# Prac 5:Topic Modelling using LSA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

In [2]:
df=pd.read_csv(r'abcnews-date-text.csv')

In [3]:
# drop the publish date.
df.drop(['publish_date'],axis=1,inplace=True)

In [4]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [5]:
df['headline_cleaned_text']=df['headline_text'].apply(clean_text)

In [6]:
df.drop(['headline_text'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,headline_cleaned_text
0,decides community broadcasting licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian traveller


# Extracting the features

In [8]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)

In [9]:
vect_text=vect.fit_transform(df['headline_cleaned_text'])

In [10]:
print(vect_text.shape)
print(vect_text)

(1226258, 1000)
  (0, 507)	0.7830964759517771
  (0, 180)	0.6219002406752289
  (1, 575)	0.6350011874790689
  (1, 982)	0.634252507913514
  (1, 322)	0.44101842150367176
  (2, 850)	0.6547003749683041
  (2, 681)	0.6236747415657183
  (2, 124)	0.42707989387150563
  (3, 743)	0.4535400720072263
  (3, 842)	0.4901743730039168
  (3, 56)	0.5225164976545785
  (3, 826)	0.5301009307789318
  (4, 58)	0.6373967041659779
  (4, 842)	0.7705358145591605
  (5, 977)	1.0
  (6, 709)	1.0
  (7, 542)	0.5180441448099484
  (7, 345)	0.48078322560362474
  (7, 960)	0.535924910123952
  (7, 55)	0.46180325325287314
  (8, 452)	0.42931170443114947
  (8, 202)	0.34965626131197547
  (8, 775)	0.4483609681844147
  (8, 13)	0.5219542868074372
  (8, 56)	0.4690075948807512
  :	:
  (1226249, 831)	0.3778999040452349
  (1226250, 100)	0.5663193054263862
  (1226250, 941)	0.503939080598977
  (1226250, 548)	0.4622873009035714
  (1226250, 135)	0.4600198895370982
  (1226251, 344)	0.6080525617065659
  (1226251, 969)	0.7938967704948061
  (12262

In [11]:
idf=vect.idf_

In [12]:
dd=dict(zip(vect.get_feature_names(), idf))
l=sorted(dd, key=(dd).get)
# print(l)
print(l[0],l[-1])
print(dd['police'])
print(dd['forecast'])

police walk
4.440524277043323
7.916979273509669


# Latent Semantic Analysis (LSA)

In [13]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

In [14]:
print(lsa_top)
print(lsa_top.shape)

[[ 1.53029397e-04  1.01492896e-02  2.19680672e-02 ... -2.21296268e-03
   3.17245107e-03  1.32956025e-04]
 [ 7.48585858e-04  5.08976424e-02  7.87240765e-02 ...  1.18530502e-01
  -9.25239380e-02  1.51522464e-02]
 [ 5.37521234e-04  3.66229969e-02  9.96668480e-02 ...  3.24419227e-01
  -4.05173232e-02 -2.13994257e-02]
 ...
 [ 7.30007417e-04  3.32113316e-02  6.16869649e-02 ...  6.26249380e-04
   6.26462090e-02  4.76153017e-03]
 [ 2.30190071e-04  1.17542405e-02  2.81349680e-02 ...  2.49101219e-03
   1.41661310e-02  1.56714585e-03]
 [ 9.47397042e-04  7.22237294e-02  7.51528438e-02 ... -1.76142492e-03
   2.51522741e-01 -1.62705114e-01]]
(1226258, 10)


In [15]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)

Document 0 :
Topic  0  :  0.015302939715647319
Topic  1  :  1.0149289606607355
Topic  2  :  2.1968067180559117
Topic  3  :  -0.8919740217810643
Topic  4  :  -0.8020581144200037
Topic  5  :  -0.677594878379342
Topic  6  :  0.06418594298345061
Topic  7  :  -0.22129626758997958
Topic  8  :  0.31724510712124443
Topic  9  :  0.01329560245325193


In [16]:
# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
interview extended michael john david smith james police andrew mark 

Topic 1: 
police death woman fire crash court say probe call murder 

Topic 2: 
say australia plan council call back govt fire court water 

Topic 3: 
say police need trump minister must expert wont needed hunt 

Topic 4: 
australia south world police first test china coronavirus india cricket 

Topic 5: 
court face woman murder fire charged charge accused death crash 

Topic 6: 
fire house sydney home crew govt australia plan school damage 

Topic 7: 
call fire say medium home house death australian inquiry spark 

Topic 8: 
australian woman crash back dy year open killed charged world 

Topic 9: 
back court police fire australian fight world hit face school 

