# Prac 5:Topic Modelling using LSA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

In [2]:
df=pd.read_csv(r'abcnews-date-text.csv')

In [3]:
# drop the publish date.
df.drop(['publish_date'],axis=1,inplace=True)

In [4]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

In [5]:
df['headline_cleaned_text']=df['headline_text'].apply(clean_text)

In [6]:
df.drop(['headline_text'],axis=1,inplace=True)

In [7]:
df.head()

Unnamed: 0,headline_cleaned_text
0,decides community broadcasting licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian traveller


# Extracting the features

In [8]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)

In [9]:
vect_text=vect.fit_transform(df['headline_cleaned_text'])

In [10]:
print(vect_text.shape)
print(vect_text)

(1093281, 1000)
  (0, 507)	0.7808664326566723
  (0, 178)	0.6246980185257855
  (1, 575)	0.6344261126788409
  (1, 982)	0.6340174024214789
  (1, 320)	0.44218258782762476
  (2, 683)	0.8255691240254475
  (2, 120)	0.5643010025295501
  (3, 745)	0.4555401202137725
  (3, 846)	0.48936875561594206
  (3, 52)	0.5178741905948874
  (3, 830)	0.533673816687887
  (4, 54)	0.6508194560051154
  (4, 846)	0.759232530707955
  (5, 472)	0.7947699014549592
  (5, 977)	0.6069108696845646
  (6, 711)	1.0
  (7, 543)	0.5164389518088216
  (7, 343)	0.4823478536254996
  (7, 960)	0.5398043943633849
  (7, 51)	0.4574304023383868
  (8, 452)	0.4263247344974713
  (8, 200)	0.34763893401982066
  (8, 778)	0.450702975997798
  (8, 12)	0.5264434547215587
  (8, 52)	0.4659597715648655
  :	:
  (1093273, 393)	0.5033599258208994
  (1093274, 49)	0.6289811610547521
  (1093274, 830)	0.7774205419451019
  (1093275, 945)	0.513993466209999
  (1093275, 549)	0.4147189949636653
  (1093275, 877)	0.4333914165329414
  (1093275, 805)	0.422979467517374

In [11]:
idf=vect.idf_

In [12]:
dd=dict(zip(vect.get_feature_names(), idf))
l=sorted(dd, key=(dd).get)
# print(l)
print(l[0],l[-1])
print(dd['police'])
print(dd['forecast'])

police shop
4.423470857785716
7.938670552264349


# Latent Semantic Analysis (LSA)

In [13]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

In [14]:
print(lsa_top)
print(lsa_top.shape)

[[ 9.71372990e-05  9.74621196e-03  2.40828560e-02 ... -8.73837587e-05
  -2.76001881e-03 -4.71492128e-04]
 [ 5.38024755e-04  5.09018771e-02  8.05111623e-02 ... -4.44691202e-02
   2.69886897e-02  1.23102458e-03]
 [ 5.30903781e-04  4.84053526e-02  1.49968429e-01 ... -7.74930642e-02
  -7.00245363e-04 -3.97849327e-02]
 ...
 [ 1.68011206e-03  1.42960980e-02  2.22267421e-02 ... -1.02617038e-03
   3.58910841e-03 -1.39351363e-03]
 [ 7.31404460e-05  4.17196796e-03  1.01954634e-02 ...  6.16221894e-03
   1.34376432e-04 -1.05918795e-03]
 [ 1.53872057e-04  1.26988867e-02  3.50453244e-02 ...  1.47034561e-03
   2.08891226e-04  7.21618681e-03]]
(1093281, 10)


In [15]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)

Document 0 :
Topic  0  :  0.00971372990006908
Topic  1  :  0.9746211955444511
Topic  2  :  2.4082856047676215
Topic  3  :  -0.7016646013301822
Topic  4  :  -0.2249388745261046
Topic  5  :  -0.011775256729257678
Topic  6  :  0.05392210930771517
Topic  7  :  -0.008738375866253627
Topic  8  :  -0.27600188100988293
Topic  9  :  -0.047149212813676684


In [16]:
# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
interview extended michael john david smith james andrew mark police 

Topic 1: 
police death probe fire woman crash call court missing drug 

Topic 2: 
say plan council call govt back fire australia water court 

Topic 3: 
say australia police report need minister must world could expert 

Topic 4: 
court fire face woman murder charged charge accused death crash 

Topic 5: 
call medium say australia inquiry report prompt spark change opposition 

Topic 6: 
fire house home govt crew say blaze damage threat school 

Topic 7: 
australia back world south first australian test take win india 

Topic 8: 
council australia crash fire rate year rise death seek dy 

Topic 9: 
back report council fight market rural hit business news police 

