# Latent Semantic Analysis

## Importing necessary libraries

In [1]:
import nltk
nltk.download('stopwords')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Getting the dataset

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                          remove=('headers', 'footers', 'qoutes'))
docs = dataset.data
print("Number of documents : ", len(docs))
print("Target names of dataset : ")
print(dataset.target_names)

Number of documents :  11314
Target names of dataset : 
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


## Preprocessing the dataset

In [3]:
news_df = pd.DataFrame({'document':docs})
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,James Hogan writes:\n\ntimmbake@mcl.ucsb.edu (...,james hogan writes timmbake ucsb bake timmons ...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,ron.roth@rose.com (ron roth) writes:\n\n|JB> ...,roth rose roth writes romdas uclink berkeley e...
11311,In article <1qn6tqINNmnf@senator-bedfellow.MIT...,article tqinnmnf senator bedfellow athena char...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [4]:
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure story seem biased disagree statement...
1,James Hogan writes:\n\ntimmbake@mcl.ucsb.edu (...,james hogan writes timmbake ucsb bake timmons ...
2,Although I realize that principle is not one o...,although realize principle strongest points wo...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss proposal much ...
4,"Well, I will have to change the scoring on my ...",well change scoring playoff pool unfortunately...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist speaking t...
11310,ron.roth@rose.com (ron roth) writes:\n\n|JB> ...,roth rose roth writes romdas uclink berkeley e...
11311,In article <1qn6tqINNmnf@senator-bedfellow.MIT...,article tqinnmnf senator bedfellow athena char...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet orange micros grappler system upd...


## Converting into TF-IDF Matrix

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 2000,
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

X

<11314x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 440858 stored elements in Compressed Sparse Row format>

## Creating and using a SVD Model

In [6]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

20

In [7]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("\nTopic "+str(i+1)+": \n")
    for t in sorted_terms:
        print(t[0])


Topic 1: 

article
like
people
know
think
good
time

Topic 2: 

windows
thanks
card
drive
file
files
mail

Topic 3: 

game
team
games
year
season
players
play

Topic 4: 

drive
scsi
drives
hard
controller
disk
floppy

Topic 5: 

chip
encryption
clipper
government
keys
space
phone

Topic 6: 

thanks
mail
advance
know
email
looking
address

Topic 7: 

card
video
monitor
drivers
chip
cards
driver

Topic 8: 

israel
drive
israeli
game
jews
team
government

Topic 9: 

israel
israeli
article
jews
arab
bike
arabs

Topic 10: 

space
nasa
sale
card
jesus
earth
shuttle

Topic 11: 

window
israel
problem
application
motif
display
manager

Topic 12: 

windows
sale
offer
condition
price
shipping
good

Topic 13: 

article
israel
uiuc
sale
news
jesus
says

Topic 14: 

window
people
article
mail
armenian
armenians
turkish

Topic 15: 

windows
window
space
thanks
bike
jesus
nasa

Topic 16: 

bike
file
armenian
jesus
armenians
files
card

Topic 17: 

armenian
armenians
know
turkish
windows
chip
armenia