## implementation of LSA(latent semantic analysis)

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_colwidth',200)

In [5]:
from sklearn.datasets import fetch_20newsgroups

In [7]:
dataset=fetch_20newsgroups(shuffle=True,random_state=1,
                           remove=('header','footers','quotes'))
documents=dataset.data
len(documents)

11314

In [8]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### data preprocessing

In [9]:
news_df=pd.DataFrame({'document':documents})

news_df.head()

Unnamed: 0,document
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubject: Re: Israeli Terrorism\nOrganization: University of Virginia\nLines: 15"
1,"From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSubject: Re: Amusing atheists and agnostics\nLines: 66\n\n\n\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\nath..."
2,"From: bc744@cleveland.Freenet.Edu (Mark Ira Kaufman)\nSubject: Re: rejoinder. Questions to Israelis\nOrganization: Case Western Reserve University, Cleveland, Ohio (USA)\nLines: 38\nNNTP-Posting-H..."
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: Clipper- business as usual?\nArticle-I.D.: ole.1993Apr20.173039.4722\nOrganization: Cascade Design Automation\nLines: 17\n\n\n Notwithstanding all t...
4,"From: kkeller@mail.sas.upenn.edu (Keith Keller)\nSubject: Playoff pool rule revision\nOrganization: University of Pennsylvania, School of Arts and Sciences\nLines: 10\nNntp-Posting-Host: mail.sas...."


In [10]:
#remove everything except alphabets
news_df['clean_doc']=news_df['document'].str.replace("[^a-zA-Z#]"," ")

In [11]:
#remove short words
news_df['clean_doc']=news_df['clean_doc'].apply(
                lambda x:' '.join([w for w in x.split() if len(w)>3]))

In [12]:
#make all text lowercase
news_df['clean_doc']=news_df['clean_doc'].apply(lambda x:x.lower())

#### remove stopwords

In [13]:
from nltk.corpus import stopwords

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [17]:
stop_words=stopwords.words('english')

In [18]:
#tokenization
tokenized_doc=news_df['clean_doc'].apply(lambda x:x.split())

In [19]:
#remove stop-words
tokenized_doc=tokenized_doc.apply(lambda x:
                        [item for item in x if item not in stop_words])

In [20]:
#de-tokenization
detokenized_doc=[]
for i in range(len(news_df)):
    t=' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
    
news_df['clean_doc']=detokenized_doc

### document-term matrix