<a href="https://colab.research.google.com/github/NavedAFZ/NLP/blob/master/Topic_modelling_using_svd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [3]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [None]:
print("\n".join(newsgroups_train.data[:3]))

In [None]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:3]]


array(['comp.graphics', 'talk.religion.misc', 'sci.space'], dtype='<U18')

In [6]:
newsgroups_train.target[:10]


array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1])

In [7]:

num_topics, num_top_words = 6, 8

In [8]:
from sklearn.feature_extraction import stop_words

sorted(list(stop_words.ENGLISH_STOP_WORDS))[:20]



['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst']

#Stemming and LemmatizationÂ¶

In [9]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [10]:
from nltk import stem

In [11]:
wnl = stem.WordNetLemmatizer()
porter = stem.porter.PorterStemmer()

In [12]:
word_list = ['feet', 'foot', 'foots', 'footing']

[wnl.lemmatize(word) for word in word_list]


['foot', 'foot', 'foot', 'footing']

In [13]:
[porter.stem(word) for word in word_list]

['feet', 'foot', 'foot', 'foot']

#Data preprocessing

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
vectorizer = CountVectorizer(stop_words='english') #, tokenizer=LemmaTokenizer())
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
vectors.shape 

(2034, 26576)

In [17]:
print(len(newsgroups_train.data), vectors.shape)


2034 (2034, 26576)


In [18]:
vocab = np.array(vectorizer.get_feature_names())


In [19]:
vocab.shape


(26576,)

In [20]:
vocab[7000:7020]

array(['cosmonauts', 'cosmos', 'cosponsored', 'cost', 'costa', 'costar',
       'costing', 'costly', 'costruction', 'costs', 'cosy', 'cote',
       'couched', 'couldn', 'council', 'councils', 'counsel',
       'counselees', 'counselor', 'count'], dtype='<U80')

#Singular Value Decomposition (SVD)

In [21]:
%time U, s, Vh = linalg.svd(vectors, full_matrices=False)

CPU times: user 1min 14s, sys: 3.89 s, total: 1min 18s
Wall time: 40.4 s


In [22]:
print(U.shape, s.shape, Vh.shape)


(2034, 2034) (2034,) (2034, 26576)


In [23]:
s[:4]
np.diag(np.diag(s[:4]))

array([433.92698542, 291.51012741, 240.71137677, 220.00048043])

# confirm that U, s, Vh is a decomposition of `vectors`

In [24]:
a=U @ np.diag(s) @ Vh
np.allclose(a,vectors)

True

#Confirm that U, Vh are orthonormal

In [None]:
np.allclose((U @ np.transpose(U)), np.eye(U.shape[0]))

True

In [None]:
np.allclose(( np.transpose(Vh) @ Vh), np.eye(Vh.shape[0]))

# Getting sense of dats

In [25]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [26]:
show_topics(Vh[:10])

['ditto critus propagandist surname galacticentric kindergarten surreal imaginative',
 'jpeg gif file color quality image jfif format',
 'graphics edu pub mail 128 3d ray ftp',
 'jesus god matthew people atheists atheism does graphics',
 'image data processing analysis software available tools display',
 'god atheists atheism religious believe religion argument true',
 'space nasa lunar mars probe moon missions probes',
 'image probe surface lunar mars probes moon orbit',
 'argument fallacy conclusion example true ad argumentum premises',
 'space larson image theory universe physical nasa material']

In [32]:
a1=Vh[:10]

In [40]:
for i in a1:
  t1=[vocab[i] for i in np.argsort(i)][:-8:-1]
  print(t1)


['ditto', 'critus', 'propagandist', 'surname', 'galacticentric', 'kindergarten', 'surreal']
['jpeg', 'gif', 'file', 'color', 'quality', 'image', 'jfif']
['graphics', 'edu', 'pub', 'mail', '128', '3d', 'ray']
['jesus', 'god', 'matthew', 'people', 'atheists', 'atheism', 'does']
['image', 'data', 'processing', 'analysis', 'software', 'available', 'tools']
['god', 'atheists', 'atheism', 'religious', 'believe', 'religion', 'argument']
['space', 'nasa', 'lunar', 'mars', 'probe', 'moon', 'missions']
['image', 'probe', 'surface', 'lunar', 'mars', 'probes', 'moon']
['argument', 'fallacy', 'conclusion', 'example', 'true', 'ad', 'argumentum']
['space', 'larson', 'image', 'theory', 'universe', 'physical', 'nasa']


In [31]:
len(top_words)

8

# Truncated svd

In [41]:
time u, s, v = np.linalg.svd(vectors, full_matrices=False)


CPU times: user 1min 14s, sys: 3.63 s, total: 1min 18s
Wall time: 40.3 s


In [43]:
from sklearn import decomposition
#import fbpca
%time u, s, v = decomposition.randomized_svd(vectors, 10)


CPU times: user 11.5 s, sys: 1.52 s, total: 13 s
Wall time: 9.16 s


In [44]:
print(u.shape,s.shape,v.shape)

(2034, 10) (10,) (10, 26576)


In [None]:
#Randomized SVD from Facebook's library fbpca:
%time u, s, v = fbpca.pca(vectors, 10)