## Eric Maxwell
## LSA Lab
<hr style="height:5px">

### Using sci.space data
<hr style="height:1px">

In [1]:
#Set up and get data
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.space']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [2]:
#Get stopwords set
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eek00\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Use BeautifulSoup to process documents and prepare for use with TFIDF
soup = BeautifulSoup(str(corpus), 'lxml')

corpusText = soup.findAll()
corpusDocs = [x.text for x in corpusText]
corpusDocs.pop(0)
corpusDocs = [x.lower() for x in corpusDocs]

In [4]:
#Update stopwords
stopset = set(stopwords.words('english'))
stopset.update(['edu', 'com', 'org','gov','bill','sci', 'space', 'mccall','higgins','nin','nthe',
                'larson','henry','spencer','ni','nlines','de','van', 'like','nsubject','posting',
                'article','see','net','na','gif','go','en','thing','faq','nnntp','host','much',
                'norganization','kc','au','dc','us','ca','also','would','far','un','could','university',
                'su','oz','use','must','cs','english','key3','ae','cmu','ics','uci','get','se','well',
                'writes','please', 'people', '14','92','93', 'princeton', 'one','two', 'good', 'toronto'])

In [5]:
#Before TFIDF
corpusDocs[0]



In [6]:
#Convert into sparse matrix with TFIDF
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpusDocs)

In [7]:
#Sparse matrix information
X[0]

<1x266020 sparse matrix of type '<class 'numpy.float64'>'
	with 266020 stored elements in Compressed Sparse Row format>

In [8]:
#View sparse matrix after TFIDF
print(X[0])

  (0, 33220)	0.00849003453689
  (0, 110202)	0.0122044246468
  (0, 23301)	0.0241435357143
  (0, 217416)	0.0100819160126
  (0, 179026)	0.0343633677836
  (0, 204293)	0.00126035189293
  (0, 78389)	0.00560456236066
  (0, 91784)	0.00453702667291
  (0, 36736)	0.0443104482515
  (0, 245392)	0.00640521412647
  (0, 7547)	0.0431418925975
  (0, 265694)	0.0433832401057
  (0, 37349)	0.00280303036465
  (0, 193263)	0.0348253831811
  (0, 179449)	0.0128433108647
  (0, 23240)	0.00141373012691
  (0, 19877)	0.0285565796472
  (0, 79575)	0.0149896692875
  (0, 200418)	0.00939623337832
  (0, 211242)	0.0691229357815
  (0, 25645)	0.0293676738766
  (0, 178421)	0.10915418609
  (0, 95529)	0.0232102347984
  (0, 146115)	0.0184694139435
  (0, 47042)	0.0114760086433
  :	:
  (0, 126065)	0.000259890919262
  (0, 47133)	0.000259890919262
  (0, 117181)	0.000259890919262
  (0, 20054)	0.000259890919262
  (0, 30219)	0.000259890919262
  (0, 120310)	0.000259890919262
  (0, 149579)	0.000259890919262
  (0, 83856)	0.000259890919262


In [9]:
#Shape of input matrix
X.shape

(338, 266020)

In [10]:
#Decompose input matrix
lsa = TruncatedSVD(n_components=25, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=25, n_iter=100,
       random_state=None, tol=0.0)

In [11]:
#Print first row of V matrix
lsa.components_[0]

array([ 0.02207287,  0.00109564,  0.00021662, ...,  0.00128298,
        0.00128298,  0.00128298])

In [12]:
import sys
print (sys.version)

3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:25:24) [MSC v.1900 64 bit (AMD64)]


In [13]:
#Display concepts
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
nasa
access
launch
earth
shuttle
time
moon
orbit
new
may
 
Concept 1:
many
model
ibm
fnal
oort
give
around
back
work
force
 
Concept 2:
oort
ibm
launch
many
cloud
venus
nicho
least
fnal
think
 
Concept 3:
launch
sky
oort
ibm
command
nicho
mass
night
boeing
lindgren
 
Concept 4:
system
data
universe
lunar
theory
ibm
earth
solar
images
information
 
Concept 5:
sky
science
loss
night
command
government
planet
think
first
orbit
 
Concept 6:
fnal
alaska
theory
cost
universe
physical
star
read
aurora
10
 
Concept 7:
launch
first
satellite
system
mars
spacecraft
theory
1993
10
earth
 
Concept 8:
hst
access
field
pat
access digex
digex
het
pluto
een
alaska
 
Concept 9:
mission
solar
venus
rockets
new
miles
surface
planet
first
lunar
 
Concept 10:
venus
theory
solar
dseg
vega
want
balloon
miles
power
universe
 
Concept 11:
nasa
het
een
satellite
images
pluto
op
right
nis
utrecht
 
Concept 12:
nasa
jpl
launch
mars
1993
year
project
orbit
field
observer
 
Concept 13:
center
theory
inco

In [14]:
lsa.components_

array([[ 0.02207287,  0.00109564,  0.00021662, ...,  0.00128298,
         0.00128298,  0.00128298],
       [-0.00218563, -0.00201432, -0.00073499, ..., -0.00116126,
        -0.00116126, -0.00116126],
       [-0.02640116,  0.00049571,  0.0008093 , ..., -0.00103922,
        -0.00103922, -0.00103922],
       ..., 
       [-0.01473928, -0.00076384, -0.00063096, ...,  0.00092467,
         0.00092467,  0.00092467],
       [ 0.00132992, -0.00052163, -0.00039902, ..., -0.00596557,
        -0.00596557, -0.00596557],
       [-0.01195573, -0.00437686, -0.00111742, ...,  0.00233148,
         0.00233148,  0.00233148]])