In [1]:
#  Jordan Hoover
#  CSC570R Fall 2016
#  Assignment: 
#  LSA lab, using Python 3.5

# LSA lab: 
Latent Semantic Analysis (LSA) on newsgroup posts

Steps: 
<li>First Steps are to import needed modules and grab the dataset, which are
the newsgroup posts.  I chose to use sci.electronics, but experimented using other newsgroups as well

<li> Then I need to split up or 'tokenize' the actual words that are in all these documents that I have 

<li> After pulling the words out of all the documents, then I need to run the LSA


In [2]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
categories = ['comp.sys.ibm.pc.hardware'] # name of newsgroup to use here
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data #corpus is now a list of documents(strings) from sci.electronics


In [3]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin130\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stopset = set(stopwords.words('english'))
stopset.update(['\n'])

Now, we have the dataset, as a list of documents in 'corpus'
Next, I will use scikit-learn's TF-IDF vectorizer on my corpus and it will be 
converted into a sparse matrix of TFIDF Features 

In [6]:
# Before running the vectorizer, first document/post in my corpus looks like: 
corpus[0]

"From: nelson@crynwr.com (Russell Nelson)\nSubject: IP numbers on Ethernet Cards \nDistribution: world\nOrganization: Crynwr Software\nLines: 16\n\nIn article <1993Apr21.162512.217@uvm.edu> djohnson@moose.uvm.edu writes:\n\n   Is it possible through either pin configuration or through software\n   programming to change the IP numbers on an ethernet card?\n\nThe Ethernet card doesn't use the IP number (32-bits, usually\n#.#.#.#); it uses the Ethernet address (48-bits, usually\n#:#:#:#:#:#). I have never run across an Ethernet controller that\ncannot be programmed to use an address that is not assigned to it.\nDECNET requires this feature.\n\nHowever, that said, there is no reason to ever change the Ethernet\naddress.  They are globally unique, the first three bytes being\nassigned to the manufacturer by the IEEE, and the last three by the\nmanufacturer.\n\n-russ <nelson@crynwr.com> What canst *thou* say?\nCrynwr Software           Crynwr Software sells packet driver support.\n11 Grant S

In [7]:
# run vectorizer
vectorizer = TfidfVectorizer(stop_words=stopset, 
                             use_idf=True, ngram_range=(1, 3))
X= vectorizer.fit_transform(corpus)


After running vectorizer, first document looks like: 

In [8]:
X[0]

<1x178196 sparse matrix of type '<class 'numpy.float64'>'
	with 308 stored elements in Compressed Sparse Row format>

In [17]:
print(X[0])

  (0, 78293)	0.0524382715955
  (0, 67971)	0.0524382715955
  (0, 15642)	0.0524382715955
  (0, 7488)	0.0524382715955
  (0, 8450)	0.0524382715955
  (0, 2669)	0.0524382715955
  (0, 113648)	0.0524382715955
  (0, 124997)	0.0524382715955
  (0, 24436)	0.0524382715955
  (0, 102927)	0.0524382715955
  (0, 98103)	0.0524382715955
  (0, 169937)	0.0524382715955
  (0, 4715)	0.0524382715955
  (0, 7480)	0.0524382715955
  (0, 8449)	0.0524382715955
  (0, 149031)	0.0524382715955
  (0, 75894)	0.0524382715955
  (0, 1728)	0.0524382715955
  (0, 152849)	0.0524382715955
  (0, 58000)	0.0524382715955
  (0, 118580)	0.0524382715955
  (0, 141227)	0.0524382715955
  (0, 146397)	0.0524382715955
  (0, 48469)	0.0524382715955
  (0, 146160)	0.0524382715955
  :	:
  (0, 124280)	0.0247856220074
  (0, 176036)	0.0153717666755
  (0, 107138)	0.0473882358448
  (0, 55087)	0.0473882358448
  (0, 60098)	0.0236552107682
  (0, 167791)	0.0947764716895
  (0, 6257)	0.0444341543032
  (0, 3828)	0.0524382715955
  (0, 5071)	0.038261028874
  (0,

In [19]:
type(X)

scipy.sparse.csr.csr_matrix

In [18]:
type(X[0])

scipy.sparse.csr.csr_matrix

Now I can do the LSA

In [10]:
X.shape

(982, 178196)

In [11]:
X[0].shape

(1, 178196)

In [12]:
# I experimented with adjusting number of total concepts by adjusting n_components
lsa = TruncatedSVD(n_components=5, n_iter=200)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=5, n_iter=200,
       random_state=None, tol=0.0)

In [13]:
# show what the first row for V looks like
lsa.components_[0]

array([ 0.00953853,  0.00025195,  0.00025195, ...,  0.00034832,
        0.00034832,  0.00034832])

In [20]:
# Show versions I am using, for reference
import sys
print(sys.version)

3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]


In [15]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip (terms, comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse =True)[:10]
    print("Concept %d:" %i)
    for term in sortedTerms:
        print(term[0])
    print("")

Concept 0:
scsi
ide
drive
edu
com
bus
controller
card
ibm
drives

Concept 1:
scsi
organization
16
use
problem
using
system
00 01 02
two
dos

Concept 2:
ide
drive
writes
one
use
drives
article
16
would
00 01 02

Concept 3:
com
ide
lines
know
bit
bus
os
dma
writes
need

Concept 4:
ide
article
time
host
austin
system
00 01 02
one
scsi
writes



In [16]:
lsa.components_

array([[  9.53852565e-03,   2.51951204e-04,   2.51951204e-04, ...,
          3.48322565e-04,   3.48322565e-04,   3.48322565e-04],
       [ -1.10937118e-02,   1.39663525e-02,   1.01909327e-02, ...,
         -3.92117440e-04,  -3.97861038e-04,  -3.97861038e-04],
       [ -1.08905451e-02,   1.01824907e-02,   1.38178411e-02, ...,
         -7.88291056e-05,  -5.41223723e-05,  -5.41223723e-05],
       [  1.05518408e-02,   1.15560580e-02,  -7.18131189e-02, ...,
         -5.64526306e-04,  -5.61507394e-04,  -5.61507394e-04],
       [  9.48883994e-03,  -3.17536422e-02,   2.92287969e-02, ...,
          1.34657866e-04,   1.26849385e-04,   1.26849385e-04]])