In [1]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\James\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.autos']
dataset = fetch_20newsgroups(subset='all',shuffle=True, random_state=42, categories=categories)
corpus = dataset.data

In [4]:
corpus[0]

u'From: mad9a@fermi.clas.Virginia.EDU (Michael A. Davis)\nSubject: Slick 50, any good?\nOrganization: University of Virginia\nLines: 9\n\n\n     Chances are that this has been discussed to death already, and\nif so could someone who has kept the discussion mail me or direct me \nto an archive site. Basically,\nI am just wondering if Slick 50 really does all it says that it does.\nAnd also, is there any data to support the claim.  Thanks for any info.\n\nMike Davis\nmad9a@fermi.clas.virginia.edu\n'

In [5]:
stopset = set(stopwords.words('english'))

In [6]:
stopset.update(['edu', 'uiuc', 'university', 'george', 'goucher', 'howell',
               'david', 'mi', 'charles', 'cc', '_', '__', '___', '____', 'berkeley', 'jim', 'com', 'bob', 'from', 'subject',
               'mike', 'davis', 'clas', '\n','alabama','al','alaska','ak','arizona','az','arkansas','ar',
                'california','ca','colorado','co','connecticut','ct','delaware','de','district of columbia',
                'dc','florida','fl','georgia','ga','hawaii','hi','idaho','id','illinois','il','indiana','in',
                'iowa','ia','kansas','ks''kentucky','ky','louisiana','la','maine','me','montana','mt','nebraska',
                'ne','nevada','nv','new hampshire','nh','new jersey','nj','new mexico','nm','new york','ny',
                'north carolina','nc','north dakota','nd','ohio','oh','oklahoma','ok','oregon','or','maryland',
                'md','massachusetts','ma','michigan','mi','minnesota','mn','mississippi','ms','missouri','mo',
                'pennsylvania','pa','rhode island','ri','south carolina','sc','south dakota','sd','tennessee',
                'tn','texas','tx','utah','ut','vermont','vt','virginia','va','washington','wa','west virginia',
                'wv','wisconsin','wi','wyoming','wy', 'umd', 'uwo', 'state', 'ak296', 'yfn', 'ysu', 'eliot', 're',
                'sfu', 'darren', 'gibbons', 'gibbonsa', 'andrew', 'spencer', 'john', 'dakar', '___________________the',
                'neilson', '_____', 'hou', 'us', 'cka52397', 'uxa', 'dave', 'davew', 'po', 'aas7', 'ucalgary', 'calgary',
                'kocrsv01', 'org', 'uwm', 'se', 'james', 'callison', 'tobias', '00', '______', 'danny', 'craig', 'boyle',
                'neil', 'ericsson', 'steve', 'mit', 'purdue', 'cmu', 'cmich', 'kevinh', 'tommy', 'erik', 'chris', 'brian',
                'austin', '000', 'cs', 'sig', 'usa', 'cleveland', 'robbie', 'uokmax', 'uoknor', 'ecn', 'att', 'nielsen',
                'les', 'bartel', 'ernie', 'smith', 'adn6285', 'erich', 'fred', 'ryan', 'wb3ffv', 'ampr', 'matthew',
                'houston', 'seattle', 'uk', 'nntp', 'drew', 'nancy', 'scott', 'thomas', 'ucs', 'au', 'oz', 'gov', 'caltech',
                'tom', 'tomm', 'wayne', 'byu', 'cso', 'acs', 'michael', 'oriolefan', 'cwru', 'lehigh', 'sho', 'ole', 'cdac',
                'navy', 'mil', 'uucp', 'robert', 'jeff', 'urbana'
])

In [7]:
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(1,3))
X = vectorizer.fit_transform(corpus)

In [8]:
X.shape

(990, 174808)

In [9]:
lsa = TruncatedSVD(n_components=990, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=990, n_iter=100,
       random_state=None, tol=0.0)

In [10]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_) :
    termsInComp = zip (terms,comp)
    sortedTerms = sorted(termsInComp, key=lambda x : x[1], reverse=True) [:10]
    print "Concept %d:" % i
    for term in sortedTerms:
        print term[0]
    print " "

Concept 0:
car
writes
article
would
cars
one
like
engine
lines
organization
 
Concept 1:
manta
opel
kadett
sold
sold buick
gt
buick
kadett sold
hard
news writes
 
Concept 2:
v12
v4
v6
v8
v12 vx
v4 v6 v8
v6 v8 v12
v8 v12
v8 v12 vx
v6 v8
 
Concept 3:
geico
insurance
claim
radar
heard
accident
also heard
policy
make claim
geico insurance
 
Concept 4:
geico
insurance
rocks
claim
read
kids
radar
also heard
companies esp
companies esp geico
 
Concept 5:
oil
diesels
diesel
emissions
changing oil
changing
magnus
particulate
installing
particulate matter
 
Concept 6:
oil
changing oil
mr2
magnus
article 1993apr15
changing
change
drain
self
cbnewsj cb
 
Concept 7:
mustang
oil
ford
magnus
drain
mach
mach iii
iii
camaro
nothing
 
Concept 8:
mr2
engine
isc rit
rit
isc
noisy
turbo
shafts
new
engr
 
Concept 9:
mr2
engine
engr
isc rit
rit
continental
isc
noisy
shafts
dec
 
Concept 10:
mr2
ssave
v6
engine
know
cars today
cars today engines
commericial
commericial cars
commericial cars today
 
Concept 11