In [1]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import gensim
import gzip
# this allows plots to appear directly in the notebook
%matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups # 20 news categories
newsgroup_train=fetch_20newsgroups(subset='train',shuffle=True)
newsgroup_test=fetch_20newsgroups(subset='test',shuffle=True)
print('done')

done


In [3]:
# print list
print(list(newsgroup_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
len(list(newsgroup_train.target_names))

20

In [5]:
newsgroup_train.data[:1]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

In [6]:
print(newsgroup_train.filenames.shape)

(11314,)


In [7]:
print(newsgroup_train.target.shape)

(11314,)


In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer,SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(40)

In [9]:
stemmer = SnowballStemmer('english')

In [10]:
# function to perform preprocessing steps 
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))  #every pasttese into present tesne

# Tokenize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3: 
            result.append(lemmatize_stemming(token))
    return result

In [11]:
# list
preprocessed_docs=[]

for doc in newsgroup_train.data:
    preprocessed_docs.append(preprocess(doc))

In [12]:
# create a dictionary 
dictionary=gensim.corpora.Dictionary(preprocessed_docs)

In [13]:
len(dictionary)

61411

In [14]:
count=0
for k,v in dictionary.iteritems():
    print(k,v)
    count+=1
    if count>20:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten
11 funki
12 histori
13 host
14 info
15 know
16 late
17 lerxst
18 line
19 look
20 mail


In [15]:
dictionary.filter_extremes(no_below=15,no_above=0.1)

In [16]:
# Create a BOW model for each doc we create one dic how many words and how many times those words appear 
bow_corpus=[dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [17]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 12
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 83 ("email") appears 1 time.
Word 145 ("newsread") appears 1 time.
Word 156 ("version") appears 1 time.
Word 522 ("collin") appears 2 time.
Word 523 ("east") appears 1 time.
Word 524 ("fort") appears 2 time.
Word 525 ("harmoni") appears 1 time.
Word 526 ("hewlett") appears 2 time.
Word 527 ("packard") appears 2 time.
Word 528 ("regard") appears 1 time.


In [18]:
# Running LDA using Bag of words
lda_model=gensim.models.LdaMulticore(bow_corpus,num_topics=8,id2word=dictionary,
                                    passes=10,workers=2)

In [19]:
# print the topics
for id,topic in lda_model.print_topics(-1):
    print("Topic :{} \nwords :{}".format(id,topic))
    print("\n")

Topic :0 
words :0.013*"imag" + 0.011*"file" + 0.007*"program" + 0.007*"avail" + 0.007*"graphic" + 0.007*"window" + 0.007*"version" + 0.007*"server" + 0.006*"softwar" + 0.006*"widget"


Topic :1 
words :0.007*"bike" + 0.005*"drive" + 0.005*"engin" + 0.004*"car" + 0.004*"power" + 0.004*"light" + 0.003*"leav" + 0.003*"speed" + 0.003*"turn" + 0.003*"littl"


Topic :2 
words :0.017*"game" + 0.015*"team" + 0.011*"play" + 0.010*"player" + 0.007*"hockey" + 0.006*"season" + 0.005*"leagu" + 0.005*"score" + 0.004*"basebal" + 0.003*"divis"


Topic :3 
words :0.011*"encrypt" + 0.010*"govern" + 0.009*"chip" + 0.008*"secur" + 0.008*"clipper" + 0.007*"public" + 0.006*"key" + 0.005*"wire" + 0.005*"protect" + 0.004*"escrow"


Topic :4 
words :0.013*"space" + 0.011*"nasa" + 0.006*"program" + 0.005*"presid" + 0.005*"research" + 0.004*"list" + 0.004*"servic" + 0.004*"launch" + 0.004*"orbit" + 0.004*"nation"


Topic :5 
words :0.010*"christian" + 0.006*"exist" + 0.006*"jesus" + 0.005*"moral" + 0.004*"bibl"

In [49]:
# test model unseen document
unseen_doc=newsgroup_test.data[50]
print(unseen_doc)

From: carter@ecf.toronto.edu (CARTER EDWARD A)
Subject: Re: Good Reasons to Wave at each other
Organization: University of Toronto, Engineering Computing Facility
Lines: 19

jlevine@rd.hydro.on.ca (Jody Levine) writes:
>Has anyone, while driving a cage, ever waved at bikers? I get the urge,
>but I've never actually done it.

Oh yeah, all the time.  On a nice spring/summer day, I roll down the window
and drive around looking for bikes.  When a bike motors by in the opposite
direction, I stick my arm out and hi5'em.  My arm feels like a million 
bucks when I'm doing this a 60km/h.  I do the same thing with cyclists.
The only problem with hi5ing a cyclist is their always in the right hand lane.
I hafta roll down the other window and hi5 them on the back.  Oh well, I 
think they appreciate the thought. 

Regards, Ted.

---
University of Toronto Computer Engineering               
PowerUsersGroupChairman
'89 FZR600: I'm taking a ride with my best friend.                  DoD#:886699




In [50]:
# data preprocessing
bow_vector=dictionary.doc2bow(preprocess(unseen_doc))

We are printing the topic  Index ( above 7 topics we have given ), also we are putting index 5 so that it will send all the words it has for that topic. 5 words are more than enough ani i kept that

In [51]:
for index,score in sorted(lda_model[bow_vector]):
    print("Score:{}? \nTopic :{} ".format(score,lda_model.print_topic(index,5)))

Score:0.8042205572128296? 
Topic :0.007*"bike" + 0.005*"drive" + 0.005*"engin" + 0.004*"car" + 0.004*"power" 
Score:0.13118712604045868? 
Topic :0.017*"game" + 0.015*"team" + 0.011*"play" + 0.010*"player" + 0.007*"hockey" 
Score:0.051262930035591125? 
Topic :0.015*"window" + 0.012*"drive" + 0.010*"card" + 0.009*"file" + 0.007*"program" 


In [52]:
print('Actual Topic',newsgroup_test.target[50])

Actual Topic 8


In [53]:
print(list(newsgroup_test.target_names))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
