In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import nltk
from nltk.stem import WordNetLemmatizer 

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [8]:
print(list(newsgroups_train.target_names))


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [13]:
(newsgroups_train.data[:1])


["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

In [12]:
def preprocess(text):
    return [w for w in gensim.utils.simple_preprocess(text) if w not in gensim.parsing.preprocessing.STOPWORDS and len(w)>3]
def lemmatize(text):
    return [WordNetLemmatizer().lemmatize(w) for w in text]

In [33]:
preproc_doc = []
for s in newsgroups_train.data:
    preproc_doc.append(lemmatize(preprocess(s)))
    

In [27]:
len(preproc_doc[0])

44

In [34]:
dwords = gensim.corpora.Dictionary(preproc_doc)

In [35]:
c = 0
for k,v in dwords.iteritems():
    c+=1
    print(k,v)
    if c==10:
        break

0 addition
1 body
2 bricklin
3 brought
4 bumper
5 called
6 college
7 door
8 early
9 engine


In [36]:
dwords.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


In [38]:
bow = [dwords.doc2bow(s) for s in preproc_doc]

In [52]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow)
corpus_tfidf = tfidf[bow]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.17920454305638),
 (1, 0.1557045715038686),
 (2, 0.17820568234586015),
 (3, 0.26624707395453623),
 (4, 0.11954604412278087),
 (5, 0.14059081526041642),
 (6, 0.36086526296005295),
 (7, 0.15655924672588417),
 (8, 0.18084941519666148),
 (9, 0.2768156369603258),
 (10, 0.15243681468300666),
 (11, 0.03887214933427212),
 (12, 0.12924340004639012),
 (13, 0.0542904516932204),
 (14, 0.17496366595665394),
 (15, 0.17057735051182651),
 (16, 0.11954604412278087),
 (17, 0.09729596258119415),
 (18, 0.2163318997580803),
 (19, 0.15243681468300666),
 (20, 0.25995696600677937),
 (21, 0.039529620151615),
 (22, 0.18736171692223094),
 (23, 0.03611836572705195),
 (24, 0.2080489209364778),
 (25, 0.1495978287256002),
 (26, 0.18933388735948237),
 (27, 0.13433864351100366),
 (28, 0.19438869444071102),
 (29, 0.18127002267051487),
 (30, 0.08606840322386297),
 (31, 0.07764795142240166),
 (32, 0.04267758110441974),
 (33, 0.16688198665969742),
 (34, 0.07815772488392199)]


In [54]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dwords, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"people" + 0.002*"know" + 0.002*"article" + 0.002*"team" + 0.002*"christian" + 0.002*"university" + 0.001*"like" + 0.001*"reply" + 0.001*"time" + 0.001*"problem"
Topic: 1 Word: 0.005*"window" + 0.002*"game" + 0.002*"file" + 0.002*"university" + 0.002*"thanks" + 0.002*"like" + 0.002*"uiuc" + 0.002*"help" + 0.002*"problem" + 0.001*"need"
Topic: 2 Word: 0.002*"people" + 0.002*"government" + 0.002*"right" + 0.002*"article" + 0.002*"time" + 0.001*"like" + 0.001*"state" + 0.001*"problem" + 0.001*"year" + 0.001*"university"
Topic: 3 Word: 0.002*"israeli" + 0.002*"like" + 0.002*"bike" + 0.002*"university" + 0.002*"article" + 0.002*"know" + 0.002*"israel" + 0.002*"posting" + 0.002*"nntp" + 0.002*"gordon"
Topic: 4 Word: 0.002*"university" + 0.002*"space" + 0.002*"think" + 0.002*"know" + 0.002*"people" + 0.002*"good" + 0.002*"time" + 0.002*"player" + 0.002*"window" + 0.002*"like"
Topic: 5 Word: 0.002*"columbia" + 0.002*"card" + 0.002*"know" + 0.002*"cunixb" + 0.002*"universit

In [57]:
bow_test = dwords.doc2bow(lemmatize(preprocess(newsgroups_test.data[0])))

for index, score in sorted(lda_model_tfidf[bow_test], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

Score: 0.513777494430542	 Topic: 0.002*"university" + 0.002*"space" + 0.002*"think" + 0.002*"know" + 0.002*"people" + 0.002*"good" + 0.002*"time" + 0.002*"player" + 0.002*"window" + 0.002*"like"
Score: 0.28822562098503113	 Topic: 0.002*"keith" + 0.002*"ohio" + 0.002*"people" + 0.002*"caltech" + 0.002*"christian" + 0.002*"state" + 0.002*"university" + 0.002*"jesus" + 0.002*"article" + 0.002*"posting"
Score: 0.18091291189193726	 Topic: 0.003*"window" + 0.003*"file" + 0.003*"card" + 0.002*"driver" + 0.002*"problem" + 0.002*"thanks" + 0.002*"drive" + 0.002*"know" + 0.002*"chip" + 0.002*"like"


In [58]:
print(newsgroups_test.target[0])


7


In [61]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_tfidf, corpus=bow, texts=preproc_doc)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,8.0,0.8325,"window, file, card, driver, problem, thanks, d...","[lerxst, thing, subject, nntp, posting, host, ..."
1,1,3.0,0.4668,"israeli, like, bike, university, article, know...","[guykuo, carson, washington, subject, clock, p..."
2,2,8.0,0.7118,"window, file, card, driver, problem, thanks, d...","[twillis, purdue, thomas, willis, subject, que..."
3,3,1.0,0.3895,"window, game, file, university, thanks, like, ...","[jgreen, amber, green, subject, weitek, organi..."
4,4,8.0,0.5514,"window, file, card, driver, problem, thanks, d...","[head, harvard, jonathan, mcdowell, subject, s..."
5,5,2.0,0.907,"people, government, right, article, time, like...","[vttoulu, foxvog, douglas, subject, rewording,..."
6,6,8.0,0.6439,"window, file, card, driver, problem, thanks, d...","[bmdelane, quad, uchicago, brian, manning, del..."
7,7,8.0,0.5212,"window, file, card, driver, problem, thanks, d...","[bgrubb, dante, nmsu, grubb, subject, scsi, or..."
8,8,8.0,0.947,"window, file, card, driver, problem, thanks, d...","[holmes, iscsvax, subject, icon, help, organiz..."
9,9,8.0,0.7724,"window, file, card, driver, problem, thanks, d...","[kerr, uiuc, stan, kerr, subject, sigma, desig..."


In [82]:
for x in df_dominant_topic[df_dominant_topic['Dominant_Topic']==8]['Document_No']:
    print(newsgroups_train.data[x])


From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





From: twillis@ec.ecn.purdue.edu (Thomas E Willis)
Subject: PB questions...
Organization: Purdue University Engineering Computer Network
Distribution: usa
Lines: 36

well folks, my mac plus finally gave up the ghost this weekend after
starting life as a 512k way back in 1985.  s

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [62]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, bow, dictionary=lda_model_tfidf.id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
