In [1]:
#This is a dummy topic modeling project using sklearn. Algorithm implemeneted are - LSA, NMF and LDA.

#Apart from NMF rest parts are copied from sklearn website
#This has been originally authored by 
#Olivier Grisel <olivier.grisel@ensta.org>, Lars Buitinck and Chyi-Kwei Yau <chyikwei.yau@gmail.com>

from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.datasets import fetch_20newsgroups

import pandas as pd

In [2]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [3]:
def print_top_words(model, feature_names, n_top_words, vector):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    docsVStopics = model.transform(vector)
    docsVStopics = pd.DataFrame(docsVStopics, columns=["Topic"+str(i+1) for i in range(n_components)])
    print("Created a (%dx%d) document-topic matrix." % (docsVStopics.shape[0], docsVStopics.shape[1]))
    display(docsVStopics.head())
    most_likely_topics = docsVStopics.idxmax(axis=1)
    display(most_likely_topics.groupby(most_likely_topics).count())

In [4]:
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.412s.


In [5]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

Extracting tf-idf features for NMF...
done in 0.327s.
Extracting tf features for LDA...
done in 0.310s.



In [6]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words, tfidf)

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.348s.

Topics in NMF model (Frobenius norm):
Topic #0: just people don think like know time good make way really say right ve want did ll new use years
Topic #1: windows use dos using window program os drivers application help software pc running ms screen files version card code work
Topic #2: god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3: thanks know does mail advance hi info interested email anybody looking card help like appreciated information send list video need
Topic #4: car cars tires miles 00 new engine insurance price condition oil power speed good 000 brake year models used bought
Topic #5: edu soon com send university internet mit ftp mail cc pub article information hope program mac email home contact blood
Topic #6: file problem files format win sound ftp pub read save sit

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10
0,0.042416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.060045,0.0,0.014343,0.0,0.0,0.0,0.0,0.0,0.022147,0.0
2,0.053466,0.0,0.0,0.003553,0.0,0.0,0.0,0.0,0.0,0.0
3,0.038737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049669,0.136817
4,0.063264,0.0,0.0,0.035055,0.0,0.071749,0.0,0.0,0.0,0.0


Topic1     1076
Topic10      76
Topic2      166
Topic3       94
Topic4      156
Topic5       71
Topic6      100
Topic7       53
Topic8      102
Topic9      106
dtype: int64

In [7]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")


tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words, tfidf)

Fitting the NMF model (generalized Kullback-Leibler divergence) with tf-idf features, n_samples=2000 and n_features=1000...
done in 1.411s.

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: people don just like think did say time make know really right said things way ve course didn question probably
Topic #1: windows help thanks using hi looking info video dos pc does anybody ftp appreciated mail know advance available use card
Topic #2: god does jesus true book christian bible christians religion faith believe life church christ says know read exist lord people
Topic #3: thanks know bike interested mail like new car edu heard just price list email hear want cars thing sounds reply
Topic #4: 10 00 sale time power 12 new 15 year 30 offer condition 14 16 model 11 monitor 100 old 25
Topic #5: space government number public data states earth security water research nasa general 1993 phone information science technology provide blood internet
Topic #6: edu file com 

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10
0,0.068317,1.095911e-24,0.02602353,5.739486e-07,9.944853e-11,1.664704e-06,0.006042472,2e-06,1.095998e-06,2.571219e-08
1,0.021188,3.4840750000000003e-17,0.05043634,0.05761136,1.006154e-13,2.120389e-37,0.02240192,0.000132,0.0345629,2.92562e-05
2,0.07567,3.654122e-12,1.283711e-16,2.012305e-10,4.847435e-17,0.0120679,7.051928e-24,0.010729,8.798645e-24,1.28328e-14
3,0.022773,0.00335687,7.426167e-11,0.006134818,4.084129e-07,0.04853427,1.665267e-16,0.019819,0.02019029,0.07903332
4,0.035811,1.200152e-10,0.0001183205,0.0455164,2.3729490000000002e-17,4.309721e-11,0.04210543,0.001308,2.741609e-13,0.004122818


Topic1     422
Topic10    187
Topic2     242
Topic3     181
Topic4     181
Topic5     153
Topic6     189
Topic7     161
Topic8     150
Topic9     134
dtype: int64

In [8]:
print("Fitting the LSA model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))

svd = TruncatedSVD(n_components=n_components)
lsa = svd.fit(tfidf)
print_top_words(lsa, tfidf_feature_names, n_top_words, tfidf)

Fitting the LSA model with tf-idf features, n_samples=2000 and n_features=1000...
Topic #0: just like don know people think does good use time new god way ve want make thanks need really say
Topic #1: god people think just jesus don bible say law did government life believe christian faith christians said christ agree religion
Topic #2: god does thanks jesus know bible windows file faith christian mail christians edu christ hi advance help ftp true info
Topic #3: edu thanks know mail game car interested bike new com email send year team price like list soon games advance
Topic #4: know key government chip thanks edu clipper encryption people don does keys public mail law advance rights com right information
Topic #5: edu 00 key god com soon use 10 chip government university encryption keys data phone new sale send public internet
Topic #6: edu file think windows game files win program team soon ftp don play window try games just dos com problem
Topic #7: game key team year does games t

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10
0,0.135844,0.071581,-0.024119,-0.059449,0.027625,0.010805,0.023009,-0.012641,0.044563,-0.033628
1,0.214571,0.050947,0.02511,0.041518,-0.019502,-0.028281,0.001472,-0.068504,0.033101,0.006057
2,0.168457,0.05478,-0.026935,0.000432,0.075655,-0.031977,0.004912,-0.007174,0.064807,-0.110913
3,0.217901,-0.027692,-0.122089,-0.133785,0.060776,0.108518,-0.135529,0.094964,6.3e-05,0.125857
4,0.258675,-0.028494,0.05558,0.166974,0.074547,0.065404,0.055637,-0.017171,-0.054052,0.05815


Topic1     1588
Topic10      19
Topic2        4
Topic3       72
Topic4       50
Topic5       34
Topic6       85
Topic7       47
Topic8       62
Topic9       39
dtype: int64

In [9]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words, tf)

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 3.887s.

Topics in LDA model:
Topic #0: edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1: don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2: christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3: drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4: hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5: god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6: 55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7: car year just cars new engine like bike good oil i

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10
0,0.003449,0.628598,0.003449,0.003449,0.003449,0.343811,0.003448,0.003449,0.003449,0.003449
1,0.003334,0.003335,0.969994,0.003334,0.003333,0.003334,0.003334,0.003334,0.003334,0.003334
2,0.003031,0.671062,0.003031,0.003031,0.058081,0.249643,0.00303,0.003031,0.003031,0.003031
3,0.002942,0.905849,0.002942,0.070616,0.002941,0.002942,0.002941,0.002942,0.002942,0.002943
4,0.122198,0.585805,0.005557,0.005556,0.005556,0.253103,0.005556,0.005557,0.005557,0.005556


Topic1     180
Topic10     88
Topic2     557
Topic3      68
Topic4     407
Topic5      23
Topic6     272
Topic7      43
Topic8     161
Topic9     201
dtype: int64