In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pickle

In [2]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_components = n_topics
n_top_words = 20

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             # for i in topic.argsort()[:-n_top_words - 1:-1]])
                             for i in topic.argsort()[-n_top_words - 1:-1]])        
        print(message)
    print()

In [4]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                            remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]

In [5]:
tfidf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, 
                                max_features=n_features, 
                                stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(data_samples)

In [6]:
# Fit the LDA model
print("Fitting LDA models with tf features")

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, 
                               learning_method='online', learning_offset=5,
                               random_state=0)
lda.fit(tfidf)

print("\nTopics in LDA model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda, tfidf_feature_names, n_top_words)

Fitting LDA models with tf features





Topics in LDA model:
Topic #0: research anonymous program data sun message cs ca faq list information contact available pub university ftp graphics send mail com
Topic #1: let work bit got need ll point good want right make sure going think just use way ve know like
Topic #2: point book later faith does religion time state went know life christian don read just people believe bible jesus think
Topic #3: bios rom new help 16 scsi problem dos using software file use hard thanks pc drives card version windows disk
Topic #4: national school red low million dr test care dc cost 10 speed new medical years 1993 disease aids hiv health
Topic #5: mind matter thing men case doesn jews way did fact time things want israel make good don say does just
Topic #6: flyers 16 21 14 22 23 17 period 25 20 play 12 19 13 team 15 game 18 11 10
Topic #7: company numbers hit actually 500 model thing cubs don insurance cars bike engine think better new like good just year
Topic #8: called says told thought thi

In [7]:
lda.components_

array([[  2.96498777,   3.80509664,  12.58672843, ...,   0.634946  ,
          0.83225164,   0.14150743],
       [  0.11060803,   0.10390246,   4.47412157, ...,  30.27157747,
         52.90470471,   0.10107614],
       [  0.14274731,   0.11671734,   0.29606627, ...,   9.07623012,
         17.55213017,   0.10324781],
       ..., 
       [  6.1066453 ,  33.00823521,   9.46750063, ...,  30.33155237,
          0.50597395,   0.10109233],
       [  0.10648207,  63.23248969,  15.10528685, ...,  85.35526087,
         14.16544921,   0.15038472],
       [  1.5087933 ,   0.11818876,   4.72361323, ...,  25.79758316,
          0.10189482,   0.10094303]])

In [8]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features")

nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5)
nmf.fit(tfidf)

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fitting the NMF model (Frobenius norm) with tf-idf features

Topics in NMF model (Frobenius norm):
Topic #0: format package cs archive images image file files amiga objects server com ftp 3d 128 send ray mail pub graphics
Topic #1: high includes data transfer __ speed heads systems interface 16 floppy supports card feature controller rom drive bios hard drives
Topic #2: woman time did happened started took building come apartment like going came think went mamma don just know said didn
Topic #3: 24 23 31 38 51 32 21 86 40 28 75 70 60 42 van 36 44 48 66 72
Topic #4: dr page 20 number service study national april information research 10 1993 disease new medical children said care health aids
Topic #5: program theory sun purpose phone ibm commercial general mac dos math available anonymous pc ftp edu comments type contact machines
Topic #6: business authority code law designed act state following device use means application division person dangerous shall license military weapon firearm


In [9]:
# Fit the NMF model with Kullback-Leibler
print("Fitting the NMF model (generalized Kullback-Leibler divergence)")

nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5)
nmf.fit(tfidf)

print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Fitting the NMF model (generalized Kullback-Leibler divergence)

Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: list ftp message email image server using computer files pub stuff help use file graphics send windows thanks mail com
Topic #1: printer supports interface memory board feature high floppy use bios rom 16 power controller scsi speed drives card disk hard
Topic #2: says things old want say right come got time ll ve didn said think going did like don know just
Topic #3: 50 30 24 26 13 21 23 19 17 22 40 16 00 12 18 25 20 15 11 55
Topic #4: number insurance cost american greek aids information states president care 1993 time health year children national research hiv 000 public
Topic #5: current anonymous written runs ftp ibm edu university comments dos machines phone software mac program contact number pc available type
Topic #6: act shall following military weapons control gun means clipper keys encryption person section chip israel used state use gove