In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np

In [2]:
dataset= fetch_20newsgroups(shuffle=True, random_state=1, remove=('header','footers','quotes'))
documents=dataset.data


In [3]:
# a tf-idf transformer is applied to the bag of words matrix that NMF must process with the TfidfVectorizer.
# LDA on the other hand, being a probabilistic graphical model (dealing with probabilities) only requires raw counts
# LDA => CountVectorizer

no_features=1000


In [4]:
# NMF is able to use tf-idf (Non-negative Matrix Factorization)
tfidf_vectorizer=TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf=tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names=tfidf_vectorizer.get_feature_names()


In [5]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer=CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf=tfidf_vectorizer.fit_transform(documents)
tf_feature_names=tfidf_vectorizer.get_feature_names()

In [6]:
# initializing the W and H matrices in NMF with 'nndsvd' rather than random initialization improves the time
# it takes for NMF to converge.
no_topics=20
# run NMF
nmf_model=NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W=nmf_model.transform(tfidf)
nmf_H=nmf_model.components_
# run LDA
lda_model=LatentDirichletAllocation(n_components=no_topics,max_iter=5, learning_method='online', learning_offset=50, random_state=0).fit(tf)
lda_W=lda_model.transform(tf)
lda_H=lda_model.components_

In [9]:
"""
both the words to topics matrix(H) and the topics to documents matrix (W) as arguments
"""

def display_topics(H,W,feature_names,no_top_words,no_top_documents):
    for topic_idx, topic in enumerate(H):
        print('Topic %d:'% topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1: -1]]))
    top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]

In [10]:
no_top_words=5
no_top_documents=2
print("NMF results:\n")
display_topics(nmf_H,nmf_W,tf_feature_names,no_top_words,no_top_documents)
print("\n")
print("LDA results:\n")
display_topics(lda_H,lda_W,tf_feature_names,no_top_words,no_top_documents)

NMF results:

Topic 0:
don just like think people
Topic 1:
edu nntp host posting university
Topic 2:
com posting nntp host hp
Topic 3:
windows file dos window files
Topic 4:
cs pitt gordon banks science
Topic 5:
nasa gov space center research
Topic 6:
ca canada university posting nntp
Topic 7:
god jesus bible christian christ
Topic 8:
key clipper chip encryption escrow
Topic 9:
cleveland cwru freenet reserve western
Topic 10:
sale 00 10 new distribution
Topic 11:
uk ac university newsreader host
Topic 12:
state ohio acs university edu
Topic 13:
access net communications online public
Topic 14:
drive scsi ide drives hard
Topic 15:
armenian armenians israel turkish serdar
Topic 16:
cc columbia edu utexas university
Topic 17:
virginia edu university israeli network
Topic 18:
netcom 408 services com online
Topic 19:
caltech keith institute technology california


LDA results:

Topic 0:
rochester gm cc parts new
Topic 1:
virginia caltech edu sgi institute
Topic 2:
radio com university edu j