<a href="https://colab.research.google.com/github/LCaravaggio/AnalisisPredictivo/blob/master/06_clasificación/NMF_vs_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation



In [24]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d - %s :" % (topic_idx, dataset['target_names'][topic_idx]))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
dataset = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 20

In [25]:
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0 - alt.atheism :
people time right did good said say make way government point really years going course long believe state fact world
Topic 1 - comp.graphics :
window problem using server application screen display motif manager running widget program problems set error mouse work code fine run
Topic 2 - comp.os.ms-windows.misc :
god jesus bible christ faith believe christian christians sin church lord hell truth life man love belief say christianity father
Topic 3 - comp.sys.ibm.pc.hardware :
game team year games season players play hockey win league player teams nhl good runs best better hit division points
Topic 4 - comp.sys.mac.hardware :
new 00 sale 10 price offer shipping condition 20 15 50 interested 12 asking 30 space 11 25 used sell
Topic 5 - comp.windows.x :
thanks mail advance hi looking info help information address appreciated email post know anybody send interested appreciate need reply tell
Topic 6 - misc.forsale :
windows file files dos program version ftp ms di

In [23]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0 - alt.atheism  :
00 black 50 white cover new 20 dos picture appears 15 supply ed price red son 30 man st light
Topic 1 - comp.graphics  :
windows card scsi use bit mac memory dos pc video drive color does using disk screen bus monitor ms mode
Topic 2 - comp.os.ms-windows.misc  :
said gun people children health guns women medical police weapons control says home went killed firearms house city saw day
Topic 3 - comp.sys.ibm.pc.hardware  :
new research science center national 1993 april use data states high united american information power service washington scientific used university
Topic 4 - comp.sys.mac.hardware  :
thanks like help use need know does looking work used want hi advance appreciated keyboard mail good pin information info
Topic 5 - comp.windows.x  :
edu file information image ftp files program version com cs graphics pub available info email internet mit use images comp
Topic 6 - misc.forsale  :
key chip encryption keys clipper use security bit number des chips 