https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730

# NNMF

<img src = "600px-NMF.png">
<img src = "NMF - cost.png">



# Latent Dírichlet Allocation

<img src="https://github.com/deeplearning-itba/NLP-Tecnicas-Tradicionales/blob/master/LDA%20smoothed%20-%20plate.png?raw=1">

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

dataset = fetch_20newsgroups(subset="train", remove=('headers', 'footers', 'quotes'))
documents = dataset.data

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 20
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
people time right did good said say make way government point really years going course long believe state fact world
Topic 1:
window problem using server application screen display motif manager running widget program problems set error mouse work code fine run
Topic 2:
god jesus bible christ faith believe christian christians sin church lord hell truth life man love belief say christianity father
Topic 3:
game team year games season players play hockey win league player teams nhl good runs best better hit division points
Topic 4:
new 00 sale 10 price offer shipping condition 20 15 50 interested 12 asking 30 space 11 25 used sell
Topic 5:
thanks mail advance hi looking info help information address appreciated email post know anybody send interested appreciate need reply tell
Topic 6:
windows file files dos program version ftp ms directory running pc run using os software drivers disk graphics win programs
Topic 7:
edu soon cs university ftp internet article email pub david s

In [2]:
TNG_topics_nmf = nmf.transform(tfidf)
TNG_topics_lda = lda.transform(tf)



In [3]:
TNG_topics_nmf.argmax(axis=-1)

array([15,  5,  5, ...,  1,  0,  4])

In [4]:
count_docs=[]
for i in range(20):
    count_docs.append({label: 0 for label in dataset.target_names})

In [5]:
for label,topic in zip(dataset.target,TNG_topics_nmf.argmax(axis=-1)):
    count_docs[topic][dataset.target_names[label]] +=1

In [6]:
count_docs

[{'alt.atheism': 212,
  'comp.graphics': 63,
  'comp.os.ms-windows.misc': 57,
  'comp.sys.ibm.pc.hardware': 37,
  'comp.sys.mac.hardware': 56,
  'comp.windows.x': 31,
  'misc.forsale': 30,
  'rec.autos': 122,
  'rec.motorcycles': 183,
  'rec.sport.baseball': 111,
  'rec.sport.hockey': 68,
  'sci.crypt': 117,
  'sci.electronics': 109,
  'sci.med': 252,
  'sci.space': 209,
  'soc.religion.christian': 183,
  'talk.politics.guns': 325,
  'talk.politics.mideast': 411,
  'talk.politics.misc': 290,
  'talk.religion.misc': 179},
 {'alt.atheism': 3,
  'comp.graphics': 33,
  'comp.os.ms-windows.misc': 40,
  'comp.sys.ibm.pc.hardware': 26,
  'comp.sys.mac.hardware': 41,
  'comp.windows.x': 216,
  'misc.forsale': 2,
  'rec.autos': 18,
  'rec.motorcycles': 6,
  'rec.sport.baseball': 3,
  'rec.sport.hockey': 4,
  'sci.crypt': 4,
  'sci.electronics': 13,
  'sci.med': 12,
  'sci.space': 16,
  'soc.religion.christian': 3,
  'talk.politics.guns': 4,
  'talk.politics.mideast': 2,
  'talk.politics.misc': 

In [7]:
TNG_topics_lda.shape

(11314, 20)

In [8]:
from sklearn.neighbors import NearestNeighbors

In [9]:
neigh_lda = NearestNeighbors(n_neighbors=20, metric="cosine")
neigh_nmf = NearestNeighbors(n_neighbors=20, metric="cosine")

In [10]:
neigh_lda.fit(TNG_topics_lda)
neigh_nmf.fit(TNG_topics_nmf)

NearestNeighbors(metric='cosine', n_neighbors=20)

In [11]:
neigh_nmf.kneighbors(TNG_topics_lda[10:11])

(array([[0.03345458, 0.03574881, 0.04886027, 0.06074025, 0.0620826 ,
         0.06545978, 0.0738174 , 0.07395766, 0.076047  , 0.08102642,
         0.08326114, 0.08328954, 0.08674353, 0.09631765, 0.09757136,
         0.09806657, 0.10118102, 0.1056692 , 0.11570665, 0.11947672]]),
 array([[ 7241,   426, 10631,  8883,  7262,  9572,  1690,  3943,  1990,
          3679,  6048, 11091,  7253,  1639,  9649, 10886,  4755,  6799,
         10250,  3312]]))

In [14]:
dataset.data[724]

'\nWell, actually, most of ours is based on what really happened and yours is\nbased on some fantasy of how it happened. But that\'s OK, I understand you\nhave a hockey background. Stats like "plus/minus" make RBI look good.\n\n\nOK, how about a straigh answer, then. Here\'s a very simele question to which\nI\'m sure a fair number of us are very interesed in the answer to. Please\nanswer yes or no, Roger:\n Can a pitcher cause the offensive players on his team to score more runs?\nAL only, please.\n\nFor anyone else following along, it is a well-known and demonstrable fact\nthat a team\'s win-loss record is closely related to the number of runs the\nteam scores and the number the team allows. It\'s not a definite,\nhard-and-fast function, but there is definitely a correlation. In fact, as a\nrule of thumb, if teams A and B both score X runs and team A allows Y runs,\nfor every 10 runs fewer than Y that team B allows, it will win another game.\nSo, for instance, if we look at the 1991 T