# Non-Negative  Matrix  Factorization

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.externals import joblib
from sklearn import metrics
from tqdm import tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [5]:
from sklearn.datasets import fetch_20newsgroups

In [6]:
newsgroups = fetch_20newsgroups()

## Preprocess data

In [7]:
raw_documents = []
snippets = []
for text in newsgroups.data:
    raw_documents.append( text.lower() )
    snippets.append( text[0:min(len(text),100)] )
print("Read %d raw text documents" % len(raw_documents))

Read 11314 raw text documents


In [8]:
from nltk.tokenize import RegexpTokenizer
toker = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+')

In [9]:
# custom stopwords
custom_stop_words = []
with open( "stopwords.txt", "r" ) as f:
    for line in f.readlines():
        custom_stop_words.append( line.strip().lower() )
        
print("Stopword list has %d entries" % len(custom_stop_words) )

Stopword list has 350 entries


In [10]:
# create BoW + tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(stop_words=custom_stop_words, min_df = 20)
A = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )

terms = vectorizer.get_feature_names()
joblib.dump((A,terms,snippets), "articles-tfidf.pkl") 

Created 11314 X 8889 TF-IDF-normalized document-term matrix


['articles-tfidf.pkl']

In [11]:
# top features from tf-idf model
import operator


sums = np.array(A.sum(axis=0)).ravel()
# map weights to the terms
weights = { term: sums[col] for col, term in enumerate(terms)}
ranking = sorted(weights.items(), key=operator.itemgetter(1), reverse=True)
for i, pair in enumerate( ranking[0:20] ):
    print( "%s (%.2f)" % ( pair[0], pair[1] ) )

edu (476.15)
com (343.46)
subject (213.89)
lines (212.11)
organization (209.69)
writes (193.92)
article (191.88)
university (183.79)
posting (167.86)
host (160.93)
nntp (159.33)
ca (151.82)
know (151.48)
people (150.03)
get (143.27)
cs (131.41)
think (131.30)
good (120.71)
time (114.49)
distribution (113.65)


## Apply NMF

In [161]:
(A,terms,snippets) = joblib.load( "articles-tfidf.pkl")

# create the model
# k = 20
# model = NMF( init="nndsvd", n_components=k ) 

# W = model.fit_transform( A )
# H = model.components_

In [162]:
# terms

In [163]:
tokenizer_tf = vectorizer.build_tokenizer()

In [164]:
W[0,:].round(2)

array([ 0.01,  0.01,  0.01,  0.01,  0.01,  0.01,  0.01,  0.01,  0.01,
        0.01,  0.01,  0.01,  0.03,  0.01,  0.01,  0.01,  0.01,  0.01,
        0.01,  0.83])

In [165]:
H.shape

(20, 8889)

In [166]:
H[:,terms.index('geb')].round(2)

array([ 25.59,   0.05,   0.05,   0.05,   0.05,   0.3 ,   0.05,   0.05,
         0.05,   0.05,   0.05,   0.05,   0.05,   0.08,   0.05,   0.05,
         0.05,   0.05,   0.05,   0.05])

In [99]:
# show topic descriptors
def get_descriptor(terms, H, topic_index, top):
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( terms[term_index] )
    return top_terms

descriptors = []
for topic_index in range(k):
    descriptors.append( get_descriptor( terms, H, topic_index, 20 ) )
    str_descriptor = ", ".join( descriptors[topic_index] )
    print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )

Topic 01: com, netcom, hp, ibm, article, writes, sun, organization, subject, lines, att, stratus, posting, nntp, distribution, host, reply, corp, austin, mot
Topic 02: god, jesus, bible, christian, christians, christ, faith, believe, people, church, christianity, truth, life, hell, religion, sin, rutgers, heaven, man, think
Topic 03: edu, university, cs, posting, nntp, host, organization, subject, lines, article, cmu, writes, uiuc, washington, andrew, distribution, thanks, please, berkeley, mit
Topic 04: windows, window, file, dos, files, program, card, graphics, mouse, help, problem, screen, version, thanks, video, color, drivers, de, pc, system
Topic 05: pitt, geb, gordon, banks, cs, cadre, dsl, n3jxp, chastity, shameful, skepticism, intellect, surrender, pittsburgh, edu, univ, science, soon, computer, reply
Topic 06: key, clipper, chip, encryption, keys, escrow, government, algorithm, security, crypto, secure, nsa, system, secret, privacy, des, public, law, encrypted, wiretap
Topic 

In [19]:
def get_top_snippets( all_snippets, W, topic_index, top ):
    top_indices = np.argsort( W[:,topic_index] )[::-1]
    top_snippets = []
    for doc_index in top_indices[0:top]:
        top_snippets.append( all_snippets[doc_index] )
    return top_snippets

topic_snippets = get_top_snippets( snippets, W, 0, 20 )
for i, snippet in enumerate(topic_snippets):
    print("%02d. %s" % ( (i+1), snippet ) )

01. From: bmdelane@midway.uchicago.edu (brian manning delaney)
Subject: RESULT: sci.life-extension passe
02. From: k4bnc@cbnewsh.cb.att.com (john.a.siegel)
Subject: Can't set COM4
Organization: AT&T
Distributi
03. From: tomk@skywalker.bocaraton.ibm.com (Thomas Chun-Hong Kok)
Subject: Re: MOOLIT and OLIT
Organizat
04. From: cdt@sw.stratus.com (C. D. Tavares)
Subject: Re: Ax the ATF
Organization: Stratus Computer, Inc
05. From: dana@lando.la.locus.com (Dana H. Myers)
Subject: What is a squid? (was Re: Riceburner Respect)
06. From: essbaum@rchland.vnet.ibm.com (Alexander Essbaum)
Subject: Re: ++BIKE SOLD OVER NET 600 MILES A
07. From: channui@austin.ibm.com (Christopher Chan-Nui)
Subject: Re: Two pointing devices in one COM-por
08. From: rosen@kranz.enet.dec.com (Jim Rosenkranz)
Subject: Re: Metal powder,steel,iron.
Reply-To: rose
09. From: vinlai@cbnewsb.cb.att.com (vincent.lai)
Subject: Third party car antennas ...
Organization: AT
10. From: slagle@lmsc.lockheed.com (Mark Slagle)
Subjec

In [167]:
joblib.dump((W,H,terms,snippets), "articles-model-nmf-k%02d.pkl" % k) 

['articles-model-nmf-k20.pkl']

## Topic coherence

In [200]:
kmin, kmax = 4, 27

topic_models = []
for k in tqdm(range(kmin,kmax+1)):
    model = NMF( init="nndsvd", n_components=k ) 
    W = model.fit_transform( A )
    H = model.components_    
    topic_models.append( (k,W,H) )

100%|██████████| 24/24 [03:12<00:00, 15.73s/it]


In [198]:
import re
import gensim


class TokenGenerator:
    def __init__( self, documents, stopwords ):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )

    def __iter__( self ):
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall( doc ):
                if tok in self.stopwords:
                    tokens.append( "<stopword>" )
                elif len(tok) >= 2:
                    tokens.append( tok )
            yield tokens
            

docgen = TokenGenerator( raw_documents, custom_stop_words )
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)
print( "Model has %d terms" % len(w2v_model.wv.vocab) )
w2v_model.save("w2v-model.bin")

Model has 12267 terms


## Calculate topic coherence TC-W2C as a mean pairwise similarity of words from topic. 

In [201]:
from itertools import combinations


def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2):
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)


k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 20 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=0.2602
K=05: Coherence=0.3121
K=06: Coherence=0.3457
K=07: Coherence=0.3595
K=08: Coherence=0.3585
K=09: Coherence=0.3602
K=10: Coherence=0.3726
K=11: Coherence=0.3758
K=12: Coherence=0.3821
K=13: Coherence=0.4034
K=14: Coherence=0.3943
K=15: Coherence=0.4017
K=16: Coherence=0.4077
K=17: Coherence=0.4042
K=18: Coherence=0.4099
K=19: Coherence=0.4090
K=20: Coherence=0.4175
K=21: Coherence=0.4156
K=22: Coherence=0.4166
K=23: Coherence=0.4183
K=24: Coherence=0.4170
K=25: Coherence=0.4221
K=26: Coherence=0.4142
K=27: Coherence=0.4159


## Calculate UCI

In [172]:
from collections import defaultdict
inverse = defaultdict(set)
for i, text in enumerate(raw_documents):
    spl = tokenizer_tf(text)
    for word in spl:
        inverse[word].add(i)
    

In [202]:

# # def PMI(w1, w2):
# w1 = 'israeli'
# w2 = 'policy'
N_docs = 11314
eps = 10**(-12)
def PMI(eps,w1,w2): #for 2 words

    n_of_both_occ = len(inverse[w1]&inverse[w2])
    n_w1 = len(inverse[w1])
    n_w2 =len(inverse[w2])
    p_w1 = n_w1/N_docs
    p_w2= n_w2/N_docs
    p_w1w2 = n_of_both_occ/N_docs
    pmi = np.log((p_w1w2+ eps) / (p_w1*p_w2))
#     print(pmi)
    return pmi

In [203]:
# PMI(eps, 'word', 'religion')

In [204]:
N=20
def calculate_coherence_2(k, term_rankings):
    overall_coherence = 0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations(term_rankings[topic_index], 2):
            pair_scores.append(PMI(eps, pair[0], pair[1]))
#             print(pair_scores)
        # get the mean for all pairs in this topic
        topic_score = (2 / (N*(N-1)))*sum(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence/k

In [205]:
k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 20 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append(get_descriptor( terms, H, topic_index, 20))
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence_2(k, term_rankings) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=0.5660
K=05: Coherence=1.0084
K=06: Coherence=1.2218
K=07: Coherence=1.2777
K=08: Coherence=1.2073
K=09: Coherence=1.2485
K=10: Coherence=1.2925
K=11: Coherence=1.1710
K=12: Coherence=1.2735
K=13: Coherence=1.4813
K=14: Coherence=1.3544
K=15: Coherence=1.3418
K=16: Coherence=1.2012
K=17: Coherence=1.2863
K=18: Coherence=1.3451
K=19: Coherence=1.2422
K=20: Coherence=1.3030
K=21: Coherence=1.2243
K=22: Coherence=1.2488
K=23: Coherence=1.3117
K=24: Coherence=1.3211
K=25: Coherence=1.1111
K=26: Coherence=1.2350
K=27: Coherence=1.2481


## Calculate UMass


In [206]:
def log_of_probs(eps,w1,w2): #for 2 words
    n_of_both_occ = len(inverse[w1]&inverse[w2])
    n_w1 = len(inverse[w1])
    n_w2 =len(inverse[w2])
    p_w1 = n_w1/N_docs
    p_w2= n_w2/N_docs
    p_w1w2 = n_of_both_occ/N_docs
    if n_w2 > n_w2:
        log = np.log((p_w1w2+ eps) / (p_w2))
    else:
        log = np.log((p_w1w2+ eps) / (p_w1))
    return log

N=20
def calculate_coherence_3(k, term_rankings):
#     print(N)
    overall_coherence = 0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations(term_rankings[topic_index], 2):
            pair_scores.append(log_of_probs(eps, pair[0], pair[1]))
#             print(log_of_probs(eps, pair[0], pair[1]))
#             print(pair_scores)
        # get the mean for all pairs in this topic
        topic_score = (2 / (N*(N-1)))*sum(pair_scores)
        overall_coherence += topic_score
#         print(overall_coherence)
    # get the mean score across all topics
    return overall_coherence / k

In [207]:
k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 20 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 20 ) )
    # Now calculate the coherence
    k_values.append( k )
    coherences.append(calculate_coherence_3(k,term_rankings) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=-1.8877
K=05: Coherence=-1.7028
K=06: Coherence=-1.6600
K=07: Coherence=-1.7040
K=08: Coherence=-1.8751
K=09: Coherence=-1.9149
K=10: Coherence=-1.9412
K=11: Coherence=-2.0786
K=12: Coherence=-1.9617
K=13: Coherence=-1.8816
K=14: Coherence=-1.9164
K=15: Coherence=-1.9913
K=16: Coherence=-2.2139
K=17: Coherence=-2.0457
K=18: Coherence=-1.9799
K=19: Coherence=-2.1113
K=20: Coherence=-2.1566
K=21: Coherence=-2.3137
K=22: Coherence=-2.1902
K=23: Coherence=-2.1527
K=24: Coherence=-2.1918
K=25: Coherence=-2.4945
K=26: Coherence=-2.2721
K=27: Coherence=-2.2655


# LDA

In [208]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.externals import joblib
from sklearn import metrics
from tqdm import tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [209]:
A,terms,snippets = joblib.load("articles-tfidf.pkl")

In [213]:
kmin, kmax = 4, 25

topic_models_LDA = []
for k in tqdm(range(kmin,kmax+1)):
    model = LatentDirichletAllocation(n_components=k, max_iter=5,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=1)
    W = model.fit_transform( A )
    H = model.components_    
    topic_models_LDA.append( (k,W,H) )

100%|██████████| 22/22 [12:22<00:00, 32.85s/it]


## TC-W2V

In [214]:
k_values = []
coherences = []
for (k,W,H) in topic_models_LDA:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ))

K=04: Coherence=0.2889
K=05: Coherence=0.2940
K=06: Coherence=0.2924
K=07: Coherence=0.2810
K=08: Coherence=0.2980
K=09: Coherence=0.2899
K=10: Coherence=0.3175
K=11: Coherence=0.3091
K=12: Coherence=0.3064
K=13: Coherence=0.3341
K=14: Coherence=0.3131
K=15: Coherence=0.3392
K=16: Coherence=0.3429
K=17: Coherence=0.3614
K=18: Coherence=0.3452
K=19: Coherence=0.3330
K=20: Coherence=0.3659
K=21: Coherence=0.3848
K=22: Coherence=0.3750
K=23: Coherence=0.3888
K=24: Coherence=0.3590
K=25: Coherence=0.3982


## UCI

In [215]:
k_values = []
coherences = []
for (k,W,H) in topic_models_LDA:
    # Get all of the topic descriptors - the term_rankings, based on top 20 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append(get_descriptor( terms, H, topic_index, 20))
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence_2(k, term_rankings) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=-0.4645
K=05: Coherence=-2.2140
K=06: Coherence=-2.1885
K=07: Coherence=-1.7606
K=08: Coherence=-1.3123
K=09: Coherence=-1.4677
K=10: Coherence=-1.6211
K=11: Coherence=-2.0313
K=12: Coherence=-1.5209
K=13: Coherence=-1.0770
K=14: Coherence=-2.2817
K=15: Coherence=-3.0259
K=16: Coherence=-2.9542
K=17: Coherence=-4.1425
K=18: Coherence=-2.2705
K=19: Coherence=-3.0868
K=20: Coherence=-3.0116
K=21: Coherence=-3.2068
K=22: Coherence=-4.1823
K=23: Coherence=-4.1239
K=24: Coherence=-4.5041
K=25: Coherence=-4.4365


# UMass

In [216]:
k_values = []
coherences = []
for (k,W,H) in topic_models_LDA:
    # Get all of the topic descriptors - the term_rankings, based on top 20 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 20 ) )
    # Now calculate the coherence
    k_values.append( k )
    coherences.append(calculate_coherence_3(k,term_rankings) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=-2.3943
K=05: Coherence=-4.8831
K=06: Coherence=-4.9652
K=07: Coherence=-4.3926
K=08: Coherence=-4.0870
K=09: Coherence=-4.2706
K=10: Coherence=-4.5308
K=11: Coherence=-4.9858
K=12: Coherence=-4.4749
K=13: Coherence=-4.1959
K=14: Coherence=-5.4487
K=15: Coherence=-6.6292
K=16: Coherence=-6.6080
K=17: Coherence=-8.0624
K=18: Coherence=-5.7084
K=19: Coherence=-6.8624
K=20: Coherence=-6.8123
K=21: Coherence=-7.0802
K=22: Coherence=-8.3505
K=23: Coherence=-8.4650
K=24: Coherence=-8.6532
K=25: Coherence=-8.7034


## LSA

In [217]:
from sklearn.decomposition import TruncatedSVD
from sklearn.externals import joblib
from sklearn import metrics
from tqdm import tqdm

In [219]:
kmin, kmax = 4, 25

topic_models_LSA = []
for k in tqdm(range(kmin,kmax+1)):
    model = TruncatedSVD(n_components=k ) 
    W = model.fit_transform( A )
    H = model.components_    
    topic_models_LSA.append( (k,W,H) )

100%|██████████| 22/22 [00:08<00:00,  1.87it/s]


## TC-W2C

In [220]:
k_values = []
coherences = []
for (k,W,H) in topic_models_LSA:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=0.3417
K=05: Coherence=0.3710
K=06: Coherence=0.3590
K=07: Coherence=0.3381
K=08: Coherence=0.3645
K=09: Coherence=0.3659
K=10: Coherence=0.3564
K=11: Coherence=0.3391
K=12: Coherence=0.3636
K=13: Coherence=0.3531
K=14: Coherence=0.3839
K=15: Coherence=0.3654
K=16: Coherence=0.3606
K=17: Coherence=0.3590
K=18: Coherence=0.3562
K=19: Coherence=0.3508
K=20: Coherence=0.3731
K=21: Coherence=0.3621
K=22: Coherence=0.3478
K=23: Coherence=0.3518
K=24: Coherence=0.3643
K=25: Coherence=0.3503


## UCI

In [221]:
k_values = []
coherences = []
for (k,W,H) in topic_models_LSA:
    # Get all of the topic descriptors - the term_rankings, based on top 20 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append(get_descriptor( terms, H, topic_index, 20))
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence_2(k, term_rankings) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=-0.0296
K=05: Coherence=0.1723
K=06: Coherence=0.0790
K=07: Coherence=-0.3688
K=08: Coherence=0.3128
K=09: Coherence=-0.2880
K=10: Coherence=-0.2323
K=11: Coherence=-0.1361
K=12: Coherence=-0.0836
K=13: Coherence=-0.5638
K=14: Coherence=-0.0680
K=15: Coherence=-0.6136
K=16: Coherence=-0.4266
K=17: Coherence=-0.4078
K=18: Coherence=-0.5192
K=19: Coherence=-0.9936
K=20: Coherence=-0.5855
K=21: Coherence=-0.7810
K=22: Coherence=-0.9532
K=23: Coherence=-0.8845
K=24: Coherence=-0.8733
K=25: Coherence=-0.8894


## UMass

In [222]:
k_values = []
coherences = []
for (k,W,H) in topic_models_LSA:
    # Get all of the topic descriptors - the term_rankings, based on top 20 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 20 ) )
    # Now calculate the coherence
    k_values.append( k )
    coherences.append(calculate_coherence_3(k,term_rankings) )
    print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )

K=04: Coherence=-2.8086
K=05: Coherence=-2.7923
K=06: Coherence=-2.8982
K=07: Coherence=-3.4353
K=08: Coherence=-2.8520
K=09: Coherence=-3.4670
K=10: Coherence=-3.3958
K=11: Coherence=-3.4566
K=12: Coherence=-3.3997
K=13: Coherence=-3.8663
K=14: Coherence=-3.5777
K=15: Coherence=-4.0955
K=16: Coherence=-3.9557
K=17: Coherence=-3.9231
K=18: Coherence=-4.0124
K=19: Coherence=-4.5190
K=20: Coherence=-4.1324
K=21: Coherence=-4.2777
K=22: Coherence=-4.4550
K=23: Coherence=-4.4478
K=24: Coherence=-4.4444
K=25: Coherence=-4.4463


# Choose 3 best models 

## NMF:

TC: K=25: Coherence=0.4221

UCI: K=13: Coherence=1.4813

Umass: K=06: Coherence=-1.6600

## LDA

TC: K=25: Coherence=0.3982

UCI: K=04: Coherence=-0.4645

Umass: K=05: Coherence=-2.7923

## LSA

TC: K=14: Coherence=0.3839
        
UCI: K=08: Coherence=0.3128

Umass: K=05: Coherence=-2.7923

## Теперь выбираем из них самую хорошую по метрике

TC: NMF

UCI: NMF

Umass: NMF

## Теперь посмотрим на дескрипторы у NMF:

In [232]:
topic_choose = []
for k in [6,13,25]:
    model = NMF( init="nndsvd", n_components=k ) 
    W = model.fit_transform( A )
    H = model.components_
    topic_choose.append( (k,W,H) )

descriptors = []
for (k,W,H) in topic_choose:
    print(k)
    for topic_index in range(k):
        descriptors.append( get_descriptor( terms, H, topic_index, 20 ) )
        str_descriptor = ", ".join( descriptors[topic_index] )
        print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )
    print('________________')

6
Topic 01: com, article, writes, netcom, hp, nasa, ibm, car, organization, sun, subject, lines, gov, ca, access, posting, nntp, distribution, host, digex
Topic 02: god, people, jesus, think, believe, bible, christian, christians, life, faith, see, christ, jews, know, religion, israel, church, time, way, truth
Topic 03: edu, university, posting, host, nntp, article, cs, organization, writes, subject, lines, cc, state, cwru, ohio, game, cleveland, team, cmu, ca
Topic 04: windows, drive, card, dos, uk, scsi, file, ac, thanks, window, pc, help, problem, system, files, program, video, disk, graphics, mac
Topic 05: pitt, geb, gordon, banks, cs, cadre, dsl, n3jxp, chastity, shameful, skepticism, intellect, edu, surrender, pittsburgh, science, univ, soon, computer, reply
Topic 06: key, clipper, chip, encryption, keys, escrow, government, security, algorithm, crypto, system, public, nsa, secure, secret, privacy, law, des, enforcement, wiretap
________________
13
Topic 01: com, article, writes,

## Я выбираю NMF с K=25

# 2. Строим рекомендательную систему

In [265]:
from sklearn.neighbors import NearestNeighbors

In [335]:
neigh = NearestNeighbors(leaf_size=30, metric='euclidean').fit(W) #euclidean - самая стандартная

In [336]:
class NewsRecommender:

    def train(self, texts):
        model = NMF( init="nndsvd", n_components=25) 
        W = model.fit_transform(texts)
        H = model.components_
    
    def recommend(self, text_sample, k):
        recommendations = []
        doc = vectorizer.transform([text_sample])
        doc_nmf = model.transform(doc)
        indices = neigh.kneighbors(doc_nmf)[1][0]
        for ind in indices:
            recommendations.append(raw_documents[ind])
        return recommendations


In [337]:
recom = NewsRecommender()

In [338]:
recom.train(A)

### Текст из новостей BBC-2017

In [339]:
r = recom.recommend('Nokia once dominated the mobile phone market but struggled after the launch of the iPhone a decade ago, and the subsequent release of Googles Android operating system. HMD Global had previously indicated it would release several Nokia-branded Android phones in 2017. It is expected to provide details of at least some of the other launches at another trade show - Barcelonas Mobile World Congress - in February. "The decision by HMD to launch its first Android smartphone into China is a reflection of the desire to meet the real world needs of consumers in different markets around the world," the firm said in a statement. "With over 552 million smartphone users in China in 2016, a figure that is predicted to grow to more than 593 million users by 2017, it is a strategically important market where premium design and quality is highly valued by consumers."', 2)

In [340]:
for txt in r:
    print(txt)
    print('///////////////////////////')

from: sp1marse@kristin (marco seirio)
subject: flat globe
lines: 13
x-newsreader: tin 1.1 pl3


does anybody have an algorithm for "flattening" out a globe, or any other
parametric surface, that is definied parametrically. 
that is, i would like to take a sheet of paper and a knife and to be
able to calculate how i must cut in the paper so i can fold it to a
globe (or any other object).


      marco seirio - in real life sp1marse@caligula.his.se

 

 

///////////////////////////
from: smb@research.att.com (steven bellovin)
subject: re: clipper will corrupt cops (was wh proposal from police point of view)
organization: at&t bell laboratories
lines: 65

in article <1993apr21.041033.16550@news.clarkson.edu>, tuinstra@signal.ece.clarkson.edu.soe (dwight tuinstra) writes:
> the clear implication is that there are "legal" authorizations other
> than a court order.  just how leaky are these?

i don't have the wiretap statute handy.  but here's what the law says
on pen registers.  this is al

### Текст из обучающего корпуса

In [341]:
text = raw_documents[101]

In [342]:
# text

In [343]:
m = recom.recommend(text, 4)

In [344]:
for i in m:
    print(i)
    print('//////////////////////')

from: exuptr@exu.ericsson.se (patrick taylor, the sounding board)
subject: re: how to the disks copy protected.
nntp-posting-host: 138.85.253.85
organization: ericsson network systems, inc.
x-disclaimer: this article was posted by a user at ericsson.
              any opinions expressed are strictly those of the
              user and not necessarily those of ericsson.
lines: 36

in article <1993apr21.131908.29582@uhura.neoucom.edu> wtm@uhura.neoucom.edu (bill mayhew) writes:
>from: wtm@uhura.neoucom.edu (bill mayhew)
>subject: re: how to the disks copy protected.
>date: wed, 21 apr 1993 13:19:08 gmt

>write a good manual to go with the software.  the hassle of
>photocopying the manual is offset by simplicity of purchasing
>the package for only $15.  also, consider offering an inexpensive
>but attractive perc for registered users.  for instance, a coffee
>mug.  you could produce and mail the incentive for a couple of
>dollars, so consider pricing the product at $17.95.

or, _documentat