In [52]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF as NMF_sklearn
import pickle
from nmf import NMF
import pymongo
from pymongo import MongoClient

def build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=None):
    '''
    Build and return a **callable** for transforming text documents to vectors,
    as well as a vocabulary to map document-vector indices to words from the
    corpus. The vectorizer will be trained from the text documents in the
    `contents` argument. If `use_tfidf` is True, then the vectorizer will use
    the Tf-Idf algorithm, otherwise a Bag-of-Words vectorizer will be used.
    The text will be tokenized by words, and each word will be stemmed iff
    `use_stemmer` is True. If `max_features` is not None, then the vocabulary
    will be limited to the `max_features` most common words in the corpus.
    '''
    
    Vectorizer = TfidfVectorizer if use_tfidf else CountVectorizer
    tokenizer = RegexpTokenizer(r"[\w']+")
    stem = PorterStemmer().stem if use_stemmer else (lambda x: x)
    stop_set = set(stopwords.words('english'))

    # Closure over the tokenizer et al.
    def tokenize(text):
        tokens = tokenizer.tokenize(text)
        stems = [stem(token) for token in tokens if token not in stop_set]
        return stems

    vectorizer_model = Vectorizer(tokenizer=tokenize, max_features=max_features)
    vectorizer_model.fit(contents)
    vocabulary = np.array(vectorizer_model.get_feature_names())

    # Closure over the vectorizer_model's transform method.
    def vectorizer(X):
        return vectorizer_model.transform(X).toarray()

    return vectorizer, vocabulary


def softmax(v, temperature=1.0):
    '''
    A heuristic to convert arbitrary positive values into probabilities.
    See: https://en.wikipedia.org/wiki/Softmax_function
    '''
    expv = np.exp(v / temperature)
    s = np.sum(expv)
    return expv / s


def hand_label_topics(H, vocabulary):
    '''
    Print the most influential words of each latent topic, and prompt the user
    to label each topic. The user should use their humanness to figure out what
    each latent topic is capturing.
    '''
    hand_labels = []
    for i, row in enumerate(H):
        top_five = np.argsort(row)[::-1][:20]
        print('topic', i)
        print('-->', ' '.join(vocabulary[top_five]))
        label = input('please label this topic: ')
        hand_labels.append(label)
        print()
    return hand_labels


def analyze_article(article_index, contents, web_urls, W, hand_labels):
    '''
    Print an analysis of a single NYT articles, including the article text
    and a summary of which topics it represents. The topics are identified
    via the hand-labels which were assigned by the user.
    '''
    print(web_urls[article_index])
    print(contents[article_index])
    probs = softmax(W[article_index], temperature=0.01)
    
    top_prob = 0
    top_cat = 0
    i = 0
    for prob, label in zip(probs, hand_labels):
        if prob > top_prob:
            top_prob = prob
            top_cat = i
        #print('--> {:.2f}% {}'.format(prob * 100, label))
        i = i + 1
    return top_cat
    #print()
    #    gotta assign all of these to the correct bin and then tfidf them 


In [74]:
'''
Run the unsupervised analysis of the NYT corpus, using NMF to find latent
topics. The user will be prompted to label each latent topic, then a few
articles will be analyzed to see which topics they contain.
'''
# Load the corpus.
df = pd.read_pickle("data/articles.pkl")
contents = df.content
web_urls = df.web_url

# Build our text-to-vector vectorizer, then vectorize our corpus.
vectorizer, vocabulary = build_text_vectorizer(contents,
                             use_tfidf=True,
                             use_stemmer=False,
                             max_features=5000)
X = vectorizer(contents)

# We'd like to see consistent results, so set the seed.
np.random.seed(12345)

# Label topics and analyze a few NYT articles.
# Btw, if you haven't modified anything, the seven topics which should
# pop out are:  (you should type these as the labels when prompted)
#  1. "football",
#  2. "arts",
#  3. "baseball",
#  4. "world news (middle eastern?)",
#  5. "politics",
#  6. "world news (war?)",
#  7. "economics"
#    hand_labels = hand_label_topics(H, vocabulary)
rand_articles = list(range(len(W)))
#for i in rand_articles:
#        analyze_article(i, contents, web_urls, W, hand_labels)

# Do it all again, this time using scikit-learn.
nmf = NMF_sklearn(n_components=7, max_iter=100, random_state=12345, alpha=0.0)
W = nmf.fit_transform(X)
H = nmf.components_
print('reconstruction error:', nmf.reconstruction_err_)
hand_labels = hand_label_topics(H, vocabulary)
for i in range(0,len(rand_articles)):
    print('before db insert')
    clus = analyze_article(i, contents, web_urls, W, hand_labels)

    cl = MongoClient()
    coll = cl["nyt_clustering"]["clustered_articles"]

    data = {"_id" : i, "content" : contents[i], "url" : web_urls[i], "cluster" : clus}
    coll.insert_one(data)
        
#pickle.dump(H,open('nyt_clusters_H.p','wb'))
#pickle.dump(W,open('nyt_clusters_W.p','wb'))


reconstruction error: 35.39696222782327
topic 0
--> mr ms new art work like music show one said p year york dance opera time city song museum people

topic 1
--> game team season yard 0 touchdown league 1 l said n player first giant 2 coach quarterback play 3 win

topic 2
--> iran rouhani nuclear mr iranian obama israel united netanyahu president nation sanction state weapon speech israeli meeting said syria leader

topic 3
--> republican house health government care senate party law obama mr shutdown democrat bill would vote senator congress president conservative cruz


KeyboardInterrupt: 

In [50]:
nmf

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=100,
  n_components=7, random_state=12345, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [51]:
np.random.choice(list(range(len(W))), 15)

array([ 482, 1309,  129,  382, 1381,  546,  769, 1142, 1393, 1339,  654,
        105, 1115,  759, 1398])

In [20]:
W.shape

(1405, 7)

In [23]:
[i for i in range(0,10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [25]:
web_urls.head()

0    http://www.nytimes.com/2013/10/03/sports/footb...
1    http://www.nytimes.com/2013/10/03/us/new-immig...
2    http://www.nytimes.com/2013/10/03/us/arizona-j...
3    http://www.nytimes.com/2013/10/03/us/texas-sta...
4    http://www.nytimes.com/2013/10/03/sports/tenni...
Name: web_url, dtype: object

In [33]:
rand_articles

NameError: name 'rand_articles' is not defined

In [46]:
rand_articles

array([ 482, 1309,  129,  382, 1381,  546,  769, 1142, 1393, 1339,  654,
        105, 1115,  759, 1398])

In [54]:
df.shape

(1405, 15)

In [56]:
bane = cl["nyt_clustering"]["clustered_articles"]

In [61]:
bane

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'nyt_clustering'), 'clustered_articles')

In [67]:
test_df = pd.DataFrame(list(cl["nyt_clustering"]["clustered_articles"].find()))

In [68]:
test_df.shape

(1405, 4)

In [69]:
test_df.columns

Index(['_id', 'cluster', 'content', 'url'], dtype='object')

In [73]:
test_df['cluster'].value_counts()

0    367
4    285
1    273
5    268
3    102
2     61
6     49
Name: cluster, dtype: int64

(1405,)