In [37]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
from src import utils, nmf_utils, lda_utils
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords
#nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.nmf import Nmf

from collections import Counter
from operator import itemgetter

# NMF - Wall Street Forum

In [4]:
def bow_vectorizer(docs_raw, min_doc_freq, max_doc_freq, max_feats, ngram_rng):
    '''
    
    Args:
        
    Return:
    
    '''
    vectorizer = CountVectorizer(min_df=min_doc_freq, max_df=max_doc_freq, max_features=max_feats, ngram_range=ngram_rng)
    docs_vectorized = vectorizer.fit_transform(docs_raw)
    return docs_vectorized, vectorizer


In [5]:
def tfidf_vectorizer(docs_raw, min_doc_freq, max_doc_freq, max_feats, ngram_rng):
    '''
    
    Args:
        
    Return:
    
    '''
    vectorizer = TfidfVectorizer(min_df=min_doc_freq, max_df=max_doc_freq, max_features=max_feats, ngram_range=ngram_rng)
    docs_vectorized = vectorizer.fit_transform(docs_raw)
    return docs_vectorized, vectorizer

In [6]:
def print_topics(model, count_vectorizer, n_top_words):
    '''
   
    Args:
    
        
    Return:
    
        
    '''
    
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        top_words_arr = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print("\nTopic #{}:".format(topic_idx))
        print(" ".join(top_words_arr))

In [27]:
# wsm = Wall Street Market
# append_to_stopwords = ['wsm']

stop_words = stopwords.words("english")# .append(append_to_stopwords)
stopwords_dict = Counter(stop_words)


def tokenize_lemmatize(text):
    '''
    
    
    Args:
        comment (str) : Content of the author's post in Wall Street Market.
        
    Return:
        lem_tokens (arr) : The cleaned, tokenized and lemmetized version of comment.
        
    '''
    
    
    
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords_dict]
    
    wordnet_lemma = nltk.WordNetLemmatizer()
    lem_tokens = [wordnet_lemma.lemmatize(t) for t in tokens]
    
    return lem_tokens

## Import Dataset and Model Inputs

In [28]:
wallstreet = pd.read_csv('data/wallstreet_master.csv')

In [29]:
# Must perform again b/c tokens are transformed to strings when saved to "data/wallstreet_master.csv" in "wallstreet_feat_eng_eda.ipynb"
wallstreet['tokens_for_nmf'] = wallstreet['cleaned_text'].apply(tokenize_lemmatize)

In [31]:
wallstreet.head()

Unnamed: 0,postID,threadID,threadTitle,subforum,authorName,postAuthorMembership,authorReputation,postSequence,flatContent,contentWithHTMLTag,post_date_dt,author_join_date,num_days_member_when_posted,wordcloud_text,cleaned_text,lemmatized_tokens,tokens_for_nmf
0,6,5,Hi,Introductions,Punka421,New member,-3,1,\nJust thought I'd introduce myself. I am new ...,"<div class=""entry-content"">\n<p>Just thought I...",2016-10-26 13:58:36,2016-10-26 00:00:00,0.0,thought id introduce new communities trying le...,just thought id introduce myself i am new to ...,"['thought', 'id', 'introduce', 'new', 'communi...","[thought, id, introduce, new, community, tryin..."
1,7,5,Hi,Introductions,WSM,Administrator,66,2,\nHello Nice to see you here!Regards\n,"<div class=""entry-content"">\n<p>Hello <img alt...",2016-10-26 14:04:04,2016-10-02 00:00:00,24.0,hello nice see hereregards,hello nice to see you hereregards,"['hello', 'nice', 'see', 'hereregards']","[hello, nice, see, hereregards]"
2,8,6,WSM Updates - Changelog (Page 1 of 4),Announcements,WSM,Administrator,74,1,\nHello everyone.I would like to tell you that...,"<div class=""entry-content"">\n<p>Hello everyone...",2016-10-26 16:54:27,2016-10-02 00:00:00,24.0,hello everyonei would like tell weve implement...,hello everyonei would like to tell you that w...,"['hello', 'everyonei', 'would', 'like', 'tell'...","[hello, everyonei, would, like, tell, weve, im..."
3,11,5,Hi,Introductions,Estrazy,Banned,0,3,\nHello Punka! nice to meet you!As you asked h...,"<div class=""entry-content"">\n<p>Hello Punka! n...",2016-10-27 14:00:16,2016-10-27 00:00:00,0.0,hello punka nice meet youas asked stay secure ...,hello punka nice to meet youas you asked how ...,"['hello', 'punka', 'nice', 'meet', 'youas', 'a...","[hello, punka, nice, meet, youas, asked, stay,..."
4,13,6,WSM Updates - Changelog (Page 1 of 4),Announcements,WSM,Administrator,74,2,"\nChangelog from Wednesday, 2nd November 2016F...","<div class=""entry-content"">\n<h5>Changelog fro...",2016-11-02 15:42:27,2016-10-02 00:00:00,31.0,changelog wednesday november lot small issues ...,changelog from wednesday november a lot of ...,"['changelog', 'wednesday', 'november', 'lot', ...","[changelog, wednesday, november, lot, small, i..."


In [32]:
docs_raw = wallstreet['tokens_for_nmf']

In [33]:
# BOW and Tf-Idf variables
min_doc_freq = 0.1
max_doc_freq = 0.85
max_feats = 10000
ngram_rng = [1, 2]

# NMF variables
num_topics = 20
init = 'nndsvd'
distance_method = 'frobenius'
regularization = 0

---
## Use Gensim to Identify Optimal Number of Topics

In [35]:
gensim_dict = Dictionary(docs_raw)
gensim_dict.filter_extremes(no_below=0.1, no_above=0.85, keep_n=10000)

corpus = [gensim_dict.doc2bow(doc) for doc in docs_raw]



In [43]:
corpus[:5]

[[(0, 1),
  (1, 1),
  (2, 2),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 2),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(27, 1), (28, 1), (29, 1)],
 [(1, 1),
  (5, 1),
  (11, 2),
  (12, 2),
  (13, 1),
  (14, 2),
  (19, 1),
  (27, 1),
  (30, 2),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 4),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 2),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 2),
  (56, 1),
  (57, 3),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 2),
  (70, 1),
  (71, 1),
  (72, 2),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 3),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  

In [44]:
num_topics_rng = list(np.arange(5, 75 + 1, 5)) 
coherence_scores = []

# NMF MODEL PARAMS
num_passes_over_training_docs = 5
gradient_desc_step = 0.1

# the "normalized" arg must equal true to be relevant.
floor_probs_for_topics = 0.01
normalize = True
num_training_docs_per_chunk = 5000

# W = topics by words
w_max_iter=300
w_stop_condition=0.0001

# H = articles by topics
h_max_iter=100
h_stop_condition=0.001
eval_every=10

for num in num_topics_rng:
    nmf = Nmf(corpus=corpus, num_topics=num, id2word=gensim_dict, chunksize=num_training_docs_per_chunk,  passes=num_passes_over_training_docs, kappa=gradient_desc_step,
             normalize=normalize, minimum_probability=floor_probs_for_topics, w_max_iter=w_max_iter, w_stop_condition=w_stop_condition, h_max_iter=h_max_iter,
             h_stop_condition=h_stop_condition, eval_every=eval_every)
    
    coherence_model = CoherenceModel(model=nmf, texts=docs_raw, dictionary=gensim_dict, coherence='c_v')
    
    coherence_scores.append(round(coherence_model.get_coherence(), 5))

scores = list(zip(num_topics_rng, coherence_scores))
topics_sorted_by_co_score = sorted(scores, key=itemgetter(1), reverse=True)[0][0]

print(topics_sorted_by_score)

NameError: name 'topics_sorted_by_score' is not defined

In [46]:
topics_sorted_by_co_score = sorted(scores, key=itemgetter(1), reverse=True)
print(topics_sorted_by_co_score)

[(10, 0.57119), (15, 0.54468), (5, 0.53721), (35, 0.52805), (20, 0.52619), (30, 0.52435), (25, 0.50427), (55, 0.5036), (60, 0.49503), (45, 0.49125), (65, 0.48518), (70, 0.48433), (40, 0.48411), (50, 0.48408), (75, 0.47384)]


---
## NMF - Bag of Words 

In [17]:
# bow_docs_vectorized,  bow_vectorizer = bow_vectorizer(docs_raw, min_doc_freq, max_doc_freq, max_feats, ngram_rng)

vectorizer = CountVectorizer(min_df=min_doc_freq, max_df=max_doc_freq, max_features=max_feats, ngram_range=ngram_rng)
docs_vectorized = vectorizer.fit_transform(docs_raw)
nmf_bow = NMF(n_components=num_topics, init='nndsvd', beta_loss=distance_method, l1_ratio=regularization).fit(bow_docs_vectorized)

In [18]:
feature_names = vectorizer.get_feature_names()

In [28]:
n_top_words = 2

NameError: name 'topic_table' is not defined

In [29]:
print_topics(nmf_bow, vectorizer, n_top_words)

IndexError: list index out of range

In [25]:
words = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(nmf_bow.components_):
    top_words_arr = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print("\nTopic #{}:".format(topic_idx))
    print(" ".join(top_words_arr))

IndexError: list index out of range

In [None]:
--