In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import utils, nlp_utils
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords

from datetime import datetime

nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\morga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Topical Modeling - Wall Street Market

## Data Processing

In [2]:
wallstreet = pd.read_csv('data/wallstreet.csv')
wall_nlp = wallstreet[['subforum', 'contentWithHTMLTag']].copy()

In [3]:
wall_nlp['cleaned_content'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_parse_text(x))
wall_nlp['lemmatized_tokens'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_tokenize_lemmatize(x))

--- 
# LDA

In [12]:
def vectorize_data_and_fit_lda_model(text_dataset, max_feats, freq_thresh, n_topics, learning_method, max_iter, random_state=3):
    '''
    Instatiates Count Vectorizer object and vectorizes text.
    
    Args:
        text_dataset(arr) : An array comprised of numerous texts.
        max_feats (int) : Number of words to limit the bag-of-words.
        freq-thresh (float) : Words that have a document frequency greater than the selected amount will be ignored.
    Return:
        text_transformed (arr) : Vectorized texted.
    
    '''
    
    vect = CountVectorizer(max_features=max_feats, max_df=freq_thresh)
    text_transformed = vect.fit_transform(text_dataset)

    lda = LatentDirichletAllocation(n_components=n_topics, learning_method=learning_method, max_iter=max_iter, random_state=random_state)
    doc_topics = lda.fit_transform(text_transformed)
    
    return doc_topics, lda, vect
    
#def summarize_lca_topics(lda_model)
    

In [13]:
cleaned_content = wall_nlp['cleaned_content'].values
doc_topics, lda_model, vect = vectorize_data_and_fit_lda_model(cleaned_content, 10000, 0.15, 10, 'batch', 25, 3)

sorted_components = np.argsort(lda_model.components_, axis=1)[:, ::-1]
feat_names = np.array(vect.get_feature_names())

In [None]:
def print_topics(topics, feature_names, sorting, topics_per_chunk=6, n_words=20):
    for i in range(0, len(topics), topics_per_chunk):
        # for each chunk:
        these_topics = topics[i: i + topics_per_chunk]
        # maybe we have less than topics_per_chunk left
        len_this_chunk = len(these_topics)
        # print topic headers
        print(("topic {:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        # print top n_words frequent words
        for i in range(n_words):
            try:
                print(("{:<14}" * len_this_chunk).format(
                    *feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")