In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
from src import utils, nlp_utils
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords

from datetime import datetime

#nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# LDA - Wall Street Market

## Data Processing

In [2]:
# wallstreet = pd.read_csv('data/wallstreet.csv')
# wall_nlp = wallstreet[['subforum', 'contentWithHTMLTag']].copy()
# wall_nlp['cleaned_content'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_parse_text(x))
# wall_nlp['lemmatized_tokens'] = wall_nlp['cleaned_content'].apply(lambda x: nlp_utils.tokenize_lemmatize(x))
# wall_nlp.to_csv('data/wall_nlp.csv', index=False)

--- 
# Bag of Words

In [3]:
# wall_nlp.head()

In [4]:
wall_nlp = pd.read_csv('data/wall_nlp.csv')

In [None]:
# Must be even for visualizations.
n_topics = 20
max_doc_freq = 0.3

cleaned_content = wall_nlp['lemmatized_tokens']
lda_output_bow, lda_model_bow, docs_vectorized_bow, vect_bow = nlp_utils.convert_to_bow_and_fit_lda_model(cleaned_content, 10000, max_doc_freq, n_topics, 'batch', 25, 3)

sorted_components = np.argsort(lda_model_bow,components_, axis=1)[:, ::-1]
feat_names = np.array(vect_bow.get_feature_names())

In [None]:
nlp_utils.print_topics(lda_model_bow, vect_bow, 10)

In [None]:
# Evaluate each topics (components) overall weight.
# Topics are named by the two most common words.

topics_per_plot = int(n_topics / 2)
barh_xlim = 2000
plot_pad = 100
fig_size = (15,8)
num_cols = 2

fig, ax = plt.subplots(1, 2, figsize=fig_size)

topic_names = ['{} {}'.format(i, ' '.join(words)) for i, words in enumerate(feat_names[sorted_components[:, :2]])]

for col in range(num_cols):
    start = col * topics_per_plot
    end = (col + 1) * topics_per_plot
    plot_width = np.sum(docs_bow, axis=0)[start:end]
    ax[col].barh(np.arange(topics_per_plot), plot_width)
    ax[col].set_yticks(np.arange(topics_per_plot))
    ax[col].set_yticklabels(topic_names[start:end], ha='left', va='top')
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, barh_xlim)
    ax[col].set_xlabel('# of documents')
    y_ax = ax[col].get_yaxis()
    y_ax.set_tick_params(pad=plot_pad)

plt.tight_layout();

In [None]:
# Topic order is based on topic popularity.
# The distance b/t circles reprsents an approximation topic similarity. It's approximation b/c only a 2D map.
# When lambda = 0, the bar chart solely show topic-specific terms (jargon).
doc_term_matrix = np.matrix(lda_model_bow.components_)
pyLDAvis.sklearn.prepare(lda_model_bow, doc_term_matrix, vect_bow)

### Model Performance

In [None]:
print(lda_bow)

In [None]:
# Log Likelihood: Higher the better
print('Log Likelihood: ', lda_bow.score(lda_bow.components_))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print('Perplexity: ', lda_bow.perplexity(lda_bow.components_))

# Model parameters
print(lda_bow.get_params())

---
# TF-IDF

In [None]:
# Must be even for visualizations.
n_topics = 20
max_doc_freq = 0.3

tokens = wall_nlp['lemmatized_tokens']
lda_output_tfidf, lda_model_tfidf, docs_vectorized_tfidf, vect_tfidf = nlp_utils.calculate_tfidf_and_fit_lda_model(tokens, 10000, max_doc_freq, n_topics, 'batch', 25, 3)

sorted_components = np.argsort(lda_model_tfidf.components_, axis=1)[:, ::-1]
feat_names = np.array(vect_tfidf.get_feature_names())

In [None]:
nlp_utils.print_topics(lda_model_tfidf, vect_tfidf, 10)

In [None]:
doc_term_matrix = np.matrix(lda_model_tfidf.components_)
pyLDAvis.sklearn.prepare(lda_model_tfidf, doc_term_matrix, vect_tfidf)

In [None]:
# Log Likelihood: Higher the better
print('Log Likelihood: ', round(lda_model_tfidf.score(docs_vectorized_tfidf), 2))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print('Perplexity: ', round(lda_model_tfidf.perplexity(docs_vectorized_tfidf), 2))

# Model parameters
print(lda_model_tfidf.get_params())