In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
from src import utils, nlp_utils
from bs4 import BeautifulSoup

import nltk
from nltk.corpus import stopwords

from datetime import datetime

#nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# LDA - Wall Street Market

## Data Processing

In [2]:
wallstreet = pd.read_csv('data/wallstreet.csv')
wall_nlp = wallstreet[['subforum', 'contentWithHTMLTag']].copy()
wall_nlp['cleaned_content'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_parse_text(x))
wall_nlp['lemmatized_tokens'] = wall_nlp['contentWithHTMLTag'].apply(lambda x: nlp_utils.clean_tokenize_lemmatize(x))
wall_nlp.to_csv('data/wall_nlp.csv', index=False)

--- 
# Bag of Words

In [3]:
wall_nlp.head()

Unnamed: 0,subforum,contentWithHTMLTag,cleaned_content,lemmatized_tokens
0,Introductions,"<div class=""entry-content"">\n<p>Just thought I...",Just thought I d introduce myself I am new t...,"[thought, introduce, new, community, trying, l..."
1,Introductions,"<div class=""entry-content"">\n<p>Hello <img alt...",Hello Nice to see you here Regards,"[hello, nice, see, regard]"
2,Announcements,"<div class=""entry-content"">\n<p>Hello everyone...",Hello everyone I would like to tell you that ...,"[hello, everyone, would, like, tell, implement..."
3,Introductions,"<div class=""entry-content"">\n<p>Hello Punka! n...",Hello Punka nice to meet you As you asked ho...,"[hello, punka, nice, meet, asked, stay, secure..."
4,Announcements,"<div class=""entry-content"">\n<h5>Changelog fro...",Changelog from Wednesday 2nd November 2016Fi...,"[changelog, wednesday, 2nd, november, 2016fixe..."


In [None]:
stopwords.words('english')

In [None]:
# wall_nlp = pd.read_csv('data/wall_nlp.csv')

In [8]:
cleaned_content[:5]

0    [thought, introduce, new, community, trying, l...
1                           [hello, nice, see, regard]
2    [hello, everyone, would, like, tell, implement...
3    [hello, punka, nice, meet, asked, stay, secure...
4    [changelog, wednesday, 2nd, november, 2016fixe...
Name: lemmatized_tokens, dtype: object

In [7]:
# Must be even for visualizations.
n_topics = 20
max_doc_freq = 0.3

cleaned_content = wall_nlp['lemmatized_tokens']
docs_bow, lda_bow, vect_bow = nlp_utils.convert_to_bow_and_fit_lda_model(cleaned_content, 10000, max_doc_freq, n_topics, 'batch', 25, 3)

sorted_components = np.argsort(lda_bow.components_, axis=1)[:, ::-1]
feat_names = np.array(vect_bow.get_feature_names())

AttributeError: 'list' object has no attribute 'lower'

In [None]:
nlp_utils.print_topics(lda_bow, vect_bow, 10)

In [None]:
# Evaluate each topics (components) overall weight.
# Topics are named by the two most common words.

topics_per_plot = int(n_topics / 2)
barh_xlim = 2000
plot_pad = 100
fig_size = (15,8)
num_cols = 2

fig, ax = plt.subplots(1, 2, figsize=fig_size)

topic_names = ['{} {}'.format(i, ' '.join(words)) for i, words in enumerate(feat_names[sorted_components[:, :2]])]

for col in range(num_cols):
    start = col * topics_per_plot
    end = (col + 1) * topics_per_plot
    plot_width = np.sum(docs_bow, axis=0)[start:end]
    ax[col].barh(np.arange(topics_per_plot), plot_width)
    ax[col].set_yticks(np.arange(topics_per_plot))
    ax[col].set_yticklabels(topic_names[start:end], ha='left', va='top')
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, barh_xlim)
    ax[col].set_xlabel('# of documents')
    y_ax = ax[col].get_yaxis()
    y_ax.set_tick_params(pad=plot_pad)

plt.tight_layout();

In [None]:
# Topic order is based on topic popularity.
# The distance b/t circles reprsents an approximation topic similarity. It's approximation b/c only a 2D map.
# When lambda = 0, the bar chart solely show topic-specific terms (jargon).
doc_term_matrix = np.matrix(lda_bow.components_)
pyLDAvis.sklearn.prepare(lda_bow, doc_term_matrix, vect_bow)

---
# TF-IDF

In [10]:
# Must be even for visualizations.
n_topics = 20
max_doc_freq = 0.3

tokens = wall_nlp['lemmatized_tokens']
docs_tfidf, lda_tfidf, vect_tfidf = nlp_utils.calculate_tfidf_and_fit_lda_model(tokens, 10000, max_doc_freq, n_topics, 'batch', 25, 3)

sorted_components = np.argsort(lda_tfidf.components_, axis=1)[:, ::-1]
feat_names = np.array(vect_tfidf.get_feature_names())

AttributeError: 'list' object has no attribute 'lower'

In [None]:
nlp_utils.print_topics(lda_tfidf, vect_tfidf, 10)

In [None]:
doc_term_matrix = np.matrix(lda_tfidf.components_)
pyLDAvis.sklearn.prepare(lda_tfidf, doc_term_matrix, vect_tfidf)