In [11]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

from scipy.spatial.distance import cdist

import nltk
import string

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from src import nlp_topic_utils

from joblib import parallel_backend

# Wall Street Forum Topic Modeling

## Data Import and Tf-Idf Vectorization

In [2]:
wallstreet = pd.read_csv('data/wallstreet_master.csv', engine='python')

In [3]:
docs_raw = wallstreet['processed_text']
n_topics = 20
max_doc_freq = 0.7
min_doc_freq = 4
max_features = 10000
ngram_rng = [1, 2]
max_iterations = 25
random_state=3

In [4]:
docs_vectorized, vectorizer = nlp_topic_utils.tfidf_vectorizer(docs_raw, min_doc_freq, max_doc_freq, max_features, ngram_rng)

## TruncatedSVD

In [7]:
docs_vectorized.shape

(45372, 10000)

In [5]:
def select_tsvd_num_components(var_ratio, var_explained_threshold):
    '''
    Determines the number of components required for TruncatedSVD to achieve the threshold of variance explained.
    '''
    total_var = 0
    n_components = 0
    
    for explained_var in var_ratio:
        total_var += explained_var
        n_components += 1
        
        if total_var >= goal_var:
            break
    
    return n_components

In [12]:
with parallel_backend('threading', n_jobs=-1):
    tsvd = TruncatedSVD(n_components=docs_vectorized.shape[1]-1, random_state=random_state)
    docs_tsvd = tsvd.fit(docs_vectorized)

MemoryError: Unable to allocate 3.38 GiB for an array with shape (10000, 45372) and data type float64

In [None]:
# Identify the required number of components to achieve the desired threshold of variance explained.
var_explained_threshold = 0.90

tsvd_var_ratios = tsvd.explained_variance_ratio_
tsvd_num_components = select_tsvd_num_components(tsvd_var_ratios, var_explained_threshold)

In [9]:
# with parallel_backend('threading', n_jobs=-1):
tsvd = TruncatedSVD(n_components=tsvd_num_components, random_state=random_state)
docs_tsvd = tsvd.fit_transform(docs_vectorized)

NameError: name 'tsvd_num_components' is not defined

---
## K-Means

In [None]:
# Identify number of cluseters using the "Elbow" method

# distortion is the average of the squared distances from the cluster centers of the respective clusters.
distortions = []
distortion_map = {}

# Inertia is the sum of squared distances of samples to their closest cluster center
inertias = []
inertia_map = {}

k_rng = np.arange(2, 30, 2)

for k in k_rng:
    kmeans = KMeans(n_clusters=k, init='k-means++').fit(docs_tsvd)
    pred = kmeans.predict(docs_tsvd)
    centroids = kmeans.cluster_centers_
    
    d = sum(np.min(cdist(docs_vectorized, centroids, metric='euclidean'), axis=1)) / docs_tsvd.shape[0]
    distortions.append(d)
    distortion_map[k] = d
    
    inertias.append(kmeans.inertia_)
    inertia_map[k] = kmeans.inertia_
    
fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(6,8))

ax[0].plot(k_rng, distortions, linewidth=2)
ax[0].set_xlabel('# of Clusters')
ax[0].set_ylabel('Avg. of Squared Errors')

ax[1].plot(k_rng, inertias, linewidth=2)
ax[1].set_xlabel('# of Clusters')
ax[1].set_ylabel('Sum of Squared Errors')
    


In [None]:
# Instiate KMeans model object.

k = 20
kmeans = KMeans(n_clusters=k, random_state=random_state)
kmeans.fit(docs_tsvd)

In [None]:
clusters = kmeans.labels_
k_clusters_centers = kmeans.cluster_centers_
original_space_centroids = tsvd.inverse_transform(k_clusters_centers)
# argsort() returns the indices of the sorted values.
ordered_centroids = original_space_centroids.argsort()[:, ::-1]

In [None]:
words = vectorizer.get_feature_names
for topic_idx, topic in enumerate():
        top_words_arr = [words[i] for i in order_centroids[topic_idx, : n_top_words]]
        print("\nCluster #{}:".format(topic_idx))
        print(" ".join(top_words_arr))