In [1]:
%load_ext autotime

In [2]:
# time: 1.16 s

import pickle
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from lib.Text_Pre_Processing_in_Python import Preprocess
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

%matplotlib inline
warnings.filterwarnings("ignore", category=DeprecationWarning)

time: 1.54 s


In [3]:
# time: 235 µs

vectorizer_max_features = 1500

time: 246 µs


In [4]:
# time: 6.5 s

all_text = pd.read_csv('archive/sub_reddits.csv', low_memory=False)
all_text['selftext'] = all_text['selftext'].astype(str)

time: 9.26 s


In [None]:
# time: 22min 24s

prep = Preprocess.Preprocess()
sanitized_posts = []

for post in all_text['selftext']:
    document = re.sub(r'https*:\/\/[\w*\-*\.]*[\w\-*\/]*[\.\w]*[^\"\s]*', ' ', str(post))
    document = re.sub(r'\/[\w*\-*\.]*[\w\-*\/]*[\.\w]*[^\"\s]*', ' ', document)
    document = re.sub(r'amp;', '', document)
    document = " ".join(prep.preprocess(document))
    sanitized_posts.append(document)


In [None]:
# time: 191 ms

with open('archive/pre_processed.pkl', 'wb') as picklefile:
    pickle.dump(sanitized_posts, picklefile)

In [8]:
# time: 238 ms

with open('archive/pre_processed.pkl', 'rb') as picklefile:
    sanitized_posts_df = pd.DataFrame(pickle.load(picklefile), columns=["posts"])

time: 184 ms


In [9]:
# time: 2min 13s

sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != 'test']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != '']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != 'a']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != '_']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != ' ']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'].str.contains('[^\d^\s]')]
sanitized_posts_df['posts'] = sanitized_posts_df['posts'].astype(str)

sanitized_posts_df.to_csv('archive/sanitized.csv', index=False)

time: 3.21 s


In [None]:
# time: 703 µs

# Workspace for additional text cleaning

In [6]:
# time: 1.53 s

sanitized_posts_df = pd.read_csv('archive/sanitized.csv')

time: 1.04 s


# The following section runs a Latent Semantic Analysis (LSA) on the corpus using TruncatedSVD

In [10]:
# time: 21.3 s

count_vectorizer = CountVectorizer(min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
count_vectorized_posts = count_vectorizer.fit_transform(sanitized_posts_df['posts'])

time: 12.8 s


In [11]:
# time: 5 s

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_fit_transform = svd.fit_transform(count_vectorized_posts)

print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

[0.12226753 0.0818686 ]
0.20413612240416884
[2464.32562309 1920.00286489]
time: 4.4 s


In [12]:
# time: 63.8 ms

topic_word = pd.DataFrame(svd.components_.round(3), index=["component_1", "component_2"], columns=count_vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaaa,aaaaaand,aaaaand,aaaand,aaand,aaarated,aaba,aac,...,то,уоu,что,это,ісо,الاستثمار,في,كما,من,ﬁnancial
component_1,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 55.4 ms


In [13]:
# time: 478 µs

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

time: 651 µs


In [14]:
# time: 37.7 ms

display_topics(svd, count_vectorizer.get_feature_names(), 5)


Topic  0
gt, earnings, stock, ha, market

Topic  1
nigger, america, nt, trade, post
time: 30.1 ms


In [15]:
# time: 25 ms

Vt = pd.DataFrame(svd_fit_transform.round(5),
             index = sanitized_posts_df['posts'],
             columns = ["component_1","component_2" ])
Vt

Unnamed: 0_level_0,component_1,component_2
posts,Unnamed: 1_level_1,Unnamed: 2_level_1
sober judge go sleep idea get go write notepad good idea get better one submit wrote expected variance time option put call price linear function respect time maximize call putt cost v gross rev combined prob dist stock price profit maybe get something going submit get working program use subreddits like department lol,1.59903,0.00004
sold first house project house lived renovating part time past year profit need invest money used part payment next house roughly year couple idea would like hear would situation edit etf large portion investment pro personal thought favorite fund etc,1.09331,0.00002
heard story numerous nonpenny stock sold penny level thursday pretty obvious happened us government must instructed largest volume trader institutional trader dump large volume stock penny level right pm eastern time thursday orchestrated effort drive key us stock market index thereby dramatically devalue us stock market objection would nt institution lose combined billion dollar dumping stock penny level yes indeed would believe us federal reserve promised recoup institution whatever loss incurred objection secret china among country ha trying slow economy stop currency quickly devalued hard imagine us would want thing covertly us dollar ha severely devalued recently week one ha wonder firm gained us stock sold penny level suspect could chinese investor word possible china took gigantic windfall profit expense us stock market precisely nt want take one us market manipulation capability assert dominance chinese market manipulation capability know seems counterintuitive doe idea slowing economy order bolster currency would surprised see currency warfare continue month big chinese firm learn fight back thursday proved american innovation leap bound ahead chinese regard market currency manipulation dollar ridiculously stronger today wa day ago euro yuan,6.48138,0.00001
want learn type investing may decide focus study certain area specifically shortly lot time hand love read study new thing doe anyone resource share book buy online literature etc,0.60494,0.00001
someone recommend good charting site see stock big long list displaying one year chart option would nice preferably moving average rsi etc use yahoo finance clicking stock take way long,1.30685,0.00000
...,...,...
disclaimer long equity please due diligence based minute quick analysis company overview xeris spec pharma company founded primary scope work involves developed injectable infusible drug present product launch first product wa approved september called gvoke pfs autoinjector ha glucagon treat severe hypoglycemia market two different product gvoke pfs nov gvoke hypopen july competition primary competition come tradition glucagon kit eli lilly ’ baqsimi baqsimi delivered via nasal passage legacy kit traditional syringe injection xeris ha advantage pfs autoinjector traditionally well received patient xb baqsimi xb legacy kit xb financials popular product likely twopack hypopen autoinjector carry awp adult prescription mg baqsimi cost similar mg dosage legacy kit lly cost xb gvoke micromedex one overhang xeris ’ financials longterm debt ha increased ye q however principal payment start interest expense year xeris generating enough fcf next two year service debt valuation takeaways · peak sale · wacc · positive ebitda xb dcf xb price target,2.90362,-0.00001
month ago ran across forum similar reddit people could anonymously publish short thesis unfortunately unable find since doe anyone know still know talking,0.50072,0.00001
love researching quality information interesting company however hard find intersection intriguing yet understandable outsider unfortunately rule pharma example really enjoyed following tesla always passionate alternative source energy lowcost airline flying around europe since wa month old love ryanair wizz though nt actually invested two u lowcost airline instead interesting note usually engaging company better ha done financially looking forward tip,1.42672,0.00001
knowing everything know investor could go back time give beginner self advice would share comment let spread wisdom everyone,0.63715,0.00001


time: 22 ms


# TODO Figure out cosine_similarity

In [None]:
# time: 4.09 ms

from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity((topic_word.iloc[0], topic_word.iloc[1])).round()

# Implementing NMF

In [None]:
# time: 20.3 s

NMF_vectorizer = CountVectorizer(stop_words='english')
NMF_posts = NMF_vectorizer.fit_transform(sanitized_posts_df['posts'])

In [None]:
# time: 8.25 s

from sklearn.decomposition import NMF

nmf_model = NMF(2, random_state=42)
NMF_nmf = nmf_model.fit_transform(NMF_posts)

In [None]:
# time: 382 ms

NMF_df = pd.DataFrame(nmf_model.components_.round(3), index=['component_1', 'component_2'], columns=NMF_vectorizer.get_feature_names())

NMF_df

In [None]:
# time: 295 ms

display_topics(nmf_model, NMF_vectorizer.get_feature_names(), 10)

# Latent Dirichlet Allocation

In [None]:
# time: 17 s

LDA_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b', max_df=0.5, min_df=10)

LDA_transformed = LDA_vectorizer.fit_transform(sanitized_posts_df['posts'])

In [None]:
# time: 17.6 s

LDA_Tf_vectorizer = TfidfVectorizer(**LDA_vectorizer.get_params())

tfidfvectorized_posts = LDA_Tf_vectorizer.fit_transform(sanitized_posts_df['posts'])

In [None]:
# time: 17.3 s

tfidf_df = pd.DataFrame(tfidfvectorized_posts.toarray(), columns=LDA_vectorizer.get_feature_names())
tfidf_df.head()

In [None]:
# time: 39min 22s

# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf.fit(LDA_transformed)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tfidf.fit(tfidfvectorized_posts)

In [None]:
# time: 6.46 ms

with open('archive/lda.20.0.pkl', 'wb') as picklefile:
    pickle.dump(lda_tfidf, picklefile)
with open('archive/lda_tfidf.20.0.pkl', 'wb') as picklefile:
    pickle.dump(lda_tfidf, picklefile)

In [None]:
# time: 4.84 ms

with open('archive/lda.20.0.pkl', 'rb') as picklefile:
    lda_tf= pickle.load(picklefile)
with open('archive/lda_tfidf.20.0.pkl', 'rb') as picklefile:
    lda_tfidf = pickle.load(picklefile)

In [None]:
# time: 56.1 s

trans_lda = lda_tfidf.transform(tfidfvectorized_posts)

In [None]:
# time: 188 ms

import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()

In [None]:
# time: 1min 45s

pyLDAvis.sklearn.prepare(lda_tf, LDA_transformed, LDA_Tf_vectorizer)

In [None]:
# time: 54.9 s

pyLDAvis.sklearn.prepare(lda_tfidf, tfidfvectorized_posts, LDA_Tf_vectorizer)

In [None]:
# time: 1min 40s

pyLDAvis.sklearn.prepare(lda_tf, LDA_transformed, LDA_Tf_vectorizer, mds='mmds')

In [None]:
# time: 1min 34s

pyLDAvis.sklearn.prepare(lda_tf, LDA_transformed, LDA_Tf_vectorizer, mds='tsne')

In [None]:
# time: 561 ms

import scipy.sparse as ss

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

In [None]:
# time: 17.5 s

cor_vectorizer = CountVectorizer(max_features=20000, stop_words='english', token_pattern="\\b[a-z][a-z]+\\b", binary=True)

cor_posts = cor_vectorizer.fit_transform(sanitized_posts_df['posts'])
cor_words = list(np.asarray(cor_vectorizer.get_feature_names()))

In [None]:
# time: 51.3 s

topic_model = ct.Corex(n_hidden=6, words=cor_words, seed=1)
topic_model.fit(cor_posts, words=cor_words, docs=sanitized_posts_df['posts'])

In [None]:
# time: 19.8 ms

topics = topic_model.get_topics()
for n, topic in enumerate(topics):
    topic_words, _ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

In [None]:
# time: 227 ms

topic_model.get_top_docs(topic=4, n_docs=2)

In [None]:
# time: 372 ms

predictions = pd.DataFrame(topic_model.predict(cor_posts), columns=['topic'+str(i) for i in range(6)])
predictions.head(3)

In [None]:
# time: 208 ms

plt.figure(figsize=(10,5))
plt.bar(range(topic_model.tcs.shape[0]), topic_model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);

In [None]:
# time: 1min 40s

topic_model = ct.Corex(n_hidden=6, words=cor_words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(cor_posts, words=cor_words, docs=sanitized_posts_df['posts'], 
                anchors=[['options'],
                         ['tendies'], 
                         ['yolo'],
                         ['god'], 
                         ['politics']], anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

# Word2Vec

In [None]:
wv_vectorizer = CountVectorizer(ngram_range=(1, 2))

wv_vectorizer.fit(all_text['selftext'])

In [None]:

wv = wv_vectorizer.transform(all_text['selftext'])

In [None]:
with open('archive/wv_vectorizer.pkl', "wb") as picklefile:
    pickle.dump(wv_vectorizer, picklefile)

with open('archive/wv_vectorized.pkl', "wb") as picklefile:
    pickle.dump(wv, picklefile)

# Crashes my machine.
wv_array = wv.toarray()

In [None]:


pd.DataFrame(wv_array, columns=wv_vectorizer.get_feature_names())

# Doc2Vec

In [None]:
import gensim
import gensim.downloader as api
# dataset = api.load("text8")
data = [d for d in dataset]

In [None]:
# time: 2.07 s

def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])
data_for_training = list(tagged_document(all_text['selftext']))

In [None]:
# time: 297 µs

print(data_for_training [:1])

In [None]:
# time: 2.27 ms

model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)

In [None]:
# time: 1min 14s

model.build_vocab(data_for_training)

In [None]:
# time: 20min 20s

model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# time: 5.2 ms
# TODO fix infer_vector list
print(model.infer_vector(['violent', 'means', 'to', 'destroy', 'the','organization']))

In [None]:
with open('archive/gensim_model.pkl', 'wb') as picklefile:
    pickle.dump(model, picklefile)

# PCA

In [None]:
from sklearn.decomposition import PCA

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm

In [None]:
# Take all of the data and plot it on 2 dimensions
pca = PCA(n_components=2)

In [None]:
cvp_arr = count_vectorized_posts.toarray()
cvp_df = pd.DataFrame(cvp_arr)
cvp_df.head()

# crashes
pca.fit(cvp_df)

In [None]:

pcafeatures_train = pca.transform(all_text['selftext'])

# KMeans

In [None]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle

In [None]:
# helper function that allows us to display data in 2 dimensions an highlights the clusters
def display_cluster(X, km=[], num_clusters=0):
    color = 'brgcmyk'
    alpha = 0.5
    s = 20
    
    if num_clusters == 0:
        plt.scatter(X[:,0], X[:,1], c=color[0], alpha=alpha, s=s)
        
    else:
        for i in range(num_clusters):
            plt.scatter(X[km.labels_==i,0],X[km.labels_==i,1],c = color[i],alpha = alpha,s=s)
            plt.scatter(km.cluster_centers_[i][0],km.cluster_centers_[i][1],c = color[i], marker = 'x', s = 100)