In [2]:
%load_ext autotime

In [3]:
import nltk
import pickle
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tools.functions

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# nltk.download('stopwords')
# nltk.download('wordnet')

time: 2.23 s


In [4]:
vectorizer_max_features = 1500

time: 235 µs


In [5]:
all_text = pd.read_csv('archive/sub_reddits.csv')
all_text['selftext'] = all_text['selftext'].astype(str)

time: 6.5 s


In [6]:
sanitized_posts_df = pd.DataFrame(tools.functions.sanitize_posts(all_text['selftext']), columns=['posts'])
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != '']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != 'a']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'] != '_']
sanitized_posts_df = sanitized_posts_df[sanitized_posts_df['posts'].str.contains('[^\d^\s]')]

time: 2min 13s


In [20]:
# Workspace for additional text cleaning

time: 703 µs


# The following section runs a Latent Semantic Analysis (LSA) on the corpus using TruncatedSVD

In [21]:
count_vectorizer = CountVectorizer(min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
count_vectorized_posts = count_vectorizer.fit_transform(sanitized_posts_df['posts'])



time: 21.3 s


In [26]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
svd_fit_transform = svd.fit_transform(count_vectorized_posts)


print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum())
print(svd.singular_values_)

[0.13418152 0.11829433]
0.25247584468078527
[3170.21868301 2869.74127917]
time: 5 s


In [23]:
topic_word = pd.DataFrame(svd.components_.round(3), index=["component_1", "component_2"], columns=vectorizer.get_feature_names())
topic_word

Unnamed: 0,00,000,0000,00000,000000,000001,00001,0001,0001104659,0001193125,...,то,уоu,что,это,ісо,الاستثمار,في,كما,من,ﬁnancial
component_1,0.037,0.024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.012,-0.005,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


time: 63.8 ms


In [24]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

time: 478 µs


In [25]:
display_topics(svd, vectorizer.get_feature_names(), 5)


Topic  0
amp, http, com, gt, earnings

Topic  1
comment, wallstreetbets, 17, 15, spy
time: 37.7 ms


In [28]:
Vt = pd.DataFrame(svd_fit_transform.round(5),
             index = sanitized_posts_df['posts'],
             columns = ["component_1","component_2" ])
Vt

Unnamed: 0_level_0,component_1,component_2
posts,Unnamed: 1_level_1,Unnamed: 2_level_1
so m sober a judge about to go to sleep when have this idea and get up and go write it down in notepad because that a good idea then get better one which is to submit it here so here is what wrote expected variance over time option put and call price are they linear function with respect to time maximize call and putt cost v gross rev combined with prob dist of stock price profit maybe if we get something going we can submit it to programing and get working program use subreddits like department lol,1.14076,-0.32376
i just sold my first house project house that ve lived in while renovating part time for the past 3 year and have 15 000 in profit that need to invest the money will be used a part of down payment on my next house in roughly 2 5 year have couple of idea but would like to hear what you would do in this situation edit what about etf a large portion of this investment pro con personal thought favorite fund etc,0.88391,-0.18157
we ve all heard the story about those numerous non penny stock that sold at penny level on thursday to me it pretty obvious what happened the s government must have instructed some of the largest volume trader institutional trader to dump large volume of those stock at penny level right at 2 38pm eastern time on thursday in an orchestrated effort to drive down key s stock market index and thereby dramatically devalue the s stock market objection 1 wouldn those institution lose combined billion of dollar by dumping those stock at penny level yes indeed they would but believe that the s federal reserve promised to recoup those institution whatever loss they incurred objection 2 it no secret that china among other country ha been trying to slow down it economy to stop t currency from being too quickly devalued is it so hard to imagine then that the s would want to do the same thing only more covertly the s dollar ha been severely devalued recently until this week that is one ha to wonder which firm gained from all of of these s stock being sold off at penny level suspect they could have been chinese investor in other word it very possible that china just took gigantic windfall profit at the expense of the s stock market precisely when they didn want to take one did the s market manipulation capability just assert it dominance over chinese market manipulation capability know it seems counter intuitive but so doe the idea of slowing down your own economy in order to bolster your currency would not be at all surprised to see this currency warfare continue for month until the big chinese firm learn how to fight back thursday just proved that american innovation is leap and bound ahead of the chinese in regard to market and currency manipulation the dollar is ridiculously stronger today than it wa only few day ago against both the euro and the yuan,4.34081,-1.40833
i want to learn about all type of investing then may decide to focus and study on certain area more specifically will shortly have lot more time on my hand and love to read and study new thing doe anyone have any resource they can share be it book to buy online literature etc,0.42224,-0.12648
can someone recommend good charting site where can see all of my stock in big long list with each displaying one year chart option for 2 3 5 would be nice preferably with moving average rsi etc a it is now use yahoo finance and clicking through to each stock 30 just take way too long,1.09618,-0.32689
...,...,...
disclaimer am long equity please do due diligence this is based off of 60 minute of quick analysis company overview xeris is spec pharma company founded in 2005 their primary scope of work involves developed injectable and infusible drug 2019 present product launch their first product wa approved in september 2019 called gvoke it is pfs and auto injector that ha glucagon to treat severe hypoglycemia this is market a two different product gvoke pfs nov 2019 and gvoke hypopen july 2020 competition primary competition come from tradition glucagon kit and eli lilly baqsimi baqsimi is delivered via the nasal passage legacy kit are traditional syringe injection xeris ha the advantage with both pfs and auto injector which traditionally are well received with patient amp x200b baqsimi http preview redd it ddtlxc9revg51 jpg width 960 amp format pjpg amp auto webp amp b046cc1e8c8253d0a7ebed32ae36fa61308834a6 amp x200b legacy kit http preview redd it pachhzbuevg51 jpg width 350 amp format pjpg amp auto webp amp 77df31e4c44ddb60207ebaf5f6103958ddcbd8b4 amp x200b financials the most popular product will most likely be the two pack hypopen auto injector this carry awp of 673 92 for adult the prescription is 1mg 0 2ml baqsimi cost is similar with 3 mg dosage and legacy kit from lly cost 280 kit amp x200b gvoke micromedex http preview redd it i5qo0ukvevg51 jpg width 717 amp format pjpg amp auto webp amp 953a76cddb98d17ec991e9a742fcbdc0db05d967 one overhang with xeris financials is their long term debt which ha increased from 58 3m ye19 to 109 5m 2q20 however principal payment do not start until 2022 and interest expense should be below 10m for the year xeris should be generating enough fcf over the next two year to service their debt valuation takeaway peak sale of 250m wacc 11 positive ebitda by 2026 amp x200b dcf http preview redd it 10k7gz5mevg51 jpg width 1304 amp format pjpg amp auto webp amp 263486dcc4c7fc9bb015888a646038b97f0a1ab8 amp x200b price target http preview redd it 22e13qipevg51 jpg width 423 amp format pjpg amp auto webp amp 3b906a275c204f3e3ccde14d2f896e9b891198e0,12.45172,-3.36845
a few month ago ran across forum very similar to reddit where people could anonymously publish short thesis unfortunately ve been unable to find it since doe anyone know if it still up or know what m talking about,0.37083,-0.09373
love researching quality information about interesting company however it is hard to find those at the intersection of intriguing yet understandable to an outsider this unfortunately rule out most of pharma for example ve really enjoyed following tesla a ve always been passionate about alternative source of energy and low cost airline a ve been flying around europe since wa only few month old love ryanair and wizz though haven actually invested in any of those two but in u low cost airline instead what interesting to note is that usually the more engaging the company the better it ha done for me financially looking forward to your tip,0.97487,-0.31669
knowing everything you know now a an investor if you could go back in time and give your beginner self advice what would it be share yours in the comment let spread wisdom for everyone,0.61824,0.53665


time: 25 ms


# TODO Figure out cosine_similarity

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity((topic_word.iloc[0], topic_word.iloc[1])).round()

array([[1., 0.],
       [0., 1.]])

time: 4.09 ms


# Implementing NMF

In [7]:
NMF_vectorizer = CountVectorizer(stop_words='english')
NMF_posts = NMF_vectorizer.fit_transform(sanitized_posts_df['posts'])

time: 20.3 s


In [37]:
from sklearn.decomposition import NMF

nmf_model = NMF(2, random_state=42)
NMF_nmf = nmf_model.fit_transform(NMF_posts)

time: 8.25 s


In [38]:
NMF_df = pd.DataFrame(nmf_model.components_.round(3), index=['component_1', 'component_2'], columns=NMF_vectorizer.get_feature_names())

time: 382 ms


In [39]:
NMF_df

Unnamed: 0,00,000,0000,00000,000000,00000000,000000000,000000000000000001,00000000000000f34b57e3bee97172558574a2b2a5d50e20e23b45e28955673f,0000000000000558,...,𝗽𝗿𝗶𝗰𝗲,𝘁𝗵𝗮𝗻,𝘁𝗵𝗲,𝘐𝘯𝘵𝘦𝘭𝘭𝘪𝘨𝘦𝘯𝘵,𝘐𝘯𝘷𝘦𝘴𝘵𝘰𝘳,𝘛𝘩𝘦,𝚃𝚊𝚗𝚔,𝟐𝟓,𝟙𝟟,𝟚𝟙𝟝ℙ
component_1,2.284,1.426,0.003,0.0,0.002,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 16.8 ms


In [41]:
display_topics(nmf_model, NMF_vectorizer.get_feature_names(), 10)


Topic  0
amp, http, gt, com, earnings, stock, market, ha, click, company

Topic  1
comment, wallstreetbets, 17, 15, spy, 24, 19, 20, www, reddit
time: 295 ms


# Latent Dirichlet Allocation

In [9]:
LDA_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b', max_df=0.5, min_df=10)

LDA_transformed = LDA_vectorizer.fit_transform(sanitized_posts_df['posts'])

time: 17 s


In [10]:
LDA_Tf_vectorizer = TfidfVectorizer(**LDA_vectorizer.get_params())

tfidfvectorized_posts = LDA_Tf_vectorizer.fit_transform(sanitized_posts_df['posts'])

time: 17.6 s


In [11]:
tfidf_df = pd.DataFrame(tfidfvectorized_posts.toarray(), columns=LDA_vectorizer.get_feature_names())
tfidf_df.head()

Unnamed: 0,aaa,aaaand,aaba,aaii,aal,aamrq,aaoi,aap,aapl,aapls,...,zuck,zuckerberg,zuckerman,zumiez,zumz,zuo,zuora,zurich,zweig,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 17.3 s


In [12]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(LDA_transformed)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(tfidfvectorized_posts)

LatentDirichletAllocation(n_components=20, random_state=0)

time: 39min 22s


In [13]:
with open('archive/lda.20.0.pkl', 'wb') as picklefile:
    pickle.dump(lda_tfidf, picklefile)

time: 6.46 ms


In [None]:
with open('archive/lda.20.0.pkl', 'rb') as picklefile:
    lda_tfidf = pickle.load(picklefile)

In [15]:
trans_lda = lda_tfidf.transform(tfidfvectorized_posts)

time: 56.1 s


In [16]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_tf, LDA_transformed, LDA_Tf_vectorizer)

time: 1min 39s


In [17]:
pyLDAvis.sklearn.prepare(lda_tfidf, tfidfvectorized_posts, LDA_Tf_vectorizer)

  and should_run_async(code)


time: 54.9 s


In [18]:
pyLDAvis.sklearn.prepare(lda_tf, LDA_transformed, LDA_Tf_vectorizer, mds='mmds')

  and should_run_async(code)


time: 1min 40s


In [19]:
pyLDAvis.sklearn.prepare(lda_tf, LDA_transformed, LDA_Tf_vectorizer, mds='tsne')

  and should_run_async(code)


time: 1min 34s
