In [3]:
# Package installations
#!pip install yahoo_fin
#!pip install requests_html

# For named-entiry recognition (NER)
#!pip install spacy
#!python -m spacy download en_core_web_sm

#!pip install unidecode

# Requires MS Visual C++ 14.0 or above
#!pip install contractions 

Collecting contractions
  Using cached https://files.pythonhosted.org/packages/0a/04/d5e0bb9f2cef5d15616ebf68087a725c5dbdd71bd422bcfb35d709f98ce7/contractions-0.0.48-py2.py3-none-any.whl
Collecting textsearch>=0.0.21
  Using cached https://files.pythonhosted.org/packages/d3/fe/021d7d76961b5ceb9f8d022c4138461d83beff36c3938dc424586085e559/textsearch-0.0.21-py2.py3-none-any.whl
Collecting pyahocorasick
  Using cached https://files.pythonhosted.org/packages/4a/92/b3c70b8cf2b76f7e3e8b7243d6f06f7cb3bab6ada237b1bce57604c5c519/pyahocorasick-1.4.1.tar.gz
Building wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py): started
  Building wheel for pyahocorasick (setup.py): finished with status 'done'
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.1-cp37-cp37m-win_amd64.whl size=43600 sha256=07356c4f3d463d0acbd5d080efd9aefdabd6106974fb55120b4808e9260fe008
  Stored in directory: C:\Users\denni\AppData\Local\pip\Cache\wheels\e4\ab\f7\cb39270df8f612

In [2]:
# Imports
import pandas as pd
import numpy as np
import datetime as dt

# For financial data
import yahoo_fin
import yahoo_fin.stock_info as si
import yahoo_fin.options as ops

# For NER/NLP
import spacy
nlp = spacy.load('en_core_web_sm')
import re
import codecs
import unidecode
import contractions

In [41]:
# Read the files
nd = pd.read_csv("NASDAQ.csv")
amex = pd.read_csv("AMEX.csv")
nyse = pd.read_csv("NYSE.csv")
otc = pd.read_csv("OTC_List.csv")

# list of frames to concatenate together
frames = [nd, amex, nyse, otc]

# Concatenate all of them together
all_companies = pd.concat(frames, sort=False).reset_index()

# Select Symbol and Name columns
all_companies = all_companies[["Symbol", "Name"]]

# Convert the names to lowercase
all_companies["Name"] = all_companies["Name"].str.lower()

# Print the lengths to check that files have been read properly
print(len(nd))
print(len(amex))
print(len(nyse))
print(len(otc))

# Display (needs to be cleaned)
all_companies

3767
275
3036
653


Unnamed: 0,Symbol,Name
0,AACG,ata creativity global american depositary shares
1,AACQ,artius acquisition inc. class a common stock
2,AACQU,artius acquisition inc. unit
3,AACQW,artius acquisition inc warrant
4,AAL,american airlines group inc. common stock
...,...,...
7726,XSPY,spy
7727,YEWB,yew bio-pharm group
7728,ZFLO,zlato
7729,ZOSN,zosano new


In [42]:
all_companies.to_csv("./Data/all_companies.csv", index = False)

In [2]:
all_companies = pd.read_csv("./Data/all_companies.csv")

all_companies

Unnamed: 0,Symbol,Name
0,AACG,ata creativity global
1,AACQ,artius acquisition inc.
2,AACQU,artius acquisition inc.
3,AACQW,artius acquisition inc
4,AAL,american airlines group inc.
...,...,...
7726,XSPY,spy
7727,YEWB,yew bio-pharm group
7728,ZFLO,zlato
7729,ZOSN,zosano new


In [3]:
# Clean up text
# Source:  https://medium.com/the-innovation/can-we-actually-predict-market-change-by-analyzing-reddits-r-wallstreetbets-9d7716516c8e

def spacy_cleaner(text):
    try:
        decoded = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
    except:
        decoded = unidecode.unidecode(text)
    
    # Expand all contractions in text
    expanded = contractions.fix(decoded)
    
    parsed = nlp(expanded)
    
    final_tokens = []
    for t in parsed:
        if t.is_punct or t.is_space or t.like_num or t.like_url or str(t).startswith('@'):
            pass
        else:
            if t.lemma_ == '-PRON-':
                final_tokens.append(str(t))
            else:
                sc_removed = re.sub("[^a-zA-Z]", '', str(t.lemma_))
                if len(sc_removed) > 1:
                    final_tokens.append(sc_removed)
    joined = ' '.join(final_tokens)
    spell_corrected = re.sub(r'(.)\1+', r'\1\1', joined)
    return spell_corrected

In [30]:
""""
Desc: Extract the stock mentioned from the Reddit submission or comment.
Parameter: Submission or comment's text
output: list of entities labeled as ORG
Source: https://towardsdatascience.com/ner-for-extracting-stock-mentions-on-reddit-aa604e577be
"""
def get_company(text):
    
    BLACKLIST = ['ev', 'covid', 'etf', 'nyse', 'sec', 'spac', 'fda', 'treasury']
    
    # Read the text
    doc = nlp(text)
    
    # Empty list to store organization names
    orgs = []
    
    # For each entity in the doc.ents
    for entity in doc.ents:
        
        # If label_ is ORG and not in the BLACKLIST
        if entity.label_ == 'ORG' and entity.text.lower() not in BLACKLIST:
            
            # Add it to the list of organizations
            orgs.append(entity.text)
    
    # Convert to set() to remove duplicates, and then convert back to list
    orgs = list(set(orgs))
    
    # Each organization in orgs
    for i in range(len(orgs)):
        
        # If any of the companies in all_companies contain an organization mentioned, get the ticker
        if (all_companies["Name"].str.contains(orgs[i].lower(), regex = True).any()):
            orgs[i] = all_companies[all_companies["Name"].str.contains(orgs[i].lower())]['Symbol'].values[0]

        # Else if the ticker is mentioned
        elif (all_companies["Symbol"].str.match(orgs[i], case = False).any()):
            orgs[i] = all_companies[all_companies["Symbol"].str.match(orgs[i], case = False)]['Symbol'].values[0]

        # Else the orgs was not found
        else:
            orgs[i] = ""
    
    # Remove blank spaces in the list
    orgs[:] = [x for x in orgs if x.strip()]
    
    # If the list is empty
    if (len(orgs) == 0):
        orgs = ""
    
    return orgs


In [24]:
# Test to function to see if it matches multiple companies, and multi-word companies
print(get_company("GME is a rising stock"))

['GME']
['GME']


In [7]:
amazon_daily = si.get_data("AMZN", 
                           start_date = "02/15/2021", 
                           end_date = "03/08/2021", 
                           index_as_date = False, 
                           )
amazon_daily

#print(type(amazon_daily.date[0]))

Unnamed: 0,date,open,high,low,close,adjclose,volume,ticker
0,2021-02-16,3254.050049,3308.300049,3253.590088,3268.949951,3268.949951,2574700,AMZN
1,2021-02-17,3263.600098,3320.909912,3259.5,3308.639893,3308.639893,3297500,AMZN
2,2021-02-18,3282.419922,3338.0,3273.939941,3328.22998,3328.22998,3027400,AMZN
3,2021-02-19,3328.22998,3333.5,3245.75,3249.899902,3249.899902,4305200,AMZN
4,2021-02-22,3208.129883,3232.320068,3172.26001,3180.73999,3180.73999,3515700,AMZN
5,2021-02-23,3127.030029,3204.72998,3093.600098,3194.5,3194.5,4677200,AMZN
6,2021-02-24,3166.75,3171.22998,3125.379883,3159.530029,3159.530029,3011300,AMZN
7,2021-02-25,3136.73999,3178.26001,3047.76001,3057.159912,3057.159912,4533800,AMZN
8,2021-02-26,3095.199951,3122.439941,3036.699951,3092.929932,3092.929932,4273500,AMZN
9,2021-03-01,3127.889893,3149.560059,3097.98999,3146.139893,3146.139893,2729100,AMZN


In [10]:
# Read the comments data
ci = pd.read_csv("./Data/comment_aa_info.csv", 
                 usecols = ["body", "date", "comment_id", "score", "submission_id", "number_of_replies", "total_awards"],
                 low_memory = False)

ci2 = pd.read_csv("./Data/comment_ab_info.csv",
                  usecols = ["body", "date", "comment_id", "score", "submission_id", "number_of_replies", "total_awards"],
                  low_memory = False)

ci3 = pd.read_csv("./Data/comment_ac_info.csv",
                  usecols = ["body", "date", "comment_id", "score", "submission_id", "number_of_replies", "total_awards"],
                  low_memory = False)

ci4 = pd.read_csv("./Data/comment_ad_info.csv",
                  usecols = ["body", "date", "comment_id", "score", "submission_id", "number_of_replies", "total_awards"],
                  low_memory = False)

ci5 = pd.read_csv("./Data/comment_ae_info.csv",
                  usecols = ["body", "date", "comment_id", "score", "submission_id", "number_of_replies", "total_awards"],
                  low_memory = False)

# Concatenate them by stacking them on top of each other
coms = pd.concat([ci, ci2, ci3, ci4, ci5], axis = 0, ignore_index = True)

# Clear up memory
del ci, ci2, ci3, ci4, ci5

# Display
display(coms)

Unnamed: 0,body,date,comment_id,score,submission_id,number_of_replies,total_awards
0,I gotta say I'm impressed ot stayed above 15 a...,1.546349e+09,ed0exiy,16,abfam2,1,0.0
1,And it will do nothing or drop the entire year...,1.546430e+09,ed2nm06,2,abfam2,0,0.0
2,I was all in on amd from Nov 2017 to like Feb ...,1.546368e+09,ed0y55b,1,abfam2,0,0.0
3,Anyone selling puts on AMD here? I was consid...,1.546381e+09,ed1fwek,1,abfam2,1,0.0
4,Most people here aren't allowed by law to talk...,1.546320e+09,eczukex,8,abfckx,1,0.0
...,...,...,...,...,...,...,...
3941015,"keep holding, don’t lose hope",1.614295e+09,gorm5zy,1,lsjpqq,0,0.0
3941016,It's bots that are automatically deleting post...,1.614295e+09,gorm2nd,2,lsjpvj,0,0.0
3941017,cause there's 470k people here right now and h...,1.614295e+09,gorm2x6,1,lsjpvj,0,0.0
3941018,Short and plain titles instead of help try exp...,1.614295e+09,gorm6vw,1,lsjpvj,0,0.0


In [None]:
# Read the comment author info data
cai = pd.read_csv()

In [42]:
# Read the submissions data
si = pd.read_csv("./Data/sub_aa_info.csv", 
                 usecols = ["date", "submission_id", "link_flair_text", "num_comments", "score", "selftext", "title",
                           "upvote_ratio", "total_awards"],
                 low_memory = False)

si2 = pd.read_csv("./Data/sub_ab_info.csv", 
                 usecols = ["date", "submission_id", "link_flair_text", "num_comments", "score", "selftext", "title",
                           "upvote_ratio", "total_awards"],
                 low_memory = False)

si3 = pd.read_csv("./Data/sub_ac_info.csv", 
                 usecols = ["date", "submission_id", "link_flair_text", "num_comments", "score", "selftext", "title",
                           "upvote_ratio", "total_awards"],
                 low_memory = False)

si4 = pd.read_csv("./Data/sub_ad_info.csv", 
                 usecols = ["date", "submission_id", "link_flair_text", "num_comments", "score", "selftext", "title",
                           "upvote_ratio", "total_awards"],
                 low_memory = False)

si5 = pd.read_csv("./Data/sub_ae_info.csv", 
                 usecols = ["date", "submission_id", "link_flair_text", "num_comments", "score", "selftext", "title",
                           "upvote_ratio", "total_awards"],
                 low_memory = False)

# Concatenate them by stacking them on top of each other
subs = pd.concat([si, si2, si3, si4, si5], axis = 0, ignore_index = True)

# Clear up memory
del si, si2, si3, si4, si5

# Display
display(subs)

Unnamed: 0,date,submission_id,link_flair_text,num_comments,score,selftext,title,upvote_ratio,total_awards
0,1.546320e+09,abfam2,,9,63,"-43% since October 1st, most impressive.",Congratulations to AMD on winning the S&P 500 ...,0.92,0
1,1.546320e+09,abfckx,Discussion,12,0,[removed],"Alright cool dudes, I'm young and want to get ...",0.50,0
2,1.546321e+09,abfel2,,137,69,This sub has seen some steady growth 2018 aver...,2018 End of Year PnL,0.94,0
3,1.546321e+09,abffqs,,0,1,[deleted],btw for you newfags still eating Banquet tendi...,1.00,0
4,1.546321e+09,abfh8l,,0,1,[deleted],"GG 2018, let's start 2019 off right :)",1.00,0
...,...,...,...,...,...,...,...,...,...
1187407,1.614295e+09,lsjppv,Discussion,6,20,,Let’s make that dick tattoo happen 💎🙌,0.83,0
1187408,1.614295e+09,lsjppy,Meme,0,1,[deleted],"Hey AMC, I want my lunch money👀",1.00,0
1187409,1.614295e+09,lsjpqa,Discussion,4,0,[removed],CCIV,0.44,0
1187410,1.614295e+09,lsjpqq,YOLO,1,4,[removed],Got in first thing this morning and went full ...,1.00,0


In [43]:
# Read the submissions author info data
sai = pd.read_csv("./Data/sub_author_aa_info.csv", 
                  usecols = ["sub_author_commentkarma", "sub_author_id", "sub_author_name", "submission_id"],
                  low_memory = False)

sai2 = pd.read_csv("./Data/sub_author_ab_info.csv", 
                   usecols = ["sub_author_commentkarma", "sub_author_id", "sub_author_name", "submission_id"],
                   low_memory = False)

sai3 = pd.read_csv("./Data/sub_author_ac_info.csv", 
                   usecols = ["sub_author_commentkarma", "sub_author_id", "sub_author_name", "submission_id"],
                   low_memory = False)

sai4 = pd.read_csv("./Data/sub_author_ad_info.csv", 
                   usecols = ["sub_author_commentkarma", "sub_author_id", "sub_author_name", "submission_id"],
                   low_memory = False)

sai5 = pd.read_csv("./Data/sub_author_ae_info.csv", 
                   usecols = ["sub_author_commentkarma", "sub_author_id", "sub_author_name", "submission_id"],
                   low_memory = False)

# Concatenate them by stacking them on top of each other
sais = pd.concat([sai, sai2, sai3, sai4, sai5], axis = 0, ignore_index = True)

# Clear up memory
del sai, sai2, sai3, sai4, sai5

# Display
display(sais)

Unnamed: 0,sub_author_commentkarma,sub_author_id,sub_author_name,submission_id
0,32944,pnohb,jimmyjay90210,abfam2
1,11097,bo7wh,bawse1,abfel2
2,39188,13qo4rja,rustyryan27,abfn1c
3,13290,ao3h8,PetetheJuggler,abfzsq
4,391,v744nt8,cant-think-of-one-23,abg8ki
...,...,...,...,...
808997,0,a53arl2w,amiU_humm,lsjpk4
808998,8993,7r5q9n4,Driving4Success,lsjpof
808999,1120,aghzi,KevvCo,lsjppv
809000,0,a148hcj6,Superpants11,lsjpqa


In [44]:
# Merge subs and sais
subs = subs.merge(sais, how="outer", on="submission_id")

# Display
display(subs)

Unnamed: 0,date,submission_id,link_flair_text,num_comments,score,selftext,title,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name
0,1.546320e+09,abfam2,,9,63,"-43% since October 1st, most impressive.",Congratulations to AMD on winning the S&P 500 ...,0.92,0,32944.0,pnohb,jimmyjay90210
1,1.546320e+09,abfckx,Discussion,12,0,[removed],"Alright cool dudes, I'm young and want to get ...",0.50,0,,,
2,1.546321e+09,abfel2,,137,69,This sub has seen some steady growth 2018 aver...,2018 End of Year PnL,0.94,0,11097.0,bo7wh,bawse1
3,1.546321e+09,abffqs,,0,1,[deleted],btw for you newfags still eating Banquet tendi...,1.00,0,,,
4,1.546321e+09,abfh8l,,0,1,[deleted],"GG 2018, let's start 2019 off right :)",1.00,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1187407,1.614295e+09,lsjppv,Discussion,6,20,,Let’s make that dick tattoo happen 💎🙌,0.83,0,1120.0,aghzi,KevvCo
1187408,1.614295e+09,lsjppy,Meme,0,1,[deleted],"Hey AMC, I want my lunch money👀",1.00,0,,,
1187409,1.614295e+09,lsjpqa,Discussion,4,0,[removed],CCIV,0.44,0,0.0,a148hcj6,Superpants11
1187410,1.614295e+09,lsjpqq,YOLO,1,4,[removed],Got in first thing this morning and went full ...,1.00,0,5284.0,wzix2,irit8in


In [45]:
# Filter submissions

# Keep all subs where selftext is not removed
subs = subs[subs['selftext'] != "[removed]"]

# Keep all subs where selftext is not deleted
subs = subs[subs['selftext'] != "[deleted]"]

# Keep all subs where title is not removed
subs = subs[subs['title'] != "[removed]"]

# Keep all subs where title is not deleted
subs = subs[subs['title'] != "[deleted]"]

# Display
subs

Unnamed: 0,date,submission_id,link_flair_text,num_comments,score,selftext,title,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name
0,1.546320e+09,abfam2,,9,63,"-43% since October 1st, most impressive.",Congratulations to AMD on winning the S&P 500 ...,0.92,0,32944.0,pnohb,jimmyjay90210
2,1.546321e+09,abfel2,,137,69,This sub has seen some steady growth 2018 aver...,2018 End of Year PnL,0.94,0,11097.0,bo7wh,bawse1
6,1.546322e+09,abfluz,,9,16,,Is the film industry profitable? or is Disneys...,0.91,0,,,
11,1.546326e+09,abfzsq,Loss,80,2175,,Thank God I saved $2727.90 in commissions,0.97,1,13290.0,ao3h8,PetetheJuggler
13,1.546328e+09,abg8ki,,10,30,,Thank God I saved 2444$ on commissions,0.81,0,391.0,v744nt8,cant-think-of-one-23
...,...,...,...,...,...,...,...,...,...,...,...,...
1187400,1.614295e+09,lsjp7h,Discussion,22,11,,Robinhood won’t let me cancel my Gold membersh...,0.92,0,2862.0,5eqyn,wisedeezl
1187402,1.614295e+09,lsjpdk,Chart,2,14,,Trader Pro AH Being sus ... High @ $403???,0.94,0,3.0,1jycpvh,kethalles
1187404,1.614295e+09,lsjpk4,News,0,1,,Gamma Squeeze,1.00,0,0.0,a53arl2w,amiU_humm
1187406,1.614295e+09,lsjpof,Discussion,546,4516,Okay so we went from $45 to $184.68 and closed...,Expectations for Tomorrow,0.98,11,8993.0,7r5q9n4,Driving4Success


In [46]:
# Print all unique flairs in subs
print(subs.link_flair_text.unique())

# Print length of unique flairs
print(len(subs.link_flair_text.unique()))

[nan 'Loss' 'Shitpost' 'Daily Discussion' 'Discussion' 'Gain' 'Stocks'
 'Fundamentals' 'DD' 'Options' 'Futures' 'Mods' 'Technicals' 'YOLO'
 'Earnings Thread' 'News' 'Degeneracy at its finest' 'RAGE' 'MEME'
 'MNUCHIN' 'Announcement' 'Certified YOLO' 'Technicals:upvote:'
 'User banned for this post' 'OP has a gigantic dong' 'Find a new slant'
 'user was banned for this post' 'SEC LOOK AWAY' 'OP is FGT' 'Serious'
 'Honeypot 2, Electric Boogaloo' 'Kraft risked it all too' 'RH is trash'
 'bullshit word salad' 'Converted to NS contracts, details in comments'
 'im a dumbass' 'Gay' 'Idiot of the Day' '勤勉' 'Academic Research'
 'example of a trash post' 'Stonks' 'Friend Zone' 'Mods = gods'
 'Technically a Shitpost' 'OG YOLO' 'Nice' 'looks like model 3 woot'
 'OP actually posts info on his GAINZ' 'NEGATIVE COMMENTS = BAN'
 'OP Delivered' 'and permanently banned' 'Storytime'
 'user was un-banned for this' 'Satire' 'Meme'
 'This Just In: Fat broads can wear LULU' "oh no.. it's retarded"
 'Spicy Mem

In [47]:
# Filter the flairs
subs = subs[~subs['link_flair_text'].str.contains("remove", na=False, regex = True, flags=re.IGNORECASE)]
subs = subs[~subs['link_flair_text'].str.contains("shit", na=False, regex = True, flags=re.IGNORECASE)]

# Display
subs

Unnamed: 0,date,submission_id,link_flair_text,num_comments,score,selftext,title,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name
0,1.546320e+09,abfam2,,9,63,"-43% since October 1st, most impressive.",Congratulations to AMD on winning the S&P 500 ...,0.92,0,32944.0,pnohb,jimmyjay90210
2,1.546321e+09,abfel2,,137,69,This sub has seen some steady growth 2018 aver...,2018 End of Year PnL,0.94,0,11097.0,bo7wh,bawse1
6,1.546322e+09,abfluz,,9,16,,Is the film industry profitable? or is Disneys...,0.91,0,,,
11,1.546326e+09,abfzsq,Loss,80,2175,,Thank God I saved $2727.90 in commissions,0.97,1,13290.0,ao3h8,PetetheJuggler
13,1.546328e+09,abg8ki,,10,30,,Thank God I saved 2444$ on commissions,0.81,0,391.0,v744nt8,cant-think-of-one-23
...,...,...,...,...,...,...,...,...,...,...,...,...
1187400,1.614295e+09,lsjp7h,Discussion,22,11,,Robinhood won’t let me cancel my Gold membersh...,0.92,0,2862.0,5eqyn,wisedeezl
1187402,1.614295e+09,lsjpdk,Chart,2,14,,Trader Pro AH Being sus ... High @ $403???,0.94,0,3.0,1jycpvh,kethalles
1187404,1.614295e+09,lsjpk4,News,0,1,,Gamma Squeeze,1.00,0,0.0,a53arl2w,amiU_humm
1187406,1.614295e+09,lsjpof,Discussion,546,4516,Okay so we went from $45 to $184.68 and closed...,Expectations for Tomorrow,0.98,11,8993.0,7r5q9n4,Driving4Success


In [48]:
# Replace NaNs in selftext column
subs['selftext'] = subs['selftext'].fillna("")

# Replace NaNs in title column
subs['title'] = subs['title'].fillna("")

# Combine selftext and title into one column
subs["text"] = subs["title"] +" "+ subs["selftext"]

# Drop selftext and title columns
subs.drop(['selftext', 'title', 'link_flair_text'], axis=1, inplace=True)

# Display
subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,Congratulations to AMD on winning the S&P 500 ...
2,1.546321e+09,abfel2,137,69,0.94,0,11097.0,bo7wh,bawse1,2018 End of Year PnL This sub has seen some st...
6,1.546322e+09,abfluz,9,16,0.91,0,,,,Is the film industry profitable? or is Disneys...
11,1.546326e+09,abfzsq,80,2175,0.97,1,13290.0,ao3h8,PetetheJuggler,Thank God I saved $2727.90 in commissions
13,1.546328e+09,abg8ki,10,30,0.81,0,391.0,v744nt8,cant-think-of-one-23,Thank God I saved 2444$ on commissions
...,...,...,...,...,...,...,...,...,...,...
1187400,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood won’t let me cancel my Gold membersh...
1187402,1.614295e+09,lsjpdk,2,14,0.94,0,3.0,1jycpvh,kethalles,Trader Pro AH Being sus ... High @ $403???
1187404,1.614295e+09,lsjpk4,0,1,1.00,0,0.0,a53arl2w,amiU_humm,Gamma Squeeze
1187406,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,Expectations for Tomorrow Okay so we went from...


In [49]:
# Save the file for q1
subs.to_csv("./Data/sub_info_q1.csv", index=False)

In [50]:
# Filter sub_author_id != NaN
subs = subs[subs['sub_author_id'].notnull()]

# Filter sub_author_name != NaN
subs = subs[subs['sub_author_name'].notnull()]

# Display
subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,Congratulations to AMD on winning the S&P 500 ...
2,1.546321e+09,abfel2,137,69,0.94,0,11097.0,bo7wh,bawse1,2018 End of Year PnL This sub has seen some st...
11,1.546326e+09,abfzsq,80,2175,0.97,1,13290.0,ao3h8,PetetheJuggler,Thank God I saved $2727.90 in commissions
13,1.546328e+09,abg8ki,10,30,0.81,0,391.0,v744nt8,cant-think-of-one-23,Thank God I saved 2444$ on commissions
14,1.546328e+09,abg8m7,11,35,0.83,0,39188.0,13qo4rja,rustyryan27,"Thank god I saved $11,776.45 in commissions"
...,...,...,...,...,...,...,...,...,...,...
1187400,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood won’t let me cancel my Gold membersh...
1187402,1.614295e+09,lsjpdk,2,14,0.94,0,3.0,1jycpvh,kethalles,Trader Pro AH Being sus ... High @ $403???
1187404,1.614295e+09,lsjpk4,0,1,1.00,0,0.0,a53arl2w,amiU_humm,Gamma Squeeze
1187406,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,Expectations for Tomorrow Okay so we went from...


In [51]:
# Save the file for the remaining questions
subs.to_csv("./Data/sub_info_all.csv", index=False)

In [25]:
# Read the clean subs data
subs = pd.read_csv("./Data/sub_info_q1.csv")

# Display
subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,Congratulations to AMD on winning the S&P 500 ...
1,1.546321e+09,abfel2,137,69,0.94,0,11097.0,bo7wh,bawse1,2018 End of Year PnL This sub has seen some st...
2,1.546322e+09,abfluz,9,16,0.91,0,,,,Is the film industry profitable? or is Disneys...
3,1.546326e+09,abfzsq,80,2175,0.97,1,13290.0,ao3h8,PetetheJuggler,Thank God I saved $2727.90 in commissions
4,1.546328e+09,abg8ki,10,30,0.81,0,391.0,v744nt8,cant-think-of-one-23,Thank God I saved 2444$ on commissions
...,...,...,...,...,...,...,...,...,...,...
351064,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood won’t let me cancel my Gold membersh...
351065,1.614295e+09,lsjpdk,2,14,0.94,0,3.0,1jycpvh,kethalles,Trader Pro AH Being sus ... High @ $403???
351066,1.614295e+09,lsjpk4,0,1,1.00,0,0.0,a53arl2w,amiU_humm,Gamma Squeeze
351067,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,Expectations for Tomorrow Okay so we went from...


In [26]:
# Clean up the text in text column
subs["text"] = subs["text"].map(spacy_cleaner)

# Display
subs

  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  


Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...
1,1.546321e+09,abfel2,137,69,0.94,0,11097.0,bo7wh,bawse1,end of Year PnL this sub have see some steady ...
2,1.546322e+09,abfluz,9,16,0.91,0,,,,be the film industry profitable or be Disneys ...
3,1.546326e+09,abfzsq,80,2175,0.97,1,13290.0,ao3h8,PetetheJuggler,thank God save in commission
4,1.546328e+09,abg8ki,10,30,0.81,0,391.0,v744nt8,cant-think-of-one-23,thank God save on commission
...,...,...,...,...,...,...,...,...,...,...
351064,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...
351065,1.614295e+09,lsjpdk,2,14,0.94,0,3.0,1jycpvh,kethalles,Trader Pro ah be sus high
351066,1.614295e+09,lsjpk4,0,1,1.00,0,0.0,a53arl2w,amiU_humm,Gamma Squeeze
351067,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...


In [27]:
# Save cleaned texts
subs.to_csv("./Data/sub_info_all.csv", index=False)

In [31]:
# Get ticker names
subs["Companies"] = subs["text"].map(get_company)

In [32]:
# Save ticker names
subs.to_csv("./Data/sub_info_all.csv", index=False)

In [39]:
# Count how many are "" in subs["Companies"]
print(len(subs))
print(len(subs[subs["Companies"] == ""]))

count = 0

for i in range(len(subs["Companies"])):
    
    if (len(subs["Companies"][i]) > 1):
        count = count + 1

print(count)

351069
258491
21702


In [69]:
# Function to do exact match for tickers
def get_ticker(text):
    
    # List to store the ticker
    ticker = []
    
    # For each ticker
    for i in range(len(all_companies["Symbol"])):
        
        # Check if each ticker is in the text (added a space on each side to make each ticker unique)
        if (all_companies["Symbol"][i].center(len(all_companies["Symbol"][i])+2) in text):
            
            # Add it to the list of tickers if it is found
            ticker.append(all_companies["Symbol"][i])
    
    return ticker

In [70]:
get_ticker(subs["text"][0])

['AMD', 'SP']

In [71]:
# Add another column for ticker
subs["Ticker"] = subs["text"].map(get_ticker)

# Display
subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,Companies,Ticker
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,[CAC],"[AMD, SP]"
1,1.546321e+09,abfel2,137,69,0.94,0,11097.0,bo7wh,bawse1,end of Year PnL this sub have see some steady ...,,[]
2,1.546322e+09,abfluz,9,16,0.91,0,,,,be the film industry profitable or be Disneys ...,,[]
3,1.546326e+09,abfzsq,80,2175,0.97,1,13290.0,ao3h8,PetetheJuggler,thank God save in commission,,[]
4,1.546328e+09,abg8ki,10,30,0.81,0,391.0,v744nt8,cant-think-of-one-23,thank God save on commission,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...
351064,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,"[GDEN, ABST]",[]
351065,1.614295e+09,lsjpdk,2,14,0.94,0,3.0,1jycpvh,kethalles,Trader Pro ah be sus high,,[]
351066,1.614295e+09,lsjpk4,0,1,1.00,0,0.0,a53arl2w,amiU_humm,Gamma Squeeze,,[]
351067,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,"[GME, WWIO]","[GME, WOW]"


In [86]:
# Save ticker names
subs.to_csv("./Data/sub_info_all.csv", index=False)

In [110]:
subs.Ticker = subs.Ticker.fillna("")

subs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,Companies,Ticker
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,[CAC],"[AMD, SP]"
17,1.546362e+09,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,[WSBC],
28,1.546381e+09,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,[NFLX],
30,1.546384e+09,abn4mo,23,65,0.92,0,2102.0,z68xh,Nonaluuluu,CNBC say Wells Fargo say that Pension Funds se...,[EAD],
32,1.546386e+09,abnere,0,1,1.00,0,,,,Mobile Computer Repair Boynton Beach FL pc revive,,[FL]
...,...,...,...,...,...,...,...,...,...,...,...,...
351055,1.614295e+09,lsjmtg,0,1,1.00,0,801.0,2b3w7q0x,soldiergrl101,Most Major Retail Brokerages suffer outage Aga...,[GME],[GME]
351059,1.614295e+09,lsjnui,0,1,1.00,0,6701.0,5bynp911,abutteredbiscuit,you be here the squeeze have not squozazzle ye...,,[GME]
351061,1.614295e+09,lsjott,6,4,0.75,0,5.0,8wlcs77w,Chance-Poet-4919,want my AMC lunch moneydrobinhood trynna take ...,[AMCX],[AMC]
351064,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,"[GDEN, ABST]",


In [111]:
subs = subs[(subs.Ticker != "") | (subs.Companies != "")]

In [112]:
subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,Companies,Ticker
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,[CAC],"[AMD, SP]"
17,1.546362e+09,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,[WSBC],
28,1.546381e+09,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,[NFLX],
30,1.546384e+09,abn4mo,23,65,0.92,0,2102.0,z68xh,Nonaluuluu,CNBC say Wells Fargo say that Pension Funds se...,[EAD],
32,1.546386e+09,abnere,0,1,1.00,0,,,,Mobile Computer Repair Boynton Beach FL pc revive,,[FL]
...,...,...,...,...,...,...,...,...,...,...,...,...
351055,1.614295e+09,lsjmtg,0,1,1.00,0,801.0,2b3w7q0x,soldiergrl101,Most Major Retail Brokerages suffer outage Aga...,[GME],[GME]
351059,1.614295e+09,lsjnui,0,1,1.00,0,6701.0,5bynp911,abutteredbiscuit,you be here the squeeze have not squozazzle ye...,,[GME]
351061,1.614295e+09,lsjott,6,4,0.75,0,5.0,8wlcs77w,Chance-Poet-4919,want my AMC lunch moneydrobinhood trynna take ...,[AMCX],[AMC]
351064,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,"[GDEN, ABST]",


In [117]:
# Replace empty string with empty list
subs.Ticker = subs.Ticker.map(lambda x: [] if x == "" else x)
subs.Companies = subs.Companies.map(lambda x: [] if x == "" else x)

# Join the 2 lists
subs["ticker"] = subs["Companies"] + subs["Ticker"]

# Drop the Companies and Ticker Columns
subs.drop(columns = ["Companies", "Ticker"], inplace = True)

AttributeError: 'DataFrame' object has no attribute 'Ticker'

In [118]:
# Expand the list of tickers to be one ticker per row
subs = subs.explode("ticker").reset_index()

# Display
subs

Unnamed: 0,index,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,ticker
0,0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,CAC
1,0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,AMD
2,0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,SP
3,17,1.546362e+09,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,WSBC
4,28,1.546381e+09,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,NFLX
...,...,...,...,...,...,...,...,...,...,...,...,...
268459,351064,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,ABST
268460,351067,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,GME
268461,351067,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,WWIO
268462,351067,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,GME


In [123]:
# Drop the index column
#subs.drop(columns=["index"], inplace = True)

# Drop duplicates
subs.drop_duplicates(inplace = True)

# Display
display(subs)

# Save ticker names
subs.to_csv("./Data/sub_info_all.csv", index=False)

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,ticker
0,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,CAC
1,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,AMD
2,1.546320e+09,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,SP
3,1.546362e+09,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,WSBC
4,1.546381e+09,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,NFLX
...,...,...,...,...,...,...,...,...,...,...,...
268458,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,GDEN
268459,1.614295e+09,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,ABST
268460,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,GME
268461,1.614295e+09,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,WWIO


In [171]:
subs = pd.read_csv("./Data/sub_info_all.csv")

# # Convert date to month-day-Year format as string
# subs["date"] = subs["date"].map(dt.datetime.utcfromtimestamp)

# for i in range(len(subs["date"])):
#     subs["date"][i] = subs["date"][i].strftime("%m/%d/%Y")

# Reorder the date as month, day, year
for i in range(len(subs["date"])):
    subs["date"][i] = (subs["date"][i] + "/")[5:] + (subs["date"][i][:4])


subs

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,ticker
0,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,CAC
1,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,AMD
2,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,SP
3,01/01/2019,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,WSBC
4,01/01/2019,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,NFLX
...,...,...,...,...,...,...,...,...,...,...,...
224048,02/25/2021,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,GDEN
224049,02/25/2021,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,ABST
224050,02/25/2021,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,GME
224051,02/25/2021,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,WWIO


In [8]:
subs = pd.read_csv("./Data/sub_info_all.csv")

subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,ticker
0,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,CAC
1,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,AMD
2,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,SP
3,01/01/2019,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,WSBC
4,01/01/2019,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,NFLX
...,...,...,...,...,...,...,...,...,...,...,...
224048,02/25/2021,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,GDEN
224049,02/25/2021,lsjp7h,22,11,0.92,0,2862.0,5eqyn,wisedeezl,Robinhood wonat let cancel my Gold membership ...,ABST
224050,02/25/2021,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,GME
224051,02/25/2021,lsjpof,546,4516,0.98,11,8993.0,7r5q9n4,Driving4Success,expectation for Tomorrow okay so we go from to...,WWIO


In [10]:
# Remove all rows with 2021
subs = subs[~(subs.date.str.contains("2021"))]

subs

Unnamed: 0,date,submission_id,num_comments,score,upvote_ratio,total_awards,sub_author_commentkarma,sub_author_id,sub_author_name,text,ticker
0,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,CAC
1,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,AMD
2,01/01/2019,abfam2,9,63,0.92,0,32944.0,pnohb,jimmyjay90210,congratulation to AMD on win the SP in with re...,SP
3,01/01/2019,abjmvm,34,10,0.71,0,744.0,2dk1wq99,nightman_123,WSB demographic how much do you faggot make pe...,WSBC
4,01/01/2019,abmpaz,8,6,0.80,0,7258.0,ggld6,sandalguy89,short NFLX,NFLX
...,...,...,...,...,...,...,...,...,...,...,...
97466,12/31/2020,ko0zd1,47,30,0.82,0,14995.0,6g69n9y7,FatCatBoomerBanker,sell Leaps Vega play for those who do not know...,PSTH
97467,12/31/2020,ko0zd1,47,30,0.82,0,14995.0,6g69n9y7,FatCatBoomerBanker,sell Leaps Vega play for those who do not know...,DISCA
97468,12/31/2020,ko0zd1,47,30,0.82,0,14995.0,6g69n9y7,FatCatBoomerBanker,sell Leaps Vega play for those who do not know...,GME
97469,12/31/2020,ko0zd1,47,30,0.82,0,14995.0,6g69n9y7,FatCatBoomerBanker,sell Leaps Vega play for those who do not know...,NAV


In [11]:
# Save ticker names
subs.to_csv("./Data/sub_info_all.csv", index=False)

In [12]:
# Split the data frame into 10
a = np.array_split(subs, 10)

# For each chunk
for i in range(len(a)):
    
    # Save it as csv
    a[i].to_csv("./Data/sub_info_all_{}.csv".format(i+1), index=False)