# Import Initial Dependencies

In [108]:
from tickers import Ticker, scrape_tickers, query_list, recent_IPO_list, upcoming_IPO_list
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pprint import pprint

In [109]:
#nltk.download("stopwords")
#nltk.download('punkt')
#nltk.download('vader_lexicon')

# Load Data

In [162]:
# Scrape to get updated tickers and company names for stocks and ETFs
scrape_tickers()

import_lists = [query_list, recent_IPO_list, upcoming_IPO_list]

tickers = []

# Build raw query list with ticker abbrevation and name of company
for i in import_lists:
    for item in i:
        tickers.append(item)

In [111]:
# Load in data for testing - will be replace with dataframes when in production
submissions_df = pd.read_csv('submissions.csv')
comments_df = pd.read_csv('comments.csv')

# Data Pre-Processing

### Remove Non-Alphanumeric Characters

In [148]:
import re
 
def scrub_data(text):
    clean = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return clean

In [149]:
# Clean submission body and title
submissions_df["Clean_Body"] = submissions_df.Body.apply(scrub_data)
submissions_df["Clean_Title"] = submissions_df.Title.apply(scrub_data)

In [150]:
# Clean comments
comments_df["Clean_Body"] = comments_df.Body.apply(scrub_data)

### Tokenize Data

In [165]:
# Create Function to Handle Errors During Tokenization
def tokenize_text(text):
    if not text:
        print("Text cannot be tokenize due to type errors.")
        text = ""
    text = text.lower()
    return nltk.word_tokenize(text)

In [152]:
# Tokenize submission body and title
submissions_df["Body_Tokens"] = submissions_df.Clean_Body.apply(tokenize_text)
submissions_df["Title_Tokens"] = submissions_df.Clean_Title.apply(tokenize_text)

In [153]:
# Tokenize comments
comments_df["Body_Tokens"] = comments_df.Clean_Body.apply(tokenize_text)

Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.


### Remove stop words

In [154]:
# Remove defined stop words from company names to prevent inaccurate labeling
common_words = ['Corp', 'Corporation', 'Ltd', 'Acquisition', 'Pharmaceuticals', 'Holding', 'Group']
new_name = ""
ticker_names = []
for ticker in tickers:
    new_name = ""
    company_words = ticker.name.split()
    for word in company_words:
        if word in common_words:
            continue
        else:
            if new_name == "":
                new_name = word
            else:
                new_name = new_name + " " + word
    ticker.name = new_name

In [155]:
stop_words = list(set(stopwords.words("english")))
def remove_stop_words(text):
    if not text:
        print("Type errors.")
        text = ""
    return [w for w in text if w not in stop_words]

In [156]:
submissions_df["Clean_Body_Tokens"] = submissions_df.Body_Tokens.apply(remove_stop_words)
submissions_df["Clean_Title_Tokens"] = submissions_df.Title_Tokens.apply(remove_stop_words)

In [157]:
# Create TF-IDF matrix using SciKit-Learn
#from sklearn.feature_extraction.text import TfidfVectorizer
# Create instance of Vectorizer
#vectorizer = TfidfVectorizer()
#tfidf = vectorizer.fit_transform(submissions_tokens)

In [158]:
# convert sentences to array of words
#submissions_df['Body_Words'] = submissions_df.Body.str.replace("'", "").str.strip().str.split('[\W_]+')
#submissions_df['Title_Words'] = submissions_df.Title.str.replace("'", "").str.strip().str.split('[\W_]+')

### Find Tickers Mentioned

In [159]:
submissions_df

Unnamed: 0,Post_Type,Submission_ID,Title,Author,Body,Flair,Distinguished,Num_Comments,Post_ID,URL,Score,Upvote_Ratio,Created_Date_UTC,Clean_Body,Clean_Title,Body_Tokens,Title_Tokens,Clean_Body_Tokens,Clean_Title_Tokens,Tickers
0,Submission,me6ezj,Drinking and Trading don’t mix. I accidentally...,CostantlyLost,Title says it all. Had a few too many drinks o...,,,208,t3_me6ezj,https://www.reddit.com/r/stocks/comments/me6ez...,1298,0.95,[1616818448.0],Title says it all Had a few too many drinks of...,Drinking and Trading dont mix I accidentally b...,"[title, says, it, all, had, a, few, too, many,...","[drinking, and, trading, dont, mix, i, acciden...","[title, says, many, drinks, liquid, courage, b...","[drinking, trading, dont, mix, accidentally, b...",
1,Submission,me2d54,"Here is a Market Recap for today Friday, March...",psychotrader00,"\n\n**PsychoMarket Recap - Friday, March 26, ...",,,21,t3_me2d54,https://www.reddit.com/r/stocks/comments/me2d5...,72,0.92,[1616804555.0],PsychoMarket Recap Friday March 26 2021Stock...,Here is a Market Recap for today Friday March ...,"[psychomarket, recap, friday, march, 26, 2021s...","[here, is, a, market, recap, for, today, frida...","[psychomarket, recap, friday, march, 26, 2021s...","[market, recap, today, friday, march, 26, 2021...",
2,Submission,me8kb3,5 Considerations When Picking Stocks for Long-...,MinnesotaPower,"Like many of you, I started actively investing...",Advice,,18,t3_me8kb3,https://www.reddit.com/r/stocks/comments/me8kb...,81,0.87,[1616827758.0],Like many of you I started actively investing ...,5 Considerations When Picking Stocks for LongT...,"[like, many, of, you, i, started, actively, in...","[5, considerations, when, picking, stocks, for...","[like, many, started, actively, investing, yea...","[5, considerations, picking, stocks, longterm,...",
3,Submission,me1wx4,I believe two contradictory things about stock...,NotLegallyBinding,"First, I'm convinced that all the relevant, ac...",,,97,t3_me1wx4,https://www.reddit.com/r/stocks/comments/me1wx...,61,0.82,[1616803173.0],First Im convinced that all the relevant actio...,I believe two contradictory things about stock...,"[first, im, convinced, that, all, the, relevan...","[i, believe, two, contradictory, things, about...","[first, im, convinced, relevant, actionable, i...","[believe, two, contradictory, things, stocks, ...",[TWOA]
4,Submission,med6uw,Wall Street Week Ahead for the trading week be...,bigbear0083,Good Saturday morning to all of you here on r/...,,,19,t3_med6uw,https://www.reddit.com/r/stocks/comments/med6u...,97,0.96,[1616849490.0],Good Saturday morning to all of you here on rs...,Wall Street Week Ahead for the trading week be...,"[good, saturday, morning, to, all, of, you, he...","[wall, street, week, ahead, for, the, trading,...","[good, saturday, morning, rstocks, hope, every...","[wall, street, week, ahead, trading, week, beg...",[TWOA]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Submission,me5cw4,Wash sale concerns for taxes.,Barkleyslakjssrtqwe,I pulled $30k from my main account to try day ...,,,21,t3_me5cw4,https://www.reddit.com/r/stocks/comments/me5cw...,1,0.60,[1616814545.0],I pulled 30k from my main account to try day t...,Wash sale concerns for taxes,"[i, pulled, 30k, from, my, main, account, to, ...","[wash, sale, concerns, for, taxes]","[pulled, 30k, main, account, try, day, trading...","[wash, sale, concerns, taxes]",
69,Submission,mdrs63,Dollar cost average,noeljvt,Is there a way to calculate dollar cost averag...,,,14,t3_mdrs63,https://www.reddit.com/r/stocks/comments/mdrs6...,0,0.50,[1616774275.0],Is there a way to calculate dollar cost averag...,Dollar cost average,"[is, there, a, way, to, calculate, dollar, cos...","[dollar, cost, average]","[way, calculate, dollar, cost, averaging, prio...","[dollar, cost, average]",[TWOA]
70,Submission,mdz868,Tesla outlook,Shaun8030,Is Tesla market share of the ev sector going t...,Company Discussion,,35,t3_mdz868,https://www.reddit.com/r/stocks/comments/mdz86...,0,0.44,[1616795038.0],Is Tesla market share of the ev sector going t...,Tesla outlook,"[is, tesla, market, share, of, the, ev, sector...","[tesla, outlook]","[tesla, market, share, ev, sector, going, dwin...","[tesla, outlook]",
71,Submission,mdrafp,XPO Logistics Inc. (XPO) Soars 2.43% on March 25,thinkB4WeSpeak,XPO Logistics Inc. (XPO) had a good day on the...,Company News,,5,t3_mdrafp,https://www.reddit.com/r/stocks/comments/mdraf...,0,0.27,[1616772948.0],XPO Logistics Inc XPOhad a good day on the mar...,XPO Logistics Inc XPO Soars 243 on March 25,"[xpo, logistics, inc, xpohad, a, good, day, on...","[xpo, logistics, inc, xpo, soars, 243, on, mar...","[xpo, logistics, inc, xpohad, good, day, marke...","[xpo, logistics, inc, xpo, soars, 243, march, 25]",[TWOA]


In [166]:
# Tag all tickers found in array of words for each post
submissions_df["Tickers"] = ""

def tokenize_tickers(text):
    if not text:
        print("Text cannot be tokenize due to type errors.")
        text = ""
    return nltk.word_tokenize(text)

for index, row in submissions_df.iterrows():
    body = row[4]
    title = row[2]
    body_tokens = tokenize_tickers(body)
    title_tokens = tokenize_tickers(title)
    tickers_found = []
    body_set = set(body_tokens)
    title_set = set(title_tokens)
    for ticker in tickers:
        if ticker.abbrev == 'A' or ticker.abbrev == 'I':
            if (ticker.name in body_set) or (ticker.name in title_set):
                tickers_found.append(ticker.abbrev)
        else:
            if (ticker.abbrev in body_set) or (ticker.abbrev in title_set):
                if ticker.abbrev not in tickers_found:
                    tickers_found.append(ticker.abbrev)
            elif (ticker.name in body_set) or (ticker.name in title_set):
                if ticker.abbrev not in tickers_found:
                    tickers_found.append(ticker.abbrev)
    if len(tickers_found) == 0:
        submissions_df.at[index, "Tickers"] = ""
    else:
        submissions_df.at[index, "Tickers"]= tickers_found

In [167]:
submissions_df

Unnamed: 0,Post_Type,Submission_ID,Title,Author,Body,Flair,Distinguished,Num_Comments,Post_ID,URL,Score,Upvote_Ratio,Created_Date_UTC,Clean_Body,Clean_Title,Body_Tokens,Title_Tokens,Clean_Body_Tokens,Clean_Title_Tokens,Tickers
0,Submission,me6ezj,Drinking and Trading don’t mix. I accidentally...,CostantlyLost,Title says it all. Had a few too many drinks o...,,,208,t3_me6ezj,https://www.reddit.com/r/stocks/comments/me6ez...,1298,0.95,[1616818448.0],Title says it all Had a few too many drinks of...,Drinking and Trading dont mix I accidentally b...,"[title, says, it, all, had, a, few, too, many,...","[drinking, and, trading, dont, mix, i, acciden...","[title, says, many, drinks, liquid, courage, b...","[drinking, trading, dont, mix, accidentally, b...",
1,Submission,me2d54,"Here is a Market Recap for today Friday, March...",psychotrader00,"\n\n**PsychoMarket Recap - Friday, March 26, ...",,,21,t3_me2d54,https://www.reddit.com/r/stocks/comments/me2d5...,72,0.92,[1616804555.0],PsychoMarket Recap Friday March 26 2021Stock...,Here is a Market Recap for today Friday March ...,"[psychomarket, recap, friday, march, 26, 2021s...","[here, is, a, market, recap, for, today, frida...","[psychomarket, recap, friday, march, 26, 2021s...","[market, recap, today, friday, march, 26, 2021...","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ..."
2,Submission,me8kb3,5 Considerations When Picking Stocks for Long-...,MinnesotaPower,"Like many of you, I started actively investing...",Advice,,18,t3_me8kb3,https://www.reddit.com/r/stocks/comments/me8kb...,81,0.87,[1616827758.0],Like many of you I started actively investing ...,5 Considerations When Picking Stocks for LongT...,"[like, many, of, you, i, started, actively, in...","[5, considerations, when, picking, stocks, for...","[like, many, started, actively, investing, yea...","[5, considerations, picking, stocks, longterm,...","[DG, GOOGL, HD, SPGI, TMO, TXN, UNP, V]"
3,Submission,me1wx4,I believe two contradictory things about stock...,NotLegallyBinding,"First, I'm convinced that all the relevant, ac...",,,97,t3_me1wx4,https://www.reddit.com/r/stocks/comments/me1wx...,61,0.82,[1616803173.0],First Im convinced that all the relevant actio...,I believe two contradictory things about stock...,"[first, im, convinced, that, all, the, relevan...","[i, believe, two, contradictory, things, about...","[first, im, convinced, relevant, actionable, i...","[believe, two, contradictory, things, stocks, ...","[PS, TWOA]"
4,Submission,med6uw,Wall Street Week Ahead for the trading week be...,bigbear0083,Good Saturday morning to all of you here on r/...,,,19,t3_med6uw,https://www.reddit.com/r/stocks/comments/med6u...,97,0.96,[1616849490.0],Good Saturday morning to all of you here on rs...,Wall Street Week Ahead for the trading week be...,"[good, saturday, morning, to, all, of, you, he...","[wall, street, week, ahead, for, the, trading,...","[good, saturday, morning, rstocks, hope, every...","[wall, street, week, ahead, trading, week, beg...","[AM, AMAT, BB, BNTX, CHWY, CIO, ENPH, ET, ETSY..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Submission,me5cw4,Wash sale concerns for taxes.,Barkleyslakjssrtqwe,I pulled $30k from my main account to try day ...,,,21,t3_me5cw4,https://www.reddit.com/r/stocks/comments/me5cw...,1,0.60,[1616814545.0],I pulled 30k from my main account to try day t...,Wash sale concerns for taxes,"[i, pulled, 30k, from, my, main, account, to, ...","[wash, sale, concerns, for, taxes]","[pulled, 30k, main, account, try, day, trading...","[wash, sale, concerns, taxes]",[SOXL]
69,Submission,mdrs63,Dollar cost average,noeljvt,Is there a way to calculate dollar cost averag...,,,14,t3_mdrs63,https://www.reddit.com/r/stocks/comments/mdrs6...,0,0.50,[1616774275.0],Is there a way to calculate dollar cost averag...,Dollar cost average,"[is, there, a, way, to, calculate, dollar, cos...","[dollar, cost, average]","[way, calculate, dollar, cost, averaging, prio...","[dollar, cost, average]",[TWOA]
70,Submission,mdz868,Tesla outlook,Shaun8030,Is Tesla market share of the ev sector going t...,Company Discussion,,35,t3_mdz868,https://www.reddit.com/r/stocks/comments/mdz86...,0,0.44,[1616795038.0],Is Tesla market share of the ev sector going t...,Tesla outlook,"[is, tesla, market, share, of, the, ev, sector...","[tesla, outlook]","[tesla, market, share, ev, sector, going, dwin...","[tesla, outlook]","[GM, TSLA]"
71,Submission,mdrafp,XPO Logistics Inc. (XPO) Soars 2.43% on March 25,thinkB4WeSpeak,XPO Logistics Inc. (XPO) had a good day on the...,Company News,,5,t3_mdrafp,https://www.reddit.com/r/stocks/comments/mdraf...,0,0.27,[1616772948.0],XPO Logistics Inc XPOhad a good day on the mar...,XPO Logistics Inc XPO Soars 243 on March 25,"[xpo, logistics, inc, xpohad, a, good, day, on...","[xpo, logistics, inc, xpo, soars, 243, on, mar...","[xpo, logistics, inc, xpohad, good, day, marke...","[xpo, logistics, inc, xpo, soars, 243, march, 25]","[CEO, S, TWOA, XPO, LTL]"


In [174]:
# Remove submissions without any mention of tickers - unable to interpret relevancy of sentiment
submissions_df = submissions_df.loc[submissions_df['Tickers'] != ""]

In [176]:
comments_df['Tickers'] = ""
for index_c, row_c in comments_df.iterrows():
    for index_s, row_s in submissions_df.iterrows():
        if row_c[1] == row_s[1]:
             comments_df.at[index_c, 'Tickers'] = row_s[19]

In [177]:
comments_df = comments_df.loc[comments_df["Tickers"] != ""]

# Perform Sentiment Analysis Using NLTK

In [205]:
# Create function to perform sentiment analysis
def sent_analyzer(df, body_index, title_index = None, upvote_score_index = None):
    sia = SIA()
    results = []
    df['negative_score'] = ""
    df['neutral_score'] = ""
    df['positive_score'] = ""
    df['compound_score'] = ""
    df['sentiment_label'] = 0
    for index, row in df.iterrows():
        body = row[body_index]
        if title_index is not None:
            title = row[title_index]
            eval_text = title + " " + body
        else:
            eval_text = body
        pol_score= sia.polarity_scores(eval_text)
        df.at[index, 'negative_score'] = pol_score['neg']
        df.at[index, 'neutral_score'] = pol_score['neu']
        df.at[index, 'positive_score'] = pol_score['pos']
        df.at[index, 'compound_score'] = pol_score['compound']
        if upvote_score_index is not None:
            weighted_compound = row[upvote_score_index] * pol_score['compound']
            if weighted_compound > 0.2:
                df.at[index,'sentiment_label'] = 1
            elif weighted_compound < 0.2:
                df.at[index,'sentiment_label'] = -1
            else:
                df.at[index,'sentiment_label'] = 0
        else:
            if pol_score['compound'] > 0.2:
                df.at[index,'sentiment_label'] = 1
            elif pol_score['compound'] < 0.2:
                df.at[index,'sentiment_label'] = -1
            else:
                df.at[index,'sentiment_label'] = 0
    return df

In [206]:
sent_analyzer(submissions_df, 13, 14, 11)

Unnamed: 0,Post_Type,Submission_ID,Title,Author,Body,Flair,Distinguished,Num_Comments,Post_ID,URL,...,Body_Tokens,Title_Tokens,Clean_Body_Tokens,Clean_Title_Tokens,Tickers,negative_score,neutral_score,positive_score,compound_score,sentiment_label
1,Submission,me2d54,"Here is a Market Recap for today Friday, March...",psychotrader00,"\n\n**PsychoMarket Recap - Friday, March 26, ...",,,21,t3_me2d54,https://www.reddit.com/r/stocks/comments/me2d5...,...,"[psychomarket, recap, friday, march, 26, 2021s...","[here, is, a, market, recap, for, today, frida...","[psychomarket, recap, friday, march, 26, 2021s...","[market, recap, today, friday, march, 26, 2021...","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ...",0.047,0.843,0.111,0.9941,1
2,Submission,me8kb3,5 Considerations When Picking Stocks for Long-...,MinnesotaPower,"Like many of you, I started actively investing...",Advice,,18,t3_me8kb3,https://www.reddit.com/r/stocks/comments/me8kb...,...,"[like, many, of, you, i, started, actively, in...","[5, considerations, when, picking, stocks, for...","[like, many, started, actively, investing, yea...","[5, considerations, picking, stocks, longterm,...","[DG, GOOGL, HD, SPGI, TMO, TXN, UNP, V]",0.034,0.839,0.127,0.9891,1
3,Submission,me1wx4,I believe two contradictory things about stock...,NotLegallyBinding,"First, I'm convinced that all the relevant, ac...",,,97,t3_me1wx4,https://www.reddit.com/r/stocks/comments/me1wx...,...,"[first, im, convinced, that, all, the, relevan...","[i, believe, two, contradictory, things, about...","[first, im, convinced, relevant, actionable, i...","[believe, two, contradictory, things, stocks, ...","[PS, TWOA]",0.153,0.765,0.082,-0.7781,-1
4,Submission,med6uw,Wall Street Week Ahead for the trading week be...,bigbear0083,Good Saturday morning to all of you here on r/...,,,19,t3_med6uw,https://www.reddit.com/r/stocks/comments/med6u...,...,"[good, saturday, morning, to, all, of, you, he...","[wall, street, week, ahead, for, the, trading,...","[good, saturday, morning, rstocks, hope, every...","[wall, street, week, ahead, trading, week, beg...","[AM, AMAT, BB, BNTX, CHWY, CIO, ENPH, ET, ETSY...",0.033,0.865,0.102,0.9999,1
6,Submission,mdwibp,PLTR - bearish discussion to better understand...,wsbloverrrrrr,"(Please excuse my username, it's the only acco...",Company Discussion,,58,t3_mdwibp,https://www.reddit.com/r/stocks/comments/mdwib...,...,"[please, excuse, my, username, its, the, only,...","[pltr, bearish, discussion, to, better, unders...","[please, excuse, username, account, thats, dox...","[pltr, bearish, discussion, better, understand...","[FANG, PLTR, PS]",0.071,0.734,0.195,0.998,1
7,Submission,mdvvkm,Opportunity to gain about 10-14% on $MX (Bough...,RowanHarley,"Right now, $MX is trading far below what it wa...",Company Discussion,,52,t3_mdvvkm,https://www.reddit.com/r/stocks/comments/mdvvk...,...,"[right, now, mx, is, trading, far, below, what...","[opportunity, to, gain, about, 1014, on, mx, b...","[right, mx, trading, far, bought, likely, boug...","[opportunity, gain, 1014, mx, bought]",[MX],0.064,0.801,0.135,0.8927,1
10,Submission,me0ast,Some figures summarizing what has been happeni...,futureIsYes,The Hang Seng Index contains all the Chinese g...,,,62,t3_me0ast,https://www.reddit.com/r/stocks/comments/me0as...,...,"[the, hang, seng, index, contains, all, the, c...","[some, figures, summarizing, what, has, been, ...","[hang, seng, index, contains, chinese, giants,...","[figures, summarizing, happening, chinese, sto...",[USD],0.104,0.896,0.0,-0.8519,-1
11,Submission,mebzea,What stocks focus on sexual self-satisfaction?,Smur_,"To explain myself, I really do feel like this ...",Industry Discussion,,74,t3_mebzea,https://www.reddit.com/r/stocks/comments/mebze...,...,"[to, explain, myself, i, really, do, feel, lik...","[what, stocks, focus, on, sexual, selfsatisfac...","[explain, really, feel, like, part, market, us...","[stocks, focus, sexual, selfsatisfaction]",[DD],0.03,0.846,0.124,0.9102,1
12,Submission,mdtss5,Anyone own Viacom (VIAC) here?,rockinoutwith2,"If so, my condolences...damn, this stock just ...",,,56,t3_mdtss5,https://www.reddit.com/r/stocks/comments/mdtss...,...,"[if, so, my, condolencesdamn, this, stock, jus...","[anyone, own, viacom, viac, here]","[condolencesdamn, stock, doesnt, stop, falling...","[anyone, viacom, viac]",[VIAC],0.05,0.785,0.165,0.6129,1
13,Submission,meaofc,Suez Canal Tanker Play,TimThyTurtle,$STNG $TNK $NAT $TK $FRO\nThe Suez Canal block...,Industry Discussion,,32,t3_meaofc,https://www.reddit.com/r/stocks/comments/meaof...,...,"[stng, tnk, nat, tk, frothe, suez, canal, bloc...","[suez, canal, tanker, play]","[stng, tnk, nat, tk, frothe, suez, canal, bloc...","[suez, canal, tanker, play]","[CEO, FRO, NAT, STNG, TK, TNK]",0.037,0.834,0.13,0.91,1


In [207]:
sent_analyzer(comments_df, 5)

Unnamed: 0,Post_Type,Submission_ID,Comment_ID,Parent_ID,Author,Body,Flair,Distinguished,Is_Author,Score,Created_Date_UTC,Clean_Body,Body_Tokens,Tickers,negative_score,neutral_score,positive_score,compound_score,sentiment_label
187,Comment,me2d54,gsd1r54,t3_me2d54,pman6,['ok guys....\n\nwhat the fuck happened during...,,,False,35,[1616805654.0],ok guysnnwhat the fuck happened during the las...,"[ok, guysnnwhat, the, fuck, happened, during, ...","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ...",0.224,0.776,0,-0.7236,-1
188,Comment,me2d54,gsdbbya,t3_me2d54,NeelAsman,"['Shout out for yesterday’s post, you nailed S...",,,False,5,[1616809931.0],Shout out for yesterdays post you nailed SPY 394,"[shout, out, for, yesterdays, post, you, naile...","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ...",0,1,0,0,-1
189,Comment,me2d54,gsd2vy0,t3_me2d54,thatsjetfuel,"[""I'm not seeing how TSM is a bad play at this...",,,False,7,[1616806145.0],Im not seeing how TSM is a bad play at this pr...,"[im, not, seeing, how, tsm, is, a, bad, play, ...","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ...",0.147,0.767,0.087,-0.5423,-1
190,Comment,me2d54,gsd61xo,t1_gsd1r54,hitmon_gg,['God wanted me to have a better weekend?'],,,False,23,[1616807556.0],God wanted me to have a better weekend,"[god, wanted, me, to, have, a, better, weekend]","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ...",0,0.674,0.326,0.4404,1
191,Comment,me2d54,gsddp75,t1_gsd1r54,bothodler,"[""I don't want to get super conspiracy-ie but ...",,,False,10,[1616811018.0],I dont want to get super conspiracyie but late...,"[i, dont, want, to, get, super, conspiracyie, ...","[ADS, BCS, DRI, HON, JP, MS, NDAQ, NIO, ORLY, ...",0.299,0.598,0.102,-0.9497,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2111,Comment,me11cn,gsdhaar,t1_gsdf2zh,merlinsbeers,"[""He isn't self-made. He comes from a wealthy ...",,,False,2,[1616812668.0],He isnt selfmade He comes from a wealthy famil...,"[he, isnt, selfmade, he, comes, from, a, wealt...","[ARK, CEO, TSLA]",0,0.846,0.154,0.6249,1
2112,Comment,me11cn,gsge5va,t1_gsdf2zh,harrison_wintergreen,['> That dude is one of the few self made bill...,,,False,1,[1616856792.0],That dude is one of the few self made billion...,"[that, dude, is, one, of, the, few, self, made...","[ARK, CEO, TSLA]",0,1,0,0,-1
2113,Comment,me11cn,gsd6jo3,t1_gscuc3i,Ironleg01,"['Oh wow, news outlet reporting on thing that ...",,,False,4,[1616807785.0],Oh wow news outlet reporting on thing that hap...,"[oh, wow, news, outlet, reporting, on, thing, ...","[ARK, CEO, TSLA]",0.296,0.51,0.194,-0.25,-1
2114,Comment,me11cn,gsd0mbr,t1_gsczs3d,Tacoman404,"['Huh, Elon Musk is actually a vole?']",,,False,2,[1616805158.0],Huh Elon Musk is actually a vole,"[huh, elon, musk, is, actually, a, vole]","[ARK, CEO, TSLA]",0,1,0,0,-1


# Split Data into Training and Test Sets

In [208]:
#submissions_sentiment = submissions_df[["Body", "Title", "sentiment_label"]]
submissions_sentiment = submissions_df[["Clean_Body", "Clean_Title", "sentiment_label"]]

In [209]:
comment_sentiment = comments_df[["Body", "sentiment_label"]]

In [210]:
#dataset = submissions_df[["Body", "Title", "sentiment_label"]]
dataset = submissions_sentiment

In [211]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1].values

In [212]:
X_arr = []
for index, row in X.iterrows():
    X_arr.append(row.values)

In [213]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_arr, y, test_size = 0.2, random_state = 0)

In [214]:
X_train

[array(['The ARKK Trades At 209 pm Thursday a trader executed a put sweep of 7238 ARKK options with a 100 strike price expiring April 16 The trade represented a 38000 bearish bet for which the trader paid 525 per option contractAt 104 pm a trader executed a put sweep of 400 ARKK options with a 110 strike price expiring on Jan 20 2023 The trade represented a 14million bearish bet for which the trader paid 26 per option contractAt 1116 pm a trader executed a put sweep of 1500 ARKK options with a 125 strike price expiring May 21 The trade represented a 249million bearish bet for which the trader paid 1660 per option contractAt 1115 pm a trader executed a put sweep of 819 ARKK options with a 100 strike price expiring on May 21 The trade represented a 135million bearish bet for which the trader paid 1650 per option contractAt 1115 pm a trader executed a put sweep of 1028 ARKK options with a 125 strike price expiring on May 21 The trade represented a 17million bearish bet for which the trade

# Bag of Words Vectorization

In [215]:
def identity_tokenizer(text):
    return text

In [216]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=identity_tokenizer, lowercase=False)

# fit AND transform the model (only for training data)
X_train_vectors = vectorizer.fit_transform(X_train)

# transform the test data
X_test_vectors = vectorizer.transform(X_test)

In [217]:
# Review data ouput
print(X_train_vectors.toarray())
print(X_train[0])
print(X_train_vectors[0])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['The ARKK Trades At 209 pm Thursday a trader executed a put sweep of 7238 ARKK options with a 100 strike price expiring April 16 The trade represented a 38000 bearish bet for which the trader paid 525 per option contractAt 104 pm a trader executed a put sweep of 400 ARKK options with a 110 strike price expiring on Jan 20 2023 The trade represented a 14million bearish bet for which the trader paid 26 per option contractAt 1116 pm a trader executed a put sweep of 1500 ARKK options with a 125 strike price expiring May 21 The trade represented a 249million bearish bet for which the trader paid 1660 per option contractAt 1115 pm a trader executed a put sweep of 819 ARKK options with a 100 strike price expiring on May 21 The trade represented a 135million bearish bet for which the trader paid 1650 per option contractAt 1115 pm a trader executed a put sweep of 1028 ARKK opt

# Train Several Models on Training Set

In [218]:
# Linear SVM

from sklearn import svm

clf_svm = svm.SVC(kernel="linear")

clf_svm.fit(X_train_vectors, y_train)

SVC(kernel='linear')

In [219]:
clf_svm_pred = clf_svm.predict(X_test_vectors)

In [220]:
#Analyze some predictions
print(X_test[3])
print(clf_svm_pred[3])

['Good morning investorsTo cut to the chase Im at a crossroad After a massive loss for me on GME and AMC I told myself I need to stay out of stocks for a while Well I saved 3500 to invest and I dont feel right letting it get dusty in my checking account Heres the stocks currently on my list Im open for any suggestionsI have separated my allotted money into sections thats why some relatively good stocks are at lower number figuresSPY  600VUG  450APPL  375DKNG  400CCL  400TRIP  250WH  225SPWR  200FSLR  250AXP  350These are all meant to be medium term plays Im thinking around a year taking profits at my own discretion Thanks a lot everybody'
 'Heres my list thoughts']
1


In [221]:
# Evaluate Model Accuracy
from sklearn.metrics import accuracy_score

accuracy_score(y_test, clf_svm_pred)

0.6363636363636364

In [61]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(X_train_vectors, y_train)

DecisionTreeClassifier()

In [62]:
clf_dec_pred = clf_dec.predict(X_test_vectors)

In [63]:
#Analyze some predictions
print(X_test[3])
print(clf_dec_pred[3])

["['HCDI\\n\\nQK - Chinese airbnb']"]
-1


In [41]:
# Evaluate Model Accuracy
accuracy_score(y_test, clf_dec_pred)

0.6363636363636364

In [45]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(X_train_vectors, y_train)

clf_gnb_pred = clf_gnb.predict(X_test_vectors)

In [46]:
# Evaluate Model Accuracy
accuracy_score(y_test, clf_gnb_pred)

0.6363636363636364

In [None]:
sentiment_average_df['sentiment_weight'] = comments_df['Score']
sentiment_average_df

In [None]:
sentiment_average_df['sentiment_adjusted'] = sentiment_average_df['sentiment_label'] * sentiment_average_df['sentiment_weight']

In [None]:
sentiment_average = sentiment_average_df[['Submission_ID','sentiment_adjusted']].groupby('Submission_ID').mean()

In [None]:
sentiment_average['sentiment_output'] = 0
sentiment_average.loc[sentiment_average['sentiment_adjusted'] > 0.2, 'sentiment_output'] = 1
sentiment_average.loc[sentiment_average['sentiment_adjusted'] < -0.2, 'sentiment_output'] = -1
sentiment_average.reset_index(inplace=True)
#sentiment_average = sentiment_average.set_index('Submission_ID')

In [None]:
sentiment_average

In [None]:
#combined_df = submissions_df.join(sentiment_average)
combined_df = pd.merge(submissions_df, sentiment_average, how='left', on='Submission_ID',
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

In [45]:
# Account for Upvote Ratio to get adjust sentiment
for index, row in submissions_df.iterrows():
    upvote_ratio = row[11]
    sentiment_label = row[18]
    if upvote_ratio <= 0.8:
        if sentiment_label == 1:
            submission_df.at[index, "sentiment_label"] = -1
        elif sentiment_label == -1:
            submission_df.at[index, "sentiment_label"] = 1

0.107
0.12200000000000001
0.081
0.098
0.188
0.152
0.0
0.127
0.165
0.128
0.019
0.09599999999999999
0.176
0.099
0.086
0.072
0.064
0.161
0.04
0.188
0.14999999999999997
0.03900000000000001
0.152
0.0
0.10200000000000001
0.11400000000000002
0.051
0.073
0.0
0.105
0.13
0.049
0.15399999999999997
0.142
0.132
0.131
0.124
0.14
0.178
0.135
0.043000000000000003
0.046
0.10999999999999999
0.203
0.041
0.172
0.146
0.052
0.042
0
0
0
0


In [None]:
combined_df.rename(columns = {'sentiment_adjusted':'comment_sentiment'}, inplace = True)

In [None]:
combined_df['submission_sentiment'] = combined_df['sentiment_label'] * combined_df['sentiment_weight']

In [None]:
combined_df.head()

In [None]:
combined_df['combined_sentiment'] = combined_df['comment_sentiment'] + combined_df['submission_sentiment']

In [None]:
combined_df['final_sentiment_label'] = 0
combined_df.loc[combined_df['combined_sentiment'] > 0.2, 'final_sentiment_label'] = 1
combined_df.loc[combined_df['combined_sentiment'] < -0.2, 'final_sentiment_label'] = -1

In [None]:
ticker_sentiment = pd.DataFrame()

In [None]:
ticker_sentiment["Ticker"] = ""
ticker_sentiment["Total_Count"] = 0
ticker_sentiment["Total_Sentiment"] = 0

In [None]:
index = 0
for ticker in tickers:
    ticker_sentiment.at[index, "Ticker"] = ticker.abbrev
    ticker_sentiment.at[index, "Total_Count"] = 0
    ticker_sentiment.at[index, "Total_Sentiment"] = 0
    index += 1

In [None]:
submissions_df['sentiment_weight'] = submissions_df['Score'] * submissions_df['Upvote_Ratio']

In [None]:
ticker_sentiment = ticker_sentiment.drop_duplicates(subset=['Ticker'])

In [None]:
ticker_sentiment

In [None]:
# Get running count of tickers mentioned (# of times mentioned) and with average sentiment
for index, row in ticker_sentiment.iterrows():
    ticker_abbrev = row[0]
    total_count = row[1]
    total_sentiment = row[2]
    for i, r in combined_df.iterrows():
        # Get column with tickers found
        ticker_set = set(r[15])
        post_sentiment = r[26]
        if ticker_abbrev in ticker_set:
            total_count += 1
            total_sentiment += post_sentiment
    ticker_sentiment.at[index, "Total_Count"] = total_count
    ticker_sentiment.at[index, "Total_Sentiment"] = total_sentiment

In [None]:
final_sentiment_results = ticker_sentiment.loc[ticker_sentiment['Total_Count'] != 0]

In [None]:
final_sentiment_results = final_sentiment_results.reset_index(drop=True)

In [None]:
final_sentiment_results

In [None]:
final_sentiment_results["Average_Sentiment"] = final_sentiment_results["Total_Count"] / final_sentiment_results["Total_Sentiment"]

In [None]:
final_sentiment_results

In [None]:
final_sentiment_results['Average_Sentiment'].unique()

In [None]:
final_sentiment_results['sentiment_label'] = "Neutral"
final_sentiment_results.loc[final_sentiment_results['Average_Sentiment'] > 0.2, 'sentiment_label'] = "Bullish"
final_sentiment_results.loc[final_sentiment_results['Average_Sentiment'] < -0.2, 'sentiment_label'] = "Bearish"

In [None]:
# Add "today's price change" to each ticker found - total count, average sentiment, today's price change
# Include graph for visuals
import yfinance as yf
from datetime import date

In [None]:
todays_date = date.today()
todays_date

In [None]:
final_sentiment_results["Price_Change"] = ""
final_sentiment_results["Date"] = ""
for index, row in final_sentiment_results.iterrows():
    try:
        df = yf.download(row[0], start=todays_date)
        date = df.index[0]
        close_price = df["Close"].values[0]
        open_price = df["Open"].values[0]
        percentage_change = round((((close_price - open_price) / open_price) * 100), 2)
        final_sentiment_results.at[index, "Price_Change"] = f'{percentage_change}%'
        final_sentiment_results.at[index, "Date"] = date
    except:
        print("Data not found")
        final_sentiment_results.at[index, "Price_Change"] = "Ticker unavailable"
        final_sentiment_results.at[index, "Date"] = date

In [None]:
final_sentiment_results

# Test database connection and process

In [None]:
# Import SQL Alchemy
from sqlalchemy import create_engine

# Import datetime
from datetime import datetime

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

In [None]:
# Create the Ticker Sentiment class
class Ticker_Sentiment(Base):
    __tablename__ = 'ticker_sentiment'
    id = Column(Integer, primary_key=True)
    ticker = Column(String(255))
    date = Column(String(255))
    count = Column(Integer)
    sentiment = Column(String)
    percent_change = Column(Float)

In [None]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///reddit_sentiment.db')

In [None]:
# Create the ticker_sentiment table within the database
Base.metadata.create_all(engine)

In [None]:
# Push the objects made and query the server
from sqlalchemy.orm import Session
session = Session(bind=engine)

In [None]:
sql_df = final_sentiment_results[["Ticker", "Date", "Total_Count", "sentiment_label", "Price_Change"]]

In [None]:
sql_df = sql_df.rename(columns={"Ticker":"ticker", "Date":"date", "Total_Count":"count", "sentiment_label":"sentiment", "Price_Change":"percent_change"})

In [None]:
sql_df

In [None]:
sql_df.to_sql('ticker_sentiment', con=engine, if_exists='append', index=False)

In [None]:
from sqlalchemy.ext.automap import automap_base
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

# Save reference to the table
sentiment = Base.classes.ticker_sentiment

In [None]:
session = Session(engine)

results = session.query(sentiment.ticker).all()

session.close()

In [None]:
import numpy as np
sentiment = list(np.ravel(results))

In [None]:
print(sentiment)