# Import Initial Dependencies

In [12]:
from tickers import Ticker, scrape_tickers, query_list, recent_IPO_list, upcoming_IPO_list
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from pprint import pprint
import random 

In [13]:
#nltk.download("stopwords")
#nltk.download('punkt')
#nltk.download('vader_lexicon')

# Load Data

In [14]:
# Scrape to get updated tickers and company names for stocks and ETFs
scrape_tickers()

import_lists = [query_list, recent_IPO_list, upcoming_IPO_list]

tickers = []

# Build raw query list with ticker abbrevation and name of company
for i in import_lists:
    for item in i:
        tickers.append(item)

In [15]:
# Load in data for testing - will be replace with dataframes when in production
submissions_df = pd.read_csv('submissions.csv')
comments_df = pd.read_csv('comments.csv')

# Data Pre-Processing

### Remove Non-Alphanumeric Characters

In [16]:
import re
 
def scrub_data(text):
    clean = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return clean

In [17]:
# Clean submission body and title
submissions_df["Clean_Body"] = submissions_df.Body.apply(scrub_data)
submissions_df["Clean_Title"] = submissions_df.Title.apply(scrub_data)

In [18]:
# Clean comments
comments_df["Clean_Body"] = comments_df.Body.apply(scrub_data)

### Tokenize Data

In [19]:
# Create Function to Handle Errors During Tokenization
def tokenize_text(text):
    if not text:
        print("Text cannot be tokenize due to type errors.")
        text = ""
    text = text.lower()
    return nltk.word_tokenize(text)

In [20]:
# Tokenize submission body and title
submissions_df["Body_Tokens"] = submissions_df.Clean_Body.apply(tokenize_text)
submissions_df["Title_Tokens"] = submissions_df.Clean_Title.apply(tokenize_text)

In [21]:
# Tokenize comments
comments_df["Body_Tokens"] = comments_df.Clean_Body.apply(tokenize_text)

Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to type errors.
Text cannot be tokenize due to t

### Remove stop words

In [22]:
# Remove defined stop words from company names to prevent inaccurate labeling
common_words = ['Corp', 'Corporation', 'Ltd', 'Acquisition', 'Pharmaceuticals', 'Holding', 'Group']
new_name = ""
ticker_names = []
for ticker in tickers:
    new_name = ""
    company_words = ticker.name.split()
    for word in company_words:
        if word in common_words:
            continue
        else:
            if new_name == "":
                new_name = word
            else:
                new_name = new_name + " " + word
    ticker.name = new_name

In [23]:
stop_words = list(set(stopwords.words("english")))
def remove_stop_words(text):
    if not text:
        print("Type errors.")
        text = ""
    return [w for w in text if w not in stop_words]

In [24]:
submissions_df["Clean_Body_Tokens"] = submissions_df.Body_Tokens.apply(remove_stop_words)
submissions_df["Clean_Title_Tokens"] = submissions_df.Title_Tokens.apply(remove_stop_words)

In [25]:
# Create TF-IDF matrix using SciKit-Learn
#from sklearn.feature_extraction.text import TfidfVectorizer
# Create instance of Vectorizer
#vectorizer = TfidfVectorizer()
#tfidf = vectorizer.fit_transform(submissions_tokens)

In [26]:
# convert sentences to array of words
#submissions_df['Body_Words'] = submissions_df.Body.str.replace("'", "").str.strip().str.split('[\W_]+')
#submissions_df['Title_Words'] = submissions_df.Title.str.replace("'", "").str.strip().str.split('[\W_]+')

### Find Tickers Mentioned

In [27]:
# Tag all tickers found in array of words for each post
submissions_df["Tickers"] = ""

def tokenize_tickers(text):
    if not text:
        print("Text cannot be tokenize due to type errors.")
        text = ""
    return nltk.word_tokenize(text)

for index, row in submissions_df.iterrows():
    body = row[4]
    title = row[2]
    body_tokens = tokenize_tickers(body)
    title_tokens = tokenize_tickers(title)
    tickers_found = []
    body_set = set(body_tokens)
    title_set = set(title_tokens)
    for ticker in tickers:
        if ticker.abbrev == 'A' or ticker.abbrev == 'I':
            if (ticker.name in body_set) or (ticker.name in title_set):
                tickers_found.append(ticker.abbrev)
        else:
            if (ticker.abbrev in body_set) or (ticker.abbrev in title_set):
                if ticker.abbrev not in tickers_found:
                    tickers_found.append(ticker.abbrev)
            elif (ticker.name in body_set) or (ticker.name in title_set):
                if ticker.abbrev not in tickers_found:
                    tickers_found.append(ticker.abbrev)
    if len(tickers_found) == 0:
        submissions_df.at[index, "Tickers"] = ""
    else:
        submissions_df.at[index, "Tickers"]= tickers_found

In [28]:
submissions_df

Unnamed: 0,Post_Type,Submission_ID,Title,Author,Body,Flair,Distinguished,Num_Comments,Post_ID,URL,Score,Upvote_Ratio,Created_Date_UTC,Clean_Body,Clean_Title,Body_Tokens,Title_Tokens,Clean_Body_Tokens,Clean_Title_Tokens,Tickers
0,Submission,n7eybf,96% of US users opt out of app tracking in iOS...,Karnes,"""It seems that in the United States, at least,...",Industry Discussion,,1635,t3_n7eybf,https://www.reddit.com/r/stocks/comments/n7eyb...,14318,0.97,[1620439925.0],It seems that in the United States at least ap...,96 of US users opt out of app tracking in iOS ...,"[it, seems, that, in, the, united, states, at,...","[96, of, us, users, opt, out, of, app, trackin...","[seems, united, states, least, app, developers...","[96, us, users, opt, app, tracking, ios, 145, ...","[AAPL, CEO, FB, ID, TV]"
1,Submission,nqk7qi,AMC to Offer Free Popcorn and Exclusive Screen...,rgujjula-csdude,Full Article Here: [https://www.cnbc.com/2021/...,,,1009,t3_nqk7qi,https://www.reddit.com/r/stocks/comments/nqk7q...,13764,0.92,[1622635400.0],Full Article Here httpswwwcnbccom20210602amcpl...,AMC to Offer Free Popcorn and Exclusive Screen...,"[full, article, here, httpswwwcnbccom20210602a...","[amc, to, offer, free, popcorn, and, exclusive...","[full, article, httpswwwcnbccom20210602amcplan...","[amc, offer, free, popcorn, exclusive, screeni...","[AMC, AONE, CEO, TWTR]"
2,Submission,n9a2nc,"Chipotle to hike wages, debut referral bonuses...",Brothanogood,Chipotle said it will increase restaurant wage...,Company News,,1665,t3_n9a2nc,https://www.reddit.com/r/stocks/comments/n9a2n...,12772,0.95,[1620665963.0],Chipotle said it will increase restaurant wage...,Chipotle to hike wages debut referral bonuses ...,"[chipotle, said, it, will, increase, restauran...","[chipotle, to, hike, wages, debut, referral, b...","[chipotle, said, increase, restaurant, wages, ...","[chipotle, hike, wages, debut, referral, bonus...",[CEO]
3,Submission,nanrlu,I analyzed 9000+ trades made by Members of the...,nobjos,**Preamble:** The ability of Congress to trade...,Meta,,369,t3_nanrlu,https://www.reddit.com/r/stocks/comments/nanrl...,7970,0.97,[1620823410.0],Preamble The ability of Congress to trade stoc...,I analyzed 9000 trades made by Members of the ...,"[preamble, the, ability, of, congress, to, tra...","[i, analyzed, 9000, trades, made, by, members,...","[preamble, ability, congress, trade, stocks, c...","[analyzed, 9000, trades, made, members, us, co...","[AONE, S, TWOA]"
4,Submission,nrou4z,"With wood prices so high, curiosity struck me....",chumbawamba56,Wood is crazy expensive right now. and most se...,Company Analysis,,1037,t3_nrou4z,https://www.reddit.com/r/stocks/comments/nrou4...,5330,0.95,[1622756602.0],Wood is crazy expensive right now and most see...,With wood prices so high curiosity struck me W...,"[wood, is, crazy, expensive, right, now, and, ...","[with, wood, prices, so, high, curiosity, stru...","[wood, crazy, expensive, right, seem, believe,...","[wood, prices, high, curiosity, struck, wood, ...","[AONE, DB, G, GP, PT, WY]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Submission,n98qqa,Arbitrage opportunity with 6 months timeframe,vasesimi,"\n\nHi all,\n\nI've been a lurker and got som...",Company News,,24,t3_n98qqa,https://www.reddit.com/r/stocks/comments/n98qq...,4,0.67,[1620662665.0],Hi allIve been a lurker and got some nice ide...,Arbitrage opportunity with 6 months timeframe,"[hi, allive, been, a, lurker, and, got, some, ...","[arbitrage, opportunity, with, 6, months, time...","[hi, allive, lurker, got, nice, ideas, form, t...","[arbitrage, opportunity, 6, months, timeframe]",
996,Submission,n93thn,Gasoline prices in relation to futures with on...,AIONisMINE,I dont follow oil stock. Much less futures in ...,,,1,t3_n93thn,https://www.reddit.com/r/stocks/comments/n93th...,5,0.73,[1620651481.0],I dont follow oil stock Much less futures in g...,Gasoline prices in relation to futures with on...,"[i, dont, follow, oil, stock, much, less, futu...","[gasoline, prices, in, relation, to, futures, ...","[dont, follow, oil, stock, much, less, futures...","[gasoline, prices, relation, futures, ongoing,...",
997,Submission,n7yug8,Anyone has an HSA account where they invest?,Dowdell2008,I guess the title says it all. I have one but ...,,,20,t3_n7yug8,https://www.reddit.com/r/stocks/comments/n7yug...,5,0.78,[1620509027.0],I guess the title says it all I have one but i...,Anyone has an HSA account where they invest,"[i, guess, the, title, says, it, all, i, have,...","[anyone, has, an, hsa, account, where, they, i...","[guess, title, says, one, bmo, doesnt, investm...","[anyone, hsa, account, invest]","[BMO, EDIT]"
998,Submission,n7rqy5,How to buy stocks like Curaleaf?,Sevendevils777,I’m having difficulty being able to find platf...,Advice Request,,23,t3_n7rqy5,https://www.reddit.com/r/stocks/comments/n7rqy...,7,0.74,[1620488192.0],Im having difficulty being able to find platfo...,How to buy stocks like Curaleaf,"[im, having, difficulty, being, able, to, find...","[how, to, buy, stocks, like, curaleaf]","[im, difficulty, able, find, platforms, certai...","[buy, stocks, like, curaleaf]",


In [29]:
# Remove submissions without any mention of tickers - unable to interpret relevancy of sentiment
submissions_df = submissions_df.loc[submissions_df['Tickers'] != ""]

In [30]:
#submissions_loop = submissions_df[["Submission_ID", "Tickers"]]

In [31]:
#comments_df['Tickers'] = ""
#for index_c, row_c in comments_df.iterrows():
   # for index_s, row_s in submissions_loop.iterrows():
        #if row_c[1] == row_s[0]:
             #comments_df.at[index_c, 'Tickers'] = row_s[1]

In [32]:
#comments_df = comments_df.loc[comments_df["Tickers"] != ""]

# Perform Sentiment Analysis Using NLTK

In [33]:
# Create function to perform sentiment analysis
def sent_analyzer(df, body_index, title_index = None, upvote_score_index = None):
    sia = SIA()
    results = []
    df['negative_score'] = ""
    df['neutral_score'] = ""
    df['positive_score'] = ""
    df['compound_score'] = ""
    df['sentiment_label'] = 0
    for index, row in df.iterrows():
        body = row[body_index]
        if title_index is not None:
            title = row[title_index]
            eval_text = title + " " + body
        else:
            eval_text = body
        pol_score= sia.polarity_scores(eval_text)
        df.at[index, 'negative_score'] = pol_score['neg']
        df.at[index, 'neutral_score'] = pol_score['neu']
        df.at[index, 'positive_score'] = pol_score['pos']
        df.at[index, 'compound_score'] = pol_score['compound']
        if upvote_score_index is not None:
            weighted_compound = row[upvote_score_index] * pol_score['compound']
            if weighted_compound > 0.3:
                df.at[index,'sentiment_label'] = 1
            elif weighted_compound < 0.3:
                df.at[index,'sentiment_label'] = -1
            else:
                df.at[index,'sentiment_label'] = 0
        else:
            if pol_score['compound'] > 0.3:
                df.at[index,'sentiment_label'] = 1
            elif pol_score['compound'] < 0.3:
                df.at[index,'sentiment_label'] = -1
            else:
                df.at[index,'sentiment_label'] = 0
    return df

In [34]:
sent_analyzer(submissions_df, 13, 14, 11)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

Unnamed: 0,Post_Type,Submission_ID,Title,Author,Body,Flair,Distinguished,Num_Comments,Post_ID,URL,...,Body_Tokens,Title_Tokens,Clean_Body_Tokens,Clean_Title_Tokens,Tickers,negative_score,neutral_score,positive_score,compound_score,sentiment_label
0,Submission,n7eybf,96% of US users opt out of app tracking in iOS...,Karnes,"""It seems that in the United States, at least,...",Industry Discussion,,1635,t3_n7eybf,https://www.reddit.com/r/stocks/comments/n7eyb...,...,"[it, seems, that, in, the, united, states, at,...","[96, of, us, users, opt, out, of, app, trackin...","[seems, united, states, least, app, developers...","[96, us, users, opt, app, tracking, ios, 145, ...","[AAPL, CEO, FB, ID, TV]",0.091,0.867,0.042,-0.9424,-1
1,Submission,nqk7qi,AMC to Offer Free Popcorn and Exclusive Screen...,rgujjula-csdude,Full Article Here: [https://www.cnbc.com/2021/...,,,1009,t3_nqk7qi,https://www.reddit.com/r/stocks/comments/nqk7q...,...,"[full, article, here, httpswwwcnbccom20210602a...","[amc, to, offer, free, popcorn, and, exclusive...","[full, article, httpswwwcnbccom20210602amcplan...","[amc, offer, free, popcorn, exclusive, screeni...","[AMC, AONE, CEO, TWTR]",0.005,0.821,0.174,0.9954,1
2,Submission,n9a2nc,"Chipotle to hike wages, debut referral bonuses...",Brothanogood,Chipotle said it will increase restaurant wage...,Company News,,1665,t3_n9a2nc,https://www.reddit.com/r/stocks/comments/n9a2n...,...,"[chipotle, said, it, will, increase, restauran...","[chipotle, to, hike, wages, debut, referral, b...","[chipotle, said, increase, restaurant, wages, ...","[chipotle, hike, wages, debut, referral, bonus...",[CEO],0.027,0.86,0.114,0.9798,1
3,Submission,nanrlu,I analyzed 9000+ trades made by Members of the...,nobjos,**Preamble:** The ability of Congress to trade...,Meta,,369,t3_nanrlu,https://www.reddit.com/r/stocks/comments/nanrl...,...,"[preamble, the, ability, of, congress, to, tra...","[i, analyzed, 9000, trades, made, by, members,...","[preamble, ability, congress, trade, stocks, c...","[analyzed, 9000, trades, made, members, us, co...","[AONE, S, TWOA]",0.032,0.892,0.076,0.991,1
4,Submission,nrou4z,"With wood prices so high, curiosity struck me....",chumbawamba56,Wood is crazy expensive right now. and most se...,Company Analysis,,1037,t3_nrou4z,https://www.reddit.com/r/stocks/comments/nrou4...,...,"[wood, is, crazy, expensive, right, now, and, ...","[with, wood, prices, so, high, curiosity, stru...","[wood, crazy, expensive, right, seem, believe,...","[wood, prices, high, curiosity, struck, wood, ...","[AONE, DB, G, GP, PT, WY]",0.052,0.815,0.133,0.9976,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,Submission,na2jnb,Mattel options short interest,QuantumVibing,I just noticed on E\*trade that open interest ...,,,9,t3_na2jnb,https://www.reddit.com/r/stocks/comments/na2jn...,...,"[i, just, noticed, on, etrade, that, open, int...","[mattel, options, short, interest]","[noticed, etrade, open, interest, put, options...","[mattel, options, short, interest]",[MAT],0.057,0.753,0.19,0.8858,1
993,Submission,na0826,Wash Sale question,PosterOfStinkyShits,"So I’m pretty dang new to trading, this mornin...",,,4,t3_na0826,https://www.reddit.com/r/stocks/comments/na082...,...,"[so, im, pretty, dang, new, to, trading, this,...","[wash, sale, question]","[im, pretty, dang, new, trading, morning, woke...","[wash, sale, question]",[IRS],0.037,0.854,0.108,0.8261,1
994,Submission,n9i81t,Where can I get real time quotes and charts fo...,Virus4762,I know that pretty much every broker has live ...,,,3,t3_n9i81t,https://www.reddit.com/r/stocks/comments/n9i81...,...,"[i, know, that, pretty, much, every, broker, h...","[where, can, i, get, real, time, quotes, and, ...","[know, pretty, much, every, broker, live, quot...","[get, real, time, quotes, charts, obscure, fut...",[WTI],0,0.833,0.167,0.8126,1
997,Submission,n7yug8,Anyone has an HSA account where they invest?,Dowdell2008,I guess the title says it all. I have one but ...,,,20,t3_n7yug8,https://www.reddit.com/r/stocks/comments/n7yug...,...,"[i, guess, the, title, says, it, all, i, have,...","[anyone, has, an, hsa, account, where, they, i...","[guess, title, says, one, bmo, doesnt, investm...","[anyone, hsa, account, invest]","[BMO, EDIT]",0,0.977,0.023,0.1154,-1


In [35]:
#sent_analyzer(comments_df, 5)

In [64]:
#submissions_sentiment = submissions_df[["Body", "Title", "sentiment_label"]]
submissions_sentiment = submissions_df[["Body", "sentiment_label"]]

In [65]:
#comment_sentiment = comments_df[["Body", "sentiment_label"]]

In [66]:
#dataset = submissions_df[["Body", "Title", "sentiment_label"]]
dataset = submissions_sentiment

In [69]:
dataset = dataset.reset_index(drop=True)

In [70]:
dataset

Unnamed: 0,Body,sentiment_label
0,"""It seems that in the United States, at least,...",-1
1,Full Article Here: [https://www.cnbc.com/2021/...,1
2,Chipotle said it will increase restaurant wage...,1
3,**Preamble:** The ability of Congress to trade...,1
4,Wood is crazy expensive right now. and most se...,1
...,...,...
723,I just noticed on E\*trade that open interest ...,1
724,"So I’m pretty dang new to trading, this mornin...",1
725,I know that pretty much every broker has live ...,1
726,I guess the title says it all. I have one but ...,-1


In [75]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 728):
  submission = re.sub('[^a-zA-Z]', ' ', dataset['Body'][i])
  submission = submission.lower()
  submission = submission.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  submission = [ps.stem(word) for word in submission if not word in set(all_stopwords)]
  submission = ' '.join(submission)
  corpus.append(submission)

In [76]:
print(corpus)

['seem unit state least app develop advertis reli target mobil advertis revenu see worst fear realiz analyt data publish week suggest us user choos opt track percent time wake io appl releas io late last month began enforc polici call app track transpar iphon ipad appl tv app requir request user permiss use techniqu like idfa id advertis track user activ across multipl app data collect ad target purpos chang met fierc resist compani like facebook whose market advantag revenu stream built leverag user data target effect ad user facebook went far take full page newspap ad claim chang would not hurt facebook would destroy small busi around world shortli appl ceo tim cook attend data privaci confer deliv speech harshli critic facebook busi model nonetheless facebook other compli appl new rule avoid reject iphon app store though app present screen explain user opt appl mandat prompt opt appear sourc http arstechnica com gadget us user opt app track io analyt find', 'full articl http www cnb

# Bag of Words Vectorization

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

# Split Data into Training and Test Sets

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify=y)

In [79]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 4, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [80]:
# Review data ouput
print(X_train[0])

[0 0 0 ... 0 0 0]


In [81]:
print(y_train[0])

-1


In [82]:
print(y_test)

[ 1  1  1  1 -1  1  1  1 -1  1  1  1  1  1  1 -1  1  1  1  1 -1  1  1  1
  1  1  1  1 -1 -1  1  1  1  1  1  1  1 -1  1  1 -1  1  1  1 -1 -1  1 -1
  1  1  1  1  1  1  1 -1  1  1 -1  1  1 -1  1  1  1  1  1  1  1  1 -1  1
  1  1  1  1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1  1 -1  1  1  1  1 -1
  1  1  1 -1  1 -1  1  1  1 -1  1 -1 -1  1  1  1 -1  1  1  1 -1  1  1  1
  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1 -1  1  1 -1  1  1  1  1  1  1
 -1  1]


# Train Several Models with Training Set

In [101]:
# Linear SVM

from sklearn import svm

clf_svm = svm.SVC(kernel="linear")

clf_svm.fit(X_train, y_train)

SVC(kernel='linear')

In [102]:
clf_svm_pred = clf_svm.predict(X_test)

In [103]:
#Analyze some predictions
print(X_test[3])
print(clf_svm_pred[3])

[0 0 0 ... 0 0 0]
1


In [104]:
# Evaluate Model Accuracy
from sklearn.metrics import accuracy_score

accuracy_score(y_test, clf_svm_pred)

0.7876712328767124

In [105]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(X_train, y_train)

DecisionTreeClassifier()

In [106]:
clf_dec_pred = clf_dec.predict(X_test)

In [107]:
#Analyze some predictions
print(X_test[3])
print(clf_dec_pred[3])

[0 0 0 ... 0 0 0]
1


In [108]:
# Evaluate Model Accuracy
accuracy_score(y_test, clf_dec_pred)

0.636986301369863

In [109]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)

GaussianNB()

In [110]:
clf_gnb_pred = clf_gnb.predict(X_test)

In [111]:
print(clf_gnb_pred)

[ 1  1  1  1  1  1  1  1  1 -1 -1  1 -1  1 -1  1  1  1  1  1  1 -1  1 -1
  1  1 -1  1 -1  1  1  1  1  1  1 -1 -1 -1  1 -1  1  1 -1  1  1  1 -1 -1
  1  1 -1  1  1  1 -1 -1  1  1  1  1  1 -1 -1  1  1  1 -1  1 -1  1  1  1
 -1 -1  1  1 -1  1  1  1  1  1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1 -1 -1
  1  1 -1  1  1 -1  1  1 -1 -1 -1  1 -1  1  1  1 -1  1 -1  1 -1  1  1  1
  1  1  1  1 -1  1  1  1 -1 -1 -1  1 -1 -1  1 -1  1 -1  1  1  1  1 -1  1
  1 -1]


In [112]:
# Evaluate Model Accuracy
accuracy_score(y_test, clf_gnb_pred)

0.6164383561643836

In [113]:
#combined_df = submissions_df.join(sentiment_average)
#combined_df = pd.merge(submissions_df, sentiment_average, how='left', on='Submission_ID',
         #left_index=False, right_index=False, sort=True,
         #suffixes=('_x', '_y'), copy=True, indicator=False,
         #validate=None)

In [115]:
# Random Forest 
from sklearn.ensemble import RandomForestClassifier
clf_ran = RandomForestClassifier(max_depth=2, random_state=0)
clf_ran.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [116]:
clf_ran_pred = clf_ran.predict(X_test)

In [117]:
# Evaluate Model Accuracy
accuracy_score(y_test, clf_ran_pred)

0.7671232876712328

In [118]:
# MLP Classifier model (built in neural network)
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [119]:
clf_mlp.fit(X_train, y_train)

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(300,),
              learning_rate='adaptive', max_iter=500)

In [120]:
clf_mlp_pred = clf_mlp.predict(X_test)

In [121]:
# Evaluate Model Accuracy
accuracy_score(y_test, clf_mlp_pred)

0.8356164383561644

### The MLP Classifier Model has the best accuracy score, so we will continue with this method.

In [None]:
ticker_sentiment = pd.DataFrame()

In [None]:
ticker_sentiment["Ticker"] = ""
ticker_sentiment["Total_Count"] = 0
ticker_sentiment["Total_Sentiment"] = 0

In [None]:
index = 0
for ticker in tickers:
    ticker_sentiment.at[index, "Ticker"] = ticker.abbrev
    ticker_sentiment.at[index, "Total_Count"] = 0
    ticker_sentiment.at[index, "Total_Sentiment"] = 0
    index += 1

In [None]:
ticker_sentiment = ticker_sentiment.drop_duplicates(subset=['Ticker'])

In [None]:
ticker_sentiment

In [None]:
# Get running count of tickers mentioned (# of times mentioned) and with average sentiment
for index, row in ticker_sentiment.iterrows():
    ticker_abbrev = row[0]
    total_count = row[1]
    total_sentiment = row[2]
    for i, r in submissions_df.iterrows():
        # Get column with tickers found
        ticker_set = set(r[19])
        post_sentiment = r[24]
        if ticker_abbrev in ticker_set:
            total_count += 1
            total_sentiment += post_sentiment
    ticker_sentiment.at[index, "Total_Count"] = total_count
    ticker_sentiment.at[index, "Total_Sentiment"] = total_sentiment

In [None]:
final_sentiment_results = ticker_sentiment.loc[ticker_sentiment['Total_Count'] != 0]

In [None]:
final_sentiment_results = final_sentiment_results.reset_index(drop=True)

In [None]:
final_sentiment_results

In [None]:
final_sentiment_results["Average_Sentiment"] = final_sentiment_results["Total_Count"] / final_sentiment_results["Total_Sentiment"]

In [None]:
final_sentiment_results

In [None]:
final_sentiment_results['Average_Sentiment'].unique()

In [None]:
final_sentiment_results['sentiment_label'] = "Neutral"
final_sentiment_results.loc[final_sentiment_results['Average_Sentiment'] > 0.2, 'sentiment_label'] = "Bullish"
final_sentiment_results.loc[final_sentiment_results['Average_Sentiment'] < -0.2, 'sentiment_label'] = "Bearish"

In [None]:
# Add "today's price change" to each ticker found - total count, average sentiment, today's price change
# Include graph for visuals
import yfinance as yf
from datetime import date

In [None]:
todays_date = date.today()
todays_date

In [None]:
final_sentiment_results["Price_Change"] = ""
final_sentiment_results["Date"] = ""
for index, row in final_sentiment_results.iterrows():
    try:
        df = yf.download(row[0], start=todays_date)
        date = df.index[0]
        close_price = df["Close"].values[0]
        open_price = df["Open"].values[0]
        percentage_change = round((((close_price - open_price) / open_price) * 100), 2)
        final_sentiment_results.at[index, "Price_Change"] = f'{percentage_change}%'
        final_sentiment_results.at[index, "Date"] = date
    except:
        print("Data not found")
        final_sentiment_results.at[index, "Price_Change"] = "Ticker unavailable"
        final_sentiment_results.at[index, "Date"] = date

In [None]:
final_sentiment_results

# Test database connection and process

In [None]:
# Import SQL Alchemy
from sqlalchemy import create_engine

# Import datetime
from datetime import datetime

# Import and establish Base for which classes will be constructed 
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Float

In [None]:
# Create the Ticker Sentiment class
class Ticker_Sentiment(Base):
    __tablename__ = 'ticker_sentiment'
    id = Column(Integer, primary_key=True)
    ticker = Column(String(255))
    date = Column(String(255))
    count = Column(Integer)
    sentiment = Column(String)
    percent_change = Column(Float)

In [None]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///reddit_sentiment.db')

In [None]:
# Create the ticker_sentiment table within the database
Base.metadata.create_all(engine)

In [None]:
# Push the objects made and query the server
from sqlalchemy.orm import Session
session = Session(bind=engine)

In [None]:
sql_df = final_sentiment_results[["Ticker", "Date", "Total_Count", "sentiment_label", "Price_Change"]]

In [None]:
sql_df = sql_df.rename(columns={"Ticker":"ticker", "Date":"date", "Total_Count":"count", "sentiment_label":"sentiment", "Price_Change":"percent_change"})

In [None]:
sql_df

In [None]:
sql_df.to_sql('ticker_sentiment', con=engine, if_exists='append', index=False)

In [None]:
from sqlalchemy.ext.automap import automap_base
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

# Save reference to the table
sentiment = Base.classes.ticker_sentiment

In [None]:
session = Session(engine)

results = session.query(sentiment.ticker).all()

session.close()

In [None]:
import numpy as np
sentiment = list(np.ravel(results))

In [None]:
print(sentiment)