In [1]:
import pandas as pd

from DataPlug import DataPlug as dp
from SqueezeNet import SqueezeNet as sq

# Util Imports
import time # timer for sentiment analysis
from tqdm import tqdm # progress bar for sentiment analysis

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jackk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
squeeze = sq("SqueezeNet")
dataPlug = dp("DataPlug")


In [3]:
df = pd.read_csv('../data/r_wallstreetbets_posts.csv', dtype={'id': 'str', 'title': 'str', 'score': 'int', 'author': 'str','author_flair_text': 'str','removed_by': 'str', 'total_awards_received': 'str','awarders': 'str','created_utc': 'int', 'full_link': 'str', 'num_comments': 'int', 'over_18': 'bool'})
# convert utc time to datetime
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
df['timestamp'] = df['created_utc'].astype('datetime64[ns]').dt.floor('D')

df.drop('author', inplace=True, axis=1)
df.drop('created_utc', inplace=True, axis=1)
df.drop('id', inplace=True, axis=1)
df.drop('author_flair_text', inplace=True, axis=1)
df.drop('removed_by', inplace=True, axis=1)
df.drop('awarders', inplace=True, axis=1)
df.drop('full_link', inplace=True, axis=1)
df.drop('over_18', inplace=True, axis=1)
df.drop('total_awards_received', inplace=True, axis=1)

df['title'] = df['title'].apply(lambda x: str(x))

display(df)


Unnamed: 0,title,score,num_comments,timestamp
0,Whats going on with PLTR?,1,2,2021-02-16
1,"Need explanations on Level 2 data for GME, why...",1,2,2021-02-16
2,XRT is being used as a laundry short machine,1,2,2021-02-16
3,Airlines?,1,2,2021-02-16
4,Buy TRXC 🚀,1,2,2021-02-16
...,...,...,...,...
1118858,"EBAY posts higher 1Q net income and revenue, s...",7,4,2012-04-19
1118859,Anyone betting on VVUS and their potential app...,1,0,2012-04-17
1118860,My poorly timed opening position for AAPL earn...,12,21,2012-04-16
1118861,"GOOG - beat estimates, price barely rises.",2,0,2012-04-12


In [None]:
agg_func = {'title': list, 'score': 'mean', 'num_comments': 'mean'}

# Replace NaN values with 'NaN'
#df['total_awards_received'] = df['total_awards_received'].fillna(0)
# Group by date and aggregate
df_new = df.groupby(df['timestamp']).aggregate(agg_func)
display(df_new)
# Join lists of titles, ids, urls, and bodies into one string per date to perform sentiment analysis.

df_new['title'] = df_new['title'].apply(lambda x: '||'.join(x))


display(df_new)

In [4]:
dataPlug.df = df

print(dataPlug.df.nunique())

display(dataPlug.df)

title           1015951
score              3385
num_comments       2913
timestamp          3020
dtype: int64


Unnamed: 0,title,score,num_comments,timestamp
0,Whats going on with PLTR?,1,2,2021-02-16
1,"Need explanations on Level 2 data for GME, why...",1,2,2021-02-16
2,XRT is being used as a laundry short machine,1,2,2021-02-16
3,Airlines?,1,2,2021-02-16
4,Buy TRXC 🚀,1,2,2021-02-16
...,...,...,...,...
1118858,"EBAY posts higher 1Q net income and revenue, s...",7,4,2012-04-19
1118859,Anyone betting on VVUS and their potential app...,1,0,2012-04-17
1118860,My poorly timed opening position for AAPL earn...,12,21,2012-04-16
1118861,"GOOG - beat estimates, price barely rises.",2,0,2012-04-12


In [5]:
dataPlug.get_price_dataframe()

Unnamed: 0,timestamp,Open,High,Low,Close,Adj Close,Volume
0,2012-04-11,5.330000,5.380000,5.2350,5.3175,3.603972,19562000
1,2012-04-12,5.325000,5.427500,5.3125,5.3900,3.653110,8414800
2,2012-04-13,5.375000,5.375000,5.2800,5.3075,3.597196,8956000
3,2012-04-16,5.325000,5.452500,5.2425,5.4250,3.676833,13710000
4,2012-04-17,5.455000,5.550000,5.4500,5.5375,3.753081,16022400
...,...,...,...,...,...,...,...
2221,2021-02-08,18.102501,18.165001,14.5050,15.0000,15.000000,102749200
2222,2021-02-09,14.152500,14.250000,11.6300,12.5775,12.577500,107372400
2223,2021-02-10,12.692500,15.707500,11.6375,12.8000,12.800000,145820000
2224,2021-02-11,12.502500,13.830000,12.0550,12.7750,12.775000,52226800


In [None]:
# Merge the dataframes
dataPlug.merge_dataframes()


display(dataPlug.mergedDF)

In [None]:
dataPlug.df

In [6]:
positive = []
negative = []
neutral = []
compound = []



begin = time.time() # timer for entire process


for i in tqdm(range(0, len(dataPlug.df['title']))):
    start = time.time() # timer for each iteration


    text = dataPlug.df['title'][i]
    clean_text = squeeze.clean_text(text)


    score = squeeze.sentiment_analysis(clean_text)
    # Add scores to array
    positive.append(score['pos'])
    negative.append(score['neg'])
    neutral.append(score['neu'])
    compound.append(score['compound'])




    #print(f"Completed {i} of {len(dataPlug.df['title'])} posts | {end - start:0.4f} seconds | {end - begin:0.4f} seconds total")

end = time.time() # end timer for each iteration
print(f"Sentiment analysis took {end - begin:0.4f} seconds total")



  5%|▌         | 57953/1118863 [00:04<01:15, 13970.96it/s]


KeyboardInterrupt: 

In [None]:
# Using DataFrame.insert() to add the sentiment columns to the dataframe
dataPlug.df.insert(1, "Compound_Sentiment", compound, True)
dataPlug.df.insert(1, "Negative_Sentiment", negative, True)
dataPlug.df.insert(1, "Neutral_Sentiment", neutral, True)
dataPlug.df.insert(1, "Positive_Sentiment", positive, True)
print('Done!')

display(dataPlug.df)
dataPlug.df.to_csv('../data/all_posts_with_sentiment.csv')
print(f'saved to ../data/all_posts_with_sentiment.csv')

In [18]:
df = pd.read_csv('../data/all_posts_with_sentiment.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df['title'] = df['title'].apply(lambda x: str(x))
df['timestamp'] = df['timestamp'].astype('datetime64[ns]').dt.floor('D')
display(df)

Unnamed: 0,title,Positive_Sentiment,Neutral_Sentiment,Negative_Sentiment,Compound_Sentiment,score,num_comments,timestamp
0,Whats going on with PLTR?,0.000,1.000,0.0,0.0000,1,2,2021-02-16
1,"Need explanations on Level 2 data for GME, why...",0.000,1.000,0.0,0.0000,1,2,2021-02-16
2,XRT is being used as a laundry short machine,0.000,1.000,0.0,0.0000,1,2,2021-02-16
3,Airlines?,0.000,1.000,0.0,0.0000,1,2,2021-02-16
4,Buy TRXC 🚀,0.000,1.000,0.0,0.0000,1,2,2021-02-16
...,...,...,...,...,...,...,...,...
1118858,"EBAY posts higher 1Q net income and revenue, s...",0.000,1.000,0.0,0.0000,7,4,2012-04-19
1118859,Anyone betting on VVUS and their potential app...,0.205,0.795,0.0,0.4767,1,0,2012-04-17
1118860,My poorly timed opening position for AAPL earn...,0.000,1.000,0.0,0.0000,12,21,2012-04-16
1118861,"GOOG - beat estimates, price barely rises.",0.000,1.000,0.0,0.0000,2,0,2012-04-12


In [19]:
agg_func = {'title': list, 'score': 'mean', 'num_comments': 'mean', 'Positive_Sentiment': 'mean', 'Negative_Sentiment': 'mean', 'Neutral_Sentiment': 'mean', 'Compound_Sentiment': 'mean'}

# Replace NaN values with 'NaN'
#df['total_awards_received'] = df['total_awards_received'].fillna(0)
# Group by date and aggregate
df_new = df.groupby(df['timestamp']).aggregate(agg_func)

# Join lists of titles, ids, urls, and bodies into one string per date to perform sentiment analysis.

df_new['title'] = df_new['title'].apply(lambda x: '||'.join(x))


display(df_new)
dataPlug.df = df_new

Unnamed: 0_level_0,title,score,num_comments,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Compound_Sentiment
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-04-11,Earnings season is here. Place your bets.,13.000000,22.000000,0.000000,0.000000,1.000000,0.000000
2012-04-12,"GOOG - beat estimates, price barely rises.",2.000000,0.000000,0.000000,0.000000,1.000000,0.000000
2012-04-16,My poorly timed opening position for AAPL earn...,12.000000,21.000000,0.000000,0.000000,1.000000,0.000000
2012-04-17,Anyone betting on VVUS and their potential app...,1.000000,0.000000,0.205000,0.000000,0.795000,0.476700
2012-04-19,"After HGSI spikes 97%, will share price drop a...",5.500000,2.000000,0.082500,0.079000,0.838500,0.012900
...,...,...,...,...,...,...,...
2021-02-12,"When you Can't Afford to Lose, Sell Options! +...",1.002569,21.684764,0.133737,0.053982,0.809946,0.120769
2021-02-13,SatoshiSwap (SAS) is the future!!!!||Strategie...,1.000765,10.059153,0.140172,0.046565,0.810966,0.135202
2021-02-14,Created a cliff for myself to jump off of||I w...,1.000337,10.891326,0.139030,0.045846,0.812087,0.130886
2021-02-15,$EAT next up!||BNGO BNGO||Can y’all bad mamaja...,1.011086,18.004751,0.128405,0.050436,0.817675,0.110758


In [20]:
dataPlug.get_price_dataframe()
dataPlug.merge_dataframes()
display(dataPlug.mergedDF)
dataPlug.mergedDF.to_csv('../data/mergedDF.csv')
print(f'saved to ../data/mergedDF.csv')

Unnamed: 0,timestamp,Open,High,Low,Close,Adj Close,Volume,title,score,num_comments,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Compound_Sentiment
0,2012-04-11,5.330000,5.380000,5.2350,5.3175,3.603972,19562000,Earnings season is here. Place your bets.,13.000000,22.000000,0.000000,0.000000,1.000000,0.000000
1,2012-04-12,5.325000,5.427500,5.3125,5.3900,3.653110,8414800,"GOOG - beat estimates, price barely rises.",2.000000,0.000000,0.000000,0.000000,1.000000,0.000000
2,2012-04-16,5.325000,5.452500,5.2425,5.4250,3.676833,13710000,My poorly timed opening position for AAPL earn...,12.000000,21.000000,0.000000,0.000000,1.000000,0.000000
3,2012-04-17,5.455000,5.550000,5.4500,5.5375,3.753081,16022400,Anyone betting on VVUS and their potential app...,1.000000,0.000000,0.205000,0.000000,0.795000,0.476700
4,2012-04-19,5.562500,5.635000,5.5325,5.5725,3.776802,14128400,"After HGSI spikes 97%, will share price drop a...",5.500000,2.000000,0.082500,0.079000,0.838500,0.012900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,2021-02-08,18.102501,18.165001,14.5050,15.0000,15.000000,102749200,Me on the RH||Can you Help me giving the name ...,1.085725,10.603988,0.152163,0.048261,0.797151,0.147872
2141,2021-02-09,14.152500,14.250000,11.6300,12.5775,12.577500,107372400,Cindicator Capital Wants to Hire r/WallStreetB...,1.000829,13.947142,0.139888,0.048742,0.811039,0.129740
2142,2021-02-10,12.692500,15.707500,11.6375,12.8000,12.800000,145820000,Why the fuck isn’t anyone talking about then G...,6.014227,20.014591,0.152525,0.039616,0.805182,0.163156
2143,2021-02-11,12.502500,13.830000,12.0550,12.7750,12.775000,52226800,Who loves a Pi ?||Does anyone know about Atlis...,1.207335,17.038586,0.142902,0.045269,0.810214,0.138476


saved to ../data/mergedDF.csv


---

### Run if you want to load the dataframe from a file to avoid re-running sentiment analysis

In [21]:
# If you want to load the mergedDF from a csv file
dataPlug.mergedDF = pd.read_csv('../data/mergedDF.csv')

---

In [24]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score


In [25]:
prices = dataPlug.mergedDF['High'].tolist()
y = []
for i in prices:
        y.append(float(i))
volume_y = dataPlug.mergedDF['Volume'].tolist()


dataPlug.mergedDF = dataPlug.mergedDF.drop('Open', axis=1)
dataPlug.mergedDF = dataPlug.mergedDF.drop('High', axis=1)
dataPlug.mergedDF = dataPlug.mergedDF.drop('Low', axis=1)
dataPlug.mergedDF = dataPlug.mergedDF.drop('Close', axis=1)
dataPlug.mergedDF = dataPlug.mergedDF.drop('Adj Close', axis=1)
dataPlug.mergedDF = dataPlug.mergedDF.drop('Volume', axis=1)


In [26]:
dataPlug.mergedDF = dataPlug.mergedDF.drop('timestamp', axis=1)
display(dataPlug.mergedDF)

Unnamed: 0.1,Unnamed: 0,title,score,num_comments,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Compound_Sentiment
0,0,Earnings season is here. Place your bets.,13.000000,22.000000,0.000000,0.000000,1.000000,0.000000
1,1,"GOOG - beat estimates, price barely rises.",2.000000,0.000000,0.000000,0.000000,1.000000,0.000000
2,2,My poorly timed opening position for AAPL earn...,12.000000,21.000000,0.000000,0.000000,1.000000,0.000000
3,3,Anyone betting on VVUS and their potential app...,1.000000,0.000000,0.205000,0.000000,0.795000,0.476700
4,4,"After HGSI spikes 97%, will share price drop a...",5.500000,2.000000,0.082500,0.079000,0.838500,0.012900
...,...,...,...,...,...,...,...,...
2140,2140,Me on the RH||Can you Help me giving the name ...,1.085725,10.603988,0.152163,0.048261,0.797151,0.147872
2141,2141,Cindicator Capital Wants to Hire r/WallStreetB...,1.000829,13.947142,0.139888,0.048742,0.811039,0.129740
2142,2142,Why the fuck isn’t anyone talking about then G...,6.014227,20.014591,0.152525,0.039616,0.805182,0.163156
2143,2143,Who loves a Pi ?||Does anyone know about Atlis...,1.207335,17.038586,0.142902,0.045269,0.810214,0.138476


In [29]:
#dataPlug.mergedDF = dataPlug.mergedDF.drop('title', axis=1)
dataPlug.mergedDF = dataPlug.mergedDF.drop('Unnamed: 0', axis=1)
display(dataPlug.mergedDF)

Unnamed: 0,score,num_comments,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Compound_Sentiment
0,13.000000,22.000000,0.000000,0.000000,1.000000,0.000000
1,2.000000,0.000000,0.000000,0.000000,1.000000,0.000000
2,12.000000,21.000000,0.000000,0.000000,1.000000,0.000000
3,1.000000,0.000000,0.205000,0.000000,0.795000,0.476700
4,5.500000,2.000000,0.082500,0.079000,0.838500,0.012900
...,...,...,...,...,...,...
2140,1.085725,10.603988,0.152163,0.048261,0.797151,0.147872
2141,1.000829,13.947142,0.139888,0.048742,0.811039,0.129740
2142,6.014227,20.014591,0.152525,0.039616,0.805182,0.163156
2143,1.207335,17.038586,0.142902,0.045269,0.810214,0.138476


In [35]:
X = dataPlug.mergedDF
# split data into test and training sets
X_train, X_test, y_train_list, y_test_list = train_test_split(X, y, test_size=0.5, random_state=3)
y_train = []
for item in y_train_list:
    y_train.append(int(item))

y_test = []
for item in y_test_list:
    y_test.append(int(item))


In [36]:
# create random forest regression
clf = BernoulliNB()
clf.fit(X_train, y_train)



In [37]:
import sklearn.metrics as metrics

# predict labels for training and test sets
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

train_score = metrics.accuracy_score(y_train, y_train_pred)




print("Training score:", train_score)



Training score: 0.15671641791044777


In [38]:
from sklearn.metrics import classification_report
target_names = ['High']
print(classification_report(y_test, y_test_pred.astype('int')))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.18      1.00      0.31       159
           2       0.00      0.00      0.00        47
           3       0.00      0.00      0.00       113
           4       0.00      0.00      0.00       107
           5       0.00      0.00      0.00       100
           6       0.12      0.03      0.05       119
           7       0.00      0.00      0.00        82
           8       0.00      0.00      0.00        42
           9       0.17      0.23      0.20        77
          10       0.18      0.08      0.11        98
          11       0.00      0.00      0.00        53
          12       1.00      0.03      0.05        38
          13       0.09      0.09      0.09        11
          14       0.00      0.00      0.00         6
          15       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1
          22       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
