The intent of this notebook is to visualize trends in the data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


import emoji
import re
from collections import Counter


%matplotlib inline

In [2]:
path = os.path.join('data', 'data_munged.csv')
df = pd.read_csv(path,
                 index_col=0,
                 parse_dates=['created_utc'])

In [3]:
df.head(2)

Unnamed: 0,total_awards_received,author,author_premium,created_utc,is_video,num_comments,score,self_text,title,is_submission,...,:rocket:,:gem_stone:,:clown_face:,:bear:,:face_with_tears_of_joy:,:rainbow:,:new_moon:,:gorilla:,:raising_hands:,:full_moon:
0,0,chibears20,False,2020-07-01 00:00:23,0,11,1,,Bring back polls. Macy’s earnings,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,slow_down_more,False,2020-07-01 00:01:21,0,0,31,Thanks i had no clue 😎,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.columns

Index(['total_awards_received', 'author', 'author_premium', 'created_utc',
       'is_video', 'num_comments', 'score', 'self_text', 'title',
       'is_submission', 'is_op', 'subreddit_subscribers', 'upvote_ratio',
       'no_follow', 'time_amplification', 'flair_DD', 'flair_Discussion',
       'flair_Gain', 'flair_Loss', 'flair_Meme', 'flair_Options',
       'flair_Shitpost', 'flair_Stocks', 'flair_YOLO', 'flair_no_flair',
       ':rocket:', ':gem_stone:', ':clown_face:', ':bear:',
       ':face_with_tears_of_joy:', ':rainbow:', ':new_moon:', ':gorilla:',
       ':raising_hands:', ':full_moon:'],
      dtype='object')

In [5]:
df.shape

(119353, 35)

In [6]:
df.dtypes

total_awards_received                int64
author                              object
author_premium                        bool
created_utc                 datetime64[ns]
is_video                             int64
num_comments                         int64
score                                int64
self_text                           object
title                               object
is_submission                        int64
is_op                                int64
subreddit_subscribers              float64
upvote_ratio                       float64
no_follow                             bool
time_amplification                 float64
flair_DD                             int64
flair_Discussion                     int64
flair_Gain                           int64
flair_Loss                           int64
flair_Meme                           int64
flair_Options                        int64
flair_Shitpost                       int64
flair_Stocks                         int64
flair_YOLO 

In [7]:
#gme data
gme_data = pd.read_csv(os.path.join('data', 'gme_stock_daily.csv'))
gme_data.Date = pd.to_datetime(gme_data.Date, infer_datetime_format=True)

gme_data.head(2)

Unnamed: 0,Date,GME High,GME Low,GME Open,GME Close,GME Volume,GME Adj Close
0,2020-07-01,4.5,4.31,4.31,4.44,2303700,4.44
1,2020-07-02,4.51,4.29,4.49,4.29,1887600,4.29


In [8]:
#take the first value on that day
df_member = df.groupby(by=pd.Grouper(key='created_utc',
                                     freq='D'),
                       as_index=True).first()


#fill in some gaps from pushshift
df_member['subreddit_subscribers'] = df_member['subreddit_subscribers'].fillna(method='ffill')
#adjust types

df_member = df_member.merge(right=gme_data,
                            how='inner',
                            left_on = df_member.index,
                            right_on = gme_data['Date'],
                            left_index=False,
                            right_index=False)

In [9]:
df_member

Unnamed: 0,key_0,total_awards_received,author,author_premium,is_video,num_comments,score,self_text,title,is_submission,...,:gorilla:,:raising_hands:,:full_moon:,Date,GME High,GME Low,GME Open,GME Close,GME Volume,GME Adj Close
0,2020-07-01,0.0,chibears20,0.0,0.0,11.0,1.0,,Bring back polls. Macy’s earnings,1.0,...,0.0,0.0,0.0,2020-07-01,4.500000,4.310000,4.310000,4.440000,2303700,4.440000
1,2020-07-02,0.0,Youngboirick,0.0,0.0,16.0,1.0,,Soon to be 18 year old looking for advice.,1.0,...,0.0,0.0,0.0,2020-07-02,4.510000,4.290000,4.490000,4.290000,1887600,4.290000
2,2020-07-06,0.0,arayasem,0.0,0.0,1.0,1.0,,Markets will trend upwards until major banks a...,1.0,...,0.0,0.0,0.0,2020-07-06,4.340000,4.190000,4.310000,4.240000,2140900,4.240000
3,2020-07-07,0.0,OnioncuttingHattori,0.0,0.0,0.0,29.0,You know I never thought of the stock market a...,,0.0,...,0.0,0.0,0.0,2020-07-07,4.250000,4.060000,4.200000,4.090000,2456600,4.090000
4,2020-07-08,0.0,KillerAc1,0.0,0.0,2.0,1.0,,Family broker said that it’s stupid to buy NIO...,1.0,...,0.0,0.0,0.0,2020-07-08,4.290000,4.030000,4.100000,4.260000,2052800,4.260000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,2021-02-08,0.0,Ok-Hunt-5961,0.0,0.0,36.0,6.0,,Very complex theory I need u guys to know of. ...,1.0,...,0.0,0.0,0.0,2021-02-08,72.660004,58.020000,72.410004,60.000000,25687300,60.000000
153,2021-02-09,0.0,PGinn,0.0,0.0,0.0,39.0,Did i buy in @ 380? Yes. \nIs ape gang stronk?...,,0.0,...,0.0,0.0,0.0,2021-02-09,57.000000,46.520000,56.610001,50.310001,26843100,50.310001
154,2021-02-10,0.0,ApocalypseMao,0.0,0.0,0.0,328.0,We miss zeejayzee,,0.0,...,0.0,0.0,0.0,2021-02-10,62.830002,46.549999,50.770000,51.200001,36455000,51.200001
155,2021-02-11,1.0,WalkonWalrus,0.0,0.0,72.0,31.0,,Stop manipulating kids for your own profits,1.0,...,0.0,0.0,0.0,2021-02-11,55.320000,48.220001,50.009998,51.099998,12997400,51.099998


## Memebers over time

In [27]:
#take the first value on that day
df_member = df.groupby(by=pd.Grouper(key='created_utc',
                                     freq='D'),
                       as_index=True).first()


#fill in some gaps from pushshift
df_member['subreddit_subscribers'] = df_member['subreddit_subscribers'].fillna(method='ffill')
#adjust types

df_member = df_member.merge(right=gme_data,
                            how='inner',
                            left_on = df_member.index,
                            right_on = gme_data['Date'],
                            left_index=True,
                            right_index=False)

df_member.index = df_member['Date']

subfig = make_subplots(specs=[[{"secondary_y": True}]])


fig1 = px.line(df_member,
               x=df_member.index, 
                     y="subreddit_subscribers", 
                     title='WSB Membership', 
                     labels={
                     "created_utc": "Dates",
                     "subreddit_subscribers": "WSB Member Count"}, 
               render_mode="webgl"
                    )

fig1['data'][0]['showlegend']=True
fig1['data'][0]['name']='Member Count'

fig1.update_layout(showlegend=True)

fig2 = px.line(gme_data,
               x='Date',
               y=['GME High',
                  'GME Low'], 
               render_mode="webgl"
              )


fig2.update_traces(yaxis="y2")


subfig.add_traces(fig2.data + fig1.data)


subfig.layout.yaxis.title="Member Count"
subfig.layout.yaxis2.title="Price (USD)"
subfig.layout.title = 'WSB Member Count'

# # recoloring is necessary otherwise lines from fig und fig2 would share each color
# # e.g. Linear-, Log- = blue; Linear+, Log+ = red... we don't want this
subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))


subfig.write_html(os.path.join('plots', 'member_count.html'))


subfig.show()




## Emojis

In [11]:
# df_emoj = df[[':rocket:', ':gem_stone:', ':new_moon:', ':clown_face:', ':bear:',
#        ':face_with_tears_of_joy:', ':rainbow:', ':full_moon:',
#        ':raising_hands:', ':gorilla:']].cumsum()

df_emoj = df[[':rocket:', ':gem_stone:', ':new_moon:', ':clown_face:', ':bear:',
       ':face_with_tears_of_joy:', ':rainbow:', ':full_moon:',
       ':raising_hands:', ':gorilla:']]

#add the date column
df_emoj['created_utc'] = df['created_utc']

#group by day
df_emoj = df_emoj.groupby(by=pd.Grouper(key='created_utc', freq='D'),
                          as_index=True).sum()

#get cumsum
df_emoj = df_emoj.cumsum()

#add GME data
df_emoj = df_emoj.merge(right=gme_data,
                            how='inner',
                            left_on = df_emoj.index,
                            right_on = gme_data['Date'],
                            left_index=True,
                            right_index=False)

df_emoj.index = df_emoj['Date']


df_emoj.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,key_0,:rocket:,:gem_stone:,:new_moon:,:clown_face:,:bear:,:face_with_tears_of_joy:,:rainbow:,:full_moon:,:raising_hands:,:gorilla:,Date,GME High,GME Low,GME Open,GME Close,GME Volume,GME Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-07-01,2020-07-01,7.0,1.0,0.0,1.0,8.0,7.0,7.0,0.0,1.0,0.0,2020-07-01,4.5,4.31,4.31,4.44,2303700,4.44
2020-07-02,2020-07-02,13.0,2.0,0.0,13.0,12.0,11.0,10.0,1.0,2.0,0.0,2020-07-02,4.51,4.29,4.49,4.29,1887600,4.29
2020-07-06,2020-07-06,25.0,3.0,0.0,84.0,29.0,26.0,18.0,1.0,2.0,0.0,2020-07-06,4.34,4.19,4.31,4.24,2140900,4.24
2020-07-07,2020-07-07,49.0,4.0,0.0,91.0,41.0,28.0,20.0,1.0,3.0,0.0,2020-07-07,4.25,4.06,4.2,4.09,2456600,4.09
2020-07-08,2020-07-08,71.0,4.0,0.0,97.0,48.0,35.0,26.0,1.0,4.0,0.0,2020-07-08,4.29,4.03,4.1,4.26,2052800,4.26


In [12]:
df_emoj.shape

(157, 18)

In [29]:
subfig = make_subplots(specs=[[{"secondary_y": True}]])



#note that y=[<arr of values>] works only with plotly > 4.8
fig1 = px.line(df_emoj, x=df_emoj.index,
              y=[':rocket:', ':gem_stone:', ':new_moon:', ':clown_face:', ':bear:',
                 ':face_with_tears_of_joy:', ':rainbow:', ':full_moon:',
                 ':raising_hands:', ':gorilla:'], 
              title='Emojis Cumulative Sum', 
              labels={
                  'created_utc': 'Dates',
                  ':rocket:': '🚀',
                  ':gem_stone:' : '💎', 
                  ':new_moon:': '🌑', 
                  ':clown_face:': '🤡', 
                  ':bear:': '🐻',
                 ':face_with_tears_of_joy:': '😂', 
                  ':rainbow:': '🌈', 
                  ':full_moon:': '🌕',
                 ':raising_hands:': '🙌', 
                  ':gorilla:': '🦍'
              }, 
             )

fig1.update_layout(showlegend=True)


fig1.update_layout(
    yaxis_title="Cumulative Sum",
    legend_title="Emoji Description",
)




fig2 = px.line(gme_data,
               x='Date',
               y=['GME High',
                  'GME Low'],
               render_mode="webgl"
              )


fig2.update_traces(yaxis="y2")


subfig.add_traces(fig2.data + fig1.data)


subfig.layout.yaxis.title="Emoji Cumulative Sum"
subfig.layout.yaxis2.title="Price (USD)"
subfig.layout.title = 'Total Emoji Use Over Time'

# # recoloring is necessary otherwise lines from fig und fig2 would share each color
# # e.g. Linear-, Log- = blue; Linear+, Log+ = red... we don't want this
subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))



subfig.write_html(os.path.join('plots', 'emoji_cumsum.html'),
               include_plotlyjs="cdn",
               full_html=False)

subfig.show()


## Mention of Stocks

In [14]:
ticker_chunks = []

for chunk in pd.read_csv(os.path.join('data', 'ticker_count.csv'),
                         chunksize=10_000, 
                         parse_dates=['created_utc'],
                         index_col=0):
    ticker_chunks.append(chunk)

df_ticker = pd.concat(ticker_chunks)

In [15]:
df_ticker.head()

Unnamed: 0,GME,TSLA,PLTR,NIO,SPY,NKLA,AAPL,AMZN,BB,AMD,RKT,AMC,MSFT,BABA,NVDA,created_utc
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-07-01 00:00:23
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-07-01 00:01:21
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-07-01 00:04:02
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2020-07-01 00:07:43
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2020-07-01 00:09:27


In [18]:
df_ticker_daily = df_ticker.groupby(by=pd.Grouper(key='created_utc', freq='D'),
                          as_index=True).sum()

df_ticker_cumsum = df_ticker_daily.cumsum()


#add GME data
df_ticker_cumsum = df_ticker_cumsum.merge(right=gme_data,
                            how='inner',
                            left_on = df_ticker_cumsum.index,
                            right_on = gme_data['Date'],
                            left_index=True,
                            right_index=False)

df_ticker_cumsum.index = df_ticker_cumsum['Date']

df_ticker_cumsum.head()

Unnamed: 0_level_0,key_0,GME,TSLA,PLTR,NIO,SPY,NKLA,AAPL,AMZN,BB,...,MSFT,BABA,NVDA,Date,GME High,GME Low,GME Open,GME Close,GME Volume,GME Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-01,2020-07-01,0,33,0,0,22,10,2,7,0,...,1,0,0,2020-07-01,4.5,4.31,4.31,4.44,2303700,4.44
2020-07-02,2020-07-02,0,55,0,3,43,16,2,22,0,...,1,0,2,2020-07-02,4.51,4.29,4.49,4.29,1887600,4.29
2020-07-06,2020-07-06,0,171,0,26,78,28,9,42,0,...,9,0,3,2020-07-06,4.34,4.19,4.31,4.24,2140900,4.24
2020-07-07,2020-07-07,0,240,0,28,88,31,9,52,0,...,13,1,6,2020-07-07,4.25,4.06,4.2,4.09,2456600,4.09
2020-07-08,2020-07-08,0,272,0,37,97,45,11,55,0,...,17,3,7,2020-07-08,4.29,4.03,4.1,4.26,2052800,4.26


In [30]:
subfig = make_subplots(specs=[[{"secondary_y": True}]])


#note that y=[<arr of values>] works only with plotly > 4.8
fig_ticker = px.line(df_ticker_cumsum,
                     x=df_ticker_cumsum.index,
                     y=['GME', 'TSLA', 'PLTR', 'NIO', 'SPY', 'NKLA', 'AAPL', 'AMZN',
                        'BB', 'AMD', 'RKT', 'AMC', 'MSFT', 'BABA', 'NVDA'], 
                     title='Stock Ticker Mentions Cumulative Sum')


fig_ticker.update_layout(
    yaxis_title="Cumulative Sum",
    legend_title="Ticker Symbol",
)



fig2 = px.line(gme_data,
               x='Date',
               y=['GME High',
                  'GME Low'],
               render_mode="webgl"
              )


fig2.update_traces(yaxis="y2")


subfig.add_traces(fig2.data + fig_ticker.data)


subfig.layout.yaxis.title="Member Count"
subfig.layout.yaxis2.title="Price (USD)"
subfig.layout.title = 'Total Mention of Stocks Over Time'

# # recoloring is necessary otherwise lines from fig und fig2 would share each color
# # e.g. Linear-, Log- = blue; Linear+, Log+ = red... we don't want this
subfig.for_each_trace(lambda t: t.update(line=dict(color=t.marker.color)))


subfig.show()



subfig.write_html(os.path.join('plots', 'ticker_cumsum.html'),
               include_plotlyjs="cdn",
               full_html=False)
