In [3]:
import json
import pandas as pd
import numpy as np 
from datetime import datetime,timezone
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import zstandard
import os
import sys
import logging.handlers
import re

In [4]:
post_raw_df = pd.read_csv('posts_raw_15_09_24.csv')

In [6]:
#retaining from dec 15 
from zoneinfo import ZoneInfo  

post_raw_df['created_et_date'] = post_raw_df['created_utc'].apply(lambda x: datetime.fromtimestamp(x, tz=timezone.utc).
                                                          astimezone(ZoneInfo('America/New_York')).strftime("%Y-%m-%d"))
post_raw_df['created_et_date'] = pd.to_datetime(post_raw_df['created_et_date'])
post_raw_df = post_raw_df[post_raw_df['created_et_date'] >= '2020-12-15']

post_raw_df['created_et_timestamp'] = post_raw_df['created_utc'].apply(lambda x: datetime.fromtimestamp(x, tz=timezone.utc).
                                                               astimezone(ZoneInfo('America/New_York')).strftime("%Y-%m-%d %H:%M:%S %Z"))

post_raw_df['created_et_time'] = post_raw_df['created_utc'].apply(lambda x: datetime.fromtimestamp(x, tz=timezone.utc)
                        .astimezone(ZoneInfo('America/New_York')).strftime("%H:%M:%S"))

post_raw_df['created_et_timestamp'] = post_raw_df['created_et_timestamp'].str.replace(r'\s\w+$', '', regex=True)  
post_raw_df['created_et_timestamp'] = pd.to_datetime(post_raw_df['created_et_timestamp']).dt.tz_localize(ZoneInfo("America/New_York"))

post_raw_df.drop('created_utc', axis=1, inplace=True)


In [9]:
post_raw_df.drop(['distinguished','id'], axis=1, inplace=True)
post_raw_df.head()

Unnamed: 0,selftext,upvote_ratio,num_comments,title,score,created_et_date,created_et_timestamp,created_et_time
15905,[removed],1.0,1,WSB only shows PLTR and GME posts,1,2020-12-15,2020-12-15 00:02:53-05:00,00:02:53
15906,[deleted],1.0,0,"Finally, PFE does something about their share ...",1,2020-12-15,2020-12-15 00:05:07-05:00,00:05:07
15907,,1.0,0,I’ll get the WSB tattooed on my right buttchee...,1,2020-12-15,2020-12-15 00:10:43-05:00,00:10:43
15908,,0.82,19,Elon strikes again,20,2020-12-15,2020-12-15 00:17:03-05:00,00:17:03
15909,,0.94,75,Did you see this notice that just went out ton...,74,2020-12-15,2020-12-15 00:23:48-05:00,00:23:48


In [15]:
#summary stats as per gme timeline
post_raw_df.loc[post_raw_df["num_comments"].idxmax()]
post_raw_df[post_raw_df["created_et_date"] >= "2021-01-28"]["num_comments"].sum()

7408135

In [None]:
#post_ts_df = post_raw_df[['num_comments','score', 'created_et_date','created_et_timestamp','created_et_time']]
#post_ts_df.to_csv('out_data/post_ts_df.csv', index=True)

In [None]:
significant_dates = ['2021-01-13', '2021-01-27']
significant_dates = [pd.to_datetime(d) for d in significant_dates]

summary_scores = post_raw_df.groupby('created_et_date')['score'].agg('max').reset_index()

plt.figure(figsize=(10,6))
plt.plot(summary_scores['created_et_date'], summary_scores['score'], marker='o', markersize=4, label='Max Score', color='darkgray')

plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

for idx, date in enumerate(significant_dates):
    plt.axvline(date, color='grey', linestyle='--', linewidth=1, label='Significant Date' if idx == 0 else "")

y_pos = summary_scores['score'].max() * 0.95  # Slightly below the top of the y-axis
pre_mid = summary_scores['created_et_date'].min() + (significant_dates[0] - summary_scores['created_et_date'].min()) / 2
squeeze_mid = significant_dates[0] + (significant_dates[1] - significant_dates[0]) / 2
post_mid = significant_dates[1] + (summary_scores['created_et_date'].max() - significant_dates[1]) / 2

plt.text(pre_mid, y_pos, 'PRE SQUEEZE', ha='center', fontsize=13, color='black')
plt.text(squeeze_mid, y_pos, 'SQUEEZE', ha='center', fontsize=13, color='black')
plt.text(post_mid, y_pos, 'POST SQUEEZE', ha='center', fontsize=13, color='black')
plt.xlabel('Date')
plt.ylabel('Score (Upvotes - Downvotes)')
plt.title('Daily Max Score received on posts in r/wsb')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('max post scores by date.png')
plt.show()
plt.close()

In [None]:
#plotting comment and post counts by date 
comment_counts = post_raw_df.groupby('created_et_date')['num_comments'].sum().reset_index()

plt.figure(figsize=(10, 6))
plt.plot(comment_counts['created_et_date'], (comment_counts['num_comments']+ 1), marker='o',markersize=4, color ='darkgray') #adding 1 for post count itself
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
for date in significant_dates:
    plt.axvline(pd.to_datetime(date), color='grey', linestyle='--', linewidth=1)
y_pos = comment_counts['num_comments'].max() * 0.95  # Slightly below the top of the y-axis
pre_mid = summary_scores['created_et_date'].min() + (significant_dates[0] - summary_scores['created_et_date'].min()) / 2
squeeze_mid = significant_dates[0] + (significant_dates[1] - significant_dates[0]) / 2
post_mid = significant_dates[1] + (summary_scores['created_et_date'].max() - significant_dates[1]) / 2

plt.text(pre_mid, y_pos, 'PRE SQUEEZE', ha='center', fontsize=13, color='black')
plt.text(squeeze_mid, y_pos, 'SQUEEZE', ha='center', fontsize=13, color='black')
plt.text(post_mid, y_pos, 'POST SQUEEZE', ha='center', fontsize=13, color='black')
plt.xlabel('Date')
plt.ylabel('Total posts and comments')
plt.title('Daily submissions on r/wsb')
plt.xticks(rotation=90)
plt.tight_layout()
plt.legend()
plt.savefig('post comment freq by date.png')
plt.show()
plt.close()


In [None]:
#upvote ration - ups/total votes 
upvote_ratio_mean = post_raw_df.groupby('created_et_date')['upvote_ratio'].mean().reset_index()
plt.figure(figsize=(10, 6))
plt.plot(upvote_ratio_mean['created_et_date'], upvote_ratio_mean['upvote_ratio'], marker='o')
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xlabel('Date')
plt.ylabel('Upvote Ratio')
plt.title('Upvote ratio by Date')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
post_raw_df['fulltext'] = post_raw_df['title'] + ' ' + post_raw_df['selftext']
post_raw_df['fulltext'] = post_raw_df['title'].str.strip()

In [None]:
#getting frequency of posts mentioning the common keywords used in GME and plotting by date 
gme_phrases = ['GME', 'this is the way', 'we like the stock', 'I like the stock', 'hold the line', 'diamond hands'
                  'paper hands', 'tendies', 'stonks', '🚀','🦍💪','🚀🌕','🍗','🧻🤲','🧻','🐻‍🌈','💎🙌','gamestop', 'gamestonk',
                  'melvin capital', 'gay bears', 'BUY THE DIP','robinhood', "he's still in, I'm still in","IF HE IS IN",
                  "if he is still in",'HODL',"Diamond Hands", "To the Moon", "Apes Together Strong", 'deep fucking value',"YOLO",
                  "Ape No Fight Ape", '🦍','Buy more', 'STOP THE COUNT','HOLD THE FUCKING LINE', 'anthem against robinhood',
                   'he likes the stock', 'priced in', 'power to the players', 'Bear r fuk','lets fucking go',' this guy fucks',
                   'this is a casino', "Let’s Go", 'lets go','this is not the way', "I’m Not Leaving", "It’s Not a Loss Until You Sell", 'WAGMI',
                    'WE LOVE THE STOCK', 'Bagholder', 'Bag holders', 'I am holding', 'BUY BUY BUY',
                   'like the stock', 'short squeeze', 'fellow apes', 'hedgefunds', 'citadel', 'melvin', 'shorts must cover',
                   'rocket', 'tendie', 'stonk', "Let's Go", 'hold hold hold', "It’s not about the money; it’s about sending a message"]

post_raw_df['fulltext'] = post_raw_df['fulltext'].fillna('')
gme_pattern = '|'.join([re.escape(phrase) for phrase in gme_phrases])
post_raw_df['contains_gme_phrase'] = post_raw_df['fulltext'].str.contains(gme_pattern, case=False, regex=True).astype(int)
sum_gme_phrase = post_raw_df.groupby('created_et_date')['contains_gme_phrase'].sum().reset_index()

# Group by date for max score where contains_gme_phrase = 1
max_gme_score = (
    post_raw_df[post_raw_df['contains_gme_phrase'] == 1]
    .groupby('created_et_date')['score']
    .max()
    .reset_index()
)

plt.figure(figsize=(12, 6))
plt.plot(sum_gme_phrase['created_et_date'], sum_gme_phrase['contains_gme_phrase'], marker='o')
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Counting posts mentioning GME related phrases')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
# Group by date for max score in general
max_daily_score = post_raw_df.groupby('created_et_date')['score'].max().reset_index()

plt.figure(figsize=(10,5))
plt.plot(max_daily_score['created_et_date'], max_daily_score['score'], marker='o', label='Max Daily Score (All Posts)', color='blue')
plt.plot(max_gme_score['created_et_date'], max_gme_score['score'], marker='x', label='Max Daily Score (GME Posts)', color='orange')
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=90)

plt.xlabel('Date')
plt.ylabel('Max Daily Score')
plt.title('Max Daily Scores: General vs GME Related Posts')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#reading stock data for comparitive plotting 
comb_5min_intra = pd.read_csv('./out_data/Stock output folder/combined_5min_stockdata.csv')
GME_daily = pd.read_csv('./out_data/Stock output folder/GME_daily_stockdata.csv')
SPX_daily = pd.read_csv('./out_data/Stock output folder/SPX_daily_stockdata.csv')
IWM_daily = pd.read_csv('./out_data/Stock output folder/IWM_daily_stockdata.csv')
IWM_daily['DateTime'] = pd.to_datetime(IWM_daily['DateTime'])
SPX_daily['DateTime'] = pd.to_datetime(SPX_daily['DateTime'])
GME_daily['DateTime'] = pd.to_datetime(GME_daily['DateTime'])

In [None]:
GME_daily.shape

In [17]:
post_raw_df['gme_timeline'] = np.select(
    [
        post_raw_df['created_et_date'] < '2021-01-13',
        (post_raw_df['created_et_date'] >= '2021-01-13') & (post_raw_df['created_et_date'] <= '2021-01-27'),
        post_raw_df['created_et_date'] > '2021-01-27'
    ],
    [0, 1, 2]
)

In [None]:
#aggreating data and plotting for 1hr timestamp 
df = post_raw_df[['created_et_timestamp','gme_timeline','num_comments','score','created_et_date']]

In [None]:
aggregated_1h = df.set_index('created_et_timestamp')
aggregated_1h = aggregated_1h.resample('1h').agg({
    'num_comments': 'sum',
    'score': 'max',  
    'gme_timeline': 'min',
    'created_et_date' :'first'
}).reset_index()
aggregated_1h.rename(columns={'created_et_timestamp': '1_hr'}, inplace=True)
aggregated_1h['timeint'] = pd.to_datetime(aggregated_1h['1_hr']).dt.strftime('%H:%M:%S')

In [None]:
aggregated_1h.head()

In [None]:
aggregated_1h.set_index('1_hr', inplace=True)
filtered_df = aggregated_1h[aggregated_1h['gme_timeline'] == 1]
filtered_df['timeint'] = filtered_df['timeint'].astype(str)
filtered_df['created_et_date'] = filtered_df['created_et_date'].astype(str)
filtered_df['newcol'] = filtered_df['timeint'] + filtered_df['created_et_date']

In [None]:

comments_threshold = filtered_df['num_comments'].quantile(0.90) 
score_threshold = filtered_df['score'].quantile(0.90)           
comment_spikes = filtered_df[filtered_df['num_comments'] > comments_threshold]
score_spikes = filtered_df[filtered_df['score'] > score_threshold]

fig, axs = plt.subplots(2, 1, figsize=(15, 12), sharex=True)

axs[0].plot(filtered_df['newcol'], filtered_df['num_comments'], marker='.', linewidth=0.5, label='Comments', color='gray')
axs[0].scatter(comment_spikes['newcol'], comment_spikes['num_comments'], color='black', label='Comment Spikes')
axs[0].set_title('r/wallstreetbets activity during the squeeze period', fontsize=14)
axs[0].set_ylabel('Number of Comments', fontsize=12)
axs[0].legend()


axs[1].plot(filtered_df['newcol'], filtered_df['score'], marker='.', linewidth=0.5, label='Score', color='gray')
axs[1].scatter(score_spikes['newcol'], score_spikes['score'], color='black', label='Score Spikes')
axs[1].set_title('r/wallstreetbets scores during the  squeeze period', fontsize=14)
axs[1].set_ylabel('Score', fontsize=12)
axs[1].legend()


x_ticks = range(0, len(filtered_df['newcol']), 4) #show every 4th hour
labels = filtered_df['newcol'].iloc[x_ticks]

bold_comment_ticks = [f"$\\bf{{{label}}}$" if label in comment_spikes['newcol'].values else label for label in labels]

bold_score_ticks = [f"$\\bf{{{label}}}$" if label in score_spikes['newcol'].values else label for label in labels]

axs[1].set_xticks(x_ticks)
axs[1].set_xticklabels(bold_score_ticks, rotation=90, fontsize=6)

axs[0].set_xticks(x_ticks)
axs[0].set_xticklabels(bold_comment_ticks, rotation=90, fontsize=6)

plt.tight_layout()
plt.savefig('comment and score spikes by hour.png')
plt.show()
plt.close()

In [None]:
comment_counts['scaled_comments'] = comment_counts['num_comments'] / 10000
fig, axs = plt.subplots(1, 2, figsize=(16, 6), sharey=False)
ax1 = axs[0]
ax2 = ax1.twinx() 
ax1.plot(GME_daily['DateTime'], 
         GME_daily['Close'], 
         label='GME Close Price', 
         color='black', alpha = 0.9,linestyle ='--')
ax1.set_ylabel('GME Close Price')
ax1.tick_params(axis='y')

ax2.plot(SPX_daily['DateTime'], 
         SPX_daily['Close'], 
         label='SPX Close Price', 
         color='lightgray', alpha = 1,linestyle ='--', linewidth = 2)
ax2.set_ylabel('SPX Close Price')
ax2.tick_params(axis='y')

ax1.plot(comment_counts['created_et_date'], 
         comment_counts['scaled_comments'], 
         label='Comments Count (scaled)', 
         color='gray', alpha =0.6)

ax1.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax1.set_xlabel('Date')
ax1.set_title('GME vs SPX Close Price with Comments Count')
plt.setp(ax1.xaxis.get_majorticklabels(), rotation=90)

ax3 = axs[1]
ax4 = ax3.twinx() 

ax3.plot(GME_daily['DateTime'], 
         GME_daily['Close'], 
         label='GME Close Price', 
         color='black', alpha = 0.9,linestyle ='--')
ax3.set_ylabel('GME Close Price')
ax3.tick_params(axis='y')

ax4.plot(IWM_daily['DateTime'], 
         IWM_daily['Close'], 
         label='IWM Close Price', 
         color='lightgray', alpha = 1,linestyle ='--', linewidth = 2)
ax4.set_ylabel('IWM Close Price')
ax4.tick_params(axis='y')

ax3.plot(comment_counts['created_et_date'], 
         comment_counts['scaled_comments'], 
         label='Comments Count (scaled)', 
         color='gray', alpha =0.6)
ax3.xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
ax3.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax3.set_xlabel('Date')
ax3.set_title('GME vs IWM Close Price with Comments Count')
plt.setp(ax3.xaxis.get_majorticklabels(), rotation=90)
ax1.legend(loc='upper left') 
ax2.legend(loc='upper right') 
ax3.legend(loc='upper left') 
ax4.legend(loc='upper right')
plt.tight_layout()
plt.savefig('gme vs SPX and IWM.png')
plt.show()
plt.close()

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(max_daily_score['created_et_date'], 
         max_daily_score['score'], 
         marker='.', label='Max Daily Score (All Posts)', color ='black', alpha =0.6,linestyle = '--', linewidth = 2)
plt.plot(max_gme_score['created_et_date'], 
         max_gme_score['score'], 
         marker='.', label='Max Daily Score (GME Posts)', color='red', alpha = 0.6,linestyle = '--', linewidth = 1.5)

ax1 = plt.gca()
ax2 = ax1.twinx()
ax2.plot(GME_daily['DateTime'], 
         GME_daily['Close'], 
         label='GME Close Price', 
         color='orange', linewidth = 0.75)
ax2.set_ylabel('GME Close Price')
ax2.tick_params(axis='y')
ax1.xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=90)
ax1.set_xlabel('Date')
ax1.set_ylabel('Max Daily Score')
ax1.set_title('Max Daily Scores and GME Close Price')
ax1.legend(loc='upper left') 
ax2.legend(loc='upper right') 

plt.tight_layout()
plt.show()

In [None]:
#plot now on clean data 
clean_df = pd.read_csv('./out_data/posts output folder/clean_posts_with_trade_time_23_09_2024.csv')
clean_df.head()

In [None]:
clean_df.drop(['id'], axis=1, inplace=True)

In [None]:
clean_df.dtypes

In [None]:
clean_df['created_et_date'] = pd.to_datetime(clean_df['created_et_date'])
clean_df['created_et_timestamp'] = pd.to_datetime(clean_df['created_et_timestamp'])

In [None]:
clean_df = clean_df.sort_values(by='created_et_timestamp')
clean_df = clean_df.set_index('created_et_timestamp')

In [None]:
clean_df.head()

In [None]:
#creating gme phrase column
clean_df['fulltext'] = clean_df['fulltext'].fillna('')
gme_pattern = '|'.join([re.escape(phrase) for phrase in gme_phrases])
clean_df['contains_gme_phrase'] = clean_df['fulltext'].str.contains(gme_pattern, case=False, regex=True).astype(int)

In [None]:
#plot upvote ratio in general and for gme phrase
upvote_ratio_mean = clean_df.groupby('created_et_date')['upvote_ratio'].mean().reset_index()
upvote_ratio_gme = (clean_df[clean_df['contains_gme_phrase'] == 1].groupby('created_et_date')['upvote_ratio']
    .mean()
    .reset_index()
)
plt.figure(figsize=(12, 6))
plt.plot(
    upvote_ratio_mean['created_et_date'],
    upvote_ratio_mean['upvote_ratio'],
    marker='.',
    label='All Posts'
)
plt.plot(
    upvote_ratio_gme['created_et_date'],
    upvote_ratio_gme['upvote_ratio'],
    marker='.',
    linestyle='--',
    label='Posts with GME Phrase'
)
significant_dates = ['2021-01-13', '2021-01-27']
for date in significant_dates:
    plt.axvline(pd.to_datetime(date), color='grey', linestyle='--', linewidth=1)
plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.xlabel('Date')
plt.ylabel('Upvote Ratio')
plt.title('Upvote Ratio by Date')
plt.xticks(rotation=90)
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
aggregated_1h.head()

In [None]:
aggregated_1h.set_index('1_hr', inplace=True)
filtered_df = aggregated_1h[aggregated_1h['gme_timeline'] == 1]
filtered_df['timeint'] = filtered_df['timeint'].astype(str)
filtered_df['created_et_date'] = filtered_df['created_et_date'].astype(str)
filtered_df['newcol'] = filtered_df['timeint'] + filtered_df['created_et_date']