In [None]:
#The purpose of this script is to analyze the relationship between mentions of a company on Reddit and the stock price
#it was created to help quantify the meme stock movement in 2021, specifically with respect to GameStop (GME)
#GameStop is hardcoded as the company being analyzed however it could be leveraged for any company
#It was creating using Pushshift, which no longer appears to work because Reddit decided to make it's API less accessible
#This code creates the dataset that feeds this "GME Mentions" dashboard so the final result can still be viewed here: https://public.tableau.com/views/GMEMentions/GMEStockvs_RedditMentions?:language=en-US&:sid=&:redirect=auth&:display_count=n&:origin=viz_share_link

#References that supported the creation of this code:
#https://www.jcchouinard.com/how-to-use-reddit-api-with-python/
#https://github.com/pushshift/api#using-the-subreddit-aggregation 
#https://reddit-api.readthedocs.io/en/latest/
#https://github.com/Watchful1/Sketchpad/blob/master/postDownloader.py

import plotly
import requests
import pandas as pd
from datetime import datetime
import time
import dateutil
import yfinance as yf
import numpy as np
import os

#Designate a filepath to save results to as an option to save time
output_path = os.environ['output_path']


C:/Users/matth/OneDrive/Documents/OAuth2.json


In [5]:
#The purpose of this function is to pull Reddit pushshift data through the API for one API call

def get_pushshift_data(data_type, **kwargs):
    """
    Gets data from the pushshift api.
 
    data_type can be 'comment' or 'submission'
    The rest of the args are interpreted as payload.
 
    Read more: https://github.com/pushshift/api
    """
 
    base_url = f"https://api.pushshift.io/reddit/search/{data_type}/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    
    #Cleaning things up to make move to a data frame
    data = request.json()
    
    #Return the final result
    return data

In [6]:
#The purpose of this section is to loop and incrementally pull all the Reddit data from pushshift and consolidate into one dataframe

data_type="submission"            # give me comments, use "submission" to publish something
query=" GME|Gamestop|$GME|"       # Add your query
duration="12d"                    # Select the timeframe. Epoch value or Integer + "s,m,h,d" (i.e. "second", "minute", "hour", "day")
size=1000                         # maximum comments
sort_type="score"                 # Sort by score (Accepted: "score", "num_comments", "created_utc")
sort="desc"                       # sort descending
aggs="subreddit"                  #"author", "link_id", "created_utc", "subreddit" #seems to be disabled
subreddit='wallstreetbets'
before = ""

df = pd.DataFrame()
counter = 0

#we are going to loop until there is no more data so just do a while loop and break it when we run out of data below
while True:
    data = get_pushshift_data(data_type=data_type,
                          q=query,
                          after=duration,
                          size=size,
                          before=before,
                          subreddit=subreddit)

    #get the next batch of data
    new_df = pd.DataFrame.from_records(data.get("data"))
    
    if new_df.shape[0] == 0:
        print("No more data on loop " + str(counter))
        break
        
    # take the final row (oldest entry)
    row = new_df.iloc[len(new_df)-1]
    
    #Create a variable so we know to pull data after this time in the next batch
    duration = row['created_utc']+1
    
    # append new_df to data
    df = df.append(new_df, ignore_index=True)
    
    #Slow down so we don't get rate limited
    time.sleep(5)

    #increse the counter and let the user know what number we are on
    counter = counter + 1
    print(counter)
    
#df['Created_Date'] = ''

#Loop to create the created_Date column, couldn't get this to work in one line
#for index, row in df.iterrows():
    
    #print(row['created_utc'])
#    row['Created_Date'] = datetime.fromtimestamp(row['created_utc']).strftime("%Y-%m-%d")

#This adds the created date as an actual date we can use
df['Created_Date'] = df.apply(lambda row: pd.Timestamp(datetime.fromtimestamp(row['created_utc']).strftime("%Y-%m-%d")), axis = 1) 

#This creates a new dataframe that groups the total records by date, using title as an anchor assuming that will always
#be present
df_Date = df.groupby(['Created_Date'])['title'].count().to_frame()

#Similar to sthe stock df below this makes the created_date an actual field to reference
df_Date = df_Date.reset_index()

#write the data to a csv for analysis as needed
df.to_csv(output_path + "Reddit_GMEAllData2.csv")
df_Date.to_csv(output_path + "Reddit_GMESummary2.csv")


TypeError: object of type 'NoneType' has no len()

In [None]:
#Pull the stock prices through yfinance

#Pull GME data
stock = yf.Ticker('GME')

#start at 11/23 because that's when we have Reddit data through
prices = stock.history(start="2021-06-28")

#I don't fully know what this does but it allows us to chart and moves the Date field from metadata to a field we can reference
prices = prices.reset_index()

#write the prices down to a csv file for analysis as needed
prices.to_csv(output_path + "Reddit_GMEPrices2.csv")

In [None]:
#This cell will merge the stock and Reddit data and create moving 3 day averages

rollwin = 3

#Before we merge, it will be easiest to do the moving average on stock prices first because the dates are more uneven
#as there are reddit mentions daily
#This would work better as it's own function that both areas are referencing rather than doing this process twice
prices['pandas_SMA_3_P'] = prices.iloc[:,1].rolling(window=rollwin).mean()
prices['SMA_pct_P'] = 0

#calculate the x-day moving average based on the rollwin variable
for ind in prices.index:
    if ind > rollwin-1:
        prices.loc[ind,'SMA_pct_P'] = prices.loc[ind,'Close']/prices.loc[ind-1,'pandas_SMA_3_P']-1

#Merge the stock and reddit data - outer join on date since we have reddit data daily but stock data only on trading days
#Combine the stock prices and the reddit mentions
df_all = df_Date.merge(prices,how='outer',left_on='Created_Date',right_on='Date')

#Create a new column to hold our combined date
df_all['EffDt'] = 0

#Populate the new date column with either the Created_Date or the Date from the stock price data to get one consolidated date field
for i in df_all.index:
    if pd.isnull(df_all.loc[i,'Created_Date']):
        df_all.loc[i,'EffDt'] = df_all.loc[i,'Date']
    else:
        df_all.loc[i,'EffDt'] = df_all.loc[i,'Created_Date']

#Sort on EffDt
df_all = df_all.sort_values('EffDt')
        
#Keep only the columns we need
df_all = df_all[['EffDt','title','Close','Volume','pandas_SMA_3_P','SMA_pct_P']]
df_all = df_all.rename(columns = {'title':'Reddit_Mentions'})

#********This is creating the X day moving average and creating the percentage change******************
df_all['pandas_SMA_3_RM'] = df_all.iloc[:,1].rolling(window=rollwin).mean()
df_all['SMA_pct_RM'] = 0

#I'm looping this because I want to compare a given row with the previous row's moving 3 day average.
#If we do a 3 day moving average including the current row then an increase or decrease for the current row might dilute the
#impact of a change
for ind in df_all.index:
    if ind > rollwin-1:
        df_all.loc[ind,'SMA_pct_RM'] = df_all.loc[ind,'Reddit_Mentions']/df_all.loc[ind-1,'pandas_SMA_3_RM']-1

df_all.to_csv(output_path + "Reddit_GMECombinedV2.csv")

In [None]:
#Create a visual of the data

import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=prices["Date"], y=prices["Close"], name="GME data"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=df_Date["Created_Date"], y=df_Date["title"], name="WSB mentions data"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="GME Price vs. Reddit Mentions"
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> GME Stock Price", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> WSB Mentions", secondary_y=True)

#show the chart
fig.show()