## 1. Imports and config

In [1]:
from psaw import PushshiftAPI
import datetime as dt
import re
import warnings
import yaml
import logging
from tqdm import tqdm
import pandas as pd
#import numpy as np
from sqlalchemy import create_engine, types
from data_extraction_utils import data_prep_posts, join_submission_title_and_body, find_stock_symbols
warnings.filterwarnings("ignore")

In [2]:
# Specify logging settings
logging.basicConfig(filename='submissions.log', level=logging.INFO, filemode="a", format="%(asctime)s - %(levelname)s - %(message)s")

## 2. Database configuration

In [3]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

## 3. Create stock symbol list

In [4]:
# load stock list and transform it into a set
stock_ticker_list = pd.read_excel("../../data/external/stock_ticker_list.xlsx")["symbol"].to_list()

# Modified second stock list with dollar sign
stock_ticker_list_with_dollar_sign = [f"${ticker}" for ticker in stock_ticker_list]

# Combine the two lists to one set
final_stock_ticker_list = set(stock_ticker_list + stock_ticker_list_with_dollar_sign)

## 4. Extract Submissions from reddit

In [5]:
# Set date range to extract the data from the API
start = dt.datetime.strptime("09-05-2022", "%d-%m-%Y")
end = dt.datetime.strptime("23-05-2022", "%d-%m-%Y")
timestamp_list = [int((start + dt.timedelta(days=x)).timestamp()) for x in range(0, (end-start).days +1)]

# Reddit API Object
api = PushshiftAPI()

In [6]:
# loop over list of timestamps
for i in (pbar := tqdm(range(len(timestamp_list) -1))):

    # Set start date and end date
    start_date = timestamp_list[i]
    end_date = timestamp_list[i + 1]

    # Specify data to extract from reddit
    filters = ['author', 'created_utc', 'title', "selftext", "score"]

    # Call API function to retrieve data from reddit as a dataframe
    df = data_prep_posts(api, "wallstreetbets", start_time=start_date, end_time=end_date, filters=filters)

    # Join submission title and body to "post"
    df["post"] = df[["title", "selftext"]].apply(lambda x: join_submission_title_and_body(*x), axis=1)

    # Transform timestamp into datetime column
    df["created_at"] = df["created_utc"].apply(lambda x: dt.datetime.fromtimestamp(x))

    # Get stock symbol from reddit post if available
    df["stock_symbol"] = df["post"].apply(lambda x: find_stock_symbols(x, final_stock_ticker_list))

    # Drop unused columns
    df.drop(columns=["created_utc", "selftext", "title", "created"], inplace=True)

    # Rename columns as preparation for database
    df.rename(columns={'score': 'num_up_votes'}, inplace=True)

    # Write dataframe to database
    df.to_sql("r_wallstreetbets", cnx, index=False, if_exists="append")

    # Logging
    logging.info(f"{df.shape[0]} rows written to DB - Date: {dt.datetime.fromtimestamp(start_date)}")
    pbar.set_description(f"{dt.datetime.fromtimestamp(start_date)}")


2022-05-22 00:00:00: 100%|██████████| 14/14 [01:48<00:00,  7.74s/it]


In [6]:
df

Unnamed: 0,author,num_up_votes,type,post,created_at,stock_symbol
0,must_be_funny_bot,1,submission,The pillar of my destroyed meme-folio,2022-01-01 23:56:53,
1,Bobdaman_1989,1,submission,Am I the only one that is scared to death abou...,2022-01-01 23:56:29,
2,justramos69,1,submission,Attapoll referral bonus,2022-01-01 23:54:20,
3,ChillinwithDillon,1,submission,ES bustin 4800 next week 🤞,2022-01-01 23:54:03,{ES}
4,Broad_Tradition_98,1,submission,Lets Teach the Hedge Funds A Lesson [C3.ai](ht...,2022-01-01 23:50:11,"{AI, A}"
...,...,...,...,...,...,...
433,illusionist_iv,1,submission,The first pic is how my YOLO Tesla contracts a...,2022-01-01 00:13:25,{TSLA}
434,Upper_Combination_46,1,submission,Want to learn more I have surface level knowle...,2022-01-01 00:09:03,
435,illusionist_iv,1,submission,The first pic is how my YOLO Tesla contracts a...,2022-01-01 00:06:37,{TSLA}
436,FortSquidward69,1,submission,WSB,2022-01-01 00:03:52,


In [9]:
found_stock_raw = {"$GME", "GME"}
print(found_stock_raw)

found_stock_processed = {stock.replace("$", "") if "$" in stock else stock for stock in found_stock_raw}

print(found_stock_processed)

{'GME', '$GME'}
{'GME'}


In [10]:
post = set("Original TSLA DD - Part 2 - Update and Outlook $EBS $BAVA".split())

In [15]:
found_stock = list(post.intersection(final_stock_ticker_list))[0]
if "$" in found_stock:
    found_stock = found_stock.replace("$", "")

In [10]:
my_ser = {}

if my_ser:
    print(my_ser)
else:
    print("not found")

not found


In [66]:
words_re = re.compile("|".join(stock_ticker_list))

match = re.search(fr'{"|".join(stock_ticker_list)}', 'Original TSLA DD - Part 2 - Update and Outlook $EBS $BAVA')

if match:
    print(match.start())

0


In [21]:
((15*52 / 8) - 30) / 5

13.5

In [75]:
"GOEV" in "|".join(stock_ticker_list)

False

In [63]:
post = "Original TSLA DD - Part 2 - Update and Outlook $EBS $BAVA"

match = re.search(fr'{"|".join(stock_ticker_list)}', post)
if match:
    print("found!")
    print(match.group())

found!
O


In [2]:
post = "Original Monkeypox DD - Part 2 - Update and Outlook $EBS $BAVA"

match = re.search(r'\$[A-Z]{1,10}', post)
if match:
    print("found!")
    print(match.group())

found!
$EBS


In [3]:
len(" $GME ")

6

In [37]:
timestamp_list = pd.date_range(start="2022-01-01",end="2022-01-02").to_list()

In [47]:
for i in range(len(timestamp_list) -1):
    print(timestamp_list[i+1])

2022-01-02 00:00:00


In [52]:
start = dt.datetime.strptime("01-01-2022", "%d-%m-%Y")
end = dt.datetime.strptime("11-01-2022", "%d-%m-%Y")
timestamp_list = [int((start + dt.timedelta(days=x)).timestamp()) for x in range(0, (end-start).days)]
timestamp_list

[1640991600,
 1641078000,
 1641164400,
 1641250800,
 1641337200,
 1641423600,
 1641510000,
 1641596400,
 1641682800,
 1641769200]

In [17]:
api = PushshiftAPI()              # Object of the API

In [18]:
def data_prep_posts(subreddit: str, start_time, end_time, filters: list[str]):

    posts = list(api.search_submissions(
        subreddit=subreddit,   # Subreddit we want to audit
        after=start_time,      # Start date
        before=end_time,       # End date
        filter=filters))       # Column names we want to retrieve

    df = pd.DataFrame(posts)
    df["type"] = "submission"
    return df # Return dataframe for analysis

In [53]:
subreddit = "wallstreetbets"     # Subreddit we are auditing
start_time = int(dt.datetime(2021, 1, 2).timestamp())
#Starting date for our search
end_time = int(dt.datetime(2021, 1, 3).timestamp())
#Ending date for our search
filters = ['author', 'created_utc', 'title', "selftext", "score"]

"""Here we are going to get subreddits for a brief analysis"""
#Call function for dataframe creation of comments
df_p = data_prep_posts("wallstreetbets", 1640991600, 1641078000, filters)

In [42]:
def join_submission_title_and_body(title: str, body: str):

    if body != "[removed]":
        return f"{title} {body}"
    else:
        return title

In [54]:
df_p

Unnamed: 0,author,created_utc,score,selftext,title,created,d_,type
0,must_be_funny_bot,1641077813,1,,The pillar of my destroyed meme-folio,1.641071e+09,"{'author': 'must_be_funny_bot', 'created_utc':...",submission
1,Bobdaman_1989,1641077789,1,[removed],Am I the only one that is scared to death abou...,1.641071e+09,"{'author': 'Bobdaman_1989', 'created_utc': 164...",submission
2,justramos69,1641077660,1,[removed],Attapoll referral bonus,1.641070e+09,"{'author': 'justramos69', 'created_utc': 16410...",submission
3,ChillinwithDillon,1641077643,1,,ES bustin 4800 next week 🤞,1.641070e+09,"{'author': 'ChillinwithDillon', 'created_utc':...",submission
4,Broad_Tradition_98,1641077411,1,[C3.ai](https://C3.ai) (AI) was trading at abo...,Lets Teach the Hedge Funds A Lesson,1.641070e+09,"{'author': 'Broad_Tradition_98', 'created_utc'...",submission
...,...,...,...,...,...,...,...,...
433,illusionist_iv,1640992405,1,,The first pic is how my YOLO Tesla contracts a...,1.640985e+09,"{'author': 'illusionist_iv', 'created_utc': 16...",submission
434,Upper_Combination_46,1640992143,1,I have surface level knowledge of trading and ...,Want to learn more,1.640985e+09,"{'author': 'Upper_Combination_46', 'created_ut...",submission
435,illusionist_iv,1640991997,1,,The first pic is how my YOLO Tesla contracts a...,1.640985e+09,"{'author': 'illusionist_iv', 'created_utc': 16...",submission
436,FortSquidward69,1640991832,1,,WSB,1.640985e+09,"{'author': 'FortSquidward69', 'created_utc': 1...",submission


In [22]:
df_p["post"] = df_p[["title", "selftext"]].apply(lambda x: join_submission_title_and_body(*x), axis=1)

In [23]:
df_p
# filter correct columns

Unnamed: 0,author,created_utc,id,selftext,title,created,d_,type,post
0,Keep_It_Toasty,1609628291,kp7v52,,Good scripture reading for the day #StonksOnly...,1609621091.0,"{'author': 'Keep_It_Toasty', 'created_utc': 16...",submission,Good scripture reading for the day #StonksOnly...
1,DesperateSalad5981,1609628267,kp7uve,,Thank you based House for providing us GME buy...,1609621067.0,"{'author': 'DesperateSalad5981', 'created_utc'...",submission,Thank you based House for providing us GME buy...
2,Loess_inspired,1609628134,kp7tfw,,Reddit knows I started trading options thanks WSB,1609620934.0,"{'author': 'Loess_inspired', 'created_utc': 16...",submission,Reddit knows I started trading options thanks ...
3,KYJELLYTIME69,1609628119,kp7t9u,I've been noticing some GME bears coming out o...,"GME DD from a retard bull - The credit market,...",1609620919.0,"{'author': 'KYJELLYTIME69', 'created_utc': 160...",submission,"GME DD from a retard bull - The credit market,..."
4,ouroboros2decimal718,1609628054,kp7sga,,🌈🐻’s should thank RH for the DD,1609620854.0,"{'author': 'ouroboros2decimal718', 'created_ut...",submission,🌈🐻’s should thank RH for the DD
...,...,...,...,...,...,...,...,...,...
626,thetrollfromabove,1609542233,kol9iu,[removed],just bought 10 000 shares of MGM by accident a...,1609535033.0,"{'author': 'thetrollfromabove', 'created_utc':...",submission,just bought 10 000 shares of MGM by accident a...
627,NejKaj,1609542226,kol9fs,[removed],I want to be a degenerate,1609535026.0,"{'author': 'NejKaj', 'created_utc': 1609542226...",submission,I want to be a degenerate
628,The-Saint-Lee,1609542217,kol9bk,,How my 2020 went (started April 27),1609535017.0,"{'author': 'The-Saint-Lee', 'created_utc': 160...",submission,How my 2020 went (started April 27)
629,jeffreymeltz,1609542052,kol7ca,,The Golden Egg,1609534852.0,"{'author': 'jeffreymeltz', 'created_utc': 1609...",submission,The Golden Egg


In [24]:
df_p["selftext"].value_counts()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [25]:
dt.datetime(2021, 5, 2).day

2

In [55]:
df_p

Unnamed: 0,author,created_utc,score,selftext,title,created,d_,type
0,must_be_funny_bot,1641077813,1,,The pillar of my destroyed meme-folio,1.641071e+09,"{'author': 'must_be_funny_bot', 'created_utc':...",submission
1,Bobdaman_1989,1641077789,1,[removed],Am I the only one that is scared to death abou...,1.641071e+09,"{'author': 'Bobdaman_1989', 'created_utc': 164...",submission
2,justramos69,1641077660,1,[removed],Attapoll referral bonus,1.641070e+09,"{'author': 'justramos69', 'created_utc': 16410...",submission
3,ChillinwithDillon,1641077643,1,,ES bustin 4800 next week 🤞,1.641070e+09,"{'author': 'ChillinwithDillon', 'created_utc':...",submission
4,Broad_Tradition_98,1641077411,1,[C3.ai](https://C3.ai) (AI) was trading at abo...,Lets Teach the Hedge Funds A Lesson,1.641070e+09,"{'author': 'Broad_Tradition_98', 'created_utc'...",submission
...,...,...,...,...,...,...,...,...
433,illusionist_iv,1640992405,1,,The first pic is how my YOLO Tesla contracts a...,1.640985e+09,"{'author': 'illusionist_iv', 'created_utc': 16...",submission
434,Upper_Combination_46,1640992143,1,I have surface level knowledge of trading and ...,Want to learn more,1.640985e+09,"{'author': 'Upper_Combination_46', 'created_ut...",submission
435,illusionist_iv,1640991997,1,,The first pic is how my YOLO Tesla contracts a...,1.640985e+09,"{'author': 'illusionist_iv', 'created_utc': 16...",submission
436,FortSquidward69,1640991832,1,,WSB,1.640985e+09,"{'author': 'FortSquidward69', 'created_utc': 1...",submission


In [27]:
print(dt.datetime.fromtimestamp(start_time))
print(dt.datetime.fromtimestamp(end_time))

2021-01-02 00:00:00
2021-01-03 00:00:00


In [56]:
df_p["created_utc"] = df_p["created_utc"].apply(lambda x: dt.datetime.fromtimestamp(x))

In [57]:
df_p

Unnamed: 0,author,created_utc,score,selftext,title,created,d_,type
0,must_be_funny_bot,2022-01-01 23:56:53,1,,The pillar of my destroyed meme-folio,1.641071e+09,"{'author': 'must_be_funny_bot', 'created_utc':...",submission
1,Bobdaman_1989,2022-01-01 23:56:29,1,[removed],Am I the only one that is scared to death abou...,1.641071e+09,"{'author': 'Bobdaman_1989', 'created_utc': 164...",submission
2,justramos69,2022-01-01 23:54:20,1,[removed],Attapoll referral bonus,1.641070e+09,"{'author': 'justramos69', 'created_utc': 16410...",submission
3,ChillinwithDillon,2022-01-01 23:54:03,1,,ES bustin 4800 next week 🤞,1.641070e+09,"{'author': 'ChillinwithDillon', 'created_utc':...",submission
4,Broad_Tradition_98,2022-01-01 23:50:11,1,[C3.ai](https://C3.ai) (AI) was trading at abo...,Lets Teach the Hedge Funds A Lesson,1.641070e+09,"{'author': 'Broad_Tradition_98', 'created_utc'...",submission
...,...,...,...,...,...,...,...,...
433,illusionist_iv,2022-01-01 00:13:25,1,,The first pic is how my YOLO Tesla contracts a...,1.640985e+09,"{'author': 'illusionist_iv', 'created_utc': 16...",submission
434,Upper_Combination_46,2022-01-01 00:09:03,1,I have surface level knowledge of trading and ...,Want to learn more,1.640985e+09,"{'author': 'Upper_Combination_46', 'created_ut...",submission
435,illusionist_iv,2022-01-01 00:06:37,1,,The first pic is how my YOLO Tesla contracts a...,1.640985e+09,"{'author': 'illusionist_iv', 'created_utc': 16...",submission
436,FortSquidward69,2022-01-01 00:03:52,1,,WSB,1.640985e+09,"{'author': 'FortSquidward69', 'created_utc': 1...",submission


In [13]:
"""FOR COMMENTS"""
def data_prep_comments(term, start_time, end_time, filters, limit):
    if (len(filters) == 0):
        filters = ['author', 'created_utc',
                   'body', 'permalink', "score"]
        #We set by default some usefull columns

    comments = list(api.search_comments(
        q=term,                 #Subreddit we want to audit
        after=start_time,       #Start date
        before=end_time,        #End date
        filter=filters,         #Column names we want to retrieve
        limit=limit))           #Max number of comments
    return pd.DataFrame(comments) #Return dataframe for analysis

In [16]:
"""Here we are going to get comments for a brief analysis"""
term = 'wallstreetbets'            #Term we want to search for
limit = 1000                #Number of elelemts
filters = list()
df_c = data_prep_comments(term, start_time,
                          end_time, filters, limit)
#Call function for dataframe creation of comments

In [17]:
df_c

Unnamed: 0,author,body,created_utc,id,permalink,score,created,d_
0,AleksAFG,Congrats and fuck you. Hopefully I'll be there...,1609541996,ghrq1lb,/r/Bitcoin/comments/kobrs1/my_friends_mocked_m...,1,1.609535e+09,"{'author': 'AleksAFG', 'body': 'Congrats and f..."
1,seb_dm,/r wallstreetbets,1609541987,ghrq111,/r/nevertellmetheodds/comments/ko6ets/the_last...,1,1.609535e+09,"{'author': 'seb_dm', 'body': '/r wallstreetbet..."
2,AutoModerator,This post was flaired as a YOLO so it's on the...,1609541639,ghrpea5,/r/wallstreetbets/comments/kol2v7/to_dee_moon/...,1,1.609534e+09,"{'author': 'AutoModerator', 'body': 'This post..."
3,brodega,"Yeah, all these poor people should just become...",1609541379,ghroxdm,/r/funny/comments/kogdjd/walmart_has_a_plan_fo...,15,1.609534e+09,"{'author': 'brodega', 'body': 'Yeah, all these..."
4,AutoModerator,This post was flaired as a YOLO so it's on the...,1609541147,ghroi46,/r/wallstreetbets/comments/kokxko/golden_egg/g...,1,1.609534e+09,"{'author': 'AutoModerator', 'body': 'This post..."
...,...,...,...,...,...,...,...,...
478,AutoModerator,Your post has been removed because a single li...,1609455994,ghnh6e9,/r/wallstreetbets/comments/ko0556/the_truth_ab...,1,1.609449e+09,"{'author': 'AutoModerator', 'body': 'Your post..."
479,AutoModerator,This post was flaired as a YOLO so it's on the...,1609455976,ghnh5ad,/r/wallstreetbets/comments/ko04yx/gme_yolo_upd...,1,1.609449e+09,"{'author': 'AutoModerator', 'body': 'This post..."
480,talesofstocks,"lol Oh no, the downvotes from reddit users in ...",1609455819,ghngw1u,/r/wallstreetbets/comments/knus1d/it_iz_what_i...,0,1.609449e+09,"{'author': 'talesofstocks', 'body': 'lol Oh no..."
481,AutoModerator,"Sir, this is the unemployment line.\n\n*I am a...",1609455753,ghngs64,/r/wallstreetbets/comments/ko02mo/crm_salesfor...,1,1.609449e+09,"{'author': 'AutoModerator', 'body': 'Sir, this..."


In [46]:
df_c

Unnamed: 0,author,body,created_utc,id,permalink,subreddit,created,d_
0,AleksAFG,Congrats and fuck you. Hopefully I'll be there...,1609541996,ghrq1lb,/r/Bitcoin/comments/kobrs1/my_friends_mocked_m...,Bitcoin,1.609538e+09,"{'author': 'AleksAFG', 'body': 'Congrats and f..."
1,seb_dm,/r wallstreetbets,1609541987,ghrq111,/r/nevertellmetheodds/comments/ko6ets/the_last...,nevertellmetheodds,1.609538e+09,"{'author': 'seb_dm', 'body': '/r wallstreetbet..."
2,AutoModerator,This post was flaired as a YOLO so it's on the...,1609541639,ghrpea5,/r/wallstreetbets/comments/kol2v7/to_dee_moon/...,wallstreetbets,1.609538e+09,"{'author': 'AutoModerator', 'body': 'This post..."
3,brodega,"Yeah, all these poor people should just become...",1609541379,ghroxdm,/r/funny/comments/kogdjd/walmart_has_a_plan_fo...,funny,1.609538e+09,"{'author': 'brodega', 'body': 'Yeah, all these..."
4,AutoModerator,This post was flaired as a YOLO so it's on the...,1609541147,ghroi46,/r/wallstreetbets/comments/kokxko/golden_egg/g...,wallstreetbets,1.609538e+09,"{'author': 'AutoModerator', 'body': 'This post..."
...,...,...,...,...,...,...,...,...
478,AutoModerator,Your post has been removed because a single li...,1609455994,ghnh6e9,/r/wallstreetbets/comments/ko0556/the_truth_ab...,wallstreetbets,1.609452e+09,"{'author': 'AutoModerator', 'body': 'Your post..."
479,AutoModerator,This post was flaired as a YOLO so it's on the...,1609455976,ghnh5ad,/r/wallstreetbets/comments/ko04yx/gme_yolo_upd...,wallstreetbets,1.609452e+09,"{'author': 'AutoModerator', 'body': 'This post..."
480,talesofstocks,"lol Oh no, the downvotes from reddit users in ...",1609455819,ghngw1u,/r/wallstreetbets/comments/knus1d/it_iz_what_i...,wallstreetbets,1.609452e+09,"{'author': 'talesofstocks', 'body': 'lol Oh no..."
481,AutoModerator,"Sir, this is the unemployment line.\n\n*I am a...",1609455753,ghngs64,/r/wallstreetbets/comments/ko02mo/crm_salesfor...,wallstreetbets,1.609452e+09,"{'author': 'AutoModerator', 'body': 'Sir, this..."


In [47]:
# get the comments of a certain submissions by some kind of id and join them together in database