## 1. Imports and config

In [3]:
import datetime as dt
import warnings
import yaml
import logging
import pywhatkit
import pandas as pd
from psaw import PushshiftAPI
from tqdm import tqdm
from sqlalchemy import create_engine
from data_extraction_utils import data_prep_posts, join_submission_title_and_body, find_stock_symbols, data_prep_comments
warnings.filterwarnings("ignore")

In [2]:
# Specify logging settings
logging.basicConfig(filename='comments.log', level=logging.INFO, filemode="a", format="%(asctime)s - %(levelname)s - %(message)s")

## 2. Database configuration

In [3]:
# Reading form config.yaml"
with open("../../config.yaml", "r") as yamlconfig:
    config = yaml.load(yamlconfig, Loader=yaml.FullLoader)

# Create postgres string with db-config
postgres_username = config["db_config"]["postgres_username"]
postgres_password = config["db_config"]["postgres_password"]
postgres_address = config["db_config"]["postgres_address"]
postgres_port = config["db_config"]["postgres_port"]
postgres_dbname = config["db_config"]["postgres_dbname"]

postgres_str = f"postgresql://{postgres_username}:{postgres_password}@{postgres_address}:{postgres_port}/{postgres_dbname}"

# create db connection with sqlalchemy
cnx = create_engine(postgres_str)

## 3. Create stock symbol list

In [4]:
# load stock list and transform it into a set
stock_ticker_list = pd.read_excel("../../data/external/stock_ticker_list.xlsx")["symbol"].to_list()

# Modified second stock list with dollar sign
stock_ticker_list_with_dollar_sign = [f"${ticker}" for ticker in stock_ticker_list]

# Combine the two lists to one set
final_stock_ticker_list = set(stock_ticker_list + stock_ticker_list_with_dollar_sign)

## 4. Extract Submissions from reddit

In [5]:
# Set date range to extract the data from the API
start = dt.datetime.strptime("25-01-2021", "%d-%m-%Y")
end = dt.datetime.strptime("01-01-2022", "%d-%m-%Y")
timestamp_list = [int((start + dt.timedelta(days=x)).timestamp()) for x in range(0, (end-start).days +1)]

# Reddit API Object
api = PushshiftAPI()

In [6]:
try:
    # loop over list of timestamps
    for i in (pbar := tqdm(range(len(timestamp_list) -1))):

        # Set start date and end date
        start_date = timestamp_list[i]
        end_date = timestamp_list[i + 1]

        # Specify data to extract from reddit
        filters = ['author', 'created_utc', "score", "body"]

        # Call API function to retrieve data from reddit as a dataframe
        df = data_prep_comments(api, "wallstreetbets", start_time=start_date, end_time=end_date, filters=filters)

        # Transform timestamp into datetime column
        df["created_at"] = df["created_utc"].apply(lambda x: dt.datetime.fromtimestamp(x))

        # Get stock symbol from reddit post if available
        df["stock_symbol"] = df["body"].apply(lambda x: find_stock_symbols(x, final_stock_ticker_list))

        # Drop unused columns
        df.drop(columns=["created_utc", "created"], inplace=True)

        # Rename columns as preparation for database
        df.rename(columns={'score': 'num_up_votes', "body": "post"}, inplace=True)

        # Write dataframe to database
        df.to_sql("r_wallstreetbets", cnx, index=False, if_exists="append")

        # Logging
        logging.info(f"{df.shape[0]} rows written to DB - Date: {dt.datetime.fromtimestamp(start_date)}")
        pbar.set_description(f"{dt.datetime.fromtimestamp(start_date)}")

except Exception as e:
    now = dt.datetime.now().time() # time object
    pywhatkit.sendwhatmsg("+4915737884472", f"ERROR - {e}", time_hour=now.hour, time_min=now.minute+1)


2021-01-24 00:00:00:   7%|▋         | 24/365 [5:39:11<80:19:22, 847.98s/it]  


KeyError: 'created_utc'