In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [11]:
import time
import pandas as pd
import praw
from datetime import datetime, date
from psaw import PushshiftAPI
import string
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
up_ratio = 0.75  # min post upvote ratio
post_ups = 20  # min # upvotes on post
cmt_ups = 2  # min # upvotes on comment
top_n_stocks = 5  # number of most mentioned stocks to consider
posts_perday = 1000 # Number of posts to consider for each day

# Modify these: Year, Month, Day
start_date = int(datetime(2022,4,1).timestamp())
end_date = int(datetime(2022,4,6).timestamp())

r = praw.Reddit(
    user_agent="sunflora",
    client_id="fQHJvnTwnElH2AOxXIt4nw",
    client_secret="UW3GeW8iCJCHbly-hYqd7Cgscx72Jw"
)
reddit = PushshiftAPI(r)

In [4]:
# Stock data adapted from https://github.com/jklepatch/eattheblocks/blob/master/screencast/290-wallstreetbets-sentiment-analysis/data.py
# csv file from https://www.nasdaq.com/market-activity/stocks/screener?exchange=nasdaq&letter=0&render=download\

stock_screener = pd.read_csv("nasdaq_screener_1649302163756.csv")
stocks = []
for i in range(stock_screener.shape[0]):
    stocks.append(stock_screener['Symbol'][i])

# blacklist words that might be confused as stock names
blacklist = {'I', 'ELON', 'WSB', 'THE', 'A', 'ROPE', 'YOLO', 'TOS', 'CEO', 'DD', 'IT', 'OPEN', 'ATH', 'PM', 'IRS', 'FOR','DEC', 'BE', 'IMO', 'ALL', 'RH', 'EV', 'TOS', 'CFO', 'CTO', 'DD', 'BTFD', 'WSB', 'OK', 'PDT', 'RH', 'KYS', 'FD', 'TYS', 'US', 'USA', 'IT', 'ATH', 'RIP', 'BMW', 'GDP', 'OTM', 'ATM', 'ITM', 'IMO', 'LOL', 'AM', 'BE', 'PR', 'PRAY', 'PT', 'FBI', 'SEC', 'GOD', 'NOT', 'POS', 'FOMO', 'TL;DR', 'EDIT', 'STILL', 'WTF', 'RAW', 'PM', 'LMAO', 'LMFAO', 'ROFL', 'EZ', 'RED', 'BEZOS', 'TICK', 'IS', 'PM', 'LPT', 'GOAT', 'FL', 'CA', 'IL', 'MACD', 'HQ', 'OP', 'PS', 'AH', 'TL', 'JAN', 'FEB', 'JUL', 'AUG', 'SEP', 'SEPT', 'OCT', 'NOV', 'FDA', 'IV', 'ER', 'IPO', 'MILF', 'BUT', 'SSN', 'FIFA', 'USD', 'CPU', 'AT', 'GG', 'Mar','ARE','GO',
             'ON','J','VERY','REAL','FAST','ANY','GET','UK','HAS','CAN','IQ'}

# adding words to update the dictionary of SentimentIntensityAnalyzer() based on reddit
new_words = {
    'citron': -4.0,
    'hidenburg': -4.0,
    'moon': 4.0,
    'highs': 2.0,
    'mooning': 4.0,
    'long': 2.0,
    'short': -2.0,
    'call': 4.0,
    'calls': 4.0,
    'put': -4.0,
    'puts': -4.0,
    'break': 2.0,
    'tendie': 2.0,
     'tendies': 2.0,
     'town': 2.0,
     'overvalued': -3.0,
     'undervalued': 3.0,
     'buy': 4.0,
     'sell': -4.0,
     'gone': -1.0,
     'gtfo': -1.7,
     'paper': -1.7,
     'bullish': 3.7,
     'bearish': -3.7,
     'bagholder': -1.7,
     'stonk': 1.9,
     'green': 1.9,
     'money': 1.2,
     'print': 2.2,
     'rocket': 2.2,
     'bull': 2.9,
     'bear': -2.9,
     'pumping': -1.0,
     'sus': -3.0,
     'offering': -2.3,
     'rip': -4.0,
     'downgrade': -3.0,
     'upgrade': 3.0,
     'maintain': 1.0,
     'pump': 1.9,
     'hot': 1.5,
     'drop': -2.5,
     'rebound': 1.5,
     'crack': 2.5,}

In [5]:
# clean up comments
def clean(cmt_string):
    punctuations = cmt_string.translate(str.maketrans('', '', string.punctuation))  # get rid of punctuations
    return punctuations


# limit = number of posts
def get_posts(start_date, end_date, limit):
    # We set by default some useful columns
    posts = list(reddit.search_submissions(
        subreddit='wallstreetbets',
        after=start_date,
        before=end_date,
        limit=limit
    ))
    return posts

In [6]:
def get_picks(start, end, limit):
    """
    :param start: Starting epoch
    :param end: Ending epoch
    :param limit: Number of posts to retrieve
    :return: dictionary: {top n_stocks: list of comments about that stock}
    """

    cmt_list = []  # stores all text
    relevant_comments = []
    stock_count = {}  # stores stock_name:count
    stock_cmts = {}  # stores stock_name: [comments]
    pick_cmts = {}  # stock_cmts for top n stocks
    
    posts = get_posts(start, end, limit)  # retrieve posts
    
    for submission in posts:
        if submission.score > post_ups:
            cmt_list.append(clean(submission.title))

            if submission.selftext != "":
                cmt_list.append(clean(submission.selftext))

            submission.comments.replace_more(limit=10)  # Number of more_comment objects to replace
            for comment in submission.comments.list():  # get comments + replies
                if comment.score > cmt_ups:
                    cmt_list.append(clean(comment.body))


    for cmt in cmt_list:
        word_list = cmt.split()
        for word in word_list:
            if word.isupper() and word in stocks and word not in blacklist:
                relevant_comments.append(cmt)

                if word not in stock_count:
                    stock_count[word] = 1
                    stock_cmts[word] = [cmt]

                else:
                    stock_count[word] += 1
                    stock_cmts[word].append(cmt)

    sorted_stock_count = dict(sorted(stock_count.items(), key=lambda item: item[1], reverse=True))
    picks = list(sorted_stock_count.keys())[0:top_n_stocks]

    for st in picks:
        pick_cmts[st] = stock_cmts[st]

    return pick_cmts

In [7]:
def sentiment_score(comment_dict):
    """
    :param comment_dict: dictionary of {stock_name: [list of stock comments]}
    :return: dictionary of {stock_name: dictionary of {sentiment:score}}
    """
    
    jack = SentimentIntensityAnalyzer()
    jack.lexicon.update(new_words)

    top_picks = list(comment_dict.keys())
    score_dict = dict.fromkeys(top_picks)
    for x in score_dict:  # stock = x
        x_comments = comment_dict[x]  # all relevant comments of stock x
        score_dict[x] = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}
        for c in x_comments:
            sentiment_dict = jack.polarity_scores(c)
            for key in sentiment_dict.keys():
                score_dict[x][key] += sentiment_dict[key]

        # need to average each entry in score_dict
        for k in score_dict[x]:
            score_dict[x][k] = score_dict[x][k]/len(x_comments)

    return score_dict


In [8]:
df_list = [] # List of data frames generated
df_commentDicts = {} # Date : comment_dictionary

starting_time = time.time()
for i in range(int(start_date), int(end_date)+86400,86400):
    curr_date = datetime.fromtimestamp(i).strftime('%Y-%m-%d')
    print(f'Now working on posts from {curr_date}...')
    comment_dictionary = get_picks(i, i+86400, posts_perday)
    df = pd.DataFrame(sentiment_score(comment_dictionary))
    df.index = ['Bearish', 'Neutral', 'Bullish', 'Total_Compound']
    df = df.T
    df.index.name = 'stock'
    dates = [i for x in range(top_n_stocks)]
    df['Date'] = dates
    df_list.append(df)
    df_commentDicts[i] = comment_dictionary
    diff = time.time() - starting_time
    print(f'{curr_date} completed in {diff} seconds')


Now working on posts from 2022-04-01...
2022-04-01 completed in 313.79141902923584 seconds
Now working on posts from 2022-04-02...
2022-04-02 completed in 755.1503059864044 seconds
Now working on posts from 2022-04-03...
2022-04-03 completed in 916.0593628883362 seconds
Now working on posts from 2022-04-04...
2022-04-04 completed in 1425.9793539047241 seconds
Now working on posts from 2022-04-05...
2022-04-05 completed in 1678.0168387889862 seconds
Now working on posts from 2022-04-06...
2022-04-06 completed in 2104.2222838401794 seconds


In [9]:
final_df = pd.concat(df_list,axis=0) # Concatenate all the dataframes for each day
final_df.reset_index() 

Unnamed: 0,stock,Bearish,Neutral,Bullish,Total_Compound,Date
0,AMD,0.137845,0.654931,0.207207,0.146643,1648785600
1,TSLA,0.103,0.711044,0.185978,0.173713,1648785600
2,TLRY,0.162314,0.660914,0.1768,-0.041674,1648785600
3,NVDA,0.2485,0.600143,0.151357,-0.182171,1648785600
4,AAPL,0.147333,0.71925,0.133417,0.084917,1648785600
5,TSLA,0.08836,0.75816,0.15352,0.151744,1648872000
6,WISH,0.10685,0.6062,0.28695,0.427565,1648872000
7,HMHC,0.057571,0.727857,0.214571,0.713243,1648872000
8,CLOV,0.17925,0.573,0.2475,0.246575,1648872000
9,TLRY,0.04025,0.86575,0.094,-0.0596,1648872000


In [10]:
# Save the dataframe as a csv file
x = datetime.fromtimestamp(start_date).strftime('%Y-%m-%d')
y = datetime.fromtimestamp(end_date).strftime('%Y-%m-%d')
final_df.to_csv(f'{x}-{y}.csv')

In [12]:
def daily_csv():
    today = date.today()
    y = today.year
    m = today.month
    d = today.day

    debut = int(datetime(y, m, d).timestamp()) - 86400
    fin = int(datetime(y, m, d).timestamp())

    comment_dictionary = get_picks(debut, fin, posts_perday)
    df = pd.DataFrame(sentiment_score(comment_dictionary))
    df.index = ['Bearish', 'Neutral', 'Bullish', 'Total_Compound']
    df = df.T
    df.index.name = 'stock'
    dates = [debut for x in range(top_n_stocks)]
    df['Date'] = dates
    df.reset_index()

    # Save the dataframe as a csv file
    x = datetime.fromtimestamp(start_date).strftime('%Y-%m-%d')
    y = datetime.fromtimestamp(end_date).strftime('%Y-%m-%d')
    df.to_csv(f'{str(today)}[1].csv')

    return comment_dictionary, df

In [13]:
stock_comments, df = daily_csv()

In [16]:
for stock in stock_comments:
    for comment in stock_comments[stock]:
        print(comment)
        print("****")

I know how you feel I was on the HMHC threads having fun and then flooded “hah idiots it’s over because xyz” when xyz was specifically KNOWN to be part of the play Either that or they just completely misunderstood the situation
****
Ive been checking the SEC filings page for HMHC and there are still no updates I dont think well find out until closer to market  open
****
Little gamble on HMHC guess I’ll post some loss porn if it goes to 0
****
HMHC HALT UPDATE
****
Houghton Mifflin Harcourt Successfully Completes Sale to Veritas Capital HMHC

HOUGHTON MIFFLIN HARCOURT COMPLETES SALE TO VERITAS CAPITAL

lol they actually did it
****
Decided to analyze the option chain all 5 expirations and all contracts for any signs of institutional positioning which are usually transacted at substantial metrics above the averagesenjoy degens no skin personally in HMHC

x200B

Timestamp estContractTradeValueOver Average

03282022 124441204CALL  225 on 202206174637  0251159k27658
04042022 102058673CALL  

ValueError: All arrays must be of the same length

In [21]:
import pickle

with open('daily_comments.pkl', 'wb') as f:
    pickle.dump(stock_comments, f)