In [35]:
import pandas as pd
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import emoji
import re
import numpy as np

In [72]:
# Start with a pandas dataframe and convert to dictionary after cleaning, dictionary necessary for sentiment libraries
inputdata = pd.read_csv("wsb_comments_2024-10-11.csv", header=[0])

# start a list of banned users for bots, spammers, etc.
inputdata['Username'] = inputdata['Username'].str.strip()
banned_users = ["VisualMod", "[deleted]"]
inputdata = inputdata[~inputdata['Username'].isin(banned_users)]

### I WANT TO CLEAN EMOJI/NON-ASCII TEXT BEFORE PROCESING
inputdata['Comment Body'] = inputdata['Comment Body'].apply(lambda s: emoji.replace_emoji(s, ''))
# Function to clean reddit style emoticons text
def clean_comment_text(text):
    # Regex pattern to match and remove Reddit emoticons (e.g., ![img](emote|t5_2th52|4271))
    cleaned_text = re.sub(r'!\[img]\(emote\|t5_[a-zA-Z0-9]+\|\d{4,5}\)', '', text)
    return cleaned_text.strip()  # Strip any leading/trailing spaces

inputdata['Comment Body'] = inputdata['Comment Body'].apply(clean_comment_text)
inputdata['Comment Body'] = inputdata['Comment Body'].str.rstrip()
inputdata['Comment Body'] = inputdata['Comment Body'].str.rstrip()
inputdata = inputdata.dropna(subset=['Comment Body', 'Username'])

# Example list of NASDAQ and NYSE tickers
tickers = ['AAPL', 'GOOGL', 'TSLA', 'AMZN', 'MSFT']

def count_tickers(comment, tickers):
    ticker_counts = {}
    # Normalize the comment to avoid case sensitivity
    comment = comment.upper()
    for ticker in tickers:
        # Count occurrences of each ticker
        count = len(re.findall(r'\b' + re.escape(ticker) + r'\b', comment))
        if count > 0:
            ticker_counts[ticker] = count
    return ticker_counts

# Apply the function to create a new column with the ticker counts
inputdata['Ticker Counts'] = inputdata['Comment Body'].apply(lambda comment: count_tickers(comment, tickers))

# I created a new dictionary here for the comment column in my csv file
commentdictionary = inputdata.get('Comment Body')
#I am converting the comment from dataframe to a list for the sentiment analyses below
commentlist = commentdictionary.tolist()
textblob_results_list=[]
vader_results_list=[]

for i in range(len(commentlist)):
    #This is TextBlob Based Sentiment Analysis
    textblob_analyze_polarity = TextBlob(commentlist [i]).polarity
    textblob_analyze_subjectivity = TextBlob(commentlist [i]).subjectivity
    #polarity values range from -1 to 1 where -1.0 is negative polarity and 1.0 is positive
    #Subjectivity/objectivity  values range from 0.0 to 1.0 where 0.0 is very objective and 1.0 is very subjective
    #print("Polarity: ", textblob_analyze_polarity)
    #print("Subjectivity: ",textblob_analyze_subjectivity)

    textblob_result = {"TextBlob Polarity Score":textblob_analyze_polarity,"TextBlob Subjectivity Score": textblob_analyze_subjectivity}
    textblob_results_list.append(textblob_result)

    #This is Vader Based Sentiment Analysis
    #Vader provides 4 results labeled as negative, neutral, positive, and compound(overall)
    vader_sentiment_analysis = SentimentIntensityAnalyzer().polarity_scores(commentlist [i])
    vader_results_list.append(vader_sentiment_analysis)
    #In Vader the compound score is the sum of positive, negative, and neutral scores which is then
    #normalized between -1 [most extreme negative] and 1[most extreme positive]
    #negative represents negative aspects of a tweet
    #positive represents positive aspects of a tweet
    #neutral represents neutral aspects of a tweet
    #print("Polarity Scores in Vader: ", vader_sentiment_analysis)

#This is the TextBlob Sentiment Analysis Results
textblobresults = pd.DataFrame(textblob_results_list)

#This is the Vader Sentiment Analysis Results
vaderresults = pd.DataFrame(vader_results_list)
#print(textblobresults['TextBlob Polarity Score'])
#print(vaderresults['neg'])

# Reset the index to ensure alignment
inputdata.reset_index(drop=True, inplace=True)
textblobresults.reset_index(drop=True, inplace=True)
vaderresults.reset_index(drop=True, inplace=True)

#file = pd.read_csv("wsb_comments_2024-10-11.csv", header=[0])
inputdata['TextBlob Polarity Score'] = textblobresults['TextBlob Polarity Score']
inputdata['TextBlob Subjectivity Score'] = textblobresults['TextBlob Subjectivity Score']
inputdata['Vader Negative Polarity Score'] = vaderresults['neg']
inputdata['Vader Neutral Polarity Score'] = vaderresults['neu']
inputdata['Vader Positive Polarity Score'] = vaderresults['pos']
inputdata['Vader Compound Polarity Score'] = vaderresults['compound']

file = pd.DataFrame(inputdata)
file.to_csv('wsb_sentiment.csv', index=True, index_label="Index")
print("Done")

Done


In [70]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load the CSV file into a pandas dataframe (assuming you already have this from before)
# inputdata = pd.read_csv("wsb_comments_2024-10-11.csv", header=[0])

# Example: Let's say we want to extract bigrams and trigrams
vectorizer = CountVectorizer(ngram_range=(4, 4))  # Change ngram_range to (1,1) for unigrams, (2,3) for bigrams/trigrams

# Fit the vectorizer to the 'Comment Body' and transform the text into n-grams
ngrams = vectorizer.fit_transform(inputdata['Comment Body'])

# Get the feature names (n-grams) and their counts
ngram_features = vectorizer.get_feature_names_out()
ngram_counts = ngrams.toarray().sum(axis=0)

# Combine n-grams with their counts into a DataFrame
ngram_df = pd.DataFrame({'n-gram': ngram_features, 'count': ngram_counts})

# Sort by count to see the most frequent n-grams
ngram_df = ngram_df.sort_values(by='count', ascending=False)

# Display the top 10 most frequent n-grams
print(ngram_df.head(10))

# Optionally, save n-gram data to CSV
ngram_df.to_csv('wsb_ngrams.csv', index=False)


                     n-gram  count
15292      what the fuck is      5
11487  someone in the crowd      4
1443       at the same time      4
6335       in the room with      4
12928      the room with us      3
10053       put the balm on      3
16012      you can think of      3
13055        the way to the      3
9215          on the way to      3
10870      scan the qr code      3


In [75]:
import yfinance as yf

# Get tickers from NASDAQ
nasdaq_tickers = yf.Tickers('AAPL MSFT TSLA ...')  # Add tickers you want
tickers = list(nasdaq_tickers.tickers.keys())

In [58]:
len(commentdictionary)

1948

In [55]:
len(vaderresults)

1948

In [18]:
len(commentdictionary)

1948

In [56]:
vaderresults

Unnamed: 0,neg,neu,pos,compound
0,0.000,1.000,0.000,0.0000
1,0.000,1.000,0.000,0.0000
2,0.135,0.611,0.253,0.4019
3,0.000,1.000,0.000,0.0000
4,0.000,0.667,0.333,0.4576
...,...,...,...,...
1943,0.000,0.800,0.200,0.3612
1944,0.125,0.875,0.000,-0.4588
1945,0.000,1.000,0.000,0.0000
1946,0.000,1.000,0.000,0.0000


In [22]:
commentdictionary

{0: 'Take a shot every time Elon says "next year"',
 1: 'Going downtown to have zero girls talk to me',
 2: 'Self respect: Gone\n\n&nbsp;\n\nDignity: Decimated\n\n&nbsp;\n\nAccount: Deep Red\n\n&nbsp;\n\nHeterosexuality: Shattered\n\n&nbsp;\n\nBear Thesis: Acquired',
 3: 'There are 56 trading days left in the year.\n\nYou only need to make $17,857.14 per day to make it to a million.\n\nYou trained for this, you got this!\n\nBelieve in yourself! Make it happen!',
 4: '$TSLA opening flat would be fucking hilarious',
 5: 'Elon going to bring Hawk tuah girl into the Robotaxi event to demonstrate',
 6: 'I buy offbrand cereal to save 40 cents at the same time I’m yoloing $3k worth of 0DTEs',
 7: 'This is the dumbest shit I’ve ever watched, and Ive watched Joker 2',
 8: 'Puts on Tesla seemed way too obvious. I look forward to seeing how much I missed out on.',
 9: 'Clover Health PPO Medicare Advantage Plans Earn 4 Star Rating for 2025',
 10: 'Need to quickly turn 100 into 50k to fix my life, 

In [77]:
print(inputdata['Username'].unique())

['WSB_Mods_are' 'longGERN' 'Innocent-Brat' 'ExceptionallyGreat'
 'broncocannon' 'BushLov3r' 'Wall_St_Bussy' 'yaboiRich'
 'YoIForgotMyPassAgain' 'ProfessorAkaliOnYT' 'SignalSalamander' 'CPA-hole'
 'No-Situation-2001' 'xeuropa' 'SensationalSeas' 'Fafa0098' 'tortoisepump'
 'GoZukkYourself' 'GraceBoorFan' 'Ok-Geologist5545' 'P-Diddy-69'
 'Kevontee324' 'LouieM13' 'Fit_Combination6988' 'Timely_Wafer2294' 'ilwb'
 'pharmadawg' 'NotSoTough-Tony' 'cinJESUS' 'WPG_Strong'
 'Buying_thefkindips' 'AffectionateBird8477' '9tacos' 'igotshrimps'
 'S-Club-Party' 'Such-Ice1325' 'ThatOneDrunkUncle' 'nateyp123'
 'FedPrinter69420' 'romt_25' 'hepukesyoudie' 'jangofett27' 'SwiftSG1'
 'Paulruswasdead' 'ChampagnePapi-' 'IncomingAxofKindness' 'NVDAismygod'
 'walrighti' 'convexdominance6' 'Slick_MF_iG' 'conflicted_humanist'
 'Pura112' 'marrful' 'BoneEvasion' 'westcoastlink' 'changez1'
 'HospitialHeadache' 'Dark_Overlord335786' 'Kooky-Letterhead1387'
 'Who_is_Your_Zaddy' 'mysuruhuduga' 'optionsCone' 'kk7766' 'bullrf

In [99]:
inputdata['Username'][1858]

'-rigga'

In [67]:
commentdictionary[156]

'Remember when Fisker bagholders were optimistic about TSLA buying out Fisker'

In [57]:
commentdictionary[0]

'Take a shot every time Elon says "next year"'