The purpose of this file is to extract sentiment from the preprocessed, Reddit WSB comments which have been mapped to respective tickers. Sentiment analysis was performed using Vader.

In [None]:
!pip install pandas vaderSentiment
!pip install tqdm

In [None]:
import os
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
vader = SentimentIntensityAnalyzer()

# Add custom WallStreetBets words to the VADER lexicon
# source https://github.com/jklepatch
wsb_words = {
    'citron': -4.0,
    'hidenburg': -4.0,
    'moon': 4.0,
    'highs': 2.0,
    'mooning': 4.0,
    'long': 2.0,
    'short': -2.0,
    'call': 4.0,
    'calls': 4.0,
    'put': -4.0,
    'puts': -4.0,
    'break': 2.0,
    'tendie': 2.0,
    'tendies': 2.0,
    'town': 2.0,
    'overvalued': -3.0,
    'undervalued': 3.0,
    'buy': 4.0,
    'sell': -4.0,
    'gone': -1.0,
    'gtfo': -1.7,
    'paper': -1.7,
    'bullish': 3.7,
    'bearish': -3.7,
    'bagholder': -1.7,
    'stonk': 1.9,
    'green': 1.9,
    'money': 1.2,
    'print': 2.2,
    'rocket': 2.2,
    'bull': 2.9,
    'bear': -2.9,
    'pumping': -1.0,
    'sus': -3.0,
    'offering': -2.3,
    'rip': -4.0,
    'downgrade': -3.0,
    'upgrade': 3.0,
    'maintain': 1.0,
    'pump': 1.9,
    'hot': 1.5,
    'drop': -2.5,
    'rebound': 1.5,
    'crack': 2.5,
}

# Update VADER lexicon with WSB words
vader.lexicon.update(wsb_words)

# Input and output directories - Google Colab was used, hence the filepaths 
input_dir = "/content/drive/MyDrive/FYP/reddit part/ticker_csvs"
output_dir = "/content/drive/MyDrive/FYP/reddit part/ticker_sentiment_csvs"
os.makedirs(output_dir, exist_ok=True)

# Process each CSV file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith(".csv"):
        print(f"Processing {filename}...")

        # Load the CSV file
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)

        # Ensure the 'body' column exists
        if 'body' not in df.columns:
            print(f"Skipping {filename}: 'body' column not found.")
            continue

        # Calculate sentiment scores for each comment
        df['sentiment_score'] = df['body'].apply(lambda x: vader.polarity_scores(str(x))['compound'])

        # Save the updated dataframe to a new CSV file
        output_filepath = os.path.join(output_dir, filename)
        df.to_csv(output_filepath, index=False)
        print(f"Finished processing {filename}. Saved to {output_filepath}.")

print(f"Sentiment analysis completed. Results saved in '{output_dir}' directory.")


In [None]:
import os
import pandas as pd
from tqdm import tqdm

# File paths
articles_file = "/content/drive/MyDrive/FYP/articles_with_tickers_with_monthly_avg_volume_v2.csv"
reddit_sentiment_dir = "/content/drive/MyDrive/FYP/reddit part/ticker_sentiment_csvs"
output_file = "/content/drive/MyDrive/FYP/articles_with_reddit_sentiment_all.csv"

# Load the articles dataset
articles_df = pd.read_csv(articles_file)

# Ensure 'reddit vader sentiment' column exists
if 'reddit vader sentiment' not in articles_df.columns:
    articles_df['reddit vader sentiment'] = ''

# Load existing progress if output file exists
if os.path.exists(output_file):
    updated_articles_df = pd.read_csv(output_file)
    processed_tickers = set(updated_articles_df['ticker'].unique())
else:
    updated_articles_df = articles_df.copy()
    processed_tickers = set()

# List all Reddit sentiment files
reddit_files = [f for f in os.listdir(reddit_sentiment_dir) if f.endswith(".csv")]

# Process each Reddit CSV
for reddit_file in tqdm(reddit_files, desc="Processing Reddit Sentiment Files"):
    # Extract the ticker from the filename
    ticker = os.path.splitext(reddit_file)[0]

    # Skip already processed tickers
    if ticker in processed_tickers:
        print(f"Skipping already processed ticker: {ticker}")
        continue

    print(f"Processing ticker: {ticker}...")

    # Load the Reddit sentiment data for the ticker
    reddit_df = pd.read_csv(os.path.join(reddit_sentiment_dir, reddit_file))

    # Handle invalid dates in the Reddit dataset
    def safe_parse_date(date):
        try:
            return pd.to_datetime(date, format='mixed', dayfirst=True).strftime('%d/%m/%Y')
        except Exception:
            return None

    # Parse and filter valid dates
    reddit_df['Formatted_Date'] = reddit_df['comment_date'].apply(safe_parse_date)
    reddit_df = reddit_df[reddit_df['Formatted_Date'].notna()]

    # Filter articles for the current ticker
    ticker_articles_df = articles_df[articles_df['ticker'] == ticker]

    # Convert articles dates to match the same format
    ticker_articles_df['Formatted_Date'] = pd.to_datetime(
        ticker_articles_df['Formatted_Date'], format='mixed', dayfirst=True
    ).dt.strftime('%d/%m/%Y')

    # Find common dates
    common_dates = set(ticker_articles_df['Formatted_Date']).intersection(set(reddit_df['Formatted_Date']))

    # Filter Reddit dataset to keep only rows with common dates
    reddit_common_df = reddit_df[reddit_df['Formatted_Date'].isin(common_dates)]

    # Calculate the average sentiment score for each common date
    average_sentiments = reddit_common_df.groupby('Formatted_Date')['sentiment_score'].mean().reset_index()

    # Map the average sentiment scores to the articles dataset
    ticker_articles_df['reddit vader sentiment'] = ticker_articles_df['Formatted_Date'].map(
        average_sentiments.set_index('Formatted_Date')['sentiment_score']
    )

    # Update the main DataFrame
    updated_articles_df.loc[updated_articles_df['ticker'] == ticker, 'reddit vader sentiment'] = ticker_articles_df[
        'reddit vader sentiment'
    ]

    # Save progress after processing each ticker
    updated_articles_df.to_csv(output_file, index=False)
    print(f"Progress saved for ticker: {ticker}")

print(f"Sentiment analysis completed for all tickers. Final data saved to: {output_file}")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# File path
file_path = "/content/drive/MyDrive/FYP/articles_with_reddit_sentiment_all.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Filter rows where both 'Sentiment_Score' and 'reddit vader sentiment' are not NaN
filtered_df = df[df['reddit vader sentiment'].notna() & df['Sentiment_Score'].notna()]

# Calculate correlation
correlation = filtered_df[['Sentiment_Score', 'reddit vader sentiment']].corr().iloc[0, 1]
print(f"Correlation between 'Sentiment_Score' and 'reddit vader sentiment': {correlation:.2f}")

# Scatter plot to visualize the relationship
plt.figure(figsize=(8, 6))
sns.scatterplot(
    x='Sentiment_Score',
    y='reddit vader sentiment',
    data=filtered_df,
    alpha=0.6,
    edgecolor=None
)
plt.title(f"Scatter Plot: Article Sentiment vs Reddit Sentiment\nCorrelation = {correlation:.2f}")
plt.xlabel('Article Sentiment Score')
plt.ylabel('Reddit Vader Sentiment')
plt.axhline(0, color='gray', linestyle='--', linewidth=0.7)
plt.axvline(0, color='gray', linestyle='--', linewidth=0.7)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Density plot to compare sentiment distributions
plt.figure(figsize=(8, 6))
sns.kdeplot(
    filtered_df['Sentiment_Score'], label='Article Sentiment Score', shade=True
)
sns.kdeplot(
    filtered_df['reddit vader sentiment'], label='Reddit Vader Sentiment', shade=True
)
plt.title("Density Plot: Article Sentiment vs Reddit Sentiment")
plt.xlabel("Sentiment Score")
plt.ylabel("Density")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# File path
file_path = "/content/drive/MyDrive/FYP/articles_with_reddit_sentiment_all.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Filter rows where both 'Sentiment_Score' and 'reddit vader sentiment' are not NaN
filtered_df = df[df['reddit vader sentiment'].notna() & df['Sentiment_Score'].notna()]

# Define conditions
positive_article_positive_reddit = (
    (filtered_df['Sentiment_Score'] > 0) & (filtered_df['reddit vader sentiment'] > 0)
)
positive_article_negative_reddit = (
    (filtered_df['Sentiment_Score'] > 0) & (filtered_df['reddit vader sentiment'] < 0)
)
negative_article_positive_reddit = (
    (filtered_df['Sentiment_Score'] < 0) & (filtered_df['reddit vader sentiment'] > 0)
)
negative_article_negative_reddit = (
    (filtered_df['Sentiment_Score'] < 0) & (filtered_df['reddit vader sentiment'] < 0)
)

# Count cases
count_positive_positive = positive_article_positive_reddit.sum()
count_positive_negative = positive_article_negative_reddit.sum()
count_negative_positive = negative_article_positive_reddit.sum()
count_negative_negative = negative_article_negative_reddit.sum()

# Print results
print("Number of cases:")
print(f"+ve Article Sentiment and +ve Reddit Sentiment: {count_positive_positive}")
print(f"+ve Article Sentiment and -ve Reddit Sentiment: {count_positive_negative}")
print(f"-ve Article Sentiment and +ve Reddit Sentiment: {count_negative_positive}")
print(f"-ve Article Sentiment and -ve Reddit Sentiment: {count_negative_negative}")
