<a href="https://colab.research.google.com/github/Lizzy-g54/GME-sentiment-analysis/blob/main/notebooks/GME_Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install yfinance pandas numpy

import pandas as pd
import yfinance as yf
import os
from google.colab import drive

# 1. MOUNT GOOGLE DRIVE
# This allows you to save the processed CSVs permanently.
drive.mount('/content/drive')

# Create a project folder in your Drive if it doesn't exist
PROJECT_PATH = "/content/drive/My Drive/GME_Sentiment_Project"
if not os.path.exists(PROJECT_PATH):
    os.makedirs(PROJECT_PATH)
    print(f"Created project folder at: {PROJECT_PATH}")

# 2. CONFIGURATION
START_DATE = "2021-01-01"
END_DATE = "2021-02-28"
TICKER = "GME"

# 3. FETCH MARKET DATA (Yahoo Finance)
def fetch_market_data():
    """
    Fetches GME price and volume.
    Calculates Volatility_Intraday as a derived attribute[cite: 25].
    """
    print(f"Fetching {TICKER} data...")
    df = yf.download(TICKER, start=START_DATE, end=END_DATE)

    # Clean up multi-index if present
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)

    df.reset_index(inplace=True)

    # Derived Attribute: High/Low Volatility [cite: 25]
    df['Volatility_Intraday'] = (df['High'] - df['Low']) / df['Open']

    # Select variables [cite: 25]
    df = df[['Date', 'Close', 'Volume', 'Volatility_Intraday']]
    df.rename(columns={'Close': 'ClosePrice'}, inplace=True)

    # Save to Google Drive
    save_path = os.path.join(PROJECT_PATH, "gme_market_data.csv")
    df.to_csv(save_path, index=False)
    print(f"Market data saved to: {save_path}")
    return df

# 4. PROCESS REDDIT DATA
def process_reddit_data(local_csv_path):
    """
    Filters WallStreetBets data for GME keywords and the Jan-Feb 2021 timeline[cite: 13, 19].
    """
    print("Loading Reddit data (this may take a minute)...")

    # Load dataset (Make sure you upload the Kaggle CSV to Colab or Drive first)
    try:
        # Optimization: only read columns we need [cite: 21]
        df = pd.read_csv(local_csv_path, parse_dates=['timestamp'])
    except FileNotFoundError:
        print(f"Error: Please upload your Reddit CSV to {local_csv_path}")
        return None

    # Filter by Date [cite: 13]
    mask_date = (df['timestamp'] >= START_DATE) & (df['timestamp'] <= END_DATE)
    df = df.loc[mask_date].copy()

    # Filter by Keywords [cite: 13]
    keywords = ['gme', 'gamestop']
    mask_keyword = df['body'].str.lower().str.contains('|'.join(keywords), na=False)
    df_filtered = df.loc[mask_keyword].copy()

    # Save processed text data to Google Drive
    save_path = os.path.join(PROJECT_PATH, "filtered_reddit_data.csv")
    df_filtered.to_csv(save_path, index=False)
    print(f"Filtered Reddit data saved to: {save_path}")
    return df_filtered

# --- EXECUTION ---

# Get Market Data
gme_market = fetch_market_data()

# Process Reddit Data
# Note: Update 'path_to_your_kaggle_file.csv' to where you stored the raw file
# e.g., "/content/drive/My Drive/raw_data/reddit_wsb.csv"
reddit_data = process_reddit_data("/content/reddit_wsb.csv")

print("\n--- Summary ---")
print(f"Market Data Rows: {len(gme_market)}")
# if reddit_data is not None:
#     print(f"Reddit Data Rows: {len(reddit_data)}")



  df = yf.download(TICKER, start=START_DATE, end=END_DATE)
[*********************100%***********************]  1 of 1 completed

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fetching GME data...
Market data saved to: /content/drive/My Drive/GME_Sentiment_Project/gme_market_data.csv
Loading Reddit data (this may take a minute)...
Error: Please upload your Reddit CSV to /content/reddit_wsb.csv

--- Summary ---
Market Data Rows: 38





In [None]:
!pip install transformers[torch] datasets pandas scikit-learn

import pandas as pd
import torch
import os
from google.colab import drive
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder



In [None]:
# ==========================================
# GME Sentiment Analysis - Step 2 (Merged Version)
# Strategy: Merging 12-Class Taxonomy -> 4-Class (Hype, Fear, Anger, Noise)
# Model: RoBERTa-base (Better for social media text)
# ==========================================

# 1. Install necessary libraries
!pip install transformers[torch] datasets pandas scikit-learn accelerate -U

import pandas as pd
import torch
import os
import numpy as np
from google.colab import drive
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, classification_report

# 2. Mount Google Drive
# If already mounted, this line will be skipped automatically
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# 3. Path Configuration (Ensure files exist in these paths)
BASE_PATH = "/content/drive/My Drive/GME_Sentiment_Project/"
TRAIN_PATH = os.path.join(BASE_PATH, "train_stockemo.csv")
VAL_PATH = os.path.join(BASE_PATH, "val_stockemo.csv")
# Model Save Path (We save as a new model to avoid overwriting the old one)
MODEL_SAVE_PATH = os.path.join(BASE_PATH, "fine_tuned_roberta_4class")

# 4. Define 4 Broad Classes and Mapping Logic
# This is the key strategy to save the project!
NEW_LABELS = ["Hype", "Fear", "Anger", "Noise"]
label2id = {label: i for i, label in enumerate(NEW_LABELS)}
id2label = {i: label for i, label in enumerate(NEW_LABELS)}

# Mapping Dictionary: Original 12 Labels -> New 4 Broad Classes
MAPPING = {
    # 🟢 Hype (Bullish/Excitement)
    'excitement': 'Hype',
    'optimism': 'Hype',
    'belief': 'Hype',
    'amusement': 'Hype',

    # 🔴 Fear (Bearish/Panic) - Core for RQ1
    'panic': 'Fear',
    'anxiety': 'Fear',
    'confusion': 'Fear',
    'depression': 'Fear',

    # 🟠 Anger (Frustration/Conflict)
    'anger': 'Anger',
    'disgust': 'Anger',

    # ⚪️ Noise (Ambiguous/Surprise)
    'surprise': 'Noise',
    'ambiguous': 'Noise'
}

# 5. Data Loading and Preprocessing
print("Loading data and merging labels...")
try:
    train_df = pd.read_csv(TRAIN_PATH)
    val_df = pd.read_csv(VAL_PATH)
except FileNotFoundError as e:
    print(f"❌ Error: File not found! Please check path: {TRAIN_PATH}")
    raise e

def process_labels(row_label):
    # Convert to lowercase and strip whitespace to prevent errors (e.g., 'Panic ')
    clean_label = str(row_label).lower().strip()
    # Map to broad class
    if clean_label in MAPPING:
        return label2id[MAPPING[clean_label]]
    return -1 # Mark as outlier/error

# Apply mapping
train_df['label'] = train_df['emo_label'].apply(process_labels)
val_df['label'] = val_df['emo_label'].apply(process_labels)

# Filter out unrecognized labels (-1)
train_df = train_df[train_df['label'] != -1]
val_df = val_df[val_df['label'] != -1]

# Print merged distribution (Verify Fear sample size increased)
print(f"✅ Merging Complete!")
print(f"Training set size: {len(train_df)}")
print("-" * 30)
print("New Class Distribution (Training Set):")
counts = train_df['label'].value_counts()
for idx, count in counts.items():
    print(f"{id2label[idx]}: {count}")
print("-" * 30)

# 6. Tokenization (Using RoBERTa)
model_name = "roberta-base"
print(f"Loading {model_name} Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # Prioritize 'processed' column, fallback to 'original' if missing
    text_col = "processed" if "processed" in examples else "original"
    return tokenizer(examples[text_col], padding="max_length", truncation=True, max_length=128)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)

# 7. Model Initialization
print("Initializing Model...")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(NEW_LABELS),
    id2label=id2label,
    label2id=label2id
)

# 8. Training Setup
# Define metric computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

training_args = TrainingArguments(
    output_dir="./results_roberta",
    num_train_epochs=4,              # Train for 4 epochs to ensure convergence
    per_device_train_batch_size=16,  # Reduce to 8 if GPU runs out of memory (OOM)
    per_device_eval_batch_size=16,
    eval_strategy="epoch",           # Evaluate accuracy after every epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,     # Automatically load the best model at the end
    weight_decay=0.01,
    logging_steps=100,               # Log every 100 steps
    report_to="none"                 # Disable wandb to prevent pop-ups
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# 9. Start Training
print("\n🚀 Training Started... Please wait approx. 10-15 minutes")
trainer.train()

# 10. Final Evaluation and Saving
print("\n📊 Final Validation Evaluation:")
eval_results = trainer.evaluate()
print(f"🏆 Final Accuracy: {eval_results['eval_accuracy']:.2%}")

# Save the model
print(f"Saving model to: {MODEL_SAVE_PATH}")
model.save_pretrained(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)
print("✅ All steps complete! You can now use this model for Step 3 (Inference).")

Loading data and merging labels...
✅ Merging Complete!
Training set size: 8000
------------------------------
New Class Distribution (Training Set):
Hype: 3771
Fear: 1994
Anger: 1346
Noise: 889
------------------------------
Loading roberta-base Tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Initializing Model...


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.



🚀 Training Started... Please wait approx. 10-15 minutes


Epoch,Training Loss,Validation Loss,Accuracy
1,0.984788,0.983302,0.611
2,0.859414,0.911196,0.641
3,0.770912,0.948293,0.639
4,0.666029,0.977047,0.647


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye


📊 Final Validation Evaluation:


🏆 Final Accuracy: 64.20%
Saving model to: /content/drive/My Drive/GME_Sentiment_Project/fine_tuned_roberta_4class


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ All steps complete! You can now use this model for Step 3 (Inference).


In [None]:
# ==========================================
# GME Sentiment Analysis - Step 3: Inference
# Objective: Apply the fine-tuned RoBERTa model to classify ~500k Reddit comments
# ==========================================

import pandas as pd
import torch
import numpy as np
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from google.colab import drive

# 1. Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# 2. Path Configuration
BASE_PATH = "/content/drive/My Drive/GME_Sentiment_Project/"

# Input: Filtered Reddit data (Generated in Step 1)
INPUT_DATA_PATH = os.path.join(BASE_PATH, "filtered_reddit_data.csv")

# Input: Your fine-tuned model (Saved in Step 2)
MODEL_PATH = os.path.join(BASE_PATH, "fine_tuned_roberta_4class")

# Output: The final labeled dataset for visualization
OUTPUT_PATH = os.path.join(BASE_PATH, "gme_sentiment_labeled.csv")

# 3. Load Model and Tokenizer
print(f"Loading model from: {MODEL_PATH} ...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
    model.to("cuda") # Move model to GPU
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model. Please check the path. Details: {e}")
    raise e

# 4. Load Data
print(f"Loading Reddit data from: {INPUT_DATA_PATH} ...")
try:
    df = pd.read_csv(INPUT_DATA_PATH)
    print(f"Total comments to process: {len(df)}")
except FileNotFoundError:
    print("❌ Input file not found. Please ensure Step 1 was completed.")
    raise

# Ensure the text column exists (handling 'body' or 'text')
if 'body' in df.columns:
    texts = df['body'].astype(str).tolist()
elif 'text' in df.columns:
    texts = df['text'].astype(str).tolist()
else:
    raise ValueError("Error: Could not find 'body' or 'text' column in the CSV.")

# 5. Define Batch Prediction Function
# We process in batches to avoid running out of GPU memory (OOM)
def predict_batch(texts, batch_size=64):
    all_preds = []
    all_probs = []

    print(f"Starting inference on {len(texts)} comments...")

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        # Tokenize
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt").to("cuda")

        # Inference
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1) # Convert logits to probabilities

        # Move results to CPU to save memory
        preds = torch.argmax(probs, dim=-1).cpu().numpy()
        probs = probs.cpu().numpy()

        all_preds.extend(preds)
        all_probs.extend(probs)

        # Progress logging
        if i % 10000 == 0 and i > 0:
            print(f"Processed {i} comments...")

    return all_preds, all_probs

# 6. Execute Prediction
predictions, probabilities = predict_batch(texts)

# 7. Integrate Results
# Retrieve label mapping (id to text, e.g., 0 -> Hype)
id2label = model.config.id2label
print(f"Label Mapping: {id2label}")

# Add columns to the DataFrame
df['sentiment_label'] = [id2label[p] for p in predictions]
# Save the max probability (confidence score)
df['sentiment_score'] = [max(prob) for prob in probabilities]

# 8. Save Final Output
print(f"Saving labeled data to: {OUTPUT_PATH}")
df.to_csv(OUTPUT_PATH, index=False)
print("🎉 Step 3 Complete! The file 'gme_sentiment_labeled.csv' is ready for visualization.")

Loading model from: /content/drive/My Drive/GME_Sentiment_Project/fine_tuned_roberta_4class ...


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

✅ Model loaded successfully!
Loading Reddit data from: /content/drive/My Drive/GME_Sentiment_Project/filtered_reddit_data.csv ...
Total comments to process: 6664
Starting inference on 6664 comments...
Label Mapping: {0: 'Hype', 1: 'Fear', 2: 'Anger', 3: 'Noise'}
Saving labeled data to: /content/drive/My Drive/GME_Sentiment_Project/gme_sentiment_labeled.csv
🎉 Step 3 Complete! The file 'gme_sentiment_labeled.csv' is ready for visualization.


In [None]:
# ==========================================
# GME Sentiment Analysis - Step 4: Data Aggregation
# Objective: Create daily time-series of Sentiment vs. Stock Price
# ==========================================

import yfinance as yf

# 1. Paths
INPUT_LABELED_FILE = os.path.join(BASE_PATH, "gme_sentiment_labeled.csv")
FINAL_VIS_FILE = os.path.join(BASE_PATH, "final_dataset_for_vis.csv")

# 2. Load Labeled Data
print("Loading labeled data...")
df = pd.read_csv(INPUT_LABELED_FILE)

# Handle Date Parsing
if 'created_utc' in df.columns:
    df['date'] = pd.to_datetime(df['created_utc'], unit='s').dt.date
elif 'timestamp' in df.columns:
    df['date'] = pd.to_datetime(df['timestamp']).dt.date
else:
    df['date'] = pd.to_datetime(df['date']).dt.date

# 3. Aggregate Daily Sentiment
print("Aggregating daily sentiment...")
daily_sentiment = df.groupby(['date', 'sentiment_label']).size().unstack(fill_value=0)
daily_sentiment.columns = [f"count_{col}" for col in daily_sentiment.columns]
daily_sentiment = daily_sentiment.reset_index()

# Calculate Ratios
daily_sentiment['total_comments'] = daily_sentiment.filter(like='count_').sum(axis=1)
# Fear Index = Fear / Total
daily_sentiment['fear_ratio'] = daily_sentiment.get('count_Fear', 0) / daily_sentiment['total_comments']

# 4. Download GME Stock Data
print("Downloading GME stock data...")
start_date = df['date'].min()
end_date = df['date'].max()

gme = yf.download("GME", start=start_date, end=end_date + pd.Timedelta(days=1))
gme = gme.reset_index()

# Fix MultiIndex columns if present (yfinance update)
if isinstance(gme.columns, pd.MultiIndex):
    gme.columns = [col[0] for col in gme.columns]

gme['Date'] = gme['Date'].dt.date
gme = gme[['Date', 'Close', 'Volume', 'High', 'Low']]
gme.columns = ['date', 'close', 'volume', 'high', 'low']

# Calculate Volatility (Intraday)
gme['volatility'] = (gme['high'] - gme['low']) / gme['low']

# 5. Merge and Save
print("Merging data...")
final_df = pd.merge(daily_sentiment, gme, on='date', how='inner').sort_values('date')

final_df.to_csv(FINAL_VIS_FILE, index=False)
print(f"✅ Final dataset saved to: {FINAL_VIS_FILE}")
print(final_df.head())

Loading labeled data...
Aggregating daily sentiment...
Downloading GME stock data...


  gme = yf.download("GME", start=start_date, end=end_date + pd.Timedelta(days=1))
  dt_now = pd.Timestamp.utcnow()
[*********************100%***********************]  1 of 1 completed

Merging data...
✅ Final dataset saved to: /content/drive/My Drive/GME_Sentiment_Project/final_dataset_for_vis.csv
         date  count_Anger  count_Fear  count_Hype  count_Noise  \
0  2021-01-28           10         141         106            0   
1  2021-01-29          136        1211        1179            1   
2  2021-02-01            7         148          62            0   
3  2021-02-02            6         160          61            0   
4  2021-02-03            9         224         129            0   

   total_comments  fear_ratio      close     volume        high      low  \
0             257    0.548638  48.400002  235263200  120.750000  28.0625   
1            2527    0.479224  81.250000  202264400  103.495003  62.5000   
2             217    0.682028  56.250000  149528800   80.500000  53.0000   
3             227    0.704846  22.500000  312732400   39.500000  18.5550   
4             362    0.618785  23.102501  170794000   28.350000  21.3125   

   volatility  
0    3.302


