In [3]:
!pip install emoji
!pip install vaderSentiment
from google.colab import drive
drive.mount('/content/drive')

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd

import yfinance as yf

import re
import emoji
from tqdm import tqdm

import torch
import nltk
from nltk.corpus import stopwords

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

In [7]:
##### Ingest Data #####

# Set data directory to read files
DATA_DIR = "/content/drive/MyDrive/UC Berkeley/Capstone Project/"

# Set save directory to save files
SAVE_DIR = "/content/drive/MyDrive/UC Berkeley/Capstone Project/"

# Reddit data comes from this Kaggle dataset: https://www.kaggle.com/datasets/gpreda/reddit-wallstreetsbets-posts?resource=download
reddit_data = pd.read_csv(DATA_DIR + "reddit_wsb.csv")

# Pull GME data for the same period as the Reddit data
gme_data = yf.download("GME", start="2021-01-04", end="2021-04-01")
gme_data.columns = gme_data.columns.get_level_values(0)
gme_data = gme_data.reset_index()

  gme_data = yf.download("GME", start="2021-01-04", end="2021-04-01")
[*********************100%***********************]  1 of 1 completed


In [None]:
reddit_data.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [None]:
gme_data.head()

Price,Date,Close,High,Low,Open,Volume
0,2021-01-04,4.3125,4.775,4.2875,4.75,40090000
1,2021-01-05,4.3425,4.52,4.3075,4.3375,19846000
2,2021-01-06,4.59,4.745,4.3325,4.335,24224800
3,2021-01-07,4.52,4.8625,4.505,4.6175,24517200
4,2021-01-08,4.4225,4.575,4.27,4.545,25928000


In [None]:
##### Data Cleaning and Feature Engineering #####

df = gme_data[["Date", "Open", "Close"]].copy()

# Calculate features matching the paper's methodology
df["Net_Movement"] = df["Close"] - df["Open"]
df["Direction"] = df["Net_Movement"].apply(lambda x: "up" if x > 0 else "down")

# Ensure Date is datetime for clean joining later
df["Date"] = pd.to_datetime(df["Date"]).dt.date
reddit_data["Date"] = pd.to_datetime(reddit_data["timestamp"]).dt.date

# Join reddit posts to their corresponding trading day's price data
# Inner join naturally removes posts on non-trading days
df = reddit_data.merge(df, on="Date", how="inner")

#Concatenating the title and body text as done in the reference paper
df["body"] = df["body"].fillna("")
df["text"] = df["title"] + " " + df["body"]


##### Text Preprocessing Pipeline #####

# Steps follow the methods outlined in the reference paper

def replace_emojis(text):
    return emoji.demojize(text, delimiters=("|", "|"))

def remove_urls(text):
    return re.sub(r'http\S+|www\.\S+', '', text)

def remove_mentions(text):
    return re.sub(r'u/\S+', '', text)

def remove_punctuation(text):
    # Temporarily protect emoji tags
    protected = re.findall(r'\|[^|]+\|', text)
    for i, tag in enumerate(protected):
        text = text.replace(tag, f'EMOJITAG{i}')
    # Remove punctuation but preserve decimal numbers (e.g., 1.5)
    text = re.sub(r'(?<!\d)\.(?!\d)|[^\w\s.]', '', text)
    # Clean up any remaining standalone periods
    text = re.sub(r'(?<!\d)\.(?!\d)', '', text)
    # Restore emoji tags
    for i, tag in enumerate(protected):
        text = text.replace(f'EMOJITAG{i}', tag)
    return text

def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = replace_emojis(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_punctuation(text)
    text = text.lower()
    text = normalize_whitespace(text)
    return text

df["text_clean"] = df["text"].apply(preprocess)


##### Feature Extraction From The Paper #####


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df["word_count"] = df["text_clean"].apply(lambda x: len(x.split()))
df["stopword_count"] = df["text_clean"].apply(lambda x: sum(1 for w in x.split() if w in stop_words))
df["avg_word_length"] = df["text_clean"].apply(lambda x: np.mean([len(w) for w in x.split()]) if x.split() else 0)
df["emoji_count"] = df["text_clean"].apply(lambda x: len(re.findall(r'\|[^|]+\|', x)))


# Dropping irrelevent columns
df = df.drop(columns=["title", "body", "url", "id", "created", "timestamp", "Open", "Close",])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35735 entries, 0 to 35734
Data columns (total 54 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   score               35735 non-null  int64  
 1   comms_num           35735 non-null  int64  
 2   Date                35735 non-null  object 
 3   Net_Movement        35735 non-null  float64
 4   Direction           35735 non-null  object 
 5   text                35735 non-null  object 
 6   text_clean          35735 non-null  object 
 7   word_count          35735 non-null  int64  
 8   stopword_count      35735 non-null  int64  
 9   avg_word_length     35735 non-null  float64
 10  emoji_count         35735 non-null  int64  
 11  vader_pos           35735 non-null  float64
 12  vader_neu           35735 non-null  float64
 13  vader_neg           35735 non-null  float64
 14  vader_compound      35735 non-null  float64
 15  finbert_pos         35735 non-null  float64
 16  finb

In [None]:
df.sample(10)

Unnamed: 0,score,comms_num,Date,Net_Movement,Direction,text,text_clean,word_count,stopword_count,avg_word_length,emoji_count,vader_pos,vader_neu,vader_neg,vader_compound
16281,582,56,2021-01-29,-13.677498,down,"It ain't much, but it is honest work. With lov...",it aint much but it is honest work with love f...,12,6,3.75,0,0.506,0.494,0.0,0.9052
10019,1,1,2021-01-29,-13.677498,down,I DECLARE THE WSB ANTHEM!! BUY THE DIP AND HOL...,i declare the wsb anthem buy the dip and hold,10,4,3.6,0,0.0,1.0,0.0,0.0
33138,20,0,2021-03-11,4.59,up,Keep holding apes!,keep holding apes,3,0,5.0,0,0.0,1.0,0.0,0.0
10709,1,3,2021-01-29,-13.677498,down,Action speaks louder than words (Robinhood) Or...,action speaks louder than words robinhood or i...,44,19,4.772727,0,0.0,0.847,0.153,-0.8227
23924,48,2,2021-02-05,2.4325,up,I heard yall like loss.,i heard yall like loss,5,1,3.6,0,0.321,0.385,0.295,0.0516
22765,162,30,2021-02-04,-9.422501,down,It’s already in the will to never sell- my gre...,its already in the will to never sell my great...,15,8,4.4,0,0.301,0.548,0.15,0.5116
3134,1,1,2021-01-29,-13.677498,down,American Airlines??,american airlines,2,0,8.0,0,0.0,1.0,0.0,0.0
24045,22,18,2021-02-05,2.4325,up,GME: Might be worth a punt at $60? Right now $...,gme might be worth a punt at 60 right now 60 g...,80,34,3.65,0,0.101,0.798,0.101,0.0
12982,1,0,2021-01-29,-13.677498,down,MEME STOCKS ARE RECOVERING SLOWLY My total los...,meme stocks are recovering slowly my total los...,31,12,8.032258,10,0.111,0.837,0.052,0.4684
23888,8,30,2021-02-05,2.4325,up,"Sold Sold at a loss for 12,000 American, since...",sold sold at a loss for 12000 american since i...,40,21,4.0,0,0.111,0.704,0.185,-0.5719


In [None]:
##### Sentiment Analysis Models #####

# ============================================================
# 1. VADER (Rule-based baseline — paper's original method)
# ============================================================
vader = SentimentIntensityAnalyzer()

def get_vader_scores(text):
    try:
        scores = vader.polarity_scores(text)
        return scores["pos"], scores["neu"], scores["neg"], scores["compound"]
    except:
        return 0.0, 1.0, 0.0, 0.0

# ============================================================
# 2. FinBERT (Financial domain sentiment)
# ============================================================
finbert_pipe = pipeline(
    "sentiment-analysis",
    model="ProsusAI/finbert",
    tokenizer="ProsusAI/finbert",
    top_k=3,
    truncation=True,
    max_length=512
)

def get_finbert_scores(text):
    try:
        results = finbert_pipe(text[:512])[0]
        scores = {r["label"]: r["score"] for r in results}
        return scores.get("positive", 0), scores.get("neutral", 0), scores.get("negative", 0)
    except:
        return 0.0, 1.0, 0.0

# ============================================================
# 3. Twitter-RoBERTa (Social media sentiment)
# ============================================================
roberta_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest",
    top_k=3,
    truncation=True,
    max_length=512
)

def get_roberta_scores(text):
    try:
        results = roberta_pipe(text[:512])[0]
        scores = {r["label"]: r["score"] for r in results}
        return scores.get("positive", 0), scores.get("neutral", 0), scores.get("negative", 0)
    except:
        return 0.0, 1.0, 0.0

# ============================================================
# 4. Topic-Sentiment RoBERTa (Entity-targeted sentiment)
# ============================================================
topic_pipe = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-topic-sentiment-latest",
    tokenizer="cardiffnlp/twitter-roberta-base-topic-sentiment-latest",
    top_k=None,
    truncation=True,
    max_length=512
)

def get_topic_sentiment_scores(text, target="GME"):
    try:
        text_input = f"{text[:450]} </s> {target}"
        results = topic_pipe(text_input)[0]
        scores = {r["label"]: r["score"] for r in results}
        return (
            scores.get("strongly positive", 0),
            scores.get("positive", 0),
            scores.get("negative or neutral", 0),
            scores.get("negative", 0),
            scores.get("strongly negative", 0)
        )
    except:
        return 0.0, 0.0, 1.0, 0.0, 0.0

# ============================================================
# 5. GoEmotions (Reddit-trained, 28 emotion labels)
# ============================================================
goemotions_pipe = pipeline(
    "text-classification",
    model="SamLowe/roberta-base-go_emotions",
    tokenizer="SamLowe/roberta-base-go_emotions",
    top_k=None,
    truncation=True,
    max_length=512
)

EMOTIONS = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring",
    "confusion", "curiosity", "desire", "disappointment", "disapproval",
    "disgust", "embarrassment", "excitement", "fear", "gratitude", "grief",
    "joy", "love", "nervousness", "optimism", "pride", "realization",
    "relief", "remorse", "sadness", "surprise", "neutral"
]

def get_goemotions_scores(text):
    try:
        results = goemotions_pipe(text[:512])[0]
        scores = {r["label"]: r["score"] for r in results}
        return [scores.get(e, 0.0) for e in EMOTIONS]
    except:
        return [0.0] * 28

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: ProsusAI/finbert
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.weight     | UNEXPECTED |  | 
roberta.pooler.dense.bias       | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: cardiffnlp/twitter-roberta-base-topic-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: SamLowe/roberta-base-go_emotions
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
##### Sentiment Analysis Run #####

# ============================================================
# Batched inference helper
# ============================================================
def run_batched(pipe, texts, batch_size=64):
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = [t[:512] for t in texts[i:i+batch_size]]
        results.extend(pipe(batch))
    return results

# ============================================================
# Run all models
# ============================================================
texts = df["text_clean"].tolist()

# --- VADER (no GPU needed, already fast) ---
print("Running VADER...")
vader_results = [get_vader_scores(t) for t in tqdm(texts)]
df["vader_pos"] = [r[0] for r in vader_results]
df["vader_neu"] = [r[1] for r in vader_results]
df["vader_neg"] = [r[2] for r in vader_results]
df["vader_compound"] = [r[3] for r in vader_results]
print("  VADER complete. Saving checkpoint...")
df.to_csv(SAVE_DIR + "checkpoint_after_vader.csv", index=False)

# --- FinBERT ---
print("Running FinBERT...")
finbert_results = run_batched(finbert_pipe, texts)
df["finbert_pos"] = [dict((r["label"], r["score"]) for r in res).get("positive", 0) for res in finbert_results]
df["finbert_neu"] = [dict((r["label"], r["score"]) for r in res).get("neutral", 0) for res in finbert_results]
df["finbert_neg"] = [dict((r["label"], r["score"]) for r in res).get("negative", 0) for res in finbert_results]
print("  FinBERT complete. Saving checkpoint...")
df.to_csv(SAVE_DIR + "checkpoint_after_finbert.csv", index=False)

# --- Twitter-RoBERTa ---
print("Running Twitter-RoBERTa...")
roberta_results = run_batched(roberta_pipe, texts)
df["roberta_pos"] = [dict((r["label"], r["score"]) for r in res).get("positive", 0) for res in roberta_results]
df["roberta_neu"] = [dict((r["label"], r["score"]) for r in res).get("neutral", 0) for res in roberta_results]
df["roberta_neg"] = [dict((r["label"], r["score"]) for r in res).get("negative", 0) for res in roberta_results]
print("  Twitter-RoBERTa complete. Saving checkpoint...")
df.to_csv(SAVE_DIR + "checkpoint_after_roberta.csv", index=False)

# --- Topic-Sentiment RoBERTa ---
print("Running Topic-Sentiment RoBERTa...")
topic_texts = [f"{t[:450]} </s> GME" for t in texts]
topic_results = run_batched(topic_pipe, topic_texts)
df["topic_strong_pos"] = [dict((r["label"], r["score"]) for r in res).get("strongly positive", 0) for res in topic_results]
df["topic_pos"] = [dict((r["label"], r["score"]) for r in res).get("positive", 0) for res in topic_results]
df["topic_neu"] = [dict((r["label"], r["score"]) for r in res).get("negative or neutral", 0) for res in topic_results]
df["topic_neg"] = [dict((r["label"], r["score"]) for r in res).get("negative", 0) for res in topic_results]
df["topic_strong_neg"] = [dict((r["label"], r["score"]) for r in res).get("strongly negative", 0) for res in topic_results]
print("  Topic-Sentiment complete. Saving checkpoint...")
df.to_csv(SAVE_DIR + "checkpoint_after_topic.csv", index=False)

# --- GoEmotions ---
print("Running GoEmotions...")
goemotions_results = run_batched(goemotions_pipe, texts)
for idx, emotion in enumerate(EMOTIONS):
    df[f"emo_{emotion}"] = [dict((r["label"], r["score"]) for r in res).get(emotion, 0) for res in goemotions_results]
print("  GoEmotions complete. Saving final output...")
df.to_csv(SAVE_DIR + "df_with_all_sentiments.csv", index=False)

print("\n" + "=" * 60)
print("ALL MODELS COMPLETE")
print(f"Final dataframe shape: {df.shape}")
print("=" * 60)

GPU available: True
Device: Tesla T4
Running VADER...


100%|██████████| 35735/35735 [00:55<00:00, 640.51it/s]


  VADER complete. Saving checkpoint...
Running FinBERT...


  0%|          | 0/559 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 559/559 [05:52<00:00,  1.59it/s]


  FinBERT complete. Saving checkpoint...
Running Twitter-RoBERTa...


100%|██████████| 559/559 [05:48<00:00,  1.60it/s]


  Twitter-RoBERTa complete. Saving checkpoint...
Running Topic-Sentiment RoBERTa...


100%|██████████| 559/559 [05:47<00:00,  1.61it/s]


  Topic-Sentiment complete. Saving checkpoint...
Running GoEmotions...


100%|██████████| 559/559 [05:50<00:00,  1.60it/s]


  GoEmotions complete. Saving final output...

ALL MODELS COMPLETE
Final dataframe shape: (35735, 54)


In [None]:
df.to_parquet(SAVE_DIR + "df_with_all_sentiments.parquet", index=False)

In [None]:
# ============================================================
# Define feature groups for aggregation
# ============================================================

metadata_features = ["score", "comms_num"]
text_features = ["word_count", "stopword_count", "avg_word_length", "emoji_count"]

vader_features = ["vader_pos", "vader_neu", "vader_neg", "vader_compound"]
finbert_features = ["finbert_pos", "finbert_neu", "finbert_neg"]
roberta_features = ["roberta_pos", "roberta_neu", "roberta_neg"]
topic_features = ["topic_strong_pos", "topic_pos", "topic_neu", "topic_neg", "topic_strong_neg"]
goemotions_features = [
    "emo_admiration", "emo_amusement", "emo_anger", "emo_annoyance", "emo_approval",
    "emo_caring", "emo_confusion", "emo_curiosity", "emo_desire", "emo_disappointment",
    "emo_disapproval", "emo_disgust", "emo_embarrassment", "emo_excitement", "emo_fear",
    "emo_gratitude", "emo_grief", "emo_joy", "emo_love", "emo_nervousness",
    "emo_optimism", "emo_pride", "emo_realization", "emo_relief", "emo_remorse",
    "emo_sadness", "emo_surprise", "emo_neutral"
]

all_numeric_features = (
    metadata_features + text_features +
    vader_features + finbert_features + roberta_features +
    topic_features + goemotions_features
)

weighted_features = (
    vader_features + finbert_features + roberta_features +
    topic_features + goemotions_features
)


# ============================================================
# Dataset 2: Daily Aggregated (Unweighted)
# ============================================================
print("Building Dataset 2: Daily Aggregated (Unweighted)...")

mean_agg = df.groupby("Date")[all_numeric_features].mean()
mean_agg.columns = [f"{c}_mean" for c in mean_agg.columns]

std_agg = df.groupby("Date")[all_numeric_features].std()
std_agg.columns = [f"{c}_std" for c in std_agg.columns]

post_counts = df.groupby("Date").size().rename("post_count")

targets = df.groupby("Date").agg(
    Direction=("Direction", "first"),
    Net_Movement=("Net_Movement", "first")
)

df_daily = pd.concat([targets, post_counts, mean_agg, std_agg], axis=1).reset_index()
df_daily = df_daily.sort_values("Date").reset_index(drop=True)

print(f"  Shape: {df_daily.shape}")
print(f"  Date range: {df_daily['Date'].min()} to {df_daily['Date'].max()}")
print(f"  Direction distribution:\n{df_daily['Direction'].value_counts().to_string()}")


# ============================================================
# Dataset 3: Daily Aggregated (Weighted by upvote score)
# ============================================================
print("\nBuilding Dataset 3: Daily Aggregated (Weighted)...")

def weighted_mean(group, features, weight_col="score"):
    weights = group[weight_col].clip(lower=1)
    result = {}
    for f in features:
        result[f"{f}_wmean"] = np.average(group[f], weights=weights)
    return pd.Series(result)

def weighted_std(group, features, weight_col="score"):
    weights = group[weight_col].clip(lower=1)
    result = {}
    for f in features:
        avg = np.average(group[f], weights=weights)
        variance = np.average((group[f] - avg) ** 2, weights=weights)
        result[f"{f}_wstd"] = np.sqrt(variance)
    return pd.Series(result)

weighted_mean_agg = df.groupby("Date").apply(lambda g: weighted_mean(g, weighted_features))
weighted_std_agg = df.groupby("Date").apply(lambda g: weighted_std(g, weighted_features))

# Metadata and text features use regular mean/std (weighting upvotes by upvotes is circular)
non_weighted_mean = df.groupby("Date")[metadata_features + text_features].mean()
non_weighted_mean.columns = [f"{c}_mean" for c in non_weighted_mean.columns]

non_weighted_std = df.groupby("Date")[metadata_features + text_features].std()
non_weighted_std.columns = [f"{c}_std" for c in non_weighted_std.columns]

df_daily_weighted = pd.concat([
    targets, post_counts,
    non_weighted_mean, non_weighted_std,
    weighted_mean_agg, weighted_std_agg
], axis=1).reset_index()

df_daily_weighted = df_daily_weighted.sort_values("Date").reset_index(drop=True)

print(f"  Shape: {df_daily_weighted.shape}")
print(f"  Date range: {df_daily_weighted['Date'].min()} to {df_daily_weighted['Date'].max()}")
print(f"  Direction distribution:\n{df_daily_weighted['Direction'].value_counts().to_string()}")


# ============================================================
# Summary
# ============================================================
print("\n" + "=" * 60)
print("AGGREGATION COMPLETE")
print("=" * 60)
print(f"  Dataset 1 (post-level):       {df.shape}")
print(f"  Dataset 2 (daily unweighted): {df_daily.shape}")
print(f"  Dataset 3 (daily weighted):   {df_daily_weighted.shape}")


# ============================================================
# Save
# ============================================================
SAVE_DIR = "/content/drive/MyDrive/UC Berkeley/Capstone Project/"

df_daily.to_parquet(SAVE_DIR + "dataset2_daily_unweighted.parquet", index=False)
df_daily_weighted.to_parquet(SAVE_DIR + "dataset3_daily_weighted.parquet", index=False)

print("\nDatasets saved to Google Drive.")