In [1]:
import pandas as pd
import re
from itertools import chain
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import Dataset  # HuggingFace dataset
import numpy as np
import matplotlib.pyplot as plt
import nasdaqdatalink
import os
from dotenv import load_dotenv
load_dotenv()
API_KEY= os.getenv('API_KEY')
nasdaqdatalink.ApiConfig.api_key = API_KEY

In [20]:
spy = pd.read_parquet('data/SPY_MINUTE.parquet')

In [21]:
spy.rename(columns={'c': 'price'}, inplace=True)
spy.drop(columns=['h', 'o', 'l', 'v', 'vw', 'n'], inplace=True)
spy.head()

Unnamed: 0_level_0,price
t,Unnamed: 1_level_1
2024-01-02 09:00:00+00:00,476.31
2024-01-02 09:01:00+00:00,476.29
2024-01-02 09:02:00+00:00,476.28
2024-01-02 09:03:00+00:00,476.27
2024-01-02 09:05:00+00:00,476.19


In [47]:
spy_returns = spy['price'].pct_change().rename('returns') # Rename the series
spy_returns = spy_returns.shift(-1)
spy_returns = spy_returns.dropna()

In [48]:
combined = pd.read_csv('data/finbert_sentiments.csv')
combined['date'] = pd.to_datetime(combined['date'])
combined.head()

Unnamed: 0,date,text,sentiment_label,sentiment_score
0,2025-05-17 10:27:00,Walmart should STOP trying to blame Tariffs as...,neutral,0.840971
1,2025-05-09 08:43:00,"Many Trade Deals in the hopper, all good (GREA...",neutral,0.716537
2,2025-05-02 12:08:00,Maria Bartiromo: “As President Trump would say...,positive,0.792937
3,2025-04-30 09:13:00,"This is Biden’s Stock Market, not Trump’s. I d...",neutral,0.757984
4,2025-04-17 08:02:00,Had a very productive call with the President ...,neutral,0.535753


In [49]:
# Prepare sentiment data# Create a numerical sentiment score: positive_score for positive, -negative_score for negative, 0 for neutral
combined['numeric_sentiment'] = np.select(
    [combined['sentiment_label'] == 'positive', combined['sentiment_label'] == 'negative'],
    [combined['sentiment_score'], -combined['sentiment_score']],
    default=0
)

# The 'date' column in 'combined' already has minute-level precision.
# We will group by this 'date' column to get minutely sentiment.

# Aggregate sentiment by minute (mean sentiment if multiple tweets in the same minute)
minutely_sentiment_df = combined.groupby('date')['numeric_sentiment'].mean().reset_index()
minutely_sentiment_df = minutely_sentiment_df.set_index('date')
# Localize the index to UTC to match spy_returns
minutely_sentiment_df.index = minutely_sentiment_df.index.tz_localize('UTC')
# The index is now 'date' and contains the minutely timestamps, localized to UTC.

print("\nMinutely Aggregated Sentiment (first 5 rows):")
minutely_sentiment_df.head()


Minutely Aggregated Sentiment (first 5 rows):


Unnamed: 0_level_0,numeric_sentiment
date,Unnamed: 1_level_1
2023-11-21 13:37:00+00:00,0.0
2023-11-30 12:15:00+00:00,0.0
2023-12-16 19:04:00+00:00,0.0
2023-12-26 20:54:00+00:00,0.0
2023-12-29 17:47:00+00:00,-0.886635


In [50]:
# Merge stock returns with sentiment data
# We'll do a left join on returns_df to keep all trading days
# Convert spy_returns Series to DataFrame before joining
merged_df = spy_returns.to_frame().join(minutely_sentiment_df, how='left')

# Forward-fill missing sentiment values (sentiment persists until a new tweet)
merged_df['numeric_sentiment'] = merged_df['numeric_sentiment'].fillna(method='ffill')

# Fill any remaining NaNs at the beginning (if no tweet before first trading day in range) with 0
merged_df['numeric_sentiment'] = merged_df['numeric_sentiment'].fillna(0)

print("\nMerged Data with Stock Returns and Sentiment (first 5 rows):")
display(merged_df.head())
print(f"\nShape of merged_df: {merged_df.shape}")
# Check for NaNs that might affect regression
print("\nNaNs in merged_df after processing:")
print(merged_df.isnull().sum())


Merged Data with Stock Returns and Sentiment (first 5 rows):


  merged_df['numeric_sentiment'] = merged_df['numeric_sentiment'].fillna(method='ffill')


Unnamed: 0_level_0,returns,numeric_sentiment
t,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-02 09:00:00+00:00,-4.2e-05,0.0
2024-01-02 09:01:00+00:00,-2.1e-05,0.0
2024-01-02 09:02:00+00:00,-2.1e-05,0.0
2024-01-02 09:03:00+00:00,-0.000168,0.0
2024-01-02 09:05:00+00:00,-2.1e-05,0.0



Shape of merged_df: (9916203, 2)

NaNs in merged_df after processing:
returns              0
numeric_sentiment    0
dtype: int64


In [51]:
import statsmodels.api as sm

# Run regressions

# Define dependent (Y) and independent (X) variables
Y = merged_df['returns'].copy()
X = merged_df['numeric_sentiment'].copy()

# Drop rows where Y is NaN (e.g., if a stock didn't trade on a particular day after initial processing)
# This also handles cases where X might be NaN if ffill/fill(0) didn't cover everything, though it should.
valid_data_idx = Y.notna() & X.notna()
Y = Y[valid_data_idx]
X = X[valid_data_idx]

# Add a constant for the intercept term
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(Y, X)
results = model.fit()


# Print the summary
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                returns   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     175.0
Date:                Mon, 19 May 2025   Prob (F-statistic):           5.90e-40
Time:                        18:04:03   Log-Likelihood:             6.0205e+07
No. Observations:             9916203   AIC:                        -1.204e+08
Df Residuals:                 9916201   BIC:                        -1.204e+08
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1.916e-07   1.77e-0