In [1]:
import pandas as pd
import re
from itertools import chain
from tqdm.auto import tqdm
from transformers import pipeline
from datasets import Dataset  # HuggingFace dataset
import numpy as np
import matplotlib.pyplot as plt
import nasdaqdatalink
import os
from dotenv import load_dotenv
load_dotenv()
API_KEY= os.getenv('API_KEY')
nasdaqdatalink.ApiConfig.api_key = API_KEY

In [3]:
combined = pd.read_csv('trump_truths_filtered.csv')
combined.rename(columns={'body_clean':'text', 'timestamp':'date'}, inplace=True)
combined['date'] = pd.to_datetime(combined['date'], format='%Y-%m-%d %H:%M:%S')
combined.drop(columns=['header_raw', 'market_related', 'body'], inplace=True)
combined.head()

Unnamed: 0,date,text
0,2025-05-17 10:27:00,Walmart should STOP trying to blame Tariffs as...
1,2025-05-09 08:43:00,"Many Trade Deals in the hopper, all good (GREA..."
2,2025-05-02 12:08:00,Maria Bartiromo: “As President Trump would say...
3,2025-04-30 09:13:00,"This is Biden’s Stock Market, not Trump’s. I d..."
4,2025-04-17 08:02:00,Had a very productive call with the President ...


In [5]:
# Initialize sentiment analysis pipeline with FinBERT
sentiment_pipeline = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# Apply sentiment analysis to the 'text' column
# Using a loop with tqdm for progress bar, especially for larger datasets
# For smaller datasets like this, direct application is also fine
results = []
for text in tqdm(combined['text'], desc="Analyzing sentiment"):
    try:
        result = sentiment_pipeline(text)
        results.append(result[0]) # sentiment_pipeline returns a list of dicts
    except Exception as e:
        print(f"Error processing text: {text[:50]}... | Error: {e}")
        results.append({'label': 'ERROR', 'score': np.nan}) # Handle potential errors

# Convert results to DataFrame and merge
sentiment_df = pd.DataFrame(results)
combined['sentiment_label'] = sentiment_df['label']
combined['sentiment_score'] = sentiment_df['score']

# Display the first few rows with sentiment
print(combined.head())

# Display value counts for sentiment labels
print("\nSentiment Label Counts:")
print(combined['sentiment_label'].value_counts())

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use mps:0


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use mps:0


Analyzing sentiment:   0%|          | 0/66 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Device set to use mps:0


Analyzing sentiment:   0%|          | 0/66 [00:00<?, ?it/s]

                 date                                               text  \
0 2025-05-17 10:27:00  Walmart should STOP trying to blame Tariffs as...   
1 2025-05-09 08:43:00  Many Trade Deals in the hopper, all good (GREA...   
2 2025-05-02 12:08:00  Maria Bartiromo: “As President Trump would say...   
3 2025-04-30 09:13:00  This is Biden’s Stock Market, not Trump’s. I d...   
4 2025-04-17 08:02:00  Had a very productive call with the President ...   

  sentiment_label  sentiment_score  
0         neutral         0.840971  
1         neutral         0.716537  
2        positive         0.792937  
3         neutral         0.757984  
4         neutral         0.535753  

Sentiment Label Counts:
sentiment_label
neutral     38
negative    18
positive    10
Name: count, dtype: int64


In [6]:
# Save to CSV
combined.to_csv('finbert_sentiments.csv', index=False)

In [7]:
combined

Unnamed: 0,date,text,sentiment_label,sentiment_score
0,2025-05-17 10:27:00,Walmart should STOP trying to blame Tariffs as...,neutral,0.840971
1,2025-05-09 08:43:00,"Many Trade Deals in the hopper, all good (GREA...",neutral,0.716537
2,2025-05-02 12:08:00,Maria Bartiromo: “As President Trump would say...,positive,0.792937
3,2025-04-30 09:13:00,"This is Biden’s Stock Market, not Trump’s. I d...",neutral,0.757984
4,2025-04-17 08:02:00,Had a very productive call with the President ...,neutral,0.535753
...,...,...,...,...
61,2023-12-29 17:47:00,"THE ECONOMY IS TERRIBLE & INFLATION, WHICH BY ...",negative,0.886635
62,2023-12-26 20:54:00,Americans For Chinese Prosperity (Action?) is ...,neutral,0.859603
63,2023-12-16 19:04:00,"I kept America SAFE, I kept Israel SAFE, I ke...",neutral,0.872503
64,2023-11-30 12:15:00,Business Insider: Deutsche Bank executives pra...,neutral,0.855227


In [8]:
START_DATE = '2023-11-21'
END_DATE = '2025-05-17'
# Define the asset universe
tickers = {
    # US Equities
    'SPY': 'QUOTEMEDIA/PRICES',  # S&P 500 ETF
    'QQQ': 'QUOTEMEDIA/PRICES',  # Nasdaq 100 ETF
    'IWM': 'QUOTEMEDIA/PRICES',  # Russell 2000 ETF (Small Cap)
    # International Equities
    'EFA': 'QUOTEMEDIA/PRICES',  # MSCI EAFE ETF (Developed Markets ex-US/Canada)
    'EEM': 'QUOTEMEDIA/PRICES',  # MSCI Emerging Markets ETF
    # Fixed Income
    'AGG': 'QUOTEMEDIA/PRICES',  # US Aggregate Bond ETF
    'TLT': 'QUOTEMEDIA/PRICES',  # US 20+ Year Treasury Bond ETF
    # Commodities
    'GLD': 'QUOTEMEDIA/PRICES',  # Gold ETF
    'USO': 'QUOTEMEDIA/PRICES',  # Oil ETF
    # Real Estate
    'VNQ': 'QUOTEMEDIA/PRICES',   # US Real Estate ETF
    # Adding a couple more for diversity
    'GSG': 'QUOTEMEDIA/PRICES', # Broad Commodities ETF
    'HYG': 'QUOTEMEDIA/PRICES'  # High Yield Corporate Bond ETF
}



In [9]:
# Fetch data for each ticker
all_data = {}
for ticker, table in tickers.items():
    try:
        # Fetch data from the specified start_date
        data = nasdaqdatalink.get_table(
            table, 
            ticker=ticker, 
            qopts={'columns': ['date', 'adj_close']},
            date={'gte': START_DATE, 'lte': END_DATE},
            paginate=True
        )
        if not data.empty:
            data = data.set_index('date')
            all_data[ticker] = data['adj_close']
        else:
            print(f"No data found for {ticker}")
    except Exception as e:
        print(f"Could not retrieve data for {ticker}: {e}")

# Combine into a single DataFrame
if all_data:
    price_df = pd.DataFrame(all_data)
    price_df = price_df.sort_index() # Ensure dates are in ascending order
    print("\nSuccessfully fetched and combined data for all available tickers.")
    print(f"Price data from {price_df.index.min()} to {price_df.index.max()}")
else:
    print("No data was fetched. Please check ticker symbols and API key.")


Successfully fetched and combined data for all available tickers.
Price data from 2023-11-21 00:00:00 to 2025-05-16 00:00:00


In [10]:
price_df.head()

Unnamed: 0_level_0,SPY,QQQ,IWM,EFA,EEM,AGG,TLT,GLD,USO,VNQ,GSG,HYG
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2023-11-21,448.60493,385.82829,174.823245,69.710864,38.584562,92.099566,86.782334,185.35,72.53,75.930125,21.18,70.580802
2023-11-22,450.336919,387.407477,175.919471,69.875826,38.506614,92.186361,87.089019,184.56,71.61,76.304307,20.97,70.703223
2023-11-24,450.614038,386.861218,177.104579,70.409525,38.526101,91.762028,86.06354,185.52,70.82,76.524979,20.76,70.627887
2023-11-27,449.802477,386.52353,176.512025,70.166935,38.370204,92.2828,87.501127,186.77,69.96,76.803216,20.55,70.731474
2023-11-28,450.247846,387.536593,175.830587,70.186342,38.691742,92.687845,87.673638,189.26,71.26,77.31172,20.86,71.042237


In [11]:
# Calculate daily percentage returns
returns_df = price_df.pct_change()

# Drop the first row of NaNs created by pct_change()
returns_df = returns_df.dropna(how='all')

print("Daily Stock Returns (first 5 rows):")
print(returns_df.head())

Daily Stock Returns (first 5 rows):
                 SPY       QQQ       IWM       EFA       EEM       AGG  \
date                                                                     
2023-11-22  0.003861  0.004093  0.006270  0.002366 -0.002020  0.000942   
2023-11-24  0.000615 -0.001410  0.006737  0.007638  0.000506 -0.004603   
2023-11-27 -0.001801 -0.000873 -0.003346 -0.003445 -0.004047  0.005675   
2023-11-28  0.000990  0.002621 -0.003861  0.000277  0.008380  0.004389   
2023-11-29 -0.000703 -0.000974  0.005280  0.002074 -0.005792  0.004890   

                 TLT       GLD       USO       VNQ       GSG       HYG  
date                                                                    
2023-11-22  0.003534 -0.004262 -0.012684  0.004928 -0.009915  0.001734  
2023-11-24 -0.011775  0.005202 -0.011032  0.002892 -0.010014 -0.001066  
2023-11-27  0.016704  0.006738 -0.012143  0.003636 -0.010116  0.001467  
2023-11-28  0.001972  0.013332  0.018582  0.006621  0.015085  0.004394  
2023-11

In [12]:
import numpy as np

# Prepare sentiment data
# Create a numerical sentiment score: positive_score for positive, -negative_score for negative, 0 for neutral
combined['numeric_sentiment'] = np.select(
    [combined['sentiment_label'] == 'positive', combined['sentiment_label'] == 'negative'],
    [combined['sentiment_score'], -combined['sentiment_score']],
    default=0
)

# Ensure 'date' is just the date part, without time, for daily aggregation
combined['date_only'] = combined['date'].dt.normalize()

# Aggregate sentiment by day (mean sentiment if multiple tweets)
daily_sentiment_df = combined.groupby('date_only')['numeric_sentiment'].mean().reset_index()
daily_sentiment_df = daily_sentiment_df.set_index('date_only')
daily_sentiment_df.index.name = 'date' # Match index name for merging

print("\nDaily Aggregated Sentiment (first 5 rows):")
print(daily_sentiment_df.head())


Daily Aggregated Sentiment (first 5 rows):
            numeric_sentiment
date                         
2023-11-21           0.000000
2023-11-30           0.000000
2023-12-16           0.000000
2023-12-26           0.000000
2023-12-29          -0.886635


In [13]:
# Merge stock returns with sentiment data
# We'll do a left join on returns_df to keep all trading days
merged_df = returns_df.join(daily_sentiment_df, how='left')

# Forward-fill missing sentiment values (sentiment persists until a new tweet)
merged_df['numeric_sentiment'] = merged_df['numeric_sentiment'].fillna(method='ffill')

# Fill any remaining NaNs at the beginning (if no tweet before first trading day in range) with 0
merged_df['numeric_sentiment'] = merged_df['numeric_sentiment'].fillna(0)

print("\nMerged Data with Stock Returns and Sentiment (first 5 rows):")
print(merged_df.head())
print(f"\nShape of merged_df: {merged_df.shape}")
# Check for NaNs that might affect regression
print("\nNaNs in merged_df after processing:")
print(merged_df.isnull().sum())


Merged Data with Stock Returns and Sentiment (first 5 rows):
                 SPY       QQQ       IWM       EFA       EEM       AGG  \
date                                                                     
2023-11-22  0.003861  0.004093  0.006270  0.002366 -0.002020  0.000942   
2023-11-24  0.000615 -0.001410  0.006737  0.007638  0.000506 -0.004603   
2023-11-27 -0.001801 -0.000873 -0.003346 -0.003445 -0.004047  0.005675   
2023-11-28  0.000990  0.002621 -0.003861  0.000277  0.008380  0.004389   
2023-11-29 -0.000703 -0.000974  0.005280  0.002074 -0.005792  0.004890   

                 TLT       GLD       USO       VNQ       GSG       HYG  \
date                                                                     
2023-11-22  0.003534 -0.004262 -0.012684  0.004928 -0.009915  0.001734   
2023-11-24 -0.011775  0.005202 -0.011032  0.002892 -0.010014 -0.001066   
2023-11-27  0.016704  0.006738 -0.012143  0.003636 -0.010116  0.001467   
2023-11-28  0.001972  0.013332  0.018582  0.00662

  merged_df['numeric_sentiment'] = merged_df['numeric_sentiment'].fillna(method='ffill')


In [16]:
import statsmodels.api as sm

p_vals = []
betas = []
# Run regressions
# Iterate over each stock ticker column (excluding 'numeric_sentiment')
for ticker in returns_df.columns:
    print(f"\nRegression results for {ticker}:")
    
    # Define dependent (Y) and independent (X) variables
    Y = merged_df[ticker].copy()
    X = merged_df['numeric_sentiment'].copy()
    
    # Drop rows where Y is NaN (e.g., if a stock didn't trade on a particular day after initial processing)
    # This also handles cases where X might be NaN if ffill/fill(0) didn't cover everything, though it should.
    valid_data_idx = Y.notna() & X.notna()
    Y = Y[valid_data_idx]
    X = X[valid_data_idx]

    if len(Y) < 2 or len(X) < 2: # Not enough data points for regression
        print(f"Skipping {ticker} due to insufficient data after NaN removal (points: {len(Y)}).")
        continue
    if X.nunique() == 1: # Independent variable is constant
        print(f"Skipping {ticker} because sentiment score is constant for the available data.")
        continue

    # Add a constant for the intercept term
    X = sm.add_constant(X)
    
    # Fit the OLS model
    model = sm.OLS(Y, X)
    results = model.fit()

    # Store p-value and beta coefficient
    p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
    betas.append(results.params[1])  # beta coefficient for the sentiment
    
    # Print the summary
    print(results.summary())
    print("-" * 80)



Regression results for SPY:
                            OLS Regression Results                            
Dep. Variable:                    SPY   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     6.011
Date:                Mon, 19 May 2025   Prob (F-statistic):             0.0147
Time:                        14:16:41   Log-Likelihood:                 1143.3
No. Observations:                 371   AIC:                            -2283.
Df Residuals:                     369   BIC:                            -2275.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const    

  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params[1])  # beta coefficient for the sentiment
  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params[1])  # beta coefficient for the sentiment
  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params[1])  # beta coefficient for the sentiment
  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params[1])  # beta coefficient for the sentiment
  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params[1])  # beta coefficient for the sentiment
  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params[1])  # beta coefficient for the sentiment
  p_vals.append(results.pvalues[1])  # p-value for the sentiment coefficient
  betas.append(results.params

In [17]:
# make a DataFrame for p-values and betas with ticker
p_vals_df = pd.DataFrame({
    'Ticker': returns_df.columns,
    'P-Value': p_vals,
    'Beta': betas
})
p_vals_df

Unnamed: 0,Ticker,P-Value,Beta
0,SPY,0.014684,-0.002984
1,QQQ,0.015446,-0.003769
2,IWM,0.102616,-0.002587
3,EFA,0.372123,-0.00097
4,EEM,0.282616,-0.001312
5,AGG,0.086311,0.000673
6,TLT,0.026619,0.002232
7,GLD,0.698177,0.000443
8,USO,0.023515,-0.004492
9,VNQ,0.503622,-0.000835
