### Import required packages

In [19]:
# Two-Step: Headlines -> Sentiment (θ̂t) -> Returns regression
# Erwartete Spalten in deiner CSV: 'Date', 'Headline', 'CP'
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import pandas as pd
import statsmodels.api as sm

### Prepare Data

In [5]:
df = pd.read_csv('../data/sp500_headlines_2008_2004.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

### Load FinBERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
model.eval()

### Compute Sentiments

In [11]:
# Define batch size
batch_size = 32
sentiments = []

# Plug Headlines into FinBERT
for i in tqdm(range(0, len(df), batch_size)):
    batch = df["Headline"].iloc[i:i+batch_size].astype(str).tolist()
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # FinBERT labels: 0 = negative, 1 = neutral, 2 = positive
    batch_scores = probs[:, 2] - probs[:, 0]  # Positive - Negative = Sentiment Score
    sentiments.extend(batch_scores.numpy())

df["finbert_sent"] = sentiments

100%|██████████| 598/598 [30:42<00:00,  3.08s/it]


### Aggregation

In [24]:
# Aggregation: take mean mean of the raw sentiment estimate for every day (average of duplicate entries)
# Note: choosing median for a more robust result is an extra option here
daily = (
    df.groupby('Date', as_index=False)
      .agg(hat_theta=('finbert_sent','mean'),
           CP=('CP','last'),
           Return_simple=('Return','last'),
           Return_log=('Return_log','last'),
           n_headlines=('Headline','size'))
)

### Downstream economic model

In [27]:
# OLS with Newey-West standard errors (HAC=heteroskedasticity and autocorrelation consistent)
# Basismodell as introduced by Tetlock: R_t = γ0 + γ1 * θ̂_t + ε_t
def ols_hac(y, X, lags=5):
    Xc = sm.add_constant(X)
    return sm.OLS(y, Xc, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags': lags})

results_by_C = {}

# Define groups for the amount of unstructered data (n_headlines): low, medium, high
daily['C_group'] = pd.qcut(daily['n_headlines'], q=3, labels=["low", "medium", "high"])

for grp, dailyg in daily.groupby("C_group"):
    X = sm.add_constant(dailyg['hat_theta'])
    y = dailyg['Return_simple']
    model = sm.OLS(y, X).fit(cov_type='HAC', cov_kwds={'maxlags': 5})
    results_by_C[grp] = model

# Print results
for grp, model in results_by_C.items():
    print(f"=== Group: {grp} ===")
    print(model.summary().tables[1])
    print("\n")


=== Group: low ===
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.663e-05      0.000     -0.061      0.952      -0.001       0.001
hat_theta     -0.0002      0.001     -0.369      0.712      -0.001       0.001


=== Group: medium ===
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const      -6.616e-06      0.000     -0.033      0.973      -0.000       0.000
hat_theta     -0.0002      0.000     -0.683      0.495      -0.001       0.000


=== Group: high ===
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0004      0.000      2.177      0.029    3.55e-05       0.001
hat_theta      0.0004      0.000      1.179      0.238      -0.000

  for grp, dailyg in daily.groupby("C_group"):
