In [None]:
# Two-Step: Headlines -> Sentiment (θ̂t) -> Returns regression
# Erwartete Spalten in deiner CSV: 'Date', 'Title', 'CP'
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [None]:

# ---- 0) Laden & Vorbereiten ----
df = pd.read_csv('\sp500_headlines_2008_2024.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)


# ---- 1) Sentiment pro Headline ----
# Primär: VADER; Fallback: sehr einfaches Pos/Neg-Lexikon
use_vader = True
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    df['sent_raw'] = df['Title'].astype(str).map(lambda x: sid.polarity_scores(x)['compound'])
except Exception as e:
    use_vader = False
    pos = set("strong beat beats growth surge record upbeat rally gain improve upgrade robust".split())
    neg = set("fall falls fell drop drops miss concern concerns probe lawsuit volatility uncertainty downgrade weak".split())
    def lex_score(s: str) -> float:
        words = str(s).lower().split()
        score = sum(w in pos for w in words) - sum(w in neg for w in words)
        return max(-1.0, min(1.0, score/5.0))
    df['sent_raw'] = df['Title'].map(lex_score)

# ---- 2) Aggregation: mehrere Headlines pro Tag -> ein Tages-Sentiment θ̂t ----
# (Durchschnitt ist Standard; Median robuster – wähle was dir passt)
daily = (
    df.groupby('Date', as_index=False)
      .agg(hat_theta=('sent_raw','mean'),
           CP=('CP','last'),
           Return_simple=('Return_simple','last'),
           Return_log=('Return_log','last'),
           n_headlines=('Title','size'))
)

# Optional: Z-Standardisierung des Sentiments (für interpretierbare Koeffizienten)
daily['hat_theta_z'] = (daily['hat_theta'] - daily['hat_theta'].mean()) / daily['hat_theta'].std(ddof=0)

print(daily.head())

# ---- 3) OLS mit Newey-West/HAC-Standardfehlern ----
# Basismodell wie bei Tetlock: R_t = γ0 + γ1 * θ̂_t + ε_t
def ols_hac(y, X, lags=5):
    Xc = sm.add_constant(X)
    return sm.OLS(y, Xc, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags': lags})

# a) einfache Rendite
mod_simple = ols_hac(daily['Return_simple'], daily['hat_theta_z'], lags=5)
print("\n=== Two-Step: Return_simple ~ hat_theta_z (HAC) ===")
print(mod_simple.summary())

# b) log-Rendite
mod_log = ols_hac(daily['Return_log'], daily['hat_theta_z'], lags=5)
print("\n=== Two-Step: Return_log ~ hat_theta_z (HAC) ===")
print(mod_log.summary())

# ---- 4) (Optional) Robustness: Lagged Sentiment & Day-of-Week FEs ----
daily['hat_theta_z_lag1'] = daily['hat_theta_z'].shift(1)
daily['dow'] = daily['Date'].dt.dayofweek  # 0=Mon ... 4=Fri (meist nur Handelstage in deinen Daten)
dow_dummies = pd.get_dummies(daily['dow'], prefix='dow', drop_first=True)

X_rob = pd.concat([daily[['hat_theta_z','hat_theta_z_lag1']], dow_dummies], axis=1)
mod_log_rob = ols_hac(daily['Return_log'], X_rob, lags=5)
print("\n=== Robustness: Return_log ~ hat_theta_z + lag1 + DOW FEs (HAC) ===")
print(mod_log_rob.summary())
