# 2: Two-Step Method using VADER as Lexicon-Based Model

In this script, the two-step strategy is applied to the dataset sp500_headlines_2008_2024.csv:

- Each article headline is evaluated with the VADER sentiment analyzer, yielding a sentiment score for the headline.
- For each trading day, the average sentiment across all headlines is computed, where the number of available headlines on a given day serves as the measure C_i.
- The daily sentiment estimate is then used as a generated regressor in a downstream economic model via simple linear regression.
- To assess the role of C_i, the sample is split into three groups (low, medium, high number of headlines) and the regression is estimated separately for each group.

### Import required packages

In [2]:
# Two-Step: Headlines -> Sentiment (θ̂t) -> Returns regression
# Erwartete Spalten in deiner CSV: 'Date', 'Headline', 'CP'
import pandas as pd
import numpy as np
import statsmodels.api as sm

### Prepare Data

In [3]:
df = pd.read_csv('../data/sp500_headlines_2008_2004.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

### Two-step strategy

In [4]:
# ---- 1) Sentiment pro Headline ----
# Primär: VADER; Fallback: sehr einfaches Pos/Neg-Lexikon
use_vader = True
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    df['sent_raw'] = df['Headline'].astype(str).map(lambda x: sid.polarity_scores(x)['compound'])
except Exception as e:
    use_vader = False
    pos = set("strong beat beats growth surge record upbeat rally gain improve upgrade robust".split())
    neg = set("fall falls fell drop drops miss concern concerns probe lawsuit volatility uncertainty downgrade weak".split())
    def lex_score(s: str) -> float:
        words = str(s).lower().split()
        score = sum(w in pos for w in words) - sum(w in neg for w in words)
        return max(-1.0, min(1.0, score/5.0))
    df['sent_raw'] = df['Headline'].map(lex_score)


### Aggregation

In [5]:
# Aggregation: take mean mean of the raw sentiment estimate for every day (average of duplicate entries)
# Note: choosing median for a more robust result is an extra option here
daily = (
    df.groupby('Date', as_index=False)
      .agg(hat_theta=('sent_raw','mean'),
           CP=('CP','last'),
           Return_simple=('Return','last'),
           Return_log=('Return_log','last'),
           n_headlines=('Headline','size'))
)

### Downstream economic model

In [6]:
# OLS with Newey-West standard errors (HAC=heteroskedasticity and autocorrelation consistent)
# Basismodell as introduced by Tetlock: R_t = γ0 + γ1 * θ̂_t + ε_t
def ols_hac(y, X, lags=5):
    Xc = sm.add_constant(X)
    return sm.OLS(y, Xc, missing='drop').fit(cov_type='HAC', cov_kwds={'maxlags': lags})

results_by_C = {}

# Define groups for the amount of unstructered data (n_headlines): low, medium, high
daily['C_group'] = pd.qcut(daily['n_headlines'], q=3, labels=["low", "medium", "high"])

for grp, dailyg in daily.groupby("C_group"):
    X = sm.add_constant(dailyg['hat_theta'])
    y = dailyg['Return_simple']
    model = sm.OLS(y, X).fit(cov_type='HAC', cov_kwds={'maxlags': 5})
    results_by_C[grp] = model

# Print results
for grp, model in results_by_C.items():
    print(f"=== Group: {grp} ===")
    print(model.summary().tables[1])
    print("\n")


=== Group: low ===
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const       8.069e-05      0.000      0.285      0.775      -0.000       0.001
hat_theta      0.0004      0.001      0.374      0.708      -0.002       0.003


=== Group: medium ===
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const       9.346e-05      0.000      0.849      0.396      -0.000       0.000
hat_theta  -9.097e-06      0.000     -0.020      0.984      -0.001       0.001


=== Group: high ===
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0001      0.000      1.188      0.235    -8.7e-05       0.000
hat_theta      0.0024      0.001      2.536      0.011       0.001

  for grp, dailyg in daily.groupby("C_group"):
