In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

rng = np.random.default_rng(42)

# --- Design ---
T = 800                      # days
gamma0, gamma1 = 0.0, 0.02   # true effect of theta on returns
rho = 0.95                   # AR(1) persistence of theta
tau = 0.05                   # innovation sd of theta
sigma_y = 0.01               # return noise sd
sigma_fin = 0.6              # headline measurement sd (per headline)
C_low, C_high = 2, 30        # min/max daily headline counts

# --- Simulate latent theta_t (AR(1)) ---
theta = np.zeros(T)
theta[0] = rng.normal(0, tau/np.sqrt(1-rho**2))
for t in range(1, T):
    theta[t] = rho*theta[t-1] + rng.normal(0, tau)

# --- Simulate headlines and daily mean signal ---
C_t = rng.integers(C_low, C_high+1, size=T)
sbar = np.empty(T)
for t in range(T):
    s_tj = theta[t] + rng.normal(0, sigma_fin, size=C_t[t])    # signals per headline
    sbar[t] = s_tj.mean()                                      # daily average (two-step proxy)

# --- Simulate returns ---
R = gamma0 + gamma1*theta + rng.normal(0, sigma_y, size=T)

def ols(y, x, addc=True):
    X = sm.add_constant(x) if addc else x
    return sm.OLS(y, X).fit()

# 1) Oracle infeasible two-step: R ~ theta (true)
res_oracle = ols(R, theta)

# 2) Feasible two-step: R ~ sbar (proxy with measurement error)
res_feasible = ols(R, sbar)

# 3) Optional: classical ME correction if Var(sbar | theta) known ≈ sigma_fin^2 / C_t
#    Here we use the sample average Var(sbar|theta) as plug-in to de-attenuate slope.
lambda_att = np.var(theta) / np.var(sbar) * np.corrcoef(theta, sbar)[0,1]  # helps interpret
# (for reporting only)

print("--- Oracle (infeasible) ---")
print(res_oracle.params, "\n")
print("--- Feasible two-step ---")
print(res_feasible.params, "\n")

print(f"True gamma1: {gamma1:.4f}")
print(f"Oracle slope (≈ unbiased): {res_oracle.params[1]:.4f}")
print(f"Feasible slope (attenuated): {res_feasible.params[1]:.4f}")
print(f"Corr(theta, sbar): {np.corrcoef(theta, sbar)[0,1]:.3f},  Var(theta)={np.var(theta):.3f}, Var(sbar)={np.var(sbar):.3f}")


--- Oracle (infeasible) ---
[0.00013824 0.01642343] 

--- Feasible two-step ---
[-0.00019924  0.00457487] 

True gamma1: 0.0200
Oracle slope (≈ unbiased): 0.0164
Feasible slope (attenuated): 0.0046
Corr(theta, sbar): 0.578,  Var(theta)=0.019, Var(sbar)=0.053
