In [5]:
# =========================
# MICRO PROBE TRAIN (fast, self-contained)
# =========================
import time, torch, numpy as np
import torch.nn as nn, torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# --- Config ---
PROBE_SEED          = 123
PROBE_WIDTH         = 192
PROBE_LAYERS        = 4
PROBE_BATCH         = 8192
PROBE_UPDATES       = 2000        # ~1–2 minutes; bump to 4000 if you can
PROBE_LR            = 1e-3
PROBE_WDECAY        = 1e-6
PROBE_LAMBDA_MAX    = 0.5
PROBE_WARMUP_STEPS  = 400
DEVICE              = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Use your loaded dfs/columns ---
X = df[feature_cols].values.astype(np.float32)
y_p = df[TARGET_PRICE].values.astype(np.float32)
y_d = df[delta_cols].values.astype(np.float32)
y_v = df[vega_cols].values.astype(np.float32)

# train/val split
idx = np.arange(len(df))
tr_idx, val_idx = train_test_split(idx, test_size=0.01, random_state=PROBE_SEED, shuffle=True)
X_tr, X_val = X[tr_idx], X[val_idx]
p_tr, p_val = y_p[tr_idx], y_p[val_idx]
d_tr, d_val = y_d[tr_idx], y_d[val_idx]
v_tr, v_val = y_v[tr_idx], y_v[val_idx]

train_ds = TensorDataset(torch.from_numpy(X_tr),  torch.from_numpy(p_tr),
                         torch.from_numpy(d_tr),  torch.from_numpy(v_tr))
val_ds   = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(p_val),
                         torch.from_numpy(d_val), torch.from_numpy(v_val))

# weighting
d_std = np.maximum(d_tr.std(axis=0), 1e-12)
v_std = np.maximum(v_tr.std(axis=0), 1e-12)
w_delta = torch.tensor(1.0/(d_std**2), dtype=torch.float32, device=DEVICE)
w_vega  = torch.tensor(1.0/(v_std**2), dtype=torch.float32, device=DEVICE)

# model
class ProbeNet(nn.Module):
    def __init__(self, in_dim, width=PROBE_WIDTH, layers=PROBE_LAYERS):
        super().__init__()
        blocks = [nn.Linear(in_dim, width), nn.Softplus()]
        for _ in range(layers - 1):
            blocks += [nn.Linear(width, width), nn.Softplus()]
        blocks += [nn.Linear(width, 1)]
        self.net = nn.Sequential(*blocks)
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
    def forward(self, x): return self.net(x).squeeze(1)

def weighted_mse(pred, true, w):
    return ((pred - true)**2 * w).mean()

# indices for autograd greeks
def require_feature(name: str) -> int: return feature_cols.index(name)
delta_idx = [require_feature(f"S0_{i}/K") for i in range(3)]
vega_idx  = [require_feature(f"sigma_{i}") for i in range(3)]

# --- train briefly ---
torch.manual_seed(PROBE_SEED); np.random.seed(PROBE_SEED)
model = ProbeNet(len(feature_cols)).to(DEVICE)
opt   = optim.Adam(model.parameters(), lr=PROBE_LR, weight_decay=PROBE_WDECAY)
sched = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=PROBE_UPDATES, eta_min=PROBE_LR/50)
mse   = nn.MSELoss()

train_loader = DataLoader(train_ds, batch_size=PROBE_BATCH, shuffle=True, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=PROBE_BATCH, shuffle=False, pin_memory=True)

def lam_sob(step): return PROBE_LAMBDA_MAX*min(1.0, step/PROBE_WARMUP_STEPS)

print("\n[PROBE] Training…")
t0 = time.time()
it = iter(train_loader)
for step in range(1, PROBE_UPDATES+1):
    try: Xb, p_true, d_true, v_true = next(it)
    except StopIteration:
        it = iter(train_loader); Xb, p_true, d_true, v_true = next(it)
    Xb = Xb.to(DEVICE).requires_grad_(True)
    p_true, d_true, v_true = [t.to(DEVICE) for t in (p_true, d_true, v_true)]
    opt.zero_grad(set_to_none=True)

    p_pred = model(Xb)
    loss_p = mse(p_pred, p_true)
    g = torch.autograd.grad(p_pred, Xb, grad_outputs=torch.ones_like(p_pred), create_graph=True)[0]
    d_pred = g[:, delta_idx]; v_pred = g[:, vega_idx]
    loss = loss_p + lam_sob(step)*(weighted_mse(d_pred,d_true,w_delta)+weighted_mse(v_pred,v_true,w_vega))
    loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step(); sched.step()

    if step % 500 == 0 or step==1 or step==PROBE_UPDATES:
        model.eval(); tot=cnt=0
        for Xv,pv,dv,vv in val_loader:
            Xv = Xv.to(DEVICE).requires_grad_(True); pv,dv,vv=[t.to(DEVICE) for t in (pv,dv,vv)]
            pp=model(Xv); lp=mse(pp,pv)
            gv=torch.autograd.grad(pp,Xv,grad_outputs=torch.ones_like(pp))[0]
            ld=weighted_mse(gv[:,delta_idx],dv,w_delta); lv=weighted_mse(gv[:,vega_idx],vv,w_vega)
            tot+=(lp+lam_sob(step)*(ld+lv)).item()*Xv.size(0); cnt+=Xv.size(0)
        print(f"[{step:>5}/{PROBE_UPDATES}] train={loss.item():.3e}  val={tot/cnt:.3e}")
        model.train()
print(f"[PROBE] Done in {time.time()-t0:.1f}s")

# --- evaluate on TEST ---
X_test = torch.from_numpy(test_df[feature_cols].values.astype(np.float32)).to(DEVICE).requires_grad_(True)
p_pred = model(X_test)
g_test = torch.autograd.grad(p_pred, X_test, grad_outputs=torch.ones_like(p_pred))[0]
price_pred = p_pred.detach().cpu().numpy()
delta_pred = g_test[:, delta_idx].detach().cpu().numpy()
vega_pred  = g_test[:, vega_idx].detach().cpu().numpy()

def r2_slope(pred, true, name):
    if np.std(true)==0 or np.std(pred)==0:
        print(f"{name:>10s} | R²=nan")
        return
    r2=r2_score(true.ravel(),pred.ravel()); a1,a0=np.polyfit(true.ravel(),pred.ravel(),1)
    print(f"{name:>10s} | R²={r2:.6f}  slope={a1:.6f}  intercept={a0:.3e}")

print("\n[PROBE] Test-set R² and slopes")
r2_slope(price_pred, test_df[TARGET_PRICE].values, "Price")
for i in range(3): r2_slope(delta_pred[:,i], test_df[delta_cols].values[:,i], f"Delta_{i}")
for i in range(3): r2_slope(vega_pred[:,i],  test_df[vega_cols].values[:,i],  f"Vega_{i}")



[PROBE] Training…


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


[    1/2000] train=3.072e+00  val=6.092e-01
[  500/2000] train=6.420e-01  val=6.390e-01
[ 1000/2000] train=5.745e-01  val=5.707e-01
[ 1500/2000] train=5.715e-01  val=5.545e-01
[ 2000/2000] train=5.689e-01  val=5.515e-01
[PROBE] Done in 558.9s

[PROBE] Test-set R² and slopes
     Price | R²=0.961443  slope=0.971749  intercept=6.281e-03
   Delta_0 | R²=0.912088  slope=0.913034  intercept=1.221e-02
   Delta_1 | R²=0.912275  slope=0.917097  intercept=1.197e-02
   Delta_2 | R²=0.912716  slope=0.920972  intercept=1.247e-02
    Vega_0 | R²=-0.020745  slope=0.003393  intercept=-6.038e-02
    Vega_1 | R²=-0.012030  slope=0.002966  intercept=-5.221e-02
    Vega_2 | R²=-0.014062  slope=0.003082  intercept=-5.284e-02


In [6]:
# =========================
# VEGA DIAGNOSTIC: AD~FD + unit-mapping sweep
# (Run right now; uses your `model`, `test_df`, `feature_cols`, `delta_cols`, `vega_cols`)
# =========================
import numpy as np, torch
from sklearn.metrics import r2_score

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1) AD vs FD sanity (vega_dim0) ---
n = min(len(test_df), 2048)
Xb_np = test_df[feature_cols].values[:n].astype(np.float32)
Xb    = torch.from_numpy(Xb_np).to(DEVICE)

with torch.no_grad():
    p0 = model(Xb).detach()

# choose one vega dim to sanity check
vega_cols_feat = [c for c in feature_cols if c.startswith("sigma_")]
assert len(vega_cols_feat) >= 1, "No sigma_* features found."
# Use the same vega_idx mapping you already have:
jV = feature_cols.index("sigma_0")

eps = 1e-4
Xe = Xb_np.copy(); Xe[:, jV] += eps
with torch.no_grad(): p_eps = model(torch.from_numpy(Xe).to(DEVICE)).detach()
vega_fd = (p_eps - p0).cpu().numpy() / eps

Xreq = Xb.clone().requires_grad_(True)
pp   = model(Xreq)
g    = torch.autograd.grad(pp, Xreq, grad_outputs=torch.ones_like(pp))[0]
vega_ad = g[:, jV].detach().cpu().numpy()

cV = np.corrcoef(vega_ad.ravel(), vega_fd.ravel())[0,1]
sV = np.polyfit(vega_fd.ravel(),  vega_ad.ravel(), 1)[0]
print(f"[AD~FD vega_dim0] Corr={cV:.6f}, slope(AD~FD)={sV:.6f}  -> should be ~1 & ~1")

# --- 2) Unit mapping sweep vs labels (all three vegas) ---
yV = test_df[vega_cols].values.astype(np.float32)

# get full predictions once
X_all = torch.from_numpy(test_df[feature_cols].values.astype(np.float32)).to(DEVICE).requires_grad_(True)
p_all = model(X_all)
g_all = torch.autograd.grad(p_all, X_all, grad_outputs=torch.ones_like(p_all))[0]
vega_pred = g_all[:, [feature_cols.index("sigma_0"),
                      feature_cols.index("sigma_1"),
                      feature_cols.index("sigma_2")]].detach().cpu().numpy()

# helpers (pull K, sigma if available)
# If K is not explicit, we can often reconstruct from S0_i and S0_i/K if both exist; otherwise skip K tests.
def col_ok(name): return name in test_df.columns
have_K = col_ok("K") or all(col_ok(f"S0_{i}") and col_ok(f"S0_{i}/K") for i in range(3))
if have_K and not col_ok("K"):
    # reconstruct K from leg 0 if available (S0_0 / (S0_0/K))
    K = (test_df["S0_0"] / test_df["S0_0/K"]).values
elif col_ok("K"):
    K = test_df["K"].values
else:
    K = None

sig = test_df[["sigma_0","sigma_1","sigma_2"]].values if all(col_ok(f"sigma_{i}") for i in range(3)) else None

def eval_map(name, pred):
    r2 = r2_score(yV.ravel(), pred.ravel()) if (np.std(yV)>0 and np.std(pred)>0) else np.nan
    a1, a0 = np.polyfit(yV.ravel(), pred.ravel(), 1)
    print(f"{name:>20s} | R²={r2:.6f}  slope={a1:.6f}  intercept={a0:.3e}")

print("\n[VEGA UNIT SWEEP] Comparing predicted vega under different mappings vs labels:")
eval_map("raw (∂p̂/∂σ)", vega_pred)  # p̂ = price/K, σ in decimals

if K is not None:
    eval_map("raw * K", vega_pred * K[:, None])
    eval_map("raw / K", vega_pred / K[:, None])

if sig is not None:
    # If labels are wrt variance v=σ^2, then ∂p̂/∂v = (1/(2σ))*∂p̂/∂σ  (when σ>0)
    eval_map("variance (1/(2σ))", vega_pred / (2.0 * np.clip(sig, 1e-8, None)))
    # If labels used percent-vol (σ% = 100*σ): ∂p̂/∂(σ%) = (1/100)*∂p̂/∂σ
    eval_map("percent-vol (/100)", vega_pred / 100.0)
    eval_map("percent-vol (*100)", vega_pred * 100.0)

# Also try price vs price/K mismatch:
# If labels are ∂Price/∂σ but model gives ∂(Price/K)/∂σ, multiply by K
if K is not None:
    eval_map("price-level (K * raw)", K[:, None] * vega_pred)


[AD~FD vega_dim0] Corr=0.999472, slope(AD~FD)=1.000211  -> should be ~1 & ~1

[VEGA UNIT SWEEP] Comparing predicted vega under different mappings vs labels:
        raw (∂p̂/∂σ) | R²=-0.015418  slope=0.003150  intercept=-5.518e-02
   variance (1/(2σ)) | R²=-0.017941  slope=0.001680  intercept=-1.017e-01
  percent-vol (/100) | R²=-0.022594  slope=0.000031  intercept=-5.518e-04
  percent-vol (*100) | R²=0.189626  slope=0.314996  intercept=-5.518e+00


In [1]:
import pandas as pd

# Load just the schema / small sample
df = pd.read_parquet("Train_Clean.parquet", engine="pyarrow")

# Show all column names
print("Columns:")
for col in df.columns:
    print(" •", col)

# Optional: show shape
print(f"\nRows: {len(df):,} | Columns: {len(df.columns)}")

# Peek at first 5 rows
print("\nSample rows:")
print(df.head())


Columns:
 • sigma_0
 • sigma_1
 • sigma_2
 • corr_0_1
 • corr_0_2
 • corr_1_2
 • r
 • T
 • delta_0
 • delta_1
 • delta_2
 • vega_0
 • vega_1
 • vega_2
 • delta_aad_0
 • delta_aad_1
 • delta_aad_2
 • vega_aad_0
 • vega_aad_1
 • vega_aad_2
 • S0_0/K
 • S0_1/K
 • S0_2/K
 • price/k

Rows: 4,821,510 | Columns: 24

Sample rows:
    sigma_0   sigma_1   sigma_2  corr_0_1  corr_0_2  corr_1_2     r         T  \
0  0.167014  0.166996  0.093563  0.595795  0.267429  0.404955  0.03  2.285714   
1  0.694955  0.560231  0.387874  0.040739  0.310864  0.304059  0.03  1.285714   
2  0.741406  0.116369  0.196987  0.542316  0.920858  0.695307  0.03  6.349206   
3  0.642632  0.504470  0.744726  0.323487  0.165483  0.568074  0.03  6.670635   
4  0.201289  0.721823  0.406528  0.160354  0.201242  0.880389  0.03  4.063492   

    delta_0   delta_1  ...  delta_aad_0  delta_aad_1  delta_aad_2  vega_aad_0  \
0  0.059344  0.840880  ...     0.059342     0.840888     0.030561  -18.786872   
1  0.170168  0.093786  ... 

In [2]:
import pandas as pd

def clean_parquet(in_path, out_path):
    # Load file
    df = pd.read_parquet(in_path, engine="pyarrow")
    print(f"Loaded {in_path}: {df.shape[0]:,} rows, {df.shape[1]} cols")

    # 1) Drop FD Greek columns
    drop_cols = [f"delta_{i}" for i in range(3)] + [f"vega_{i}" for i in range(3)]
    df = df.drop(columns=drop_cols, errors="ignore")

    # 2) Rename AAD Greeks → canonical names
    rename_map = {f"delta_aad_{i}": f"delta_{i}" for i in range(3)}
    rename_map.update({f"vega_aad_{i}": f"vega_{i}" for i in range(3)})
    df = df.rename(columns=rename_map)

    # 3) Save cleaned file
    df.to_parquet(out_path, engine="pyarrow", index=False)
    print(f"Saved cleaned file: {out_path} with {df.shape[1]} cols")

# Clean both train & test
clean_parquet("Train_Clean.parquet", "Train_Clean_Cleaned.parquet")
clean_parquet("Test_Clean.parquet",  "Test_Clean_Cleaned.parquet")


Loaded Train_Clean.parquet: 4,821,510 rows, 24 cols
Saved cleaned file: Train_Clean_Cleaned.parquet with 18 cols
Loaded Test_Clean.parquet: 48,224 rows, 24 cols
Saved cleaned file: Test_Clean_Cleaned.parquet with 18 cols


In [3]:
# save as: check_train_clean.py
import numpy as np
import pandas as pd
from pathlib import Path

PATH = "Train_Clean.parquet"

# What we expect after cleaning
FEATURE_COLS = [
    "sigma_0","sigma_1","sigma_2",
    "corr_0_1","corr_0_2","corr_1_2",
    "r","T",
    "S0_0/K","S0_1/K","S0_2/K",
]
TARGET_COLS = ["price/k"] + [f"delta_{i}" for i in range(3)] + [f"vega_{i}" for i in range(3)]
EXPECTED = FEATURE_COLS + TARGET_COLS

RAW_MUST_NOT_EXIST = ["K", "price"] + [f"S0_{i}" for i in range(3)] + [c for c in ["price_se"] if True]

def main():
    if not Path(PATH).exists():
        raise FileNotFoundError(f"{PATH} not found")

    df = pd.read_parquet(PATH, engine="pyarrow")
    print(f"\n=== Loaded {PATH} → shape {df.shape} ===")

    # -------- 1) Schema checks --------
    missing = [c for c in EXPECTED if c not in df.columns]
    extra   = [c for c in df.columns if c not in EXPECTED]
    raw_left = [c for c in RAW_MUST_NOT_EXIST if c in df.columns] + \
               [c for c in df.columns if c.startswith("S0_") and "/K" not in c]

    print("\n[Schema]")
    print("Missing expected columns :", missing or "None ✅")
    print("Unexpected extra columns :", extra or "None ✅")
    print("RAW columns still present:", raw_left or "None ✅")

    # Hard fail if required columns are missing
    if missing:
        print("\n❌ Schema incomplete — fix before training.")
        return

    # -------- 2) NA / Inf --------
    print("\n[NA / Inf]")
    na = df[EXPECTED].isna().sum()
    na = na[na > 0]
    print("NA counts:", ("None ✅" if na.empty else f"\n{na}"))
    inf_any = np.isinf(df[EXPECTED].select_dtypes(include=[float,int])).any().any()
    print("Any ±Inf? ", "No ✅" if not inf_any else "Yes ❌")

    # -------- 3) Generator-implied ranges --------
    print("\n[Range checks (from your data generator)]")
    def frac_bad(col, lo=None, hi=None):
        s = df[col]
        bad = np.zeros(len(s), dtype=bool)
        if lo is not None: bad |= (s < lo - 1e-6)
        if hi is not None: bad |= (s > hi + 1e-6)
        return 100 * bad.mean()

    print(f"sigma_0 in [0.05,0.8]: {frac_bad('sigma_0',0.05,0.8):.3f}% {'✅' if frac_bad('sigma_0',0.05,0.8)==0 else '⚠️'}")
    print(f"sigma_1 in [0.05,0.8]: {frac_bad('sigma_1',0.05,0.8):.3f}% {'✅' if frac_bad('sigma_1',0.05,0.8)==0 else '⚠️'}")
    print(f"sigma_2 in [0.05,0.8]: {frac_bad('sigma_2',0.05,0.8):.3f}% {'✅' if frac_bad('sigma_2',0.05,0.8)==0 else '⚠️'}")
    print(f"r == 0.03             : { (df['r']!=0.03).mean()*100:.3f}% {'✅' if (df['r']==0.03).all() else '⚠️'}")
    print(f"T in [0.00397,7.34]   : {frac_bad('T',0.00397,7.34):.3f}% {'✅' if frac_bad('T',0.00397,7.34)==0 else '⚠️'}")

    for c in ["corr_0_1","corr_0_2","corr_1_2"]:
        bad = ((df[c] < -1-1e-6) | (df[c] > 1+1e-6)).mean()*100
        print(f"{c:>8s} in [-1,1]       : {bad:.3f}% {'✅' if bad==0 else '⚠️'}")

    for c in ["S0_0/K","S0_1/K","S0_2/K"]:
        bad = (df[c] <= 0).mean()*100
        print(f"{c:>8s} > 0            : {bad:.3f}% {'✅' if bad==0 else '⚠️'}")

    bad = (df["price/k"] < 1e-6).mean()*100
    print(f"{'price/k':>8s} ≥ 1e-6       : {bad:.3f}% {'✅' if bad==0 else '⚠️'}")

    # -------- 4) Delta sanity (worst-of call) --------
    print("\n[Delta sanity]")
    D = df[[f"delta_{i}" for i in range(3)]].to_numpy()
    in_bounds = ((D >= -1e-6) & (D <= 1+1e-6)).all(axis=1).mean()*100
    sum_le_1  = (D.sum(axis=1) <= 1 + 1e-6).mean()*100
    print(f"each delta_i in [0,1] : {in_bounds:.2f}% {'✅' if in_bounds==100 else '⚠️'}")
    print(f"sum(delta_i) ≤ 1      : {sum_le_1:.2f}% {'✅' if sum_le_1==100 else '⚠️'}")

    # -------- 5) Vega scaling sanity (the key check) --------
    print("\n[Vega scaling sanity]")
    V = df[[f"vega_{i}" for i in range(3)]].copy()
    desc = V.describe(percentiles=[0.01,0.5,0.99]).T[["mean","std","min","1%","50%","99%","max"]]
    print(desc)

    # Heuristic flag: if many |vega| > 20, likely unscaled by K
    frac_large = (V.abs().gt(20).any(axis=1)).mean()*100
    print(f"Rows with any |vega| > 20: {frac_large:.3f}% {'✅ (scaled)' if frac_large==0 else '❌ (likely unscaled)'}")

    # -------- 6) Quick overall stats print --------
    print("\n[Quick .describe() of all features/targets]")
    print(df[EXPECTED].describe(percentiles=[0.01,0.5,0.99]).T)

    print("\n✅ Finished checks.")

if __name__ == "__main__":
    main()



=== Loaded Train_Clean.parquet → shape (4821510, 24) ===

[Schema]
Missing expected columns : None ✅
Unexpected extra columns : ['delta_aad_0', 'delta_aad_1', 'delta_aad_2', 'vega_aad_0', 'vega_aad_1', 'vega_aad_2']
RAW columns still present: None ✅

[NA / Inf]
NA counts: None ✅
Any ±Inf?  No ✅

[Range checks (from your data generator)]
sigma_0 in [0.05,0.8]: 0.000% ✅
sigma_1 in [0.05,0.8]: 0.000% ✅
sigma_2 in [0.05,0.8]: 0.000% ✅
r == 0.03             : 0.000% ✅
T in [0.00397,7.34]   : 1.656% ⚠️
corr_0_1 in [-1,1]       : 0.000% ✅
corr_0_2 in [-1,1]       : 0.000% ✅
corr_1_2 in [-1,1]       : 0.000% ✅
  S0_0/K > 0            : 0.000% ✅
  S0_1/K > 0            : 0.000% ✅
  S0_2/K > 0            : 0.000% ✅
 price/k ≥ 1e-6       : 0.000% ✅

[Delta sanity]
each delta_i in [0,1] : 99.23% ⚠️
sum(delta_i) ≤ 1      : 99.17% ⚠️

[Vega scaling sanity]
            mean       std       min        1%       50%       99%       max
vega_0 -0.031974  0.191593 -3.043398 -0.636702 -0.006985  0.520611 