# Overview of Project

# Data Generation 

# Explanation/ Reasoning 

## Code

In [None]:
import os, math, time, argparse, pathlib, sys
import numpy as np
import torch
import pyarrow as pa, pyarrow.parquet as pq

# --------- global knobs (keep from Code 1) ---------
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark       = True
torch.set_default_dtype(torch.float32)

N_ASSETS   = 3
R_RATE     = 0.03
SEED_BASE  = 42
CHUNK_MAX  = 100_000                 # rows before flushing Parquet
NGPU       = torch.cuda.device_count()
DEVICES    = [torch.device(f"cuda:{i}") for i in range(NGPU)]
SIM_CHUNK  = 1_000_000               # per-GPU sub-batch size (paths)

if not NGPU:
    sys.exit("No CUDA GPU visible – aborting.")

# --------- correlation sampler (Code 1) ---------
def cvine_corr_np(d, a: float = 5.0, b: float = 2.0) -> torch.Tensor:
    P = np.eye(d)
    for k in range(d - 1):
        for i in range(k + 1, d):
            rho = 2.0 * np.random.beta(a, b) - 1.0
            for m in range(k - 1, -1, -1):
                rho = rho * np.sqrt((1 - P[m, i] ** 2) * (1 - P[m, k] ** 2)) + P[m, i] * P[m, k]
            P[k, i] = P[i, k] = rho
    ev, evec = np.linalg.eigh(P)
    P = evec @ np.diag(np.clip(ev, 1e-6, None)) @ evec.T
    return torch.as_tensor(P, dtype=torch.float32, device="cuda")

# --------- random scenario generator (Code 1) ---------
def fg_sample():
    z     = np.random.normal(0.5, math.sqrt(0.25), N_ASSETS)
    S0    = 100 * np.exp(z)
    sigma = np.random.uniform(0.05, 0.8, N_ASSETS)
    T     = (np.random.randint(1, 44) ** 2) / 252.0
    return dict(S0=S0.astype(np.float32), sigma=sigma.astype(np.float32),
                T=float(T), rho=cvine_corr_np(N_ASSETS), K=100.0, r=R_RATE)

# --------- helpers ---------
def _split_across_devices(total: int, ndev: int):
    base = total // ndev
    rem  = total % ndev
    return [base + (1 if i < rem else 0) for i in range(ndev)]

@torch.no_grad()
def terminal_prices(S0, sigma, T, rho, *, n_paths, r, device, gen=None):
    L = torch.linalg.cholesky(rho.to(device))
    Z = torch.randn(n_paths, N_ASSETS, device=device, generator=gen)
    with torch.autocast('cuda', dtype=torch.float16):
        drift     = (r - 0.5 * sigma**2) * T
        diffusion = sigma * math.sqrt(T) * (Z @ L.T)
        return torch.exp(torch.log(S0) + drift + diffusion)

# --------- streaming MC price (Code 1; fast path) ---------
def price_mc(params, n_paths, return_se=False):
    counts      = _split_across_devices(n_paths, NGPU)
    total_sum   = 0.0
    total_sumsq = 0.0
    disc        = math.exp(-float(params['r']) * float(params['T']))

    for dev, count in zip(DEVICES, counts):
        if count == 0: continue
        S0    = torch.tensor(params['S0'],    device=dev)
        sigma = torch.tensor(params['sigma'], device=dev)
        T     = torch.tensor(params['T'],     device=dev)
        r     = torch.tensor(params['r'],     device=dev)
        K     = torch.tensor(params['K'],     device=dev)

        for offset in range(0, count, SIM_CHUNK):
            sz  = min(SIM_CHUNK, count - offset)
            ST  = terminal_prices(S0, sigma, T, params['rho'], n_paths=sz, r=r, device=dev)
            pay = torch.clamp(ST.min(dim=1).values - K, 0.0)
            arr = (disc * pay).float().cpu().numpy()
            total_sum   += arr.sum()
            total_sumsq += (arr * arr).sum()

    mean = total_sum / n_paths
    if not return_se:
        return mean
    var  = (total_sumsq / n_paths) - mean * mean
    se   = math.sqrt(max(var, 0.0) / n_paths)
    return mean, se

# ===================== FD Δ & ν WITH CRN (Code 2 semantics) =====================
def delta_vega_fd_crn(params, n_paths, rel=1e-4, base_seed=None):
    """
    FD Δ/ν using common random numbers (same Z/Y reused for up/down).
    • float64 throughout (matches your Code 2)
    • chunked + multi-GPU
    • deterministic if base_seed is provided (per-scenario)
    Returns (delta[N_ASSETS], vega[N_ASSETS]) as numpy float64 arrays.
    """
    disc = math.exp(-float(params['r']) * float(params['T']))
    delta_num = np.zeros(N_ASSETS, dtype=np.float64)
    vega_num  = np.zeros(N_ASSETS, dtype=np.float64)

    counts = _split_across_devices(n_paths, NGPU)
    for dev_idx, (dev, count) in enumerate(zip(DEVICES, counts)):
        if count == 0: continue

        # constants & base tensors (float64 like Code 2)
        S0f  = torch.tensor(params['S0'],    dtype=torch.float64, device=dev)
        sigf = torch.tensor(params['sigma'], dtype=torch.float64, device=dev)
        Tf   = torch.tensor(float(params['T']), dtype=torch.float64, device=dev)
        rf   = torch.tensor(float(params['r']), dtype=torch.float64, device=dev)
        Kf   = torch.tensor(float(params['K']), dtype=torch.float64, device=dev)
        L    = torch.linalg.cholesky(params['rho'].to(dev).to(torch.float64))

        for chunk_idx, offset in enumerate(range(0, count, SIM_CHUNK)):
            sz   = min(SIM_CHUNK, count - offset)

            # Per-device/chunk generator so CRN is reproducible across bumps
            gen = None
            if base_seed is not None:
                gen = torch.Generator(device=dev)
                gen.manual_seed(int(base_seed) + 1_000_003*dev_idx + chunk_idx)

            # --- draw once and reuse across bumps ---
            Z = torch.randn(sz, N_ASSETS, dtype=torch.float64, device=dev, generator=gen)
            Y = Z @ L.T

            # Precompute drift/diff for base sigma (used by Δ bumps)
            drift_b = (rf - 0.5 * sigf**2) * Tf
            diff_b  = sigf * torch.sqrt(Tf) * Y

            # Loop assets
            for i in range(N_ASSETS):
                # Δ: bump S_i
                hS   = float(max(rel * float(S0f[i].item()), 1e-6))
                S_up = S0f.clone(); S_up[i] += hS
                S_dn = S0f.clone(); S_dn[i] -= hS

                ST_up = torch.exp(torch.log(S_up) + drift_b + diff_b)
                ST_dn = torch.exp(torch.log(S_dn) + drift_b + diff_b)
                pay_up = torch.clamp(ST_up.min(dim=1).values - Kf, 0.0)
                pay_dn = torch.clamp(ST_dn.min(dim=1).values - Kf, 0.0)

                delta_num[i] += disc * (pay_up.sum().double().cpu().item() - pay_dn.sum().double().cpu().item()) / (2.0 * hS)

                # ν: bump σ_i (recompute drift/diff, reuse Y)
                hV    = float(max(rel * float(sigf[i].item()), 1e-6))
                sig_up = sigf.clone(); sig_up[i] += hV
                sig_dn = sigf.clone(); sig_dn[i] -= hV

                drift_up = (rf - 0.5 * sig_up**2) * Tf
                diff_up  = sig_up * torch.sqrt(Tf) * Y
                drift_dn = (rf - 0.5 * sig_dn**2) * Tf
                diff_dn  = sig_dn * torch.sqrt(Tf) * Y

                ST_vup = torch.exp(torch.log(S0f) + drift_up + diff_up)
                ST_vdn = torch.exp(torch.log(S0f) + drift_dn + diff_dn)
                pay_vup = torch.clamp(ST_vup.min(dim=1).values - Kf, 0.0)
                pay_vdn = torch.clamp(ST_vdn.min(dim=1).values - Kf, 0.0)

                vega_num[i]  += disc * (pay_vup.sum().double().cpu().item() - pay_vdn.sum().double().cpu().item()) / (2.0 * hV)

    # convert sums of path contributions to derivatives of the mean
    delta = delta_num / n_paths
    vega  = vega_num  / n_paths
    return delta, vega

# ===================== AAD Δ & ν (float64, chunked) =====================
def delta_vega_aad(params, n_paths, base_seed=None):
    counts = _split_across_devices(n_paths, NGPU)
    sum_dS = np.zeros(N_ASSETS, dtype=np.float64)
    sum_dV = np.zeros(N_ASSETS, dtype=np.float64)

    for dev_idx, (dev, count) in enumerate(zip(DEVICES, counts)):
        if count == 0: continue
        L  = torch.linalg.cholesky(params['rho'].to(dev).to(torch.float64))
        Tf = torch.tensor(float(params['T']), dtype=torch.float64, device=dev)
        rf = torch.tensor(float(params['r']), dtype=torch.float64, device=dev)
        Kf = torch.tensor(float(params['K']), dtype=torch.float64, device=dev)

        for chunk_idx, offset in enumerate(range(0, count, SIM_CHUNK)):
            sz  = min(SIM_CHUNK, count - offset)

            gen = None
            if base_seed is not None:
                gen = torch.Generator(device=dev)
                gen.manual_seed(int(base_seed) + 1_000_003*dev_idx + chunk_idx)

            # Leaf vars in float64 with grads
            S0  = torch.tensor(params['S0'],    dtype=torch.float64, device=dev, requires_grad=True)
            sig = torch.tensor(params['sigma'], dtype=torch.float64, device=dev, requires_grad=True)

            Z = torch.randn(sz, N_ASSETS, dtype=torch.float64, device=dev, generator=gen)
            Y = Z @ L.T

            drift = (rf - 0.5 * sig**2) * Tf
            diff  = sig * torch.sqrt(Tf) * Y
            ST    = torch.exp(torch.log(S0) + drift + diff)
            payoff = torch.clamp(ST.min(dim=1).values - Kf, 0.0)
            price  = torch.exp(-rf * Tf) * payoff.mean()

            dS, dV = torch.autograd.grad(price, (S0, sig), retain_graph=False, create_graph=False)
            sum_dS += dS.detach().double().cpu().numpy() * sz
            sum_dV += dV.detach().double().cpu().numpy() * sz

    return sum_dS / n_paths, sum_dV / n_paths

# ----------------- main driver -----------------
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--rows',        type=int, default=5_000)
    ap.add_argument('--paths',       type=int, default=100_000_000)
    ap.add_argument('--seed_offset', type=int, default=0)
    ap.add_argument('--out',         type=str, default='Train_AllInOne.parquet')
    ap.add_argument('--no-aad',      dest='do_aad', action='store_false', default=True, help='disable AAD Δ/ν')
    args = ap.parse_args()

    np.random.seed(SEED_BASE + args.seed_offset)
    torch.manual_seed(SEED_BASE + args.seed_offset)

    out_path  = pathlib.Path(args.out)
    writer    = None
    first     = True
    total_t0  = time.time()
    sample_t = price_t = fd_t = aad_t = 0.0

    print(f"Launching Monte-Carlo for {args.rows:,} rows …", flush=True)
    global_row_idx = 0

    rows_left = args.rows
    while rows_left:
        batch = min(rows_left, CHUNK_MAX)
        records = []

        for _ in range(batch):
            # ---- sample scenario (Code 1)
            t0 = time.perf_counter()
            p  = fg_sample()
            sample_t += time.perf_counter() - t0

            # ---- price & SE (Code 1)
            t1 = time.perf_counter()
            price, price_se = price_mc(p, args.paths, return_se=True)
            price_t += time.perf_counter() - t1

            # ---- FD Δ/ν with CRN (Code 2 semantics)
            scen_seed = SEED_BASE + args.seed_offset + global_row_idx
            t2 = time.perf_counter()
            delta, vega = delta_vega_fd_crn(p, args.paths, rel=1e-4, base_seed=scen_seed)
            fd_t += time.perf_counter() - t2

            # ---- AAD Δ/ν (optional)
            if args.do_aad:
                t3 = time.perf_counter()
                delta_aad, vega_aad = delta_vega_aad(p, args.paths, base_seed=scen_seed)
                aad_t += time.perf_counter() - t3
            else:
                delta_aad = vega_aad = None

            # ---- flatten correlation matrix (Code 1)
            corr_mat = p['rho'].detach().cpu().numpy()
            corr_fields = {
                f"corr_{i}_{j}": float(corr_mat[i, j])
                for i in range(N_ASSETS) for j in range(i + 1, N_ASSETS)
            }

            # ---- assemble record
            rec = {
                **{f"S0_{i}":     float(p['S0'][i])     for i in range(N_ASSETS)},
                **{f"sigma_{i}":  float(p['sigma'][i])  for i in range(N_ASSETS)},
                **corr_fields,
                "K": float(p['K']),
                "r": float(p['r']),
                "T": float(p['T']),
                "price":    float(price),
                "price_se": float(price_se),
                **{f"delta_{i}": float(delta[i]) for i in range(N_ASSETS)},
                **{f"vega_{i}":  float(vega[i])  for i in range(N_ASSETS)},
            }
            if delta_aad is not None:
                rec.update({f"delta_aad_{i}": float(delta_aad[i]) for i in range(N_ASSETS)})
                rec.update({f"vega_aad_{i}":  float(vega_aad[i])  for i in range(N_ASSETS)})

            records.append(rec)
            global_row_idx += 1

        # ---- write Parquet (single file)
        table = pa.Table.from_pylist(records)
        if first:
            writer = pq.ParquetWriter(str(out_path), table.schema, compression='zstd')
            first = False
        writer.write_table(table)
        rows_left -= batch

    writer.close()
    print(f"Sampling: {sample_t:.1f}s | Pricing: {price_t:.1f}s | FD: {fd_t:.1f}s | AAD: {aad_t:.1f}s")
    print(f"Wrote {args.rows:,} rows → {out_path} in {time.time() - total_t0:.1f}s")

if __name__ == "__main__":
    main()


# Neural Net + AAD extraction of Greeks

## Explanantion/ Reasoning 

## Code

In [None]:
# Cell 1 — imports & device setup
import os, math, random, time, pathlib, re
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from torchmetrics import R2Score

# Use inline plots
%matplotlib inline

os.environ.setdefault("NCCL_P2P_LEVEL", "0")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", DEVICE)

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Cell 2 — load & split cleaned Parquet into train/val/test TensorDatasets

# 1) Read cleaned Parquet files
#    (assumes you have already run clean_parquet on Train.parquet and Test100M.parquet)
train_df = pd.read_parquet("Train_clean_5m.parquet", engine="pyarrow")
test_df  = pd.read_parquet("Test_clean_5k.parquet", engine="pyarrow")

# 2) Split train_df → 80% train, 20% validation
X_full = train_df.drop(columns=["price/k"]).values.astype(np.float32)
y_full = train_df["price/k"].values.astype(np.float32)
X_tr_np, X_val_np, y_tr_np, y_val_np = train_test_split(
    X_full, y_full, test_size=0.01, random_state=42
)

# 3) Build TensorDatasets
train_ds = TensorDataset(torch.from_numpy(X_tr_np), torch.from_numpy(y_tr_np))
val_ds   = TensorDataset(torch.from_numpy(X_val_np), torch.from_numpy(y_val_np))
test_ds  = TensorDataset(
    torch.from_numpy(test_df.drop(columns=["price/k"]).values.astype(np.float32)),
    torch.from_numpy(test_df["price/k"].values.astype(np.float32))
)

print(f"train {len(train_ds):,} rows")
print(f"valid {len(val_ds):,} rows")
print(f" test {len(test_ds):,} rows")



In [None]:
# Cell 3 — define the MLP (BasketNet) model

class BasketNet(nn.Module):
    def __init__(self, in_dim: int, width: int, layers: int):
        """
        in_dim = number of input features
        width  = nodes per hidden layer
        layers = number of hidden layers
        """
        super().__init__()
        blocks = [nn.Linear(in_dim, width), nn.ReLU()]
        for _ in range(layers - 1):
            blocks += [nn.Linear(width, width), nn.ReLU()]
        blocks.append(nn.Linear(width, 1))
        self.net = nn.Sequential(*blocks)

        # Xavier initialization
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x):
        return self.net(x)


def run_experiment_updates_per_update(
    width: int = 300,
    layers: int = 4,
    batch_size: int = 50_000,
    n_updates: int = 100_000,     # total mini‐batch updates
    lr: float = 1e-3,
    optimizer_name: str = "Adam",
    seed: int = 42,
    save_model: bool = False,      # ← new flag
    log_every: int = 1             # record every `log_every` updates
):
    """
    Train for exactly n_updates mini‐batches, recording
    train/val MSE at each update, then evaluate on test set.

    Args:
      width, layers:    model architecture
      batch_size:       mini‐batch size
      n_updates:        total optimizer steps (updates)
      lr:               learning rate
      optimizer_name:   "Adam" | "SGD" | "LBFGS"
      seed:             random seed
      save_model:       if True, save final weights to disk
      log_every:        record/compute validation every log_every steps
    """
    # 5.1) Re‐seed for reproducibility
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    # 5.2) Build DataLoaders
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, pin_memory=True)

    # 5.3) Instantiate the model
    in_dim = train_ds[0][0].shape[0]
    model = BasketNet(in_dim, width, layers).to(DEVICE)
    if DEVICE.type == "cuda" and torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # 5.4) Loss & optimizer
    criterion = nn.MSELoss()
    if optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=lr)
    elif optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    elif optimizer_name == "LBFGS":
        optimizer = optim.LBFGS(model.parameters(), lr=lr, max_iter=20)
    else:
        raise ValueError(f"Unknown optimizer: {optimizer_name}")

    # 5.5) Create an infinite iterator over shuffled train_loader
    def infinite_train_iter(loader):
        while True:
            for batch in loader:
                yield batch

    inf_train = infinite_train_iter(train_loader)

    # 5.6) Storage for train/val losses
    train_losses   = []
    valid_losses   = []
    steps_recorded = []

    # decide how often to print (every 10% of total updates)
    print_every = max(1, n_updates // 10)

    # 5.7) Main loop: exactly n_updates optimizer steps
    model.train()
    for step in range(1, n_updates + 1):
        # ----- 1) one optimizer step -----
        Xb, yb = next(inf_train)
        Xb, yb = Xb.to(DEVICE), yb.to(DEVICE).unsqueeze(1)
        optimizer.zero_grad(set_to_none=True)
        out       = model(Xb)
        loss      = criterion(out, yb)
        loss.backward()
        optimizer.step()
        loss_train = loss.item()

        # ----- 2) record & validate every log_every steps -----
        if step == 1 or step == n_updates or (step % log_every == 0):
            # validation pass
            model.eval()
            tot_val, cnt_val = 0.0, 0
            with torch.no_grad():
                for Xv, yv in val_loader:
                    Xv, yv = Xv.to(DEVICE), yv.to(DEVICE).unsqueeze(1)
                    ov = model(Xv)
                    tot_val += criterion(ov, yv).item() * Xv.size(0)
                    cnt_val += Xv.size(0)
            val_mse = tot_val / cnt_val

            # stash losses
            steps_recorded.append(step)
            train_losses.append(loss_train)
            valid_losses.append(val_mse)

            model.train()

            # ----- 3) print only at the first, last, or each 10% milestone -----
            if step == 1 or step == n_updates or (step % print_every == 0):
                print(f"[upd {step}/{n_updates}] "
                      f"train MSE={loss_train:.3e}  val MSE={val_mse:.3e}")

    # ---- 4) plot on the down-sampled grid ----
    train_min = np.minimum.accumulate(train_losses)
    valid_min = np.minimum.accumulate(valid_losses)

    plt.figure(figsize=(7, 4))
    plt.plot(steps_recorded, np.log10(train_min), label="train (cum-min)")
    plt.plot(steps_recorded, np.log10(valid_min), label="valid (cum-min)")
    plt.xlabel("update step")
    plt.ylabel("log10 MSE")
    plt.title(f"{layers}×{width} • bs={batch_size} • {optimizer_name}")
    plt.legend()
    plt.tight_layout()
    plt.show()

    # 5.9) Final test‐set evaluation
    model.eval()
    all_preds, all_truths = [], []
    with torch.no_grad():
        for Xb, yb in test_loader:
            pr = model(Xb.to(DEVICE)).cpu().squeeze()
            all_preds.append(pr)
            all_truths.append(yb)
    preds  = torch.cat(all_preds)
    truths = torch.cat(all_truths)
    test_mse = criterion(preds.unsqueeze(1), truths.unsqueeze(1)).item()
    test_r2  = R2Score()(preds, truths).cpu().item()
    print(f"\nTest results → MSE = {test_mse:.3e}   R² = {test_r2:.4f}")

    # 5.10) Optionally save model weights
    if save_model:
        tag = f"w{width}_L{layers}_bs{batch_size}_upd{n_updates}_" \
              f"{optimizer_name}_lr{lr:g}"
        torch.save(model.state_dict(), f"model_{tag}.pt")
        print(f"Saved model to model_{tag}.pt")

run_experiment_updates_per_update(
        width=250,
        layers=5,
        batch_size=5000,
        n_updates=100000,
        lr=1e-3,
        optimizer_name="Adam",
        seed=42,
        save_model=True,
        log_every=100  
    )



# Dual Headed Neural Net 

## Explananation/ Reasoning 

## Code

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# 1) Device & seeds
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:", DEVICE)
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# 2) Load dataset with price and FD Greeks
df = pd.read_parquet("Train_clean_5m_aad_greeks.parquet", engine="pyarrow")

# 3) Define target and feature columns
target_cols = ["price/k"] + [f"delta_{i}" for i in range(3)] + [f"vega_{i}" for i in range(3)]
feature_cols = [c for c in df.columns if c not in target_cols]

# 4) Extract NumPy arrays
X = df[feature_cols].values.astype(np.float32)
y = df[target_cols].values.astype(np.float32)

# 5) Train/val split (99/1)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.01, random_state=42)

# 6) Separate price, delta, vega targets
price_tr, delta_tr, vega_tr = y_tr[:,0], y_tr[:,1:4], y_tr[:,4:7]
price_val, delta_val, vega_val = y_val[:,0], y_val[:,1:4], y_val[:,4:7]

# 7) Build PyTorch datasets
train_ds = TensorDataset(
    torch.from_numpy(X_tr),
    torch.from_numpy(price_tr),
    torch.from_numpy(delta_tr),
    torch.from_numpy(vega_tr)
)
val_ds = TensorDataset(
    torch.from_numpy(X_val),
    torch.from_numpy(price_val),
    torch.from_numpy(delta_val),
    torch.from_numpy(vega_val)
)
print(f"train {len(train_ds):,} rows")
print(f"valid {len(val_ds):,} rows")

# 8) Define single-output BasketNet model with Softplus activations
class BasketNet(nn.Module):
    def __init__(self, in_dim, width=300, layers=4):
        super().__init__()
        blocks = [nn.Linear(in_dim, width), nn.Softplus()]
        for _ in range(layers - 1):
            blocks += [nn.Linear(width, width), nn.Softplus()]
        blocks.append(nn.Linear(width, 1))
        self.net = nn.Sequential(*blocks)
        # Xavier init for all Linear layers
        for m in self.net:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x):
        # returns shape (batch,)
        return self.net(x).squeeze(1)

# 9) Feature indices for Greeks
delta_idx = [feature_cols.index(f"S0_{i}/K") for i in range(3)]
vega_idx  = [feature_cols.index(f"sigma_{i}") for i in range(3)]

# 10) Training loop with Sobolev (differential) loss
def run_differential(
    width=300,
    layers=4,
    batch_size=50_000,
    n_updates=100_000,
    lr=1e-3,
    λ=1.0,
    log_every=5_000
):
    torch.manual_seed(42)
    np.random.seed(42)
    random.seed(42)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, pin_memory=True)

    model     = BasketNet(len(feature_cols), width, layers).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    train_losses, valid_losses, steps = [], [], []

    # infinite iterator for training
    def inf_iter(loader):
        while True:
            for batch in loader:
                yield batch
    train_iter = inf_iter(train_loader)

    model.train()
    for step in range(1, n_updates + 1):
        Xb, p_true, d_true, v_true = next(train_iter)
        Xb = Xb.to(DEVICE).requires_grad_(True)
        p_true, d_true, v_true = [t.to(DEVICE) for t in (p_true, d_true, v_true)]

        optimizer.zero_grad()

        # forward price
        p_pred = model(Xb)
        loss_p = criterion(p_pred, p_true)

        # compute autograd Greeks (retain graph for Sobolev loss)
        grad = torch.autograd.grad(
            outputs=p_pred,
            inputs=Xb,
            grad_outputs=torch.ones_like(p_pred),
            create_graph=True
        )[0]
        # extract predictions
        d_pred = grad[:, delta_idx]
        v_pred = grad[:, vega_idx]
        loss_g = criterion(d_pred, d_true) + criterion(v_pred, v_true)

        # total loss
        loss = loss_p + λ * loss_g
        loss.backward()
        optimizer.step()

        # validation & logging
        if step == 1 or step % log_every == 0 or step == n_updates:
            model.eval()
            tot_val, cnt = 0.0, 0
            for Xv, pv, dv, vv in val_loader:
                Xv = Xv.to(DEVICE).requires_grad_(True)
                pv, dv, vv = [t.to(DEVICE) for t in (pv, dv, vv)]

                pp = model(Xv)
                lp = criterion(pp, pv)

                g = torch.autograd.grad(
                    outputs=pp,
                    inputs=Xv,
                    grad_outputs=torch.ones_like(pp),
                    create_graph=False
                )[0]
                lg = criterion(g[:, delta_idx], dv) + criterion(g[:, vega_idx], vv)

                tot_val += (lp + λ * lg).item() * Xv.size(0)
                cnt     += Xv.size(0)

            val_loss = tot_val / cnt
            train_losses.append(loss.item())
            valid_losses.append(val_loss)
            steps.append(step)
            print(f"[upd {step}/{n_updates}] train loss={loss.item():.3e}  val loss={val_loss:.3e}")
            model.train()

    # plot training curves
    plt.figure(figsize=(7,4))
    plt.plot(steps, np.log10(train_losses), label="train")
    plt.plot(steps, np.log10(valid_losses), label="valid")
    plt.xlabel("update step")
    plt.ylabel("log10 loss")
    plt.legend()
    plt.tight_layout()
    plt.show()

    return model

# 11) Run training
model = run_differential(
    width=250,
    layers=5,
    batch_size=5_000,
    n_updates=100_000,
    lr=1e-3,
    λ=1.0,
    log_every=5_000
)

# 12) Test evaluation
test_df = pd.read_parquet("Test_clean_5k_aad_greeks.parquet", engine="pyarrow")
X_test = torch.from_numpy(test_df[feature_cols].values.astype(np.float32)).to(DEVICE).requires_grad_(True)
y_test = test_df[target_cols].values.astype(np.float32)

model.eval()
# forward pass
p_pred_tensor = model(X_test)
# compute AAD Greeks
grad_test = torch.autograd.grad(
    outputs=p_pred_tensor,
    inputs=X_test,
    grad_outputs=torch.ones_like(p_pred_tensor)
)[0]

# convert to numpy
price_pred = p_pred_tensor.detach().cpu().numpy()
delta_pred = grad_test[:, delta_idx].detach().cpu().numpy()
vega_pred  = grad_test[:, vega_idx].detach().cpu().numpy()

# true targets
p_true = y_test[:, 0]
d_true = y_test[:, 1:4]
v_true = y_test[:, 4:7]

# 13) Metrics & plots
def report_and_plot(name, pred, true):
    mae  = np.mean(np.abs(pred - true))
    rmse = np.sqrt(np.mean((pred - true)**2))
    r2   = np.corrcoef(pred, true)[0,1]**2
    print(f"{name:>8s}: MAE={mae:.3e}, RMSE={rmse:.3e}, R²={r2:.4f}")
    plt.figure(figsize=(4,4))
    plt.scatter(true, pred, s=3, alpha=0.3)
    lo, hi = min(true.min(), pred.min()), max(true.max(), pred.max())
    plt.plot([lo,hi],[lo,hi],'k--')
    plt.title(f"{name}: MAE={mae:.3e}, RMSE={rmse:.3e}, R²={r2:.4f}")
    plt.xlabel("True")
    plt.ylabel("Predicted")
    plt.grid(ls=":")
    plt.tight_layout()
    plt.show()

print("\nTest set performance:")
report_and_plot("Price", price_pred, p_true)
for i in range(3):
    report_and_plot(f"Delta_{i}", delta_pred[:, i], d_true[:, i])
for i in range(3):
    report_and_plot(f"Vega_{i}", vega_pred[:, i], v_true[:, i])

# 14) Save NN predictions for Test and Train (5k) in comparison-ready format
pred_cols = ["price/k"] + [f"delta_{i}" for i in range(3)] + [f"vega_{i}" for i in range(3)]

def predict_df(df_like: pd.DataFrame, batch_size: int = 100_000) -> pd.DataFrame:
    """Return a DataFrame with columns ['price/k','delta_0..2','vega_0..2'] for rows in df_like."""
    model.eval()
    outs = []
    idxs = []
    n = len(df_like)
    for i in range(0, n, batch_size):
        Xb_np = df_like.iloc[i:i+batch_size][feature_cols].values.astype(np.float32)
        Xb = torch.from_numpy(Xb_np).to(DEVICE)
        Xb.requires_grad_(True)

        pb = model(Xb)  # (B,)
        gb = torch.autograd.grad(outputs=pb, inputs=Xb, grad_outputs=torch.ones_like(pb))[0]  # (B, F)

        block = np.column_stack([
            pb.detach().cpu().numpy(),
            gb[:, delta_idx].detach().cpu().numpy(),
            gb[:, vega_idx].detach().cpu().numpy()
        ])
        outs.append(block)
        idxs.append(df_like.index[i:i+batch_size])

    out = np.vstack(outs) if len(outs) > 1 else outs[0]
    pred = pd.DataFrame(out, columns=pred_cols, index=pd.Index(np.concatenate([np.array(ix) for ix in idxs])))
    pred = pred.loc[df_like.index]  # preserve original order
    return pred

# Test predictions saved as Test_clean_5k_NN.parquet
test_pred_df = predict_df(test_df)
test_pred_df.to_parquet("Test_clean_5k_NN.parquet", engine="pyarrow", index=True)

# Train 5k sample predictions saved as Train_clean_5k_NN.parquet
train_5k = df.sample(n=5000, random_state=42)
train_pred_df = predict_df(train_5k)
train_pred_df.to_parquet("Train_clean_5k_NN.parquet", engine="pyarrow", index=True)

print("Saved: Test_clean_5k_NN.parquet  and  Train_clean_5k_NN.parquet")


# Results

In [1]:
import pandas as pd
import numpy as np

# ——— load as before ———
aad_mc     = pd.read_parquet("Test_clean_5k_aad_greeks.parquet")
fd_mc      = pd.read_parquet("Test_clean_5k_fd_greeks.parquet")
model_aad  = pd.read_parquet("Test_clean_5k_Model+AAD_greeks.parquet")
model_nn   = pd.read_parquet("Test_clean_5k_NN.parquet")   # <-- added NN predictions

delta_cols = [f"delta_{i}" for i in range(3)]
vega_cols  = [f"vega_{i}"  for i in range(3)]

def enhanced_stats(stored: pd.DataFrame, model: pd.DataFrame, cols):
    diffs     = model[cols] - stored[cols]
    abs_diffs = diffs.abs()
    sq_diffs  = diffs ** 2

    stats = pd.DataFrame(index=cols)
    stats['count']     = diffs.count().values
    stats['mean_diff'] = diffs.mean().values
    stats['std_diff']  = diffs.std().values
    stats['min_diff']  = diffs.min().values
    stats['25%']       = diffs.quantile(0.25).values
    stats['50%']       = diffs.median().values
    stats['75%']       = diffs.quantile(0.75).values
    stats['max_diff']  = diffs.max().values

    # additional error metrics
    stats['MAE']  = abs_diffs.mean().values
    stats['MSE']  = sq_diffs.mean().values
    stats['RMSE'] = np.sqrt(stats['MSE'])

    # R^2 = (Pearson r)^2
    r2_list = []
    for col in cols:
        r = stored[col].corr(model[col])
        r2_list.append(r**2)
    stats['R2'] = r2_list

    return stats

# ——— compute vs Model+AAD (existing) ———
delta_aad_vs_mc_aad = enhanced_stats(aad_mc,    model_aad, delta_cols)
delta_aad_vs_mc_fd  = enhanced_stats(fd_mc,     model_aad, delta_cols)
vega_aad_vs_mc_aad  = enhanced_stats(aad_mc,    model_aad, vega_cols)
vega_aad_vs_mc_fd   = enhanced_stats(fd_mc,     model_aad, vega_cols)

# ——— compute vs Model NN (added) ———
delta_nn_vs_mc_aad = enhanced_stats(aad_mc,    model_nn, delta_cols)
delta_nn_vs_mc_fd  = enhanced_stats(fd_mc,     model_nn, delta_cols)
vega_nn_vs_mc_aad  = enhanced_stats(aad_mc,    model_nn, vega_cols)
vega_nn_vs_mc_fd   = enhanced_stats(fd_mc,     model_nn, vega_cols)

# ——— display ———
print("=== Delta: Model AAD vs MC AAD ===")
display(delta_aad_vs_mc_aad)

print("\n=== Delta: Model AAD vs MC FD ===")
display(delta_aad_vs_mc_fd)

print("\n=== Vega: Model AAD vs MC AAD ===")
display(vega_aad_vs_mc_aad)

print("\n=== Vega: Model AAD vs MC FD ===")
display(vega_aad_vs_mc_fd)

print("\n=== Delta: Model NN vs MC AAD ===")
display(delta_nn_vs_mc_aad)

print("\n=== Delta: Model NN vs MC FD ===")
display(delta_nn_vs_mc_fd)

print("\n=== Vega: Model NN vs MC AAD ===")
display(vega_nn_vs_mc_aad)

print("\n=== Vega: Model NN vs MC FD ===")
display(vega_nn_vs_mc_fd)


=== Delta: Model AAD vs MC AAD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
delta_0,4806,-0.000707,0.014806,-0.15729,-0.005265,-0.000404,0.004152,0.110328,0.008789,0.00022,0.014821,0.993985
delta_1,4806,0.000362,0.016276,-0.152547,-0.004731,4.4e-05,0.004693,0.160013,0.009169,0.000265,0.016278,0.993185
delta_2,4806,-0.000707,0.014941,-0.124199,-0.005217,-0.000312,0.003866,0.183293,0.008755,0.000224,0.014956,0.993885



=== Delta: Model AAD vs MC FD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
delta_0,4806,-0.000707,0.014806,-0.15729,-0.005265,-0.000404,0.004151,0.110327,0.008789,0.00022,0.014821,0.993985
delta_1,4806,0.000362,0.016276,-0.152546,-0.004731,4.4e-05,0.004693,0.16001,0.009169,0.000265,0.016278,0.993185
delta_2,4806,-0.000707,0.014941,-0.124199,-0.005217,-0.000312,0.003866,0.183294,0.008755,0.000224,0.014956,0.993885



=== Vega: Model AAD vs MC AAD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
vega_0,4806,0.000123,0.023004,-0.176788,-0.009497,0.000137,0.008973,0.197933,0.014824,0.000529,0.023002,0.985453
vega_1,4806,0.000515,0.024158,-0.311557,-0.008432,0.000171,0.009013,0.248155,0.014766,0.000584,0.024161,0.982299
vega_2,4806,-0.000697,0.024758,-0.297383,-0.009155,-0.000226,0.008687,0.199936,0.014985,0.000613,0.024765,0.982617



=== Vega: Model AAD vs MC FD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
vega_0,4806,0.000123,0.023004,-0.176788,-0.009497,0.000138,0.008973,0.197929,0.014824,0.000529,0.023002,0.985453
vega_1,4806,0.000515,0.024158,-0.311557,-0.008432,0.000171,0.009013,0.248155,0.014766,0.000584,0.024161,0.982299
vega_2,4806,-0.000697,0.024758,-0.297384,-0.009155,-0.000226,0.008687,0.199935,0.014985,0.000613,0.024765,0.982617



=== Delta: Model NN vs MC AAD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
delta_0,4806,0.000782,0.004059,-0.0585,-0.000192,0.00048,0.001433,0.108188,0.001932,1.7e-05,0.004133,0.999558
delta_1,4806,-0.000553,0.005777,-0.113954,-0.001832,-0.000571,0.00026,0.109329,0.002547,3.4e-05,0.005803,0.99914
delta_2,4806,0.000767,0.003752,-0.045523,-0.000206,0.000585,0.001585,0.061217,0.001995,1.5e-05,0.003829,0.999615



=== Delta: Model NN vs MC FD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
delta_0,4806,0.000782,0.004059,-0.0585,-0.000192,0.00048,0.001433,0.108188,0.001932,1.7e-05,0.004133,0.999558
delta_1,4806,-0.000553,0.005777,-0.113954,-0.001832,-0.000571,0.00026,0.109329,0.002547,3.4e-05,0.005803,0.99914
delta_2,4806,0.000767,0.003752,-0.045524,-0.000205,0.000584,0.001585,0.061217,0.001995,1.5e-05,0.003829,0.999615



=== Vega: Model NN vs MC AAD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
vega_0,4806,0.001093,0.00536,-0.079406,-0.000852,0.000774,0.002787,0.115532,0.003029,3e-05,0.00547,0.99921
vega_1,4806,0.000312,0.006909,-0.20994,-0.001675,-0.000168,0.001805,0.16233,0.003273,4.8e-05,0.006915,0.998535
vega_2,4806,-0.000766,0.006298,-0.14359,-0.002406,-0.000534,0.001242,0.07115,0.003396,4e-05,0.006344,0.998876



=== Vega: Model NN vs MC FD ===


Unnamed: 0,count,mean_diff,std_diff,min_diff,25%,50%,75%,max_diff,MAE,MSE,RMSE,R2
vega_0,4806,0.001093,0.00536,-0.079406,-0.000852,0.000773,0.002787,0.115532,0.003029,3e-05,0.00547,0.99921
vega_1,4806,0.000312,0.006909,-0.209942,-0.001675,-0.000168,0.001805,0.16233,0.003273,4.8e-05,0.006915,0.998535
vega_2,4806,-0.000766,0.006298,-0.143589,-0.002406,-0.000534,0.001242,0.07115,0.003396,4e-05,0.006344,0.998876


# Conclusion