# VAE Latent Risk Factor - Pipeline Dashboard

Central configuration and execution notebook for the full walk-forward validation pipeline.

**Workflow:**
1. Configure all parameters (Sections 1-2)
2. Load data (Section 3)
3. Run pipeline (Section 4)
4. Inspect results (Sections 5-7)

---
## 1. Setup

In [None]:
import os
import sys
import json
import logging
import tempfile
from dataclasses import replace, asdict
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

# Project root: go up from notebooks/ to project root
_NB_DIR = Path(os.path.abspath("")).resolve()
PROJECT_ROOT = (_NB_DIR / "..").resolve() if _NB_DIR.name == "notebooks" else _NB_DIR
os.chdir(PROJECT_ROOT)
sys.path.insert(0, str(PROJECT_ROOT))

from src.config import (
    PipelineConfig,
    DataPipelineConfig,
    VAEArchitectureConfig,
    LossConfig,
    TrainingConfig,
    InferenceConfig,
    RiskModelConfig,
    PortfolioConfig,
    WalkForwardConfig,
)
from src.data_pipeline.data_loader import generate_synthetic_csv, load_stock_data
from src.data_pipeline.returns import compute_log_returns
from src.data_pipeline.features import compute_trailing_volatility
from src.integration.pipeline import FullPipeline
from src.integration.reporting import format_summary_table, serialize_for_json
from src.integration.visualization import (
    plot_fold_metrics,
    plot_e_star_distribution,
    plot_pairwise_heatmap,
    style_summary_table,
    style_fold_table,
)
from src.walk_forward.selection import aggregate_fold_metrics, summary_statistics

%matplotlib inline
plt.rcParams["figure.dpi"] = 120

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s")
logger = logging.getLogger("dashboard")

print(f"PyTorch {torch.__version__} | Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
print(f"Working directory: {os.getcwd()}")

---
## 2. Configuration

Two configuration profiles are available. **Run ONLY one section:**
- **Section 2a** — Synthetic data: minimal parameters for quick end-to-end testing
- **Section 2b** — Real data: full production configuration

Always run the **Global** cell (below) first, then choose ONE section.

In [None]:
# ============================================================
# GLOBAL
# ============================================================
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

USE_SYNTHETIC = True
QUICK_MODE = False  # Set True for minimal config even with real data

### 2a. Synthetic Data (Quick Test)

Run **only this cell** to configure the pipeline for a minimal end-to-end test on synthetic data. Skip Section 2b entirely.

In [None]:
# ============================================================
# SYNTHETIC — Minimal config for end-to-end testing
# Run ONLY this cell, then jump to Section 3
# ============================================================

if USE_SYNTHETIC:
    QUICK_MODE = True  # Always True for synthetic data
    DATA_PATH = ""
    N_STOCKS = 10
    N_YEARS = 6

    # K=10 for speed; r_max relaxed because C_L floor (384) makes the CNN
    # too large for small synthetic universes — acceptable for testing only.
    config = PipelineConfig(
        data=DataPipelineConfig(
            n_stocks=N_STOCKS,
            window_length=504,
            n_features=2,
        ),
        vae=VAEArchitectureConfig(
            K=10,
            window_length=504,
            n_features=2,
            r_max=1e6,
        ),
        loss=LossConfig(mode="P"),
        training=TrainingConfig(
            max_epochs=1,
            batch_size=256,
            learning_rate=1e-4,
            patience=1,
        ),
        inference=InferenceConfig(),
        risk_model=RiskModelConfig(),
        portfolio=PortfolioConfig(n_starts=2),
        walk_forward=WalkForwardConfig(
            total_years=N_YEARS,
            min_training_years=max(3, N_YEARS // 3),
            holdout_years=max(1, N_YEARS // 5),
        ),
        seed=SEED,
    )

    HP_GRID = [{"mode": "P", "learning_rate": 1e-4, "alpha": 1.0}]

    print(f"[Synthetic mode] {N_STOCKS} stocks, {N_YEARS} years, K={config.vae.K}")
    print(f"  max_epochs={config.training.max_epochs}, patience={config.training.patience}, HP_GRID=1 config, n_starts=2")
    print(f"  r_max={config.vae.r_max:.0e} (relaxed for testing)")
    print(f"  Walk-forward: {config.walk_forward.total_years}y total, "
        f"{config.walk_forward.min_training_years}y min training, "
        f"{config.walk_forward.holdout_years}y holdout")
    print(f"  Device: {DEVICE}")

### 2b. Real Data (Production)

Run **all cells below** (through "ASSEMBLE FULL CONFIG") for full production configuration. Skip Section 2a.

In [None]:
# ============================================================
# DATA SOURCE — Real data
# ============================================================
DATA_PATH = "data/stock_data.csv"  # <-- Set path to your stock data CSV

# Synthetic parameters (unused when USE_SYNTHETIC=False)
N_STOCKS = 50
N_YEARS = 10

In [None]:
# ============================================================
# DATA PIPELINE (MOD-001)
# ============================================================
data_cfg = DataPipelineConfig(
    n_stocks=1000,               # universe cap (max stocks)
    window_length=504,           # T: sliding window length (trading days)
    n_features=2,                # F: features per timestep (return + realized vol)
    vol_window=252,              # trailing vol lookback (days)
    vix_lookback_percentile=80.0,# VIX percentile for crisis threshold
    min_valid_fraction=0.80,     # minimum valid data fraction per stock
)

In [None]:
# ============================================================
# VAE ARCHITECTURE (MOD-002)
# ============================================================
vae_cfg = VAEArchitectureConfig(
    K=200,                       # latent capacity ceiling
    sigma_sq_init=1.0,           # initial observation noise
    sigma_sq_min=1e-4,           # lower clamp for sigma^2
    sigma_sq_max=10.0,           # upper clamp for sigma^2
    window_length=504,           # T (must match data_cfg)
    n_features=2,                # F (must match data_cfg)
    r_max=5.0,                   # max parameter/data ratio (relaxed for synthetic)
)

print(f"Encoder depth L={vae_cfg.encoder_depth}, "
      f"Final width C_L={vae_cfg.final_layer_width}, "
      f"D={vae_cfg.D}")

In [None]:
# ============================================================
# LOSS FUNCTION (MOD-004)
# ============================================================
loss_cfg = LossConfig(
    mode="P",                    # 'P' (primary), 'F' (fallback), 'A' (advanced)
    gamma=3.0,                   # crisis overweighting factor
    lambda_co_max=0.5,           # max co-movement loss weight
    beta_fixed=1.0,              # fixed beta for Mode A
    warmup_fraction=0.20,        # fraction of epochs for Mode F warmup
    max_pairs=2048,              # max pairs for co-movement loss
    delta_sync=21,               # max date gap for synchronization (days)
)

In [None]:
# ============================================================
# TRAINING (MOD-005)
# ============================================================
training_cfg = TrainingConfig(
    max_epochs=100,              # maximum training epochs
    batch_size=256,              # batch size
    learning_rate=1e-4,          # initial learning rate (eta_0)
    weight_decay=1e-5,           # Adam weight decay
    adam_betas=(0.9, 0.999),     # Adam betas
    adam_eps=1e-8,               # Adam epsilon
    patience=10,                 # early stopping patience (epochs)
    lr_patience=5,               # ReduceLROnPlateau patience
    lr_factor=0.5,               # ReduceLROnPlateau factor
    n_strata=15,                 # strata for synchronous batching
    curriculum_phase1_frac=0.30, # fraction of epochs: sync+stratified batching
    curriculum_phase2_frac=0.30, # fraction of epochs: + co-movement loss ramp
)

In [None]:
# ============================================================
# INFERENCE (MOD-006)
# ============================================================
inference_cfg = InferenceConfig(
    batch_size=512,              # inference batch size
    au_threshold=0.01,           # KL threshold for active unit (nats)
    r_min=2,                     # min observations-per-parameter for AU_max
    aggregation_method="mean",   # profile aggregation method
)

In [None]:
# ============================================================
# RISK MODEL (MOD-007)
# ============================================================
risk_model_cfg = RiskModelConfig(
    winsorize_lo=5.0,            # lower percentile for vol ratio winsorization
    winsorize_hi=95.0,           # upper percentile
    d_eps_floor=1e-6,            # floor for idiosyncratic variance
    conditioning_threshold=1e6,  # condition number threshold for ridge fallback
    ridge_scale=1e-6,            # ridge regularization scale
)

In [None]:
# ============================================================
# PORTFOLIO OPTIMIZATION (MOD-008)
# Constraints identical for VAE and all 6 benchmarks (INV-012)
# ============================================================
portfolio_cfg = PortfolioConfig(
    lambda_risk=1.0,             # risk aversion
    w_max=0.05,                  # max weight per stock (hard cap)
    w_min=0.001,                 # min active weight (semi-continuous)
    w_bar=0.03,                  # concentration penalty threshold
    phi=25.0,                    # concentration penalty weight
    kappa_1=0.1,                 # linear turnover penalty
    kappa_2=7.5,                 # quadratic turnover penalty
    delta_bar=0.01,              # turnover penalty threshold
    tau_max=0.30,                # max one-way turnover (hard cap)
    n_starts=5,                  # multi-start initializations
    sca_max_iter=100,            # max SCA iterations
    sca_tol=1e-8,               # SCA convergence tolerance
    armijo_c=1e-4,               # Armijo sufficient decrease
    armijo_rho=0.5,              # Armijo backtracking factor
    armijo_max_iter=20,          # max Armijo backtracking steps
    max_cardinality_elim=100,    # max cardinality elimination rounds
    entropy_eps=1e-30,           # numerical stability for log()
    alpha_grid=[0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0],  # frontier alpha grid
)

In [None]:
# ============================================================
# WALK-FORWARD VALIDATION (MOD-009)
# ============================================================
walk_forward_cfg = WalkForwardConfig(
    total_years=30,              # total history length
    min_training_years=10,       # minimum training window
    oos_months=6,                # out-of-sample period (months)
    embargo_days=21,             # embargo between train and OOS (trading days)
    holdout_years=3,             # final holdout period
    val_years=2,                 # nested validation for Phase A
    score_lambda_pen=5.0,        # MDD penalty weight in composite score
    score_lambda_est=2.0,        # estimation quality penalty weight
    score_mdd_threshold=0.20,    # MDD threshold in composite score
)

In [None]:
# ============================================================
# HP GRID for Phase A (set to None for default 18-config grid)
# ============================================================
HP_GRID = None  # None = default: 3 modes x 2 LRs x 3 alphas = 18 configs

# Uncomment to define a custom grid:
# HP_GRID = [
#     {"mode": "P", "learning_rate": 5e-4, "alpha": 1.0},
#     {"mode": "F", "learning_rate": 1e-3, "alpha": 0.5},
#     {"mode": "A", "learning_rate": 1e-3, "alpha": 2.0},
# ]

In [None]:
# ============================================================
# ASSEMBLE FULL CONFIG
# ============================================================
if not USE_SYNTHETIC:
      config = PipelineConfig(
            data=data_cfg,
            vae=vae_cfg,
            loss=loss_cfg,
            training=training_cfg,
            inference=inference_cfg,
            risk_model=risk_model_cfg,
            portfolio=portfolio_cfg,
            walk_forward=walk_forward_cfg,
            seed=SEED,
      )

      print("PipelineConfig assembled.")
      print(f"  Walk-forward: {config.walk_forward.total_years}y total, "
            f"{config.walk_forward.min_training_years}y min training, "
            f"{config.walk_forward.holdout_years}y holdout")
      print(f"  VAE: K={config.vae.K}, T={config.vae.window_length}, F={config.vae.n_features}")
      print(f"  Training: {config.training.max_epochs} max epochs, "
            f"bs={config.training.batch_size}, lr={config.training.learning_rate}")
      print(f"  Loss mode: {config.loss.mode}, gamma={config.loss.gamma}")
      print(f"  Capacity guard r_max: {config.vae.r_max}")
      print(f"  Device: {DEVICE}")

In [None]:
# ============================================================
# QUICK_MODE override for real data
# When QUICK_MODE=True on real data, apply the same minimal
# config as the synthetic path for fast end-to-end testing.
# ============================================================
if QUICK_MODE and not USE_SYNTHETIC:
    config = replace(config,
        vae=replace(config.vae, K=10, r_max=1e6),
        training=replace(config.training, max_epochs=1, patience=1, batch_size=256, learning_rate=1e-4),
        portfolio=replace(config.portfolio, n_starts=2),
    )
    HP_GRID = [{"mode": "P", "learning_rate": 1e-4, "alpha": 1.0}]
    print("[QUICK MODE] Minimal config applied to real data")
    print(f"  K={config.vae.K}, max_epochs={config.training.max_epochs}, "
          f"patience={config.training.patience}, n_starts={config.portfolio.n_starts}")
    print(f"  HP_GRID=1 config, r_max={config.vae.r_max:.0e}")

---
## 3. Data Loading

In [None]:
np.random.seed(SEED)

if USE_SYNTHETIC:
    start_year = 2000
    end_year = start_year + N_YEARS
    start_date = f"{start_year}-01-03"

    with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
        csv_path = f.name

    generate_synthetic_csv(
        csv_path,
        n_stocks=N_STOCKS,
        start_date=start_date,
        end_date=f"{end_year}-12-31",
        seed=SEED,
    )
    stock_data = load_stock_data(csv_path)
    os.unlink(csv_path)
    print(f"Synthetic data: {N_STOCKS} stocks, {start_date} to {end_year}-12-31")
else:
    stock_data = load_stock_data(DATA_PATH)
    start_date = str(stock_data["date"].min().date())
    print(f"Loaded data from {DATA_PATH}")

print(f"Stock data shape: {stock_data.shape}")
print(f"Date range: {stock_data['date'].min()} to {stock_data['date'].max()}")
print(f"Unique stocks: {stock_data['permno'].nunique()}")
stock_data.head()

In [None]:
# Compute log-returns and trailing volatility
returns = compute_log_returns(stock_data)
trailing_vol = compute_trailing_volatility(returns, window=config.data.vol_window)

print(f"Returns: {returns.shape[0]} dates x {returns.shape[1]} stocks")
print(f"Trailing vol: {trailing_vol.shape} (first {config.data.vol_window-1} rows NaN)")
print(f"Returns date range: {returns.index[0]} to {returns.index[-1]}")

---
## 4. Run Pipeline

Executes the full walk-forward validation: Phase A (HP selection) + Phase B (deployment) on each fold, then benchmarks, statistical tests, and report generation.

In [None]:
pipeline = FullPipeline(config)

results = pipeline.run(
    stock_data=stock_data,
    returns=returns,
    trailing_vol=trailing_vol,
    skip_phase_a=USE_SYNTHETIC,
    vix_data=None,
    start_date=start_date,
    hp_grid=HP_GRID,
    device=DEVICE,
)

print("Pipeline complete.")
print(f"Folds processed: {len(results['vae_results'])}")
print(f"Benchmarks: {list(results['benchmark_results'].keys())}")

---
## 5. Results - Summary Report

In [None]:
# Text summary
print(format_summary_table(results["report"]))

In [None]:
# Deployment recommendation
deployment = results["report"]["deployment"]
print(f"Scenario: {deployment['scenario']}")
print(f"Recommendation: {deployment['recommendation']}")
print()
print("Per-benchmark wins (VAE vs benchmark on primary metrics):")
for bench, info in deployment["per_benchmark"].items():
    print(f"  {bench:20s}: {info['wins']}/{info['total']} metrics won")

In [None]:
# VAE summary statistics
vae_df = aggregate_fold_metrics(results["vae_results"])
vae_summary = summary_statistics(vae_df)
print("VAE Summary Statistics:")
style_summary_table(vae_summary)

In [None]:
# Benchmark summary statistics
for bench_name, bench_metrics in results["benchmark_results"].items():
    bench_df = aggregate_fold_metrics(bench_metrics)
    bench_summary = summary_statistics(bench_df)
    print(f"\n{bench_name} Summary:")
    display(style_summary_table(bench_summary))

---
## 6. Results - Per-Fold Detail

In [None]:
# VAE per-fold metrics
print("VAE Per-Fold Metrics:")
style_fold_table(vae_df)

In [None]:
# E* distribution
e_star_summary = results["report"]["e_star_summary"]
print(f"E* epochs: mean={e_star_summary['mean']:.1f}, "
      f"std={e_star_summary['std']:.1f}, "
      f"range=[{e_star_summary['min']}, {e_star_summary['max']}]")

plot_e_star_distribution(results["e_stars"])
plt.show()

In [None]:
# Fold metrics: VAE vs benchmarks
plot_fold_metrics(results["vae_results"], results["benchmark_results"])
plt.show()

---
## 7. Results - Statistical Tests

In [None]:
# Pairwise tests heatmap
plot_pairwise_heatmap(results["report"])
plt.show()

In [None]:
# Detailed pairwise test results
tests = results["report"]["statistical_tests"]
print(f"Total comparisons: {tests['n_tests']} (alpha={tests['alpha']})")
print()

for bench_name, metrics in tests["pairwise"].items():
    print(f"VAE vs {bench_name}:")
    for metric, result in metrics.items():
        if result.get("skipped", False):
            print(f"  {metric}: skipped ({result['reason']})")
            continue
        sig = " *" if result.get("significant_corrected", False) else ""
        print(f"  {metric}: delta={result['median_delta']:+.4f} "
              f"[{result['ci_lower']:+.4f}, {result['ci_upper']:+.4f}] "
              f"p={result.get('p_corrected', result['p_value']):.4f}{sig}")
    print()

---
## 8. Export Results

In [None]:
OUTPUT_DIR = "results/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Fold metrics CSV
vae_df.to_csv(os.path.join(OUTPUT_DIR, "vae_fold_metrics.csv"), index=False)

for bench_name, bench_metrics in results["benchmark_results"].items():
    bench_df = aggregate_fold_metrics(bench_metrics)
    bench_df.to_csv(os.path.join(OUTPUT_DIR, f"{bench_name}_fold_metrics.csv"), index=False)

# Text report
with open(os.path.join(OUTPUT_DIR, "report.txt"), "w") as f:
    f.write(format_summary_table(results["report"]))

# JSON report
with open(os.path.join(OUTPUT_DIR, "report.json"), "w") as f:
    json.dump(serialize_for_json(results["report"]), f, indent=2)

# Config snapshot
with open(os.path.join(OUTPUT_DIR, "config.json"), "w") as f:
    json.dump(serialize_for_json(asdict(config)), f, indent=2)

print(f"Results saved to {OUTPUT_DIR}")
print(f"  vae_fold_metrics.csv")
print(f"  <benchmark>_fold_metrics.csv (x{len(results['benchmark_results'])})")
print(f"  report.txt")
print(f"  report.json")
print(f"  config.json")