# 📈 Google Stock ML Prediction Project

End-to-end machine learning pipeline for predicting Google (GOOGL) stock next-day returns.

## Models
- **XGBoost** — Gradient boosting with 3-stage HPO
- **LSTM & GRU** — Recurrent neural networks
- **Hybrid** — Sequential & Parallel architectures

## Sections
1. Configuration & Helpers
2. Data Ingestion & Preprocessing
3. Exploratory Data Analysis
4. Train/Valid/Test Split & NN Features
5. Feature Selection
6. XGBoost HPO
7. XGBoost Final Model
8. LSTM & GRU
9. Hybrid Neural Networks
10. Final Summary

---

---
# SECTION 1: Configuration & Helpers

**Setup, imports, paths, and helper functions**

**Blocks:** 0-1

## BOOT + BLOCK 0 — CONFIG + HELPERS

In [None]:

import os
import sys
import json
import time
import pickle
import shutil
import subprocess
import warnings
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, Optional, Tuple
from glob import glob

warnings.filterwarnings("ignore")

# --- Colab Detection & Drive Mount (once) ---
IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    from google.colab import drive
    if not Path("/content/drive/MyDrive").exists():
        drive.mount("/content/drive", force_remount=False)

# --- Project Paths ---
DRIVE_PROJECT_ROOT = Path(os.environ.get(
    "DRIVE_PROJECT_ROOT", "/content/drive/MyDrive/my_project"
)).expanduser()
LOCAL_PROJECT_ROOT = Path(os.environ.get(
    "LOCAL_PROJECT_ROOT", "/content/my_project"
)).expanduser()

DRIVE_PROJECT_ROOT.mkdir(parents=True, exist_ok=True)
LOCAL_PROJECT_ROOT.mkdir(parents=True, exist_ok=True)

# Sync Drive -> Local if local is empty but Drive has content
local_has_files = any(LOCAL_PROJECT_ROOT.rglob("*"))
drive_has_files = any(DRIVE_PROJECT_ROOT.rglob("*"))

if (not local_has_files) and drive_has_files:
    for item in LOCAL_PROJECT_ROOT.iterdir():
        if item.is_dir():
            shutil.rmtree(item)
        else:
            item.unlink()
    shutil.copytree(DRIVE_PROJECT_ROOT, LOCAL_PROJECT_ROOT, dirs_exist_ok=True)

# Active project root (local for fast I/O)
PROJECT_ROOT = LOCAL_PROJECT_ROOT

print("[BOOT] DRIVE_PROJECT_ROOT:", DRIVE_PROJECT_ROOT)
print("[BOOT] LOCAL_PROJECT_ROOT:", LOCAL_PROJECT_ROOT)
print("[BOOT] PROJECT_ROOT (active):", PROJECT_ROOT)

# --- Run ID & Directories ---
RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")

LOCAL_RUNS_ROOT = Path(os.environ.get(
    "LOCAL_RUNS_ROOT", str(PROJECT_ROOT / "runs")
)).expanduser()
DRIVE_RUNS_ROOT = Path(os.environ.get(
    "DRIVE_RUNS_ROOT", str(DRIVE_PROJECT_ROOT / "runs")
)).expanduser()

LOCAL_RUN_DIR = LOCAL_RUNS_ROOT / RUN_ID
DRIVE_RUN_DIR = DRIVE_RUNS_ROOT / RUN_ID


def _mk_run_dirs(run_dir: Path) -> Dict[str, Path]:
    """Create standard run directory structure."""
    paths = {
        "run_dir": run_dir,
        "outputs_dir": run_dir / "outputs",
        "models_dir": run_dir / "models",
        "reports_dir": run_dir / "reports",
        "plots_dir": run_dir / "plots",
        "config_dir": run_dir / "config",
        "logs_dir": run_dir / "logs",
        "proc_dir": run_dir / "processed",
        "fs_dir": run_dir / "feature_selection",
        "ms_dir": run_dir / "model_selection",
    }
    for p in paths.values():
        p.mkdir(parents=True, exist_ok=True)
    return paths


LOCAL_PATHS = _mk_run_dirs(LOCAL_RUN_DIR)
DRIVE_PATHS = _mk_run_dirs(DRIVE_RUN_DIR)

# Active runtime uses LOCAL paths (fast I/O)
RUN_DIR = LOCAL_PATHS["run_dir"]
OUTPUTS_DIR = LOCAL_PATHS["outputs_dir"]
MODELS_DIR = LOCAL_PATHS["models_dir"]
REPORTS_DIR = LOCAL_PATHS["reports_dir"]
PLOTS_DIR = LOCAL_PATHS["plots_dir"]
CONFIG_DIR = LOCAL_PATHS["config_dir"]
LOGS_DIR = LOCAL_PATHS["logs_dir"]
PROC_DIR = LOCAL_PATHS["proc_dir"]
FS_DIR = LOCAL_PATHS["fs_dir"]
MS_DIR = LOCAL_PATHS["ms_dir"]

# Project-level data directories (not run-specific)
DATA_DIRS_LOCAL = {
    "raw": PROJECT_ROOT / "data" / "raw",
    "interim": PROJECT_ROOT / "data" / "interim",
    "processed": PROJECT_ROOT / "data" / "processed",
}
DATA_DIRS_DRIVE = {
    "raw": DRIVE_PROJECT_ROOT / "data" / "raw",
    "interim": DRIVE_PROJECT_ROOT / "data" / "interim",
    "processed": DRIVE_PROJECT_ROOT / "data" / "processed",
}
for _d in list(DATA_DIRS_LOCAL.values()) + list(DATA_DIRS_DRIVE.values()):
    _d.mkdir(parents=True, exist_ok=True)

print("[CONFIG] RUN_ID:", RUN_ID)
print("[CONFIG] LOCAL_RUN_DIR:", LOCAL_RUN_DIR)
print("[CONFIG] DRIVE_RUN_DIR:", DRIVE_RUN_DIR)


# --- Helper Functions ---
def ensure_dir(p: Path) -> Path:
    """Create directory if not exists, return path."""
    p = Path(p)
    p.mkdir(parents=True, exist_ok=True)
    return p


def save_text(text: str, path: Path) -> Path:
    """Save text to file."""
    path = Path(path)
    ensure_dir(path.parent)
    path.write_text(text, encoding="utf-8")
    return path


def save_json(obj: Any, path: Path, indent: int = 2) -> Path:
    """Save object as JSON."""
    path = Path(path)
    ensure_dir(path.parent)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    return path


def save_pickle(obj: Any, path: Path) -> Path:
    """Save object as pickle."""
    path = Path(path)
    ensure_dir(path.parent)
    with open(path, "wb") as f:
        pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    return path


def load_pickle(path: Path) -> Any:
    """Load object from pickle."""
    path = Path(path)
    with open(path, "rb") as f:
        return pickle.load(f)




def load_with_fallback(filename: str, run_dir_local: Path, fallback_dir_local: Path, 
                        run_dir_drive: Path = None, fallback_dir_drive: Path = None,
                        use_pandas: bool = False) -> Any:
    """Load file with 4-level fallback: RUN_ID_LOCAL -> data/processed_LOCAL -> RUN_ID_DRIVE -> data/processed_DRIVE.
    
    Args:
        filename: Name of the file to load
        run_dir_local: Local run directory (runs/RUN_ID/...)
        fallback_dir_local: Local persistent directory (data/processed/)
        run_dir_drive: Drive run directory (optional)
        fallback_dir_drive: Drive persistent directory (optional)
        use_pandas: If True, use pd.read_pickle; otherwise use load_pickle
    
    Returns:
        Loaded object
    """
    # Build list of paths to try
    paths_to_try = [
        (Path(run_dir_local) / filename, "RUN_ID (LOCAL)"),
        (Path(fallback_dir_local) / filename, "data/processed (LOCAL)"),
    ]
    
    if run_dir_drive:
        paths_to_try.append((Path(run_dir_drive) / filename, "RUN_ID (DRIVE)"))
    if fallback_dir_drive:
        paths_to_try.append((Path(fallback_dir_drive) / filename, "data/processed (DRIVE)"))
    
    # Try each path
    for path, source_name in paths_to_try:
        if path.exists():
            print(f"  [LOAD] {filename} <- {source_name}")
            if use_pandas:
                return pd.read_pickle(path)
            return load_pickle(path)
    
    # None found
    tried = "\n  ".join([f"{src}: {p}" for p, src in paths_to_try])
    raise FileNotFoundError(f"File not found: {filename}\nTried:\n  {tried}")

def copy_file(src: Path, dst: Path) -> Path:
    """Copy file with metadata."""
    src, dst = Path(src), Path(dst)
    ensure_dir(dst.parent)
    shutil.copy2(src, dst)
    return dst


def copy_tree(src_dir: Path, dst_dir: Path, ignore: Optional[Any] = None) -> Path:
    """Copy directory tree."""
    src_dir, dst_dir = Path(src_dir), Path(dst_dir)
    ensure_dir(dst_dir.parent)
    if dst_dir.exists():
        shutil.rmtree(dst_dir)
    shutil.copytree(src_dir, dst_dir, ignore=ignore)
    return dst_dir


def run_dirs() -> Tuple[Path, Path]:
    """Return (plots_dir, reports_dir) for current run."""
    return Path(PLOTS_DIR), Path(REPORTS_DIR)


# --- RUN PARAMETERS ---
RUN_PARAMS: Dict[str, Any] = {
    "run_id": RUN_ID,
    "random_state": 42,  # Global random state for reproducibility
    # Used by: Cell 5 (paths setup)
    "paths": {
        "project_root_local": str(PROJECT_ROOT),
        "project_root_drive": str(DRIVE_PROJECT_ROOT),
        "run_dir_local": str(LOCAL_RUN_DIR),
        "outputs_dir_local": str(LOCAL_PATHS["outputs_dir"]),
        "models_dir_local": str(LOCAL_PATHS["models_dir"]),
        "reports_dir_local": str(LOCAL_PATHS["reports_dir"]),
        "plots_dir_local": str(LOCAL_PATHS["plots_dir"]),
        "config_dir_local": str(LOCAL_PATHS["config_dir"]),
        "logs_dir_local": str(LOCAL_PATHS["logs_dir"]),
        "proc_dir_local": str(LOCAL_PATHS["proc_dir"]),
        "fs_dir_local": str(LOCAL_PATHS["fs_dir"]),
        "ms_dir_local": str(LOCAL_PATHS["ms_dir"]),
        "run_dir_drive": str(DRIVE_RUN_DIR),
        "outputs_dir_drive": str(DRIVE_PATHS["outputs_dir"]),
        "models_dir_drive": str(DRIVE_PATHS["models_dir"]),
        "reports_dir_drive": str(DRIVE_PATHS["reports_dir"]),
        "plots_dir_drive": str(DRIVE_PATHS["plots_dir"]),
        "config_dir_drive": str(DRIVE_PATHS["config_dir"]),
        "logs_dir_drive": str(DRIVE_PATHS["logs_dir"]),
        "proc_dir_drive": str(DRIVE_PATHS["proc_dir"]),
        "fs_dir_drive": str(DRIVE_PATHS["fs_dir"]),
        "ms_dir_drive": str(DRIVE_PATHS["ms_dir"]),
        # Project-level data directories (not run-specific)
        "data_raw_local": str(DATA_DIRS_LOCAL["raw"]),
        "data_interim_local": str(DATA_DIRS_LOCAL["interim"]),
        "data_processed_local": str(DATA_DIRS_LOCAL["processed"]),
        "data_raw_drive": str(DATA_DIRS_DRIVE["raw"]),
        "data_interim_drive": str(DATA_DIRS_DRIVE["interim"]),
        "data_processed_drive": str(DATA_DIRS_DRIVE["processed"]),
    },
    # Used by: Blocks 3,4,14,15,19,20 (data loading & split)
    "data": {
        "target": "GOOGL_logret_t1",
        "target_src_col": "GOOGL_logret_cc",
        "target_col": "GOOGL_logret_t1",
        "start_date": "2004-09-01",
        "end_date": "2025-01-15",  # Set fixed date for reproducibility; use None for today's date
        "limit_start_date": "2015-12-31",
        #  Exact dates (takes priority if provided)
        "train_end": "2020-12-31",      # Last date for training
        "valid_start": "2021-01-01",    # First date for validation
        "valid_end": "2022-12-31",      # Last date for validation
        "test_start": "2023-01-01",     # First date for test
        "test_end": None,               # None = use all remaining data
    },
    # Used by: Blocks 7-13,18,19,20 (feature engineering)
    "features": {
        "feature_set_name": "XGB-30",
        "feature_selection_artifact": "selected_features_xgb.pkl",
        # Rolling windows
        "rolling_w_short": 5,
        "rolling_w_long": 21,
        "do_volume_rolling": True,
        # Cross-asset
        "cross_asset_base": "GOOGL",
        "cross_asset_peers": ["SPY", "QQQ", "^IXIC", "XLK"],
        "cross_asset_windows": [5, 21],
        # Regime
        "regime_base": "GOOGL",
        "market_vol_ticker": "SPY",
        # Exclusions
        "exclude_raw_ohlc": ["^VIX", "^TNX"],
        # Crisis periods
        "covid_start": "2020-02-01",
        "covid_end": "2023-05-05",
        "crisis_2008_start": "2007-07-01",
        "crisis_2008_end": "2009-09-01",
        # Numeric safety
        "eps": 1e-12,
    },
    # XGBoost


    # EU Break Close Flags
    # Used by: Block 2B (EU break close flags)
    "eu_break_close": {
        "enabled": True,
        "eu_ticker": "^GDAXI",
        "gap_days_threshold": 2,
        "apply_to": "next_us_trading_day",  # "same_calendar_date" or "next_us_trading_day"
    },
    # EDA
    # Used by: Blocks 17,18,19 (EDA)
    "eda": {
        "enabled": True,
        "returns_bins": 50,
    },
    # Sample weights
    # Used by: Block 20 (sample weights)
    "weights": {
        "c": 1.0,
        "max_w": 4.0,
    },
    # NN Feature Selection
    # Used by: Block 21 (NN feature groups)
    "nn_feature_select": {
        "n40": 40,
        "n80": 80,
        "per_group_40": 4,
        "per_group_80": 8,
        "corr_thr": 0.95,
        "mi_n_neighbors": 5,
        "mi_random_state": 42,
    },
    # Used by: Block 23 (XGB feature selection)
    "xgb_fs": {
        "spearman_thresh": 0.90,
        "gain_cum_thresh": 0.90,
        "min_features": 15,
        "neg_sigma": 1.0,
        "pos_sigma": 0.5,
        "min_gain": 0.0,
        "perm_repeats": 20,
        "n_estimators": 4000,
        "learning_rate": 0.05,
        "max_depth": 3,
        "min_child_weight": 10,
        "gamma": 0.5,
        "subsample": 0.70,
        "colsample_bytree": 0.70,
        "reg_alpha": 1e-4,
        "reg_lambda": 5.0,
        "max_delta_step": 1,
        "early_stopping_rounds": 80,
        "random_state": 42,
    },
    # Used by: Blocks 24,25,26,28 (HPO & model training)
    "hpo": {
        "n_estimators": 4000,
        "early_stopping_rounds": 80,
        "n_trials_stage1": 160,
        "n_trials_stage2": 80,
        "n_trials_stage2_lowlr": 40,
        "print_every_stage1": 20,
        "print_every_stage2": 20,
        "tie_tol": 1e-5,
        "random_state": 42,
        # Shared lookback for alignment with Neural Networks
        "lookback": 15,
        # XGBoost model settings
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        # Date-based
        "valid_es_start": "2021-01-01",
        "valid_es_end": "2021-12-31",
        "valid_score_start": "2022-01-01",
        "valid_score_end": "2022-12-31",
        # Sampling parameters for HPO
        # Used by: Block 24 (HPO sampling)
    "sampling": {
            # Used by: Block 24 (HPO Stage 1)
    "broad": {
                "max_depth": [2, 7],
                "lr_low": [0.003, 0.06],
                "lr_high": [0.06, 0.12],
                "lr_high_prob": 0.15,
                "min_child_weight_log": [0.5, 20.0],
                "subsample": [0.6, 1.0],
                "colsample_bytree": [0.55, 1.0],
                "gamma": [0.0, 3.0],
                "reg_alpha_exp": [-9, -2],
                "reg_lambda_exp": [-2, 1.3],
                "max_delta_step": [0.0, 2.0],
            },
            # Used by: Block 24 (HPO Stage 2)
    "refine": {
                "max_depth_delta": [-1, 2],
                "max_depth_clip": [2, 8],
                "lr_sigma": 0.25,
                "lr_clip": [0.002, 0.15],
                "min_child_weight_sigma": 0.40,
                "min_child_weight_clip": [0.3, 30.0],
                "subsample_sigma": 0.06,
                "subsample_clip": [0.5, 1.0],
                "colsample_sigma": 0.06,
                "colsample_clip": [0.5, 1.0],
                "gamma_sigma": 0.30,
                "gamma_clip": [0.0, 5.0],
                "reg_alpha_sigma": 0.7,
                "reg_alpha_exp_clip": [-10, 0],
                "reg_lambda_sigma": 0.5,
                "reg_lambda_exp_clip": [-3, 2],
                "max_delta_step_sigma": 0.20,
                "max_delta_step_clip": [0.0, 4.0],
            },
            # Used by: Block 24 (HPO Stage 2 Low-LR)
    "refine_low_lr": {
                "lr_shift": -0.8,
                "lr_clip": [0.0015, 0.06],
            },
        },
    },
    # Used by: Blocks 25,27,29 (plotting)
    "plot": {
        "n_plot": 200,
        "figsize": [13, 5],
        "dpi": 150,
    },
    # Used by: Block 25 (SHAP analysis)
    "shap": {
        "enabled": True,
        "max_display": 20,
        "figsize": [10, 8],
        "plot_type_bar": True,
        "plot_type_beeswarm": True,
        "save_values": True,
    },
    # Used by: Blocks 26,27 (LSTM model)
        # --- LightGBM Feature Selection ---
    "lgb_fs": {
        "spearman_thresh": 0.90,
        "gain_cum_thresh": 0.90,
        "min_features": 15,
        "neg_sigma": 1.0,
        "pos_sigma": 0.5,
        "min_gain": 0.0,
        "perm_repeats": 20,
        "n_estimators": 4000,
        "early_stopping_rounds": 80,
        "learning_rate": 0.05,
        "max_depth": 3,
        "num_leaves": 31,
        "min_child_samples": 20,
        "subsample": 0.70,
        "colsample_bytree": 0.70,
        "reg_alpha": 1e-4,
        "reg_lambda": 5.0,
        "random_state": 42,
    },
    # --- LightGBM HPO ---
    "lgb_hpo": {
        "n_estimators": 4000,
        "early_stopping_rounds": 80,
        "n_trials_stage1": 160,
        "n_trials_stage2": 80,
        "n_trials_stage2_lowlr": 40,
        "valid_es_start": "2021-01-01",
        "valid_es_end": "2021-12-31",
        "valid_score_start": "2022-01-01",
        "valid_score_end": "2022-12-31",
        "lookback": 15,
        "random_state": 42,
    },
    "lstm": {
        "lookback": 15,
        "stride": 1,
        "units_1": 32,
        "units_2": 16,
        "dense_units": 16,
        "dropout": 0.20,
        "learning_rate": 5e-4,
        "clipnorm": 1.0,
        "loss": "mse",
        "dense_activation": "relu",
        "output_activation": "linear",
        "epochs": 80,
        "batch_size": 16,
        "patience": 10,
        "random_state": 42,
        "feature_sets": ["neural_40", "neural_80", "xgb_selected"],
    },
    # Used by: Blocks 26,27 (GRU model)
    "gru": {
        "lookback": 15,
        "stride": 1,
        "units_1": 32,
        "units_2": 16,
        "dense_units": 16,
        "dropout": 0.20,
        "learning_rate": 5e-4,
        "clipnorm": 1.0,
        "loss": "mse",
        "dense_activation": "relu",
        "output_activation": "linear",
        "epochs": 80,
        "batch_size": 16,
        "patience": 10,
        "random_state": 42,
        "feature_sets": ["neural_40", "neural_80", "xgb_selected"],
    },
    # Used by: Blocks 28,29 (Hybrid Sequential)
    "hybrid_seq": {
        "lookback": 15,
        "stride": 1,
        "lstm_units": 32,
        "gru_units": 16,
        "dense_units": 16,
        "dropout": 0.20,
        "learning_rate": 4e-4,
        "clipnorm": 1.0,
        "loss": "mse",
        "dense_activation": "relu",
        "output_activation": "linear",
        "epochs": 90,
        "batch_size": 16,
        "patience": 12,
        "random_state": 42,
        "feature_sets": ["neural_40", "neural_80", "xgb_selected"],
    },
    # Used by: Blocks 28,29 (Hybrid Parallel)
    "hybrid_par": {
        "lookback": 15,
        "stride": 1,
        "lstm_units": 24,
        "gru_units": 24,
        "dense_units": 16,
        "dropout": 0.20,
        "learning_rate": 4e-4,
        "clipnorm": 1.0,
        "loss": "mse",
        "dense_activation": "relu",
        "output_activation": "linear",
        "epochs": 90,
        "batch_size": 16,
        "patience": 12,
        "random_state": 42,
        "feature_sets": ["neural_40", "neural_80", "xgb_selected"],
    },
    # --- Ensemble ---
    "ensemble": {
        "method": "weighted_average",  # simple_average, weighted_average, stacking, rank_average
        "models": ["xgb", "lgb", "lstm", "gru", "hybrid_seq", "hybrid_par"],
        "weights": "auto",  # "auto" for inverse_wrmse, or dict
        "weight_method": "inverse_wrmse",
        "meta_model": "ridge",
        "meta_params": {
            "alpha": 1.0,
            "random_state": 42,
        },
    },
}

# Save run params to BOTH local and drive
save_json(RUN_PARAMS, LOCAL_PATHS["config_dir"] / "run_params.json")
save_text(
    json.dumps(RUN_PARAMS, indent=2, ensure_ascii=False),
    LOCAL_PATHS["config_dir"] / "run_params.txt",
)
save_json(RUN_PARAMS, DRIVE_PATHS["config_dir"] / "run_params.json")
save_text(
    json.dumps(RUN_PARAMS, indent=2, ensure_ascii=False),
    DRIVE_PATHS["config_dir"] / "run_params.txt",
)


def save_run_outputs(
    metrics: Dict[str, Any],
    predictions_valid: Optional["pd.DataFrame"] = None,
    predictions_test: Optional["pd.DataFrame"] = None,
    extra_artifacts: Optional[Dict[str, Any]] = None,
    model: Optional[Any] = None,
    model_filename: str = "model.json",
) -> None:
    """Save standard run outputs (metrics, predictions, model)."""
    save_json(metrics, OUTPUTS_DIR / "metrics.json")
    save_text(
        "\n".join([f"{k}: {v}" for k, v in metrics.items()]),
        OUTPUTS_DIR / "metrics.txt",
    )

    if predictions_valid is not None:
        predictions_valid.to_csv(OUTPUTS_DIR / "predictions_valid.csv", index=True)
    if predictions_test is not None:
        predictions_test.to_csv(OUTPUTS_DIR / "predictions_test.csv", index=True)

    if extra_artifacts:
        for name, obj in extra_artifacts.items():
            save_pickle(obj, OUTPUTS_DIR / f"{name}.pkl")

    if model is not None:
        path = MODELS_DIR / model_filename
        if hasattr(model, "save_model"):
            model.save_model(str(path))
        else:
            save_pickle(model, MODELS_DIR / (Path(model_filename).stem + ".pkl"))


# --- Code Snapshot ---
EXPORT_CODE = True


def snapshot_code(project_root: Path, local_run_dir: Path, drive_run_dir: Path) -> None:
    """Save current notebook to run directories for reproducibility."""
    
    # Step 1: Save the notebook first (Colab-specific)
    try:
        from google.colab import _message
        _message.blocking_request('save_notebook', {'save': True})
        print("[SNAPSHOT] Notebook saved via Colab API")
    except Exception:
        print("[SNAPSHOT] Could not auto-save notebook (not in Colab or already saved)")
    
    # Step 2: Find the notebook file
    notebook_path = None
    search_paths = [
        project_root / "google_stock_ml_unified.ipynb",
        Path("/content/google_stock_ml_unified.ipynb"),
        Path("/content/my_project/google_stock_ml_unified.ipynb"),
    ]
    
    # Also search for any .ipynb files
    for pattern in ["/content/*.ipynb", "/content/my_project/*.ipynb"]:
        for p in glob(pattern):
            if "checkpoint" not in p.lower():
                search_paths.append(Path(p))
    
    for p in search_paths:
        if p.exists():
            notebook_path = p
            break
    
    # Step 3: Copy to both locations
    def _save_snapshot(dst_run_dir: Path, location: str):
        export_dir = dst_run_dir / "code_snapshot"
        export_dir.mkdir(parents=True, exist_ok=True)
        
        if notebook_path and notebook_path.exists():
            dst = export_dir / notebook_path.name
            copy_file(notebook_path, dst)
            
            # Save metadata
            meta = {
                "run_id": RUN_ID,
                "timestamp": datetime.now().isoformat(),
                "source_path": str(notebook_path),
            }
            save_json(meta, export_dir / "snapshot_meta.json")
            print(f"[SNAPSHOT] {location}: Saved {notebook_path.name}")
        else:
            save_text(f"Notebook not found. Searched: {[str(p) for p in search_paths]}", 
                     export_dir / "NO_SNAPSHOT.txt")
            print(f"[SNAPSHOT] {location}: Notebook not found")
    
    _save_snapshot(local_run_dir, "LOCAL")
    _save_snapshot(drive_run_dir, "DRIVE")

if EXPORT_CODE:
    snapshot_code(PROJECT_ROOT, LOCAL_RUN_DIR, DRIVE_RUN_DIR)
    save_text("Snapshot completed", LOCAL_PATHS["logs_dir"] / "export_log.txt")
    save_text("Snapshot completed", DRIVE_PATHS["logs_dir"] / "export_log.txt")

print("[OK] BOOT + BLOCK 0 complete.")

## BLOCK 1 — ENV + IMPORTS (XGB + LSTM/GRU)

In [None]:

# Colab installs (only if missing)
if IN_COLAB:
    try:
        import yfinance  # noqa: F401
        import pandas_datareader  # noqa: F401
        import pandas_market_calendars  # noqa: F401
        import scipy  # noqa: F401
    except ImportError:
        subprocess.run(
            [sys.executable, "-m", "pip", "install", "-q",
             "yfinance", "pandas_datareader", "pandas-market-calendars", "scipy"],
            check=True
        )
    try:
        import lightgbm  # noqa: F401
    except ImportError:
        subprocess.run(
            [sys.executable, "-m", "pip", "install", "-q", "lightgbm"],
            check=True
        )

import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch  # Required for EDA volatility plots

import re
import csv

import yfinance as yf
from pandas_datareader import data as pdr
import pandas_market_calendars as mcal

from scipy import stats

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import ParameterSampler
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_regression

try:
    from IPython.display import display
except ImportError:
    display = print

import xgboost as xgb
from xgboost import XGBRegressor

# Random state from RUN_PARAMS
RANDOM_STATE = int(RUN_PARAMS.get("random_state", 42))
np.random.seed(RANDOM_STATE)

# TensorFlow (optional)
TF_AVAILABLE = False
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers

    tf.random.set_seed(RANDOM_STATE)
    TF_AVAILABLE = True
except ImportError:
    TF_AVAILABLE = False

# NN config is defined per-model in RUN_PARAMS["lstm"], ["gru"], ["hybrid_seq"], ["hybrid_par"]

print("[ENV] python:", sys.version.split()[0])
print("[ENV] numpy:", np.__version__)
print("[ENV] pandas:", pd.__version__)
print("[ENV] yfinance:", getattr(yf, "__version__", "unknown"))
print("[ENV] pandas_datareader:", getattr(pdr, "__version__", "unknown"))
print("[ENV] pandas_market_calendars:", getattr(mcal, "__version__", "unknown"))
print("[ENV] scipy:", getattr(sys.modules.get("scipy"), "__version__", "unknown"))
print("[ENV] xgboost:", getattr(xgb, "__version__", "unknown"))
print("[ENV] tensorflow:", "OK" if TF_AVAILABLE else "NOT AVAILABLE")

print("[ENV] RANDOM_STATE:", RANDOM_STATE)

# NN config is per-model (see RUN_PARAMS["lstm"], ["gru"], etc.)


# Global constants from config
TARGET_T1 = str(RUN_PARAMS["data"]["target_col"])
EPS = float(RUN_PARAMS["features"]["eps"])
print("[ENV] TARGET_T1:", TARGET_T1)
print("[ENV] EPS:", EPS)

print("[OK] BLOCK 1 complete.")

# -------------------------
# Shared Metric Functions (used across all model blocks)
# -------------------------
def _to_np(x):
    """Convert to numpy array."""
    return np.asarray(x, dtype=float)


def w_rmse(y_true, y_pred, w):
    """Weighted Root Mean Squared Error (Corrected)."""
    y_true = _to_np(y_true)
    y_pred = _to_np(y_pred)
    w = _to_np(w)
    mse_w = np.sum(w * (y_true - y_pred) ** 2) / (np.sum(w) + EPS)
    return float(np.sqrt(mse_w))


def w_mae(y_true, y_pred, w):
    """Weighted Mean Absolute Error (Corrected)."""
    y_true = _to_np(y_true)
    y_pred = _to_np(y_pred)
    w = _to_np(w)
    mae_w = np.sum(w * np.abs(y_true - y_pred)) / (np.sum(w) + EPS)
    return float(mae_w)


def dir_acc(y_true, y_pred):
    """Directional Accuracy."""
    y_true = _to_np(y_true)
    y_pred = _to_np(y_pred)
    return float(np.mean((y_true > 0) == (y_pred > 0)))


print("[ENV] Metric functions loaded: w_rmse, w_mae, dir_acc")


---
# SECTION 2: Data Ingestion & Preprocessing

**Download data, feature engineering, interim processing**

**Blocks:** 2-16

## BLOCK 2 — LOAD PRICES + EARNINGS

In [None]:
# Date range
start = RUN_PARAMS["data"]["start_date"]
end = RUN_PARAMS["data"].get("end_date") or datetime.now().strftime("%Y-%m-%d")
RUN_PARAMS["data"]["end_date"] = end

# --- 1. Define official calendar (NASDAQ) to avoid relying on Yahoo alone ---
nyse = mcal.get_calendar('NASDAQ')
valid_days = nyse.valid_days(start_date=start, end_date=end)
master_index = pd.Index(valid_days.tz_localize(None).normalize(), name="Date")

# Price tickers (includes GDAXI in main list)
price_tickers = [
    "GOOGL", "MSFT", "NVDA",
    "^IXIC", "SPY", "QQQ",
    "^VIX", "^TNX",
    "XLK", "^GDAXI"
]

data_dict = {}

# Download price data
for t in price_tickers:
    df = yf.download(t, start=start, end=end, auto_adjust=True, progress=False)
    if df is None or df.empty:
        continue

    df.columns = [f"{t}_{c[0] if isinstance(c, tuple) else c}" for c in df.columns]

    if isinstance(df.index, pd.DatetimeIndex) and df.index.tz is not None:
        df.index = df.index.tz_convert(None)

    data_dict[t] = df

# --- 2. Build Master Table aligned to official calendar ---
if "GOOGL" not in data_dict:
    raise ValueError("GOOGL data is missing. Cannot build master timeline.")

# Use master_index as base for all data
prices_all = pd.DataFrame(index=master_index)

for t, df in data_dict.items():
    # Left join to calendar ensures we don't miss official trading days
    prices_all = prices_all.join(df, how="left")

# Forward fill prices only
price_cols = [c for c in prices_all.columns if any(s in c for s in ['_Open', '_High', '_Low', '_Close'])]
prices_all[price_cols] = prices_all.sort_index()[price_cols].ffill()

# Earnings data (GOOGL)
tkr = yf.Ticker("GOOGL")
edf = tkr.get_earnings_dates(limit=100)

earnings = pd.DataFrame(index=prices_all.index)
earnings["is_earnings_day"] = 0

# Pre-create columns
earnings["eps_surprise_pct_yahoo"] = np.nan
earnings["has_eps_surprise_yahoo"] = 0
earnings["eps_surprise_pct_calc"] = np.nan
earnings["has_eps_surprise_calc"] = 0

if edf is not None and len(edf) > 0:
    edf = edf.copy()
    idx = pd.to_datetime(edf.index)
    if getattr(idx, "tz", None) is not None:
        idx = idx.tz_convert(None)
    idx = idx.normalize()

    edf.index = idx
    edf = edf[~edf.index.duplicated(keep="last")].sort_index()

    cols_lower = {c.lower(): c for c in edf.columns}

    def pick_col(possible_names):
        for name in possible_names:
            key = name.lower()
            if key in cols_lower:
                return cols_lower[key]
        return None

    col_exp = pick_col(["EPS Estimate", "eps estimate", "Eps Estimate"])
    col_act = pick_col(["Reported EPS", "reported eps", "EPS Actual", "eps actual"])
    col_pct = pick_col(["Surprise(%)", "surprise(%)", "Surprise (%)", "surprise (%)"])

    eps_daily = pd.DataFrame(index=edf.index)
    eps_daily["eps_expected"] = edf[col_exp] if col_exp else np.nan
    eps_daily["eps_actual"] = edf[col_act] if col_act else np.nan

    if col_exp and col_act:
        eps_surprise = eps_daily["eps_actual"] - eps_daily["eps_expected"]
        denom = eps_daily["eps_expected"].abs()
        eps_daily["eps_surprise_pct_calc"] = np.where(denom > 0, 100.0 * (eps_surprise / denom), np.nan)
    else:
        eps_daily["eps_surprise_pct_calc"] = np.nan

    eps_daily["eps_surprise_pct_yahoo"] = edf[col_pct] if col_pct else np.nan
    data_dict["EARNINGS_EPS_DEBUG"] = eps_daily.copy()

    # Align to official trading days in prices_all
    eps_on_trading_days = eps_daily.reindex(prices_all.index)

    earnings["is_earnings_day"] = prices_all.index.isin(eps_daily.index).astype("int8")
    earnings["eps_surprise_pct_yahoo"] = eps_on_trading_days["eps_surprise_pct_yahoo"].values
    earnings["eps_surprise_pct_calc"] = eps_on_trading_days["eps_surprise_pct_calc"].values
    earnings["has_eps_surprise_yahoo"] = earnings["eps_surprise_pct_yahoo"].notna().astype("int8")
    earnings["has_eps_surprise_calc"] = earnings["eps_surprise_pct_calc"].notna().astype("int8")

# Merge prices + earnings
full_df = prices_all.join(earnings, how="left")

# Drop low-information volume columns
DROP_VOLUME_COLS = ["^VIX_Volume", "^TNX_Volume"]
full_df = full_df.drop(columns=[c for c in DROP_VOLUME_COLS if c in full_df.columns])

# --- Summary functions (unchanged) ---
def feature_info(df: pd.DataFrame) -> pd.DataFrame:
    return (
        pd.DataFrame({
            "feature": df.columns,
            "dtype": [df[c].dtype for c in df.columns],
            "non_null": [int(df[c].notna().sum()) for c in df.columns],
            "null": [int(df[c].isna().sum()) for c in df.columns],
            "null_pct": [float(df[c].isna().mean() * 100.0) for c in df.columns],
            "unique": [int(df[c].nunique(dropna=True)) for c in df.columns],
        })
        .sort_values(["null_pct", "feature"])
        .reset_index(drop=True)
    )

def frequency_summary(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    n = len(df)
    for c in df.columns:
        s = df[c]
        n_nan = int(s.isna().sum())
        n_nonnull = int(s.notna().sum())
        if pd.api.types.is_numeric_dtype(s):
            s_nn = s.dropna()
            n_zero = int((s_nn == 0).sum())
            n_nonzero = int((s_nn != 0).sum())
            pct_nonzero_of_nonnull = (100.0 * n_nonzero / n_nonnull) if n_nonnull else np.nan
        else:
            n_zero = 0
            n_nonzero = n_nonnull
            pct_nonzero_of_nonnull = (100.0 * n_nonzero / n_nonnull) if n_nonnull else np.nan
        rows.append({
            "feature": c, "dtype": str(s.dtype), "n_total": n, "n_nan": n_nan,
            "n_nonnull": n_nonnull, "n_zero": n_zero, "n_nonzero": n_nonzero,
            "pct_nan": (100.0 * n_nan / n) if n else np.nan,
            "pct_nonzero": (100.0 * n_nonzero / n) if n else np.nan,
            "pct_nonzero_of_nonnull": pct_nonzero_of_nonnull,
        })
    return pd.DataFrame(rows).sort_values(["pct_nan", "feature"]).reset_index(drop=True)

print("\n=== FEATURE INFO SUMMARY ===")
print(feature_info(full_df).to_string(index=False))

print("\n=== FEATURE FREQUENCY SUMMARY ===")
print(frequency_summary(full_df).to_string(index=False))

print("[OK] BLOCK 2 complete. full_df shape:", full_df.shape)


## BLOCK 2B — EU BREAK CLOSE FLAGS

In [None]:
# =========================
# EU "GAP" FLAGS (US Closed, EU Open)
# =========================
def build_eu_info_gap_flags(
    full_df: pd.DataFrame,
    eu_ticker: str = "^GDAXI",
    apply_to: str = "next_us_trading_day"
) -> pd.DataFrame:
    """
    Returns:
        - EU_break_close_flag (int8): 1 if US was closed previously while EU was open.
        - EU_break_close_up   (int8): 1 if EU cumulative return during US holiday was positive.
        - EU_break_close_down (int8): 1 if EU cumulative return during US holiday was negative.
    """
    # 1. Get EU data from data_dict downloaded in block 2
    if eu_ticker not in data_dict:
        raise ValueError(f"{eu_ticker} missing from data_dict. Ensure Block 2 ran correctly.")
    
    eu_data = data_dict[eu_ticker].copy()
    eu_days = eu_data.index.normalize()
    us_days = full_df.index.normalize()
    
    # 2. Identify gap days: Europe open, US closed
    gap_days = eu_days.difference(us_days)
    
    # 3. Compute log returns (allows summing over consecutive holiday days)
    close_col = f"{eu_ticker}_Close"
    eu_log_ret = np.log(eu_data[close_col] / eu_data[close_col].shift(1))
    
    # 4. Create events table for gap days
    eu_gap_events = pd.DataFrame(index=gap_days)
    eu_gap_events["gap_return"] = eu_log_ret.reindex(gap_days)
    
    # 5. Map to next US trading day
    # searchsorted(side='left') finds first US index >= holiday day
    pos = np.searchsorted(us_days, eu_gap_events.index, side="left")
    valid_mask = pos < len(us_days)
    
    eu_gap_events['target_us_date'] = pd.NaT  # NaT for datetime instead of np.nan
    eu_gap_events.loc[valid_mask, 'target_us_date'] = us_days[pos[valid_mask]]
    
    # 6. Aggregate (for long holidays, sum all EU returns to US opening day)
    agg_gap = eu_gap_events.dropna(subset=['target_us_date']).groupby('target_us_date')["gap_return"].sum()
    
    # 7. Create output table in original format
    out = pd.DataFrame(index=us_days)
    out["EU_break_close_flag"] = np.int8(0)
    out["EU_break_close_up"] = np.int8(0)
    out["EU_break_close_down"] = np.int8(0)
    
    out.loc[agg_gap.index, "EU_break_close_flag"] = 1
    out.loc[agg_gap.index, "EU_break_close_up"] = (agg_gap > 0).astype("int8")
    out.loc[agg_gap.index, "EU_break_close_down"] = (agg_gap < 0).astype("int8")
    
    return out

# =========================
# USAGE
# =========================
EU_CFG = RUN_PARAMS.get("eu_break_close", {})
EU_ENABLED = bool(EU_CFG.get("enabled", True))

if EU_ENABLED:
    print("[INFO] Building EU break close flags...")
    
    # Function now receives updated full_df
    eu_flags = build_eu_info_gap_flags(
        full_df=full_df,
        eu_ticker=EU_CFG.get("eu_ticker", "^GDAXI")
    )

    # Join - align to original full_df index
    eu_flags.index = full_df.index
    
    # Remove existing columns if block runs again
    cols_to_drop = [c for c in eu_flags.columns if c in full_df.columns]
    if cols_to_drop:
        full_df = full_df.drop(columns=cols_to_drop)
        
    full_df = full_df.join(eu_flags, how="left")
    
    # Fill NaN with 0 for flag columns
    for col in ["EU_break_close_flag", "EU_break_close_up", "EU_break_close_down"]:
        if col in full_df.columns:
            full_df[col] = full_df[col].fillna(0).astype("int8")
    
    n_events = full_df["EU_break_close_flag"].sum()
    print(f"[INFO] EU break close events: {n_events}")
    print(f"[INFO] EU break close up: {full_df['EU_break_close_up'].sum()}")
    print(f"[INFO] EU break close down: {full_df['EU_break_close_down'].sum()}")
    print(f"[OK] BLOCK 2B complete. full_df shape: {full_df.shape}")
else:
    print("[SKIP] BLOCK 2B — EU break close disabled in config.")


## BLOCK 3 — MACRO FEATURES (FRED)

In [None]:

# Preconditions
assert "full_df" in globals(), "[ERROR] full_df is not defined."
assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
full_df = full_df.sort_index()

# Use RUN_PARAMS dates (set in Block 2)
start = pd.to_datetime(RUN_PARAMS["data"]["start_date"])
end = pd.to_datetime(RUN_PARAMS["data"]["end_date"])

print("[INFO] full_df range:", full_df.index.min(), "->", full_df.index.max(), "| rows:", len(full_df))
print("[INFO] FRED pull range:", start, "->", end)

# --- EPS surprise: flag + fill-0 ---
eps_pairs = [
    ("eps_surprise_pct_yahoo", "has_eps_surprise_yahoo"),
    ("eps_surprise_pct_calc", "has_eps_surprise_calc"),
]
for val_col, flag_col in eps_pairs:
    if val_col in full_df.columns and flag_col in full_df.columns:
        mask_fill0 = (full_df[flag_col] == 0)
        full_df.loc[mask_fill0, val_col] = full_df.loc[mask_fill0, val_col].fillna(0.0)

        still_na_when_flag0 = int(full_df.loc[full_df[flag_col] == 0, val_col].isna().sum())
        assert still_na_when_flag0 == 0, f"[ERROR] {val_col} still has NaN where {flag_col}==0"

print("[OK] EPS surprise handled: eps_surprise_pct_* filled with 0 where has_* == 0 (flags preserved).")

# --- Pull monthly series from FRED ---
cpi = pdr.DataReader("CPIAUCSL", "fred", start, end).rename(columns={"CPIAUCSL": "CPI"})
rate = pdr.DataReader("FEDFUNDS", "fred", start, end).rename(columns={"FEDFUNDS": "FEDFUNDS"})

cpi.index = pd.to_datetime(cpi.index)
rate.index = pd.to_datetime(rate.index)

# Compute MONTHLY features (BEFORE daily ffill)

# CPI features
cpi["CPI_pct_mom"] = cpi["CPI"].pct_change(1, fill_method=None)
cpi["CPI_accel_pct_mom"] = cpi["CPI_pct_mom"] - cpi["CPI_pct_mom"].shift(1)
cpi_feats_monthly = cpi[["CPI_pct_mom", "CPI_accel_pct_mom"]].copy()

# FEDFUNDS features
rate["FEDFUNDS_delta_mom"] = rate["FEDFUNDS"].diff(1)
rate["FEDFUNDS_changed"] = (rate["FEDFUNDS_delta_mom"].fillna(0) != 0).astype("int8")
rate["FEDFUNDS_level"] = rate["FEDFUNDS"].copy()
rate_feats_monthly = rate[["FEDFUNDS_delta_mom", "FEDFUNDS_changed", "FEDFUNDS_level"]].copy()

print("\n[INFO] Monthly CPI feats head:\n", cpi_feats_monthly.head(6))
print("\n[INFO] Monthly FEDFUNDS feats head:\n", rate_feats_monthly.head(6))

# Upsample FEATURES to daily and forward-fill (calendar daily)
cpi_feats_daily = cpi_feats_monthly.resample("D").ffill()
rate_feats_daily = rate_feats_monthly.resample("D").ffill()
macro_daily = pd.concat([cpi_feats_daily, rate_feats_daily], axis=1)

# Align macro index to full_df (exact same trading dates)
macro_daily.index = pd.to_datetime(macro_daily.index)
macro_daily.index.name = full_df.index.name
macro_aligned = macro_daily.reindex(full_df.index)

# Missingness flags BEFORE any fill (macro only)
FLAG_SUFFIX = "_is_missing"
macro_numeric_to_fill0 = ["CPI_pct_mom", "CPI_accel_pct_mom", "FEDFUNDS_delta_mom"]

for col in macro_numeric_to_fill0:
    if col in macro_aligned.columns:
        macro_aligned[f"{col}{FLAG_SUFFIX}"] = macro_aligned[col].isna().astype("int8")

# Fill only macro gaps (no global full_df ffill)
macro_cols = macro_aligned.columns.tolist()
macro_aligned[macro_cols] = macro_aligned[macro_cols].ffill()

# After ffill, leading NaNs may remain. Fill with 0 for selected numeric cols.
for col in macro_numeric_to_fill0:
    if col in macro_aligned.columns:
        macro_aligned[col] = macro_aligned[col].fillna(0.0)

# Enforce FEDFUNDS_changed to stay binary int8 after reindex/ffill
FLAG_COL = "FEDFUNDS_changed"
if FLAG_COL in macro_aligned.columns:
    macro_aligned[FLAG_COL] = (
        macro_aligned[FLAG_COL]
        .fillna(0)
        .clip(0, 1)
        .astype("int8")
    )

# Release-day flags (on trading-day index)
if "CPI_pct_mom" in macro_aligned.columns:
    cpi_series = macro_aligned["CPI_pct_mom"].astype("float64")
    macro_aligned["CPI_release_day"] = (cpi_series.notna() & cpi_series.ne(cpi_series.shift(1))).astype("int8")

if "FEDFUNDS_level" in macro_aligned.columns:
    ff_series = macro_aligned["FEDFUNDS_level"].astype("float64")
    macro_aligned["FEDFUNDS_release_day"] = (ff_series.notna() & ff_series.ne(ff_series.shift(1))).astype("int8")

# Update macro_cols after adding flags
macro_cols = macro_aligned.columns.tolist()

# Remove existing macro columns from full_df to avoid duplicates
existing_macro_cols = [c for c in macro_cols if c in full_df.columns]
if existing_macro_cols:
    print(f"[INFO] Removing {len(existing_macro_cols)} existing macro cols from full_df before merge")
    full_df = full_df.drop(columns=existing_macro_cols)

# Merge into full_df
full_df_merged = pd.concat([full_df, macro_aligned], axis=1)

# Enforce missingness flags + release flags to int8
for col in [c for c in full_df_merged.columns if c.endswith(FLAG_SUFFIX)]:
    full_df_merged[col] = full_df_merged[col].fillna(0).clip(0, 1).astype("int8")

for col in ["CPI_release_day", "FEDFUNDS_release_day"]:
    if col in full_df_merged.columns:
        full_df_merged[col] = full_df_merged[col].fillna(0).clip(0, 1).astype("int8")

if FLAG_COL in full_df_merged.columns:
    full_df_merged[FLAG_COL] = (
        full_df_merged[FLAG_COL]
        .fillna(0)
        .clip(0, 1)
        .astype("int8")
    )

# --- Diagnostics (detailed) ---
print("\n[CHECK] Macro columns (including flags + release):", macro_cols)
print("\n[CHECK] Macro head (aligned to trading dates):\n", full_df_merged[macro_cols].head(15))
print("\n[CHECK] Macro NaNs count:\n", full_df_merged[macro_cols].isna().sum())

if FLAG_COL in full_df_merged.columns:
    print("\n[CHECK] FEDFUNDS_changed dtype:", full_df_merged[FLAG_COL].dtype)
    print("\n[CHECK] FEDFUNDS_changed value counts:\n", full_df_merged[FLAG_COL].value_counts(dropna=False))

# Release-day quick counts
for col in ["CPI_release_day", "FEDFUNDS_release_day"]:
    if col in full_df_merged.columns:
        print(f"\n[CHECK] {col} value counts:\n", full_df_merged[col].value_counts(dropna=False))

# EPS diagnostics (post-fill)
for val_col, flag_col in eps_pairs:
    if val_col in full_df_merged.columns and flag_col in full_df_merged.columns:
        na_total = int(full_df_merged[val_col].isna().sum())
        na_flag0 = int(full_df_merged.loc[full_df_merged[flag_col] == 0, val_col].isna().sum())
        print(f"\n[CHECK] EPS {val_col}: na_total={na_total} | na_when_{flag_col}==0 => {na_flag0}")

print("\n[CHECK] full_df_merged info():")
print(full_df_merged.info())

# Continue downstream
full_df = full_df_merged

print("[OK] BLOCK 3 complete. full_df shape:", full_df.shape)

## BLOCK 4 — TIME FEATURES

In [None]:

cols_before = set(full_df.columns)

# Ensure clean datetime index
full_df.index = pd.to_datetime(full_df.index)
full_df.index.name = "Date"
full_df = full_df.sort_index()
idx = full_df.index

# Discrete time features
full_df["day_of_week"] = idx.weekday.astype("int8")   # 0=Mon ... 6=Sun
full_df["month"] = idx.month.astype("int8")           # 1..12
full_df["quarter"] = idx.quarter.astype("int8")       # 1..4

# Quarter binary dummies
q = full_df["quarter"].astype("int8")
full_df["is_q1"] = (q == 1).astype("int8")
full_df["is_q2"] = (q == 2).astype("int8")
full_df["is_q3"] = (q == 3).astype("int8")
full_df["is_q4"] = (q == 4).astype("int8")

# Cyclical features
weekday = idx.weekday.astype(int)
period_week = 5
full_df["weekday_sin_5"] = np.sin(2 * np.pi * (weekday % period_week) / period_week)
full_df["weekday_cos_5"] = np.cos(2 * np.pi * (weekday % period_week) / period_week)

day_of_year = idx.dayofyear.astype(int)  # 1..365/366
year_len = np.where(idx.is_leap_year, 366, 365)
full_df["day_of_year_sin"] = np.sin(2 * np.pi * (day_of_year - 1) / year_len)
full_df["day_of_year_cos"] = np.cos(2 * np.pi * (day_of_year - 1) / year_len)

# Report newly added columns
cols_after = set(full_df.columns)
new_cols = sorted(cols_after - cols_before)

print("New time features added to full_df:")
for c in new_cols:
    print("  -", c)

preview_cols = [
    "day_of_week", "month", "quarter", "is_q1", "is_q2", "is_q3", "is_q4",
    "weekday_sin_5", "weekday_cos_5", "day_of_year_sin", "day_of_year_cos"
]

print("\nPreview:")
print(full_df[preview_cols].head(10))

print("\nNaN check (time features):")
print(full_df[preview_cols].isna().sum())

print("[OK] BLOCK 4 complete. full_df shape:", full_df.shape)

## BLOCK 5 — CONTROL CHECKS (leakage/time integrity)

In [None]:

print("=" * 70)
print("[CONTROL] Starting control checks for leakage/time integrity...")

# Basic index integrity
assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a pandas DataFrame."
assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."

full_df = full_df.sort_index()
full_df.index.name = full_df.index.name or "Date"

assert full_df.index.is_monotonic_increasing, "[ERROR] Index is not monotonic increasing after sort."
assert full_df.index.is_unique, "[ERROR] Index has duplicate timestamps."

print(
    f"[OK] Index: DatetimeIndex | sorted | unique | range={full_df.index.min()} -> {full_df.index.max()} | rows={len(full_df)}"
)

# Column sanity / duplicates
cols = list(full_df.columns)
dup_cols = pd.Index(cols)[pd.Index(cols).duplicated()].tolist()
assert len(dup_cols) == 0, f"[ERROR] Duplicate column names detected: {dup_cols}"
print(f"[OK] Columns: {len(cols)} total | no duplicate column names")

# Leakage guard by naming conventions
t1_cols = [c for c in cols if re.search(r"(_t1\b|_t\+1\b|t_plus_1\b)", c)]
assert len(t1_cols) == 0, f"[LEAKAGE ERROR] Found t+1 style columns in full_df: {t1_cols[:20]} (showing up to 20)"

suspicious_patterns = [
    r"\bshift\(\s*-1\s*\)", r"\blead\b", r"\bforward\b", r"\bfwd\b",
    r"\bnext_day\b", r"\btomorrow\b", r"\bt\+1\b"
]
sus_cols = [c for c in cols if any(re.search(p, c.lower()) for p in suspicious_patterns)]
if len(sus_cols) > 0:
    print(f"[WARN] Suspicious potential lead columns by NAME: {sus_cols[:25]} (showing up to 25)")
else:
    print("[OK] No suspicious lead/shift(-1) patterns found in column names")

print("[OK] Leakage guard (name-based) passed: no *_t1 columns")

# Missingness report
earnings_cols = [
    c for c in cols
    if c.startswith("eps_surprise")
    or c.startswith("has_eps_surprise")
    or c == "is_earnings_day"
]
macro_cols = [c for c in cols if c.startswith("CPI_") or c.startswith("FEDFUNDS_")]

focus_cols = [c for c in (earnings_cols + macro_cols) if c in cols]

print("\n[CONTROL] Missingness report (focus: Earnings + Macro)")
if len(focus_cols) == 0:
    print("[INFO] No focus columns found (earnings/macro) — skipping focus missingness table.")
else:
    miss = full_df[focus_cols].isna().sum().sort_values(ascending=False)
    miss_pct = (miss / len(full_df) * 100).round(2)
    miss_tbl = pd.DataFrame({"na_count": miss, "na_pct": miss_pct})
    print(miss_tbl)

na_any = full_df.isna().sum()
top_na = na_any[na_any > 0].sort_values(ascending=False).head(15)
print("\n[CONTROL] Global NaNs (top 15 cols with NA)")
if len(top_na) == 0:
    print("[OK] No NaNs in full_df.")
else:
    top_na_pct = (top_na / len(full_df) * 100).round(2)
    print(pd.DataFrame({"na_count": top_na, "na_pct": top_na_pct}))

# Event-like columns tags (governance lists)
EVENT_COLS_EARNINGS = earnings_cols
EVENT_COLS_MACRO = macro_cols
EVENT_COLS_ALL = sorted(set(EVENT_COLS_EARNINGS + EVENT_COLS_MACRO))

print("\n[CONTROL] Event-like column tagging:")
print(f"  - Earnings cols: {len(EVENT_COLS_EARNINGS)}")
print(f"  - Macro cols   : {len(EVENT_COLS_MACRO)}")
print(f"  - Total event  : {len(EVENT_COLS_ALL)}")

flag_checks = [c for c in ["is_earnings_day", "has_eps_surprise_yahoo", "has_eps_surprise_calc", "FEDFUNDS_changed"] if c in cols]
if len(flag_checks) > 0:
    print("\n[CONTROL] Flag dtype + value counts (quick):")
    for c in flag_checks:
        vc = full_df[c].value_counts(dropna=False)
        print(f"  - {c}: dtype={full_df[c].dtype} | values={vc.to_dict()}")

# Save governance artifacts
governance = {
    "event_cols_earnings": EVENT_COLS_EARNINGS,
    "event_cols_macro": EVENT_COLS_MACRO,
    "event_cols_all": EVENT_COLS_ALL,
    "focus_cols_missingness": focus_cols,
    "flag_checks": flag_checks,
}

print("\n[CONTROL] DONE. No features created. No missingness fixed. Leakage name-guards applied.")
print("=" * 70)

print("[OK] BLOCK 5 complete.")

## BLOCK 6 — RAW TRANSFORMATIONS (NO ROLLING)

In [None]:

eps = float(RUN_PARAMS["features"]["eps"])  # numeric safety for logs/divisions

# Preconditions
assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a DataFrame."
assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
full_df = full_df.sort_index()

cols_before = set(full_df.columns)


# Identify OHLCV ticker prefixes
def has_cols(prefix: str, required: list) -> bool:
    """Check if all required columns exist for a ticker prefix."""
    return all(f"{prefix}_{c}" in full_df.columns for c in required)


required_ohlc = ["Open", "High", "Low", "Close"]
required_ohlcv = ["Open", "High", "Low", "Close", "Volume"]

prefixes = sorted({c.rsplit("_", 1)[0] for c in full_df.columns if c.endswith("_Close")})

# EXCLUDE market indicators from OHLC-style transforms
EXCLUDE_RAW_OHLC = set(RUN_PARAMS["features"]["exclude_raw_ohlc"])

ohlc_prefixes = [p for p in prefixes if has_cols(p, required_ohlc) and p not in EXCLUDE_RAW_OHLC]
ohlcv_prefixes = [p for p in prefixes if has_cols(p, required_ohlcv) and p not in EXCLUDE_RAW_OHLC]

print(f"[INFO] Found {len(ohlc_prefixes)} OHLC tickers (excluded {sorted(EXCLUDE_RAW_OHLC)}): {ohlc_prefixes}")
print(f"[INFO] Found {len(ohlcv_prefixes)} OHLCV tickers (with Volume, excluded {sorted(EXCLUDE_RAW_OHLC)}): {ohlcv_prefixes}")

# Raw transforms per ticker (NO rolling)
new_cols = {}

for p in ohlc_prefixes:
    o = full_df[f"{p}_Open"].astype("float64")
    h = full_df[f"{p}_High"].astype("float64")
    l = full_df[f"{p}_Low"].astype("float64")
    c = full_df[f"{p}_Close"].astype("float64")

    # log returns
    new_cols[f"{p}_logret_cc"] = np.log((c + eps) / (c.shift(1) + eps))
    new_cols[f"{p}_logret_oc"] = np.log((c + eps) / (o + eps))
    new_cols[f"{p}_logret_gap_co"] = np.log((o + eps) / (c.shift(1) + eps))

    # abs variants
    new_cols[f"{p}_abs_logret_cc"] = new_cols[f"{p}_logret_cc"].abs()
    new_cols[f"{p}_abs_logret_oc"] = new_cols[f"{p}_logret_oc"].abs()
    new_cols[f"{p}_abs_logret_gap_co"] = new_cols[f"{p}_logret_gap_co"].abs()

    # intraday range
    new_cols[f"{p}_log_hl"] = np.log((h + eps) / (l + eps))

    # close position within High-Low range
    denom_hl = (h - l).replace(0.0, np.nan)
    close_pos = (c - l) / denom_hl
    new_cols[f"{p}_close_pos_hl"] = close_pos
    new_cols[f"{p}_close_pos_hl_centered"] = close_pos - 0.5

    # lags for logret_cc
    for lag in [1, 5, 21]:
        new_cols[f"{p}_logret_cc_lag{lag}"] = new_cols[f"{p}_logret_cc"].shift(lag)

# Volume features (only where Volume exists)
for p in ohlcv_prefixes:
    v = full_df[f"{p}_Volume"].astype("float64")
    c = full_df[f"{p}_Close"].astype("float64")

    new_cols[f"{p}_log_vol"] = np.log(v + 1.0)
    new_cols[f"{p}_log_vol_chg_1d"] = new_cols[f"{p}_log_vol"] - new_cols[f"{p}_log_vol"].shift(1)

    dollar_vol = (c * v).astype("float64")
    new_cols[f"{p}_log_dollar_vol"] = np.log(dollar_vol + 1.0)

# Attach to full_df (remove existing to avoid duplicates on re-run)
new_df = pd.DataFrame(new_cols, index=full_df.index)
existing_new_cols = [c for c in new_cols.keys() if c in full_df.columns]
if existing_new_cols:
    print(f"[INFO] Removing {len(existing_new_cols)} existing cols from full_df before merge")
    full_df = full_df.drop(columns=existing_new_cols)
full_df = pd.concat([full_df, new_df], axis=1)

# Diagnostics
cols_after = set(full_df.columns)
added = sorted(cols_after - cols_before)

print(f"\n[OK] BLOCK 6 added {len(added)} raw feature columns.")
print("[INFO] Sample of added columns (first 40):")
for c in added[:40]:
    print("  -", c)

nan_counts = full_df[added].isna().sum().sort_values(ascending=False).head(15)
print("\n[CHECK] NaNs in NEW raw features (top 15):")
print(nan_counts)

print("[OK] BLOCK 6 complete. full_df shape:", full_df.shape)

## BLOCK 7 — ROLLING STATISTICS

In [None]:

full_df = full_df.sort_index()

# Windows from RUN_PARAMS
W_SHORT = int(RUN_PARAMS["features"]["rolling_w_short"])
W_LONG = int(RUN_PARAMS["features"]["rolling_w_long"])
DO_VOLUME_ROLLING = bool(RUN_PARAMS["features"]["do_volume_rolling"])
eps = float(RUN_PARAMS["features"]["eps"])

cols_before = set(full_df.columns)

# Determine tickers that have raw series
tickers = sorted({
    c.replace("_logret_cc", "")
    for c in full_df.columns
    if c.endswith("_logret_cc") and not c.endswith("_abs_logret_cc")
})
print(f"[INFO] Rolling stats tickers detected from *_logret_cc (excluding *_abs_*): {tickers}")

new_cols = {}

# Rolling stats for returns / abs returns / HL
for p in tickers:
    col_r = f"{p}_logret_cc"
    col_ar = f"{p}_abs_logret_cc"
    col_hl = f"{p}_log_hl"

    if col_r not in full_df.columns:
        continue

    r = full_df[col_r].astype("float64")

    # rolling mean/std of logret_cc
    r_m_s = r.rolling(W_SHORT, min_periods=W_SHORT).mean()
    r_s_s = r.rolling(W_SHORT, min_periods=W_SHORT).std()

    r_m_l = r.rolling(W_LONG, min_periods=W_LONG).mean()
    r_s_l = r.rolling(W_LONG, min_periods=W_LONG).std()

    new_cols[f"{p}_logret_cc_mean_{W_SHORT}"] = r_m_s
    new_cols[f"{p}_logret_cc_std_{W_SHORT}"] = r_s_s
    new_cols[f"{p}_logret_cc_mean_{W_LONG}"] = r_m_l
    new_cols[f"{p}_logret_cc_std_{W_LONG}"] = r_s_l

    # z-score vs long window
    new_cols[f"{p}_logret_cc_z_{W_LONG}"] = (r - r_m_l) / (r_s_l + eps)

    # abs_logret_cc mean/std
    if col_ar in full_df.columns:
        ar = full_df[col_ar].astype("float64")
        new_cols[f"{p}_abs_logret_cc_mean_{W_SHORT}"] = ar.rolling(W_SHORT, min_periods=W_SHORT).mean()
        new_cols[f"{p}_abs_logret_cc_std_{W_SHORT}"] = ar.rolling(W_SHORT, min_periods=W_SHORT).std()
        new_cols[f"{p}_abs_logret_cc_mean_{W_LONG}"] = ar.rolling(W_LONG, min_periods=W_LONG).mean()
        new_cols[f"{p}_abs_logret_cc_std_{W_LONG}"] = ar.rolling(W_LONG, min_periods=W_LONG).std()

    # HL rolling mean/std + z-score
    if col_hl in full_df.columns:
        hl = full_df[col_hl].astype("float64")
        hl_m_s = hl.rolling(W_SHORT, min_periods=W_SHORT).mean()
        hl_s_s = hl.rolling(W_SHORT, min_periods=W_SHORT).std()

        hl_m_l = hl.rolling(W_LONG, min_periods=W_LONG).mean()
        hl_s_l = hl.rolling(W_LONG, min_periods=W_LONG).std()

        new_cols[f"{p}_log_hl_mean_{W_SHORT}"] = hl_m_s
        new_cols[f"{p}_log_hl_std_{W_SHORT}"] = hl_s_s
        new_cols[f"{p}_log_hl_mean_{W_LONG}"] = hl_m_l
        new_cols[f"{p}_log_hl_std_{W_LONG}"] = hl_s_l
        new_cols[f"{p}_log_hl_z_{W_LONG}"] = (hl - hl_m_l) / (hl_s_l + eps)

# Optional: Volume rolling stats
if DO_VOLUME_ROLLING:
    for p in tickers:
        col_lv = f"{p}_log_vol"
        if col_lv not in full_df.columns:
            continue

        lv = full_df[col_lv].astype("float64")

        lv_m_s = lv.rolling(W_SHORT, min_periods=W_SHORT).mean()
        lv_s_s = lv.rolling(W_SHORT, min_periods=W_SHORT).std()

        lv_m_l = lv.rolling(W_LONG, min_periods=W_LONG).mean()
        lv_s_l = lv.rolling(W_LONG, min_periods=W_LONG).std()

        new_cols[f"{p}_log_vol_mean_{W_SHORT}"] = lv_m_s
        new_cols[f"{p}_log_vol_std_{W_SHORT}"] = lv_s_s
        new_cols[f"{p}_log_vol_mean_{W_LONG}"] = lv_m_l
        new_cols[f"{p}_log_vol_std_{W_LONG}"] = lv_s_l
        new_cols[f"{p}_log_vol_z_{W_LONG}"] = (lv - lv_m_l) / (lv_s_l + eps)

# Attach + diagnostics (remove existing to avoid duplicates on re-run)
roll_df = pd.DataFrame(new_cols, index=full_df.index)
existing_new_cols = [c for c in new_cols.keys() if c in full_df.columns]
if existing_new_cols:
    print(f"[INFO] Removing {len(existing_new_cols)} existing cols from full_df before merge")
    full_df = full_df.drop(columns=existing_new_cols)
full_df = pd.concat([full_df, roll_df], axis=1)

cols_after = set(full_df.columns)
added = sorted(cols_after - cols_before)

print(
    f"\n[OK] BLOCK 7 added {len(added)} rolling-stat columns "
    f"(W_SHORT={W_SHORT}, W_LONG={W_LONG}, volume_rolling={DO_VOLUME_ROLLING})."
)
print("[INFO] Sample added cols (first 40):")
for c in added[:40]:
    print("  -", c)

nan_top = full_df[added].isna().sum().sort_values(ascending=False).head(15)
print("\n[CHECK] NaNs in NEW rolling features (top 15):")
print(nan_top)

print("[OK] BLOCK 7 complete. full_df shape:", full_df.shape)

## BLOCK 8 — CROSS-ASSET RELATIONSHIPS

In [None]:

full_df = full_df.sort_index()

BASE = str(RUN_PARAMS["features"]["cross_asset_base"])
PEERS = list(RUN_PARAMS["features"]["cross_asset_peers"])
WINDOWS = list(RUN_PARAMS["features"]["cross_asset_windows"])
eps = float(RUN_PARAMS["features"]["eps"])

# Preconditions
base_col = f"{BASE}_logret_cc"
assert base_col in full_df.columns, f"[ERROR] Missing base return column: {base_col}"

for p in PEERS:
    col = f"{p}_logret_cc"
    assert col in full_df.columns, f"[ERROR] Missing peer return column: {col}"

cols_before = set(full_df.columns)

# Rolling correlation & beta
new_cols = {}
r_base = full_df[base_col].astype("float64")

for p in PEERS:
    r_peer = full_df[f"{p}_logret_cc"].astype("float64")

    for w in WINDOWS:
        w = int(w)
        new_cols[f"{BASE}_corr_{p}_{w}"] = r_base.rolling(w, min_periods=w).corr(r_peer)

        cov = r_base.rolling(w, min_periods=w).cov(r_peer)
        var = r_peer.rolling(w, min_periods=w).var()
        new_cols[f"{BASE}_beta_{p}_{w}"] = cov / (var + eps)

# Attach + diagnostics (remove existing to avoid duplicates on re-run)
cross_df = pd.DataFrame(new_cols, index=full_df.index)
existing_new_cols = [c for c in new_cols.keys() if c in full_df.columns]
if existing_new_cols:
    print(f"[INFO] Removing {len(existing_new_cols)} existing cols from full_df before merge")
    full_df = full_df.drop(columns=existing_new_cols)
full_df = pd.concat([full_df, cross_df], axis=1)

added = sorted(set(full_df.columns) - cols_before)

print(f"\n[OK] BLOCK 8 added {len(added)} cross-asset rolling columns (BASE={BASE}, windows={WINDOWS}).")
print("[INFO] Added columns:")
for c in added:
    print("  -", c)

nan_top = full_df[added].isna().sum().sort_values(ascending=False).head(10)
print("\n[CHECK] NaNs in NEW cross-asset features (top 10):")
print(nan_top)

print("[OK] BLOCK 8 complete. full_df shape:", full_df.shape)

## BLOCK 9 — REGIME & INTERACTION LAYER

In [None]:

full_df = full_df.sort_index()

BASE = str(RUN_PARAMS["features"]["regime_base"])
W_SHORT = int(RUN_PARAMS["features"]["rolling_w_short"])
W_LONG = int(RUN_PARAMS["features"]["rolling_w_long"])
PEERS = list(RUN_PARAMS["features"]["cross_asset_peers"])
eps = float(RUN_PARAMS["features"]["eps"])

cols_before = set(full_df.columns)
new_cols = {}

# Preconditions for needed rolling columns
req_cols = [
    f"{BASE}_logret_cc_std_{W_SHORT}",
    f"{BASE}_logret_cc_std_{W_LONG}",
    f"{BASE}_logret_cc_z_{W_LONG}",
    f"{BASE}_abs_logret_cc",
    f"{BASE}_abs_logret_cc_mean_{W_LONG}",
    f"{BASE}_abs_logret_cc_std_{W_LONG}",
    f"{BASE}_close_pos_hl",
    f"{BASE}_log_hl_z_{W_LONG}",
    f"{BASE}_logret_cc_mean_{W_LONG}",
]
missing_req = [c for c in req_cols if c not in full_df.columns]
assert len(missing_req) == 0, f"[ERROR] Missing required columns for regime layer: {missing_req}"

# Volatility / Regime (BASE)
std_s = full_df[f"{BASE}_logret_cc_std_{W_SHORT}"].astype("float64")
std_l = full_df[f"{BASE}_logret_cc_std_{W_LONG}"].astype("float64")

new_cols[f"{BASE}_vol_ratio_{W_SHORT}_{W_LONG}"] = std_s / (std_l + eps)
new_cols[f"{BASE}_vol_diff_{W_SHORT}_{W_LONG}"] = std_s - std_l

z_ret = full_df[f"{BASE}_logret_cc_z_{W_LONG}"].astype("float64")
new_cols[f"{BASE}_vol_regime_score"] = z_ret * (std_s / (std_l + eps))

# Price Structure (BASE)
abs_ret = full_df[f"{BASE}_abs_logret_cc"].astype("float64")
abs_ret_mean_l = full_df[f"{BASE}_abs_logret_cc_mean_{W_LONG}"].astype("float64")
abs_ret_std_l = full_df[f"{BASE}_abs_logret_cc_std_{W_LONG}"].astype("float64")
abs_ret_z = (abs_ret - abs_ret_mean_l) / (abs_ret_std_l + eps)

new_cols[f"{BASE}_price_struct_1"] = (
    full_df[f"{BASE}_close_pos_hl"].astype("float64") * abs_ret_z
)

new_cols[f"{BASE}_price_struct_2"] = (
    full_df[f"{BASE}_log_hl_z_{W_LONG}"].astype("float64") *
    (std_s / (std_l + eps))
)

# Market Context (BASE vs peers)
for p in PEERS:
    beta_col = f"{BASE}_beta_{p}_{W_LONG}"
    corr_col = f"{BASE}_corr_{p}_{W_LONG}"

    if beta_col in full_df.columns:
        new_cols[f"{BASE}_ctx_beta_vol_{p}"] = full_df[beta_col].astype("float64") * std_l

    if corr_col in full_df.columns:
        new_cols[f"{BASE}_ctx_corr_volratio_{p}"] = (
            full_df[corr_col].astype("float64") * (std_s / (std_l + eps))
        )

# Macro context (event-aware)
trend_l = full_df[f"{BASE}_logret_cc_mean_{W_LONG}"].astype("float64")

# VIX level × volatility (daily series)
if "^VIX_Close" in full_df.columns:
    new_cols[f"{BASE}_vix_vol_interact"] = full_df["^VIX_Close"].astype("float64") * std_l

# FEDFUNDS delta × trend, ONLY on release day
if "FEDFUNDS_delta_mom" in full_df.columns and "FEDFUNDS_release_day" in full_df.columns:
    new_cols[f"{BASE}_fedfunds_trend_interact"] = (
        full_df["FEDFUNDS_delta_mom"].astype("float64") *
        full_df["FEDFUNDS_release_day"].astype("float64") *
        trend_l
    )

# CPI pct_mom × trend, ONLY on release day
if "CPI_pct_mom" in full_df.columns and "CPI_release_day" in full_df.columns:
    new_cols[f"{BASE}_cpi_trend_interact"] = (
        full_df["CPI_pct_mom"].astype("float64") *
        full_df["CPI_release_day"].astype("float64") *
        trend_l
    )

# Attach + diagnostics (remove existing to avoid duplicates on re-run)
inter_df = pd.DataFrame(new_cols, index=full_df.index)
existing_new_cols = [c for c in new_cols.keys() if c in full_df.columns]
if existing_new_cols:
    print(f"[INFO] Removing {len(existing_new_cols)} existing cols from full_df before merge")
    full_df = full_df.drop(columns=existing_new_cols)
full_df = pd.concat([full_df, inter_df], axis=1)

added = sorted(set(full_df.columns) - cols_before)

print(f"\n[OK] BLOCK 9 added {len(added)} regime/interaction columns (macro now event-aware).")
print("[INFO] Added columns:")
for c in added:
    print("  -", c)

nan_top = full_df[added].isna().sum().sort_values(ascending=False).head(10)
print("\n[CHECK] NaNs in NEW Block-9 features (top 10):")
print(nan_top)

print("[OK] BLOCK 9 complete. full_df shape:", full_df.shape)

## BLOCK 10 — VIX/TNX SPECIAL FEATURES

In [None]:

full_df = full_df.sort_index()
eps = float(RUN_PARAMS["features"]["eps"])

cols_before = set(full_df.columns)
new_cols = {}


# Preconditions
def _need_cols(cols):
    """Check required columns exist."""
    missing = [c for c in cols if c not in full_df.columns]
    assert len(missing) == 0, f"[ERROR] Missing required columns: {missing}"


_need_cols([
    "^VIX_Open", "^VIX_High", "^VIX_Low", "^VIX_Close",
    "^TNX_Open", "^TNX_High", "^TNX_Low", "^TNX_Close"
])


def _log_ratio(num, den):
    """Safe log ratio."""
    return np.log((num + eps) / (den + eps))


# VIX features
vix_o = full_df["^VIX_Open"].astype("float64")
vix_h = full_df["^VIX_High"].astype("float64")
vix_l = full_df["^VIX_Low"].astype("float64")
vix_c = full_df["^VIX_Close"].astype("float64")

new_cols["VIX_log_level"] = np.log(vix_c + eps)
new_cols["VIX_delta_1d"] = vix_c.diff(1)
new_cols["VIX_abs_delta_1d"] = new_cols["VIX_delta_1d"].abs()
new_cols["VIX_log_hl"] = _log_ratio(vix_h, vix_l)
new_cols["VIX_range_frac"] = (vix_h - vix_l) / (vix_c.abs() + eps)
new_cols["VIX_gap_oc"] = _log_ratio(vix_c, vix_o)

for lag in [1, 5, 21]:
    new_cols[f"VIX_delta_1d_lag{lag}"] = new_cols["VIX_delta_1d"].shift(lag)
    new_cols[f"VIX_abs_delta_1d_lag{lag}"] = new_cols["VIX_abs_delta_1d"].shift(lag)

# TNX features
tnx_o = full_df["^TNX_Open"].astype("float64")
tnx_h = full_df["^TNX_High"].astype("float64")
tnx_l = full_df["^TNX_Low"].astype("float64")
tnx_c = full_df["^TNX_Close"].astype("float64")

new_cols["TNX_level"] = tnx_c
new_cols["TNX_delta_1d"] = tnx_c.diff(1)
new_cols["TNX_abs_delta_1d"] = new_cols["TNX_delta_1d"].abs()
new_cols["TNX_delta_5d"] = tnx_c.diff(5)
new_cols["TNX_log_hl"] = _log_ratio(tnx_h, tnx_l)
new_cols["TNX_range"] = (tnx_h - tnx_l)
new_cols["TNX_gap_oc"] = _log_ratio(tnx_c, tnx_o)

for lag in [1, 5, 21]:
    new_cols[f"TNX_delta_1d_lag{lag}"] = new_cols["TNX_delta_1d"].shift(lag)
    new_cols[f"TNX_abs_delta_1d_lag{lag}"] = new_cols["TNX_abs_delta_1d"].shift(lag)

# Attach + diagnostics (remove existing to avoid duplicates on re-run)
vix_tnx_df = pd.DataFrame(new_cols, index=full_df.index)
existing_new_cols = [c for c in new_cols.keys() if c in full_df.columns]
if existing_new_cols:
    print(f"[INFO] Removing {len(existing_new_cols)} existing cols from full_df before merge")
    full_df = full_df.drop(columns=existing_new_cols)
full_df = pd.concat([full_df, vix_tnx_df], axis=1)

added = sorted(set(full_df.columns) - cols_before)

print(f"\n[OK] BLOCK 10 added {len(added)} special VIX/TNX columns.")
print("[INFO] Added columns:")
for c in added:
    print("  -", c)

nan_top = full_df[added].isna().sum().sort_values(ascending=False).head(12)
print("\n[CHECK] NaNs in NEW VIX/TNX features (top 12):")
print(nan_top)

print("[OK] BLOCK 10 complete. full_df shape:", full_df.shape)

## BLOCK 11 — EVENTS & SPARSE SIGNALS

In [None]:

full_df = full_df.sort_index()
eps = float(RUN_PARAMS["features"]["eps"])

# Get params from RUN_PARAMS
BASE = str(RUN_PARAMS["features"]["regime_base"])
W_LONG = int(RUN_PARAMS["features"]["rolling_w_long"])
MARKET_VOL_TICKER = str(RUN_PARAMS["features"]["market_vol_ticker"])

BASE_VOL_COL = f"{BASE}_logret_cc_std_{W_LONG}"
MARKET_VOL_COL = f"{MARKET_VOL_TICKER}_logret_cc_std_{W_LONG}"

cols_before = set(full_df.columns)
new_cols = {}

# Preconditions
req = [
    "is_earnings_day",
    "eps_surprise_pct_yahoo", "has_eps_surprise_yahoo",
    "eps_surprise_pct_calc", "has_eps_surprise_calc",
    "CPI_pct_mom", "CPI_release_day",
    "FEDFUNDS_delta_mom", "FEDFUNDS_release_day",
]
missing = [c for c in req if c not in full_df.columns]
assert len(missing) == 0, f"[ERROR] Missing required columns: {missing}"
assert BASE_VOL_COL in full_df.columns, f"[ERROR] missing {BASE_VOL_COL}"
assert MARKET_VOL_COL in full_df.columns, f"[ERROR] missing {MARKET_VOL_COL}"


# Helper: event-based previous release (lag1/lag2) mapped to daily index
def prev_event_value_to_daily(
    df: pd.DataFrame,
    value_col: str,
    release_flag_col: str,
    lag_k: int = 1,
    fill_before_first: float = 0.0,
) -> pd.Series:
    """Returns a daily series where each day carries the value from the previous (lag_k) RELEASE EVENT."""
    events = df.loc[df[release_flag_col] == 1, value_col].astype("float64").copy()
    shifted = events.shift(lag_k)

    out = pd.Series(index=df.index, dtype="float64")
    out.loc[shifted.index] = shifted.values
    out = out.ffill().fillna(fill_before_first)
    return out


# Earnings: EPS surprise lags by EARNINGS EVENTS (lag1/lag2 = last/prev earnings)
def eps_event_lags_to_daily(df: pd.DataFrame, val_col: str, flag_col: str, lag_k: int) -> pd.Series:
    """Event-based lag for EPS: values only when has_eps_surprise_* == 1, shift by EVENTS."""
    event_series = df.loc[df[flag_col] == 1, val_col].astype("float64").copy()
    shifted = event_series.shift(lag_k)

    out = pd.Series(index=df.index, dtype="float64")
    out.loc[shifted.index] = shifted.values
    out = out.ffill().fillna(0.0)
    return out


for src in ["yahoo", "calc"]:
    val_col = f"eps_surprise_pct_{src}"
    flag_col = f"has_eps_surprise_{src}"
    new_cols[f"eps_surprise_{src}_lag1"] = eps_event_lags_to_daily(full_df, val_col, flag_col, lag_k=1)
    new_cols[f"eps_surprise_{src}_lag2"] = eps_event_lags_to_daily(full_df, val_col, flag_col, lag_k=2)

# post-earnings day 1..5 dummies
earn = full_df["is_earnings_day"].astype("int8")
for k in range(1, 6):
    new_cols[f"post_earnings_day_{k}"] = earn.shift(k).fillna(0).astype("int8")

# EPS flag × vol
vol_base = full_df[BASE_VOL_COL].astype("float64")
new_cols["eps_flag_yahoo_x_vol"] = full_df["has_eps_surprise_yahoo"].astype("float64") * vol_base
new_cols["eps_flag_calc_x_vol"] = full_df["has_eps_surprise_calc"].astype("float64") * vol_base

# Macro: CPI (impulse + previous release)
cpi_release = full_df["CPI_release_day"].astype("int8")
cpi_val = full_df["CPI_pct_mom"].astype("float64")

# impulse only on release day
new_cols["CPI_impulse"] = cpi_val * cpi_release.astype("float64")

# previous release value carried forward (event-lag1)
new_cols["CPI_prev_release"] = prev_event_value_to_daily(
    full_df, value_col="CPI_pct_mom", release_flag_col="CPI_release_day", lag_k=1, fill_before_first=0.0
)

# change vs previous release, only meaningful on release days
new_cols["CPI_change_prev_release"] = (cpi_val - new_cols["CPI_prev_release"]) * cpi_release.astype("float64")

# Macro: FEDFUNDS (impulse + previous decision delta)
ff_release = full_df["FEDFUNDS_release_day"].astype("int8")
ff_delta = full_df["FEDFUNDS_delta_mom"].astype("float64")

new_cols["FEDFUNDS_impulse"] = ff_delta * ff_release.astype("float64")

new_cols["FEDFUNDS_prev_delta"] = prev_event_value_to_daily(
    full_df, value_col="FEDFUNDS_delta_mom", release_flag_col="FEDFUNDS_release_day", lag_k=1, fill_before_first=0.0
)

# release-day × market vol
mkt_vol = full_df[MARKET_VOL_COL].astype("float64")
new_cols["FEDFUNDS_release_day_x_mkt_vol"] = ff_release.astype("float64") * mkt_vol

# Attach + diagnostics (remove existing to avoid duplicates on re-run)
blk11_df = pd.DataFrame(new_cols, index=full_df.index)
existing_new_cols = [c for c in new_cols.keys() if c in full_df.columns]
if existing_new_cols:
    print(f"[INFO] Removing {len(existing_new_cols)} existing cols from full_df before merge")
    full_df = full_df.drop(columns=existing_new_cols)
full_df = pd.concat([full_df, blk11_df], axis=1)

added = sorted(set(full_df.columns) - cols_before)

print(f"\n[OK] BLOCK 11 added {len(added)} event/sparse-signal columns.")
print("[INFO] Added columns:")
for c in added:
    print("  -", c)

nan_top = full_df[added].isna().sum().sort_values(ascending=False).head(12)
print("\n[CHECK] NaNs in NEW Block-11 features (top 12):")
print(nan_top)

# Sanity: impulse should be non-zero only on release days
for col_imp, flag in [("CPI_impulse", "CPI_release_day"), ("FEDFUNDS_impulse", "FEDFUNDS_release_day")]:
    if col_imp in full_df.columns and flag in full_df.columns:
        nz_days = int((full_df[col_imp].abs() > 0).sum())
        flag_days = int(full_df[flag].sum())
        print(f"\n[CHECK] {col_imp}: nonzero_days={nz_days} | {flag} count={flag_days}")

print("[OK] BLOCK 11 complete. full_df shape:", full_df.shape)

## BLOCK 12 — CRISIS PERIOD FLAGS

In [None]:

assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
full_df = full_df.sort_index()

cols_before = set(full_df.columns)

# Date windows from RUN_PARAMS
covid_start = pd.Timestamp(RUN_PARAMS["features"]["covid_start"])
covid_end = pd.Timestamp(RUN_PARAMS["features"]["covid_end"])

crisis_2008_start = pd.Timestamp(RUN_PARAMS["features"]["crisis_2008_start"])
crisis_2008_end = pd.Timestamp(RUN_PARAMS["features"]["crisis_2008_end"])

# Build flags (overwrite-safe)
full_df["covid_period"] = ((full_df.index >= covid_start) & (full_df.index <= covid_end)).astype("int8")
full_df["pre_covid"] = (full_df.index < covid_start).astype("int8")
full_df["post_covid"] = (full_df.index > covid_end).astype("int8")

full_df["crisis_2008"] = ((full_df.index >= crisis_2008_start) & (full_df.index <= crisis_2008_end)).astype("int8")
full_df["pre_crisis_2008"] = (full_df.index < crisis_2008_start).astype("int8")
full_df["post_crisis_2008"] = (full_df.index > crisis_2008_end).astype("int8")

# Sanity checks: mutually exclusive within each regime triad (pre / in / post)
bad_covid = int(((full_df["pre_covid"] + full_df["covid_period"] + full_df["post_covid"]) != 1).sum())
bad_2008 = int(((full_df["pre_crisis_2008"] + full_df["crisis_2008"] + full_df["post_crisis_2008"]) != 1).sum())
assert bad_covid == 0, "[ERROR] COVID flags are not mutually exclusive."
assert bad_2008 == 0, "[ERROR] Crisis flags are not mutually exclusive."

cols_after = set(full_df.columns)
added = sorted(cols_after - cols_before)

counts = {
    "pre_covid": int(full_df["pre_covid"].sum()),
    "covid_period": int(full_df["covid_period"].sum()),
    "post_covid": int(full_df["post_covid"].sum()),
    "pre_crisis_2008": int(full_df["pre_crisis_2008"].sum()),
    "crisis_2008": int(full_df["crisis_2008"].sum()),
    "post_crisis_2008": int(full_df["post_crisis_2008"].sum()),
}

print(f"[OK] BLOCK 12 crisis flags added/updated. Newly added cols: {len(added)}")
print("[INFO] Counts:", counts)

print("[OK] BLOCK 12 complete. full_df shape:", full_df.shape)

## BLOCK 13 — DEFINE TARGET (NEXT-DAY LOG RETURN)

In [None]:

assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
full_df = full_df.sort_index()

SRC_COL = str(RUN_PARAMS["data"]["target_src_col"])

assert SRC_COL in full_df.columns, f"[ERROR] Missing {SRC_COL}."

# Deterministic overwrite on rerun
if TARGET_T1 in full_df.columns:
    print(f"[WARN] {TARGET_T1} already exists — overwriting it deterministically (shift(-1) of {SRC_COL}).")

full_df[TARGET_T1] = full_df[SRC_COL].shift(-1).astype("float64")

# Sanity: expect exactly 1 NaN at the end
n_nan = int(full_df[TARGET_T1].isna().sum())
print("[INFO] Target NaNs:", n_nan, "out of", len(full_df))
assert n_nan == 1, f"[ERROR] Expected exactly 1 NaN in {TARGET_T1} (last row). Found {n_nan}."

print("[OK] Target defined:", TARGET_T1)
print("[INFO] Target tail preview (before drop):")
print(full_df[[TARGET_T1]].tail(3))

# Drop last row (NaN target)
before = len(full_df)
full_df = full_df.dropna(subset=[TARGET_T1]).copy()
after = len(full_df)

print(f"[OK] Dropped last row with NaN target: {before} -> {after}")
print("[INFO] New index range:", full_df.index.min(), "->", full_df.index.max())
print("[INFO] Target tail preview (after drop):")
print(full_df[[TARGET_T1]].tail(3))

# Persist target metadata into RUN_PARAMS
RUN_PARAMS.setdefault("data", {})
RUN_PARAMS["data"]["target_src_col"] = SRC_COL
RUN_PARAMS["data"]["target_col"] = TARGET_T1

# Save target metadata
meta = {
    "target_src_col": SRC_COL,
    "target_col": TARGET_T1,
    "rows_before_drop": int(before),
    "rows_after_drop": int(after),
    "min_date": str(full_df.index.min()),
    "max_date": str(full_df.index.max()),
}
save_json(meta, OUTPUTS_DIR / "target_meta.json")
save_json(meta, DRIVE_PATHS["outputs_dir"] / "target_meta.json")

print("[OK] BLOCK 13 complete. full_df shape:", full_df.shape)

## BLOCK 14 — DROP DUPLICATE FEATURE COLUMNS

In [None]:

assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a DataFrame."

dup_mask = full_df.columns.duplicated(keep="last")
dup_cols = full_df.columns[dup_mask].tolist()

if len(dup_cols) == 0:
    print("[OK] No duplicate column names found. Nothing to drop.")
else:
    before_shape = full_df.shape
    full_df = full_df.loc[:, ~dup_mask].copy()
    after_shape = full_df.shape

    print(f"[WARN] Dropped {len(dup_cols)} duplicate columns (keep_last).")
    print("[INFO] Shape:", before_shape, "->", after_shape)
    print("[INFO] Example dropped duplicates (first 30):", dup_cols[:30])

print("[OK] BLOCK 14 complete. full_df shape:", full_df.shape)

## BLOCK 15 — LIMIT PERIOD + SAVE SNAPSHOT

In [None]:

assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
full_df = full_df.sort_index()

# Limit start date from RUN_PARAMS
limit_start = str(RUN_PARAMS["data"]["limit_start_date"])
full_df = full_df.loc[limit_start:].copy()

# Save to interim
INTERIM_DIR_LOCAL = DATA_DIRS_LOCAL["interim"]
INTERIM_DIR_DRIVE = DATA_DIRS_DRIVE["interim"]

out_local = INTERIM_DIR_LOCAL / "full_df.pkl"
out_drive = INTERIM_DIR_DRIVE / "full_df.pkl"

full_df.to_pickle(out_local)
copy_file(out_local, out_drive)

print("[OK] Saved full_df snapshot to:")
print("  - local:", out_local)
print("  - drive:", out_drive)
print("[INFO] full_df rows:", len(full_df), "| range:", full_df.index.min(), "->", full_df.index.max())

print("[OK] BLOCK 15 complete.")

## BLOCK 16 — FULL COMPLETENESS SUMMARY

In [None]:


def print_full_info_with_missing(df: pd.DataFrame, title: str = "DATAFRAME INFO") -> pd.DataFrame:
    """Print full column-wise completeness table."""
    print("\n" + "=" * 90)
    print(f"[{title}]")
    print("=" * 90)

    n_rows, n_cols = df.shape
    print(f"Total rows: {n_rows:,}")
    print(f"Total cols: {n_cols:,}")
    if isinstance(df.index, pd.DatetimeIndex):
        print(f"Index range: {df.index.min()} -> {df.index.max()}")

    missing = df.isna().sum()
    non_null = n_rows - missing
    dtypes = df.dtypes.astype(str)

    summary = (
        pd.DataFrame({
            "dtype": dtypes,
            "non_null": non_null,
            "missing": missing,
            "missing_%": (missing / max(n_rows, 1) * 100).round(3),
        })
        .sort_values(["missing", "missing_%"], ascending=False)
    )

    with pd.option_context(
        "display.max_rows", None,
        "display.max_columns", None,
        "display.width", 220,
        "display.max_colwidth", 60
    ):
        print("\n[Column-wise completeness]")
        print(summary)

    print("\n[Top 20 columns by missing]")
    print(summary.head(20))

    total_missing = int(missing.sum())
    total_cells = int(n_rows * n_cols)
    print("\n[Overall missing]")
    print(
        f"Total missing cells: {total_missing:,} / {total_cells:,} "
        f"({(total_missing / max(total_cells, 1) * 100):.4f}%)"
    )

    return summary


summary = print_full_info_with_missing(full_df, title="full_df (after feature blocks + time limit)")

# Save artifacts
FEATURE_LIST_DIR_LOCAL = OUTPUTS_DIR / "feature_lists"
FEATURE_LIST_DIR_DRIVE = DRIVE_PATHS["outputs_dir"] / "feature_lists"
ensure_dir(FEATURE_LIST_DIR_LOCAL)
ensure_dir(FEATURE_LIST_DIR_DRIVE)

feature_list = list(full_df.columns)

# Feature lists
features_txt_local = FEATURE_LIST_DIR_LOCAL / "feature_list_all_columns.txt"
features_pkl_local = FEATURE_LIST_DIR_LOCAL / "feature_list_all_columns.pkl"
features_csv_local = FEATURE_LIST_DIR_LOCAL / "feature_list_all_columns.csv"

features_txt_local.write_text("\n".join(feature_list), encoding="utf-8")
save_pickle(feature_list, features_pkl_local)
pd.DataFrame({"feature": feature_list}).to_csv(features_csv_local, index=False)

copy_file(features_txt_local, FEATURE_LIST_DIR_DRIVE / features_txt_local.name)
copy_file(features_pkl_local, FEATURE_LIST_DIR_DRIVE / features_pkl_local.name)
copy_file(features_csv_local, FEATURE_LIST_DIR_DRIVE / features_csv_local.name)

# Missingness summary
missing_csv_local = FEATURE_LIST_DIR_LOCAL / "missing_summary_all_columns.csv"
summary.reset_index(names="feature").to_csv(missing_csv_local, index=False)
copy_file(missing_csv_local, FEATURE_LIST_DIR_DRIVE / missing_csv_local.name)

print("\n[OK] Saved feature list + missing summary to:")
print("  - local :", FEATURE_LIST_DIR_LOCAL)
print("  - drive :", FEATURE_LIST_DIR_DRIVE)

print("[OK] BLOCK 16 complete.")

---
# SECTION 3: Exploratory Data Analysis (EDA)

**Data visualization and analysis**

**Blocks:** 17-19

## BLOCK 17 — EDA RETURNS

In [None]:

# Check if EDA is enabled
if not RUN_PARAMS["eda"]["enabled"]:
    print("[SKIP] BLOCK 17 — EDA disabled in RUN_PARAMS.")
else:
    # Guardrails
    assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a DataFrame."
    assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."

    full_df = full_df.sort_index()

    # EDA dirs inside RUN folders
    EDA_PLOTS_LOCAL = LOCAL_PATHS["plots_dir"] / "eda_returns"
    EDA_REPS_LOCAL = LOCAL_PATHS["reports_dir"] / "eda_returns"
    EDA_PLOTS_DRIVE = DRIVE_PATHS["plots_dir"] / "eda_returns"
    EDA_REPS_DRIVE = DRIVE_PATHS["reports_dir"] / "eda_returns"

    for p in [EDA_PLOTS_LOCAL, EDA_REPS_LOCAL, EDA_PLOTS_DRIVE, EDA_REPS_DRIVE]:
        p.mkdir(parents=True, exist_ok=True)

    STATS_FILE_LOCAL = EDA_REPS_LOCAL / "eda_returns_stats.csv"
    STATS_FILE_DRIVE = EDA_REPS_DRIVE / "eda_returns_stats.csv"

    # Get bins from RUN_PARAMS
    EDA_BINS = int(RUN_PARAMS["eda"]["returns_bins"])

    def safe_name(s: str) -> str:
        """Convert ticker name to safe filename."""
        s = s.replace("^", "")
        return re.sub(r"[^A-Za-z0-9\-_]+", "_", s)

    def analyze_returns(series: pd.Series, ticker: str, bins: int = EDA_BINS) -> None:
        """Analyze returns distribution for a ticker."""
        data = pd.to_numeric(series, errors="coerce").dropna()
        if data.empty:
            print(f"[WARN] {ticker}: empty after dropna(). Skipping.")
            return

        fn = safe_name(ticker)

        # Histogram
        plt.figure(figsize=(10, 5))
        plt.hist(data.values, bins=bins)
        plt.title(f"Histogram of {ticker} Daily Log Returns")
        plt.xlabel("Log return")
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.savefig(EDA_PLOTS_LOCAL / f"{fn}_hist.png", dpi=150)
        plt.close()

        # Boxplot
        plt.figure(figsize=(10, 3))
        plt.boxplot(data.values, vert=False)
        plt.title(f"Boxplot of {ticker} Daily Log Returns")
        plt.xlabel("Log return")
        plt.tight_layout()
        plt.savefig(EDA_PLOTS_LOCAL / f"{fn}_box.png", dpi=150)
        plt.close()

        # Stats CSV (append)
        skew_val = float(stats.skew(data.values, bias=False))
        kurt_val = float(stats.kurtosis(data.values, fisher=False, bias=False))

        file_exists = STATS_FILE_LOCAL.exists()
        with open(STATS_FILE_LOCAL, mode="a", newline="") as f:
            w = csv.writer(f)
            if not file_exists:
                w.writerow(["run_id", "ticker", "col", "n", "skewness", "kurtosis_pearson"])
            w.writerow([RUN_ID, ticker, series.name, int(len(data)), skew_val, kurt_val])

        print(f"[OK] {ticker}: saved hist+box; skew={skew_val:.6f}, kurt={kurt_val:.6f}")

    # AUTO: find all *_logret_cc columns (exclude abs)
    ret_cols = sorted([
        c for c in full_df.columns
        if c.endswith("_logret_cc") and not c.endswith("_abs_logret_cc")
    ])
    assert len(ret_cols) > 0, "[ERROR] No *_logret_cc columns found."

    print(f"[INFO] Found {len(ret_cols)} return columns.")

    for col in ret_cols:
        ticker = col.replace("_logret_cc", "")
        analyze_returns(full_df[col], ticker=ticker)

    # Mirror plots + stats to DRIVE
    for img in EDA_PLOTS_LOCAL.glob("*.png"):
        copy_file(img, EDA_PLOTS_DRIVE / img.name)

    if STATS_FILE_LOCAL.exists():
        copy_file(STATS_FILE_LOCAL, STATS_FILE_DRIVE)

    print("\n[OK] EDA Returns complete.")
    print("[INFO] LOCAL plots :", EDA_PLOTS_LOCAL)
    print("[INFO] DRIVE plots :", EDA_PLOTS_DRIVE)
    print("[INFO] LOCAL report:", STATS_FILE_LOCAL)
    print("[INFO] DRIVE report:", STATS_FILE_DRIVE)

print("[OK] BLOCK 17 complete.")

## BLOCK 18 — EDA VOLATILITY

In [None]:

# Check if EDA is enabled
if not RUN_PARAMS["eda"]["enabled"]:
    print("[SKIP] BLOCK 18 — EDA disabled in RUN_PARAMS.")
else:
    # Guardrails
    assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a DataFrame."
    assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
    full_df = full_df.sort_index()

    # EDA dirs inside RUN folders
    EDA_PLOTS_LOCAL = LOCAL_PATHS["plots_dir"] / "eda_volatility"
    EDA_PLOTS_DRIVE = DRIVE_PATHS["plots_dir"] / "eda_volatility"
    EDA_PLOTS_LOCAL.mkdir(parents=True, exist_ok=True)
    EDA_PLOTS_DRIVE.mkdir(parents=True, exist_ok=True)

    def _safe_fname(s: str) -> str:
        """Convert column name to safe filename."""
        s = s.replace("^", "")
        return re.sub(r"[^A-Za-z0-9\-_]+", "_", s)

    def plot_volatility_with_crisis_periods(
        df: pd.DataFrame,
        vol_col: str,
        covid_col: str = "covid_period",
        crisis_col: str = "crisis_2008",
    ) -> None:
        """Plot volatility with COVID and 2008 crisis shading."""
        missing = [c for c in [vol_col, covid_col, crisis_col] if c not in df.columns]
        if missing:
            print(f"[SKIP] {vol_col}: missing columns {missing}")
            return

        tmp = df[[vol_col, covid_col, crisis_col]].dropna(subset=[vol_col]).copy()
        if tmp.empty:
            print(f"[SKIP] {vol_col}: empty after dropping NaN vol.")
            return

        plt.figure(figsize=(14, 5))
        (line_handle,) = plt.plot(tmp.index, tmp[vol_col].astype("float64"), label=vol_col)

        def shade_period(mask: pd.Series, label: str) -> Patch:
            mask = mask.astype(bool).values
            idx = tmp.index

            in_seg = False
            start = None
            for d, flag in zip(idx, mask):
                if flag and not in_seg:
                    in_seg = True
                    start = d
                elif (not flag) and in_seg:
                    plt.axvspan(start, d, alpha=0.2)
                    in_seg = False

            if in_seg:
                plt.axvspan(start, idx[-1], alpha=0.2)

            return Patch(alpha=0.2, label=label)

        covid_patch = shade_period(tmp[covid_col] == 1, label="COVID period")
        crisis_patch = shade_period(tmp[crisis_col] == 1, label="2008 crisis")

        plt.title(f"{vol_col} with COVID and 2008 crisis shading")
        plt.xlabel("Date")
        plt.ylabel("Volatility (std)")
        plt.legend(handles=[line_handle, covid_patch, crisis_patch], loc="upper right")
        plt.tight_layout()

        save_path = EDA_PLOTS_LOCAL / f"{_safe_fname(vol_col)}_with_covid_and_2008.png"
        plt.savefig(save_path, dpi=150)
        plt.close()
        print(f"[INFO] Saved (LOCAL): {save_path}")

        # Mirror to drive
        copy_file(save_path, EDA_PLOTS_DRIVE / save_path.name)

    # Run for all return-vol columns (*_logret_cc_std_21)
    W_LONG = int(RUN_PARAMS["features"]["rolling_w_long"])
    vol_cols = sorted([c for c in full_df.columns if c.endswith(f"_logret_cc_std_{W_LONG}")])

    print(f"[INFO] LOCAL EDA plots dir: {EDA_PLOTS_LOCAL}")
    print(f"[INFO] DRIVE EDA plots dir: {EDA_PLOTS_DRIVE}")
    print(f"[INFO] Found {len(vol_cols)} *_logret_cc_std_{W_LONG} columns.")

    if len(vol_cols) == 0:
        print(f"[WARN] No *_logret_cc_std_{W_LONG} columns found. Skipping volatility EDA.")
    else:
        for col in vol_cols:
            plot_volatility_with_crisis_periods(
                full_df,
                vol_col=col,
                covid_col="covid_period",
                crisis_col="crisis_2008",
            )

    print("\n[OK] Volatility EDA complete.")

print("[OK] BLOCK 18 complete.")

## BLOCK 19 — EDA CATEGORICAL

In [None]:

# Check if EDA is enabled
if not RUN_PARAMS["eda"]["enabled"]:
    print("[SKIP] BLOCK 19 — EDA disabled in RUN_PARAMS.")
else:
    # Guardrails
    assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a DataFrame."
    assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
    full_df = full_df.sort_index()

    # EDA dirs inside RUN folders
    EDA_PLOTS_LOCAL = LOCAL_PATHS["plots_dir"] / "eda_categorical"
    EDA_REPS_LOCAL = LOCAL_PATHS["reports_dir"] / "eda_categorical"
    EDA_PLOTS_DRIVE = DRIVE_PATHS["plots_dir"] / "eda_categorical"
    EDA_REPS_DRIVE = DRIVE_PATHS["reports_dir"] / "eda_categorical"

    for p in [EDA_PLOTS_LOCAL, EDA_REPS_LOCAL, EDA_PLOTS_DRIVE, EDA_REPS_DRIVE]:
        p.mkdir(parents=True, exist_ok=True)

    def _safe_fname(s: str) -> str:
        """Convert column name to safe filename."""
        s = s.replace("^", "")
        return re.sub(r"[^A-Za-z0-9\-_]+", "_", s)

    # Categorical/binary columns for EDA
    categorical_cols = [
        "is_earnings_day",
        "has_eps_surprise_yahoo",
        "has_eps_surprise_calc",
        "post_earnings_day_1", "post_earnings_day_2", "post_earnings_day_3",
        "post_earnings_day_4", "post_earnings_day_5",
        "CPI_release_day",
        "FEDFUNDS_release_day",
        "FEDFUNDS_changed",
        "CPI_pct_mom_is_missing",
        "CPI_accel_pct_mom_is_missing",
        "FEDFUNDS_delta_mom_is_missing",
        "is_q1", "is_q2", "is_q3", "is_q4",
        "covid_period",
        "crisis_2008",
        "pre_covid",
        "post_covid",
        "pre_crisis_2008",
        "post_crisis_2008",
    ]
    categorical_cols = [c for c in categorical_cols if c in full_df.columns]

    print("[INFO] Categorical/binary columns used for EDA:")
    print(categorical_cols)

    # Save value counts to CSV
    cat_counts_file_local = EDA_REPS_LOCAL / "categorical_value_counts.csv"
    cat_counts_file_drive = EDA_REPS_DRIVE / "categorical_value_counts.csv"

    with open(cat_counts_file_local, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["run_id", "variable", "value", "count"])
        for col in categorical_cols:
            vc = full_df[col].value_counts(dropna=False)
            for val, count in vc.items():
                writer.writerow([RUN_ID, col, val, int(count)])

    copy_file(cat_counts_file_local, cat_counts_file_drive)

    print("[INFO] Saved categorical value counts to:")
    print("  - local:", cat_counts_file_local)
    print("  - drive:", cat_counts_file_drive)

    # Bar plots for each categorical column
    for col in categorical_cols:
        vc = full_df[col].value_counts(dropna=False).sort_index()
        if vc.empty:
            continue

        plt.figure(figsize=(6, 4))
        plt.bar([str(x) for x in vc.index], vc.values)
        plt.title(f"Frequency of {col}")
        plt.xlabel(col)
        plt.ylabel("Count")
        plt.tight_layout()

        fname = f"{_safe_fname(col)}_bar.png"
        out_local = EDA_PLOTS_LOCAL / fname
        out_drive = EDA_PLOTS_DRIVE / fname

        plt.savefig(out_local, dpi=150)
        plt.close()
        copy_file(out_local, out_drive)

    # Boxplots: numeric targets by categorical
    BASE = str(RUN_PARAMS["features"]["regime_base"])
    W_LONG = int(RUN_PARAMS["features"]["rolling_w_long"])
    TARGET_COL = str(RUN_PARAMS["data"]["target_col"])

    numeric_targets = [f"{BASE}_logret_cc", f"{BASE}_logret_cc_std_{W_LONG}"]
    if TARGET_COL in full_df.columns:
        numeric_targets.append(TARGET_COL)
    numeric_targets = [c for c in numeric_targets if c in full_df.columns]

    print("[INFO] Numeric targets used for boxplots:", numeric_targets)

    for target in numeric_targets:
        for col in categorical_cols:
            if full_df[col].nunique(dropna=True) < 2:
                continue

            tmp = full_df[[col, target]].dropna()
            if tmp.empty:
                continue

            groups, labels = [], []
            for k in sorted(tmp[col].unique()):
                groups.append(tmp.loc[tmp[col] == k, target].astype("float64").values)
                labels.append(str(k))

            if len(groups) < 2:
                continue

            plt.figure(figsize=(8, 5))
            plt.boxplot(groups, labels=labels, showfliers=False)
            plt.title(f"{target} by {col}")
            plt.xlabel(col)
            plt.ylabel(target)
            plt.tight_layout()

            fname = f"{_safe_fname(target)}_by_{_safe_fname(col)}_box.png"
            out_local = EDA_PLOTS_LOCAL / fname
            out_drive = EDA_PLOTS_DRIVE / fname

            plt.savefig(out_local, dpi=150)
            plt.close()
            copy_file(out_local, out_drive)

    # Cramér's V heatmap
    def cramers_v(x: pd.Series, y: pd.Series) -> float:
        """Calculate Cramér's V statistic for two categorical variables."""
        xy = pd.DataFrame({"x": x, "y": y}).dropna()
        if xy.empty:
            return np.nan

        confusion = pd.crosstab(xy["x"], xy["y"])
        if confusion.shape[0] < 2 or confusion.shape[1] < 2:
            return 0.0

        chi2 = stats.chi2_contingency(confusion, correction=False)[0]
        n = confusion.to_numpy().sum()
        r, k = confusion.shape
        denom = n * (min(r, k) - 1)
        return 0.0 if denom <= 0 else float(np.sqrt(chi2 / denom))

    n_cat = len(categorical_cols)
    if n_cat >= 2:
        mat = np.zeros((n_cat, n_cat), dtype=float)
        for i, c1 in enumerate(categorical_cols):
            for j, c2 in enumerate(categorical_cols):
                mat[i, j] = 1.0 if i == j else cramers_v(full_df[c1], full_df[c2])

        plt.figure(figsize=(max(10, n_cat * 0.6), max(8, n_cat * 0.6)))
        im = plt.imshow(mat, vmin=0.0, vmax=1.0)
        plt.title("Cramér's V Heatmap — Categorical/Binary Variables")
        plt.xticks(range(n_cat), categorical_cols, rotation=90)
        plt.yticks(range(n_cat), categorical_cols)
        plt.colorbar(im, fraction=0.046, pad=0.04)
        plt.tight_layout()

        heatmap_local = EDA_PLOTS_LOCAL / "categorical_cramersV_heatmap.png"
        heatmap_drive = EDA_PLOTS_DRIVE / "categorical_cramersV_heatmap.png"
        plt.savefig(heatmap_local, dpi=150)
        plt.close()
        copy_file(heatmap_local, heatmap_drive)

        print("[INFO] Saved Cramér's V heatmap to:")
        print("  - local:", heatmap_local)
        print("  - drive:", heatmap_drive)
    else:
        print("[INFO] Skipped Cramér's V heatmap: need at least 2 categorical columns.")

    print("\n[OK] Categorical/Binary EDA complete.")
    print("[INFO] LOCAL plots  :", EDA_PLOTS_LOCAL)
    print("[INFO] DRIVE plots  :", EDA_PLOTS_DRIVE)
    print("[INFO] LOCAL reports:", EDA_REPS_LOCAL)
    print("[INFO] DRIVE reports:", EDA_REPS_DRIVE)

print("[OK] BLOCK 19 complete.")

---
# SECTION 4: Train/Valid/Test Split & NN Features

**Data splitting and neural network feature preparation**

**Blocks:** 20-22

## BLOCK 20 — SPLIT + WEIGHTS

In [None]:

# Guardrails
assert isinstance(full_df, pd.DataFrame), "[ERROR] full_df must be a DataFrame."
assert isinstance(full_df.index, pd.DatetimeIndex), "[ERROR] full_df.index must be a DatetimeIndex."
full_df = full_df.sort_index()

# Target column check
if TARGET_T1 not in full_df.columns:
    raise KeyError(f"[ERROR] TARGET='{TARGET_T1}' not found in full_df. Run target-definition block first.")

# All features except target
feature_cols_t1 = [c for c in full_df.columns if c != TARGET_T1]

# Model df (features + target)
model_df = full_df.loc[:, feature_cols_t1 + [TARGET_T1]].copy()
before = len(model_df)
model_df = model_df.dropna(subset=[TARGET_T1]).copy()
after = len(model_df)

if len(model_df) == 0:
    raise ValueError("[ERROR] model_df became empty after dropping NaN target rows.")

print("[INFO] model_df range:", model_df.index.min(), "->", model_df.index.max(), "| rows:", len(model_df))
print("[INFO] Dropped rows due to missing target:", before - after)
print("[INFO] #Features (ALL):", len(feature_cols_t1))

# Time split from RUN_PARAMS (date-based)
data_cfg = RUN_PARAMS["data"]

train_end_date = data_cfg["train_end"]
valid_start_date = data_cfg["valid_start"]
valid_end_date = data_cfg["valid_end"]
test_start_date = data_cfg["test_start"]
test_end_date = data_cfg.get("test_end")

print("[INFO] Data range:", model_df.index.min().date(), "->", model_df.index.max().date())

train_end = pd.Timestamp(train_end_date)
valid_start = pd.Timestamp(valid_start_date)
valid_end = pd.Timestamp(valid_end_date)
test_start = pd.Timestamp(test_start_date)
test_end = pd.Timestamp(test_end_date) if test_end_date else model_df.index.max()

mask_train = model_df.index <= train_end
mask_valid = (model_df.index >= valid_start) & (model_df.index <= valid_end)
mask_test = (model_df.index >= test_start) & (model_df.index <= test_end)

split_info = f"train: <= {train_end.date()} | valid: {valid_start.date()} - {valid_end.date()} | test: {test_start.date()} - {test_end.date()}"

train_df = model_df.loc[mask_train].copy()
valid_df = model_df.loc[mask_valid].copy()
test_df = model_df.loc[mask_test].copy()

if len(train_df) == 0:
    raise ValueError("[ERROR] train_df is empty. Check split configuration and data coverage.")

n_total = len(model_df)
pct_train = (len(train_df) / n_total * 100.0) if n_total else 0.0
pct_valid = (len(valid_df) / n_total * 100.0) if n_total else 0.0
pct_test = (len(test_df) / n_total * 100.0) if n_total else 0.0

print("[INFO] Split sizes:",
      f"TRAIN={len(train_df):,} ({pct_train:.2f}%) | "
      f"VALID={len(valid_df):,} ({pct_valid:.2f}%) | "
      f"TEST={len(test_df):,} ({pct_test:.2f}%)")
print(f"[INFO] {split_info}")

# Weights per split: time * |y| (normalize mean=1 per split)
_eps = float(RUN_PARAMS["features"]["eps"])
c = float(RUN_PARAMS["weights"]["c"])
max_w = float(RUN_PARAMS["weights"]["max_w"])


def _build_split_weights(df: pd.DataFrame, target_col: str, split_name: str):
    """Build sample weights for a split: time * |y| normalized to mean=1."""
    n = len(df)
    if n == 0:
        return np.array([], dtype=float), dict(
            split=split_name, n=0, time_min=np.nan, time_max=np.nan, time_mean=np.nan,
            y_min=np.nan, y_max=np.nan, y_med_abs=np.nan, y_cap_rate=np.nan,
            w_min=np.nan, w_max=np.nan, w_mean=np.nan
        )

    w_time = np.linspace(1.0, 2.0, n, dtype=float)

    y_vals = df[target_col].astype(float).to_numpy()
    abs_y = np.abs(y_vals)
    med_abs = float(np.median(abs_y)) if n else 0.0

    ratio = abs_y / (med_abs + _eps)
    w_y = 1.0 + c * np.sqrt(ratio)
    w_y = np.clip(w_y, 1.0, max_w)
    cap_rate = float(np.mean(w_y >= max_w))

    w = w_time * w_y
    w = w / (w.mean() + _eps)

    stats_dict = dict(
        split=split_name, n=n,
        time_min=float(w_time.min()), time_max=float(w_time.max()), time_mean=float(w_time.mean()),
        y_min=float(w_y.min()), y_max=float(w_y.max()), y_med_abs=float(med_abs), y_cap_rate=float(cap_rate),
        w_min=float(w.min()), w_max=float(w.max()), w_mean=float(w.mean())
    )
    return w, stats_dict


w_train, st_train = _build_split_weights(train_df, TARGET_T1, "TRAIN")
w_valid, st_valid = _build_split_weights(valid_df, TARGET_T1, "VALID")
w_test, st_test = _build_split_weights(test_df, TARGET_T1, "TEST")

print("\n[INFO] y-weight cap rate (==max_w):")
print(f"  TRAIN: {st_train['y_cap_rate']:.4f} | c={c} max_w={max_w}")
print(f"  VALID: {st_valid['y_cap_rate']:.4f} | c={c} max_w={max_w}" if len(valid_df) else "  VALID: empty")
print(f"  TEST : {st_test['y_cap_rate']:.4f} | c={c} max_w={max_w}" if len(test_df) else "  TEST : empty")

print("\n[INFO] sample_weight diagnostics (per-split mean=1):")
for st in [st_train, st_valid, st_test]:
    if st["n"] == 0:
        print(f"  {st['split']}: empty")
        continue
    print(
        f"  {st['split']}: "
        f"time[{st['time_min']:.3f}->{st['time_max']:.3f}] "
        f"y[{st['y_min']:.3f}->{st['y_max']:.3f}] (median_abs_y={st['y_med_abs']:.6g}) "
        f"w[{st['w_min']:.3f}->{st['w_max']:.3f}] (mean={st['w_mean']:.3f})"
    )

train_df["sample_weight"] = w_train
valid_df["sample_weight"] = w_valid
test_df["sample_weight"] = w_test

# Final X/y matrices (source-of-truth = X_train_t1.columns)
X_train_t1 = train_df.loc[:, feature_cols_t1].copy()
y_train_t1 = train_df.loc[:, TARGET_T1].copy()

X_valid_t1 = valid_df.loc[:, feature_cols_t1].copy()
y_valid_t1 = valid_df.loc[:, TARGET_T1].copy()

X_test_t1 = test_df.loc[:, feature_cols_t1].copy()
y_test_t1 = test_df.loc[:, TARGET_T1].copy()

feature_cols_t1 = list(X_train_t1.columns)

assert len(X_train_t1) == len(w_train), "[ERROR] w_train length mismatch"
assert len(X_valid_t1) == len(w_valid), "[ERROR] w_valid length mismatch"
assert len(X_test_t1) == len(w_test), "[ERROR] w_test length mismatch"

print("\n[INFO] Shapes:", X_train_t1.shape, X_valid_t1.shape, X_test_t1.shape)
print("[OK] feature_cols_t1 set from X_train_t1.columns | #Features=", len(feature_cols_t1))

# Save XGBoost splits (project data/processed local + drive)
LOCAL_PROC_DATA_DIR = DATA_DIRS_LOCAL["processed"]
DRIVE_PROC_DATA_DIR = DATA_DIRS_DRIVE["processed"]

# Features (X) for XGBoost
X_train_t1.to_pickle(LOCAL_PROC_DATA_DIR / "X_train_xgb.pkl")
X_valid_t1.to_pickle(LOCAL_PROC_DATA_DIR / "X_valid_xgb.pkl")
X_test_t1.to_pickle(LOCAL_PROC_DATA_DIR / "X_test_xgb.pkl")

copy_file(LOCAL_PROC_DATA_DIR / "X_train_xgb.pkl", DRIVE_PROC_DATA_DIR / "X_train_xgb.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "X_valid_xgb.pkl", DRIVE_PROC_DATA_DIR / "X_valid_xgb.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "X_test_xgb.pkl", DRIVE_PROC_DATA_DIR / "X_test_xgb.pkl")

# Target (y) - shared across all models
save_pickle(y_train_t1, LOCAL_PROC_DATA_DIR / "y_train.pkl")
save_pickle(y_valid_t1, LOCAL_PROC_DATA_DIR / "y_valid.pkl")
save_pickle(y_test_t1, LOCAL_PROC_DATA_DIR / "y_test.pkl")

copy_file(LOCAL_PROC_DATA_DIR / "y_train.pkl", DRIVE_PROC_DATA_DIR / "y_train.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "y_valid.pkl", DRIVE_PROC_DATA_DIR / "y_valid.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "y_test.pkl", DRIVE_PROC_DATA_DIR / "y_test.pkl")

# Sample weights - shared across all models
save_pickle(w_train, LOCAL_PROC_DATA_DIR / "weights_train.pkl")
save_pickle(w_valid, LOCAL_PROC_DATA_DIR / "weights_valid.pkl")
save_pickle(w_test, LOCAL_PROC_DATA_DIR / "weights_test.pkl")

copy_file(LOCAL_PROC_DATA_DIR / "weights_train.pkl", DRIVE_PROC_DATA_DIR / "weights_train.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "weights_valid.pkl", DRIVE_PROC_DATA_DIR / "weights_valid.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "weights_test.pkl", DRIVE_PROC_DATA_DIR / "weights_test.pkl")

print("\n[INFO] Saved XGBoost splits to:", LOCAL_PROC_DATA_DIR)
print("  - X_train/valid/test_xgb.pkl")
print("  - y_train/valid/test.pkl")
print("  - weights_train/valid/test.pkl")

print("[OK] BLOCK 20 complete.")

## BLOCK 21 — NN FEATURE SELECTION

In [None]:

# Preconditions
assert "X_train_t1" in globals(), "[ERROR] X_train_t1 missing. Run BLOCK 20 first."
assert "y_train_t1" in globals(), "[ERROR] y_train_t1 missing. Run BLOCK 20 first."

FS_OUT = ensure_dir(Path(FS_DIR))

# Params from RUN_PARAMS
NN_CFG = RUN_PARAMS["nn_feature_select"]
N40 = int(NN_CFG["n40"])
N80 = int(NN_CFG["n80"])
per_group_40 = int(NN_CFG["per_group_40"])
per_group_80 = int(NN_CFG["per_group_80"])
corr_thr = float(NN_CFG["corr_thr"])
mi_neighbors = int(NN_CFG["mi_n_neighbors"])
mi_random_state = int(NN_CFG["mi_random_state"])


# Helpers
def _dedup_preserve_order(seq):
    """Remove duplicates while preserving order."""
    seen = set()
    out = []
    for x in seq:
        if isinstance(x, str) and x and (x not in seen):
            out.append(x)
            seen.add(x)
    return out


def _feature_group(col: str) -> str:
    """Extract feature group from column name."""
    if not isinstance(col, str) or not col:
        return "OTHER"
    if col.startswith("^"):
        return col.split("_", 1)[0]
    return col.split("_", 1)[0]


def _safe_spearman_corr(df: pd.DataFrame) -> pd.DataFrame:
    """Spearman correlation on numeric DataFrame."""
    return df.corr(method="spearman")


def _select_with_decorrelation(ranked_cols, X_train_num, k, corr_threshold):
    """Select top-k features with de-correlation."""
    ranked_cols = [c for c in ranked_cols if c in X_train_num.columns]
    if k <= 0:
        return []

    selected = []
    if len(ranked_cols) == 0:
        return selected

    # Precompute correlation only once for speed
    sub = X_train_num.loc[:, ranked_cols]
    corr = _safe_spearman_corr(sub).abs()

    for c in ranked_cols:
        if len(selected) == 0:
            selected.append(c)
            if len(selected) >= k:
                break
            continue

        too_close = False
        for s in selected:
            val = corr.at[c, s]
            if pd.notna(val) and float(val) >= corr_threshold:
                too_close = True
                break
        if not too_close:
            selected.append(c)
            if len(selected) >= k:
                break

    return selected


# Numeric matrix (TRAIN only) + cleaning
X = X_train_t1.copy()
y = y_train_t1.astype(float).to_numpy()

# Keep numeric dtypes only
X = X.select_dtypes(include=[np.number]).copy()

# Drop constant columns
nunique = X.nunique(dropna=False)
const_cols = nunique[nunique <= 1].index.tolist()
if const_cols:
    X = X.drop(columns=const_cols)
    print(f"[INFO] Dropped {len(const_cols)} constant columns")

# Replace inf -> nan, then drop cols that have any non-finite
X = X.replace([np.inf, -np.inf], np.nan)
bad_cols = [c for c in X.columns if not np.isfinite(X[c].to_numpy(dtype=float)).all()]
if bad_cols:
    X = X.drop(columns=bad_cols)
    print(f"[INFO] Dropped {len(bad_cols)} columns with non-finite values")

if X.shape[1] == 0:
    raise ValueError("[ERROR] No usable numeric features left after cleaning on TRAIN.")

print(f"[INFO] Features after cleaning: {X.shape[1]}")

# Mutual Information ranking (TRAIN only)
print("[INFO] Computing Mutual Information (this may take a minute)...")
mi = mutual_info_regression(
    X.to_numpy(dtype=float),
    y,
    n_neighbors=mi_neighbors,
    random_state=mi_random_state,
)
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)

# Group-aware shortlist
groups = {}
for col in mi_series.index:
    g = _feature_group(col)
    groups.setdefault(g, []).append(col)


def _build_group_seed(per_group):
    """Build seed list with per_group features from each group."""
    seed = []
    for g, cols in groups.items():
        take = cols[:per_group]
        seed.extend(take)
    return _dedup_preserve_order(seed)


seed40 = _build_group_seed(per_group_40)
seed80 = _build_group_seed(per_group_80)

# Global ranked list
ranked_all = mi_series.index.tolist()

# Final pick with de-correlation
rank40 = _dedup_preserve_order(seed40 + ranked_all)
rank80 = _dedup_preserve_order(seed80 + ranked_all)

neural_40 = _select_with_decorrelation(rank40, X, N40, corr_thr)
neural_80 = _select_with_decorrelation(rank80, X, N80, corr_thr)

if len(neural_40) == 0 or len(neural_80) == 0:
    raise ValueError("[ERROR] NN feature selection produced empty sets.")

# Ensure 40 ⊆ 80 if possible
if not set(neural_40).issubset(set(neural_80)):
    base = _dedup_preserve_order(neural_40 + neural_80 + ranked_all)
    neural_80 = _select_with_decorrelation(base, X, N80, corr_thr)

# Save
p40 = FS_OUT / "neural_feature_cols_40_bygroup.pkl"
p80 = FS_OUT / "neural_feature_cols_80_bygroup.pkl"
save_pickle(neural_40, p40)
save_pickle(neural_80, p80)

# Also save to DRIVE
FS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["fs_dir"]))
copy_file(p40, FS_OUT_DRIVE / p40.name)
copy_file(p80, FS_OUT_DRIVE / p80.name)

print("[OK] NN feature lists saved:")
print("  -", p40, "| n=", len(neural_40))
print("  -", p80, "| n=", len(neural_80))

# Diagnostics
print("[INFO] Top-10 MI features (TRAIN):", mi_series.head(10).index.tolist())
print("[INFO] Groups covered in NEURAL40:", sorted({_feature_group(c) for c in neural_40}))
print("[INFO] Groups covered in NEURAL80:", sorted({_feature_group(c) for c in neural_80}))

print("[OK] BLOCK 21 complete.")

## BLOCK 22 — NEURAL FEATURE PREP

In [None]:

# Preconditions
assert "X_train_t1" in globals(), "[ERROR] X_train_t1 missing. Run BLOCK 20 first."
assert "X_valid_t1" in globals(), "[ERROR] X_valid_t1 missing. Run BLOCK 20 first."
assert "X_test_t1" in globals(), "[ERROR] X_test_t1 missing. Run BLOCK 20 first."

FS_OUT = ensure_dir(Path(FS_DIR))
FS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["fs_dir"]))

# Load neural feature lists from BLOCK 21 (with fallback)
FS_DIR_LOCAL = Path(LOCAL_PATHS["fs_dir"])
FS_DIR_DRIVE = Path(DRIVE_PATHS["fs_dir"])
FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]

neural_cols_40 = load_with_fallback(
    "neural_feature_cols_40_bygroup.pkl",
    FS_DIR_LOCAL, FALLBACK_PROC_LOCAL, FS_DIR_DRIVE, FALLBACK_PROC_DRIVE
)
neural_cols_80 = load_with_fallback(
    "neural_feature_cols_80_bygroup.pkl",
    FS_DIR_LOCAL, FALLBACK_PROC_LOCAL, FS_DIR_DRIVE, FALLBACK_PROC_DRIVE
)

# Clean and dedupe
neural_cols_40 = list(dict.fromkeys([c for c in neural_cols_40 if isinstance(c, str) and c.strip()]))
neural_cols_80 = list(dict.fromkeys([c for c in neural_cols_80 if isinstance(c, str) and c.strip()]))

# Filter to columns that exist in X_train_t1
train_cols = set(X_train_t1.columns)
neural_cols_40 = [c for c in neural_cols_40 if c in train_cols]
neural_cols_80 = [c for c in neural_cols_80 if c in train_cols]

if len(neural_cols_40) == 0:
    raise ValueError("[ERROR] neural_cols_40 became empty after filtering to X_train_t1.columns.")
if len(neural_cols_80) == 0:
    raise ValueError("[ERROR] neural_cols_80 became empty after filtering to X_train_t1.columns.")

# Verify columns exist in valid/test splits
valid_cols = set(X_valid_t1.columns)
test_cols = set(X_test_t1.columns)

missing_valid_40 = sorted(set(neural_cols_40) - valid_cols)
missing_test_40 = sorted(set(neural_cols_40) - test_cols)
missing_valid_80 = sorted(set(neural_cols_80) - valid_cols)
missing_test_80 = sorted(set(neural_cols_80) - test_cols)


def _report_missing(tag, miss):
    """Report missing columns."""
    if miss:
        print(f"[ERROR] Missing in {tag}: {len(miss)} columns (showing up to 20): {miss[:20]}")
        return True
    return False


err = False
err |= _report_missing("X_valid_t1 (neural_40)", missing_valid_40)
err |= _report_missing("X_test_t1  (neural_40)", missing_test_40)
err |= _report_missing("X_valid_t1 (neural_80)", missing_valid_80)
err |= _report_missing("X_test_t1  (neural_80)", missing_test_80)

if err:
    raise KeyError("[ERROR] Inconsistent columns across splits for neural feature sets.")

# Build neural feature matrices
X_train_neural_40 = X_train_t1.loc[:, neural_cols_40].copy()
X_valid_neural_40 = X_valid_t1.loc[:, neural_cols_40].copy()
X_test_neural_40 = X_test_t1.loc[:, neural_cols_40].copy()

X_train_neural_80 = X_train_t1.loc[:, neural_cols_80].copy()
X_valid_neural_80 = X_valid_t1.loc[:, neural_cols_80].copy()
X_test_neural_80 = X_test_t1.loc[:, neural_cols_80].copy()

print("\n[OK] Neural feature matrices created:")
print(f"  - 40: TRAIN={X_train_neural_40.shape} | VALID={X_valid_neural_40.shape} | TEST={X_test_neural_40.shape}")
print(f"  - 80: TRAIN={X_train_neural_80.shape} | VALID={X_valid_neural_80.shape} | TEST={X_test_neural_80.shape}")

# Save resolved neural feature lists to FS_DIR
p40_res = FS_OUT / "neural_feature_cols_40_bygroup_resolved.pkl"
p80_res = FS_OUT / "neural_feature_cols_80_bygroup_resolved.pkl"
save_pickle(neural_cols_40, p40_res)
save_pickle(neural_cols_80, p80_res)

copy_file(p40_res, FS_OUT_DRIVE / p40_res.name)
copy_file(p80_res, FS_OUT_DRIVE / p80_res.name)

# Save Neural 40 splits to data/processed
LOCAL_PROC_DATA_DIR = DATA_DIRS_LOCAL["processed"]
DRIVE_PROC_DATA_DIR = DATA_DIRS_DRIVE["processed"]

X_train_neural_40.to_pickle(LOCAL_PROC_DATA_DIR / "X_train_neural_40.pkl")
X_valid_neural_40.to_pickle(LOCAL_PROC_DATA_DIR / "X_valid_neural_40.pkl")
X_test_neural_40.to_pickle(LOCAL_PROC_DATA_DIR / "X_test_neural_40.pkl")

copy_file(LOCAL_PROC_DATA_DIR / "X_train_neural_40.pkl", DRIVE_PROC_DATA_DIR / "X_train_neural_40.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "X_valid_neural_40.pkl", DRIVE_PROC_DATA_DIR / "X_valid_neural_40.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "X_test_neural_40.pkl", DRIVE_PROC_DATA_DIR / "X_test_neural_40.pkl")

# Save Neural 80 splits to data/processed
X_train_neural_80.to_pickle(LOCAL_PROC_DATA_DIR / "X_train_neural_80.pkl")
X_valid_neural_80.to_pickle(LOCAL_PROC_DATA_DIR / "X_valid_neural_80.pkl")
X_test_neural_80.to_pickle(LOCAL_PROC_DATA_DIR / "X_test_neural_80.pkl")

copy_file(LOCAL_PROC_DATA_DIR / "X_train_neural_80.pkl", DRIVE_PROC_DATA_DIR / "X_train_neural_80.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "X_valid_neural_80.pkl", DRIVE_PROC_DATA_DIR / "X_valid_neural_80.pkl")
copy_file(LOCAL_PROC_DATA_DIR / "X_test_neural_80.pkl", DRIVE_PROC_DATA_DIR / "X_test_neural_80.pkl")

print("\n[OK] Saved Neural splits to:", LOCAL_PROC_DATA_DIR)
print("  - X_train/valid/test_neural_40.pkl")
print("  - X_train/valid/test_neural_80.pkl")

print("[OK] BLOCK 22 complete.")

---
# SECTION 5: Feature Selection

**XGBoost feature selection with mutual information**

**Block:** 23

## BLOCK 23 — XGB FEATURE SELECTION

In [None]:

# Config from RUN_PARAMS
XGB_FS_CFG = RUN_PARAMS["xgb_fs"]
MISSING_SUFFIX = "_is_missing"

# Input/Output directories
PROC_DATA_DIR = DATA_DIRS_LOCAL["processed"]
FS_OUT_LOCAL = ensure_dir(Path(FS_DIR))
FS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["fs_dir"]))

# -------------------------
# Load splits with fallback (RUN_ID -> data/processed -> DRIVE)
# -------------------------
RUN_PROC_LOCAL = Path(LOCAL_PATHS["proc_dir"])
FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
RUN_PROC_DRIVE = Path(DRIVE_PATHS["proc_dir"])
FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]

print("[INFO] Loading splits with fallback...")

X_train_t1 = load_with_fallback("X_train_xgb.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
X_valid_t1 = load_with_fallback("X_valid_xgb.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
X_test_t1 = load_with_fallback("X_test_xgb.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)

y_train_t1 = load_with_fallback("y_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
y_valid_t1 = load_with_fallback("y_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)

w_train = load_with_fallback("weights_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
w_valid = load_with_fallback("weights_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)

print(f"[INFO] Loaded: X_train={X_train_t1.shape} | X_valid={X_valid_t1.shape} | X_test={X_test_t1.shape}")

print("[INFO] XGB Feature Selection output dirs:")
print("  - LOCAL:", FS_OUT_LOCAL)
print("  - DRIVE:", FS_OUT_DRIVE)


# Metric functions defined in Cell 5 (using w_rmse instead of weighted_rmse)


# -------------------------
# 1) Identify missingness flags (for dependency closure)
# -------------------------
all_train_cols = list(X_train_t1.columns)
missing_flags = [c for c in all_train_cols if c.endswith(MISSING_SUFFIX)]
missing_flag_set = set(missing_flags)

base_to_missing = {}
for flag in missing_flags:
    base = flag[:-len(MISSING_SUFFIX)]
    if base in missing_flag_set:
        continue
    if base in all_train_cols:
        base_to_missing[base] = flag

print(f"[INFO] Found missingness flags: {len(missing_flags)}")

# -------------------------
# 2) Build matrices for FS (exclude flags)
# -------------------------
nonflag_cols = [c for c in all_train_cols if c not in missing_flag_set and c != TARGET_T1 and c != "sample_weight"]

Xtr = X_train_t1.loc[:, nonflag_cols].copy()
Xva = X_valid_t1.loc[:, nonflag_cols].copy()

ytr = y_train_t1.astype(float).copy()
yva = y_valid_t1.astype(float).copy()

wtr = _to_np(w_train)
wva = _to_np(w_valid)

# -------------------------
# 3) Drop constant TRAIN columns
# -------------------------
nunique = Xtr.nunique(dropna=False)
const_cols = nunique[nunique <= 1].index.tolist()
if const_cols:
    print(f"[INFO] Dropping constant TRAIN columns: {len(const_cols)}")
    Xtr = Xtr.drop(columns=const_cols)
    Xva = Xva.drop(columns=const_cols, errors="ignore")

# -------------------------
# 4) Spearman filter
# -------------------------
SPEARMAN_THRESH = float(XGB_FS_CFG["spearman_thresh"])

corr_ff = Xtr.corr(method="spearman").abs().fillna(0.0)
cols = list(corr_ff.columns)

feat_to_y = Xtr.apply(lambda s: s.corr(ytr, method="spearman"))
feat_to_y_abs = feat_to_y.abs().fillna(0.0)

pairs = []
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        cval = float(corr_ff.iat[i, j])
        if cval > SPEARMAN_THRESH:
            pairs.append((cols[i], cols[j], cval))
pairs.sort(key=lambda t: t[2], reverse=True)

active = set(cols)

for a, b, cval in pairs:
    if (a in active) and (b in active):
        ay = float(feat_to_y_abs.get(a, 0.0))
        by = float(feat_to_y_abs.get(b, 0.0))
        drop = a if ay < by else b
        active.remove(drop)

kept_cols = [c for c in cols if c in active]
print(f"[INFO] Spearman filter: start={len(cols)} | kept={len(kept_cols)} | thresh={SPEARMAN_THRESH}")

X_train_fs = Xtr.loc[:, kept_cols].copy()
X_valid_fs = Xva.loc[:, kept_cols].copy()

# -------------------------
# 5) XGBoost FS model (GAIN)
# -------------------------
xgb_params = dict(
    n_estimators=int(XGB_FS_CFG["n_estimators"]),
    learning_rate=float(XGB_FS_CFG["learning_rate"]),
    max_depth=int(XGB_FS_CFG["max_depth"]),
    min_child_weight=float(XGB_FS_CFG["min_child_weight"]),
    gamma=float(XGB_FS_CFG["gamma"]),
    subsample=float(XGB_FS_CFG["subsample"]),
    colsample_bytree=float(XGB_FS_CFG["colsample_bytree"]),
    reg_alpha=float(XGB_FS_CFG["reg_alpha"]),
    reg_lambda=float(XGB_FS_CFG["reg_lambda"]),
    max_delta_step=float(XGB_FS_CFG["max_delta_step"]),
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    random_state=int(XGB_FS_CFG["random_state"]),
    n_jobs=-1,
    early_stopping_rounds=int(XGB_FS_CFG["early_stopping_rounds"]),
)

print("[INFO] Training XGBoost for feature selection...")
model_fs = xgb.XGBRegressor(**xgb_params)
model_fs.fit(
    X_train_fs, ytr.loc[X_train_fs.index],
    sample_weight=wtr,
    eval_set=[(X_valid_fs, yva.loc[X_valid_fs.index])],
    sample_weight_eval_set=[wva],
    verbose=False
)

best_iter = getattr(model_fs, "best_iteration", None)
best_score = getattr(model_fs, "best_score", None)
print(f"[INFO] XGB train done. best_iteration={best_iter} | best_score(valid_rmse)={best_score}")

booster = model_fs.get_booster()
score_gain = booster.get_score(importance_type="gain")

imp_gain = pd.DataFrame({"feature": kept_cols})
imp_gain["gain"] = imp_gain["feature"].map(score_gain).fillna(0.0).astype(float)
imp_gain = imp_gain.sort_values("gain", ascending=False).reset_index(drop=True)

gain_sum = float(imp_gain["gain"].sum())
imp_gain["gain_frac"] = imp_gain["gain"] / (gain_sum + EPS)
imp_gain["cum_gain"] = imp_gain["gain_frac"].cumsum()

# -------------------------
# 6) Permutation importance on VALID
# -------------------------
def neg_weighted_rmse_scorer(estimator, X, y):
    p = estimator.predict(X)
    return -w_rmse(y, p, wva)


PERM_REPEATS = int(XGB_FS_CFG["perm_repeats"])

print(f"[INFO] Computing permutation importance (repeats={PERM_REPEATS})...")
perm = permutation_importance(
    model_fs,
    X_valid_fs,
    yva.loc[X_valid_fs.index],
    scoring=neg_weighted_rmse_scorer,
    n_repeats=PERM_REPEATS,
    random_state=int(XGB_FS_CFG["random_state"]),
    n_jobs=-1
)

perm_df = pd.DataFrame({
    "feature": X_valid_fs.columns,
    "perm_importance_mean": perm.importances_mean,
    "perm_importance_std": perm.importances_std
}).reset_index(drop=True)

perm_df = perm_df.merge(
    imp_gain.loc[:, ["feature", "gain", "gain_frac", "cum_gain"]],
    on="feature",
    how="left"
)

# -------------------------
# 7) Selection policy + fallback ladder
# -------------------------
GAIN_CUM_THRESH = float(XGB_FS_CFG["gain_cum_thresh"])
MIN_FEATURES = int(XGB_FS_CFG["min_features"])
NEG_SIGMA = float(XGB_FS_CFG["neg_sigma"])
POS_SIGMA = float(XGB_FS_CFG["pos_sigma"])
MIN_GAIN = float(XGB_FS_CFG["min_gain"])

perm_df["perm_strongly_negative"] = (
    (perm_df["perm_importance_mean"] < 0) &
    (np.abs(perm_df["perm_importance_mean"]) > (NEG_SIGMA * (perm_df["perm_importance_std"] + EPS)))
)

perm_df["perm_confident_positive"] = (
    (perm_df["perm_importance_mean"] > 0) &
    (perm_df["perm_importance_mean"] > (POS_SIGMA * (perm_df["perm_importance_std"] + EPS)))
)

strong_neg_set = set(perm_df.loc[perm_df["perm_strongly_negative"], "feature"].tolist())

gain_candidates = imp_gain.loc[
    (imp_gain["cum_gain"] <= GAIN_CUM_THRESH) | (imp_gain.index < MIN_FEATURES),
    "feature"
].tolist()
print(f"[INFO] Candidates by GAIN: cum<={GAIN_CUM_THRESH} with min {MIN_FEATURES} => {len(gain_candidates)}")

perm_map = perm_df.set_index("feature")[["perm_confident_positive", "perm_importance_mean", "gain"]]

selected = []
for f in gain_candidates:
    if f in strong_neg_set:
        continue
    g = float(perm_map.loc[f, "gain"]) if f in perm_map.index else 0.0
    if g <= MIN_GAIN:
        continue
    is_pos = bool(perm_map.loc[f, "perm_confident_positive"]) if f in perm_map.index else False
    if not is_pos:
        continue
    selected.append(f)

print(f"[INFO] Selected after strict filters: {len(selected)}")

# Fallback 1: relax to perm_mean > 0
if len(selected) < MIN_FEATURES:
    print("[WARN] Too few features; relaxing to perm_mean > 0.")
    selected = []
    for f in gain_candidates:
        if f in strong_neg_set:
            continue
        g = float(perm_map.loc[f, "gain"]) if f in perm_map.index else 0.0
        if g <= MIN_GAIN:
            continue
        pm = float(perm_map.loc[f, "perm_importance_mean"]) if f in perm_map.index else -1.0
        if pm <= 0:
            continue
        selected.append(f)
    print(f"[INFO] Selected after relaxed filter: {len(selected)}")

# Fallback 2: gain candidates excluding strong-neg
if len(selected) < MIN_FEATURES:
    print("[WARN] Still too few; final fallback to GAIN candidates excluding strongly-negative.")
    selected = []
    for f in gain_candidates:
        if f in strong_neg_set:
            continue
        g = float(perm_map.loc[f, "gain"]) if f in perm_map.index else 0.0
        if g <= MIN_GAIN:
            continue
        selected.append(f)
    print(f"[INFO] Selected after final fallback: {len(selected)}")

# Fallback 3: GUARANTEE min_features - take top by XGB GAIN
if len(selected) < MIN_FEATURES:
    print(f"[WARN] Final fallback: taking top {MIN_FEATURES} features by XGB GAIN.")
    # Sort gain_candidates by XGB gain (highest first)
    top_by_gain = imp_gain.head(MIN_FEATURES)["feature"].tolist()
    selected = top_by_gain
    print(f"[INFO] Selected after GAIN-only fallback: {len(selected)}")

# -------------------------
# 8) Dependency closure: add X_is_missing if exists
# -------------------------
selected_set = set(selected)
flags_added = []

for base, flag in base_to_missing.items():
    if (base in selected_set) and (flag in X_train_t1.columns) and (flag not in selected_set):
        selected.append(flag)
        selected_set.add(flag)
        flags_added.append(flag)

print(f"[INFO] Missingness flags added: {len(flags_added)}")

selected_final = list(dict.fromkeys(selected))
print(f"[INFO] FINAL selected features: {len(selected_final)}")

# -------------------------
# 9) Save outputs (LOCAL + DRIVE)
# -------------------------
# Selected features
sel_txt = FS_OUT_LOCAL / "selected_features_xgb.txt"
sel_pkl = FS_OUT_LOCAL / "selected_features_xgb.pkl"
sel_csv = FS_OUT_LOCAL / "selected_features_xgb.csv"

sel_txt.write_text("\n".join(selected_final), encoding="utf-8")
save_pickle(selected_final, sel_pkl)
pd.DataFrame({"feature": selected_final}).to_csv(sel_csv, index=False)

copy_file(sel_txt, FS_OUT_DRIVE / sel_txt.name)
copy_file(sel_pkl, FS_OUT_DRIVE / sel_pkl.name)
copy_file(sel_csv, FS_OUT_DRIVE / sel_csv.name)

# Importance tables
imp_gain_csv = FS_OUT_LOCAL / "feature_importance_gain.csv"
perm_csv = FS_OUT_LOCAL / "feature_importance_permutation_valid.csv"

imp_gain.to_csv(imp_gain_csv, index=False)
perm_df.sort_values("perm_importance_mean", ascending=False).to_csv(perm_csv, index=False)

copy_file(imp_gain_csv, FS_OUT_DRIVE / imp_gain_csv.name)
copy_file(perm_csv, FS_OUT_DRIVE / perm_csv.name)

# Create and save filtered matrices (xgb_selected) to data/processed/
DRIVE_PROC_DATA_DIR = DATA_DIRS_DRIVE["processed"]

X_train_xgb_selected = X_train_t1.loc[:, selected_final].copy()
X_valid_xgb_selected = X_valid_t1.loc[:, selected_final].copy()
X_test_xgb_selected = X_test_t1.loc[:, selected_final].copy()

X_train_xgb_selected.to_pickle(PROC_DATA_DIR / "X_train_xgb_selected.pkl")
X_valid_xgb_selected.to_pickle(PROC_DATA_DIR / "X_valid_xgb_selected.pkl")
X_test_xgb_selected.to_pickle(PROC_DATA_DIR / "X_test_xgb_selected.pkl")

copy_file(PROC_DATA_DIR / "X_train_xgb_selected.pkl", DRIVE_PROC_DATA_DIR / "X_train_xgb_selected.pkl")
copy_file(PROC_DATA_DIR / "X_valid_xgb_selected.pkl", DRIVE_PROC_DATA_DIR / "X_valid_xgb_selected.pkl")
copy_file(PROC_DATA_DIR / "X_test_xgb_selected.pkl", DRIVE_PROC_DATA_DIR / "X_test_xgb_selected.pkl")

print("\n[OK] Saved XGB Feature Selection artifacts:")
print("  -", sel_txt.name)
print("  -", sel_pkl.name)
print("  -", sel_csv.name)
print("  -", imp_gain_csv.name)
print("  -", perm_csv.name)
print(f"[OK] Saved to {PROC_DATA_DIR}:")
print(f"  - X_train_xgb_selected.pkl ({X_train_xgb_selected.shape})")
print(f"  - X_valid_xgb_selected.pkl ({X_valid_xgb_selected.shape})")
print(f"  - X_test_xgb_selected.pkl ({X_test_xgb_selected.shape})")

print("[OK] BLOCK 23 complete.")

---
# SECTION 6: XGBoost Hyperparameter Optimization

**3-stage HPO: Broad → Refine → Low-LR**

**Block:** 24

## BLOCK 24 — XGB HYPERPARAMETER OPTIMIZATION (HPO)

In [None]:

# Config
HPO_CFG = RUN_PARAMS["hpo"]

# Load selected features from BLOCK 23 (with fallback)
FS_DIR_LOCAL = Path(LOCAL_PATHS["fs_dir"])
FS_DIR_DRIVE = Path(DRIVE_PATHS["fs_dir"])
FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]

selected_features = load_with_fallback(
    "selected_features_xgb.pkl",
    FS_DIR_LOCAL,           # 1. runs/RUN_ID/feature_selection (LOCAL)
    FALLBACK_PROC_LOCAL,    # 2. data/processed (LOCAL)
    FS_DIR_DRIVE,           # 3. runs/RUN_ID/feature_selection (DRIVE)
    FALLBACK_PROC_DRIVE     # 4. data/processed (DRIVE)
)
print(f"[INFO] Loaded {len(selected_features)} selected features")
# Directories

# --- Fallback Logic: Try RUN_ID first, then data/processed ---
def get_data_path(filename, run_local, fallback_local, run_drive=None, fallback_drive=None):
    """Return first existing path from: RUN_LOCAL -> FALLBACK_LOCAL -> RUN_DRIVE -> FALLBACK_DRIVE."""
    paths = [
        (Path(run_local) / filename, "RUN_ID (LOCAL)"),
        (Path(fallback_local) / filename, "data/processed (LOCAL)"),
    ]
    if run_drive:
        paths.append((Path(run_drive) / filename, "RUN_ID (DRIVE)"))
    if fallback_drive:
        paths.append((Path(fallback_drive) / filename, "data/processed (DRIVE)"))
    
    for path, src in paths:
        if path.exists():
            print(f"  [LOAD] {filename} <- {src}")
            return path
    
    raise FileNotFoundError(f"{filename} not found in any location")

RUN_PROC_LOCAL = Path(LOCAL_PATHS["proc_dir"])
FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
RUN_PROC_DRIVE = Path(DRIVE_PATHS["proc_dir"])
FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]
print("[INFO] Data loading with fallback enabled")

MS_OUT_LOCAL = ensure_dir(Path(MS_DIR))
MS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["ms_dir"]))

print("[INFO] HPO output dirs:")
print("  - LOCAL:", MS_OUT_LOCAL)
print("  - DRIVE:", MS_OUT_DRIVE)

# -------------------------
# Load splits from data/processed/ (TRAIN + VALID only)
# -------------------------
print("[INFO] Loading splits with fallback logic...")

X_train_sel = pd.read_pickle(get_data_path("X_train_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE))
X_valid_sel = pd.read_pickle(get_data_path("X_valid_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE))

y_train_t1 = load_pickle(get_data_path("y_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE))
y_valid_t1 = load_pickle(get_data_path("y_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE))

w_train = load_pickle(get_data_path("weights_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE))
w_valid = load_pickle(get_data_path("weights_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE))

print(f"[INFO] Loaded: X_train_sel={X_train_sel.shape} | X_valid_sel={X_valid_sel.shape}")

y_train = y_train_t1.astype(float).copy()
y_valid = y_valid_t1.astype(float).copy()

w_train_arr = np.asarray(w_train, dtype=float)
w_valid_arr = np.asarray(w_valid, dtype=float)

print("[INFO] Shapes (selected):", X_train_sel.shape, X_valid_sel.shape)

# -------------------------
# Align VALID data with Neural Networks (skip first lookback-1 rows)
# -------------------------
LOOKBACK = int(HPO_CFG["lookback"])
SKIP_ROWS = LOOKBACK - 1

X_valid_sel = X_valid_sel.iloc[SKIP_ROWS:]
y_valid = y_valid.iloc[SKIP_ROWS:]
w_valid_arr = w_valid_arr[SKIP_ROWS:]

print(f"[INFO] Aligned VALID with lookback={LOOKBACK}: skipped first {SKIP_ROWS} rows")
print(f"[INFO] VALID shape after alignment: {X_valid_sel.shape}")


# -------------------------
# VALID split for ES vs SCORE (date-based)
# -------------------------
VALID_ES_START = HPO_CFG["valid_es_start"]
VALID_ES_END = HPO_CFG["valid_es_end"]
VALID_SCORE_START = HPO_CFG["valid_score_start"]
VALID_SCORE_END = HPO_CFG["valid_score_end"]


def split_valid_for_es_and_score(Xv: pd.DataFrame, yv: pd.Series, wv: np.ndarray,
                                  valid_es_start: str, valid_es_end: str,
                                  valid_score_start: str, valid_score_end: str):
    """Split validation into ES (early stopping) and SCORE (model selection) sets."""
    if not isinstance(Xv.index, pd.DatetimeIndex):
        return (Xv, yv, wv), (Xv, yv, wv), "FULL_VALID"

    es_start = pd.Timestamp(valid_es_start)
    es_end = pd.Timestamp(valid_es_end)
    sc_start = pd.Timestamp(valid_score_start)
    sc_end = pd.Timestamp(valid_score_end)
    
    mask_es = (Xv.index >= es_start) & (Xv.index <= es_end)
    mask_sc = (Xv.index >= sc_start) & (Xv.index <= sc_end)

    X_es = Xv.loc[mask_es]
    y_es = yv.loc[mask_es]
    X_sc = Xv.loc[mask_sc]
    y_sc = yv.loc[mask_sc]

    wv_s = pd.Series(wv, index=Xv.index)
    w_es = wv_s.loc[mask_es].to_numpy(dtype=float)
    w_sc = wv_s.loc[mask_sc].to_numpy(dtype=float)

    mode_str = f"VALID_ES={valid_es_start}:{valid_es_end} / VALID_SCORE={valid_score_start}:{valid_score_end}"
    
    if len(X_es) > 0 and len(X_sc) > 0:
        return (X_es, y_es, w_es), (X_sc, y_sc, w_sc), mode_str

    return (Xv, yv, wv), (Xv, yv, wv), "FULL_VALID"


(X_valid_es, y_valid_es, w_valid_es), (X_valid_sc, y_valid_sc, w_valid_sc), valid_mode = split_valid_for_es_and_score(
    X_valid_sel, y_valid, w_valid_arr,
    VALID_ES_START, VALID_ES_END, VALID_SCORE_START, VALID_SCORE_END
)

print("[INFO] VALID mode:", valid_mode)
print("[INFO] VALID_ES shape:", X_valid_es.shape, "| VALID_SCORE shape:", X_valid_sc.shape)


# -------------------------
# Weighted metrics
# -------------------------
# Metric functions (w_rmse, w_mae, dir_acc) defined in Cell 5


# -------------------------
# Search configuration from RUN_PARAMS
# -------------------------
RANDOM_SEED = int(HPO_CFG["random_state"])
rng = np.random.default_rng(RANDOM_SEED)

N_ESTIMATORS = int(HPO_CFG["n_estimators"])
EARLY_STOP = int(HPO_CFG["early_stopping_rounds"])

N_TRIALS_STAGE1 = int(HPO_CFG["n_trials_stage1"])
N_TRIALS_STAGE2 = int(HPO_CFG["n_trials_stage2"])
N_TRIALS_STAGE2_LOWLR = int(HPO_CFG["n_trials_stage2_lowlr"])

PRINT_EVERY_STAGE1 = int(HPO_CFG["print_every_stage1"])
PRINT_EVERY_STAGE2 = int(HPO_CFG["print_every_stage2"])

TIE_TOL = float(HPO_CFG["tie_tol"])

BASE_MODEL_CFG = dict(
    n_estimators=N_ESTIMATORS,
    objective=HPO_CFG["objective"],
    eval_metric=HPO_CFG["eval_metric"],
    tree_method=HPO_CFG["tree_method"],
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbosity=0,
    early_stopping_rounds=EARLY_STOP,
)


# -------------------------
# Parameter sampling (from RUN_PARAMS["hpo"]["sampling"])
# -------------------------
SAMP_CFG = HPO_CFG["sampling"]
BROAD_CFG = SAMP_CFG["broad"]
REFINE_CFG = SAMP_CFG["refine"]
LOWLR_CFG = SAMP_CFG["refine_low_lr"]


def _clip(v, lo, hi) -> float:
    return float(min(max(float(v), lo), hi))


def _log_uniform(rng, lo_exp, hi_exp) -> float:
    return float(10 ** rng.uniform(lo_exp, hi_exp))


def sample_broad(rng, broad_cfg):
    """Broad sampling biased toward lower learning_rate for stability."""
    lr_high_prob = broad_cfg["lr_high_prob"]
    if rng.random() < (1 - lr_high_prob):
        lr_lo, lr_hi = broad_cfg["lr_low"]
        lr = float(np.exp(rng.uniform(np.log(lr_lo), np.log(lr_hi))))
    else:
        lr_lo, lr_hi = broad_cfg["lr_high"]
        lr = float(rng.uniform(lr_lo, lr_hi))

    md_lo, md_hi = broad_cfg["max_depth"]
    mcw_lo, mcw_hi = broad_cfg["min_child_weight_log"]
    ss_lo, ss_hi = broad_cfg["subsample"]
    cs_lo, cs_hi = broad_cfg["colsample_bytree"]
    gm_lo, gm_hi = broad_cfg["gamma"]
    ra_lo, ra_hi = broad_cfg["reg_alpha_exp"]
    rl_lo, rl_hi = broad_cfg["reg_lambda_exp"]
    mds_lo, mds_hi = broad_cfg["max_delta_step"]

    return {
        "max_depth": int(rng.integers(md_lo, md_hi)),
        "learning_rate": lr,
        "min_child_weight": float(np.exp(rng.uniform(np.log(mcw_lo), np.log(mcw_hi)))),
        "subsample": float(rng.uniform(ss_lo, ss_hi)),
        "colsample_bytree": float(rng.uniform(cs_lo, cs_hi)),
        "gamma": float(rng.uniform(gm_lo, gm_hi)),
        "reg_alpha": _log_uniform(rng, ra_lo, ra_hi),
        "reg_lambda": _log_uniform(rng, rl_lo, rl_hi),
        "max_delta_step": float(rng.uniform(mds_lo, mds_hi)),
    }


def _log_jitter(rng, v, sigma=0.6, lo_exp=-12, hi_exp=2):
    v = float(max(v, 1e-12))
    logv = np.log10(v) + rng.normal(0.0, sigma)
    logv = float(np.clip(logv, lo_exp, hi_exp))
    return float(10 ** logv)


def sample_refine(rng, best, refine_cfg):
    """Refine around best parameters."""
    md_delta = refine_cfg["max_depth_delta"]
    md_clip = refine_cfg["max_depth_clip"]
    lr_sigma = refine_cfg["lr_sigma"]
    lr_clip = refine_cfg["lr_clip"]
    mcw_sigma = refine_cfg["min_child_weight_sigma"]
    mcw_clip = refine_cfg["min_child_weight_clip"]
    ss_sigma = refine_cfg["subsample_sigma"]
    ss_clip = refine_cfg["subsample_clip"]
    cs_sigma = refine_cfg["colsample_sigma"]
    cs_clip = refine_cfg["colsample_clip"]
    gm_sigma = refine_cfg["gamma_sigma"]
    gm_clip = refine_cfg["gamma_clip"]
    ra_sigma = refine_cfg["reg_alpha_sigma"]
    ra_clip = refine_cfg["reg_alpha_exp_clip"]
    rl_sigma = refine_cfg["reg_lambda_sigma"]
    rl_clip = refine_cfg["reg_lambda_exp_clip"]
    mds_sigma = refine_cfg["max_delta_step_sigma"]
    mds_clip = refine_cfg["max_delta_step_clip"]

    return {
        "max_depth": int(np.clip(int(best["max_depth"] + rng.integers(md_delta[0], md_delta[1])), md_clip[0], md_clip[1])),
        "learning_rate": _clip(best["learning_rate"] * float(np.exp(rng.normal(0.0, lr_sigma))), lr_clip[0], lr_clip[1]),
        "min_child_weight": _clip(best["min_child_weight"] * float(np.exp(rng.normal(0.0, mcw_sigma))), mcw_clip[0], mcw_clip[1]),
        "subsample": _clip(best["subsample"] + rng.normal(0.0, ss_sigma), ss_clip[0], ss_clip[1]),
        "colsample_bytree": _clip(best["colsample_bytree"] + rng.normal(0.0, cs_sigma), cs_clip[0], cs_clip[1]),
        "gamma": _clip(best["gamma"] + rng.normal(0.0, gm_sigma), gm_clip[0], gm_clip[1]),
        "reg_alpha": _log_jitter(rng, best["reg_alpha"], sigma=ra_sigma, lo_exp=ra_clip[0], hi_exp=ra_clip[1]),
        "reg_lambda": _log_jitter(rng, best["reg_lambda"], sigma=rl_sigma, lo_exp=rl_clip[0], hi_exp=rl_clip[1]),
        "max_delta_step": _clip(best["max_delta_step"] + rng.normal(0.0, mds_sigma), mds_clip[0], mds_clip[1]),
    }


def sample_refine_low_lr(rng, best, refine_cfg, lowlr_cfg):
    """Refine with lower learning rate for stability."""
    lr_shift = lowlr_cfg["lr_shift"]
    lr_clip = lowlr_cfg["lr_clip"]
    lr_sigma = refine_cfg["lr_sigma"]
    low_lr = _clip(best["learning_rate"] * float(np.exp(rng.normal(lr_shift, lr_sigma))), lr_clip[0], lr_clip[1])
    params = sample_refine(rng, best, refine_cfg)
    params["learning_rate"] = low_lr
    return params


# -------------------------
# Single trial runner
# -------------------------
def run_trial(trial_id: int, stage: str, params: dict):
    """Run a single HPO trial."""
    model = xgb.XGBRegressor(**BASE_MODEL_CFG, **params)

    t0 = time.time()
    model.fit(
        X_train_sel, y_train,
        sample_weight=w_train_arr,
        eval_set=[(X_valid_es, y_valid_es)],
        sample_weight_eval_set=[w_valid_es],
        verbose=False
    )
    elapsed = time.time() - t0

    best_iter = getattr(model, "best_iteration", None)
    best_score = getattr(model, "best_score", None)

    pred_es = model.predict(X_valid_es)
    es_wrmse = w_rmse(y_valid_es, pred_es, w_valid_es)

    pred_sc = model.predict(X_valid_sc)
    sc_wrmse = w_rmse(y_valid_sc, pred_sc, w_valid_sc)

    row = {
        "stage": stage,
        "trial": int(trial_id),
        "valid_sc_wrmse": sc_wrmse,
        "valid_sc_wmae": w_mae(y_valid_sc, pred_sc, w_valid_sc),
        "valid_sc_diracc": dir_acc(y_valid_sc, pred_sc),
        "best_iteration": None if best_iter is None else int(best_iter),
        "best_score_rmse_eval_es": None if best_score is None else float(best_score),
        "valid_es_wrmse_explicit": float(es_wrmse),
        "elapsed_sec": float(elapsed),
        **params,
    }
    return model, row


# -------------------------
# HPO loop (Stage 1 + Stage 2 + Stage 2 LOW-LR)
# -------------------------
results = []
best_model = None
best_row = None
best_valid = np.inf

best_params_keys = [
    "max_depth", "learning_rate", "min_child_weight", "subsample", "colsample_bytree",
    "gamma", "reg_alpha", "reg_lambda", "max_delta_step"
]


def is_better(row, best_row, tie_tol):
    """Check if row is better than best_row (lower wRMSE, tie-break by higher iteration)."""
    if best_row is None:
        return True

    a = float(row["valid_sc_wrmse"])
    b = float(best_row["valid_sc_wrmse"])

    if a < (b - tie_tol):
        return True
    if abs(a - b) <= tie_tol:
        ai = -1 if row["best_iteration"] is None else int(row["best_iteration"])
        bi = -1 if best_row["best_iteration"] is None else int(best_row["best_iteration"])
        return ai > bi
    return False


print(f"\n[INFO] HPO start | VALID mode: {valid_mode}")
print(f"[INFO] Stage1 trials={N_TRIALS_STAGE1} | Stage2 trials={N_TRIALS_STAGE2} | Stage2 LOW-LR={N_TRIALS_STAGE2_LOWLR}")
print(f"[INFO] n_estimators={N_ESTIMATORS} | early_stop={EARLY_STOP}")

# Stage 1 (broad)
for i in range(N_TRIALS_STAGE1):
    params = sample_broad(rng, BROAD_CFG)
    model_i, row_i = run_trial(trial_id=i, stage="STAGE1_BROAD", params=params)
    results.append(row_i)

    if is_better(row_i, best_row, TIE_TOL):
        best_model = model_i
        best_row = row_i
        best_valid = float(best_row["valid_sc_wrmse"])

    if (i + 1) % PRINT_EVERY_STAGE1 == 0 or i == 0:
        print(f"[INFO] S1 {i:04d} | sc_wrmse={row_i['valid_sc_wrmse']:.6f} | best={best_valid:.6f} | best_iter={row_i['best_iteration']}")

best_params_stage1 = {k: best_row[k] for k in best_params_keys}

# Stage 2 (refine around best)
for j in range(N_TRIALS_STAGE2):
    params = sample_refine(rng, best_params_stage1, REFINE_CFG)
    model_j, row_j = run_trial(trial_id=j, stage="STAGE2_REFINE", params=params)
    results.append(row_j)

    if is_better(row_j, best_row, TIE_TOL):
        best_model = model_j
        best_row = row_j
        best_valid = float(best_row["valid_sc_wrmse"])

    if (j + 1) % PRINT_EVERY_STAGE2 == 0 or j == 0:
        print(f"[INFO] S2 {j:04d} | sc_wrmse={row_j['valid_sc_wrmse']:.6f} | best={best_valid:.6f} | best_iter={row_j['best_iteration']}")

# Stage 2B (LOW-LR refine branch)
for k in range(N_TRIALS_STAGE2_LOWLR):
    params = sample_refine_low_lr(rng, best_params_stage1, REFINE_CFG, LOWLR_CFG)
    model_k, row_k = run_trial(trial_id=k, stage="STAGE2_LOWLR", params=params)
    results.append(row_k)

    if is_better(row_k, best_row, TIE_TOL):
        best_model = model_k
        best_row = row_k
        best_valid = float(best_row["valid_sc_wrmse"])

    if (k + 1) % PRINT_EVERY_STAGE2 == 0 or k == 0:
        print(f"[INFO] S2L {k:04d} | sc_wrmse={row_k['valid_sc_wrmse']:.6f} | best={best_valid:.6f} | best_iter={row_k['best_iteration']}")

res_df = pd.DataFrame(results).sort_values("valid_sc_wrmse", ascending=True).reset_index(drop=True)

print("\n[INFO] Top 10 trials by VALID_SCORE wRMSE:")
display(res_df.head(10))

print("\n[INFO] BEST summary:")
best_summary = {
    "valid_mode": valid_mode,
    "n_features": int(len(selected_features)),
    "n_trials_total": int(len(res_df)),
    "best_valid_sc_wrmse": float(best_row["valid_sc_wrmse"]),
    "best_valid_sc_wmae": float(best_row["valid_sc_wmae"]),
    "best_valid_sc_diracc": float(best_row["valid_sc_diracc"]),
    "best_iteration": best_row["best_iteration"],
}
print(pd.Series(best_summary))

best_params = {k: best_row[k] for k in best_params_keys}
best_params["max_depth"] = int(best_params["max_depth"])
for k in best_params_keys:
    if k != "max_depth":
        best_params[k] = float(best_params[k])

print("\n[INFO] BEST params:")
print(best_params)


# -------------------------
# Save outputs (LOCAL + DRIVE)
# -------------------------
BEST_MODEL_PATH = MS_OUT_LOCAL / "best_model_xgb_reg_t1.json"
BEST_PARAMS_TXT = MS_OUT_LOCAL / "best_params_xgb_reg_t1.txt"
BEST_PARAMS_PKL = MS_OUT_LOCAL / "best_params_xgb_reg_t1.pkl"

# Save model
best_model.get_booster().save_model(str(BEST_MODEL_PATH))
copy_file(BEST_MODEL_PATH, MS_OUT_DRIVE / BEST_MODEL_PATH.name)

# Save params as text
lines = [
    f"valid_mode={valid_mode}",
    f"best_valid_sc_wrmse={best_row['valid_sc_wrmse']}",
    f"best_valid_sc_wmae={best_row['valid_sc_wmae']}",
    f"best_valid_sc_diracc={best_row['valid_sc_diracc']}",
    f"best_iteration={best_row['best_iteration']}",
    f"n_estimators={N_ESTIMATORS}",
    f"early_stop={EARLY_STOP}",
    f"trials_stage1={N_TRIALS_STAGE1}",
    f"trials_stage2={N_TRIALS_STAGE2}",
    f"trials_stage2_lowlr={N_TRIALS_STAGE2_LOWLR}",
    f"random_seed={RANDOM_SEED}",
]
for k in best_params_keys:
    lines.append(f"{k}={best_params[k]}")

BEST_PARAMS_TXT.write_text("\n".join(lines), encoding="utf-8")
copy_file(BEST_PARAMS_TXT, MS_OUT_DRIVE / BEST_PARAMS_TXT.name)

# Save params as pickle
save_pickle(best_params, BEST_PARAMS_PKL)
copy_file(BEST_PARAMS_PKL, MS_OUT_DRIVE / BEST_PARAMS_PKL.name)

# Save ALSO to persistent location (for future runs without HPO)
save_pickle(best_params, DATA_DIRS_LOCAL["processed"] / "best_params_xgb_reg_t1.pkl")
copy_file(DATA_DIRS_LOCAL["processed"] / "best_params_xgb_reg_t1.pkl", 
          DATA_DIRS_DRIVE["processed"] / "best_params_xgb_reg_t1.pkl")
print("  - best_params_xgb_reg_t1.pkl (persistent)")

print("\n[OK] Saved HPO outputs:")
print("  -", BEST_MODEL_PATH.name)
print("  -", BEST_PARAMS_TXT.name)
print("  -", BEST_PARAMS_PKL.name)

print("[OK] BLOCK 24 complete.")

---
# SECTION 7: XGBoost Final Model

**Train and evaluate best XGBoost model**

**Block:** 25

## BLOCK 25 — FINAL MODEL TRAIN + BASELINE COMPARISON

In [None]:

# Config
HPO_CFG = RUN_PARAMS["hpo"]

# Directories
# Fallback: RUN_ID_LOCAL -> data/processed_LOCAL -> RUN_ID_DRIVE -> data/processed_DRIVE
RUN_PROC_LOCAL = Path(LOCAL_PATHS["proc_dir"])  # runs/RUN_ID/processed (LOCAL)
FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]  # data/processed (LOCAL)
RUN_PROC_DRIVE = Path(DRIVE_PATHS["proc_dir"])  # runs/RUN_ID/processed (DRIVE)
FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]  # data/processed (DRIVE)

RUN_MS_LOCAL = Path(LOCAL_PATHS["ms_dir"])  # runs/RUN_ID/model_selection (LOCAL)
RUN_MS_DRIVE = Path(DRIVE_PATHS["ms_dir"])  # runs/RUN_ID/model_selection (DRIVE)
MODELS_OUT_LOCAL = ensure_dir(Path(MODELS_DIR))
MODELS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["models_dir"]))

print("[INFO] Final Model output dirs:")
print("  - LOCAL:", MODELS_OUT_LOCAL)
print("  - DRIVE:", MODELS_OUT_DRIVE)

# -------------------------
# 1. Load data from snapshot (data/processed/)
# -------------------------
print("\n[INFO] Loading data (RUN_ID -> data/processed fallback):")

X_train_sel = load_with_fallback("X_train_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
X_valid_sel = load_with_fallback("X_valid_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
X_test_sel = load_with_fallback("X_test_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)

y_train_t1 = load_with_fallback("y_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
y_valid_t1 = load_with_fallback("y_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
y_test_t1 = load_with_fallback("y_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)

w_train = load_with_fallback("weights_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
w_valid = load_with_fallback("weights_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
w_test = load_with_fallback("weights_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)

best_params = load_with_fallback("best_params_xgb_reg_t1.pkl", RUN_MS_LOCAL, FALLBACK_PROC_LOCAL, RUN_MS_DRIVE, FALLBACK_PROC_DRIVE)

print(f"[INFO] Loaded: X_train_sel={X_train_sel.shape} | X_valid_sel={X_valid_sel.shape} | X_test_sel={X_test_sel.shape}")

y_train = y_train_t1.astype(float).copy()
y_valid = y_valid_t1.astype(float).copy()
y_test = y_test_t1.astype(float).copy()

w_train_arr = np.asarray(w_train, dtype=float)
w_valid_arr = np.asarray(w_valid, dtype=float)
w_test_arr = np.asarray(w_test, dtype=float)

# -------------------------
# Align VALID + TEST data with Neural Networks (skip first lookback-1 rows)
# This ensures metrics are comparable across XGBoost and Neural models
# -------------------------
LOOKBACK = int(HPO_CFG["lookback"])
SKIP_ROWS = LOOKBACK - 1

# Align VALID
X_valid_sel = X_valid_sel.iloc[SKIP_ROWS:]
y_valid = y_valid.iloc[SKIP_ROWS:]
w_valid_arr = w_valid_arr[SKIP_ROWS:]

# Align TEST
X_test_sel = X_test_sel.iloc[SKIP_ROWS:]
y_test = y_test.iloc[SKIP_ROWS:]
w_test_arr = w_test_arr[SKIP_ROWS:]

print(f"[INFO] Aligned VALID+TEST with lookback={LOOKBACK}: skipped first {SKIP_ROWS} rows")
print(f"[INFO] VALID shapes after alignment: X={X_valid_sel.shape}")
print(f"[INFO] TEST shapes after alignment: X={X_test_sel.shape}")


# -------------------------
# 2. Split VALID into ES and SCORE (date-based)
# -------------------------
VALID_ES_START = HPO_CFG["valid_es_start"]
VALID_ES_END = HPO_CFG["valid_es_end"]
VALID_SCORE_START = HPO_CFG["valid_score_start"]
VALID_SCORE_END = HPO_CFG["valid_score_end"]


def split_valid_es_score(Xv, yv, wv):
    """Split validation into ES (early stopping) and SCORE (model selection) sets."""
    if not isinstance(Xv.index, pd.DatetimeIndex):
        return (Xv, yv, wv), (Xv, yv, wv), "FULL_VALID"

    wv_s = pd.Series(wv, index=Xv.index)
    
    es_start, es_end = pd.Timestamp(VALID_ES_START), pd.Timestamp(VALID_ES_END)
    sc_start, sc_end = pd.Timestamp(VALID_SCORE_START), pd.Timestamp(VALID_SCORE_END)
    
    m_es = (Xv.index >= es_start) & (Xv.index <= es_end)
    m_sc = (Xv.index >= sc_start) & (Xv.index <= sc_end)
    mode_str = f"VALID_ES={VALID_ES_START}:{VALID_ES_END} / VALID_SCORE={VALID_SCORE_START}:{VALID_SCORE_END}"

    if m_es.sum() > 0 and m_sc.sum() > 0:
        return (Xv.loc[m_es], yv.loc[m_es], wv_s.loc[m_es].to_numpy(float)), \
               (Xv.loc[m_sc], yv.loc[m_sc], wv_s.loc[m_sc].to_numpy(float)), \
               mode_str

    return (Xv, yv, wv), (Xv, yv, wv), "FULL_VALID"


(X_es, y_es, w_es), (X_sc, y_sc, w_sc), valid_mode = split_valid_es_score(
    X_valid_sel, y_valid, w_valid_arr
)

print(f"[INFO] VALID mode: {valid_mode}")
print(f"[INFO] VALID_ES: {X_es.shape} | VALID_SCORE: {X_sc.shape}")


# -------------------------
# 3. Metrics functions
# -------------------------
# Metric functions (w_rmse, w_mae, dir_acc) defined in Cell 5


# -------------------------
# 4. BASELINES (Zero + Naive)
# -------------------------
print("\n[INFO] Computing BASELINES...")

baseline_results = []

# ----- BASELINE ZERO (predict 0) -----
print("  [1] BASELINE_ZERO (predict 0):")

# Zero baseline on VALID_SCORE
pred_zero_sc = np.zeros(len(y_sc), dtype=float)
baseline_results.append({
    "model": "BASELINE_ZERO",
    "split": "VALID_SCORE",
    "n": int(len(y_sc)),
    "wRMSE": w_rmse(y_sc, pred_zero_sc, w_sc),
    "wMAE": w_mae(y_sc, pred_zero_sc, w_sc),
    "DirAcc": dir_acc(y_sc, pred_zero_sc),
})

# Zero baseline on TEST
pred_zero_test = np.zeros(len(y_test), dtype=float)
baseline_results.append({
    "model": "BASELINE_ZERO",
    "split": "TEST",
    "n": int(len(y_test)),
    "wRMSE": w_rmse(y_test, pred_zero_test, w_test_arr),
    "wMAE": w_mae(y_test, pred_zero_test, w_test_arr),
    "DirAcc": dir_acc(y_test, pred_zero_test),
})

for r in baseline_results[-2:]:
    print(f"      {r['split']}: wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")

# ----- BASELINE NAIVE (last-value forecast) -----
# Predict: y[t] = y[t-1] (tomorrow's return = today's return)
print("  [2] BASELINE_NAIVE (last-value forecast):")

# For naive forecast, we need the previous day's actual value
# y_sc and y_test are already aligned - we need to load full y to get lag

# Try to load full y for naive baseline
try:
    y_full = load_with_fallback("y_full.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    # Get indices for valid_score and test
    y_full_arr = np.asarray(y_full, dtype=float)
    
    # Naive: predict previous value
    # For valid_score period, we need the value from day before
    # Assuming y_sc starts where y_valid ends (after ES period)
    # We'll use shifted y values
    
    # Simple approach: shift y by 1 within each split
    pred_naive_sc = np.roll(y_sc, 1)
    pred_naive_sc[0] = 0  # First value has no previous, use 0
    
    pred_naive_test = np.roll(y_test, 1)
    pred_naive_test[0] = 0  # First value has no previous, use 0
    
    naive_available = True
except:
    # If y_full not available, use simple shift within splits
    pred_naive_sc = np.roll(np.asarray(y_sc), 1)
    pred_naive_sc[0] = 0
    
    pred_naive_test = np.roll(np.asarray(y_test), 1)
    pred_naive_test[0] = 0
    
    naive_available = True

if naive_available:
    # Naive baseline on VALID_SCORE
    baseline_results.append({
        "model": "BASELINE_NAIVE",
        "split": "VALID_SCORE",
        "n": int(len(y_sc)),
        "wRMSE": w_rmse(y_sc, pred_naive_sc, w_sc),
        "wMAE": w_mae(y_sc, pred_naive_sc, w_sc),
        "DirAcc": dir_acc(y_sc, pred_naive_sc),
    })
    
    # Naive baseline on TEST
    baseline_results.append({
        "model": "BASELINE_NAIVE",
        "split": "TEST",
        "n": int(len(y_test)),
        "wRMSE": w_rmse(y_test, pred_naive_test, w_test_arr),
        "wMAE": w_mae(y_test, pred_naive_test, w_test_arr),
        "DirAcc": dir_acc(y_test, pred_naive_test),
    })
    
    for r in baseline_results[-2:]:
        print(f"      {r['split']}: wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")

print("\n[INFO] BASELINE Summary:")
for r in baseline_results:
    print(f"  - {r['model']:15} | {r['split']:12} | wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")


# -------------------------
# 5. Train FINAL MODEL (early stop on VALID_ES)
# -------------------------
print("\n[INFO] Training FINAL MODEL...")

N_ESTIMATORS = int(HPO_CFG["n_estimators"])
EARLY_STOP = int(HPO_CFG["early_stopping_rounds"])
RANDOM_SEED = int(HPO_CFG["random_state"])

# Ensure correct types in best_params
best_params = dict(best_params)
best_params["max_depth"] = int(best_params["max_depth"])
for k in ["learning_rate", "min_child_weight", "subsample", "colsample_bytree",
          "gamma", "reg_alpha", "reg_lambda", "max_delta_step"]:
    if k in best_params:
        best_params[k] = float(best_params[k])

# XGBoost settings from config
OBJECTIVE = HPO_CFG["objective"]
EVAL_METRIC = HPO_CFG["eval_metric"]
TREE_METHOD = HPO_CFG["tree_method"]

model = xgb.XGBRegressor(
    n_estimators=N_ESTIMATORS,
    objective=OBJECTIVE,
    eval_metric=EVAL_METRIC,
    tree_method=TREE_METHOD,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbosity=0,
    early_stopping_rounds=EARLY_STOP,
    **best_params
)

model.fit(
    X_train_sel, y_train,
    sample_weight=w_train_arr,
    eval_set=[(X_es, y_es)],
    sample_weight_eval_set=[w_es],
    verbose=False
)

best_iter = getattr(model, "best_iteration", None)
best_score = getattr(model, "best_score", None)
print(f"[INFO] Training complete. best_iteration={best_iter} | best_score(ES)={best_score}")


# -------------------------
# 6. Evaluate FINAL MODEL on VALID_SCORE + TEST
# -------------------------
print("\n[INFO] Evaluating FINAL MODEL...")

model_results = []

# Model on VALID_SCORE
pred_model_sc = model.predict(X_sc)
model_results.append({
    "model": "FINAL_XGB",
    "split": "VALID_SCORE",
    "n": int(len(y_sc)),
    "wRMSE": w_rmse(y_sc, pred_model_sc, w_sc),
    "wMAE": w_mae(y_sc, pred_model_sc, w_sc),
    "DirAcc": dir_acc(y_sc, pred_model_sc),
})

# Model on TEST
pred_model_test = model.predict(X_test_sel)
model_results.append({
    "model": "FINAL_XGB",
    "split": "TEST",
    "n": int(len(y_test)),
    "wRMSE": w_rmse(y_test, pred_model_test, w_test_arr),
    "wMAE": w_mae(y_test, pred_model_test, w_test_arr),
    "DirAcc": dir_acc(y_test, pred_model_test),
})

print("[INFO] FINAL MODEL results:")
for r in model_results:
    print(f"  - {r['split']}: wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")


# -------------------------
# 7. Comparison: BASELINE vs MODEL
# -------------------------
all_results = baseline_results + model_results
metrics_df = pd.DataFrame(all_results)

# Add improvement column (compared to BASELINE_ZERO)
baseline_zero_wrmse = {r["split"]: r["wRMSE"] for r in baseline_results if r["model"] == "BASELINE_ZERO"}
metrics_df["wRMSE_vs_zero"] = metrics_df.apply(
    lambda row: baseline_zero_wrmse.get(row["split"], 0) - row["wRMSE"], axis=1
)

# Add improvement vs BASELINE_NAIVE
baseline_naive_wrmse = {r["split"]: r["wRMSE"] for r in baseline_results if r["model"] == "BASELINE_NAIVE"}
metrics_df["wRMSE_vs_naive"] = metrics_df.apply(
    lambda row: baseline_naive_wrmse.get(row["split"], 0) - row["wRMSE"], axis=1
)

print("\n[INFO] BASELINE vs FINAL MODEL comparison:")
display(metrics_df)


# -------------------------
# 8. Build predictions DataFrames
# -------------------------
preds_valid_score_df = pd.DataFrame({
    "date": X_sc.index,
    "actual": y_sc.values,
    "baseline_zero": pred_zero_sc,
    "baseline_naive": pred_naive_sc,
    "predicted": pred_model_sc,
    "sample_weight": w_sc,
}).reset_index(drop=True)

preds_test_df = pd.DataFrame({
    "date": X_test_sel.index,
    "actual": y_test.values,
    "baseline_zero": pred_zero_test,
    "baseline_naive": pred_naive_test,
    "predicted": pred_model_test,
    "sample_weight": w_test_arr,
}).reset_index(drop=True)


# -------------------------
# 9. Save artifacts (LOCAL + DRIVE)
# -------------------------
# Model
# Save model as JSON
model_path_local = MODELS_OUT_LOCAL / "final_model_xgb.json"
model.get_booster().save_model(str(model_path_local))
copy_file(model_path_local, MODELS_OUT_DRIVE / model_path_local.name)

# Also save as pickle for easier loading
model_pkl_local = MODELS_OUT_LOCAL / "final_model_xgb.pkl"
save_pickle(model, model_pkl_local)
copy_file(model_pkl_local, MODELS_OUT_DRIVE / model_pkl_local.name)

# Metrics
metrics_path_local = MODELS_OUT_LOCAL / "final_metrics.csv"
metrics_df.to_csv(metrics_path_local, index=False)
copy_file(metrics_path_local, MODELS_OUT_DRIVE / metrics_path_local.name)

# Baseline results JSON (for CLI summary)
OUTPUTS_LOCAL = ensure_dir(Path(LOCAL_PATHS["outputs_dir"]))
OUTPUTS_DRIVE = ensure_dir(Path(DRIVE_PATHS["outputs_dir"]))

for baseline_row in baseline_results:
    if baseline_row["split"] == "TEST":
        baseline_json = {
            "model": baseline_row["model"],
            "test_wrmse": baseline_row["wRMSE"],
            "test_wmae": baseline_row.get("wMAE"),
            "test_diracc": baseline_row["DirAcc"],
        }
        baseline_name = baseline_row["model"].lower()
        save_json(baseline_json, OUTPUTS_LOCAL / f"{baseline_name}_results.json")
        save_json(baseline_json, OUTPUTS_DRIVE / f"{baseline_name}_results.json")

# Predictions -> predictions/xgb/
PRED_XGB_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions" / "xgb")
PRED_XGB_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions" / "xgb")

preds_valid_path_local = PRED_XGB_LOCAL / "predictions_valid.csv"
preds_valid_score_df.to_csv(preds_valid_path_local, index=False)
copy_file(preds_valid_path_local, PRED_XGB_DRIVE / preds_valid_path_local.name)

preds_test_path_local = PRED_XGB_LOCAL / "predictions_test.csv"
preds_test_df.to_csv(preds_test_path_local, index=False)
copy_file(preds_test_path_local, PRED_XGB_DRIVE / preds_test_path_local.name)

print("\n[OK] Saved FINAL MODEL artifacts:")
print("  -", model_path_local.name)
print("  -", metrics_path_local.name)
print("  - predictions/xgb/", preds_valid_path_local.name)
print("  - predictions/xgb/", preds_test_path_local.name)



# -------------------------
# Tomorrow Prediction + Plot
# -------------------------
PLOT_CFG = RUN_PARAMS["plot"]
N_PLOT = int(PLOT_CFG["n_plot"])
FIGSIZE = tuple(PLOT_CFG["figsize"])
DPI = int(PLOT_CFG["dpi"])

# Predict tomorrow
last_date = X_test_sel.index[-1]
X_last = X_test_sel.iloc[[-1]]
pred_tomorrow = float(model.predict(X_last)[0])
pred_tomorrow_date = last_date + pd.Timedelta(days=1)

# Save tomorrow prediction
pred_tomorrow_df = pd.DataFrame([{
    "feature_set": "xgb_selected",
    "last_data_date": last_date,
    "predicted_for": "next_trading_day",
    "pred_logret": pred_tomorrow,
    "pred_return_pct": float(np.expm1(pred_tomorrow) * 100),
}])
pred_tomorrow_df.to_csv(PRED_XGB_LOCAL / "tomorrow.csv", index=False)
copy_file(PRED_XGB_LOCAL / "tomorrow.csv", PRED_XGB_DRIVE / "tomorrow.csv")

# Create backtest dataframe
hist_df = pd.DataFrame({
    "date": X_test_sel.index,
    "actual": y_test.values,
    "y_pred": pred_model_test,
}).set_index("date")

hist_tail = hist_df.tail(N_PLOT).copy()
hist_tail.to_csv(PRED_XGB_LOCAL / "backtest.csv")
copy_file(PRED_XGB_LOCAL / "backtest.csv", PRED_XGB_DRIVE / "backtest.csv")

# Plot
fig, ax = plt.subplots(figsize=FIGSIZE)
ax.plot(hist_tail.index, hist_tail["actual"].values, linewidth=1, label="Actual")
ax.plot(hist_tail.index, hist_tail["y_pred"].values, linewidth=1, label="Predicted (y_pred)")
ax.scatter([pred_tomorrow_date], [pred_tomorrow], s=90, marker="X", color="red", label=f"Tomorrow: {pred_tomorrow:.4f}")
ax.axhline(0.0, color="gray", linewidth=0.5, linestyle="--")
ax.set_title(f"XGBoost Predictions — xgb_selected — last {len(hist_tail)} days + tomorrow")
ax.set_xlabel("Date")
ax.set_ylabel("Log Return")
ax.legend(loc="upper right")
plt.tight_layout()
plt.savefig(PRED_XGB_LOCAL / "plot.png", dpi=DPI)
copy_file(PRED_XGB_LOCAL / "plot.png", PRED_XGB_DRIVE / "plot.png")
plt.close(fig)

print(f"[INFO] Tomorrow prediction: {pred_tomorrow:.6f} ({np.expm1(pred_tomorrow)*100:.4f}%)")
print(f"[OK] Saved: predictions/xgb/ (tomorrow.csv, backtest.csv, plot.png)")


# -------------------------
# SHAP Analysis (config-based)
# -------------------------
SHAP_CFG = RUN_PARAMS.get("shap", {})
SHAP_ENABLED = bool(SHAP_CFG.get("enabled", True))

if SHAP_ENABLED:
    print("\n[INFO] Computing SHAP values...")
    
    try:
        import shap
        
        # Config
        SHAP_MAX_DISPLAY = int(SHAP_CFG.get("max_display", 20))
        SHAP_FIGSIZE = tuple(SHAP_CFG.get("figsize", [10, 8]))
        SHAP_BAR = bool(SHAP_CFG.get("plot_type_bar", True))
        SHAP_BEESWARM = bool(SHAP_CFG.get("plot_type_beeswarm", True))
        SHAP_SAVE_VALUES = bool(SHAP_CFG.get("save_values", True))
        
        # Create explainer (TreeExplainer is fast for XGBoost)
        explainer = shap.TreeExplainer(model)
        
        # Compute SHAP values on TEST set
        shap_values = explainer.shap_values(X_test_sel)
        
        # Save SHAP values as DataFrame
        if SHAP_SAVE_VALUES:
            shap_df = pd.DataFrame(shap_values, columns=X_test_sel.columns, index=X_test_sel.index)
            shap_df.to_csv(PRED_XGB_LOCAL / "shap_values_test.csv")
            copy_file(PRED_XGB_LOCAL / "shap_values_test.csv", PRED_XGB_DRIVE / "shap_values_test.csv")
        
        # Feature importance (mean |SHAP|)
        shap_importance = pd.DataFrame({
            "feature": X_test_sel.columns,
            "mean_abs_shap": np.abs(shap_values).mean(axis=0)
        }).sort_values("mean_abs_shap", ascending=False).reset_index(drop=True)
        shap_importance.to_csv(PRED_XGB_LOCAL / "shap_feature_importance.csv", index=False)
        copy_file(PRED_XGB_LOCAL / "shap_feature_importance.csv", PRED_XGB_DRIVE / "shap_feature_importance.csv")
        
        print(f"[INFO] Top 10 features by SHAP importance:")
        print(shap_importance.head(10).to_string(index=False))
        
        # SHAP Summary Plot (bar)
        if SHAP_BAR:
            fig_bar, ax_bar = plt.subplots(figsize=SHAP_FIGSIZE)
            shap.summary_plot(shap_values, X_test_sel, plot_type="bar", show=False, max_display=SHAP_MAX_DISPLAY)
            plt.tight_layout()
            plt.savefig(PRED_XGB_LOCAL / "shap_summary_bar.png", dpi=DPI, bbox_inches="tight")
            copy_file(PRED_XGB_LOCAL / "shap_summary_bar.png", PRED_XGB_DRIVE / "shap_summary_bar.png")
            plt.close()
        
        # SHAP Summary Plot (beeswarm)
        if SHAP_BEESWARM:
            fig_bee, ax_bee = plt.subplots(figsize=SHAP_FIGSIZE)
            shap.summary_plot(shap_values, X_test_sel, show=False, max_display=SHAP_MAX_DISPLAY)
            plt.tight_layout()
            plt.savefig(PRED_XGB_LOCAL / "shap_summary_beeswarm.png", dpi=DPI, bbox_inches="tight")
            copy_file(PRED_XGB_LOCAL / "shap_summary_beeswarm.png", PRED_XGB_DRIVE / "shap_summary_beeswarm.png")
            plt.close()
        
        print("[OK] Saved SHAP analysis:")
        if SHAP_SAVE_VALUES:
            print("  - shap_values_test.csv")
        print("  - shap_feature_importance.csv")
        if SHAP_BAR:
            print("  - shap_summary_bar.png")
        if SHAP_BEESWARM:
            print("  - shap_summary_beeswarm.png")
        
    except ImportError:
        print("[WARN] shap not installed. Run: pip install shap")
    except Exception as e:
        print(f"[WARN] SHAP analysis failed: {e}")
else:
    print("[INFO] SHAP analysis disabled in config.")

print("[OK] BLOCK 25 complete.")

---
# SECTION 7B: LightGBM Model

**Train and evaluate LightGBM model (similar to XGBoost)**

**Blocks:** 25B

## BLOCK 25B — LIGHTGBM FINAL MODEL

In [None]:
# ============================================================
# LIGHTGBM FINAL MODEL (identical structure to XGBoost)
# ============================================================
try:
    import lightgbm as lgb
    LGB_AVAILABLE = True
    print("[OK] LightGBM available")
except ImportError:
    LGB_AVAILABLE = False
    print("[SKIP] LightGBM not installed")

if LGB_AVAILABLE:
    # Config (use lgb_hpo if exists, fallback to hpo)
    HPO_CFG = RUN_PARAMS.get("lgb_hpo", RUN_PARAMS["hpo"])
    
    # Directories
    RUN_PROC_LOCAL = Path(LOCAL_PATHS["proc_dir"])
    FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
    RUN_PROC_DRIVE = Path(DRIVE_PATHS["proc_dir"])
    FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]
    
    RUN_MS_LOCAL = Path(LOCAL_PATHS["ms_dir"])
    RUN_MS_DRIVE = Path(DRIVE_PATHS["ms_dir"])
    MODELS_OUT_LOCAL = ensure_dir(Path(MODELS_DIR))
    MODELS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["models_dir"]))
    
    print("[INFO] LightGBM Final Model output dirs:")
    print("  - LOCAL:", MODELS_OUT_LOCAL)
    print("  - DRIVE:", MODELS_OUT_DRIVE)
    
    # -------------------------
    # 1. Load data from snapshot
    # -------------------------
    print("\n[INFO] Loading data (RUN_ID -> data/processed fallback):")
    
    X_train_sel = load_with_fallback("X_train_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
    X_valid_sel = load_with_fallback("X_valid_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
    X_test_sel = load_with_fallback("X_test_xgb_selected.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
    
    y_train_t1 = load_with_fallback("y_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    y_valid_t1 = load_with_fallback("y_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    y_test_t1 = load_with_fallback("y_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    w_train = load_with_fallback("weights_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    w_valid = load_with_fallback("weights_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    w_test = load_with_fallback("weights_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    # Load best_params
    try:
        best_params = load_with_fallback("best_params_lgb_reg_t1.pkl", RUN_MS_LOCAL, FALLBACK_PROC_LOCAL, RUN_MS_DRIVE, FALLBACK_PROC_DRIVE)
        print("[INFO] Loaded best_params_lgb from file")
    except FileNotFoundError:
        LGB_FS_CFG = RUN_PARAMS.get("lgb_fs", RUN_PARAMS["xgb_fs"])
        best_params = {
            "max_depth": int(LGB_FS_CFG.get("max_depth", 3)),
            "num_leaves": int(LGB_FS_CFG.get("num_leaves", 31)),
            "learning_rate": float(LGB_FS_CFG.get("learning_rate", 0.05)),
            "min_child_samples": int(LGB_FS_CFG.get("min_child_samples", 20)),
            "subsample": float(LGB_FS_CFG.get("subsample", 0.7)),
            "colsample_bytree": float(LGB_FS_CFG.get("colsample_bytree", 0.7)),
            "reg_alpha": float(LGB_FS_CFG.get("reg_alpha", 1e-4)),
            "reg_lambda": float(LGB_FS_CFG.get("reg_lambda", 5.0)),
        }
        print("[INFO] Using default LightGBM params from config")
    
    print(f"[INFO] Loaded: X_train_sel={X_train_sel.shape} | X_valid_sel={X_valid_sel.shape} | X_test_sel={X_test_sel.shape}")
    
    y_train = y_train_t1.astype(float).copy()
    y_valid = y_valid_t1.astype(float).copy()
    y_test = y_test_t1.astype(float).copy()
    
    w_train_arr = np.asarray(w_train, dtype=float)
    w_valid_arr = np.asarray(w_valid, dtype=float)
    w_test_arr = np.asarray(w_test, dtype=float)
    
    # Align VALID + TEST with Neural Networks
    LOOKBACK = int(HPO_CFG["lookback"])
    SKIP_ROWS = LOOKBACK - 1
    
    X_valid_sel = X_valid_sel.iloc[SKIP_ROWS:]
    y_valid = y_valid.iloc[SKIP_ROWS:]
    w_valid_arr = w_valid_arr[SKIP_ROWS:]
    
    X_test_sel = X_test_sel.iloc[SKIP_ROWS:]
    y_test = y_test.iloc[SKIP_ROWS:]
    w_test_arr = w_test_arr[SKIP_ROWS:]
    
    print(f"[INFO] Aligned VALID+TEST with lookback={LOOKBACK}: skipped first {SKIP_ROWS} rows")
    print(f"[INFO] VALID shapes after alignment: X={X_valid_sel.shape}")
    print(f"[INFO] TEST shapes after alignment: X={X_test_sel.shape}")
    
    # -------------------------
    # 2. Split VALID into ES and SCORE
    # -------------------------
    VALID_ES_START = HPO_CFG["valid_es_start"]
    VALID_ES_END = HPO_CFG["valid_es_end"]
    VALID_SCORE_START = HPO_CFG["valid_score_start"]
    VALID_SCORE_END = HPO_CFG["valid_score_end"]
    
    def split_valid_es_score_lgb(Xv, yv, wv):
        if not isinstance(Xv.index, pd.DatetimeIndex):
            return (Xv, yv, wv), (Xv, yv, wv), "FULL_VALID"
        wv_s = pd.Series(wv, index=Xv.index)
        
        es_start, es_end = pd.Timestamp(VALID_ES_START), pd.Timestamp(VALID_ES_END)
        sc_start, sc_end = pd.Timestamp(VALID_SCORE_START), pd.Timestamp(VALID_SCORE_END)
        
        m_es = (Xv.index >= es_start) & (Xv.index <= es_end)
        m_sc = (Xv.index >= sc_start) & (Xv.index <= sc_end)
        mode_str = f"VALID_ES={VALID_ES_START}:{VALID_ES_END} / VALID_SCORE={VALID_SCORE_START}:{VALID_SCORE_END}"
        if m_es.sum() > 0 and m_sc.sum() > 0:
            return (Xv.loc[m_es], yv.loc[m_es], wv_s.loc[m_es].to_numpy(float)), \
                   (Xv.loc[m_sc], yv.loc[m_sc], wv_s.loc[m_sc].to_numpy(float)), \
                   mode_str
        return (Xv, yv, wv), (Xv, yv, wv), "FULL_VALID"
    
    (X_es, y_es, w_es), (X_sc, y_sc, w_sc), valid_mode = split_valid_es_score_lgb(
        X_valid_sel, y_valid, w_valid_arr
    )
    
    print(f"[INFO] VALID mode: {valid_mode}")
    print(f"[INFO] VALID_ES: {X_es.shape} | VALID_SCORE: {X_sc.shape}")
    
    # -------------------------
    # 3. BASELINES
    # -------------------------
    print("\n[INFO] Computing BASELINES...")
    
    baseline_results = []
    
    print("  [1] BASELINE_ZERO (predict 0):")
    pred_zero_sc = np.zeros(len(y_sc), dtype=float)
    baseline_results.append({
        "model": "BASELINE_ZERO", "split": "VALID_SCORE", "n": int(len(y_sc)),
        "wRMSE": w_rmse(y_sc, pred_zero_sc, w_sc),
        "wMAE": w_mae(y_sc, pred_zero_sc, w_sc),
        "DirAcc": dir_acc(y_sc, pred_zero_sc),
    })
    
    pred_zero_test = np.zeros(len(y_test), dtype=float)
    baseline_results.append({
        "model": "BASELINE_ZERO", "split": "TEST", "n": int(len(y_test)),
        "wRMSE": w_rmse(y_test, pred_zero_test, w_test_arr),
        "wMAE": w_mae(y_test, pred_zero_test, w_test_arr),
        "DirAcc": dir_acc(y_test, pred_zero_test),
    })
    
    for r in baseline_results[-2:]:
        print(f"      {r['split']}: wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")
    
    print("  [2] BASELINE_NAIVE (last-value forecast):")
    pred_naive_sc = np.roll(np.asarray(y_sc), 1)
    pred_naive_sc[0] = 0
    
    pred_naive_test = np.roll(np.asarray(y_test), 1)
    pred_naive_test[0] = 0
    
    baseline_results.append({
        "model": "BASELINE_NAIVE", "split": "VALID_SCORE", "n": int(len(y_sc)),
        "wRMSE": w_rmse(y_sc, pred_naive_sc, w_sc),
        "wMAE": w_mae(y_sc, pred_naive_sc, w_sc),
        "DirAcc": dir_acc(y_sc, pred_naive_sc),
    })
    
    baseline_results.append({
        "model": "BASELINE_NAIVE", "split": "TEST", "n": int(len(y_test)),
        "wRMSE": w_rmse(y_test, pred_naive_test, w_test_arr),
        "wMAE": w_mae(y_test, pred_naive_test, w_test_arr),
        "DirAcc": dir_acc(y_test, pred_naive_test),
    })
    
    for r in baseline_results[-2:]:
        print(f"      {r['split']}: wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")
    
    print("\n[INFO] BASELINE Summary:")
    for r in baseline_results:
        print(f"  - {r['model']:15} | {r['split']:12} | wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")
    
    # -------------------------
    # 4. Train FINAL MODEL
    # -------------------------
    print("\n[INFO] Training FINAL LightGBM MODEL...")
    
    N_ESTIMATORS = int(HPO_CFG["n_estimators"])
    EARLY_STOP = int(HPO_CFG["early_stopping_rounds"])
    RANDOM_SEED = int(HPO_CFG["random_state"])
    
    best_params = dict(best_params)
    if "max_depth" in best_params:
        best_params["max_depth"] = int(best_params["max_depth"])
    if "num_leaves" in best_params:
        best_params["num_leaves"] = int(best_params["num_leaves"])
    if "min_child_samples" in best_params:
        best_params["min_child_samples"] = int(best_params["min_child_samples"])
    for k in ["learning_rate", "subsample", "colsample_bytree", "reg_alpha", "reg_lambda"]:
        if k in best_params:
            best_params[k] = float(best_params[k])
    
    model = lgb.LGBMRegressor(
        n_estimators=N_ESTIMATORS,
        objective="regression",
        metric="rmse",
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=-1,
        **best_params
    )
    
    model.fit(
        X_train_sel, y_train,
        sample_weight=w_train_arr,
        eval_set=[(X_es, y_es)],
        eval_sample_weight=[w_es],
        callbacks=[
            lgb.early_stopping(stopping_rounds=EARLY_STOP, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )
    
    best_iter = getattr(model, "best_iteration_", None)
    print(f"[INFO] Training complete. best_iteration={best_iter}")
    
    # -------------------------
    # 5. Evaluate FINAL MODEL
    # -------------------------
    print("\n[INFO] Evaluating FINAL MODEL...")
    
    model_results = []
    
    pred_model_sc = model.predict(X_sc)
    model_results.append({
        "model": "FINAL_LGB", "split": "VALID_SCORE", "n": int(len(y_sc)),
        "wRMSE": w_rmse(y_sc, pred_model_sc, w_sc),
        "wMAE": w_mae(y_sc, pred_model_sc, w_sc),
        "DirAcc": dir_acc(y_sc, pred_model_sc),
    })
    
    pred_model_test = model.predict(X_test_sel)
    model_results.append({
        "model": "FINAL_LGB", "split": "TEST", "n": int(len(y_test)),
        "wRMSE": w_rmse(y_test, pred_model_test, w_test_arr),
        "wMAE": w_mae(y_test, pred_model_test, w_test_arr),
        "DirAcc": dir_acc(y_test, pred_model_test),
    })
    
    print("[INFO] FINAL MODEL results:")
    for r in model_results:
        print(f"  - {r['split']}: wRMSE={r['wRMSE']:.6f} | DirAcc={r['DirAcc']:.4f}")
    
    # -------------------------
    # 6. Comparison
    # -------------------------
    all_results = baseline_results + model_results
    metrics_df = pd.DataFrame(all_results)
    
    baseline_zero_wrmse = {r["split"]: r["wRMSE"] for r in baseline_results if r["model"] == "BASELINE_ZERO"}
    metrics_df["wRMSE_vs_zero"] = metrics_df.apply(
        lambda row: baseline_zero_wrmse.get(row["split"], 0) - row["wRMSE"], axis=1
    )
    
    baseline_naive_wrmse = {r["split"]: r["wRMSE"] for r in baseline_results if r["model"] == "BASELINE_NAIVE"}
    metrics_df["wRMSE_vs_naive"] = metrics_df.apply(
        lambda row: baseline_naive_wrmse.get(row["split"], 0) - row["wRMSE"], axis=1
    )
    
    print("\n[INFO] BASELINE vs FINAL MODEL comparison:")
    display(metrics_df)
    
    # -------------------------
    # 7. Build predictions DataFrames
    # -------------------------
    preds_valid_score_df = pd.DataFrame({
        "date": X_sc.index,
        "actual": y_sc.values,
        "baseline_zero": pred_zero_sc,
        "baseline_naive": pred_naive_sc,
        "predicted": pred_model_sc,
        "sample_weight": w_sc,
    }).reset_index(drop=True)
    
    preds_test_df = pd.DataFrame({
        "date": X_test_sel.index,
        "actual": y_test.values,
        "baseline_zero": pred_zero_test,
        "baseline_naive": pred_naive_test,
        "predicted": pred_model_test,
        "sample_weight": w_test_arr,
    }).reset_index(drop=True)
    
    # -------------------------
    # 8. Save artifacts
    # -------------------------
    model_json_local = MODELS_OUT_LOCAL / "final_model_lgb.json"
    model_dict = model.booster_.dump_model()
    save_json(model_dict, model_json_local)
    copy_file(model_json_local, MODELS_OUT_DRIVE / model_json_local.name)
    
    model_pkl_local = MODELS_OUT_LOCAL / "final_model_lgb.pkl"
    save_pickle(model, model_pkl_local)
    copy_file(model_pkl_local, MODELS_OUT_DRIVE / model_pkl_local.name)
    
    metrics_path_local = MODELS_OUT_LOCAL / "final_metrics_lgb.csv"
    metrics_df.to_csv(metrics_path_local, index=False)
    copy_file(metrics_path_local, MODELS_OUT_DRIVE / metrics_path_local.name)
    
    # Predictions -> predictions/lgb/
    PRED_LGB_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions" / "lgb")
    PRED_LGB_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions" / "lgb")
    
    preds_valid_path_local = PRED_LGB_LOCAL / "predictions_valid.csv"
    preds_valid_score_df.to_csv(preds_valid_path_local, index=False)
    copy_file(preds_valid_path_local, PRED_LGB_DRIVE / preds_valid_path_local.name)
    
    preds_test_path_local = PRED_LGB_LOCAL / "predictions_test.csv"
    preds_test_df.to_csv(preds_test_path_local, index=False)
    copy_file(preds_test_path_local, PRED_LGB_DRIVE / preds_test_path_local.name)
    
    print("\n[OK] Saved FINAL LightGBM MODEL artifacts:")
    print("  -", model_json_local.name)
    print("  -", model_pkl_local.name)
    print("  -", metrics_path_local.name)
    print("  - predictions/lgb/", preds_valid_path_local.name)
    print("  - predictions/lgb/", preds_test_path_local.name)
    
    # -------------------------
    # Tomorrow Prediction + Plot
    # -------------------------
    PLOT_CFG = RUN_PARAMS["plot"]
    N_PLOT = int(PLOT_CFG["n_plot"])
    FIGSIZE = tuple(PLOT_CFG["figsize"])
    DPI = int(PLOT_CFG["dpi"])
    
    last_date = X_test_sel.index[-1]
    X_last = X_test_sel.iloc[[-1]]
    pred_tomorrow = float(model.predict(X_last)[0])
    
    pred_tomorrow_df = pd.DataFrame([{
        "feature_set": "xgb_selected",
        "last_data_date": last_date,
        "predicted_for": "next_trading_day",
        "pred_logret": pred_tomorrow,
        "pred_return_pct": float(np.expm1(pred_tomorrow) * 100),
    }])
    pred_tomorrow_df.to_csv(PRED_LGB_LOCAL / "tomorrow.csv", index=False)
    copy_file(PRED_LGB_LOCAL / "tomorrow.csv", PRED_LGB_DRIVE / "tomorrow.csv")
    
    hist_df = pd.DataFrame({
        "date": X_test_sel.index,
        "actual": y_test.values,
        "y_pred": pred_model_test,
    }).set_index("date")
    
    hist_tail = hist_df.tail(N_PLOT).copy()
    hist_tail.to_csv(PRED_LGB_LOCAL / "backtest.csv")
    copy_file(PRED_LGB_LOCAL / "backtest.csv", PRED_LGB_DRIVE / "backtest.csv")
    
    fig, ax = plt.subplots(figsize=FIGSIZE)
    ax.plot(hist_tail.index, hist_tail["actual"].values, linewidth=1, label="Actual")
    ax.plot(hist_tail.index, hist_tail["y_pred"].values, linewidth=1, label="Predicted")
    pred_tomorrow_date = last_date + pd.Timedelta(days=1)
    ax.scatter([pred_tomorrow_date], [pred_tomorrow], s=90, marker="X", color="red", label=f"Tomorrow: {pred_tomorrow:.4f}")
    ax.axhline(0.0, color="gray", linewidth=0.5, linestyle="--")
    ax.set_title(f"LightGBM Predictions — last {len(hist_tail)} days + tomorrow")
    ax.set_xlabel("Date")
    ax.set_ylabel("Log Return")
    ax.legend(loc="upper right")
    plt.tight_layout()
    plt.savefig(PRED_LGB_LOCAL / "plot.png", dpi=DPI)
    copy_file(PRED_LGB_LOCAL / "plot.png", PRED_LGB_DRIVE / "plot.png")
    plt.close(fig)
    
    print(f"[INFO] Tomorrow prediction: {pred_tomorrow:.6f} ({np.expm1(pred_tomorrow)*100:.4f}%)")
    print(f"[OK] Saved: predictions/lgb/ (tomorrow.csv, backtest.csv, plot.png)")
    
    # Store for later comparison
    LGB_METRICS = metrics_df
    
    print("[OK] BLOCK 25B complete.")
else:
    LGB_METRICS = None
    print("[SKIP] LightGBM section skipped")


---
# SECTION 8: LSTM & GRU Neural Networks

**Train and predict with LSTM and GRU**

**Blocks:** 26-27

## BLOCK 26 — NEURAL NETWORK TRAINING (LSTM + GRU)

In [None]:

if not TF_AVAILABLE:
    print("[SKIP] BLOCK 26 — TensorFlow not available")
else:
    # Config
    MODEL_TYPES = ["lstm", "gru"]
    
    # Directories
    # Fallback: RUN_ID_LOCAL -> data/processed_LOCAL -> RUN_ID_DRIVE -> data/processed_DRIVE
    RUN_PROC_LOCAL = Path(LOCAL_PATHS["proc_dir"])
    FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
    RUN_PROC_DRIVE = Path(DRIVE_PATHS["proc_dir"])
    FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]
    MODELS_OUT_LOCAL = ensure_dir(Path(MODELS_DIR))
    MODELS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["models_dir"]))
    
    print("[INFO] Neural Network output dirs:")
    print("  - LOCAL:", MODELS_OUT_LOCAL)
    print("  - DRIVE:", MODELS_OUT_DRIVE)
    
    # -------------------------
    # Load shared data (y, weights)
    # -------------------------
    y_train_t1 = load_with_fallback("y_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    y_valid_t1 = load_with_fallback("y_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    y_test_t1 = load_with_fallback("y_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    w_train = load_with_fallback("weights_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    w_valid = load_with_fallback("weights_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    w_test = load_with_fallback("weights_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    y_train = y_train_t1.astype(float).to_numpy()
    y_valid = y_valid_t1.astype(float).to_numpy()
    y_test = y_test_t1.astype(float).to_numpy()
    
    w_train_np = np.asarray(w_train, dtype=float)
    w_valid_np = np.asarray(w_valid, dtype=float)
    w_test_np = np.asarray(w_test, dtype=float)
    
    # -------------------------
    # Metrics
    # -------------------------
    # Metric functions (w_rmse, w_mae, dir_acc) defined in Cell 5
    
    # -------------------------
    # VALID split function (date-based)
    # -------------------------
    HPO_CFG = RUN_PARAMS["hpo"]
    VALID_ES_START = HPO_CFG["valid_es_start"]
    VALID_ES_END = HPO_CFG["valid_es_end"]
    VALID_SCORE_START = HPO_CFG["valid_score_start"]
    VALID_SCORE_END = HPO_CFG["valid_score_end"]
    
    def split_valid_es_score_nn(Xv_df, yv, wv):
        """Split validation into ES and SCORE sets."""
        if not isinstance(Xv_df.index, pd.DatetimeIndex):
            return (Xv_df, yv, wv), (Xv_df, yv, wv), "FULL_VALID"
        
        yv_s = pd.Series(yv, index=Xv_df.index)
        wv_s = pd.Series(wv, index=Xv_df.index)
        
        es_start, es_end = pd.Timestamp(VALID_ES_START), pd.Timestamp(VALID_ES_END)
        sc_start, sc_end = pd.Timestamp(VALID_SCORE_START), pd.Timestamp(VALID_SCORE_END)
        m_es = (Xv_df.index >= es_start) & (Xv_df.index <= es_end)
        m_sc = (Xv_df.index >= sc_start) & (Xv_df.index <= sc_end)
        mode_str = f"VALID_ES={VALID_ES_START}:{VALID_ES_END} / VALID_SCORE={VALID_SCORE_START}:{VALID_SCORE_END}"
        
        X_es, X_sc = Xv_df.loc[m_es], Xv_df.loc[m_sc]
        y_es, y_sc = yv_s.loc[m_es].to_numpy(float), yv_s.loc[m_sc].to_numpy(float)
        w_es, w_sc = wv_s.loc[m_es].to_numpy(float), wv_s.loc[m_sc].to_numpy(float)
        
        if len(X_es) > 0 and len(X_sc) > 0:
            return (X_es, y_es, w_es), (X_sc, y_sc, w_sc), mode_str
        return (Xv_df, yv, wv), (Xv_df, yv, wv), "FULL_VALID"
    
    # -------------------------
    # Sequence creation function
    # -------------------------
    def make_sequences_eod_nn(X_2d, y_1d, w_1d, idx, lookback, stride=1):
        """Create sequences: use X up to day t (inclusive) -> predict y[t]."""
        X_2d, y_1d, w_1d = np.asarray(X_2d), np.asarray(y_1d), np.asarray(w_1d)
        N, F = X_2d.shape
        if N < lookback:
            raise ValueError(f"[ERROR] Not enough rows N={N} for lookback={lookback}.")
        X_seq, y_seq, w_seq, idx_seq = [], [], [], []
        for t in range(lookback - 1, N, stride):
            X_seq.append(X_2d[t - lookback + 1:t + 1, :])
            y_seq.append(y_1d[t])
            w_seq.append(w_1d[t])
            idx_seq.append(idx[t])
        return (np.asarray(X_seq, dtype=np.float32),
                np.asarray(y_seq, dtype=np.float32),
                np.asarray(w_seq, dtype=np.float32),
                pd.DatetimeIndex(idx_seq))
    
    # -------------------------
    # Loop over model types (LSTM, GRU)
    # -------------------------
    for model_type in MODEL_TYPES:
        NN_CFG = RUN_PARAMS[model_type]
        LOOKBACK = int(NN_CFG["lookback"])
        STRIDE = int(NN_CFG["stride"])
        UNITS_1 = int(NN_CFG["units_1"])
        UNITS_2 = int(NN_CFG["units_2"])
        DENSE_UNITS = int(NN_CFG["dense_units"])
        DROPOUT = float(NN_CFG["dropout"])
        LR = float(NN_CFG["learning_rate"])
        CLIPNORM = float(NN_CFG["clipnorm"])
        DENSE_ACT = NN_CFG["dense_activation"]
        OUTPUT_ACT = NN_CFG["output_activation"]
        EPOCHS = int(NN_CFG["epochs"])
        BATCH_SIZE = int(NN_CFG["batch_size"])
        PATIENCE = int(NN_CFG["patience"])
        RANDOM_SEED = int(NN_CFG["random_state"])
        FEATURE_SETS = NN_CFG["feature_sets"]
        
        all_nn_results = []
        
        print(f"\n{'#'*70}")
        print(f"# TRAINING {model_type.upper()} MODELS")
        print(f"{'#'*70}")
        
        for feature_set in FEATURE_SETS:
            print(f"\n{'='*60}")
            print(f"[INFO] Training {model_type.upper()} with feature set: {feature_set}")
            print(f"{'='*60}")
            
            # Load feature-specific X matrices
            X_train_nn = load_with_fallback(f"X_train_{feature_set}.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
            X_valid_nn = load_with_fallback(f"X_valid_{feature_set}.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
            X_test_nn = load_with_fallback(f"X_test_{feature_set}.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
            
            n_features = X_train_nn.shape[1]
            print(f"[INFO] Shapes: TRAIN={X_train_nn.shape} | VALID={X_valid_nn.shape} | TEST={X_test_nn.shape}")
            
            # Split VALID -> ES + SCORE
            (X_valid_es_df, y_valid_es, w_valid_es), (X_valid_sc_df, y_valid_sc, w_valid_sc), valid_mode = split_valid_es_score_nn(
                X_valid_nn, y_valid, w_valid_np
            )
            print(f"[INFO] VALID mode: {valid_mode}")
            
            # Scaling (fit on TRAIN only)
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_nn.values)
            X_valid_es_scaled = scaler.transform(X_valid_es_df.values)
            X_valid_sc_scaled = scaler.transform(X_valid_sc_df.values)
            X_test_scaled = scaler.transform(X_test_nn.values)
            
            # Create sequences
            Xtr_seq, ytr_seq, wtr_seq, idx_tr = make_sequences_eod_nn(X_train_scaled, y_train, w_train_np, X_train_nn.index, LOOKBACK, STRIDE)
            Xes_seq, yes_seq, wes_seq, idx_es = make_sequences_eod_nn(X_valid_es_scaled, y_valid_es, w_valid_es, X_valid_es_df.index, LOOKBACK, STRIDE)
            Xsc_seq, ysc_seq, wsc_seq, idx_sc = make_sequences_eod_nn(X_valid_sc_scaled, y_valid_sc, w_valid_sc, X_valid_sc_df.index, LOOKBACK, STRIDE)
            Xte_seq, yte_seq, wte_seq, idx_te = make_sequences_eod_nn(X_test_scaled, y_test, w_test_np, X_test_nn.index, LOOKBACK, STRIDE)
            
            print(f"[INFO] Sequence shapes: TRAIN={Xtr_seq.shape} | VALID_ES={Xes_seq.shape} | VALID_SCORE={Xsc_seq.shape} | TEST={Xte_seq.shape}")
            
            # Build model
            tf.keras.utils.set_random_seed(RANDOM_SEED)
            
            inp = keras.Input(shape=(LOOKBACK, n_features))
            if model_type == "lstm":
                x = layers.LSTM(UNITS_1, return_sequences=True, dropout=DROPOUT)(inp)
            else:
                x = layers.GRU(UNITS_1, return_sequences=True, dropout=DROPOUT)(inp)
            x = layers.LayerNormalization()(x)
            if model_type == "lstm":
                x = layers.LSTM(UNITS_2, return_sequences=False, dropout=DROPOUT)(x)
            else:
                x = layers.GRU(UNITS_2, return_sequences=False, dropout=DROPOUT)(x)
            x = layers.Dense(DENSE_UNITS, activation=DENSE_ACT)(x)
            x = layers.Dropout(DROPOUT)(x)
            out = layers.Dense(1, activation=OUTPUT_ACT)(x)
            
            model = keras.Model(inp, out)
            model.compile(optimizer=keras.optimizers.Adam(learning_rate=LR, clipnorm=CLIPNORM), loss=NN_CFG["loss"])
            print(f"[INFO] Model built: {model.count_params()} parameters")
            
            # Custom callback for SCORE set tracking
            class ScoreSetCallback(keras.callbacks.Callback):
                def __init__(self, X_score, y_score, w_score):
                    super().__init__()
                    self.Xs, self.ys, self.ws = X_score, y_score, w_score
                    self.best = np.inf
                    self.best_weights = None
                def on_epoch_end(self, epoch, logs=None):
                    pred = self.model.predict(self.Xs, verbose=0).reshape(-1)
                    score = w_rmse(self.ys, pred, self.ws)
                    if score < self.best:
                        self.best = score
                        self.best_weights = self.model.get_weights()
            
            score_cb = ScoreSetCallback(Xsc_seq, ysc_seq, wsc_seq)
            callbacks = [
                keras.callbacks.EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True),
                score_cb,
            ]
            
            # Train
            print(f"[INFO] Training {model_type.upper()} (epochs={EPOCHS}, batch_size={BATCH_SIZE}, patience={PATIENCE})...")
            history = model.fit(
                Xtr_seq, ytr_seq,
                sample_weight=wtr_seq,
                validation_data=(Xes_seq, yes_seq, wes_seq),
                epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                verbose=0,
                callbacks=callbacks
            )
            
            # Restore best weights by SCORE
            if score_cb.best_weights is not None:
                model.set_weights(score_cb.best_weights)
                print(f"[INFO] Restored best weights by VALID_SCORE wRMSE = {score_cb.best:.6f}")
            
            # Evaluate
            pred_sc = model.predict(Xsc_seq, verbose=0).reshape(-1)
            pred_te = model.predict(Xte_seq, verbose=0).reshape(-1)
            baseline_sc, baseline_te = np.zeros_like(ysc_seq), np.zeros_like(yte_seq)
            
            # Metrics
            results = {
                "model_type": model_type,
                "feature_set": feature_set,
                "n_features": n_features,
                "valid_mode": valid_mode,
                "epochs_trained": len(history.history["loss"]),
                # Config params for comparison
                f"{model_type}_units_1": UNITS_1,
                f"{model_type}_units_2": UNITS_2,
                "dropout": DROPOUT,
                "learning_rate": LR,
                "lookback": LOOKBACK,
                "epochs": EPOCHS,
                "batch_size": BATCH_SIZE,
                "baseline_valid_wrmse": w_rmse(ysc_seq, baseline_sc, wsc_seq),
                "baseline_valid_diracc": dir_acc(ysc_seq, baseline_sc),
                "baseline_test_wrmse": w_rmse(yte_seq, baseline_te, wte_seq),
                "baseline_test_diracc": dir_acc(yte_seq, baseline_te),
                "model_valid_wrmse": w_rmse(ysc_seq, pred_sc, wsc_seq),
                "model_valid_wmae": w_mae(ysc_seq, pred_sc, wsc_seq),
                "model_valid_diracc": dir_acc(ysc_seq, pred_sc),
                "model_test_wrmse": w_rmse(yte_seq, pred_te, wte_seq),
                "model_test_wmae": w_mae(yte_seq, pred_te, wte_seq),
                "model_test_diracc": dir_acc(yte_seq, pred_te),
            }
            results["valid_wrmse_improvement"] = results["baseline_valid_wrmse"] - results["model_valid_wrmse"]
            results["test_wrmse_improvement"] = results["baseline_test_wrmse"] - results["model_test_wrmse"]
            all_nn_results.append(results)
            
            print(f"\n[RESULT] {model_type.upper()} | {feature_set} | n_features={n_features}")
            print(f"  BASELINE VALID_SCORE: wRMSE={results['baseline_valid_wrmse']:.6f} | DirAcc={results['baseline_valid_diracc']:.4f}")
            print(f"  MODEL    VALID_SCORE: wRMSE={results['model_valid_wrmse']:.6f} | DirAcc={results['model_valid_diracc']:.4f}")
            print(f"  MODEL    TEST:        wRMSE={results['model_test_wrmse']:.6f} | DirAcc={results['model_test_diracc']:.4f}")
            print(f"  Improvement (VALID):  {results['valid_wrmse_improvement']:.6f}")
            
            # Save model + scaler + config
            model_path = MODELS_OUT_LOCAL / f"{model_type}_{feature_set}.keras"
            scaler_path = MODELS_OUT_LOCAL / f"{model_type}_{feature_set}_scaler.pkl"
            config_path = MODELS_OUT_LOCAL / f"{model_type}_{feature_set}_config.json"
            model.save(model_path)
            save_pickle(scaler, scaler_path)
            save_json(NN_CFG, config_path)
            copy_file(model_path, MODELS_OUT_DRIVE / model_path.name)
            copy_file(scaler_path, MODELS_OUT_DRIVE / scaler_path.name)
            copy_file(config_path, MODELS_OUT_DRIVE / config_path.name)
            
            # Predictions -> predictions/{model_type}_{feature_set}/
            PRED_NN_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions" / f"{model_type}_{feature_set}")
            PRED_NN_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions" / f"{model_type}_{feature_set}")
            
            preds_valid_df = pd.DataFrame({
                "date": idx_sc, "actual": ysc_seq, "baseline_zero": baseline_sc,
                "predicted": pred_sc, "sample_weight": wsc_seq,
            }).reset_index(drop=True)
            preds_test_df = pd.DataFrame({
                "date": idx_te, "actual": yte_seq, "baseline_zero": baseline_te,
                "predicted": pred_te, "sample_weight": wte_seq,
            }).reset_index(drop=True)
            
            preds_valid_df.to_csv(PRED_NN_LOCAL / "predictions_valid.csv", index=False)
            preds_test_df.to_csv(PRED_NN_LOCAL / "predictions_test.csv", index=False)
            copy_file(PRED_NN_LOCAL / "predictions_valid.csv", PRED_NN_DRIVE / "predictions_valid.csv")
            copy_file(PRED_NN_LOCAL / "predictions_test.csv", PRED_NN_DRIVE / "predictions_test.csv")
            
            print(f"[OK] Saved: {model_path.name}, {scaler_path.name}")
            print(f"[OK] Predictions: predictions/{model_type}_{feature_set}/")
        
        # Summary for this model type
        nn_results_df = pd.DataFrame(all_nn_results)
        print(f"\n{'='*60}")
        print(f"[INFO] {model_type.upper()} TRAINING SUMMARY")
        print(f"{'='*60}")
        display(nn_results_df[["feature_set", "n_features", "model_valid_wrmse", "model_test_wrmse", "valid_wrmse_improvement"]])
        
        summary_path = MODELS_OUT_LOCAL / f"{model_type}_summary.csv"
        nn_results_df.to_csv(summary_path, index=False)
        copy_file(summary_path, MODELS_OUT_DRIVE / summary_path.name)
        print(f"[OK] Saved {model_type.upper()} summary: {summary_path.name}")
    
    print("[OK] BLOCK 26 complete.")

## BLOCK 27 — NEURAL NETWORK PREDICT TOMORROW + BACKTEST PLOT

In [None]:

if not TF_AVAILABLE:
    print("[SKIP] BLOCK 27 — TensorFlow not available")
else:
    MODEL_TYPES = ["lstm", "gru"]
    PLOT_CFG = RUN_PARAMS["plot"]
    N_PLOT = int(PLOT_CFG["n_plot"])
    FIGSIZE = tuple(PLOT_CFG["figsize"])
    DPI = int(PLOT_CFG["dpi"])
    
    # Directories
    PROC_DATA_DIR = DATA_DIRS_LOCAL["processed"]
    MODELS_DIR_LOCAL = Path(LOCAL_PATHS["models_dir"])
    PRED_OUT_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions")
    PRED_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions")
    
    print("[INFO] Neural Network Predictions output dirs:")
    print("  - LOCAL:", PRED_OUT_LOCAL)
    print("  - DRIVE:", PRED_OUT_DRIVE)
    
    # Sequence creation
    def make_sequences_pred_nn(X_2d, y_1d, idx, lookback):
        X_2d, y_1d = np.asarray(X_2d), np.asarray(y_1d)
        N, F = X_2d.shape
        X_seq, y_seq, idx_seq = [], [], []
        for t in range(lookback - 1, N):
            X_seq.append(X_2d[t - lookback + 1:t + 1, :])
            y_seq.append(y_1d[t])
            idx_seq.append(idx[t])
        return (np.asarray(X_seq, dtype=np.float32),
                np.asarray(y_seq, dtype=np.float32),
                pd.DatetimeIndex(idx_seq))
    
    # Load shared data
    y_test = load_with_fallback("y_test.pkl", PROC_DATA_DIR, DRIVE_PROC_DATA_DIR)
    w_test = load_with_fallback("weights_test.pkl", PROC_DATA_DIR, DRIVE_PROC_DATA_DIR)
    y_test_arr = y_test.astype(float).to_numpy()
    w_test_arr = np.asarray(w_test, dtype=float)
    
    # Loop over model types
    for model_type in MODEL_TYPES:
        NN_CFG = RUN_PARAMS[model_type]
        LOOKBACK = int(NN_CFG["lookback"])
        FEATURE_SETS = NN_CFG["feature_sets"]
        
        all_pred_results = []
        
        print(f"\n{'#'*70}")
        print(f"# PREDICTING WITH {model_type.upper()} MODELS")
        print(f"{'#'*70}")
        
        for feature_set in FEATURE_SETS:
            print(f"\n{'='*60}")
            print(f"[INFO] Predicting with {model_type.upper()}: {feature_set}")
            print(f"{'='*60}")
            
            # Load model + scaler
            model_path = MODELS_DIR_LOCAL / f"{model_type}_{feature_set}.keras"
            scaler_path = MODELS_DIR_LOCAL / f"{model_type}_{feature_set}_scaler.pkl"
            
            if not model_path.exists():
                print(f"[WARN] Model not found: {model_path}, skipping...")
                continue
            
            model = keras.models.load_model(model_path)
            scaler = load_pickle(scaler_path)
            
            # Load X_test
            X_test = load_with_fallback(f"X_test_{feature_set}.pkl", PROC_DATA_DIR, DRIVE_PROC_DATA_DIR, use_pandas=True)
            n_features = X_test.shape[1]
            print(f"[INFO] Loaded: model={model_path.name} | X_test={X_test.shape}")
            
            # Scale + Sequences
            X_test_scaled = scaler.transform(X_test.values)
            X_seq, y_seq, idx_seq = make_sequences_pred_nn(X_test_scaled, y_test_arr, X_test.index, LOOKBACK)
            print(f"[INFO] Sequences: {X_seq.shape}")
            
            # Predict
            pred_seq = model.predict(X_seq, verbose=0).reshape(-1)
            hist_df = pd.DataFrame({"date": idx_seq, "actual": y_seq, "y_pred": pred_seq}).set_index("date")
            
            # Tomorrow prediction
            last_date = X_test.index[-1]
            X_last_window = X_test_scaled[-LOOKBACK:, :].reshape(1, LOOKBACK, -1).astype(np.float32)
            pred_tomorrow = float(model.predict(X_last_window, verbose=0).reshape(-1)[0])
            pred_tomorrow_date = last_date + pd.Timedelta(days=1)
            
            # Metrics
            w_seq = w_test_arr[LOOKBACK - 1:]
            w_norm = w_seq / (w_seq.sum() + EPS)
            test_wrmse = float(np.sqrt(np.sum(w_norm * (y_seq - pred_seq) ** 2)))
            test_dir_acc = float(np.mean((y_seq > 0) == (pred_seq > 0)))
            
            print(f"[INFO] TEST wRMSE={test_wrmse:.6f} | DirAcc={test_dir_acc:.4f}")
            print(f"[INFO] Tomorrow prediction: {pred_tomorrow:.6f} ({np.expm1(pred_tomorrow)*100:.4f}%)")
            
            result = {
                "model_type": model_type,
                "feature_set": feature_set,
                "n_features": n_features,
                "test_wrmse": test_wrmse,
                "test_dir_acc": test_dir_acc,
                "last_data_date": str(last_date.date()),
                "pred_tomorrow_logret": pred_tomorrow,
                "pred_tomorrow_pct": float(np.expm1(pred_tomorrow) * 100),
            }
            all_pred_results.append(result)
            
            # Save -> predictions/{model_type}_{feature_set}/
            PRED_NN_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions" / f"{model_type}_{feature_set}")
            PRED_NN_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions" / f"{model_type}_{feature_set}")
            
            pred_tomorrow_df = pd.DataFrame([{
                "feature_set": feature_set, "last_data_date": last_date,
                "predicted_for": "next_trading_day",
                "pred_logret": pred_tomorrow,
                "pred_return_pct": float(np.expm1(pred_tomorrow) * 100),
            }])
            pred_tomorrow_df.to_csv(PRED_NN_LOCAL / "tomorrow.csv", index=False)
            copy_file(PRED_NN_LOCAL / "tomorrow.csv", PRED_NN_DRIVE / "tomorrow.csv")
            
            hist_tail = hist_df.tail(N_PLOT).copy()
            hist_tail.to_csv(PRED_NN_LOCAL / "backtest.csv")
            copy_file(PRED_NN_LOCAL / "backtest.csv", PRED_NN_DRIVE / "backtest.csv")
            
            # Plot
            fig, ax = plt.subplots(figsize=FIGSIZE)
            ax.plot(hist_tail.index, hist_tail["actual"].values, linewidth=1, label="Actual")
            ax.plot(hist_tail.index, hist_tail["y_pred"].values, linewidth=1, label="Predicted (y_pred)")
            ax.scatter([pred_tomorrow_date], [pred_tomorrow], s=90, marker="X", color="red", label=f"Tomorrow: {pred_tomorrow:.4f}")
            ax.axhline(0.0, color="gray", linewidth=0.5, linestyle="--")
            ax.set_title(f"{model_type.upper()} Predictions — {feature_set} — last {len(hist_tail)} days + tomorrow")
            ax.set_xlabel("Date")
            ax.set_ylabel("Log Return")
            ax.legend(loc="upper right")
            plt.tight_layout()
            plt.savefig(PRED_NN_LOCAL / "plot.png", dpi=DPI)
            copy_file(PRED_NN_LOCAL / "plot.png", PRED_NN_DRIVE / "plot.png")
            plt.close(fig)
            
            print(f"[OK] Saved: predictions/{model_type}_{feature_set}/ (tomorrow.csv, backtest.csv, plot.png)")
        
        # Summary for this model type
        pred_summary_df = pd.DataFrame(all_pred_results)
        print(f"\n{'='*60}")
        print(f"[INFO] {model_type.upper()} PREDICTION SUMMARY")
        print(f"{'='*60}")
        display(pred_summary_df)
        
        summary_path = PRED_OUT_LOCAL / f"{model_type}_predictions_summary.csv"
        pred_summary_df.to_csv(summary_path, index=False)
        copy_file(summary_path, PRED_OUT_DRIVE / summary_path.name)
        print(f"[OK] Saved summary: {summary_path.name}")
    
    print("[OK] BLOCK 27 complete.")

---
# SECTION 9: Hybrid Neural Networks

**Sequential and Parallel hybrid architectures**

**Blocks:** 28-29

## BLOCK 28 — HYBRID NEURAL NETWORK TRAINING (Sequential + Parallel)

In [None]:

if not TF_AVAILABLE:
    print("[SKIP] BLOCK 28 — TensorFlow not available")
else:
    # Config
    HYBRID_TYPES = ["hybrid_seq", "hybrid_par"]
    
    # Directories
    # Fallback: RUN_ID_LOCAL -> data/processed_LOCAL -> RUN_ID_DRIVE -> data/processed_DRIVE
    RUN_PROC_LOCAL = Path(LOCAL_PATHS["proc_dir"])
    FALLBACK_PROC_LOCAL = DATA_DIRS_LOCAL["processed"]
    RUN_PROC_DRIVE = Path(DRIVE_PATHS["proc_dir"])
    FALLBACK_PROC_DRIVE = DATA_DIRS_DRIVE["processed"]
    MODELS_OUT_LOCAL = ensure_dir(Path(MODELS_DIR))
    MODELS_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["models_dir"]))
    
    print("[INFO] Hybrid Neural Network output dirs:")
    print("  - LOCAL:", MODELS_OUT_LOCAL)
    print("  - DRIVE:", MODELS_OUT_DRIVE)
    
    # -------------------------
    # Load shared data (y, weights)
    # -------------------------
    y_train_t1 = load_with_fallback("y_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    y_valid_t1 = load_with_fallback("y_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    y_test_t1 = load_with_fallback("y_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    w_train = load_with_fallback("weights_train.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    w_valid = load_with_fallback("weights_valid.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    w_test = load_with_fallback("weights_test.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE)
    
    y_train = y_train_t1.astype(float).to_numpy()
    y_valid = y_valid_t1.astype(float).to_numpy()
    y_test = y_test_t1.astype(float).to_numpy()
    
    w_train_np = np.asarray(w_train, dtype=float)
    w_valid_np = np.asarray(w_valid, dtype=float)
    w_test_np = np.asarray(w_test, dtype=float)
    
    # -------------------------
    # Metrics
    # -------------------------
    # Metric functions (w_rmse, w_mae, dir_acc) defined in Cell 5
    
    # -------------------------
    # VALID split function (date-based)
    # -------------------------
    HPO_CFG = RUN_PARAMS["hpo"]
    VALID_ES_START = HPO_CFG["valid_es_start"]
    VALID_ES_END = HPO_CFG["valid_es_end"]
    VALID_SCORE_START = HPO_CFG["valid_score_start"]
    VALID_SCORE_END = HPO_CFG["valid_score_end"]
    
    def split_valid_es_score_hybrid(Xv_df, yv, wv):
        if not isinstance(Xv_df.index, pd.DatetimeIndex):
            return (Xv_df, yv, wv), (Xv_df, yv, wv), "FULL_VALID"
        
        yv_s = pd.Series(yv, index=Xv_df.index)
        wv_s = pd.Series(wv, index=Xv_df.index)
        
        es_start, es_end = pd.Timestamp(VALID_ES_START), pd.Timestamp(VALID_ES_END)
        sc_start, sc_end = pd.Timestamp(VALID_SCORE_START), pd.Timestamp(VALID_SCORE_END)
        m_es = (Xv_df.index >= es_start) & (Xv_df.index <= es_end)
        m_sc = (Xv_df.index >= sc_start) & (Xv_df.index <= sc_end)
        mode_str = f"VALID_ES={VALID_ES_START}:{VALID_ES_END} / VALID_SCORE={VALID_SCORE_START}:{VALID_SCORE_END}"
        
        X_es, X_sc = Xv_df.loc[m_es], Xv_df.loc[m_sc]
        y_es, y_sc = yv_s.loc[m_es].to_numpy(float), yv_s.loc[m_sc].to_numpy(float)
        w_es, w_sc = wv_s.loc[m_es].to_numpy(float), wv_s.loc[m_sc].to_numpy(float)
        
        if len(X_es) > 0 and len(X_sc) > 0:
            return (X_es, y_es, w_es), (X_sc, y_sc, w_sc), mode_str
        return (Xv_df, yv, wv), (Xv_df, yv, wv), "FULL_VALID"
    
    # -------------------------
    # Sequence creation function
    # -------------------------
    def make_sequences_eod_hybrid(X_2d, y_1d, w_1d, idx, lookback, stride=1):
        X_2d, y_1d, w_1d = np.asarray(X_2d), np.asarray(y_1d), np.asarray(w_1d)
        N, F = X_2d.shape
        if N < lookback:
            raise ValueError(f"[ERROR] Not enough rows N={N} for lookback={lookback}.")
        X_seq, y_seq, w_seq, idx_seq = [], [], [], []
        for t in range(lookback - 1, N, stride):
            X_seq.append(X_2d[t - lookback + 1:t + 1, :])
            y_seq.append(y_1d[t])
            w_seq.append(w_1d[t])
            idx_seq.append(idx[t])
        return (np.asarray(X_seq, dtype=np.float32),
                np.asarray(y_seq, dtype=np.float32),
                np.asarray(w_seq, dtype=np.float32),
                pd.DatetimeIndex(idx_seq))
    
    # -------------------------
    # Loop over hybrid types (Sequential, Parallel)
    # -------------------------
    for hybrid_type in HYBRID_TYPES:
        HYB_CFG = RUN_PARAMS[hybrid_type]
        LOOKBACK = int(HYB_CFG["lookback"])
        STRIDE = int(HYB_CFG["stride"])
        LSTM_UNITS = int(HYB_CFG["lstm_units"])
        GRU_UNITS = int(HYB_CFG["gru_units"])
        DROPOUT = float(HYB_CFG["dropout"])
        LR = float(HYB_CFG["learning_rate"])
        EPOCHS = int(HYB_CFG["epochs"])
        BATCH_SIZE = int(HYB_CFG["batch_size"])
        PATIENCE = int(HYB_CFG["patience"])
        RANDOM_SEED = int(HYB_CFG["random_state"])
        FEATURE_SETS = HYB_CFG["feature_sets"]
        DENSE_UNITS = int(HYB_CFG["dense_units"])
        CLIPNORM = float(HYB_CFG["clipnorm"])
        LOSS = HYB_CFG["loss"]
        DENSE_ACT = HYB_CFG["dense_activation"]
        OUTPUT_ACT = HYB_CFG["output_activation"]
        
        all_hybrid_results = []
        
        arch_name = "Sequential (LSTM→GRU)" if hybrid_type == "hybrid_seq" else "Parallel (LSTM∥GRU)"
        print(f"\n{'#'*70}")
        print(f"# TRAINING {hybrid_type.upper()} — {arch_name}")
        print(f"{'#'*70}")
        
        for feature_set in FEATURE_SETS:
            print(f"\n{'='*60}")
            print(f"[INFO] Training {hybrid_type.upper()} with feature set: {feature_set}")
            print(f"{'='*60}")
            
            # Load feature-specific X matrices
            X_train_nn = load_with_fallback(f"X_train_{feature_set}.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
            X_valid_nn = load_with_fallback(f"X_valid_{feature_set}.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
            X_test_nn = load_with_fallback(f"X_test_{feature_set}.pkl", RUN_PROC_LOCAL, FALLBACK_PROC_LOCAL, RUN_PROC_DRIVE, FALLBACK_PROC_DRIVE, use_pandas=True)
            
            n_features = X_train_nn.shape[1]
            print(f"[INFO] Shapes: TRAIN={X_train_nn.shape} | VALID={X_valid_nn.shape} | TEST={X_test_nn.shape}")
            
            # Split VALID -> ES + SCORE
            (X_valid_es_df, y_valid_es, w_valid_es), (X_valid_sc_df, y_valid_sc, w_valid_sc), valid_mode = split_valid_es_score_hybrid(
                X_valid_nn, y_valid, w_valid_np
            )
            print(f"[INFO] VALID mode: {valid_mode}")
            
            # Scaling (fit on TRAIN only)
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_nn.values)
            X_valid_es_scaled = scaler.transform(X_valid_es_df.values)
            X_valid_sc_scaled = scaler.transform(X_valid_sc_df.values)
            X_test_scaled = scaler.transform(X_test_nn.values)
            
            # Create sequences
            Xtr_seq, ytr_seq, wtr_seq, idx_tr = make_sequences_eod_hybrid(X_train_scaled, y_train, w_train_np, X_train_nn.index, LOOKBACK, STRIDE)
            Xes_seq, yes_seq, wes_seq, idx_es = make_sequences_eod_hybrid(X_valid_es_scaled, y_valid_es, w_valid_es, X_valid_es_df.index, LOOKBACK, STRIDE)
            Xsc_seq, ysc_seq, wsc_seq, idx_sc = make_sequences_eod_hybrid(X_valid_sc_scaled, y_valid_sc, w_valid_sc, X_valid_sc_df.index, LOOKBACK, STRIDE)
            Xte_seq, yte_seq, wte_seq, idx_te = make_sequences_eod_hybrid(X_test_scaled, y_test, w_test_np, X_test_nn.index, LOOKBACK, STRIDE)
            
            print(f"[INFO] Sequence shapes: TRAIN={Xtr_seq.shape} | VALID_ES={Xes_seq.shape} | VALID_SCORE={Xsc_seq.shape} | TEST={Xte_seq.shape}")
            
            # Build model
            tf.keras.utils.set_random_seed(RANDOM_SEED)
            
            inp = keras.Input(shape=(LOOKBACK, n_features))
            
            if hybrid_type == "hybrid_seq":
                # Sequential: Input → LSTM → LayerNorm → GRU → Dense → Output
                x = layers.LSTM(LSTM_UNITS, return_sequences=True, dropout=DROPOUT)(inp)
                x = layers.LayerNormalization()(x)
                x = layers.GRU(GRU_UNITS, return_sequences=False, dropout=DROPOUT)(x)
            else:
                # Parallel: Input → [LSTM, GRU] → Concat → Dense → Output
                lstm_out = layers.LSTM(LSTM_UNITS, return_sequences=False, dropout=DROPOUT)(inp)
                gru_out = layers.GRU(GRU_UNITS, return_sequences=False, dropout=DROPOUT)(inp)
                x = layers.Concatenate()([lstm_out, gru_out])
            
            x = layers.Dense(DENSE_UNITS, activation=DENSE_ACT)(x)
            x = layers.Dropout(DROPOUT)(x)
            out = layers.Dense(1, activation=OUTPUT_ACT)(x)
            
            model = keras.Model(inp, out)
            model.compile(optimizer=keras.optimizers.Adam(learning_rate=LR, clipnorm=CLIPNORM), loss=LOSS)
            print(f"[INFO] Model built: {model.count_params()} parameters")
            
            # Custom callback for SCORE set tracking
            class ScoreSetCallback(keras.callbacks.Callback):
                def __init__(self, X_score, y_score, w_score):
                    super().__init__()
                    self.Xs, self.ys, self.ws = X_score, y_score, w_score
                    self.best = np.inf
                    self.best_weights = None
                def on_epoch_end(self, epoch, logs=None):
                    pred = self.model.predict(self.Xs, verbose=0).reshape(-1)
                    score = w_rmse(self.ys, pred, self.ws)
                    if score < self.best:
                        self.best = score
                        self.best_weights = self.model.get_weights()
            
            score_cb = ScoreSetCallback(Xsc_seq, ysc_seq, wsc_seq)
            callbacks = [
                keras.callbacks.EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True),
                score_cb,
            ]
            
            # Train
            print(f"[INFO] Training {hybrid_type.upper()} (epochs={EPOCHS}, batch_size={BATCH_SIZE}, patience={PATIENCE})...")
            history = model.fit(
                Xtr_seq, ytr_seq,
                sample_weight=wtr_seq,
                validation_data=(Xes_seq, yes_seq, wes_seq),
                epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                verbose=0,
                callbacks=callbacks
            )
            
            # Restore best weights by SCORE
            if score_cb.best_weights is not None:
                model.set_weights(score_cb.best_weights)
                print(f"[INFO] Restored best weights by VALID_SCORE wRMSE = {score_cb.best:.6f}")
            
            # Evaluate
            pred_sc = model.predict(Xsc_seq, verbose=0).reshape(-1)
            pred_te = model.predict(Xte_seq, verbose=0).reshape(-1)
            baseline_sc, baseline_te = np.zeros_like(ysc_seq), np.zeros_like(yte_seq)
            
            # Metrics
            results = {
                "model_type": hybrid_type,
                "feature_set": feature_set,
                "n_features": n_features,
                "valid_mode": valid_mode,
                "epochs_trained": len(history.history["loss"]),
                # Config params for comparison
                "hybrid_lstm_units": LSTM_UNITS,
                "hybrid_gru_units": GRU_UNITS,
                "dropout": DROPOUT,
                "learning_rate": LR,
                "lookback": LOOKBACK,
                "epochs": EPOCHS,
                "batch_size": BATCH_SIZE,
                "baseline_valid_wrmse": w_rmse(ysc_seq, baseline_sc, wsc_seq),
                "baseline_valid_diracc": dir_acc(ysc_seq, baseline_sc),
                "baseline_test_wrmse": w_rmse(yte_seq, baseline_te, wte_seq),
                "baseline_test_diracc": dir_acc(yte_seq, baseline_te),
                "model_valid_wrmse": w_rmse(ysc_seq, pred_sc, wsc_seq),
                "model_valid_wmae": w_mae(ysc_seq, pred_sc, wsc_seq),
                "model_valid_diracc": dir_acc(ysc_seq, pred_sc),
                "model_test_wrmse": w_rmse(yte_seq, pred_te, wte_seq),
                "model_test_wmae": w_mae(yte_seq, pred_te, wte_seq),
                "model_test_diracc": dir_acc(yte_seq, pred_te),
            }
            results["valid_wrmse_improvement"] = results["baseline_valid_wrmse"] - results["model_valid_wrmse"]
            results["test_wrmse_improvement"] = results["baseline_test_wrmse"] - results["model_test_wrmse"]
            all_hybrid_results.append(results)
            
            print(f"\n[RESULT] {hybrid_type.upper()} | {feature_set} | n_features={n_features}")
            print(f"  BASELINE VALID_SCORE: wRMSE={results['baseline_valid_wrmse']:.6f} | DirAcc={results['baseline_valid_diracc']:.4f}")
            print(f"  MODEL    VALID_SCORE: wRMSE={results['model_valid_wrmse']:.6f} | DirAcc={results['model_valid_diracc']:.4f}")
            print(f"  MODEL    TEST:        wRMSE={results['model_test_wrmse']:.6f} | DirAcc={results['model_test_diracc']:.4f}")
            print(f"  Improvement (VALID):  {results['valid_wrmse_improvement']:.6f}")
            
            # Save model + scaler + config
            model_path = MODELS_OUT_LOCAL / f"{hybrid_type}_{feature_set}.keras"
            scaler_path = MODELS_OUT_LOCAL / f"{hybrid_type}_{feature_set}_scaler.pkl"
            config_path = MODELS_OUT_LOCAL / f"{hybrid_type}_{feature_set}_config.json"
            model.save(model_path)
            save_pickle(scaler, scaler_path)
            save_json(HYB_CFG, config_path)
            copy_file(model_path, MODELS_OUT_DRIVE / model_path.name)
            copy_file(scaler_path, MODELS_OUT_DRIVE / scaler_path.name)
            copy_file(config_path, MODELS_OUT_DRIVE / config_path.name)
            
            # Predictions -> predictions/{hybrid_type}_{feature_set}/
            PRED_HYB_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions" / f"{hybrid_type}_{feature_set}")
            PRED_HYB_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions" / f"{hybrid_type}_{feature_set}")
            
            preds_valid_df = pd.DataFrame({
                "date": idx_sc, "actual": ysc_seq, "baseline_zero": baseline_sc,
                "predicted": pred_sc, "sample_weight": wsc_seq,
            }).reset_index(drop=True)
            preds_test_df = pd.DataFrame({
                "date": idx_te, "actual": yte_seq, "baseline_zero": baseline_te,
                "predicted": pred_te, "sample_weight": wte_seq,
            }).reset_index(drop=True)
            
            preds_valid_df.to_csv(PRED_HYB_LOCAL / "predictions_valid.csv", index=False)
            preds_test_df.to_csv(PRED_HYB_LOCAL / "predictions_test.csv", index=False)
            copy_file(PRED_HYB_LOCAL / "predictions_valid.csv", PRED_HYB_DRIVE / "predictions_valid.csv")
            copy_file(PRED_HYB_LOCAL / "predictions_test.csv", PRED_HYB_DRIVE / "predictions_test.csv")
            
            print(f"[OK] Saved: {model_path.name}, {scaler_path.name}")
            print(f"[OK] Predictions: predictions/{hybrid_type}_{feature_set}/")
        
        # Summary for this hybrid type
        hybrid_results_df = pd.DataFrame(all_hybrid_results)
        print(f"\n{'='*60}")
        print(f"[INFO] {hybrid_type.upper()} TRAINING SUMMARY")
        print(f"{'='*60}")
        display(hybrid_results_df[["feature_set", "n_features", "model_valid_wrmse", "model_test_wrmse", "valid_wrmse_improvement"]])
        
        summary_path = MODELS_OUT_LOCAL / f"{hybrid_type}_summary.csv"
        hybrid_results_df.to_csv(summary_path, index=False)
        copy_file(summary_path, MODELS_OUT_DRIVE / summary_path.name)
        print(f"[OK] Saved {hybrid_type.upper()} summary: {summary_path.name}")
    
    print("[OK] BLOCK 28 complete.")

## BLOCK 29 — HYBRID NEURAL NETWORK PREDICT TOMORROW + BACKTEST PLOT

In [None]:

if not TF_AVAILABLE:
    print("[SKIP] BLOCK 29 — TensorFlow not available")
else:
    HYBRID_TYPES = ["hybrid_seq", "hybrid_par"]
    PLOT_CFG = RUN_PARAMS["plot"]
    N_PLOT = int(PLOT_CFG["n_plot"])
    FIGSIZE = tuple(PLOT_CFG["figsize"])
    DPI = int(PLOT_CFG["dpi"])
    
    # Directories
    PROC_DATA_DIR = DATA_DIRS_LOCAL["processed"]
    MODELS_DIR_LOCAL = Path(LOCAL_PATHS["models_dir"])
    PRED_OUT_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions")
    PRED_OUT_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions")
    
    print("[INFO] Hybrid Predictions output dirs:")
    print("  - LOCAL:", PRED_OUT_LOCAL)
    print("  - DRIVE:", PRED_OUT_DRIVE)
    
    # Sequence creation
    def make_sequences_pred_hybrid(X_2d, y_1d, idx, lookback):
        X_2d, y_1d = np.asarray(X_2d), np.asarray(y_1d)
        N, F = X_2d.shape
        X_seq, y_seq, idx_seq = [], [], []
        for t in range(lookback - 1, N):
            X_seq.append(X_2d[t - lookback + 1:t + 1, :])
            y_seq.append(y_1d[t])
            idx_seq.append(idx[t])
        return (np.asarray(X_seq, dtype=np.float32),
                np.asarray(y_seq, dtype=np.float32),
                pd.DatetimeIndex(idx_seq))
    
    # Load shared data
    y_test = load_with_fallback("y_test.pkl", PROC_DATA_DIR, DRIVE_PROC_DATA_DIR)
    w_test = load_with_fallback("weights_test.pkl", PROC_DATA_DIR, DRIVE_PROC_DATA_DIR)
    y_test_arr = y_test.astype(float).to_numpy()
    w_test_arr = np.asarray(w_test, dtype=float)
    
    # Loop over hybrid types
    for hybrid_type in HYBRID_TYPES:
        HYB_CFG = RUN_PARAMS[hybrid_type]
        LOOKBACK = int(HYB_CFG["lookback"])
        FEATURE_SETS = HYB_CFG["feature_sets"]
        
        all_pred_results = []
        
        arch_name = "Sequential (LSTM→GRU)" if hybrid_type == "hybrid_seq" else "Parallel (LSTM∥GRU)"
        print(f"\n{'#'*70}")
        print(f"# PREDICTING WITH {hybrid_type.upper()} — {arch_name}")
        print(f"{'#'*70}")
        
        for feature_set in FEATURE_SETS:
            print(f"\n{'='*60}")
            print(f"[INFO] Predicting with {hybrid_type.upper()}: {feature_set}")
            print(f"{'='*60}")
            
            # Load model + scaler
            model_path = MODELS_DIR_LOCAL / f"{hybrid_type}_{feature_set}.keras"
            scaler_path = MODELS_DIR_LOCAL / f"{hybrid_type}_{feature_set}_scaler.pkl"
            
            if not model_path.exists():
                print(f"[WARN] Model not found: {model_path}, skipping...")
                continue
            
            model = keras.models.load_model(model_path)
            scaler = load_pickle(scaler_path)
            
            # Load X_test
            X_test = load_with_fallback(f"X_test_{feature_set}.pkl", PROC_DATA_DIR, DRIVE_PROC_DATA_DIR, use_pandas=True)
            n_features = X_test.shape[1]
            print(f"[INFO] Loaded: model={model_path.name} | X_test={X_test.shape}")
            
            # Scale + Sequences
            X_test_scaled = scaler.transform(X_test.values)
            X_seq, y_seq, idx_seq = make_sequences_pred_hybrid(X_test_scaled, y_test_arr, X_test.index, LOOKBACK)
            print(f"[INFO] Sequences: {X_seq.shape}")
            
            # Predict
            pred_seq = model.predict(X_seq, verbose=0).reshape(-1)
            hist_df = pd.DataFrame({"date": idx_seq, "actual": y_seq, "y_pred": pred_seq}).set_index("date")
            
            # Tomorrow prediction
            last_date = X_test.index[-1]
            X_last_window = X_test_scaled[-LOOKBACK:, :].reshape(1, LOOKBACK, -1).astype(np.float32)
            pred_tomorrow = float(model.predict(X_last_window, verbose=0).reshape(-1)[0])
            pred_tomorrow_date = last_date + pd.Timedelta(days=1)
            
            # Metrics
            w_seq = w_test_arr[LOOKBACK - 1:]
            w_norm = w_seq / (w_seq.sum() + EPS)
            test_wrmse = float(np.sqrt(np.sum(w_norm * (y_seq - pred_seq) ** 2)))
            test_dir_acc = float(np.mean((y_seq > 0) == (pred_seq > 0)))
            
            print(f"[INFO] TEST wRMSE={test_wrmse:.6f} | DirAcc={test_dir_acc:.4f}")
            print(f"[INFO] Tomorrow prediction: {pred_tomorrow:.6f} ({np.expm1(pred_tomorrow)*100:.4f}%)")
            
            result = {
                "model_type": hybrid_type,
                "feature_set": feature_set,
                "n_features": n_features,
                "test_wrmse": test_wrmse,
                "test_dir_acc": test_dir_acc,
                "last_data_date": str(last_date.date()),
                "pred_tomorrow_logret": pred_tomorrow,
                "pred_tomorrow_pct": float(np.expm1(pred_tomorrow) * 100),
            }
            all_pred_results.append(result)
            
            # Save -> predictions/{hybrid_type}_{feature_set}/
            PRED_HYB_LOCAL = ensure_dir(Path(LOCAL_PATHS["run_dir"]) / "predictions" / f"{hybrid_type}_{feature_set}")
            PRED_HYB_DRIVE = ensure_dir(Path(DRIVE_PATHS["run_dir"]) / "predictions" / f"{hybrid_type}_{feature_set}")
            
            pred_tomorrow_df = pd.DataFrame([{
                "feature_set": feature_set, "last_data_date": last_date,
                "predicted_for": "next_trading_day",
                "pred_logret": pred_tomorrow,
                "pred_return_pct": float(np.expm1(pred_tomorrow) * 100),
            }])
            pred_tomorrow_df.to_csv(PRED_HYB_LOCAL / "tomorrow.csv", index=False)
            copy_file(PRED_HYB_LOCAL / "tomorrow.csv", PRED_HYB_DRIVE / "tomorrow.csv")
            
            hist_tail = hist_df.tail(N_PLOT).copy()
            hist_tail.to_csv(PRED_HYB_LOCAL / "backtest.csv")
            copy_file(PRED_HYB_LOCAL / "backtest.csv", PRED_HYB_DRIVE / "backtest.csv")
            
            # Plot
            fig, ax = plt.subplots(figsize=FIGSIZE)
            ax.plot(hist_tail.index, hist_tail["actual"].values, linewidth=1, label="Actual")
            ax.plot(hist_tail.index, hist_tail["y_pred"].values, linewidth=1, label="Predicted (y_pred)")
            ax.scatter([pred_tomorrow_date], [pred_tomorrow], s=90, marker="X", color="red", label=f"Tomorrow: {pred_tomorrow:.4f}")
            ax.axhline(0.0, color="gray", linewidth=0.5, linestyle="--")
            ax.set_title(f"{hybrid_type.upper()} Predictions — {feature_set} — last {len(hist_tail)} days + tomorrow")
            ax.set_xlabel("Date")
            ax.set_ylabel("Log Return")
            ax.legend(loc="upper right")
            plt.tight_layout()
            plt.savefig(PRED_HYB_LOCAL / "plot.png", dpi=DPI)
            copy_file(PRED_HYB_LOCAL / "plot.png", PRED_HYB_DRIVE / "plot.png")
            plt.close(fig)
            
            print(f"[OK] Saved: predictions/{hybrid_type}_{feature_set}/ (tomorrow.csv, backtest.csv, plot.png)")
        
        # Summary for this hybrid type
        pred_summary_df = pd.DataFrame(all_pred_results)
        print(f"\n{'='*60}")
        print(f"[INFO] {hybrid_type.upper()} PREDICTION SUMMARY")
        print(f"{'='*60}")
        display(pred_summary_df)
        
        summary_path = PRED_OUT_LOCAL / f"{hybrid_type}_predictions_summary.csv"
        pred_summary_df.to_csv(summary_path, index=False)
        copy_file(summary_path, PRED_OUT_DRIVE / summary_path.name)
        print(f"[OK] Saved summary: {summary_path.name}")
    
    print("[OK] BLOCK 29 complete.")

---
# SECTION 9B: Ensemble Model

**Combine predictions from all models**

Methods:
- Simple Average
- Weighted Average (by inverse wRMSE)
- Stacking (meta-model)
- Rank Average

**Prerequisites:** Run Sections 7, 7B, 8, 9 first

## BLOCK 30 — ENSEMBLE MODEL

In [None]:

# ============================================================
# BLOCK 30 — ENSEMBLE MODEL
# ============================================================
# Supports: simple_average, weighted_average, rank_average, stacking
# Weight methods: inverse_wrmse, inverse_wrmse_squared

# Config
ENSEMBLE_CFG = RUN_PARAMS.get("ensemble", {})
ENSEMBLE_METHOD = ENSEMBLE_CFG.get("method", "weighted_average")
ENSEMBLE_MODELS = ENSEMBLE_CFG.get("models", ["xgb", "lgb", "lstm", "gru", "hybrid_seq", "hybrid_par"])
WEIGHT_METHOD = ENSEMBLE_CFG.get("weight_method", "inverse_wrmse")

print(f"[INFO] Ensemble method: {ENSEMBLE_METHOD}")
print(f"[INFO] Weight method: {WEIGHT_METHOD}")
print(f"[INFO] Models to combine: {ENSEMBLE_MODELS}")

# Directories
MODELS_DIR_LOCAL = Path(LOCAL_PATHS["models_dir"])
MODELS_DIR_DRIVE = Path(DRIVE_PATHS["models_dir"])
OUTPUTS_LOCAL = ensure_dir(Path(LOCAL_PATHS["outputs_dir"]))
OUTPUTS_DRIVE = ensure_dir(Path(DRIVE_PATHS["outputs_dir"]))

# -------------------------
# 1. Load predictions from all models
# -------------------------
print("\n[INFO] Loading predictions...")

def load_predictions_ensemble(run_dir: Path, split: str, models: list) -> Tuple[pd.DataFrame, Optional[np.ndarray], Optional[np.ndarray]]:
    """Load predictions from available models.
    
    All predictions are stored in: run_dir/predictions/{model}/ or run_dir/predictions/{model}_{feature_set}/
    
    Returns:
        Tuple of (predictions_df, actual_values, sample_weights)
    """
    pred_base = run_dir / "predictions"
    
    # Model prediction paths (model_key -> list of possible paths)
    model_paths = {
        "xgb": [pred_base / "xgb" / f"predictions_{split}.csv"],
        "lgb": [pred_base / "lgb" / f"predictions_{split}.csv"],
        "lstm": [
            pred_base / "lstm_neural_40" / f"predictions_{split}.csv",
            pred_base / "lstm_xgb_selected" / f"predictions_{split}.csv",
        ],
        "gru": [
            pred_base / "gru_neural_40" / f"predictions_{split}.csv",
            pred_base / "gru_xgb_selected" / f"predictions_{split}.csv",
        ],
        "hybrid_seq": [
            pred_base / "hybrid_seq_neural_40" / f"predictions_{split}.csv",
            pred_base / "hybrid_seq_xgb_selected" / f"predictions_{split}.csv",
        ],
        "hybrid_par": [
            pred_base / "hybrid_par_neural_40" / f"predictions_{split}.csv",
            pred_base / "hybrid_par_xgb_selected" / f"predictions_{split}.csv",
        ],
    }
    
    predictions = {}
    actual = None
    weights = None
    
    for model_name in models:
        if model_name not in model_paths:
            continue
        
        # Find first existing file
        file_path = None
        for path in model_paths[model_name]:
            if path.exists():
                file_path = path
                break
        
        if file_path is None:
            print(f"  [SKIP] {model_name}: file not found")
            continue
        
        df = pd.read_csv(file_path)
        
        # Handle different column names
        pred_col = None
        for col in ["predicted", "prediction", "y_pred_model", "y_pred"]:
            if col in df.columns:
                pred_col = col
                break
        
        actual_col = None
        for col in ["actual", "y_true"]:
            if col in df.columns:
                actual_col = col
                break
        
        if pred_col:
            predictions[model_name] = df[pred_col].values
            if actual is None and actual_col:
                actual = df[actual_col].values
            # Load sample weights if available
            if weights is None and "sample_weight" in df.columns:
                weights = df["sample_weight"].values
            print(f"  [OK] {model_name}: {len(predictions[model_name])} predictions")
    
    if len(predictions) == 0:
        return pd.DataFrame(), None, None
    
    # Align lengths
    min_len = min(len(p) for p in predictions.values())
    pred_df = pd.DataFrame({name: pred[:min_len] for name, pred in predictions.items()})
    
    return (pred_df, 
            actual[:min_len] if actual is not None else None,
            weights[:min_len] if weights is not None else None)


# Load from local first, then drive
valid_pred, valid_actual, valid_weights_loaded = None, None, None
test_pred, test_actual, test_weights_loaded = None, None, None

# Get run directories (parent of models_dir)
RUN_DIR_LOCAL = MODELS_DIR_LOCAL.parent
RUN_DIR_DRIVE = MODELS_DIR_DRIVE.parent

for run_dir in [RUN_DIR_LOCAL, RUN_DIR_DRIVE]:
    if (run_dir / "predictions").exists():
        if valid_pred is None or len(valid_pred.columns) == 0:
            valid_pred, valid_actual, valid_weights_loaded = load_predictions_ensemble(run_dir, "valid", ENSEMBLE_MODELS)
        if test_pred is None or len(test_pred.columns) == 0:
            test_pred, test_actual, test_weights_loaded = load_predictions_ensemble(run_dir, "test", ENSEMBLE_MODELS)

if valid_pred is None or len(valid_pred.columns) == 0 or test_pred is None or len(test_pred.columns) == 0:
    print("[ERROR] No predictions found! Run model training first.")
    ENSEMBLE_METRICS = None
else:
    print(f"\n[INFO] Loaded models: {list(valid_pred.columns)}")
    print(f"[INFO] Valid predictions: {len(valid_pred)} samples")
    print(f"[INFO] Test predictions: {len(test_pred)} samples")
    
    # -------------------------
    # 2. Load metrics for weighting
    # -------------------------
    print("\n[INFO] Loading model metrics...")
    
    model_metrics = {}
    
    metric_files = {
        "xgb": "final_metrics.csv",
        "lgb": "final_metrics_lgb.csv",
    }
    
    for model_name in valid_pred.columns:
        if model_name in metric_files:
            for models_dir in [MODELS_DIR_LOCAL, MODELS_DIR_DRIVE]:
                file_path = models_dir / metric_files[model_name]
                if file_path.exists():
                    df = pd.read_csv(file_path)
                    if "test_wrmse" in df.columns:
                        model_metrics[model_name] = {"test_wrmse": float(df["test_wrmse"].iloc[0])}
                    elif "wRMSE" in df.columns:
                        test_row = df[df.get("split", "") == "TEST"]
                        if len(test_row) > 0:
                            model_metrics[model_name] = {"test_wrmse": float(test_row["wRMSE"].iloc[0])}
                    break
        
        # Check for summary files (neural networks)
        for models_dir in [MODELS_DIR_LOCAL, MODELS_DIR_DRIVE]:
            summary_path = models_dir / f"{model_name}_summary.csv"
            if summary_path.exists() and model_name not in model_metrics:
                df = pd.read_csv(summary_path)
                if "model_test_wrmse" in df.columns:
                    model_metrics[model_name] = {"test_wrmse": float(df["model_test_wrmse"].iloc[0])}
                break
    
    for name, metrics in model_metrics.items():
        print(f"  {name}: wRMSE = {metrics['test_wrmse']:.6f}")
    
    # -------------------------
    # 3. Compute ensemble weights
    # -------------------------
    print(f"\n[INFO] Computing ensemble weights ({WEIGHT_METHOD})...")
    
    if ENSEMBLE_METHOD == "simple_average":
        weights = {m: 1.0 / len(valid_pred.columns) for m in valid_pred.columns}
        print("  Using equal weights (simple_average)")
        
    elif ENSEMBLE_METHOD in ["weighted_average", "stacking"]:
        if len(model_metrics) > 0:
            if WEIGHT_METHOD == "inverse_wrmse":
                raw_weights = {m: 1.0 / (model_metrics[m]["test_wrmse"] + EPS) 
                              for m in valid_pred.columns if m in model_metrics}
            elif WEIGHT_METHOD == "inverse_wrmse_squared":
                raw_weights = {m: 1.0 / ((model_metrics[m]["test_wrmse"] ** 2) + EPS) 
                              for m in valid_pred.columns if m in model_metrics}
            else:
                raise ValueError(f"Unknown weight_method: {WEIGHT_METHOD}")
            
            total = sum(raw_weights.values())
            weights = {m: w / total for m, w in raw_weights.items()}
            
            # Add equal weight for models without metrics
            for m in valid_pred.columns:
                if m not in weights:
                    weights[m] = 1.0 / len(valid_pred.columns)
                    print(f"  [WARN] No metrics for {m}, using equal weight")
        else:
            weights = {m: 1.0 / len(valid_pred.columns) for m in valid_pred.columns}
            print("  [WARN] No model metrics found, using equal weights")
            
    elif ENSEMBLE_METHOD == "rank_average":
        weights = {m: 1.0 / len(valid_pred.columns) for m in valid_pred.columns}
        print("  Using equal weights (rank_average)")
    
    else:
        raise ValueError(f"Unknown ensemble method: {ENSEMBLE_METHOD}")
    
    print(f"  Weights: {weights}")
    
    # -------------------------
    # 4. Generate ensemble predictions
    # -------------------------
    print(f"\n[INFO] Running ensemble ({ENSEMBLE_METHOD})...")
    
    if ENSEMBLE_METHOD == "simple_average":
        ensemble_valid = valid_pred.mean(axis=1).values
        ensemble_test = test_pred.mean(axis=1).values
        
    elif ENSEMBLE_METHOD == "weighted_average":
        ensemble_valid = np.zeros(len(valid_pred))
        ensemble_test = np.zeros(len(test_pred))
        for model, weight in weights.items():
            if model in valid_pred.columns:
                ensemble_valid += weight * valid_pred[model].values
                ensemble_test += weight * test_pred[model].values
                
    elif ENSEMBLE_METHOD == "rank_average":
        # Rank-based averaging
        valid_ranks = valid_pred.rank(pct=True)
        test_ranks = test_pred.rank(pct=True)
        
        avg_valid_rank = valid_ranks.mean(axis=1)
        avg_test_rank = test_ranks.mean(axis=1)
        
        # Convert back to prediction scale using inverse normal CDF
        from scipy import stats
        mean_pred = valid_pred.mean(axis=1).mean()
        std_pred = valid_pred.std(axis=1).mean()
        
        ensemble_valid = mean_pred + std_pred * stats.norm.ppf(avg_valid_rank.clip(0.001, 0.999))
        ensemble_test = mean_pred + std_pred * stats.norm.ppf(avg_test_rank.clip(0.001, 0.999))
        
    elif ENSEMBLE_METHOD == "stacking":
        from sklearn.linear_model import Ridge
        
        meta_params = ENSEMBLE_CFG.get("meta_params", {"alpha": 1.0})
        meta_model = Ridge(**meta_params)
        meta_model.fit(valid_pred.values, valid_actual)
        
        ensemble_valid = meta_model.predict(valid_pred.values)
        ensemble_test = meta_model.predict(test_pred.values)
        
        # Update weights with stacking coefficients
        weights = dict(zip(valid_pred.columns, meta_model.coef_))
        print(f"  Stacking coefficients: {weights}")
    
    # -------------------------
    # 5. Compute metrics
    # -------------------------
    print("\n[INFO] Computing ensemble metrics...")
    
    # Load sample weights (fallback to loaded or uniform)
    valid_weights = valid_weights_loaded
    test_weights = test_weights_loaded
    
    if valid_weights is None:
        try:
            valid_weights = load_with_fallback("weights_valid.pkl", 
                Path(LOCAL_PATHS["proc_dir"]), DATA_DIRS_LOCAL["processed"],
                Path(DRIVE_PATHS["proc_dir"]), DATA_DIRS_DRIVE["processed"])[:len(valid_actual)]
        except:
            valid_weights = np.ones(len(valid_actual))
            print("  [WARN] Using uniform weights for validation")
    
    if test_weights is None:
        try:
            test_weights = load_with_fallback("weights_test.pkl",
                Path(LOCAL_PATHS["proc_dir"]), DATA_DIRS_LOCAL["processed"],
                Path(DRIVE_PATHS["proc_dir"]), DATA_DIRS_DRIVE["processed"])[:len(test_actual)]
        except:
            test_weights = np.ones(len(test_actual))
            print("  [WARN] Using uniform weights for test")
    
    ensemble_metrics = {
        "model": f"Ensemble-{ENSEMBLE_METHOD}",
        "method": ENSEMBLE_METHOD,
        "weight_method": WEIGHT_METHOD,
        "models": list(valid_pred.columns),
        "weights": weights,
        "valid_wrmse": w_rmse(valid_actual, ensemble_valid, valid_weights),
        "valid_wmae": w_mae(valid_actual, ensemble_valid, valid_weights),
        "valid_diracc": dir_acc(valid_actual, ensemble_valid),
        "test_wrmse": w_rmse(test_actual, ensemble_test, test_weights),
        "test_wmae": w_mae(test_actual, ensemble_test, test_weights),
        "test_diracc": dir_acc(test_actual, ensemble_test),
    }
    
    print(f"\n[RESULTS] Ensemble ({ENSEMBLE_METHOD}):")
    print(f"  Valid wRMSE: {ensemble_metrics['valid_wrmse']:.6f} | wMAE: {ensemble_metrics['valid_wmae']:.6f} | DirAcc: {ensemble_metrics['valid_diracc']:.4f}")
    print(f"  Test  wRMSE: {ensemble_metrics['test_wrmse']:.6f} | wMAE: {ensemble_metrics['test_wmae']:.6f} | DirAcc: {ensemble_metrics['test_diracc']:.4f}")
    
    # -------------------------
    # 6. Compare with individual models
    # -------------------------
    print("\n[COMPARE] Individual models vs Ensemble:")
    print(f"  {'Model':<15} {'Test wRMSE':<12} {'Improvement':<12}")
    print(f"  {'-'*39}")
    for model_name, metrics in model_metrics.items():
        model_wrmse = metrics["test_wrmse"]
        improvement = (model_wrmse - ensemble_metrics["test_wrmse"]) / model_wrmse * 100
        print(f"  {model_name:<15} {model_wrmse:<12.6f} {improvement:>+.2f}%")
    print(f"  {'ENSEMBLE':<15} {ensemble_metrics['test_wrmse']:<12.6f} {'---':<12}")
    
    # -------------------------
    # 7. Save outputs (LOCAL + DRIVE)
    # -------------------------
    print(f"\n[INFO] Saving ensemble outputs...")
    print(f"  LOCAL:  {OUTPUTS_LOCAL}")
    print(f"  DRIVE:  {OUTPUTS_DRIVE}")
    
    # 7.1 Predictions with sample weights - Valid
    valid_df = pd.DataFrame({
        "actual": valid_actual,
        "predicted": ensemble_valid,
        "sample_weight": valid_weights[:len(valid_actual)],
    })
    valid_df.to_csv(OUTPUTS_LOCAL / "ensemble_predictions_valid.csv", index=False)
    copy_file(OUTPUTS_LOCAL / "ensemble_predictions_valid.csv", OUTPUTS_DRIVE / "ensemble_predictions_valid.csv")
    
    # 7.2 Predictions with sample weights - Test
    test_df = pd.DataFrame({
        "actual": test_actual,
        "predicted": ensemble_test,
        "sample_weight": test_weights[:len(test_actual)],
    })
    test_df.to_csv(OUTPUTS_LOCAL / "ensemble_predictions_test.csv", index=False)
    copy_file(OUTPUTS_LOCAL / "ensemble_predictions_test.csv", OUTPUTS_DRIVE / "ensemble_predictions_test.csv")
    
    # 7.3 Metrics CSV
    metrics_df = pd.DataFrame([{
        "model": ensemble_metrics["model"],
        "method": ensemble_metrics["method"],
        "weight_method": ensemble_metrics["weight_method"],
        "n_models": len(ensemble_metrics["models"]),
        "valid_wrmse": ensemble_metrics["valid_wrmse"],
        "valid_wmae": ensemble_metrics["valid_wmae"],
        "valid_diracc": ensemble_metrics["valid_diracc"],
        "test_wrmse": ensemble_metrics["test_wrmse"],
        "test_wmae": ensemble_metrics["test_wmae"],
        "test_diracc": ensemble_metrics["test_diracc"],
    }])
    metrics_df.to_csv(OUTPUTS_LOCAL / "ensemble_metrics.csv", index=False)
    copy_file(OUTPUTS_LOCAL / "ensemble_metrics.csv", OUTPUTS_DRIVE / "ensemble_metrics.csv")
    
    # 7.4 Full results JSON
    save_json(ensemble_metrics, OUTPUTS_LOCAL / "ensemble_results.json")
    copy_file(OUTPUTS_LOCAL / "ensemble_results.json", OUTPUTS_DRIVE / "ensemble_results.json")
    
    # 7.5 Weights JSON
    save_json(weights, OUTPUTS_LOCAL / "ensemble_weights.json")
    copy_file(OUTPUTS_LOCAL / "ensemble_weights.json", OUTPUTS_DRIVE / "ensemble_weights.json")
    
    # 7.6 Save stacking meta-model if used
    if ENSEMBLE_METHOD == "stacking":
        save_pickle(meta_model, OUTPUTS_LOCAL / "ensemble_meta_model.pkl")
        copy_file(OUTPUTS_LOCAL / "ensemble_meta_model.pkl", OUTPUTS_DRIVE / "ensemble_meta_model.pkl")
        print(f"  - ensemble_meta_model.pkl")
    
    # 7.7 Tomorrow prediction (ensemble of individual model tomorrow predictions)
    PRED_BASE = Path(LOCAL_PATHS["predictions_dir"])
    
    tomorrow_paths = {
        "xgb": PRED_BASE / "xgb" / "tomorrow.csv",
        "lgb": PRED_BASE / "lgb" / "tomorrow.csv",
        "lstm": PRED_BASE / "lstm_xgb_selected" / "tomorrow.csv",
        "gru": PRED_BASE / "gru_xgb_selected" / "tomorrow.csv",
        "hybrid_seq": PRED_BASE / "hybrid_seq_xgb_selected" / "tomorrow.csv",
        "hybrid_par": PRED_BASE / "hybrid_par_xgb_selected" / "tomorrow.csv",
    }
    
    tomorrow_preds = {}
    for model_name, path in tomorrow_paths.items():
        if path.exists():
            df = pd.read_csv(path)
            if "pred_logret" in df.columns and len(df) > 0:
                tomorrow_preds[model_name] = float(df["pred_logret"].iloc[0])
    
    if tomorrow_preds:
        print(f"\n[INFO] Tomorrow predictions from {len(tomorrow_preds)} models")
        
        # Calculate weighted average
        total_weight = 0.0
        weighted_sum = 0.0
        
        for model_name, pred in tomorrow_preds.items():
            weight = weights.get(model_name, 0.0)
            if weight > 0:
                weighted_sum += weight * pred
                total_weight += weight
                print(f"  {model_name}: {pred:.6f} (weight: {weight:.4f})")
        
        if total_weight > 0:
            ensemble_tomorrow = weighted_sum / total_weight
            ensemble_tomorrow_pct = float(np.expm1(ensemble_tomorrow) * 100)
            
            print(f"\n[INFO] ENSEMBLE Tomorrow prediction: {ensemble_tomorrow:.6f} ({ensemble_tomorrow_pct:.4f}%)")
            
            # Save tomorrow prediction
            tomorrow_df = pd.DataFrame([{
                "method": ENSEMBLE_METHOD,
                "n_models": len(tomorrow_preds),
                "predicted_for": "next_trading_day",
                "pred_logret": ensemble_tomorrow,
                "pred_return_pct": ensemble_tomorrow_pct,
            }])
            tomorrow_df.to_csv(OUTPUTS_LOCAL / "tomorrow.csv", index=False)
            copy_file(OUTPUTS_LOCAL / "tomorrow.csv", OUTPUTS_DRIVE / "tomorrow.csv")
            
            # Update metrics with tomorrow prediction
            ensemble_metrics["pred_tomorrow_logret"] = ensemble_tomorrow
            ensemble_metrics["pred_tomorrow_pct"] = ensemble_tomorrow_pct
            save_json(ensemble_metrics, OUTPUTS_LOCAL / "ensemble_results.json")
            copy_file(OUTPUTS_LOCAL / "ensemble_results.json", OUTPUTS_DRIVE / "ensemble_results.json")
            
            print(f"[OK] Saved: tomorrow.csv")
    
    print(f"\n[OK] Saved ensemble outputs:")
    print(f"  - ensemble_predictions_valid.csv")
    print(f"  - ensemble_predictions_test.csv")
    print(f"  - ensemble_metrics.csv")
    print(f"  - ensemble_results.json")
    print(f"  - ensemble_weights.json")
    
    # Store for summary
    ENSEMBLE_METRICS = ensemble_metrics


# ============================================================
# COMPARE ENSEMBLE METHODS (Optional)
# ============================================================
def compare_ensemble_methods(
    run_dir_local: Path,
    run_dir_drive: Path,
    models: list,
    methods: list = None,
    weight_methods: list = None,
) -> pd.DataFrame:
    """Compare different ensemble methods.
    
    Args:
        run_dir_local: Local run directory
        run_dir_drive: Drive run directory
        models: List of models to include
        methods: List of ensemble methods to compare (default: all)
        weight_methods: List of weight methods to compare (default: all)
    
    Returns:
        DataFrame with comparison results
    """
    if methods is None:
        methods = ["simple_average", "weighted_average", "rank_average", "stacking"]
    if weight_methods is None:
        weight_methods = ["inverse_wrmse", "inverse_wrmse_squared"]
    
    print("\n" + "="*60)
    print("ENSEMBLE METHOD COMPARISON")
    print("="*60)
    
    all_results = []
    
    for method in methods:
        for wm in weight_methods:
            # Skip weight_method for methods that don't use it
            if method in ["simple_average", "rank_average"] and wm != "inverse_wrmse":
                continue
            
            print(f"\n[INFO] Testing: {method} + {wm}")
            
            try:
                # Load predictions
                valid_pred, valid_actual, valid_weights = None, None, None
                test_pred, test_actual, test_weights = None, None, None
                
                for run_dir in [run_dir_local, run_dir_drive]:
                    if (run_dir / "predictions").exists():
                        if valid_pred is None or len(valid_pred.columns) == 0:
                            valid_pred, valid_actual, valid_weights = load_predictions_ensemble(run_dir, "valid", models)
                        if test_pred is None or len(test_pred.columns) == 0:
                            test_pred, test_actual, test_weights = load_predictions_ensemble(run_dir, "test", models)
                
                if valid_pred is None or len(valid_pred.columns) == 0:
                    print("  [SKIP] No predictions found")
                    continue
                
                # Load model metrics
                model_metrics_cmp = {}
                metric_files_cmp = {"xgb": "final_metrics.csv", "lgb": "final_metrics_lgb.csv"}
                
                for mn in valid_pred.columns:
                    if mn in metric_files_cmp:
                        for md in [MODELS_DIR_LOCAL, MODELS_DIR_DRIVE]:
                            fp = md / metric_files_cmp[mn]
                            if fp.exists():
                                df = pd.read_csv(fp)
                                if "test_wrmse" in df.columns:
                                    model_metrics_cmp[mn] = {"test_wrmse": float(df["test_wrmse"].iloc[0])}
                                elif "wRMSE" in df.columns:
                                    tr = df[df.get("split", "") == "TEST"]
                                    if len(tr) > 0:
                                        model_metrics_cmp[mn] = {"test_wrmse": float(tr["wRMSE"].iloc[0])}
                                break
                    
                    for md in [MODELS_DIR_LOCAL, MODELS_DIR_DRIVE]:
                        sp = md / f"{mn}_summary.csv"
                        if sp.exists() and mn not in model_metrics_cmp:
                            df = pd.read_csv(sp)
                            if "model_test_wrmse" in df.columns:
                                model_metrics_cmp[mn] = {"test_wrmse": float(df["model_test_wrmse"].iloc[0])}
                            break
                
                # Compute weights
                if method == "simple_average" or method == "rank_average":
                    w = {m: 1.0 / len(valid_pred.columns) for m in valid_pred.columns}
                elif len(model_metrics_cmp) > 0:
                    if wm == "inverse_wrmse":
                        raw_w = {m: 1.0 / (model_metrics_cmp[m]["test_wrmse"] + EPS) 
                                for m in valid_pred.columns if m in model_metrics_cmp}
                    else:  # inverse_wrmse_squared
                        raw_w = {m: 1.0 / ((model_metrics_cmp[m]["test_wrmse"] ** 2) + EPS) 
                                for m in valid_pred.columns if m in model_metrics_cmp}
                    total_w = sum(raw_w.values())
                    w = {m: ww / total_w for m, ww in raw_w.items()}
                    for m in valid_pred.columns:
                        if m not in w:
                            w[m] = 1.0 / len(valid_pred.columns)
                else:
                    w = {m: 1.0 / len(valid_pred.columns) for m in valid_pred.columns}
                
                # Generate predictions
                if method == "simple_average":
                    ens_valid = valid_pred.mean(axis=1).values
                    ens_test = test_pred.mean(axis=1).values
                elif method == "weighted_average":
                    ens_valid = np.zeros(len(valid_pred))
                    ens_test = np.zeros(len(test_pred))
                    for m, ww in w.items():
                        if m in valid_pred.columns:
                            ens_valid += ww * valid_pred[m].values
                            ens_test += ww * test_pred[m].values
                elif method == "rank_average":
                    from scipy import stats
                    v_ranks = valid_pred.rank(pct=True).mean(axis=1)
                    t_ranks = test_pred.rank(pct=True).mean(axis=1)
                    mean_p = valid_pred.mean(axis=1).mean()
                    std_p = valid_pred.std(axis=1).mean()
                    ens_valid = mean_p + std_p * stats.norm.ppf(v_ranks.clip(0.001, 0.999))
                    ens_test = mean_p + std_p * stats.norm.ppf(t_ranks.clip(0.001, 0.999))
                elif method == "stacking":
                    from sklearn.linear_model import Ridge
                    meta = Ridge(alpha=1.0)
                    meta.fit(valid_pred.values, valid_actual)
                    ens_valid = meta.predict(valid_pred.values)
                    ens_test = meta.predict(test_pred.values)
                    w = dict(zip(valid_pred.columns, meta.coef_))
                
                # Load sample weights
                sw_valid = valid_weights if valid_weights is not None else np.ones(len(valid_actual))
                sw_test = test_weights if test_weights is not None else np.ones(len(test_actual))
                
                # Compute metrics
                result = {
                    "method": method,
                    "weight_method": wm,
                    "n_models": len(valid_pred.columns),
                    "valid_wrmse": w_rmse(valid_actual, ens_valid, sw_valid),
                    "valid_diracc": dir_acc(valid_actual, ens_valid),
                    "test_wrmse": w_rmse(test_actual, ens_test, sw_test),
                    "test_diracc": dir_acc(test_actual, ens_test),
                }
                all_results.append(result)
                print(f"  Test wRMSE: {result['test_wrmse']:.6f} | DirAcc: {result['test_diracc']:.4f}")
                
            except Exception as e:
                print(f"  [ERROR] {e}")
    
    if len(all_results) == 0:
        print("\n[WARN] No results to compare")
        return pd.DataFrame()
    
    # Create comparison DataFrame
    cmp_df = pd.DataFrame(all_results).sort_values("test_wrmse").reset_index(drop=True)
    cmp_df["rank"] = range(1, len(cmp_df) + 1)
    
    print("\n" + "="*60)
    print("COMPARISON RESULTS (sorted by Test wRMSE)")
    print("="*60)
    print(cmp_df[["rank", "method", "weight_method", "test_wrmse", "test_diracc"]].to_string(index=False))
    
    # Save comparison
    cmp_df.to_csv(OUTPUTS_LOCAL / "ensemble_comparison.csv", index=False)
    copy_file(OUTPUTS_LOCAL / "ensemble_comparison.csv", OUTPUTS_DRIVE / "ensemble_comparison.csv")
    print(f"\n[OK] Saved ensemble_comparison.csv")
    
    return cmp_df


# Run comparison if enabled in config
if ENSEMBLE_CFG.get("compare_methods", False):
    ENSEMBLE_COMPARISON = compare_ensemble_methods(
        run_dir_local=RUN_DIR_LOCAL,
        run_dir_drive=RUN_DIR_DRIVE,
        models=ENSEMBLE_MODELS,
    )
else:
    ENSEMBLE_COMPARISON = None

print("\n[OK] BLOCK 30 complete.")


## BLOCK 31 — SUMMARY & RESULTS EXPORT


In [None]:

# ============================================================
# BLOCK 31 — SUMMARY & RESULTS EXPORT
# ============================================================
# Scans ALL runs and creates unified comparison tables
# Outputs saved to: results_summary/ (project-level, not run-specific)

# ============================================================
# LOAD ALL MODEL RESULTS (from ALL RUN_IDs)
# ============================================================

# Scan all runs directories
RUNS_DIR_DRIVE = Path(DRIVE_PROJECT_ROOT) / "runs"
RUNS_DIR_LOCAL = Path(PROJECT_ROOT) / "runs"

print(f"[INFO] Scanning all runs in: {RUNS_DIR_DRIVE}")

all_results = []

# Get all RUN_ID folders
run_folders = []
if RUNS_DIR_DRIVE.exists():
    run_folders = sorted([d for d in RUNS_DIR_DRIVE.iterdir() if d.is_dir()], reverse=True)
    print(f"[INFO] Found {len(run_folders)} runs")

if len(run_folders) == 0:
    print("[WARN] No runs found!")

for run_folder in run_folders:
    run_id = run_folder.name
    models_dir = run_folder / "models"
    ms_dir = run_folder / "model_selection"
    config_dir = run_folder / "config"
    
    if not models_dir.exists():
        continue
    
    print(f"\n[INFO] Loading from RUN_ID: {run_id}")
    
    # --------------------------
    # Load run config for period info
    # --------------------------
    run_config = {}
    config_path = config_dir / "run_params.json"
    if config_path.exists():
        with open(config_path, "r") as f:
            run_config = json.load(f)
    
    # Extract period info from config (date-based)
    data_cfg = run_config.get("data", {})
    
    train_start = data_cfg.get("limit_start_date", "N/A")
    train_end = data_cfg.get("train_end", "N/A")
    train_period = f"{train_start[:10] if train_start != 'N/A' else 'N/A'} - {train_end}"
    
    valid_start = data_cfg.get("valid_start", "N/A")
    valid_end = data_cfg.get("valid_end", "N/A")
    valid_period = f"{valid_start} - {valid_end}"
    
    test_start = data_cfg.get("test_start", "N/A")
    test_end = data_cfg.get("test_end", "latest")
    test_period = f"{test_start} - {test_end if test_end else 'latest'}"
    
    # Get actual data range from full_df.pkl
    data_start = "N/A"
    data_end = "N/A"
    try:
        # Search paths for full_df.pkl
        full_df_paths = [
            Path(DRIVE_PROJECT_ROOT) / "data" / "interim" / "full_df.pkl",
            Path(PROJECT_ROOT) / "data" / "interim" / "full_df.pkl",
        ]
        
        for full_df_path in full_df_paths:
            if full_df_path.exists():
                full_df = load_pickle(full_df_path)
                if hasattr(full_df, 'index') and len(full_df.index) > 0:
                    data_start = str(full_df.index.min().date())
                    data_end = str(full_df.index.max().date())
                break
        
        # Fallback to config if still N/A
        if data_start == "N/A":
            data_start = str(data_cfg.get("limit_start_date", "N/A"))[:10]
        
        if data_end == "N/A":
            test_end = data_cfg.get("test_end")
            if test_end:
                data_end = test_end
            else:
                data_end = "latest"
            
    except Exception as e:
        print(f"    [WARN] Could not read data range: {e}")
    
    # --------------------------
    # 0. Baseline Results (from final_metrics.csv)
    # --------------------------
    xgb_metrics_path = models_dir / "final_metrics.csv"
    
    if xgb_metrics_path.exists():
        metrics_df = pd.read_csv(xgb_metrics_path)
        
        # Look for baseline rows
        for baseline_name in ["BASELINE_ZERO", "BASELINE_NAIVE"]:
            baseline_rows = metrics_df[metrics_df["model"] == baseline_name]
            for _, row in baseline_rows.iterrows():
                if row["split"] == "TEST":
                    all_results.append({
                        "run_id": run_id,
                        "model": baseline_name,
                        "feature_set": "baseline",
                        "train_period": train_period,
                        "valid_period": valid_period,
                        "test_period": test_period,
                        "data_start": data_start,
                        "data_end": data_end,
                        "test_wrmse": float(row["wRMSE"]),
                        "test_wmae": float(row.get("wMAE", 0)) if "wMAE" in row else None,
                        "test_diracc": float(row["DirAcc"]),
                        "params": "{}",
                    })
                    print(f"  ✓ {baseline_name}: wRMSE={row['wRMSE']:.6f}")
    
    # --------------------------
    # 1. XGBoost Results
    # --------------------------
    
    if xgb_metrics_path.exists():
        xgb_metrics = pd.read_csv(xgb_metrics_path)
        xgb_test = xgb_metrics[(xgb_metrics["model"] == "FINAL_XGB") & (xgb_metrics["split"] == "TEST")]
        
        if len(xgb_test) > 0:
            xgb_test = xgb_test.iloc[0]
            
            # Load best params
            xgb_params = {}
            params_path = ms_dir / "best_params_xgb_reg_t1.pkl"
            if params_path.exists():
                xgb_params = load_pickle(params_path)
            
            all_results.append({
                "run_id": run_id,
                "model": "XGBoost",
                "feature_set": "xgb_selected",
                "train_period": train_period,
                "valid_period": valid_period,
                "test_period": test_period,
                "data_start": data_start,
                "data_end": data_end,
                "test_wrmse": float(xgb_test["wRMSE"]),
                "test_wmae": float(xgb_test.get("wMAE", 0)) if "wMAE" in xgb_test else None,
                "test_diracc": float(xgb_test["DirAcc"]),
                "params": str(xgb_params),
            })
            print(f"  ✓ XGBoost: wRMSE={xgb_test['wRMSE']:.6f}")
    
    
    # --------------------------
    # 1B. LightGBM Results
    # --------------------------
    lgb_metrics_path = models_dir / "final_metrics_lgb.csv"
    lgb_json_path = run_folder / "outputs" / "lgb_results.json"
    
    if lgb_metrics_path.exists():
        lgb_metrics = pd.read_csv(lgb_metrics_path)
        lgb_test = lgb_metrics[(lgb_metrics["model"] == "FINAL_LGB") & (lgb_metrics["split"] == "TEST")]
        if len(lgb_test) > 0:
            lgb_row = lgb_test.iloc[0]
            
            # Load best params
            lgb_params = {}
            params_path = ms_dir / "best_params_lgb_reg_t1.pkl"
            if params_path.exists():
                lgb_params = load_pickle(params_path)
            
            all_results.append({
                "run_id": run_id,
                "model": "LightGBM",
                "feature_set": "xgb_selected",
                "train_period": train_period,
                "valid_period": valid_period,
                "test_period": test_period,
                "data_start": data_start,
                "data_end": data_end,
                "test_wrmse": float(lgb_row["wRMSE"]),
                "test_wmae": float(lgb_row["wMAE"]) if "wMAE" in lgb_row.index else None,
                "test_diracc": float(lgb_row["DirAcc"]),
                "params": str(lgb_params),
            })
            print(f"  ✓ LightGBM: wRMSE={lgb_row['wRMSE']:.6f}")
    elif lgb_json_path.exists():
        # Try loading from JSON
        with open(lgb_json_path, "r") as f:
            lgb_data = json.load(f)
        
        all_results.append({
            "run_id": run_id,
            "model": "LightGBM",
            "feature_set": "xgb_selected",
            "train_period": train_period,
            "valid_period": valid_period,
            "test_period": test_period,
            "data_start": data_start,
            "data_end": data_end,
            "test_wrmse": float(lgb_data.get("test_wrmse", 0)),
            "test_wmae": float(lgb_data.get("test_wmae", 0)) if "test_wmae" in lgb_data else None,
            "test_diracc": float(lgb_data.get("test_diracc", 0)),
            "params": str({k: lgb_data.get(k) for k in ["num_leaves", "max_depth", "learning_rate", "subsample", "colsample_bytree"] if k in lgb_data}),
        })
        print(f"  ✓ LightGBM: wRMSE={lgb_data.get('test_wrmse', 0):.6f}")

# --------------------------
    # 2. LSTM & GRU Results
    # --------------------------
    for model_type in ["lstm", "gru"]:
        summary_path = models_dir / f"{model_type}_summary.csv"
        
        if summary_path.exists():
            df = pd.read_csv(summary_path)
            
            for _, row in df.iterrows():
                all_results.append({
                    "run_id": run_id,
                    "model": model_type.upper(),
                    "feature_set": row["feature_set"],
                    "train_period": train_period,
                    "valid_period": valid_period,
                    "test_period": test_period,
                    "data_start": data_start,
                    "data_end": data_end,
                    "test_wrmse": float(row["model_test_wrmse"]),
                    "test_wmae": float(row.get("model_test_wmae", 0)) if "model_test_wmae" in row else None,
                    "test_diracc": float(row["model_test_diracc"]),
                    "params": str(run_config.get(model_type, {})),
                })
            print(f"  ✓ {model_type.upper()}: {len(df)} configurations")
    
    # --------------------------
    # 3. Hybrid Results
    # --------------------------
    for hybrid_type in ["hybrid_seq", "hybrid_par"]:
        summary_path = models_dir / f"{hybrid_type}_summary.csv"
        
        if summary_path.exists():
            df = pd.read_csv(summary_path)
            
            for _, row in df.iterrows():
                model_name = "Hybrid-Seq" if hybrid_type == "hybrid_seq" else "Hybrid-Par"
                all_results.append({
                    "run_id": run_id,
                    "model": model_name,
                    "feature_set": row["feature_set"],
                    "train_period": train_period,
                    "valid_period": valid_period,
                    "test_period": test_period,
                    "data_start": data_start,
                    "data_end": data_end,
                    "test_wrmse": float(row["model_test_wrmse"]),
                    "test_wmae": float(row.get("model_test_wmae", 0)) if "model_test_wmae" in row else None,
                    "test_diracc": float(row["model_test_diracc"]),
                    "params": str(run_config.get(hybrid_type, {})),
                })
            print(f"  ✓ {hybrid_type}: {len(df)} configurations")
    
    # --------------------------
    # 4. Ensemble Results
    # --------------------------
    ensemble_path = run_folder / "outputs" / "ensemble_results.json"
    
    if ensemble_path.exists():
        with open(ensemble_path, "r") as f:
            ens_data = json.load(f)
        
        ens_method = ens_data.get("method", "weighted_average")
        all_results.append({
            "run_id": run_id,
            "model": f"Ensemble-{ens_method}",
            "feature_set": "all_models",
            "train_period": train_period,
            "valid_period": valid_period,
            "test_period": test_period,
            "data_start": data_start,
            "data_end": data_end,
            "test_wrmse": float(ens_data.get("test_wrmse", 0)),
            "test_wmae": float(ens_data.get("test_wmae", 0)) if "test_wmae" in ens_data else None,
            "test_diracc": float(ens_data.get("test_diracc", 0)),
            "params": str(ens_data.get("weights", {})),
        })
        print(f"  ✓ Ensemble ({ens_method}): wRMSE={ens_data.get('test_wrmse', 0):.6f}")

print(f"\n[OK] Loaded {len(all_results)} total results from {len(run_folders)} runs")

# ============================================================
# CREATE COMPARISON TABLES
# ============================================================

if len(all_results) == 0:
    print("[ERROR] No results found! Run training sections first.")
else:
    # Create DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Filter out invalid results (wRMSE=0 is impossible except for perfect predictions)
    # Keep BASELINE_ZERO which has legitimate 0 values for some metrics
    invalid_mask = (
        (results_df["test_wrmse"] == 0) & 
        (results_df["test_diracc"] == 0) &
        (~results_df["model"].str.contains("BASELINE", case=False, na=False))
    )
    if invalid_mask.any():
        n_invalid = invalid_mask.sum()
        print(f"[WARN] Filtering {n_invalid} invalid results (wRMSE=0 and DirAcc=0)")
        results_df = results_df[~invalid_mask]
    
    # Sort by test_wrmse (lower is better)
    results_df = results_df.sort_values("test_wrmse").reset_index(drop=True)
    
    # Add rank
    results_df.insert(0, "rank", range(1, len(results_df) + 1))
    
    # Count unique runs
    n_runs = results_df["run_id"].nunique()
    n_models = len(results_df)
    
    # =========================================
    # REPORT 1: All Results from All RUN_IDs
    # =========================================
    print("\n" + "="*100)
    print(f"REPORT 1: ALL RESULTS ({n_models} results from {n_runs} runs)")
    print("="*100)
    
    display_cols = ["rank", "run_id", "model", "feature_set", 
                    "train_period", "valid_period", "test_period", "data_start", "data_end",
                    "test_wrmse", "test_diracc"]
    display(results_df[display_cols])
    
    # =========================================
    # REPORT 2: Best per Model Type (across ALL runs)
    # =========================================
    print("\n" + "="*100)
    print("REPORT 2: BEST PER MODEL TYPE (across all runs)")
    print("="*100)
    
    # Get best (lowest wRMSE) for each model type across ALL runs
    best_per_model = results_df.loc[results_df.groupby("model")["test_wrmse"].idxmin()]
    best_per_model = best_per_model.drop(columns=["rank"], errors="ignore")
    best_per_model = best_per_model.sort_values("test_wrmse").reset_index(drop=True)
    best_per_model.insert(0, "rank", range(1, len(best_per_model) + 1))
    
    display_cols_best = ["rank", "model", "feature_set", "run_id",
                         "train_period", "valid_period", "test_period", "data_start", "data_end",
                         "test_wrmse", "test_diracc"]
    display(best_per_model[display_cols_best])
    
    # =========================================
    # Overall Best Model
    # =========================================
    best = results_df.iloc[0]
    print("\n" + "="*100)
    print("OVERALL BEST MODEL (across all runs)")
    print("="*100)
    print(f"  Run ID:       {best['run_id']}")
    print(f"  Model:        {best['model']}")
    print(f"  Feature Set:  {best['feature_set']}")
    print(f"  Train Period: {best['train_period']}")
    print(f"  Valid Period: {best['valid_period']}")
    print(f"  Test Period:  {best['test_period']}")
    print(f"  Data Range:   {best['data_start']} to {best['data_end']}")
    print(f"  Test wRMSE:   {best['test_wrmse']:.6f}")
    print(f"  Test DirAcc:  {best['test_diracc']:.4f}")
    
    # =========================================
    # REPORT 3: Bootstrap Confidence Intervals
    # =========================================
    print("\n" + "="*100)

# ============================================================
# SAVE RESULTS (LOCAL + DRIVE)
# ============================================================

from datetime import datetime

# Directories - save to project level (not run-specific)
RESULTS_LOCAL = ensure_dir(Path(PROJECT_ROOT) / "results_summary")
RESULTS_DRIVE = ensure_dir(Path(DRIVE_PROJECT_ROOT) / "results_summary")

print(f"[INFO] Results output dirs:")
print(f"  - LOCAL: {RESULTS_LOCAL}")
print(f"  - DRIVE: {RESULTS_DRIVE}")

if len(all_results) > 0:
    n_runs = results_df["run_id"].nunique()
    
    # =========================================
    # Save Report 1: All Results (all runs)
    # =========================================
    all_results_cols = ["rank", "run_id", "model", "feature_set", 
                        "train_period", "valid_period", "test_period",
                        "data_start", "data_end",
                        "test_wrmse", "test_wmae", "test_diracc"]
    all_results_df = results_df[all_results_cols].copy()
    
    all_results_df.to_csv(RESULTS_LOCAL / "all_results.csv", index=False)
    all_results_df.to_csv(RESULTS_DRIVE / "all_results.csv", index=False)
    print("[OK] Saved: all_results.csv")
    
    # =========================================
    # Save Report 2: Best per Model (all runs)
    # =========================================
    best_cols = ["rank", "model", "feature_set", "run_id",
                 "train_period", "valid_period", "test_period",
                 "data_start", "data_end",
                 "test_wrmse", "test_wmae", "test_diracc"]
    best_per_model_df = best_per_model[best_cols].copy()
    
    best_per_model_df.to_csv(RESULTS_LOCAL / "best_per_model.csv", index=False)
    best_per_model_df.to_csv(RESULTS_DRIVE / "best_per_model.csv", index=False)
    print("[OK] Saved: best_per_model.csv")
    
    # =========================================
    # Save Full Results with Params
    # =========================================
    results_df.to_csv(RESULTS_LOCAL / "all_results_with_params.csv", index=False)
    results_df.to_csv(RESULTS_DRIVE / "all_results_with_params.csv", index=False)
    print("[OK] Saved: all_results_with_params.csv")
    
    # =========================================
    # Generate RESULTS.md
    # =========================================
    best = results_df.iloc[0]
    
    md_lines = [
        "# Results Summary",
        "",
        f"**Last Updated:** {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        "",
        f"**Total Runs:** {n_runs} | **Total Configurations:** {len(results_df)}",
        "",
        "---",
        "",
        "## 🏆 Best Results (Top 10)",
        "",
        "| # | Model | Feature Set | wRMSE | DirAcc | Run |",
        "|---|-------|-------------|-------|--------|-----|",
    ]
    
    for _, row in results_df.head(10).iterrows():
        wrmse = f"{row['test_wrmse']:.6f}" if pd.notna(row["test_wrmse"]) else "-"
        diracc = f"{row['test_diracc']:.2%}" if pd.notna(row["test_diracc"]) else "-"
        run_short = row['run_id'][-6:] if len(str(row['run_id'])) > 6 else row['run_id']
        md_lines.append(f"| {row['rank']} | {row['model']} | {row['feature_set']} | {wrmse} | {diracc} | {run_short} |")
    
    md_lines.extend([
        "",
        "---",
        "",
        "## 📊 Best per Model Type",
        "",
        "| # | Model | Feature Set | wRMSE | DirAcc |",
        "|---|-------|-------------|-------|--------|",
    ])
    
    for _, row in best_per_model.iterrows():
        wrmse = f"{row['test_wrmse']:.6f}" if pd.notna(row["test_wrmse"]) else "-"
        diracc = f"{row['test_diracc']:.2%}" if pd.notna(row["test_diracc"]) else "-"
        md_lines.append(f"| {row['rank']} | {row['model']} | {row['feature_set']} | {wrmse} | {diracc} |")
    
    md_lines.extend([
        "",
        "---",
        "",
        "## 🥇 Overall Best",
        "",
        "| Metric | Value |",
        "|--------|-------|",
        f"| Model | **{best['model']}** |",
        f"| Feature Set | {best['feature_set']} |",
        f"| wRMSE | {best['test_wrmse']:.6f} |",
        f"| DirAcc | {best['test_diracc']:.2%} |",
        f"| Run ID | {best['run_id']} |",
        f"| Data Range | {best['data_start']} → {best['data_end']} |",
        "",
        "---",
        "",
        "## 📖 Metrics",
        "",
        "| Metric | Description |",
        "|--------|-------------|",
        "| wRMSE | Weighted Root Mean Squared Error (↓ lower is better) |",
        "| DirAcc | Directional Accuracy (↑ higher is better) |",
        "",
        "*Full details with period configurations available in CSV files.*",
    ])
    
    md_content = "\n".join(md_lines)
    
    (RESULTS_LOCAL / "RESULTS.md").write_text(md_content)
    (RESULTS_DRIVE / "RESULTS.md").write_text(md_content)
    print("[OK] Saved: RESULTS.md")
    
    # =========================================
    # Save Bootstrap CI (if computed)
    # =========================================
    if 'bootstrap_df' in dir() and len(bootstrap_df) > 0:
        bootstrap_df.to_csv(RESULTS_LOCAL / "bootstrap_ci.csv", index=False)
        bootstrap_df.to_csv(RESULTS_DRIVE / "bootstrap_ci.csv", index=False)
        print("[OK] Saved: bootstrap_ci.csv")
    
    # =========================================
    # TOMORROW PREDICTIONS SUMMARY
    # =========================================
    # Collect tomorrow.csv from all models across all runs
    
    print("\n[INFO] Collecting tomorrow predictions...")
    
    tomorrow_results = []
    
    # Use the most recent run for tomorrow predictions
    if len(run_folders) > 0:
        latest_run = run_folders[0]  # Already sorted desc
        run_id = latest_run.name
        pred_dir = latest_run / "predictions"
        outputs_dir = latest_run / "outputs"
        
        # Model paths for tomorrow.csv
        tomorrow_paths = {
            ("XGBoost", "xgb_selected"): pred_dir / "xgb" / "tomorrow.csv",
            ("LightGBM", "xgb_selected"): pred_dir / "lgb" / "tomorrow.csv",
            ("LSTM", "xgb_selected"): pred_dir / "lstm_xgb_selected" / "tomorrow.csv",
            ("LSTM", "neural_40"): pred_dir / "lstm_neural_40" / "tomorrow.csv",
            ("LSTM", "neural_80"): pred_dir / "lstm_neural_80" / "tomorrow.csv",
            ("GRU", "xgb_selected"): pred_dir / "gru_xgb_selected" / "tomorrow.csv",
            ("GRU", "neural_40"): pred_dir / "gru_neural_40" / "tomorrow.csv",
            ("GRU", "neural_80"): pred_dir / "gru_neural_80" / "tomorrow.csv",
            ("Hybrid-Seq", "xgb_selected"): pred_dir / "hybrid_seq_xgb_selected" / "tomorrow.csv",
            ("Hybrid-Seq", "neural_40"): pred_dir / "hybrid_seq_neural_40" / "tomorrow.csv",
            ("Hybrid-Seq", "neural_80"): pred_dir / "hybrid_seq_neural_80" / "tomorrow.csv",
            ("Hybrid-Par", "xgb_selected"): pred_dir / "hybrid_par_xgb_selected" / "tomorrow.csv",
            ("Hybrid-Par", "neural_40"): pred_dir / "hybrid_par_neural_40" / "tomorrow.csv",
            ("Hybrid-Par", "neural_80"): pred_dir / "hybrid_par_neural_80" / "tomorrow.csv",
            ("Ensemble", "all_models"): outputs_dir / "tomorrow.csv",
        }
        
        for (model, feature_set), path in tomorrow_paths.items():
            if path.exists():
                try:
                    df = pd.read_csv(path)
                    if len(df) > 0:
                        row = df.iloc[0]
                        tomorrow_results.append({
                            "run_id": run_id,
                            "model": model,
                            "feature_set": row.get("feature_set", feature_set),
                            "last_data_date": row.get("last_data_date", ""),
                            "pred_logret": float(row.get("pred_logret", 0)),
                            "pred_return_pct": float(row.get("pred_return_pct", row.get("pred_logret", 0) * 100)),
                        })
                        print(f"  ✓ {model} ({feature_set}): {row.get('pred_return_pct', 0):.4f}%")
                except Exception as e:
                    pass
    
    if tomorrow_results:
        tomorrow_df = pd.DataFrame(tomorrow_results)
        # Sort by predicted return (highest first for bullish, lowest first for bearish)
        tomorrow_df = tomorrow_df.sort_values("pred_return_pct", ascending=False).reset_index(drop=True)
        tomorrow_df.insert(0, "rank", range(1, len(tomorrow_df) + 1))
        
        tomorrow_df.to_csv(RESULTS_LOCAL / "tomorrow_summary.csv", index=False)
        tomorrow_df.to_csv(RESULTS_DRIVE / "tomorrow_summary.csv", index=False)
        print(f"[OK] Saved: tomorrow_summary.csv ({len(tomorrow_df)} predictions)")
        
        # Show summary
        print("\n" + "="*70)
        print("TOMORROW PREDICTIONS (next trading day)")
        print("="*70)
        print(tomorrow_df[["rank", "model", "feature_set", "pred_return_pct"]].to_string(index=False))
    else:
        print("[WARN] No tomorrow predictions found")
    
    print("\n" + "="*70)
    print("FILES SAVED")
    print("="*70)
    print(f"\nLOCAL: {RESULTS_LOCAL}/")
    print(f"DRIVE: {RESULTS_DRIVE}/")
    print("\nFiles:")
    print("  - all_results.csv          (Report 1: all runs with periods)")
    print("  - best_per_model.csv       (Report 2: best per model with periods)")
    print("  - all_results_with_params.csv")
    print("  - bootstrap_ci.csv         (Report 3: bootstrap confidence intervals)")
    print("  - tomorrow_summary.csv     (Report 4: tomorrow predictions)")
    print("  - RESULTS.md")

# ============================================================
# DOWNLOAD FILES FOR GIT (Colab only)
# ============================================================

if IN_COLAB and len(all_results) > 0:
    from google.colab import files
    
    print("[INFO] Downloading files for Git...")
    print("(Copy these to your repo: results/ folder)\n")
    
    # Download from DRIVE (project level - persistent location)
    for f in RESULTS_DRIVE.glob("*"):
        if f.is_file() and f.suffix in [".csv", ".md"]:
            files.download(str(f))
            print(f"  Downloaded: {f.name}")
    
    print("\n[OK] Files downloaded!")
    print("\nNext steps:")
    print("  1. Copy downloaded files to your repo: results/")
    print("  2. git add results/")
    print('  3. git commit -m "Update results"')
    print("  4. git push")
else:
    if len(all_results) > 0:
        print(f"[INFO] Files saved to: {RESULTS_DRIVE}")
    else:
        print("[INFO] No results to download")

print("\n[OK] BLOCK 31 complete.")


## BLOCK 32 — BOOTSTRAP CONFIDENCE INTERVALS

In [None]:
# ============================================================
# BOOTSTRAP CONFIDENCE INTERVALS
# ============================================================

# Run bootstrap analysis ONLY on models in best_per_model
# This provides statistical confidence for the metrics

# --- Bootstrap helper functions (standalone) ---

def bootstrap_metric(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    weights = None,
    metric_fn = None,
    n_bootstrap: int = 1000,
    confidence_level: float = 0.95,
    random_state: int = 42
):
    """Compute bootstrap confidence interval for a single metric."""
    y_true = _to_np(y_true)
    y_pred = _to_np(y_pred)
    n = len(y_true)
    
    if weights is None:
        weights = np.ones(n)
    weights = _to_np(weights)
    
    rng = np.random.RandomState(random_state)
    
    # Point estimate
    point_estimate = metric_fn(y_true, y_pred, weights)
    
    # Bootstrap samples
    bootstrap_values = []
    for _ in range(n_bootstrap):
        idx = rng.randint(0, n, size=n)
        val = metric_fn(y_true[idx], y_pred[idx], weights[idx])
        bootstrap_values.append(val)
    
    bootstrap_values = np.array(bootstrap_values)
    
    # Confidence interval (percentile method)
    alpha = 1 - confidence_level
    ci_lower = np.percentile(bootstrap_values, 100 * alpha / 2)
    ci_upper = np.percentile(bootstrap_values, 100 * (1 - alpha / 2))
    
    return {
        "point_estimate": float(point_estimate),
        "ci_lower": float(ci_lower),
        "ci_upper": float(ci_upper),
        "std": float(np.std(bootstrap_values)),
        "n_bootstrap": n_bootstrap,
        "confidence_level": confidence_level,
    }


def bootstrap_all_metrics(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    weights = None,
    n_bootstrap: int = 1000,
    confidence_level: float = 0.95,
    random_state: int = 42
):
    """Compute bootstrap confidence intervals for all standard metrics."""
    # Wrapper functions that accept weights
    def wrmse_fn(yt, yp, w):
        return w_rmse(yt, yp, w)
    
    def wmae_fn(yt, yp, w):
        return w_mae(yt, yp, w)
    
    def diracc_fn(yt, yp, _w):
        return dir_acc(yt, yp)  # DirAcc doesn't use weights
    
    metrics = {
        "wrmse": wrmse_fn,
        "wmae": wmae_fn,
        "diracc": diracc_fn,
    }
    
    results = {}
    for name, fn in metrics.items():
        results[name] = bootstrap_metric(
            y_true, y_pred, weights, fn,
            n_bootstrap=n_bootstrap,
            confidence_level=confidence_level,
            random_state=random_state
        )
    
    return results


def format_ci(ci_result, decimals: int = 6) -> str:
    """Format confidence interval as string: point [lower, upper]."""
    return (
        f"{ci_result['point_estimate']:.{decimals}f} "
        f"[{ci_result['ci_lower']:.{decimals}f}, {ci_result['ci_upper']:.{decimals}f}]"
    )


# --- Run Bootstrap Analysis ---

print("[INFO] Running Bootstrap Confidence Interval Analysis...")
print(f"[INFO] Models to analyze: {len(best_per_model)} (from best_per_model)")

N_BOOTSTRAP = 1000
CONFIDENCE_LEVEL = 0.95

bootstrap_results = []

# Get the best run for each model from best_per_model
for _, row in best_per_model.iterrows():
    model_name = row["model"]
    run_id = row["run_id"]
    feature_set = row.get("feature_set", "unknown")
    
    print(f"\n[INFO] Analyzing: {model_name} (run: {run_id})")
    
    # Find predictions file for this model/run
    run_dir = RUNS_DIR_DRIVE / run_id
    models_dir = run_dir / "models"
    pred_dir = run_dir / "predictions"
    
    y_true = None
    y_pred = None
    weights = None
    
    # Try to load predictions based on model type
    # All predictions are in predictions/{model}/ or predictions/{model}_{feature_set}/
    try:
        pred_path = None
        
        if model_name == "XGBoost":
            pred_path = pred_dir / "xgb" / "predictions_test.csv"
                
        elif model_name == "LightGBM":
            pred_path = pred_dir / "lgb" / "predictions_test.csv"
                
        elif model_name in ["LSTM", "GRU"]:
            model_lower = model_name.lower()
            # Try different feature_set paths
            for fs in ["neural_40", "xgb_selected"]:
                p = pred_dir / f"{model_lower}_{fs}" / "predictions_test.csv"
                if p.exists():
                    pred_path = p
                    break
                
        elif model_name in ["Hybrid-Seq", "Hybrid-Par"]:
            model_key = "hybrid_seq" if model_name == "Hybrid-Seq" else "hybrid_par"
            for fs in ["neural_40", "xgb_selected"]:
                p = pred_dir / f"{model_key}_{fs}" / "predictions_test.csv"
                if p.exists():
                    pred_path = p
                    break
                
        elif "Ensemble" in model_name:
            pred_path = run_dir / "outputs" / "ensemble_predictions_test.csv"
                
        elif model_name in ["BASELINE_ZERO", "BASELINE_NAIVE"]:
            # Use XGBoost predictions for y_true
            pred_path = pred_dir / "xgb" / "predictions_test.csv"
        
        # Load and parse predictions
        if pred_path and pred_path.exists():
            df = pd.read_csv(pred_path)
            
            # Get y_true (different column names)
            if "y_true" in df.columns:
                y_true = df["y_true"].values
            elif "actual" in df.columns:
                y_true = df["actual"].values
            else:
                raise ValueError(f"No y_true/actual column in {pred_path}")
            
            # Get predictions
            if model_name == "BASELINE_ZERO":
                y_pred = np.zeros(len(y_true))
            elif model_name == "BASELINE_NAIVE":
                y_pred = np.roll(y_true, 1)
                y_pred[0] = 0
            elif "y_pred_model" in df.columns:
                y_pred = df["y_pred_model"].values
            elif "predicted" in df.columns:
                y_pred = df["predicted"].values
            else:
                raise ValueError(f"No prediction column in {pred_path}")
            
            # Get weights
            if "sample_weight" in df.columns:
                weights = df["sample_weight"].values
            else:
                weights = np.ones(len(y_true))
        
        if y_true is not None and y_pred is not None:
            # Run bootstrap
            ci_results = bootstrap_all_metrics(
                y_true, y_pred, weights,
                n_bootstrap=N_BOOTSTRAP,
                confidence_level=CONFIDENCE_LEVEL,
                random_state=42
            )
            
            bootstrap_results.append({
                "model": model_name,
                "run_id": run_id,
                "feature_set": feature_set,
                "n_samples": len(y_true),
                "n_bootstrap": N_BOOTSTRAP,
                "confidence_level": CONFIDENCE_LEVEL,
                # wRMSE
                "wrmse": ci_results["wrmse"]["point_estimate"],
                "wrmse_ci_lower": ci_results["wrmse"]["ci_lower"],
                "wrmse_ci_upper": ci_results["wrmse"]["ci_upper"],
                "wrmse_std": ci_results["wrmse"]["std"],
                # wMAE
                "wmae": ci_results["wmae"]["point_estimate"],
                "wmae_ci_lower": ci_results["wmae"]["ci_lower"],
                "wmae_ci_upper": ci_results["wmae"]["ci_upper"],
                "wmae_std": ci_results["wmae"]["std"],
                # DirAcc
                "diracc": ci_results["diracc"]["point_estimate"],
                "diracc_ci_lower": ci_results["diracc"]["ci_lower"],
                "diracc_ci_upper": ci_results["diracc"]["ci_upper"],
                "diracc_std": ci_results["diracc"]["std"],
            })
            
            print(f"  wRMSE: {format_ci(ci_results['wrmse'])}")
            print(f"  DirAcc: {format_ci(ci_results['diracc'], decimals=4)}")
        else:
            print(f"  [WARN] Could not load predictions for {model_name}")
            
    except Exception as e:
        print(f"  [ERROR] {e}")

# Create DataFrame
if len(bootstrap_results) > 0:
    bootstrap_df = pd.DataFrame(bootstrap_results)
    bootstrap_df = bootstrap_df.sort_values("wrmse").reset_index(drop=True)
    bootstrap_df.insert(0, "rank", range(1, len(bootstrap_df) + 1))
    
    print("\n" + "="*100)
    print(f"BOOTSTRAP CONFIDENCE INTERVALS ({CONFIDENCE_LEVEL*100:.0f}% CI, n={N_BOOTSTRAP})")
    print("="*100)
    
    # Display summary
    display_cols = ["rank", "model", "wrmse", "wrmse_ci_lower", "wrmse_ci_upper", 
                    "diracc", "diracc_ci_lower", "diracc_ci_upper"]
    display(bootstrap_df[display_cols])
    
    # Save to results_summary
    bootstrap_df.to_csv(RESULTS_LOCAL / "bootstrap_ci.csv", index=False)
    bootstrap_df.to_csv(RESULTS_DRIVE / "bootstrap_ci.csv", index=False)
    print(f"\n[OK] Saved: bootstrap_ci.csv")
    
    # Also save as JSON for detailed results
    save_json(bootstrap_results, RESULTS_LOCAL / "bootstrap_ci.json")
    save_json(bootstrap_results, RESULTS_DRIVE / "bootstrap_ci.json")
    print("[OK] Saved: bootstrap_ci.json")
else:
    print("[WARN] No bootstrap results computed")
