In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.2


In [7]:
import json
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    average_precision_score
)

import matplotlib
matplotlib.use('Agg')  # Non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns

# XGBoost imports
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("WARNING: XGBoost not installed. Install with: pip install xgboost")

In [8]:
# =========================================
# CONFIGURATION
# =========================================
DATA_DIR = Path("/Users/nitinlodha/Desktop/ML/ML_Project/Bybit_CSV_Data")
FILES = {
    "BTC": DATA_DIR / "Bybit_BTC.csv",
    "ETH": DATA_DIR / "Bybit_ETH.csv",
    "SOL": DATA_DIR / "Bybit_SOL.csv",
    "XRP": DATA_DIR / "Bybit_XRP.csv",
    "DOGE": DATA_DIR / "Bybit_DOGE.csv",
}

HORIZONS = [1, 3, 6]  # Forecast horizons in 4-hour bars
DEFAULT_COST_BP = {1: 8.0, 3: 10.0, 6: 12.0}  # Trading costs in basis points

# Policy thresholds
TAU_P = 0.60        # Probability gate for P(edge > cost)
TAU_MU = 0.0005     # Expected-return gate (log-return)
LAM = 2.0           # Kelly-lite multiplier
W_MAX = 0.50        # Max gross position (50% notional)

MODEL_VERSION = "xgboost_multiH_v1.0"
CALIBRATION_VERSION = "iso+conformal_v1"

In [9]:
# =========================================
# UTILITY FUNCTIONS
# =========================================
def bp_to_logret(bp: float) -> float:
    """Convert basis points to log-return units."""
    return bp * 1e-4


def _find_close_column(df: pd.DataFrame) -> str:
    """Find the close price column in a dataframe."""
    lower = {c.lower(): c for c in df.columns}
    for key in ("close", "closing_price", "close_price", "price_close", "last", "c"):
        if key in lower:
            return lower[key]
    # Fallback: any single float column
    float_cols = [c for c in df.columns if pd.api.types.is_float_dtype(df[c])]
    if len(float_cols) == 1:
        return float_cols[0]
    raise ValueError("Cannot identify 'close' column.")


def cumulative_log_returns(price: pd.Series, h: int) -> pd.Series:
    """Compute log(P_{t+h}/P_t) aligned to t."""
    return np.log(price.shift(-h) / price).dropna()


def brier_score(y: np.ndarray, p: np.ndarray) -> float:
    """Brier score for probability calibration."""
    return float(np.mean((y - p) ** 2))


def expected_calibration_error(y: np.ndarray, p: np.ndarray, bins: int = 10) -> float:
    """Expected Calibration Error (ECE)."""
    edges = np.linspace(0, 1, bins + 1)
    ece = 0.0
    for i in range(bins):
        m = (p >= edges[i]) & (p < edges[i+1])
        if m.sum() == 0:
            continue
        ece += (m.sum()/len(p)) * np.abs(np.mean(y[m]) - np.mean(p[m]))
    return float(ece)

In [None]:
# =========================================
# EVALUATION METRICS
# =========================================
def compute_classification_metrics(y_true: np.ndarray, y_pred: np.ndarray,
                                   y_prob: np.ndarray = None) -> dict:
    """Compute comprehensive classification metrics."""
    metrics = {}

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    metrics['confusion_matrix'] = cm
    metrics['true_negatives'] = int(tn)
    metrics['false_positives'] = int(fp)
    metrics['false_negatives'] = int(fn)
    metrics['true_positives'] = int(tp)

    # Basic metrics
    metrics['accuracy'] = float((tp + tn) / (tp + tn + fp + fn))
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1_score'] = f1_score(y_true, y_pred, zero_division=0)

    # Additional metrics
    metrics['specificity'] = float(tn / (tn + fp)) if (tn + fp) > 0 else 0.0
    metrics['false_positive_rate'] = float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0
    metrics['false_negative_rate'] = float(fn / (fn + tp)) if (fn + tp) > 0 else 0.0

    # Balanced accuracy
    metrics['balanced_accuracy'] = (metrics['recall'] + metrics['specificity']) / 2

    # ROC-AUC if probabilities provided
    if y_prob is not None and len(np.unique(y_true)) > 1:
        try:
            metrics['roc_auc'] = roc_auc_score(y_true, y_prob)
            metrics['average_precision'] = average_precision_score(y_true, y_prob)
        except:
            metrics['roc_auc'] = None
            metrics['average_precision'] = None

    return metrics


def print_classification_report(metrics: dict, horizon: int, split: str = "test"):
    """Pretty print classification metrics."""
    print(f"\n{'='*60}")
    print(f"CLASSIFICATION METRICS - Horizon {horizon} ({split})")
    print(f"{'='*60}")

    # Confusion matrix
    cm = metrics['confusion_matrix']
    print("\nConfusion Matrix:")
    print(f"                Predicted Negative  Predicted Positive")
    print(f"Actual Negative        {cm[0,0]:6d}              {cm[0,1]:6d}")
    print(f"Actual Positive        {cm[1,0]:6d}              {cm[1,1]:6d}")

    # Metrics
    print(f"\nPerformance Metrics:")
    print(f"  Accuracy:           {metrics['accuracy']:.4f}")
    print(f"  Balanced Accuracy:  {metrics['balanced_accuracy']:.4f}")
    print(f"  Precision:          {metrics['precision']:.4f}")
    print(f"  Recall (TPR):       {metrics['recall']:.4f}")
    print(f"  Specificity (TNR):  {metrics['specificity']:.4f}")
    print(f"  F1 Score:           {metrics['f1_score']:.4f}")

    if metrics.get('roc_auc') is not None:
        print(f"  ROC-AUC:            {metrics['roc_auc']:.4f}")
        print(f"  Average Precision:  {metrics['average_precision']:.4f}")

    # Detailed counts
    print(f"\nDetailed Counts:")
    print(f"  True Positives:     {metrics['true_positives']}")
    print(f"  True Negatives:     {metrics['true_negatives']}")
    print(f"  False Positives:    {metrics['false_positives']}")
    print(f"  False Negatives:    {metrics['false_negatives']}")
    print(f"  Total Samples:      {metrics['true_positives'] + metrics['true_negatives'] + metrics['false_positives'] + metrics['false_negatives']}")


def plot_confusion_matrix(cm: np.ndarray, horizon: int, save_path: Path = None):
    """Plot confusion matrix as heatmap."""
    plt.figure(figsize=(8, 6))

    # Normalize for percentages
    cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    # Create annotations
    annot = np.array([[f'{cm[i,j]}\n({cm_pct[i,j]:.1f}%)'
                      for j in range(cm.shape[1])]
                      for i in range(cm.shape[0])])

    sns.heatmap(cm, annot=annot, fmt='', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'],
                cbar_kws={'label': 'Count'})

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title(f'Confusion Matrix - Horizon {horizon}')
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"  Saved confusion matrix to: {save_path}")
    else:
        plt.show()

    plt.close()


def plot_roc_curve(y_true: np.ndarray, y_prob: np.ndarray,
                   horizon: int, save_path: Path = None):
    """Plot ROC curve."""
    if len(np.unique(y_true)) < 2:
        print("  Warning: Cannot plot ROC curve - only one class present")
        return

    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    roc_auc = roc_auc_score(y_true, y_prob)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
             label='Random classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title(f'ROC Curve - Horizon {horizon}')
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"  Saved ROC curve to: {save_path}")
    else:
        plt.show()

    plt.close()


def plot_precision_recall_curve(y_true: np.ndarray, y_prob: np.ndarray,
                                horizon: int, save_path: Path = None):
    """Plot precision-recall curve."""
    if len(np.unique(y_true)) < 2:
        print("  Warning: Cannot plot PR curve - only one class present")
        return

    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    avg_precision = average_precision_score(y_true, y_prob)

    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='darkorange', lw=2,
             label=f'PR curve (AP = {avg_precision:.3f})')

    # Baseline
    baseline = np.sum(y_true) / len(y_true)
    plt.axhline(y=baseline, color='navy', linestyle='--', lw=2,
                label=f'Baseline (prevalence = {baseline:.3f})')

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - Horizon {horizon}')
    plt.legend(loc="lower left")
    plt.grid(alpha=0.3)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"  Saved PR curve to: {save_path}")
    else:
        plt.show()

    plt.close()


def save_metrics_to_csv(all_metrics: dict, save_path: Path):
    """Save metrics summary to CSV."""
    rows = []
    for symbol, horizons in all_metrics.items():
        for h, metrics in horizons.items():
            row = {
                'symbol': symbol,
                'horizon': h,
                'accuracy': metrics.get('accuracy', np.nan),
                'balanced_accuracy': metrics.get('balanced_accuracy', np.nan),
                'precision': metrics.get('precision', np.nan),
                'recall': metrics.get('recall', np.nan),
                'specificity': metrics.get('specificity', np.nan),
                'f1_score': metrics.get('f1_score', np.nan),
                'roc_auc': metrics.get('roc_auc', np.nan),
                'avg_precision': metrics.get('average_precision', np.nan),
                'brier_score': metrics.get('brier_val', np.nan),
                'ece': metrics.get('ece_val', np.nan),
                'true_positives': metrics.get('true_positives', 0),
                'false_positives': metrics.get('false_positives', 0),
                'true_negatives': metrics.get('true_negatives', 0),
                'false_negatives': metrics.get('false_negatives', 0),
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(save_path, index=False)
    print(f"\nSaved metrics summary to: {save_path}")

    # Print summary statistics
    print("\n" + "="*60)
    print("OVERALL METRICS SUMMARY (across all symbols and horizons)")
    print("="*60)
    summary_cols = ['accuracy', 'balanced_accuracy', 'precision', 'recall',
                   'f1_score', 'roc_auc', 'brier_score', 'ece']
    print(df[summary_cols].describe().round(4))


In [None]:
# =========================================
# FEATURE ENGINEERING
# =========================================
def make_feature_table(close: pd.Series):
    """Build feature table from close prices."""
    df = pd.DataFrame(index=close.index)
    df["price"] = close.astype(float)

    # Log returns
    df["ret_1"] = np.log(df["price"] / df["price"].shift(1))
    df["ret_3"] = np.log(df["price"] / df["price"].shift(3))
    df["ret_6"] = np.log(df["price"] / df["price"].shift(6))

    # Volatility
    df["vol_6"] = df["ret_1"].rolling(6).std()
    df["vol_12"] = df["ret_1"].rolling(12).std()

    # Moving average ratio
    ma_10 = df["price"].rolling(10).mean()
    ma_20 = df["price"].rolling(20).mean()
    df["ma_ratio"] = np.log(ma_10 / ma_20)

    # Drop NaN rows
    df = df.dropna()

    # Feature matrix (exclude price)
    feat_cols = [c for c in df.columns if c != "price"]
    X = df[feat_cols].values.astype(float)

    return df, X

In [None]:
# =========================================
# WALK-FORWARD CV
# =========================================
def purged_walkforward_slices(n: int, n_folds: int = 3, embargo: int = 24):
    """Generate (train, val, test) slices for walk-forward CV."""
    fold_size = n // (n_folds + 2)
    slices = []

    for i in range(n_folds):
        train_end = (i + 1) * fold_size
        val_start = train_end + embargo
        val_end = val_start + fold_size
        test_start = val_end + embargo
        test_end = min(test_start + fold_size, n)

        if test_end - test_start < fold_size // 2:
            break

        slices.append((
            (0, train_end),
            (val_start, val_end),
            (test_start, test_end)
        ))

    return slices

In [None]:
# =========================================
# XGBOOST MODEL
# =========================================
@dataclass
class XGBSnapshot:
    """Container for trained XGBoost model."""
    booster: xgb.Booster
    scaler: StandardScaler
    horizon: int
    feature_names: list[str]
    best_iteration: int | None = None
    feature_importance: dict | None = None


def fit_xgboost_classifier(X_train: np.ndarray, y_train: np.ndarray,
                           X_val: np.ndarray = None, y_val: np.ndarray = None,
                           horizon: int = 1,
                           feature_names: list[str] = None,
                           random_state: int = 123,
                           n_estimators: int = 100,
                           max_depth: int = 6,
                           learning_rate: float = 0.1,
                           subsample: float = 0.8,
                           colsample_bytree: float = 0.8,
                           reg_alpha: float = 0.0,
                           reg_lambda: float = 1.0,
                           scale_pos_weight: float = None,
                           early_stopping_rounds: int = 10) -> XGBSnapshot:
    """
    Train XGBoost classifier with optional early stopping.

    Args:
        X_train: Training features
        y_train: Training labels (binary)
        X_val: Validation features (for early stopping)
        y_val: Validation labels
        horizon: Forecast horizon
        feature_names: List of feature names
        n_estimators: Number of boosting rounds (trees)
        max_depth: Maximum tree depth
        learning_rate: Step size shrinkage (eta)
        subsample: Subsample ratio of training instances
        colsample_bytree: Subsample ratio of columns per tree
        reg_alpha: L1 regularization term on weights
        reg_lambda: L2 regularization term on weights
        scale_pos_weight: Balancing of positive/negative weights (auto if None)
        early_stopping_rounds: Stop if no improvement for N rounds

    Returns:
        XGBSnapshot with trained model
    """
    if not XGBOOST_AVAILABLE:
        raise ImportError("XGBoost not installed. Run: pip install xgboost")

    # Auto-balance classes if not specified
    if scale_pos_weight is None:
        neg_count = np.sum(y_train == 0)
        pos_count = np.sum(y_train == 1)
        scale_pos_weight = neg_count / pos_count if pos_count > 0 else 1.0

    # Standardize features (XGBoost doesn't require it, but helps with regularization)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)

    # Create DMatrix (XGBoost's internal data structure)
    if feature_names is None:
        feature_names = [f"f{i}" for i in range(X_train.shape[1])]

    dtrain = xgb.DMatrix(X_train_scaled, label=y_train, feature_names=feature_names)

    # Parameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': max_depth,
        'eta': learning_rate,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'alpha': reg_alpha,
        'lambda': reg_lambda,
        'scale_pos_weight': scale_pos_weight,
        'seed': random_state,
        'tree_method': 'auto',
        'verbosity': 0  # Silent
    }

    # Early stopping setup
    evals = [(dtrain, 'train')]
    if X_val is not None and y_val is not None:
        X_val_scaled = scaler.transform(X_val)
        dval = xgb.DMatrix(X_val_scaled, label=y_val, feature_names=feature_names)
        evals.append((dval, 'val'))

    # Train
    evals_result = {}
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=n_estimators,
        evals=evals,
        early_stopping_rounds=early_stopping_rounds if X_val is not None else None,
        evals_result=evals_result,
        verbose_eval=False
    )

    # Feature importance
    importance_dict = bst.get_score(importance_type='gain')  # 'gain', 'weight', or 'cover'

    return XGBSnapshot(
        booster=bst,
        scaler=scaler,
        horizon=horizon,
        feature_names=feature_names,
        best_iteration=bst.best_iteration if hasattr(bst, 'best_iteration') else n_estimators,
        feature_importance=importance_dict
    )

In [None]:
# =========================================
# FORECASTING
# =========================================
def forecast_multi_horizon_xgb(
    snapshots: dict[int, XGBSnapshot],
    X_seg: np.ndarray,
    price_seg: pd.Series,
    horizons: list[int],
    cost_bp: dict[int, float] | None = None
):
    """
    Generate multi-horizon forecasts using trained XGBoost models.

    XGBoost provides well-calibrated probabilities through its logistic objective.
    """
    if cost_bp is None:
        cost_bp = {h: DEFAULT_COST_BP.get(h, 8.0) for h in horizons}
    cost_log = {h: bp_to_logret(float(cost_bp[h])) for h in horizons}

    Tseg = X_seg.shape[0]
    idx = price_seg.index
    out = {}

    for h in horizons:
        if h not in snapshots:
            print(f"Warning: No model for horizon {h}, skipping")
            continue

        snap = snapshots[h]
        out_h = pd.DataFrame(index=idx[:-h] if h < Tseg else idx[:0])
        T_h = Tseg - h

        if T_h <= 0:
            out[h] = out_h
            continue

        # Scale features
        X_scaled = snap.scaler.transform(X_seg[:T_h])

        # Create DMatrix
        dtest = xgb.DMatrix(X_scaled, feature_names=snap.feature_names)

        # Get probabilities
        p_edge = snap.booster.predict(dtest)

        # Expected return estimation
        mu = p_edge * (cost_log[h] + 0.002) + (1 - p_edge) * (-cost_log[h] - 0.001)

        # Uncertainty estimate (based on probability)
        std_h = np.sqrt(p_edge * (1 - p_edge)) * 0.04

        # Quantiles
        q10 = mu - 1.28 * std_h
        q50 = mu
        q90 = mu + 1.28 * std_h

        # Populate DataFrame
        p_now = price_seg.iloc[:T_h].values

        out_h['mu'] = mu
        out_h['std'] = std_h
        out_h['p_edge_raw'] = p_edge
        out_h['ret_q10'] = q10
        out_h['ret_q50'] = q50
        out_h['ret_q90'] = q90
        out_h['price_pred'] = p_now * np.exp(mu)
        out_h['price_q10'] = p_now * np.exp(q10)
        out_h['price_q50'] = p_now * np.exp(q50)
        out_h['price_q90'] = p_now * np.exp(q90)

        out[h] = out_h

    return out, cost_log

In [15]:
# =========================================
# CALIBRATION
# =========================================
@dataclass
class ProbCalibrator:
    """Probability calibrator using isotonic regression."""
    method: str
    iso: IsotonicRegression | None = None


def fit_prob_calibrator_isotonic(p_raw: np.ndarray, y: np.ndarray,
                                 min_points: int = 30) -> ProbCalibrator:
    """Fit isotonic regression p_raw -> y."""
    p_raw = np.asarray(p_raw, float)
    y = np.asarray(y, float)
    m = np.isfinite(p_raw) & np.isfinite(y)
    p, t = p_raw[m], y[m]
    if p.size < min_points or np.unique(p).size < 3:
        return ProbCalibrator(method="identity", iso=None)
    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(p, t)
    return ProbCalibrator(method="isotonic", iso=iso)


def apply_prob_calibrator(cal: ProbCalibrator, p_raw: np.ndarray) -> np.ndarray:
    """Apply probability calibrator."""
    p_raw = np.asarray(p_raw, float)
    if cal.method == "isotonic":
        return cal.iso.predict(p_raw)
    return p_raw


@dataclass
class IntervalCalibrator:
    """Conformal prediction interval calibrator."""
    method: str
    q_alpha: float
    alpha: float


def fit_conformal_interval(residuals: np.ndarray, alpha: float = 0.2) -> IntervalCalibrator:
    """Fit conformal prediction intervals."""
    resid = np.asarray(residuals, float)
    resid = resid[np.isfinite(resid)]
    q = float(np.quantile(np.abs(resid), 1 - alpha)) if resid.size > 0 else 0.0
    return IntervalCalibrator(method="conformal_abs", q_alpha=q, alpha=alpha)


def apply_conformal_interval(cal: IntervalCalibrator, mu: np.ndarray):
    """Apply conformal prediction intervals."""
    mu = np.asarray(mu, float)
    return mu - cal.q_alpha, mu + cal.q_alpha


In [None]:
# =========================================
# FEATURE IMPORTANCE ANALYSIS
# =========================================
def analyze_feature_importance(snapshots: dict[int, XGBSnapshot]):
    """
    Analyze and print feature importance from XGBoost models.

    XGBoost provides multiple importance types:
    - 'gain': Average gain across all splits using the feature
    - 'weight': Number of times feature is used in splits
    - 'cover': Average coverage (samples affected) by splits using feature
    """
    print("\n" + "="*60)
    print("FEATURE IMPORTANCE ANALYSIS (XGBoost)")
    print("="*60)
    print("\nNote: 'Gain' measures average improvement in loss when splitting on feature.")
    print("Higher gain = more important for predictions.\n")

    for h, snap in snapshots.items():
        if snap.feature_importance is None or not snap.feature_importance:
            continue

        print(f"\nHorizon {h} (best iteration: {snap.best_iteration}):")

        # Sort by gain (importance)
        sorted_features = sorted(snap.feature_importance.items(),
                               key=lambda x: x[1], reverse=True)

        # Normalize to percentages
        total_gain = sum(gain for _, gain in sorted_features)

        print("  Top Features by Gain:")
        for i, (feat, gain) in enumerate(sorted_features[:10]):
            pct = (gain / total_gain * 100) if total_gain > 0 else 0
            print(f"    {i+1}. {feat:15s}: {gain:8.2f} ({pct:5.1f}%)")


In [None]:
# =========================================
# MAIN TRAINING PIPELINE
# =========================================
def run_xgb_for_symbol(symbol: str, path: Path,
                       horizons: list[int] = HORIZONS,
                       n_folds: int = 3,
                       embargo: int = 24,
                       n_estimators: int = 100,
                       max_depth: int = 6,
                       learning_rate: float = 0.1,
                       early_stopping: bool = True,
                       save_plots: bool = True):
    """
    Train and evaluate XGBoost models for one symbol.

    Args:
        symbol: Asset symbol
        path: Path to CSV file
        horizons: Forecast horizons in bars
        n_folds: Number of walk-forward folds
        embargo: Embargo period between folds
        n_estimators: Number of boosting rounds
        max_depth: Maximum tree depth (3-10 typical)
        learning_rate: Step size shrinkage (0.01-0.3 typical)
        early_stopping: Use validation set for early stopping
        save_plots: If True, save confusion matrix and ROC curves

    Returns:
        results: dict[horizon] -> dict with 'val', 'test', 'diag', 'metrics'
    """
    if not XGBOOST_AVAILABLE:
        raise ImportError("XGBoost not installed. Run: pip install xgboost")

    # Load data
    df_raw = pd.read_csv(path)
    close_col = _find_close_column(df_raw)
    close = pd.Series(df_raw[close_col].astype(float).values,
                      index=pd.RangeIndex(len(df_raw)), name="close")

    feat_df, X = make_feature_table(close)
    price = feat_df["price"]
    n = len(price)

    # Get feature names
    feature_names = [c for c in feat_df.columns if c != "price"]

    folds = purged_walkforward_slices(n, n_folds=n_folds, embargo=embargo)

    results = {h: {"val": [], "test": [], "diag": [], "metrics": []} for h in horizons}

    # Create plots directory
    if save_plots:
        plots_dir = DATA_DIR / "evaluation_plots" / symbol
        plots_dir.mkdir(parents=True, exist_ok=True)

    print(f"\n{'='*60}")
    print(f"Training XGBoost for {symbol}")
    print(f"Horizons: {horizons}")
    print(f"Folds: {n_folds}")
    print(f"Params: n_estimators={n_estimators}, max_depth={max_depth}, lr={learning_rate}")
    print(f"{'='*60}\n")

    for fold_idx, ((s0,e0), (s1,e1), (s2,e2)) in enumerate(folds):
        print(f"Fold {fold_idx + 1}/{len(folds)}: Train[{s0}:{e0}] Val[{s1}:{e1}] Test[{s2}:{e2}]")

        # Train one XGBoost per horizon
        snapshots = {}

        for h in horizons:
            print(f"  Training h={h}...", end=" ")

            # Create labels for this horizon
            ret_train = cumulative_log_returns(price.iloc[s0:e0], h)
            ret_val = cumulative_log_returns(price.iloc[s1:e1], h)

            # Align features and labels
            n_train = min(len(X[s0:e0]), len(ret_train))
            X_train_aligned = X[s0:s0+n_train]
            ret_train_aligned = ret_train.iloc[:n_train]

            if len(X_train_aligned) < 50:
                print("SKIP (insufficient data)")
                continue

            # Binary classification
            y_train = (ret_train_aligned.values > bp_to_logret(DEFAULT_COST_BP[h])).astype(int)

            # Validation set for early stopping
            X_val_use, y_val_use = None, None
            if early_stopping:
                n_val = min(len(X[s1:e1]), len(ret_val))
                X_val_use = X[s1:s1+n_val]
                y_val_use = (ret_val.iloc[:n_val].values > bp_to_logret(DEFAULT_COST_BP[h])).astype(int)

            # Check class balance
            pos_frac = y_train.mean()
            if pos_frac < 0.1 or pos_frac > 0.9:
                print(f"WARN (imbalanced: {pos_frac:.2%} positive)", end=" ")

            snap = fit_xgboost_classifier(
                X_train_aligned, y_train,
                X_val=X_val_use, y_val=y_val_use,
                horizon=h,
                feature_names=feature_names,
                random_state=123 + fold_idx,
                n_estimators=n_estimators,
                max_depth=max_depth,
                learning_rate=learning_rate,
                early_stopping_rounds=10 if early_stopping else None
            )

            snapshots[h] = snap
            print(f"✓ (best_iter={snap.best_iteration})")

        if not snapshots:
            print("  No models trained, skipping fold")
            continue

        # Feature importance analysis (first fold only)
        if fold_idx == 0:
            analyze_feature_importance(snapshots)

        # Forecast on validation and test
        print("  Forecasting validation...", end=" ")
        out_val_raw, cost_log = forecast_multi_horizon_xgb(
            snapshots=snapshots,
            X_seg=X[s1:e1],
            price_seg=price.iloc[s1:e1],
            horizons=horizons
        )
        print("✓")

        print("  Forecasting test...", end=" ")
        out_test_raw, _ = forecast_multi_horizon_xgb(
            snapshots=snapshots,
            X_seg=X[s2:e2],
            price_seg=price.iloc[s2:e2],
            horizons=horizons
        )
        print("✓")

        # Calibration and evaluation
        for h in horizons:
            if h not in out_val_raw or h not in out_test_raw:
                continue

            ret_val = cumulative_log_returns(price.iloc[s1:e1], h)
            idx_common = out_val_raw[h].index.intersection(ret_val.index)

            if len(idx_common) == 0:
                continue

            dfV = out_val_raw[h].loc[idx_common].copy()
            maskV = np.isfinite(dfV["p_edge_raw"].values) & np.isfinite(dfV["mu"].values)
            dfV = dfV[maskV]

            if len(dfV) < 20:
                continue

            ret_val_aligned = ret_val.loc[dfV.index]
            y_val = (ret_val_aligned.values > cost_log[h]).astype(int)
            p_raw_val = dfV["p_edge_raw"].values
            mu_val = dfV["mu"].values

            # Fit calibrators
            cal_prob = fit_prob_calibrator_isotonic(p_raw_val, y_val, min_points=20)
            resid_val = ret_val_aligned.values - mu_val
            cal_pi = fit_conformal_interval(resid_val, alpha=0.2)

            # Apply to test
            ret_test = cumulative_log_returns(price.iloc[s2:e2], h)
            idx_test_common = out_test_raw[h].index.intersection(ret_test.index)
            dfT = out_test_raw[h].loc[idx_test_common].copy()

            maskT = np.isfinite(dfT["p_edge_raw"].values) & np.isfinite(dfT["mu"].values)
            dfT = dfT[maskT]

            if len(dfT) == 0:
                continue

            dfT["p_edge"] = apply_prob_calibrator(cal_prob, dfT["p_edge_raw"].values)
            mu_test = dfT["mu"].values
            lo, hi = apply_conformal_interval(cal_pi, mu_test)
            dfT["ret_lo"] = lo
            dfT["ret_hi"] = hi

            p_now = price.loc[dfT.index].values
            dfT["price_lo"] = p_now * np.exp(lo)
            dfT["price_hi"] = p_now * np.exp(hi)

            dfT["edge"] = dfT["mu"] - cost_log[h]
            dfT["risk_edge"] = (dfT["mu"] - cost_log[h]) / (dfT["std"] + 1e-12)

            results[h]["test"].append(dfT)

            # Compute classification metrics
            ret_test_aligned = ret_test.loc[dfT.index]
            y_test_true = (ret_test_aligned.values > cost_log[h]).astype(int)
            y_test_pred = (dfT["p_edge"].values > 0.5).astype(int)
            y_test_prob = dfT["p_edge"].values

            test_metrics = compute_classification_metrics(
                y_test_true, y_test_pred, y_test_prob
            )

            # Print detailed report
            print_classification_report(test_metrics, h, "test")

            # Save plots (only for last fold)
            if save_plots and fold_idx == len(folds) - 1:
                print(f"\n  Generating evaluation plots for horizon {h}...")

                cm_path = plots_dir / f"confusion_matrix_h{h}.png"
                plot_confusion_matrix(test_metrics['confusion_matrix'], h, cm_path)

                if test_metrics.get('roc_auc') is not None:
                    roc_path = plots_dir / f"roc_curve_h{h}.png"
                    plot_roc_curve(y_test_true, y_test_prob, h, roc_path)

                    pr_path = plots_dir / f"precision_recall_h{h}.png"
                    plot_precision_recall_curve(y_test_true, y_test_prob, h, pr_path)

            results[h]["metrics"].append(test_metrics)

            # Calibration diagnostics
            if cal_prob.method == "isotonic":
                p_cal_val = apply_prob_calibrator(cal_prob, p_raw_val)
                brier = brier_score(y_val, p_cal_val)
                ece = expected_calibration_error(y_val, p_cal_val)
            else:
                brier = brier_score(y_val, p_raw_val)
                ece = expected_calibration_error(y_val, p_raw_val)

            coverage = float(np.mean((resid_val >= -cal_pi.q_alpha) & (resid_val <= cal_pi.q_alpha)))

            diag = {
                "h": h,
                "brier_val": float(brier),
                "ece_val": float(ece),
                "pi_coverage_val": coverage
            }
            results[h]["diag"].append(diag)

            # Store validation
            dfV["p_edge"] = apply_prob_calibrator(cal_prob, dfV["p_edge_raw"].values)
            loV, hiV = apply_conformal_interval(cal_pi, mu_val)
            dfV["ret_lo"], dfV["ret_hi"] = loV, hiV
            results[h]["val"].append(dfV)

    # Concatenate folds
    for h in horizons:
        for split in ("val", "test"):
            if results[h][split]:
                results[h][split] = pd.concat(results[h][split]).sort_index()
            else:
                results[h][split] = pd.DataFrame()

    print(f"\nCompleted {symbol}\n")
    return results

In [18]:
# =========================================
# JSON EXPORT
# =========================================
def build_json_records(all_outputs: dict,
                       model_version: str = MODEL_VERSION,
                       calibration_version: str = CALIBRATION_VERSION,
                       horizons: list[int] = HORIZONS):
    """Build JSONL records for trading agent."""
    records = []
    for sym, res in all_outputs.items():
        for h in horizons:
            df = res[h]["test"]
            if isinstance(df, list) or isinstance(df, tuple):
                df = pd.concat(df).sort_index()
            for t, row in df.iterrows():
                rec = {
                    "timestamp_index": int(t),
                    "symbol": sym,
                    "horizon_bars": int(h),
                    "model_version": model_version,
                    "calibration_version": calibration_version,
                    "signals": {
                        "expected_return": float(row["mu"]),
                        "stdev_return": float(row["std"]),
                        "p_edge_gt_cost": float(row["p_edge"]),
                        "predicted_price": float(row["price_pred"]),
                        "price_PI": {
                            "p10": float(row["price_q10"]),
                            "p50": float(row["price_q50"]),
                            "p90": float(row["price_q90"])
                        }
                    },
                    "policy_suggestions": {
                        "gate_threshold_p": TAU_P,
                        "gate_threshold_edge": TAU_MU,
                        "suggested_action": "buy" if (row["p_edge"]>=TAU_P and row["edge"]>=TAU_MU and row["mu"]>0)
                                            else ("sell" if (row["p_edge"]>=TAU_P and row["edge"]>=TAU_MU and row["mu"]<0)
                                                  else "hold")
                    }
                }
                records.append(rec)
    return records

In [None]:
# =========================================
# MAIN EXECUTION
# =========================================
if __name__ == "__main__":
    if not XGBOOST_AVAILABLE:
        print("\nERROR: XGBoost is not installed!")
        print("Please install with: pip install xgboost")
        print("Then run this script again.\n")
        exit(1)

    # Process all symbols
    all_outputs = {}
    all_metrics = {}

    for symbol, path in FILES.items():
        if not path.exists():
            print(f"Warning: {path} not found, skipping {symbol}")
            continue

        results = run_xgb_for_symbol(
            symbol=symbol,
            path=path,
            horizons=HORIZONS,
            n_folds=3,
            embargo=24,
            n_estimators=100,      # Try: 50 (faster), 200 (more accurate)
            max_depth=6,           # Try: 3-4 (less overfit), 8-10 (more complex)
            learning_rate=0.1,     # Try: 0.01-0.05 (slower learning), 0.2-0.3 (faster)
            early_stopping=True,   # Recommended: prevents overfitting
            save_plots=True
        )

        all_outputs[symbol] = results

        # Collect metrics
        all_metrics[symbol] = {}
        for h in HORIZONS:
            if results[h]['metrics']:
                metrics_list = results[h]['metrics']
                avg_metrics = {
                    'accuracy': np.mean([m['accuracy'] for m in metrics_list]),
                    'balanced_accuracy': np.mean([m['balanced_accuracy'] for m in metrics_list]),
                    'precision': np.mean([m['precision'] for m in metrics_list]),
                    'recall': np.mean([m['recall'] for m in metrics_list]),
                    'specificity': np.mean([m['specificity'] for m in metrics_list]),
                    'f1_score': np.mean([m['f1_score'] for m in metrics_list]),
                    'roc_auc': np.mean([m['roc_auc'] for m in metrics_list if m.get('roc_auc') is not None]),
                    'average_precision': np.mean([m['average_precision'] for m in metrics_list if m.get('average_precision') is not None]),
                    'true_positives': sum([m['true_positives'] for m in metrics_list]),
                    'false_positives': sum([m['false_positives'] for m in metrics_list]),
                    'true_negatives': sum([m['true_negatives'] for m in metrics_list]),
                    'false_negatives': sum([m['false_negatives'] for m in metrics_list]),
                }

                diag_list = results[h]['diag']
                if diag_list:
                    avg_metrics['brier_val'] = np.mean([d['brier_val'] for d in diag_list])
                    avg_metrics['ece_val'] = np.mean([d['ece_val'] for d in diag_list])

                all_metrics[symbol][h] = avg_metrics

        # Print summary
        print(f"\n{'='*60}")
        print(f"SUMMARY FOR {symbol}")
        print(f"{'='*60}")

        for h in HORIZONS:
            test_df = results[h]['test']

            if len(test_df) > 0 and h in all_metrics[symbol]:
                metrics = all_metrics[symbol][h]
                print(f"\nHorizon {h}:")
                print(f"  Test samples:      {len(test_df)}")
                print(f"  Accuracy:          {metrics['accuracy']:.4f}")
                print(f"  Balanced Acc:      {metrics['balanced_accuracy']:.4f}")
                print(f"  Precision:         {metrics['precision']:.4f}")
                print(f"  Recall:            {metrics['recall']:.4f}")
                print(f"  F1 Score:          {metrics['f1_score']:.4f}")
                if not np.isnan(metrics.get('roc_auc', np.nan)):
                    print(f"  ROC-AUC:           {metrics['roc_auc']:.4f}")
                print(f"  Brier Score:       {metrics.get('brier_val', np.nan):.4f}")
                print(f"  ECE:               {metrics.get('ece_val', np.nan):.4f}")

    # Save metrics summary
    metrics_csv_path = DATA_DIR / "xgb_metrics_summary.csv"
    save_metrics_to_csv(all_metrics, metrics_csv_path)

    # Export to JSON
    json_records = build_json_records(all_outputs)
    json_path = DATA_DIR / "trader_feed_xgb_multiH.jsonl"
    with open(json_path, "w") as f:
        for r in json_records:
            f.write(json.dumps(r) + "\n")

    print(f"\n{'='*60}")
    print(f"FINAL OUTPUTS")
    print(f"{'='*60}")
    print(f"JSON feed:        {json_path}")
    print(f"Metrics CSV:      {metrics_csv_path}")
    print(f"Plots directory:  {DATA_DIR / 'evaluation_plots'}")
    print(f"{'='*60}\n")

    print("💡 Tip: XGBoost typically achieves the best accuracy!")
    print("Compare with other models using the metrics CSV files.")



Training XGBoost for BTC
Horizons: [1, 3, 6]
Folds: 3
Params: n_estimators=100, max_depth=6, lr=0.1

Fold 1/3: Train[0:1749] Val[1773:3522] Test[3546:5295]
  Training h=1... ✓ (best_iter=11)
  Training h=3... ✓ (best_iter=5)
  Training h=6... ✓ (best_iter=0)

FEATURE IMPORTANCE ANALYSIS (XGBoost)

Note: 'Gain' measures average improvement in loss when splitting on feature.
Higher gain = more important for predictions.


Horizon 1 (best iteration: 11):
  Top Features by Gain:
    1. ret_3          :     3.48 ( 17.3%)
    2. ret_1          :     3.45 ( 17.2%)
    3. ret_6          :     3.37 ( 16.8%)
    4. vol_6          :     3.31 ( 16.5%)
    5. ma_ratio       :     3.26 ( 16.2%)
    6. vol_12         :     3.23 ( 16.1%)

Horizon 3 (best iteration: 5):
  Top Features by Gain:
    1. vol_12         :     4.27 ( 18.0%)
    2. ret_3          :     4.26 ( 18.0%)
    3. ma_ratio       :     4.17 ( 17.6%)
    4. ret_6          :     3.89 ( 16.4%)
    5. vol_6          :     3.66 ( 15.5%)
 