In [1]:
!pip install torch torchvision



In [None]:
#!/usr/bin/env python3
"""
GRU Multi-Horizon Trading Signal Generator
Complete implementation for cryptocurrency trading with custom GRU (Gated Recurrent Unit)

Key features:
- Custom GRU implementation (from user's code)
- One GRU model per horizon (1, 3, 6 bars)
- Sequence modeling for time series
- Native handling of temporal dependencies
- Walk-forward cross-validation
- Comprehensive evaluation metrics
- JSON feed output for trading agent
- GPU acceleration support
"""

import json
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    average_precision_score
)

import matplotlib
matplotlib.use('Agg')  # Non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings("ignore")


# =========================================
# CUSTOM GRU IMPLEMENTATION (USER PROVIDED)
# =========================================
class CustomGRUCell(nn.Module):
    def __init__(self, input_size, hidden_size, bias=True):
        super(CustomGRUCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weight matrices for update gate
        self.W_z = nn.Linear(input_size, hidden_size, bias=bias)
        self.U_z = nn.Linear(hidden_size, hidden_size, bias=bias)

        # Weight matrices for reset gate
        self.W_r = nn.Linear(input_size, hidden_size, bias=bias)
        self.U_r = nn.Linear(hidden_size, hidden_size, bias=bias)

        # Weight matrices for new memory content
        self.W_h = nn.Linear(input_size, hidden_size, bias=bias)
        self.U_h = nn.Linear(hidden_size, hidden_size, bias=bias)

    def forward(self, x, h):
        z = torch.sigmoid(self.W_z(x) + self.U_z(h))
        r = torch.sigmoid(self.W_r(x) + self.U_r(h))
        h_tilde = torch.tanh(self.W_h(x) + self.U_h(r * h))
        h_next = (1 - z) * h + z * h_tilde
        return h_next


class CustomGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, bias=True, batch_first=False):
        super(CustomGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_first = batch_first

        self.cells = nn.ModuleList([
            CustomGRUCell(input_size if i == 0 else hidden_size, hidden_size, bias)
            for i in range(num_layers)
        ])

    def forward(self, x, h=None):
        # x shape: (seq_len, batch, input_size) or (batch, seq_len, input_size)
        if self.batch_first:
            x = x.transpose(0, 1)  # Convert to (seq_len, batch, input_size)

        seq_len, batch_size, _ = x.size()

        if h is None:
            h = torch.zeros(self.num_layers, batch_size, self.hidden_size, device=x.device)

        h_n = []
        for layer in range(self.num_layers):
            h_t = h[layer]
            outputs = []
            for t in range(seq_len):
                h_t = self.cells[layer](x[t], h_t)
                outputs.append(h_t)
            x = torch.stack(outputs, dim=0)
            h_n.append(h_t)

        h_n = torch.stack(h_n, dim=0)

        # Convert back to batch_first if needed
        if self.batch_first:
            x = x.transpose(0, 1)  # Convert back to (batch, seq_len, hidden_size)

        return x, h_n


class GRUClassifier(nn.Module):
    """
    GRU-based binary classifier for trading signals.
    Uses custom GRU implementation from user's code.
    """
    def __init__(self, input_size, hidden_size=64, num_layers=2,
                 dropout=0.2, device='cpu', use_custom_gru=True):
        super(GRUClassifier, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

        # Use custom GRU implementation
        if use_custom_gru:
            self.gru = CustomGRU(
                input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                bias=True,
                batch_first=True
            )
        else:
            self.gru = nn.GRU(
                input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                bias=True,
                batch_first=True,
                dropout=dropout if num_layers > 1 else 0
            )

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

        # Output layer
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x shape: (batch, seq_len, features)
        # GRU output: (batch, seq_len, hidden_size)
        gru_out, h_n = self.gru(x)

        # Use last time step output
        last_output = gru_out[:, -1, :]  # (batch, hidden_size)

        # Apply dropout
        last_output = self.dropout(last_output)

        # Output layer
        out = self.fc(last_output)  # (batch, 1)

        # Sigmoid for binary classification
        out = torch.sigmoid(out)

        # Ensure output is [batch, 1]
        return out.view(-1, 1)


# =========================================
# CONFIGURATION
# =========================================
DATA_DIR = Path("/Users/nitinlodha/Desktop/ML/ML_Project/Bybit_CSV_Data")

FILES = {
    "BTC": DATA_DIR / "Bybit_BTC.csv",
    "ETH": DATA_DIR / "Bybit_ETH.csv",
    "SOL": DATA_DIR / "Bybit_SOL.csv",
    "XRP": DATA_DIR / "Bybit_XRP.csv",
    "DOGE": DATA_DIR / "Bybit_DOGE.csv",
}

HORIZONS = [1, 3, 6]  # Forecast horizons in 4-hour bars
DEFAULT_COST_BP = {1: 8.0, 3: 10.0, 6: 12.0}  # Trading costs in basis points

# GRU hyperparameters
SEQUENCE_LENGTH = 20  # Look back 20 time steps (80 hours)
HIDDEN_SIZE = 64      # Hidden units in GRU
NUM_LAYERS = 2        # Stacked GRU layers
DROPOUT = 0.2         # Dropout rate
BATCH_SIZE = 32       # Training batch size
EPOCHS = 50           # Training epochs
LEARNING_RATE = 0.001 # Learning rate
EARLY_STOPPING = 10   # Early stopping patience

# Policy thresholds
TAU_P = 0.60
TAU_MU = 0.0005
LAM = 2.0
W_MAX = 0.50

MODEL_VERSION = "gru_multiH_v1.0"
CALIBRATION_VERSION = "iso+conformal_v1"

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# =========================================
# UTILITY FUNCTIONS
# =========================================
def bp_to_logret(bp: float) -> float:
    """Convert basis points to log-return units."""
    return bp * 1e-4


def _find_close_column(df: pd.DataFrame) -> str:
    """Find the close price column in a dataframe."""
    lower = {c.lower(): c for c in df.columns}
    for key in ("close", "closing_price", "close_price", "price_close", "last", "c"):
        if key in lower:
            return lower[key]
    float_cols = [c for c in df.columns if pd.api.types.is_float_dtype(df[c])]
    if len(float_cols) == 1:
        return float_cols[0]
    raise ValueError("Cannot identify 'close' column.")


def cumulative_log_returns(price: pd.Series, h: int) -> pd.Series:
    """Compute log(P_{t+h}/P_t) aligned to t."""
    return np.log(price.shift(-h) / price).dropna()


def brier_score(y: np.ndarray, p: np.ndarray) -> float:
    """Brier score for probability calibration."""
    return float(np.mean((y - p) ** 2))


def expected_calibration_error(y: np.ndarray, p: np.ndarray, bins: int = 10) -> float:
    """Expected Calibration Error (ECE)."""
    edges = np.linspace(0, 1, bins + 1)
    ece = 0.0
    for i in range(bins):
        m = (p >= edges[i]) & (p < edges[i+1])
        if m.sum() == 0:
            continue
        ece += (m.sum()/len(p)) * np.abs(np.mean(y[m]) - np.mean(p[m]))
    return float(ece)


# =========================================
# EVALUATION METRICS
# =========================================
def compute_classification_metrics(y_true: np.ndarray, y_pred: np.ndarray,
                                   y_prob: np.ndarray = None) -> dict:
    """Compute comprehensive classification metrics."""
    metrics = {}

    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    metrics['confusion_matrix'] = cm
    metrics['true_negatives'] = int(tn)
    metrics['false_positives'] = int(fp)
    metrics['false_negatives'] = int(fn)
    metrics['true_positives'] = int(tp)

    metrics['accuracy'] = float((tp + tn) / (tp + tn + fp + fn))
    metrics['precision'] = precision_score(y_true, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_true, y_pred, zero_division=0)
    metrics['f1_score'] = f1_score(y_true, y_pred, zero_division=0)

    metrics['specificity'] = float(tn / (tn + fp)) if (tn + fp) > 0 else 0.0
    metrics['false_positive_rate'] = float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0
    metrics['false_negative_rate'] = float(fn / (fn + tp)) if (fn + tp) > 0 else 0.0

    metrics['balanced_accuracy'] = (metrics['recall'] + metrics['specificity']) / 2

    if y_prob is not None and len(np.unique(y_true)) > 1:
        try:
            metrics['roc_auc'] = roc_auc_score(y_true, y_prob)
            metrics['average_precision'] = average_precision_score(y_true, y_prob)
        except:
            metrics['roc_auc'] = None
            metrics['average_precision'] = None

    return metrics


def print_classification_report(metrics: dict, horizon: int, split: str = "test"):
    """Pretty print classification metrics."""
    print(f"\n{'='*60}")
    print(f"CLASSIFICATION METRICS - Horizon {horizon} ({split})")
    print(f"{'='*60}")

    cm = metrics['confusion_matrix']
    print("\nConfusion Matrix:")
    print(f"                Predicted Negative  Predicted Positive")
    print(f"Actual Negative        {cm[0,0]:6d}              {cm[0,1]:6d}")
    print(f"Actual Positive        {cm[1,0]:6d}              {cm[1,1]:6d}")

    print(f"\nPerformance Metrics:")
    print(f"  Accuracy:           {metrics['accuracy']:.4f}")
    print(f"  Balanced Accuracy:  {metrics['balanced_accuracy']:.4f}")
    print(f"  Precision:          {metrics['precision']:.4f}")
    print(f"  Recall (TPR):       {metrics['recall']:.4f}")
    print(f"  Specificity (TNR):  {metrics['specificity']:.4f}")
    print(f"  F1 Score:           {metrics['f1_score']:.4f}")

    if metrics.get('roc_auc') is not None:
        print(f"  ROC-AUC:            {metrics['roc_auc']:.4f}")
        print(f"  Average Precision:  {metrics['average_precision']:.4f}")


def plot_confusion_matrix(cm: np.ndarray, horizon: int, save_path: Path = None):
    """Plot confusion matrix as heatmap."""
    plt.figure(figsize=(8, 6))

    cm_pct = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    annot = np.array([[f'{cm[i,j]}\n({cm_pct[i,j]:.1f}%)'
                      for j in range(cm.shape[1])]
                      for i in range(cm.shape[0])])

    sns.heatmap(cm, annot=annot, fmt='', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'],
                cbar_kws={'label': 'Count'})

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.title(f'Confusion Matrix - Horizon {horizon}')
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"  Saved confusion matrix to: {save_path}")
    else:
        plt.show()

    plt.close()


def plot_roc_curve(y_true: np.ndarray, y_prob: np.ndarray,
                   horizon: int, save_path: Path = None):
    """Plot ROC curve."""
    if len(np.unique(y_true)) < 2:
        print("  Warning: Cannot plot ROC curve - only one class present")
        return

    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    roc_auc = roc_auc_score(y_true, y_prob)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
             label='Random classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title(f'ROC Curve - Horizon {horizon}')
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"  Saved ROC curve to: {save_path}")
    else:
        plt.show()

    plt.close()


def plot_precision_recall_curve(y_true: np.ndarray, y_prob: np.ndarray,
                                horizon: int, save_path: Path = None):
    """Plot precision-recall curve."""
    if len(np.unique(y_true)) < 2:
        print("  Warning: Cannot plot PR curve - only one class present")
        return

    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    avg_precision = average_precision_score(y_true, y_prob)

    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='darkorange', lw=2,
             label=f'PR curve (AP = {avg_precision:.3f})')

    baseline = np.sum(y_true) / len(y_true)
    plt.axhline(y=baseline, color='navy', linestyle='--', lw=2,
                label=f'Baseline (prevalence = {baseline:.3f})')

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve - Horizon {horizon}')
    plt.legend(loc="lower left")
    plt.grid(alpha=0.3)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches='tight')
        print(f"  Saved PR curve to: {save_path}")
    else:
        plt.show()

    plt.close()


def save_metrics_to_csv(all_metrics: dict, save_path: Path):
    """Save metrics summary to CSV."""
    rows = []
    for symbol, horizons in all_metrics.items():
        for h, metrics in horizons.items():
            row = {
                'symbol': symbol,
                'horizon': h,
                'accuracy': metrics.get('accuracy', np.nan),
                'balanced_accuracy': metrics.get('balanced_accuracy', np.nan),
                'precision': metrics.get('precision', np.nan),
                'recall': metrics.get('recall', np.nan),
                'specificity': metrics.get('specificity', np.nan),
                'f1_score': metrics.get('f1_score', np.nan),
                'roc_auc': metrics.get('roc_auc', np.nan),
                'avg_precision': metrics.get('average_precision', np.nan),
                'brier_score': metrics.get('brier_val', np.nan),
                'ece': metrics.get('ece_val', np.nan),
                'true_positives': metrics.get('true_positives', 0),
                'false_positives': metrics.get('false_positives', 0),
                'true_negatives': metrics.get('true_negatives', 0),
                'false_negatives': metrics.get('false_negatives', 0),
            }
            rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(save_path, index=False)
    print(f"\nSaved metrics summary to: {save_path}")

    print("\n" + "="*60)
    print("OVERALL METRICS SUMMARY (across all symbols and horizons)")
    print("="*60)
    summary_cols = ['accuracy', 'balanced_accuracy', 'precision', 'recall',
                   'f1_score', 'roc_auc', 'brier_score', 'ece']
    print(df[summary_cols].describe().round(4))


# =========================================
# FEATURE ENGINEERING
# =========================================
def make_feature_table(close: pd.Series):
    """Build feature table from close prices."""
    df = pd.DataFrame(index=close.index)
    df["price"] = close.astype(float)

    # Log returns
    df["ret_1"] = np.log(df["price"] / df["price"].shift(1))
    df["ret_3"] = np.log(df["price"] / df["price"].shift(3))
    df["ret_6"] = np.log(df["price"] / df["price"].shift(6))

    # Volatility
    df["vol_6"] = df["ret_1"].rolling(6).std()
    df["vol_12"] = df["ret_1"].rolling(12).std()

    # Moving average ratio
    ma_10 = df["price"].rolling(10).mean()
    ma_20 = df["price"].rolling(20).mean()
    df["ma_ratio"] = np.log(ma_10 / ma_20)

    # Drop NaN rows
    df = df.dropna()

    # Feature matrix (exclude price)
    feat_cols = [c for c in df.columns if c != "price"]
    X = df[feat_cols].values.astype(float)

    return df, X


# =========================================
# DATASET FOR SEQUENCES
# =========================================
class TimeSeriesDataset(Dataset):
    """Dataset for creating sequences for GRU."""
    def __init__(self, X, y, sequence_length):
        self.X = X
        self.y = y
        self.sequence_length = sequence_length

        # Pre-compute valid indices
        self.valid_length = len(self.X) - self.sequence_length

    def __len__(self):
        return self.valid_length

    def __getitem__(self, idx):
        # Get sequence of features
        X_seq = self.X[idx:idx + self.sequence_length]

        # Get target at the END of the sequence (predict future from past)
        # y is aligned with X, so we want y at position idx + sequence_length
        if idx + self.sequence_length < len(self.y):
            y_target = self.y[idx + self.sequence_length]
        else:
            # Fallback to last available label
            y_target = self.y[idx + self.sequence_length - 1]

        return torch.FloatTensor(X_seq), torch.FloatTensor([y_target])


# =========================================
# WALK-FORWARD CV
# =========================================
def purged_walkforward_slices(n: int, n_folds: int = 3, embargo: int = 24):
    """Generate (train, val, test) slices for walk-forward CV."""
    fold_size = n // (n_folds + 2)
    slices = []

    for i in range(n_folds):
        train_end = (i + 1) * fold_size
        val_start = train_end + embargo
        val_end = val_start + fold_size
        test_start = val_end + embargo
        test_end = min(test_start + fold_size, n)

        if test_end - test_start < fold_size // 2:
            break

        slices.append((
            (0, train_end),
            (val_start, val_end),
            (test_start, test_end)
        ))

    return slices


# =========================================
# GRU MODEL
# =========================================
@dataclass
class GRUSnapshot:
    """Container for trained GRU model."""
    model: GRUClassifier
    scaler: StandardScaler
    horizon: int
    sequence_length: int
    best_epoch: int | None = None
    train_losses: list | None = None
    val_losses: list | None = None


def train_gru_classifier(X_train: np.ndarray, y_train: np.ndarray,
                        X_val: np.ndarray = None, y_val: np.ndarray = None,
                        horizon: int = 1,
                        sequence_length: int = SEQUENCE_LENGTH,
                        hidden_size: int = HIDDEN_SIZE,
                        num_layers: int = NUM_LAYERS,
                        dropout: float = DROPOUT,
                        batch_size: int = BATCH_SIZE,
                        epochs: int = EPOCHS,
                        learning_rate: float = LEARNING_RATE,
                        early_stopping: int = EARLY_STOPPING,
                        device: str = DEVICE) -> GRUSnapshot:
    """
    Train GRU classifier with early stopping.
    """
    # Check if we have enough data
    if len(X_train) < sequence_length + batch_size:
        raise ValueError(f"Not enough training data: {len(X_train)} samples, need at least {sequence_length + batch_size}")

    # Standardize features
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)

    # Create datasets
    train_dataset = TimeSeriesDataset(X_train_scaled, y_train, sequence_length)

    if len(train_dataset) < batch_size:
        raise ValueError(f"Not enough sequences: {len(train_dataset)}, need at least {batch_size}")

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Validation
    val_loader = None
    if X_val is not None and y_val is not None:
        X_val_scaled = scaler.transform(X_val)
        val_dataset = TimeSeriesDataset(X_val_scaled, y_val, sequence_length)
        if len(val_dataset) > 0:
            val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model
    model = GRUClassifier(
        input_size=X_train.shape[1],
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout,
        device=device,
        use_custom_gru=True  # Use custom GRU implementation
    ).to(device)

    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    best_val_loss = float('inf')
    patience_counter = 0
    best_epoch = 0
    best_model_state = None
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)  # [batch_size, seq_len, features]
            y_batch = y_batch.to(device)   # [batch_size, 1]

            optimizer.zero_grad()
            outputs = model(X_batch)       # [batch_size, 1]

            # Ensure shapes match exactly
            outputs = outputs.view(-1, 1)
            y_batch = y_batch.view(-1, 1)

            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        train_losses.append(train_loss)

        # Validation
        if val_loader is not None:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch = X_batch.to(device)
                    y_batch = y_batch.to(device)

                    outputs = model(X_batch)

                    # Ensure shapes match exactly
                    outputs = outputs.view(-1, 1)
                    y_batch = y_batch.view(-1, 1)

                    loss = criterion(outputs, y_batch)
                    val_loss += loss.item()

            val_loss /= len(val_loader)
            val_losses.append(val_loss)

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                patience_counter = 0
                # Save best model
                best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            else:
                patience_counter += 1

            if patience_counter >= early_stopping:
                print(f"    Early stopping at epoch {epoch+1}")
                # Restore best model
                if best_model_state is not None:
                    model.load_state_dict(best_model_state)
                break

        if (epoch + 1) % 10 == 0:
            print(f"    Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}" +
                  (f", Val Loss: {val_loss:.4f}" if val_loader is not None else ""))

    # If we saved a best model, restore it
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return GRUSnapshot(
        model=model,
        scaler=scaler,
        horizon=horizon,
        sequence_length=sequence_length,
        best_epoch=best_epoch,
        train_losses=train_losses,
        val_losses=val_losses
    )


# =========================================
# FORECASTING
# =========================================
def forecast_multi_horizon_gru(
    snapshots: dict[int, GRUSnapshot],
    X_seg: np.ndarray,
    price_seg: pd.Series,
    horizons: list[int],
    cost_bp: dict[int, float] | None = None,
    device: str = DEVICE
):
    """
    Generate multi-horizon forecasts using trained GRU models.
    """
    if cost_bp is None:
        cost_bp = {h: DEFAULT_COST_BP.get(h, 8.0) for h in horizons}
    cost_log = {h: bp_to_logret(float(cost_bp[h])) for h in horizons}

    Tseg = X_seg.shape[0]
    idx = price_seg.index
    out = {}

    for h in horizons:
        if h not in snapshots:
            print(f"Warning: No model for horizon {h}, skipping")
            continue

        snap = snapshots[h]
        seq_len = snap.sequence_length

        # Can only predict from seq_len onwards
        out_h = pd.DataFrame(index=idx[seq_len:-h] if h < Tseg - seq_len else idx[:0])
        T_h = Tseg - h - seq_len

        if T_h <= 0:
            out[h] = out_h
            continue

        # Scale features
        X_scaled = snap.scaler.transform(X_seg)

        # Create sequences
        sequences = []
        for i in range(seq_len, len(X_scaled) - h):
            seq = X_scaled[i-seq_len:i]
            sequences.append(seq)

        if len(sequences) == 0:
            out[h] = out_h
            continue

        X_sequences = torch.FloatTensor(np.array(sequences)).to(device)

        # Get probabilities
        snap.model.eval()
        with torch.no_grad():
            p_edge = snap.model(X_sequences).cpu().numpy().flatten()

        # Expected return estimation
        mu = p_edge * (cost_log[h] + 0.002) + (1 - p_edge) * (-cost_log[h] - 0.001)

        # Uncertainty estimate
        std_h = np.sqrt(p_edge * (1 - p_edge)) * 0.04

        # Quantiles
        q10 = mu - 1.28 * std_h
        q50 = mu
        q90 = mu + 1.28 * std_h

        # Populate DataFrame (align with price indices)
        valid_indices = idx[seq_len:seq_len+len(p_edge)]
        p_now = price_seg.loc[valid_indices].values

        out_h = pd.DataFrame(index=valid_indices)
        out_h['mu'] = mu
        out_h['std'] = std_h
        out_h['p_edge_raw'] = p_edge
        out_h['ret_q10'] = q10
        out_h['ret_q50'] = q50
        out_h['ret_q90'] = q90
        out_h['price_pred'] = p_now * np.exp(mu)
        out_h['price_q10'] = p_now * np.exp(q10)
        out_h['price_q50'] = p_now * np.exp(q50)
        out_h['price_q90'] = p_now * np.exp(q90)

        out[h] = out_h

    return out, cost_log


# =========================================
# CALIBRATION
# =========================================
@dataclass
class ProbCalibrator:
    """Probability calibrator using isotonic regression."""
    method: str
    iso: IsotonicRegression | None = None


def fit_prob_calibrator_isotonic(p_raw: np.ndarray, y: np.ndarray,
                                 min_points: int = 30) -> ProbCalibrator:
    """Fit isotonic regression p_raw -> y."""
    p_raw = np.asarray(p_raw, float)
    y = np.asarray(y, float)
    m = np.isfinite(p_raw) & np.isfinite(y)
    p, t = p_raw[m], y[m]
    if p.size < min_points or np.unique(p).size < 3:
        return ProbCalibrator(method="identity", iso=None)
    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(p, t)
    return ProbCalibrator(method="isotonic", iso=iso)


def apply_prob_calibrator(cal: ProbCalibrator, p_raw: np.ndarray) -> np.ndarray:
    """Apply probability calibrator."""
    p_raw = np.asarray(p_raw, float)
    if cal.method == "isotonic":
        return cal.iso.predict(p_raw)
    return p_raw


@dataclass
class IntervalCalibrator:
    """Conformal prediction interval calibrator."""
    method: str
    q_alpha: float
    alpha: float


def fit_conformal_interval(residuals: np.ndarray, alpha: float = 0.2) -> IntervalCalibrator:
    """Fit conformal prediction intervals."""
    resid = np.asarray(residuals, float)
    resid = resid[np.isfinite(resid)]
    q = float(np.quantile(np.abs(resid), 1 - alpha)) if resid.size > 0 else 0.0
    return IntervalCalibrator(method="conformal_abs", q_alpha=q, alpha=alpha)


def apply_conformal_interval(cal: IntervalCalibrator, mu: np.ndarray):
    """Apply conformal prediction intervals."""
    mu = np.asarray(mu, float)
    return mu - cal.q_alpha, mu + cal.q_alpha


# =========================================
# MAIN TRAINING PIPELINE
# =========================================
def run_gru_for_symbol(symbol: str, path: Path,
                       horizons: list[int] = HORIZONS,
                       n_folds: int = 3,
                       embargo: int = 24,
                       sequence_length: int = SEQUENCE_LENGTH,
                       hidden_size: int = HIDDEN_SIZE,
                       num_layers: int = NUM_LAYERS,
                       epochs: int = EPOCHS,
                       save_plots: bool = True):
    """
    Train and evaluate GRU models for one symbol.
    """
    # Load data
    df_raw = pd.read_csv(path)
    close_col = _find_close_column(df_raw)
    close = pd.Series(df_raw[close_col].astype(float).values,
                      index=pd.RangeIndex(len(df_raw)), name="close")

    feat_df, X = make_feature_table(close)
    price = feat_df["price"]
    n = len(price)

    folds = purged_walkforward_slices(n, n_folds=n_folds, embargo=embargo)

    results = {h: {"val": [], "test": [], "diag": [], "metrics": []} for h in horizons}

    # Create plots directory
    if save_plots:
        plots_dir = DATA_DIR / "evaluation_plots_gru" / symbol
        plots_dir.mkdir(parents=True, exist_ok=True)

    print(f"\n{'='*60}")
    print(f"Training GRU for {symbol}")
    print(f"Device: {DEVICE}")
    print(f"Horizons: {horizons}")
    print(f"Folds: {n_folds}")
    print(f"Sequence Length: {sequence_length}")
    print(f"Hidden Size: {hidden_size}, Layers: {num_layers}")
    print(f"{'='*60}\n")

    for fold_idx, ((s0,e0), (s1,e1), (s2,e2)) in enumerate(folds):
        print(f"Fold {fold_idx + 1}/{len(folds)}: Train[{s0}:{e0}] Val[{s1}:{e1}] Test[{s2}:{e2}]")

        # Train one GRU per horizon
        snapshots = {}

        for h in horizons:
            print(f"  Training h={h}...")

            # Create labels
            ret_train = cumulative_log_returns(price.iloc[s0:e0], h)
            ret_val = cumulative_log_returns(price.iloc[s1:e1], h)

            # Align features and labels
            n_train = min(len(X[s0:e0]), len(ret_train))
            X_train_aligned = X[s0:s0+n_train]
            ret_train_aligned = ret_train.iloc[:n_train]

            # Check if we have enough data for sequences
            min_required = sequence_length + BATCH_SIZE + 10
            if len(X_train_aligned) < min_required:
                print(f"    SKIP (need {min_required} samples, have {len(X_train_aligned)})")
                continue

            # Binary classification
            y_train = (ret_train_aligned.values > bp_to_logret(DEFAULT_COST_BP[h])).astype(int)

            # Validation set
            n_val = min(len(X[s1:e1]), len(ret_val))
            X_val_use = X[s1:s1+n_val]
            y_val_use = (ret_val.iloc[:n_val].values > bp_to_logret(DEFAULT_COST_BP[h])).astype(int)

            # Check class balance
            pos_frac = y_train.mean()
            if pos_frac < 0.1 or pos_frac > 0.9:
                print(f"    WARN: Imbalanced ({pos_frac:.2%} positive)")

            try:
                snap = train_gru_classifier(
                    X_train_aligned, y_train,
                    X_val=X_val_use, y_val=y_val_use,
                    horizon=h,
                    sequence_length=sequence_length,
                    hidden_size=hidden_size,
                    num_layers=num_layers,
                    epochs=epochs,
                    device=DEVICE
                )

                snapshots[h] = snap
                print(f"    âœ“ Best epoch: {snap.best_epoch}")
            except Exception as e:
                print(f"    ERROR: {str(e)}")
                continue

        if not snapshots:
            print("  No models trained, skipping fold")
            continue

        # Forecast on validation and test
        print("  Forecasting validation...", end=" ")
        out_val_raw, cost_log = forecast_multi_horizon_gru(
            snapshots=snapshots,
            X_seg=X[s1:e1],
            price_seg=price.iloc[s1:e1],
            horizons=horizons,
            device=DEVICE
        )
        print("âœ“")

        print("  Forecasting test...", end=" ")
        out_test_raw, _ = forecast_multi_horizon_gru(
            snapshots=snapshots,
            X_seg=X[s2:e2],
            price_seg=price.iloc[s2:e2],
            horizons=horizons,
            device=DEVICE
        )
        print("âœ“")

        # Calibration and evaluation
        for h in horizons:
            if h not in out_val_raw or h not in out_test_raw:
                continue

            ret_val = cumulative_log_returns(price.iloc[s1:e1], h)
            idx_common = out_val_raw[h].index.intersection(ret_val.index)

            if len(idx_common) == 0:
                continue

            dfV = out_val_raw[h].loc[idx_common].copy()
            maskV = np.isfinite(dfV["p_edge_raw"].values) & np.isfinite(dfV["mu"].values)
            dfV = dfV[maskV]

            if len(dfV) < 20:
                continue

            ret_val_aligned = ret_val.loc[dfV.index]
            y_val = (ret_val_aligned.values > cost_log[h]).astype(int)
            p_raw_val = dfV["p_edge_raw"].values
            mu_val = dfV["mu"].values

            # Fit calibrators
            cal_prob = fit_prob_calibrator_isotonic(p_raw_val, y_val, min_points=20)
            resid_val = ret_val_aligned.values - mu_val
            cal_pi = fit_conformal_interval(resid_val, alpha=0.2)

            # Apply to test
            ret_test = cumulative_log_returns(price.iloc[s2:e2], h)
            idx_test_common = out_test_raw[h].index.intersection(ret_test.index)
            dfT = out_test_raw[h].loc[idx_test_common].copy()

            maskT = np.isfinite(dfT["p_edge_raw"].values) & np.isfinite(dfT["mu"].values)
            dfT = dfT[maskT]

            if len(dfT) == 0:
                continue

            dfT["p_edge"] = apply_prob_calibrator(cal_prob, dfT["p_edge_raw"].values)
            mu_test = dfT["mu"].values
            lo, hi = apply_conformal_interval(cal_pi, mu_test)
            dfT["ret_lo"] = lo
            dfT["ret_hi"] = hi

            p_now = price.loc[dfT.index].values
            dfT["price_lo"] = p_now * np.exp(lo)
            dfT["price_hi"] = p_now * np.exp(hi)

            dfT["edge"] = dfT["mu"] - cost_log[h]
            dfT["risk_edge"] = (dfT["mu"] - cost_log[h]) / (dfT["std"] + 1e-12)

            results[h]["test"].append(dfT)

            # Compute classification metrics
            ret_test_aligned = ret_test.loc[dfT.index]
            y_test_true = (ret_test_aligned.values > cost_log[h]).astype(int)
            y_test_pred = (dfT["p_edge"].values > 0.5).astype(int)
            y_test_prob = dfT["p_edge"].values

            test_metrics = compute_classification_metrics(
                y_test_true, y_test_pred, y_test_prob
            )

            # Print detailed report
            print_classification_report(test_metrics, h, "test")

            # Save plots (only for last fold)
            if save_plots and fold_idx == len(folds) - 1:
                print(f"\n  Generating evaluation plots for horizon {h}...")

                cm_path = plots_dir / f"confusion_matrix_h{h}.png"
                plot_confusion_matrix(test_metrics['confusion_matrix'], h, cm_path)

                if test_metrics.get('roc_auc') is not None:
                    roc_path = plots_dir / f"roc_curve_h{h}.png"
                    plot_roc_curve(y_test_true, y_test_prob, h, roc_path)

                    pr_path = plots_dir / f"precision_recall_h{h}.png"
                    plot_precision_recall_curve(y_test_true, y_test_prob, h, pr_path)

            results[h]["metrics"].append(test_metrics)

            # Calibration diagnostics
            if cal_prob.method == "isotonic":
                p_cal_val = apply_prob_calibrator(cal_prob, p_raw_val)
                brier = brier_score(y_val, p_cal_val)
                ece = expected_calibration_error(y_val, p_cal_val)
            else:
                brier = brier_score(y_val, p_raw_val)
                ece = expected_calibration_error(y_val, p_raw_val)

            coverage = float(np.mean((resid_val >= -cal_pi.q_alpha) & (resid_val <= cal_pi.q_alpha)))

            diag = {
                "h": h,
                "brier_val": float(brier),
                "ece_val": float(ece),
                "pi_coverage_val": coverage
            }
            results[h]["diag"].append(diag)

            # Store validation
            dfV["p_edge"] = apply_prob_calibrator(cal_prob, dfV["p_edge_raw"].values)
            loV, hiV = apply_conformal_interval(cal_pi, mu_val)
            dfV["ret_lo"], dfV["ret_hi"] = loV, hiV
            results[h]["val"].append(dfV)

    # Concatenate folds
    for h in horizons:
        for split in ("val", "test"):
            if results[h][split]:
                results[h][split] = pd.concat(results[h][split]).sort_index()
            else:
                results[h][split] = pd.DataFrame()

    print(f"\nCompleted {symbol}\n")
    return results


# =========================================
# JSON EXPORT
# =========================================
def build_json_records(all_outputs: dict,
                       model_version: str = MODEL_VERSION,
                       calibration_version: str = CALIBRATION_VERSION,
                       horizons: list[int] = HORIZONS):
    """Build JSONL records for trading agent."""
    records = []
    for sym, res in all_outputs.items():
        for h in horizons:
            df = res[h]["test"]
            if isinstance(df, list) or isinstance(df, tuple):
                df = pd.concat(df).sort_index()
            for t, row in df.iterrows():
                rec = {
                    "timestamp_index": int(t),
                    "symbol": sym,
                    "horizon_bars": int(h),
                    "model_version": model_version,
                    "calibration_version": calibration_version,
                    "signals": {
                        "expected_return": float(row["mu"]),
                        "stdev_return": float(row["std"]),
                        "p_edge_gt_cost": float(row["p_edge"]),
                        "predicted_price": float(row["price_pred"]),
                        "price_PI": {
                            "p10": float(row["price_q10"]),
                            "p50": float(row["price_q50"]),
                            "p90": float(row["price_q90"])
                        }
                    },
                    "policy_suggestions": {
                        "gate_threshold_p": TAU_P,
                        "gate_threshold_edge": TAU_MU,
                        "suggested_action": "buy" if (row["p_edge"]>=TAU_P and row["edge"]>=TAU_MU and row["mu"]>0)
                                            else ("sell" if (row["p_edge"]>=TAU_P and row["edge"]>=TAU_MU and row["mu"]<0)
                                                  else "hold")
                    }
                }
                records.append(rec)
    return records


# =========================================
# MAIN EXECUTION
# =========================================
if __name__ == "__main__":
    print(f"Using device: {DEVICE}")
    if DEVICE.type == 'cuda':
        print(f"GPU: {torch.cuda.get_device_name(0)}")

    # Process all symbols
    all_outputs = {}
    all_metrics = {}

    for symbol, path in FILES.items():
        if not path.exists():
            print(f"Warning: {path} not found, skipping {symbol}")
            continue

        results = run_gru_for_symbol(
            symbol=symbol,
            path=path,
            horizons=HORIZONS,
            n_folds=3,
            embargo=24,
            sequence_length=SEQUENCE_LENGTH,
            hidden_size=HIDDEN_SIZE,
            num_layers=NUM_LAYERS,
            epochs=EPOCHS,
            save_plots=True
        )

        all_outputs[symbol] = results

        # Collect metrics
        all_metrics[symbol] = {}
        for h in HORIZONS:
            if results[h]['metrics']:
                metrics_list = results[h]['metrics']
                avg_metrics = {
                    'accuracy': np.mean([m['accuracy'] for m in metrics_list]),
                    'balanced_accuracy': np.mean([m['balanced_accuracy'] for m in metrics_list]),
                    'precision': np.mean([m['precision'] for m in metrics_list]),
                    'recall': np.mean([m['recall'] for m in metrics_list]),
                    'specificity': np.mean([m['specificity'] for m in metrics_list]),
                    'f1_score': np.mean([m['f1_score'] for m in metrics_list]),
                    'roc_auc': np.mean([m['roc_auc'] for m in metrics_list if m.get('roc_auc') is not None]),
                    'average_precision': np.mean([m['average_precision'] for m in metrics_list if m.get('average_precision') is not None]),
                    'true_positives': sum([m['true_positives'] for m in metrics_list]),
                    'false_positives': sum([m['false_positives'] for m in metrics_list]),
                    'true_negatives': sum([m['true_negatives'] for m in metrics_list]),
                    'false_negatives': sum([m['false_negatives'] for m in metrics_list]),
                }

                diag_list = results[h]['diag']
                if diag_list:
                    avg_metrics['brier_val'] = np.mean([d['brier_val'] for d in diag_list])
                    avg_metrics['ece_val'] = np.mean([d['ece_val'] for d in diag_list])

                all_metrics[symbol][h] = avg_metrics

        # Print summary
        print(f"\n{'='*60}")
        print(f"SUMMARY FOR {symbol}")
        print(f"{'='*60}")

        for h in HORIZONS:
            test_df = results[h]['test']

            if len(test_df) > 0 and h in all_metrics[symbol]:
                metrics = all_metrics[symbol][h]
                print(f"\nHorizon {h}:")
                print(f"  Test samples:      {len(test_df)}")
                print(f"  Accuracy:          {metrics['accuracy']:.4f}")
                print(f"  Balanced Acc:      {metrics['balanced_accuracy']:.4f}")
                print(f"  Precision:         {metrics['precision']:.4f}")
                print(f"  Recall:            {metrics['recall']:.4f}")
                print(f"  F1 Score:          {metrics['f1_score']:.4f}")
                if not np.isnan(metrics.get('roc_auc', np.nan)):
                    print(f"  ROC-AUC:           {metrics['roc_auc']:.4f}")
                print(f"  Brier Score:       {metrics.get('brier_val', np.nan):.4f}")
                print(f"  ECE:               {metrics.get('ece_val', np.nan):.4f}")

    # Save metrics summary
    metrics_csv_path = DATA_DIR / "gru_metrics_summary.csv"
    save_metrics_to_csv(all_metrics, metrics_csv_path)

    # Export to JSON
    json_records = build_json_records(all_outputs)
    json_path = DATA_DIR / "trader_feed_gru_multiH.jsonl"
    with open(json_path, "w") as f:
        for r in json_records:
            f.write(json.dumps(r) + "\n")

    print(f"\n{'='*60}")
    print(f"FINAL OUTPUTS")
    print(f"{'='*60}")
    print(f"JSON feed:        {json_path}")
    print(f"Metrics CSV:      {metrics_csv_path}")
    print(f"Plots directory:  {DATA_DIR / 'evaluation_plots_gru'}")
    print(f"{'='*60}\n")

    print("ðŸ’¡ Tip: GRU excels at capturing temporal patterns!")
    print("Compare with other models using the metrics CSV files.")

Using device: cpu

Training GRU for BTC
Device: cpu
Horizons: [1, 3, 6]
Folds: 3
Sequence Length: 20
Hidden Size: 64, Layers: 2

Fold 1/3: Train[0:1749] Val[1773:3522] Test[3546:5295]
  Training h=1...
    Epoch 10/50 - Train Loss: 0.6858, Val Loss: 0.6872
    Epoch 20/50 - Train Loss: 0.6720, Val Loss: 0.6878
    Early stopping at epoch 21
    âœ“ Best epoch: 10
  Training h=3...
    Epoch 10/50 - Train Loss: 0.6478, Val Loss: 0.7081
    Early stopping at epoch 14
    âœ“ Best epoch: 3
  Training h=6...
    Epoch 10/50 - Train Loss: 0.5765, Val Loss: 0.8066
    Early stopping at epoch 11
    âœ“ Best epoch: 0
  Forecasting validation... âœ“
  Forecasting test... âœ“

CLASSIFICATION METRICS - Horizon 1 (test)

Confusion Matrix:
                Predicted Negative  Predicted Positive
Actual Negative           938                   4
Actual Positive           780                   6

Performance Metrics:
  Accuracy:           0.5463
  Balanced Accuracy:  0.5017
  Precision:          0.600