In [2]:
#!/usr/bin/env python3
"""
Random Forest Multi-Horizon Trading Signal Generator
Complete implementation for cryptocurrency trading with Random Forest classification/regression

Key features:
- One Random Forest model per horizon (1, 3, 6 bars)
- Native probability estimates (no calibration needed for RF)
- Walk-forward cross-validation
- Isotonic + conformal calibration
- JSON feed output for trading agent
- Feature importance analysis
"""

import json
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


In [None]:
# =========================================
# CONFIGURATION
# =========================================
DATA_DIR = Path("/Users/nitinlodha/Desktop/ML/ML_Project/Bybit_CSV_Data")
FILES = {
    "BTC": DATA_DIR / "Bybit_BTC.csv",
    "ETH": DATA_DIR / "Bybit_ETH.csv",
    "SOL": DATA_DIR / "Bybit_SOL.csv",
    "XRP": DATA_DIR / "Bybit_XRP.csv",
    "DOGE": DATA_DIR / "Bybit_DOGE.csv",
}

HORIZONS = [1, 3, 6]  # Forecast horizons in 4-hour bars
DEFAULT_COST_BP = {1: 8.0, 3: 10.0, 6: 12.0}  # Trading costs in basis points

# Policy thresholds
TAU_P = 0.60        # Probability gate for P(edge > cost)
TAU_MU = 0.0005     # Expected-return gate (log-return)
LAM = 2.0           # Kelly-lite multiplier
W_MAX = 0.50        # Max gross position (50% notional)

MODEL_VERSION = "rf_multiH_v1.0"
CALIBRATION_VERSION = "iso+conformal_v1"


In [None]:
# =========================================
# UTILITY FUNCTIONS
# =========================================
def bp_to_logret(bp: float) -> float:
    """Convert basis points to log-return units."""
    return bp * 1e-4


def _find_close_column(df: pd.DataFrame) -> str:
    """Find the close price column in a dataframe."""
    lower = {c.lower(): c for c in df.columns}
    for key in ("close", "closing_price", "close_price", "price_close", "last", "c"):
        if key in lower:
            return lower[key]
    # Fallback: any single float column
    float_cols = [c for c in df.columns if pd.api.types.is_float_dtype(df[c])]
    if len(float_cols) == 1:
        return float_cols[0]
    raise ValueError("Cannot identify 'close' column.")


def cumulative_log_returns(price: pd.Series, h: int) -> pd.Series:
    """Compute log(P_{t+h}/P_t) aligned to t."""
    return np.log(price.shift(-h) / price).dropna()


def brier_score(y: np.ndarray, p: np.ndarray) -> float:
    """Brier score for probability calibration."""
    return float(np.mean((y - p) ** 2))


def expected_calibration_error(y: np.ndarray, p: np.ndarray, bins: int = 10) -> float:
    """Expected Calibration Error (ECE)."""
    edges = np.linspace(0, 1, bins + 1)
    ece = 0.0
    for i in range(bins):
        m = (p >= edges[i]) & (p < edges[i+1])
        if m.sum() == 0:
            continue
        ece += (m.sum()/len(p)) * np.abs(np.mean(y[m]) - np.mean(p[m]))
    return float(ece)


In [None]:
# =========================================
# FEATURE ENGINEERING
# =========================================
def make_feature_table(close: pd.Series):
    """
    Build feature table from close prices.

    Features:
    - Returns at multiple lags (1, 3, 6 bars)
    - Rolling volatility (6, 12 bars)
    - Moving average ratio (log MA10/MA20)

    Returns:
        df: DataFrame with price and features
        X: Feature matrix (numpy array)
    """
    df = pd.DataFrame(index=close.index)
    df["price"] = close.astype(float)

    # Log returns
    df["ret_1"] = np.log(df["price"] / df["price"].shift(1))
    df["ret_3"] = np.log(df["price"] / df["price"].shift(3))
    df["ret_6"] = np.log(df["price"] / df["price"].shift(6))

    # Volatility
    df["vol_6"] = df["ret_1"].rolling(6).std()
    df["vol_12"] = df["ret_1"].rolling(12).std()

    # Moving average ratio
    ma_10 = df["price"].rolling(10).mean()
    ma_20 = df["price"].rolling(20).mean()
    df["ma_ratio"] = np.log(ma_10 / ma_20)

    # Drop NaN rows
    df = df.dropna()

    # Feature matrix (exclude price)
    feat_cols = [c for c in df.columns if c != "price"]
    X = df[feat_cols].values.astype(float)

    return df, X


In [None]:
# =========================================
# WALK-FORWARD CV
# =========================================
def purged_walkforward_slices(n: int, n_folds: int = 3, embargo: int = 24):
    """
    Generate (train, val, test) slices for walk-forward CV with embargo.

    Args:
        n: Total number of samples
        n_folds: Number of folds
        embargo: Gap between train/val and val/test (in bars)

    Returns:
        List of ((train_start, train_end), (val_start, val_end), (test_start, test_end))
    """
    fold_size = n // (n_folds + 2)
    slices = []

    for i in range(n_folds):
        train_end = (i + 1) * fold_size
        val_start = train_end + embargo
        val_end = val_start + fold_size
        test_start = val_end + embargo
        test_end = min(test_start + fold_size, n)

        if test_end - test_start < fold_size // 2:
            break

        slices.append((
            (0, train_end),
            (val_start, val_end),
            (test_start, test_end)
        ))

    return slices


In [None]:
# =========================================
# RANDOM FOREST MODEL
# =========================================
@dataclass
class RFSnapshot:
    """Container for trained Random Forest model and preprocessing."""
    clf: RandomForestClassifier | RandomForestRegressor
    scaler: StandardScaler
    model_type: str  # 'classification' or 'regression'
    horizon: int
    feature_names: list[str]
    feature_importances: np.ndarray | None = None


def fit_rf_classifier(X_train: np.ndarray, y_train: np.ndarray,
                      horizon: int,
                      feature_names: list[str] = None,
                      random_state: int = 123,
                      n_estimators: int = 200,
                      max_depth: int | None = 10,
                      min_samples_split: int = 20,
                      min_samples_leaf: int = 10) -> RFSnapshot:
    """
    Train Random Forest classifier for binary edge detection.

    Args:
        X_train: Feature matrix (T, D)
        y_train: Binary labels (1 if return > cost, else 0)
        horizon: Forecast horizon in bars
        feature_names: List of feature names for importance analysis
        n_estimators: Number of trees
        max_depth: Maximum tree depth (None = unlimited)
        min_samples_split: Minimum samples to split node
        min_samples_leaf: Minimum samples in leaf

    Returns:
        RFSnapshot with trained classifier
    """
    # Standardize features (optional for RF, but helps with interpretability)
    scaler = StandardScaler().fit(X_train)
    X_scaled = scaler.transform(X_train)

    # Random Forest Classifier
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight='balanced',  # Handle imbalanced classes
        random_state=random_state,
        n_jobs=-1,  # Use all CPU cores
        bootstrap=True,
        oob_score=True,  # Out-of-bag score for validation
        max_features='sqrt'  # Number of features per split
    )

    clf.fit(X_scaled, y_train)

    # Feature importances
    feature_importances = clf.feature_importances_ if hasattr(clf, 'feature_importances_') else None

    return RFSnapshot(
        clf=clf,
        scaler=scaler,
        model_type='classification',
        horizon=horizon,
        feature_names=feature_names or [f"feat_{i}" for i in range(X_train.shape[1])],
        feature_importances=feature_importances
    )


def fit_rf_regressor(X_train: np.ndarray, y_train: np.ndarray,
                     horizon: int,
                     feature_names: list[str] = None,
                     random_state: int = 123,
                     n_estimators: int = 200,
                     max_depth: int | None = 10,
                     min_samples_split: int = 20,
                     min_samples_leaf: int = 10) -> RFSnapshot:
    """
    Train Random Forest regressor to directly predict continuous returns.

    Args:
        X_train: Feature matrix
        y_train: Continuous log returns

    Returns:
        RFSnapshot with trained regressor
    """
    scaler = StandardScaler().fit(X_train)
    X_scaled = scaler.transform(X_train)

    reg = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state,
        n_jobs=-1,
        bootstrap=True,
        oob_score=True,
        max_features='sqrt'
    )

    reg.fit(X_scaled, y_train)

    feature_importances = reg.feature_importances_ if hasattr(reg, 'feature_importances_') else None

    return RFSnapshot(
        clf=reg,
        scaler=scaler,
        model_type='regression',
        horizon=horizon,
        feature_names=feature_names or [f"feat_{i}" for i in range(X_train.shape[1])],
        feature_importances=feature_importances
    )


In [None]:
# =========================================
# FORECASTING
# =========================================
def forecast_multi_horizon_rf(
    snapshots: dict[int, RFSnapshot],
    X_seg: np.ndarray,
    price_seg: pd.Series,
    horizons: list[int],
    cost_bp: dict[int, float] | None = None,
    compute_tree_variance: bool = False
):
    """
    Generate multi-horizon forecasts using trained Random Forests.

    Random Forests provide natural uncertainty estimates via tree variance,
    which is more principled than bootstrap for ensemble methods.

    Args:
        snapshots: Dict mapping horizon -> trained RFSnapshot
        X_seg: Feature matrix for forecast segment
        price_seg: Corresponding price series
        horizons: List of forecast horizons
        cost_bp: Trading costs in basis points
        compute_tree_variance: If True, compute prediction variance from trees

    Returns:
        out: dict[horizon] -> DataFrame with predictions
        cost_log: dict[horizon] -> cost in log-return units
    """
    if cost_bp is None:
        cost_bp = {h: DEFAULT_COST_BP.get(h, 8.0) for h in horizons}
    cost_log = {h: bp_to_logret(float(cost_bp[h])) for h in horizons}

    Tseg = X_seg.shape[0]
    idx = price_seg.index
    out = {}

    for h in horizons:
        if h not in snapshots:
            print(f"Warning: No model for horizon {h}, skipping")
            continue

        snap = snapshots[h]
        out_h = pd.DataFrame(index=idx[:-h] if h < Tseg else idx[:0])
        T_h = Tseg - h

        if T_h <= 0:
            out[h] = out_h
            continue

        # Scale features
        X_scaled = snap.scaler.transform(X_seg[:T_h])

        if snap.model_type == 'classification':
            # Get probabilities (RF provides well-calibrated probabilities)
            p_edge = snap.clf.predict_proba(X_scaled)[:, 1]

            # Expected return estimation (probability-weighted)
            mu = p_edge * (cost_log[h] + 0.002) + (1 - p_edge) * (-cost_log[h] - 0.001)

            # Uncertainty from tree variance
            if compute_tree_variance:
                std_h, q10, q50, q90 = _tree_variance_classification(
                    snap, X_scaled, cost_log[h]
                )
            else:
                # Simple uncertainty estimate
                std_h = np.sqrt(p_edge * (1 - p_edge)) * 0.03  # Binomial variance scaled
                q10 = mu - 1.28 * std_h  # ~10th percentile
                q50 = mu
                q90 = mu + 1.28 * std_h  # ~90th percentile

        elif snap.model_type == 'regression':
            # Direct return prediction
            mu = snap.clf.predict(X_scaled)

            # Probability via sigmoid transform
            p_edge = 1.0 / (1.0 + np.exp(-10 * (mu - cost_log[h])))

            if compute_tree_variance:
                std_h, q10, q50, q90 = _tree_variance_regression(snap, X_scaled)
            else:
                # Simple uncertainty estimate
                std_h = np.full(T_h, 0.015)
                q10 = mu - 0.025
                q50 = mu
                q90 = mu + 0.025

        # Populate DataFrame
        p_now = price_seg.iloc[:T_h].values

        out_h['mu'] = mu
        out_h['std'] = std_h
        out_h['p_edge_raw'] = p_edge
        out_h['ret_q10'] = q10
        out_h['ret_q50'] = q50
        out_h['ret_q90'] = q90
        out_h['price_pred'] = p_now * np.exp(mu)
        out_h['price_q10'] = p_now * np.exp(q10)
        out_h['price_q50'] = p_now * np.exp(q50)
        out_h['price_q90'] = p_now * np.exp(q90)

        out[h] = out_h

    return out, cost_log


def _tree_variance_classification(snap, X_scaled, cost_threshold):
    """
    Compute uncertainty from individual tree predictions (classification).

    Each tree in the forest votes for a class. We can use the distribution
    of these votes to estimate uncertainty.
    """
    # Get predictions from all trees
    tree_probs = np.array([tree.predict_proba(X_scaled)[:, 1]
                           for tree in snap.clf.estimators_])  # (n_trees, T)

    # Variance across trees
    std = np.std(tree_probs, axis=0)

    # Convert probabilities to return estimates
    mu_samples = tree_probs * (cost_threshold + 0.002) + (1 - tree_probs) * (-cost_threshold - 0.001)

    q10 = np.percentile(mu_samples, 10, axis=0)
    q50 = np.percentile(mu_samples, 50, axis=0)
    q90 = np.percentile(mu_samples, 90, axis=0)

    return std, q10, q50, q90


def _tree_variance_regression(snap, X_scaled):
    """
    Compute uncertainty from individual tree predictions (regression).
    """
    # Get predictions from all trees
    tree_preds = np.array([tree.predict(X_scaled)
                          for tree in snap.clf.estimators_])  # (n_trees, T)

    # Statistics across trees
    std = np.std(tree_preds, axis=0)
    q10 = np.percentile(tree_preds, 10, axis=0)
    q50 = np.percentile(tree_preds, 50, axis=0)
    q90 = np.percentile(tree_preds, 90, axis=0)

    return std, q10, q50, q90

In [9]:
# =========================================
# CALIBRATION
# =========================================
@dataclass
class ProbCalibrator:
    """Probability calibrator using isotonic regression."""
    method: str
    iso: IsotonicRegression | None = None


def fit_prob_calibrator_isotonic(p_raw: np.ndarray, y: np.ndarray,
                                 min_points: int = 30) -> ProbCalibrator:
    """Fit isotonic regression p_raw -> y."""
    p_raw = np.asarray(p_raw, float)
    y = np.asarray(y, float)
    m = np.isfinite(p_raw) & np.isfinite(y)
    p, t = p_raw[m], y[m]
    if p.size < min_points or np.unique(p).size < 3:
        return ProbCalibrator(method="identity", iso=None)
    iso = IsotonicRegression(out_of_bounds="clip")
    iso.fit(p, t)
    return ProbCalibrator(method="isotonic", iso=iso)


def apply_prob_calibrator(cal: ProbCalibrator, p_raw: np.ndarray) -> np.ndarray:
    """Apply probability calibrator."""
    p_raw = np.asarray(p_raw, float)
    if cal.method == "isotonic":
        return cal.iso.predict(p_raw)
    return p_raw


@dataclass
class IntervalCalibrator:
    """Conformal prediction interval calibrator."""
    method: str
    q_alpha: float
    alpha: float


def fit_conformal_interval(residuals: np.ndarray, alpha: float = 0.2) -> IntervalCalibrator:
    """Fit conformal prediction intervals."""
    resid = np.asarray(residuals, float)
    resid = resid[np.isfinite(resid)]
    q = float(np.quantile(np.abs(resid), 1 - alpha)) if resid.size > 0 else 0.0
    return IntervalCalibrator(method="conformal_abs", q_alpha=q, alpha=alpha)


def apply_conformal_interval(cal: IntervalCalibrator, mu: np.ndarray):
    """Apply conformal prediction intervals."""
    mu = np.asarray(mu, float)
    return mu - cal.q_alpha, mu + cal.q_alpha


In [None]:
# =========================================
# FEATURE IMPORTANCE ANALYSIS
# =========================================
def analyze_feature_importance(snapshots: dict[int, RFSnapshot]):
    """
    Analyze and print feature importances across horizons.

    Random Forests provide natural feature importance scores based on
    how much each feature reduces impurity across all trees.
    """
    print("\n" + "="*60)
    print("FEATURE IMPORTANCE ANALYSIS")
    print("="*60)

    for h, snap in snapshots.items():
        if snap.feature_importances is None:
            continue

        print(f"\nHorizon {h}:")

        # Sort features by importance
        indices = np.argsort(snap.feature_importances)[::-1]

        for i, idx in enumerate(indices[:10]):  # Top 10 features
            print(f"  {i+1}. {snap.feature_names[idx]}: {snap.feature_importances[idx]:.4f}")

        # OOB score if available
        if hasattr(snap.clf, 'oob_score_'):
            print(f"  Out-of-bag score: {snap.clf.oob_score_:.4f}")


In [None]:
# =========================================
# MAIN TRAINING PIPELINE
# =========================================
def run_rf_for_symbol(symbol: str, path: Path,
                      horizons: list[int] = HORIZONS,
                      n_folds: int = 3,
                      embargo: int = 24,
                      model_type: str = 'classification',
                      n_estimators: int = 200,
                      max_depth: int | None = 10,
                      compute_tree_variance: bool = True):
    """
    Train and evaluate Random Forest models for one symbol.

    Args:
        symbol: Asset symbol
        path: Path to CSV file
        horizons: Forecast horizons in bars
        n_folds: Number of walk-forward folds
        embargo: Embargo period between folds
        model_type: 'classification' or 'regression'
        n_estimators: Number of trees in forest
        max_depth: Maximum tree depth
        compute_tree_variance: Use tree variance for uncertainty

    Returns:
        results: dict[horizon] -> dict with 'val', 'test', 'diag' DataFrames
    """
    # Load data
    df_raw = pd.read_csv(path)
    close_col = _find_close_column(df_raw)
    close = pd.Series(df_raw[close_col].astype(float).values,
                      index=pd.RangeIndex(len(df_raw)), name="close")

    feat_df, X = make_feature_table(close)
    price = feat_df["price"]
    n = len(price)

    # Get feature names
    feature_names = [c for c in feat_df.columns if c != "price"]

    folds = purged_walkforward_slices(n, n_folds=n_folds, embargo=embargo)

    results = {h: {"val": [], "test": [], "diag": []} for h in horizons}

    print(f"\n{'='*60}")
    print(f"Training Random Forest for {symbol}")
    print(f"Model type: {model_type}")
    print(f"Horizons: {horizons}")
    print(f"Folds: {n_folds}")
    print(f"Trees: {n_estimators}")
    print(f"Max depth: {max_depth}")
    print(f"{'='*60}\n")

    for fold_idx, ((s0,e0), (s1,e1), (s2,e2)) in enumerate(folds):
        print(f"Fold {fold_idx + 1}/{len(folds)}: Train[{s0}:{e0}] Val[{s1}:{e1}] Test[{s2}:{e2}]")

        # Train one RF per horizon
        snapshots = {}

        for h in horizons:
            print(f"  Training h={h}...", end=" ")

            # Create labels for this horizon
            ret_train = cumulative_log_returns(price.iloc[s0:e0], h)

            # Align features and labels
            n_train = min(len(X[s0:e0]), len(ret_train))
            X_train_aligned = X[s0:s0+n_train]
            ret_train_aligned = ret_train.iloc[:n_train]

            if len(X_train_aligned) < 50:
                print("SKIP (insufficient data)")
                continue

            if model_type == 'classification':
                # Binary classification
                y_train = (ret_train_aligned.values > bp_to_logret(DEFAULT_COST_BP[h])).astype(int)

                # Check class balance
                pos_frac = y_train.mean()
                if pos_frac < 0.1 or pos_frac > 0.9:
                    print(f"WARN (imbalanced: {pos_frac:.2%} positive)", end=" ")

                snap = fit_rf_classifier(
                    X_train_aligned, y_train,
                    horizon=h,
                    feature_names=feature_names,
                    random_state=123 + fold_idx,
                    n_estimators=n_estimators,
                    max_depth=max_depth
                )
            else:
                # Regression
                y_train = ret_train_aligned.values
                snap = fit_rf_regressor(
                    X_train_aligned, y_train,
                    horizon=h,
                    feature_names=feature_names,
                    random_state=123 + fold_idx,
                    n_estimators=n_estimators,
                    max_depth=max_depth
                )

            snapshots[h] = snap

            # Print OOB score
            if hasattr(snap.clf, 'oob_score_'):
                print(f"✓ (OOB: {snap.clf.oob_score_:.3f})")
            else:
                print("✓")

        if not snapshots:
            print("  No models trained, skipping fold")
            continue

        # Analyze feature importance (first fold only)
        if fold_idx == 0:
            analyze_feature_importance(snapshots)

        # Forecast on validation and test
        print("  Forecasting validation...", end=" ")
        out_val_raw, cost_log = forecast_multi_horizon_rf(
            snapshots=snapshots,
            X_seg=X[s1:e1],
            price_seg=price.iloc[s1:e1],
            horizons=horizons,
            compute_tree_variance=compute_tree_variance
        )
        print("✓")

        print("  Forecasting test...", end=" ")
        out_test_raw, _ = forecast_multi_horizon_rf(
            snapshots=snapshots,
            X_seg=X[s2:e2],
            price_seg=price.iloc[s2:e2],
            horizons=horizons,
            compute_tree_variance=compute_tree_variance
        )
        print("✓")

        # Calibration (same as SVM/HMM)
        for h in horizons:
            if h not in out_val_raw or h not in out_test_raw:
                continue

            ret_val = cumulative_log_returns(price.iloc[s1:e1], h)
            idx_common = out_val_raw[h].index.intersection(ret_val.index)

            if len(idx_common) == 0:
                continue

            dfV = out_val_raw[h].loc[idx_common].copy()
            maskV = np.isfinite(dfV["p_edge_raw"].values) & np.isfinite(dfV["mu"].values)
            dfV = dfV[maskV]

            if len(dfV) < 20:
                continue

            ret_val_aligned = ret_val.loc[dfV.index]
            y_val = (ret_val_aligned.values > cost_log[h]).astype(int)
            p_raw_val = dfV["p_edge_raw"].values
            mu_val = dfV["mu"].values

            # Fit calibrators
            cal_prob = fit_prob_calibrator_isotonic(p_raw_val, y_val, min_points=20)
            resid_val = ret_val_aligned.values - mu_val
            cal_pi = fit_conformal_interval(resid_val, alpha=0.2)

            # Apply to test
            ret_test = cumulative_log_returns(price.iloc[s2:e2], h)
            idx_test_common = out_test_raw[h].index.intersection(ret_test.index)
            dfT = out_test_raw[h].loc[idx_test_common].copy()

            maskT = np.isfinite(dfT["p_edge_raw"].values) & np.isfinite(dfT["mu"].values)
            dfT = dfT[maskT]

            if len(dfT) == 0:
                continue

            dfT["p_edge"] = apply_prob_calibrator(cal_prob, dfT["p_edge_raw"].values)
            mu_test = dfT["mu"].values
            lo, hi = apply_conformal_interval(cal_pi, mu_test)
            dfT["ret_lo"] = lo
            dfT["ret_hi"] = hi

            p_now = price.loc[dfT.index].values
            dfT["price_lo"] = p_now * np.exp(lo)
            dfT["price_hi"] = p_now * np.exp(hi)

            dfT["edge"] = dfT["mu"] - cost_log[h]
            dfT["risk_edge"] = (dfT["mu"] - cost_log[h]) / (dfT["std"] + 1e-12)

            results[h]["test"].append(dfT)

            # Diagnostics
            if cal_prob.method == "isotonic":
                p_cal_val = apply_prob_calibrator(cal_prob, p_raw_val)
                brier = brier_score(y_val, p_cal_val)
                ece = expected_calibration_error(y_val, p_cal_val)
            else:
                brier = brier_score(y_val, p_raw_val)
                ece = expected_calibration_error(y_val, p_raw_val)

            coverage = float(np.mean((resid_val >= -cal_pi.q_alpha) & (resid_val <= cal_pi.q_alpha)))

            diag = {
                "h": h,
                "brier_val": float(brier),
                "ece_val": float(ece),
                "pi_coverage_val": coverage
            }
            results[h]["diag"].append(diag)

            # Store validation
            dfV["p_edge"] = apply_prob_calibrator(cal_prob, dfV["p_edge_raw"].values)
            loV, hiV = apply_conformal_interval(cal_pi, mu_val)
            dfV["ret_lo"], dfV["ret_hi"] = loV, hiV
            results[h]["val"].append(dfV)

    # Concatenate folds
    for h in horizons:
        for split in ("val", "test"):
            if results[h][split]:
                results[h][split] = pd.concat(results[h][split]).sort_index()
            else:
                results[h][split] = pd.DataFrame()

    print(f"\nCompleted {symbol}\n")
    return results


In [12]:
# =========================================
# JSON EXPORT
# =========================================
def build_json_records(all_outputs: dict,
                       model_version: str = MODEL_VERSION,
                       calibration_version: str = CALIBRATION_VERSION,
                       horizons: list[int] = HORIZONS):
    """Build JSONL records for trading agent."""
    records = []
    for sym, res in all_outputs.items():
        for h in horizons:
            df = res[h]["test"]
            if isinstance(df, list) or isinstance(df, tuple):
                df = pd.concat(df).sort_index()
            for t, row in df.iterrows():
                rec = {
                    "timestamp_index": int(t),
                    "symbol": sym,
                    "horizon_bars": int(h),
                    "model_version": model_version,
                    "calibration_version": calibration_version,
                    "signals": {
                        "expected_return": float(row["mu"]),
                        "stdev_return": float(row["std"]),
                        "p_edge_gt_cost": float(row["p_edge"]),
                        "predicted_price": float(row["price_pred"]),
                        "price_PI": {
                            "p10": float(row["price_q10"]),
                            "p50": float(row["price_q50"]),
                            "p90": float(row["price_q90"])
                        }
                    },
                    "policy_suggestions": {
                        "gate_threshold_p": TAU_P,
                        "gate_threshold_edge": TAU_MU,
                        "suggested_action": "buy" if (row["p_edge"]>=TAU_P and row["edge"]>=TAU_MU and row["mu"]>0)
                                            else ("sell" if (row["p_edge"]>=TAU_P and row["edge"]>=TAU_MU and row["mu"]<0)
                                                  else "hold")
                    }
                }
                records.append(rec)
    return records

In [None]:
# =========================================
# MAIN EXECUTION
# =========================================
if __name__ == "__main__":
    # Process all symbols
    all_outputs = {}

    for symbol, path in FILES.items():
        if not path.exists():
            print(f"Warning: {path} not found, skipping {symbol}")
            continue

        results = run_rf_for_symbol(
            symbol=symbol,
            path=path,
            horizons=HORIZONS,
            n_folds=3,
            embargo=24,
            model_type='classification',  # or 'regression'
            n_estimators=200,  # More trees = better but slower
            max_depth=10,  # Deeper = more complex (risk overfitting)
            compute_tree_variance=True  # Use tree variance for uncertainty
        )

        all_outputs[symbol] = results

        # Print summary
        print(f"\n{'='*60}")
        print(f"RESULTS FOR {symbol}")
        print(f"{'='*60}")

        for h in HORIZONS:
            test_df = results[h]['test']
            diag_list = results[h]['diag']

            if len(test_df) > 0:
                print(f"\nHorizon {h}:")
                print(f"  Test samples: {len(test_df)}")
                print(f"  Mean p_edge: {test_df['p_edge'].mean():.3f}")
                print(f"  Mean mu: {test_df['mu'].mean():.4f}")

                if diag_list:
                    avg_brier = np.mean([d['brier_val'] for d in diag_list])
                    avg_ece = np.mean([d['ece_val'] for d in diag_list])
                    avg_cov = np.mean([d['pi_coverage_val'] for d in diag_list])
                    print(f"  Avg Brier: {avg_brier:.4f}")
                    print(f"  Avg ECE: {avg_ece:.4f}")
                    print(f"  Avg PI coverage: {avg_cov:.2%}")

    # Export to JSON
    json_records = build_json_records(all_outputs)
    json_path = DATA_DIR / "trader_feed_rf_multiH.jsonl"
    with open(json_path, "w") as f:
        for r in json_records:
            f.write(json.dumps(r) + "\n")

    print(f"\n{'='*60}")
    print(f"Wrote {len(json_records)} records to: {json_path}")
    print(f"{'='*60}\n")


Training Random Forest for BTC
Model type: classification
Horizons: [1, 3, 6]
Folds: 3
Trees: 200
Max depth: 10

Fold 1/3: Train[0:1749] Val[1773:3522] Test[3546:5295]
  Training h=1... ✓ (OOB: 0.517)
  Training h=3... ✓ (OOB: 0.585)
  Training h=6... ✓ (OOB: 0.589)

FEATURE IMPORTANCE ANALYSIS

Horizon 1:
  1. ret_3: 0.1937
  2. ret_1: 0.1858
  3. vol_6: 0.1674
  4. ret_6: 0.1608
  5. vol_12: 0.1465
  6. ma_ratio: 0.1458
  Out-of-bag score: 0.5166

Horizon 3:
  1. vol_12: 0.1956
  2. vol_6: 0.1668
  3. ma_ratio: 0.1653
  4. ret_1: 0.1626
  5. ret_6: 0.1574
  6. ret_3: 0.1523
  Out-of-bag score: 0.5853

Horizon 6:
  1. vol_12: 0.2285
  2. vol_6: 0.1837
  3. ma_ratio: 0.1784
  4. ret_6: 0.1631
  5. ret_1: 0.1237
  6. ret_3: 0.1225
  Out-of-bag score: 0.5892
  Forecasting validation... ✓
  Forecasting test... ✓
Fold 2/3: Train[0:3498] Val[3522:5271] Test[5295:7044]
  Training h=1... ✓ (OOB: 0.522)
  Training h=3... ✓ (OOB: 0.576)
  Training h=6... ✓ (OOB: 0.597)
  Forecasting validation