# Classifier Training Notebook
## Train XGBoost and Meta-Model for Trading Signals

This notebook trains machine learning models for the quantitative trading bot:
1. **Primary Classifier**: XGBoost model with triple-barrier labeling
2. **Meta-Model**: Secondary filter model for trade acceptance/rejection
3. **Model Calibration**: Platt and Isotonic calibration for probability outputs
4. **Cross-Validation**: Purged K-fold validation for time series data

In [None]:
import warnings

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import yaml
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import brier_score_loss, confusion_matrix, roc_auc_score

warnings.filterwarnings("ignore")

# Import our modules
import os
import sys

sys.path.append("../")

from algos.core.cv_utils import PurgedKFold, combinatorial_purged_cv
from algos.core.feature_pipe import FeaturePipeline
from algos.core.labels import TripleBarrierLabeler, create_meta_labels

# Set style
plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

print("Libraries imported successfully")

## 1. Load Configuration and Data

In [None]:
# Load configuration
with open("../config.yaml") as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"Features: {list(config['trading']['features'].keys())}")
print(f"Labels: {config['trading']['labels']}")

# Initialize components
feature_pipeline = FeaturePipeline(config["trading"])
labeler = TripleBarrierLabeler(config["trading"])

print("Components initialized")

In [None]:
# Generate synthetic data for demonstration
# In practice, you would load real market data here

def generate_synthetic_data(n_samples=5000, n_symbols=5):
    """Generate synthetic market data for training."""

    np.random.seed(42)
    dates = pd.date_range("2020-01-01", periods=n_samples, freq="30min")

    all_data = {}

    for i in range(n_symbols):
        symbol = f"SYMBOL_{i}"

        # Generate price data with realistic properties
        returns = np.random.normal(0, 0.02, n_samples)  # 2% volatility
        returns += 0.1 * np.sin(np.arange(n_samples) * 2 * np.pi / 100)  # Trend component

        prices = 100 * np.exp(np.cumsum(returns))

        # Generate OHLCV data
        data = pd.DataFrame({
            "close": prices,
            "high": prices * (1 + np.abs(np.random.normal(0, 0.005, n_samples))),
            "low": prices * (1 - np.abs(np.random.normal(0, 0.005, n_samples))),
            "volume": np.random.lognormal(10, 1, n_samples)
        }, index=dates)

        data["open"] = data["close"].shift(1).fillna(data["close"][0])

        all_data[symbol] = data

    return all_data

# Generate data
market_data = generate_synthetic_data(5000, 5)
print(f"Generated data for {len(market_data)} symbols")
print(f"Sample data shape: {list(market_data.values())[0].shape}")

# Display sample data
sample_symbol = list(market_data.keys())[0]
print(f"\nSample data for {sample_symbol}:")
print(market_data[sample_symbol].head())

## 2. Feature Engineering

In [None]:
# Generate features for all symbols
all_features = {}
all_labels = {}

for symbol, data in market_data.items():
    print(f"Processing {symbol}...")

    # Generate features
    features = feature_pipeline.build_features(data)

    if len(features) > 100:  # Ensure sufficient data
        all_features[symbol] = features

        # Generate labels
        labels, touch_times, barriers_df = labeler.fit_transform(
            data["close"], features["atr"]
        )

        all_labels[symbol] = labels

        print(f"  Features shape: {features.shape}")
        print(f"  Labels shape: {labels.shape}")
        print(f"  Label distribution: {labels.value_counts().to_dict()}")

print(f"\nFeature generation complete for {len(all_features)} symbols")

In [None]:
# Combine all features and labels
combined_features = pd.concat(all_features.values(), axis=0)
combined_labels = pd.concat(all_labels.values(), axis=0)

# Align features and labels
aligned_features, aligned_labels = combined_features.align(combined_labels, join="inner")

print("Combined dataset:")
print(f"Features shape: {aligned_features.shape}")
print(f"Labels shape: {aligned_labels.shape}")
print(f"Combined label distribution: {aligned_labels.value_counts().to_dict()}")

# Handle missing values
aligned_features = aligned_features.fillna(0)
aligned_features = aligned_features.replace([np.inf, -np.inf], 0)

print("Missing values handled")
print(f"Feature columns: {list(aligned_features.columns)}")

## 3. Model Training with Cross-Validation

In [None]:
# Convert labels to binary (1 for profit, 0 for loss/timeout)
binary_labels = (aligned_labels == 1).astype(int)

print("Binary label distribution:")
print(f"Positive (profit): {binary_labels.sum()} ({binary_labels.mean():.1%})")
print(f"Negative (loss/timeout): {(~binary_labels.astype(bool)).sum()} ({(~binary_labels).mean():.1%})")

# Set up purged cross-validation
cv = PurgedKFold(n_splits=5, embargo_frac=0.02)

# XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

print("Starting XGBoost training with purged CV...")

# Perform cross-validation
cv_results = combinatorial_purged_cv(
    X=aligned_features,
    y=binary_labels,
    model=xgb_model,
    n_splits=5,
    embargo_frac=0.02
)

print("\nCross-validation results:")
for metric, stats in cv_results["cv_stats"].items():
    print(f"{metric}: {stats['mean']:.4f} ± {stats['std']:.4f}")

In [None]:
# Train final model on full dataset
print("Training final XGBoost model...")

xgb_model.fit(
    aligned_features,
    binary_labels,
    eval_set=[(aligned_features, binary_labels)],
    verbose=False
)

# Get predictions
train_predictions = xgb_model.predict_proba(aligned_features)[:, 1]

# Feature importance
feature_importance = pd.DataFrame({
    "feature": aligned_features.columns,
    "importance": xgb_model.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 10 most important features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(10), feature_importance.head(10)["importance"])
plt.yticks(range(10), feature_importance.head(10)["feature"])
plt.xlabel("Feature Importance")
plt.title("Top 10 Feature Importances - XGBoost")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Model training complete")

## 4. Model Calibration

In [None]:
# Calibrate model probabilities
print("Calibrating model probabilities...")

# Platt calibration
platt_calibrator = CalibratedClassifierCV(xgb_model, method="sigmoid", cv=3)
platt_calibrator.fit(aligned_features, binary_labels)

# Isotonic calibration
isotonic_calibrator = CalibratedClassifierCV(xgb_model, method="isotonic", cv=3)
isotonic_calibrator.fit(aligned_features, binary_labels)

# Get calibrated predictions
platt_predictions = platt_calibrator.predict_proba(aligned_features)[:, 1]
isotonic_predictions = isotonic_calibrator.predict_proba(aligned_features)[:, 1]

# Calibration plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Original calibration
fraction_of_positives, mean_predicted_value = calibration_curve(
    binary_labels, train_predictions, n_bins=10
)
axes[0].plot(mean_predicted_value, fraction_of_positives, "s-", label="XGBoost")
axes[0].plot([0, 1], [0, 1], "k:", label="Perfect calibration")
axes[0].set_ylabel("Fraction of positives")
axes[0].set_xlabel("Mean predicted probability")
axes[0].set_title("Original Model")
axes[0].legend()

# Platt calibration
fraction_of_positives, mean_predicted_value = calibration_curve(
    binary_labels, platt_predictions, n_bins=10
)
axes[1].plot(mean_predicted_value, fraction_of_positives, "s-", label="Platt")
axes[1].plot([0, 1], [0, 1], "k:", label="Perfect calibration")
axes[1].set_xlabel("Mean predicted probability")
axes[1].set_title("Platt Calibration")
axes[1].legend()

# Isotonic calibration
fraction_of_positives, mean_predicted_value = calibration_curve(
    binary_labels, isotonic_predictions, n_bins=10
)
axes[2].plot(mean_predicted_value, fraction_of_positives, "s-", label="Isotonic")
axes[2].plot([0, 1], [0, 1], "k:", label="Perfect calibration")
axes[2].set_xlabel("Mean predicted probability")
axes[2].set_title("Isotonic Calibration")
axes[2].legend()

plt.tight_layout()
plt.show()

# Calculate Brier scores
original_brier = brier_score_loss(binary_labels, train_predictions)
platt_brier = brier_score_loss(binary_labels, platt_predictions)
isotonic_brier = brier_score_loss(binary_labels, isotonic_predictions)

print("\nBrier Scores (lower is better):")
print(f"Original: {original_brier:.4f}")
print(f"Platt: {platt_brier:.4f}")
print(f"Isotonic: {isotonic_brier:.4f}")

# Choose best calibration method
if platt_brier < isotonic_brier:
    final_model = platt_calibrator
    calibration_method = "platt"
else:
    final_model = isotonic_calibrator
    calibration_method = "isotonic"

print(f"\nSelected {calibration_method} calibration (best Brier score)")

## 5. Meta-Model Training

In [None]:
# Create meta-labels
print("Creating meta-labels...")

# Get primary model predictions
primary_predictions = final_model.predict_proba(aligned_features)[:, 1]

# Convert original triple-barrier labels to binary for meta-labeling
meta_labels = create_meta_labels(pd.Series(primary_predictions, index=aligned_labels.index),
                                aligned_labels, threshold=0.5)

print(f"Meta-labels created: {len(meta_labels)} samples")
print(f"Meta-label acceptance rate: {meta_labels.mean():.1%}")

# Prepare meta-features
meta_features = pd.DataFrame({
    "primary_prob": primary_predictions,
    "atr_ratio": aligned_features["atr"] / aligned_features.get("sma_20", 100),
    "realized_vol": aligned_features.get("realized_vol", 0.1),
    "rsi_normalized": aligned_features.get("rsi", 50) / 100,
    "bb_position": aligned_features.get("bb_position", 0.5)
})

meta_features = meta_features.fillna(0)

print(f"Meta-features shape: {meta_features.shape}")
print(f"Meta-features columns: {list(meta_features.columns)}")

# Train meta-model
meta_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42
)

print("Training meta-model...")
meta_model.fit(meta_features, meta_labels)

# Meta-model predictions
meta_predictions = meta_model.predict(meta_features)
meta_probabilities = meta_model.predict_proba(meta_features)[:, 1]

print("\nMeta-model performance:")
print(f"Accuracy: {(meta_predictions == meta_labels).mean():.3f}")
print(f"Precision: {(meta_predictions & meta_labels).sum() / meta_predictions.sum():.3f}")
print(f"Recall: {(meta_predictions & meta_labels).sum() / meta_labels.sum():.3f}")

# Meta-model feature importance
meta_importance = pd.DataFrame({
    "feature": meta_features.columns,
    "importance": meta_model.feature_importances_
}).sort_values("importance", ascending=False)

print("\nMeta-model feature importance:")
print(meta_importance)

## 6. Model Evaluation and Diagnostics

In [None]:
# Final model evaluation
print("=== MODEL EVALUATION ===")

# Primary model metrics
final_predictions = final_model.predict_proba(aligned_features)[:, 1]
binary_predictions = (final_predictions > 0.5).astype(int)

print("\nPrimary Model Metrics:")
print(f"AUC Score: {roc_auc_score(binary_labels, final_predictions):.4f}")
print(f"Brier Score: {brier_score_loss(binary_labels, final_predictions):.4f}")
print(f"Accuracy: {(binary_predictions == binary_labels).mean():.4f}")

# Confusion matrix
cm = confusion_matrix(binary_labels, binary_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Primary Model")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()

# ROC Curve
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(binary_labels, final_predictions)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc_score(binary_labels, final_predictions):.3f})")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Primary Model")
plt.legend()
plt.show()

# Prediction distribution
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(final_predictions[binary_labels == 0], alpha=0.7, label="Negative", bins=30)
plt.hist(final_predictions[binary_labels == 1], alpha=0.7, label="Positive", bins=30)
plt.xlabel("Predicted Probability")
plt.ylabel("Frequency")
plt.title("Prediction Distribution")
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(meta_probabilities, alpha=0.7, bins=30, color="green")
plt.xlabel("Meta-Model Probability")
plt.ylabel("Frequency")
plt.title("Meta-Model Probability Distribution")

plt.tight_layout()
plt.show()

print("\nModel evaluation complete")

## 7. Save Models

In [None]:
# Create models directory
os.makedirs("../models", exist_ok=True)

# Save primary model
primary_model_path = "../models/xgb.pkl"
joblib.dump(final_model, primary_model_path)
print(f"Primary model saved to {primary_model_path}")

# Save meta-model
meta_model_path = "../models/meta.pkl"
joblib.dump(meta_model, meta_model_path)
print(f"Meta-model saved to {meta_model_path}")

# Save model metadata
model_metadata = {
    "primary_model": {
        "type": "XGBoost",
        "calibration": calibration_method,
        "features": list(aligned_features.columns),
        "n_features": len(aligned_features.columns),
        "training_samples": len(aligned_features),
        "auc_score": roc_auc_score(binary_labels, final_predictions),
        "brier_score": brier_score_loss(binary_labels, final_predictions)
    },
    "meta_model": {
        "type": "XGBoost",
        "features": list(meta_features.columns),
        "acceptance_rate": meta_labels.mean(),
        "accuracy": (meta_predictions == meta_labels).mean()
    },
    "training_config": config["trading"]
}

metadata_path = "../models/model_metadata.yaml"
with open(metadata_path, "w") as f:
    yaml.dump(model_metadata, f, default_flow_style=False)
print(f"Model metadata saved to {metadata_path}")

print("\n=== TRAINING COMPLETE ===")
print(f"Primary model AUC: {model_metadata['primary_model']['auc_score']:.4f}")
print(f"Meta-model accuracy: {model_metadata['meta_model']['accuracy']:.4f}")
print(f"Meta-model acceptance rate: {model_metadata['meta_model']['acceptance_rate']:.1%}")
print("\nModels saved and ready for deployment!")