# Model Training for Market Regime Classification

This notebook trains machine learning models to classify market regimes based on labeled technical indicators.

**Label Classes:**
- 0 = ranging (sideways movement)
- 1 = trending_up (bullish trend)
- 2 = trending_down (bearish trend)

**Workflow:**
1. Load features and labels
2. Prepare training data
3. Train XGBoost classifier
4. Evaluate performance
5. Compare with RandomForest
6. Save best model for backtesting

## 1. Setup

Import required libraries and modules.

In [1]:
import sys
import pandas as pd
import numpy as np
import json
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(str(Path.cwd().parent))

# Import trainer functions
from src.trainer import (
    train_model,
    evaluate_model,
    save_model,
    get_feature_importance,
    prepare_features,
    cross_validate
)
from src.config import FEATURES_DIR, LABELS_DIR, MODELS_DIR, RANDOM_STATE, TRAIN_TEST_SPLIT

print("Libraries imported successfully!")
print(f"Features directory: {FEATURES_DIR}")
print(f"Labels directory: {LABELS_DIR}")
print(f"Models directory: {MODELS_DIR}")

Libraries imported successfully!
Features directory: C:\Users\jakers\Desktop\bot\features
Labels directory: C:\Users\jakers\Desktop\bot\labels
Models directory: C:\Users\jakers\Desktop\bot\models


## 2. Load Features & Labels

Load the features from parquet files and labels from JSON, then merge them together.

In [2]:
# Configuration
SYMBOL = 'BTCUSDT'
INTERVAL = '1h'

# Load features from parquet
features_path = FEATURES_DIR / f"{SYMBOL}_{INTERVAL}_features.parquet"
print(f"Loading features from: {features_path}")

if not features_path.exists():
    raise FileNotFoundError(f"Features file not found: {features_path}")

df_features = pd.read_parquet(features_path)
print(f"\nFeatures loaded: {len(df_features)} rows, {len(df_features.columns)} columns")
print(f"Date range: {df_features['open_time'].min()} to {df_features['open_time'].max()}")

# Load labels from JSON
labels_path = LABELS_DIR / f"{SYMBOL}_{INTERVAL}_labels.json"
print(f"\nLoading labels from: {labels_path}")

if not labels_path.exists():
    print(f"\nWARNING: Labels file not found: {labels_path}")
    print("Please run the labeling notebook (02_label_data.ipynb) first to create labels.")
    print("\nFor demonstration purposes, we'll create sample labels...")
    
    # Create sample labels for demonstration
    labels_data = {
        'metadata': {
            'symbol': SYMBOL,
            'interval': INTERVAL,
            'total_labels': 0
        },
        'labels': {}
    }
else:
    with open(labels_path, 'r') as f:
        labels_data = json.load(f)

# Convert labels dict to DataFrame
labels_list = []
for label_info in labels_data.get('labels', []):
    labels_list.append({
        # Label spans start_idx to end_idx
        'label': label_info['label']
    })

if labels_list:
    df_labels = pd.DataFrame(labels_list)
    print(f"\nLabels loaded: {len(df_labels)} labeled samples")
    print(f"\nLabel distribution:")
    label_counts = df_labels['label'].value_counts().sort_index()
    for label, count in label_counts.items():
        label_name = ['ranging', 'trending_up', 'trending_down'][label]
        print(f"  {label} ({label_name}): {count} ({count/len(df_labels)*100:.1f}%)")
else:
    print("\nNo labels found. Please label your data first!")
    df_labels = pd.DataFrame(columns=['open_time', 'label'])

Loading features from: C:\Users\jakers\Desktop\bot\features\BTCUSDT_1h_features.parquet

Features loaded: 51864 rows, 232 columns
Date range: 2020-01-01 00:00:00 to 2025-11-30 23:00:00

Loading labels from: C:\Users\jakers\Desktop\bot\labels\BTCUSDT_1h_labels.json


AttributeError: 'list' object has no attribute 'items'

In [None]:
# Merge features and labels
df = df_features.merge(df_labels, on='open_time', how='left')
print(f"Merged dataset: {len(df)} rows")
print(f"Labeled samples: {df['label'].notna().sum()}")
print(f"Unlabeled samples: {df['label'].isna().sum()}")

# Show sample of merged data
print("\nSample of merged data:")
df.head()

## 3. Prepare Training Data

Filter to only labeled rows and separate features (X) from labels (y).

In [None]:
# Filter to labeled data only
df_labeled = df.dropna(subset=['label']).copy()
print(f"Labeled dataset: {len(df_labeled)} samples")

if len(df_labeled) == 0:
    raise ValueError("No labeled data available. Please label data first using 02_label_data.ipynb")

# Check for minimum samples per class
min_samples_per_class = df_labeled['label'].value_counts().min()
print(f"\nMinimum samples per class: {min_samples_per_class}")

if min_samples_per_class < 10:
    print("\nWARNING: Very few samples in some classes. Consider labeling more data for better model performance.")

# Prepare features and target
y = df_labeled['label'].values.astype(int)
X, feature_names = prepare_features(df_labeled)

print(f"\nFeature matrix shape: {X.shape}")
print(f"Number of features: {len(feature_names)}")
print(f"Target shape: {y.shape}")

## 4. Feature Selection

Display which features will be used for training (excludes raw OHLCV and timestamps).

In [None]:
print(f"Total features for training: {len(feature_names)}\n")
print("Features being used:")
print("=" * 50)

# Group features by category for better readability
ma_features = [f for f in feature_names if 'ma_' in f or 'ema_' in f]
spread_features = [f for f in feature_names if 'spread' in f]
slope_features = [f for f in feature_names if 'slope' in f]
compression_features = [f for f in feature_names if 'compression' in f]
position_features = [f for f in feature_names if 'position' in f]
other_features = [f for f in feature_names if f not in ma_features + spread_features + 
                  slope_features + compression_features + position_features]

if ma_features:
    print(f"\nMoving Averages ({len(ma_features)}):")
    for f in ma_features[:10]:  # Show first 10
        print(f"  - {f}")
    if len(ma_features) > 10:
        print(f"  ... and {len(ma_features)-10} more")

if spread_features:
    print(f"\nSpread Features ({len(spread_features)}):")
    for f in spread_features:
        print(f"  - {f}")

if slope_features:
    print(f"\nSlope Features ({len(slope_features)}):")
    for f in slope_features:
        print(f"  - {f}")

if compression_features:
    print(f"\nCompression Features ({len(compression_features)}):")
    for f in compression_features:
        print(f"  - {f}")

if position_features:
    print(f"\nPosition Features ({len(position_features)}):")
    for f in position_features:
        print(f"  - {f}")

if other_features:
    print(f"\nOther Features ({len(other_features)}):")
    for f in other_features:
        print(f"  - {f}")

print("\n" + "=" * 50)
print("\nExcluded columns (metadata, not features):")
excluded = ['open_time', 'label', 'timestamp', 'date', 'datetime', 
            'close_time', 'open', 'high', 'low', 'close', 'volume']
actual_excluded = [col for col in excluded if col in df_labeled.columns]
for col in actual_excluded:
    print(f"  - {col}")

## 5. Train/Test Split

Split data into training (80%) and testing (20%) sets with stratification.

In [None]:
# Perform stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=(1 - TRAIN_TEST_SPLIT),
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"Train/Test Split Configuration:")
print(f"  Split ratio: {TRAIN_TEST_SPLIT:.0%} train / {(1-TRAIN_TEST_SPLIT):.0%} test")
print(f"  Random state: {RANDOM_STATE}")
print(f"  Stratified: Yes\n")

print(f"Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)\n")

# Show class distribution in train and test sets
train_dist = pd.Series(y_train).value_counts().sort_index()
test_dist = pd.Series(y_test).value_counts().sort_index()

print("Class distribution in training set:")
for label, count in train_dist.items():
    label_name = ['ranging', 'trending_up', 'trending_down'][label]
    print(f"  {label} ({label_name}): {count} ({count/len(y_train)*100:.1f}%)")

print("\nClass distribution in test set:")
for label, count in test_dist.items():
    label_name = ['ranging', 'trending_up', 'trending_down'][label]
    print(f"  {label} ({label_name}): {count} ({count/len(y_test)*100:.1f}%)")

## 6. Train XGBoost Model

Train an XGBoost classifier with optimized hyperparameters.

In [None]:
print("Training XGBoost classifier...\n")
print("Model Configuration:")
print("  Algorithm: XGBoost")
print("  n_estimators: 100")
print("  max_depth: 6")
print("  learning_rate: 0.1")
print("  subsample: 0.8")
print("  colsample_bytree: 0.8")
print("  eval_metric: mlogloss")
print()

# Train XGBoost model
xgb_model = train_model(X_train, y_train, model_type='xgboost')

print("XGBoost model trained successfully!")

## 7. Evaluate XGBoost Performance

Calculate accuracy, precision, recall, F1 score, and visualize the confusion matrix.

In [None]:
# Evaluate XGBoost model
xgb_metrics = evaluate_model(xgb_model, X_test, y_test)

print("XGBoost Test Set Performance:")
print("=" * 50)
print(f"Accuracy:  {xgb_metrics['accuracy']:.4f}")
print(f"Precision: {xgb_metrics['precision']:.4f}")
print(f"Recall:    {xgb_metrics['recall']:.4f}")
print(f"F1 Score:  {xgb_metrics['f1']:.4f}")
print("=" * 50)

In [None]:
# Visualize confusion matrix as heatmap
cm = np.array(xgb_metrics['confusion_matrix'])
labels = ['Ranging', 'Trending Up', 'Trending Down']

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=labels,
    y=labels,
    text=cm,
    texttemplate='%{text}',
    textfont={"size": 16},
    colorscale='Blues',
    showscale=True
))

fig.update_layout(
    title='XGBoost Confusion Matrix',
    xaxis_title='Predicted Label',
    yaxis_title='True Label',
    width=600,
    height=500
)

fig.show()

# Calculate per-class metrics
print("\nPer-Class Performance:")
print("=" * 50)
for i, label_name in enumerate(labels):
    tp = cm[i, i]
    fp = cm[:, i].sum() - tp
    fn = cm[i, :].sum() - tp
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"{label_name}:")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print()

## 8. Feature Importance

Visualize the top 20 most important features for the XGBoost model.

In [None]:
# Get feature importances
importance_df = get_feature_importance(xgb_model, feature_names)

print("Top 20 Most Important Features:")
print("=" * 50)
print(importance_df.head(20).to_string(index=False))

In [None]:
# Visualize top 20 feature importances
top_n = 20
top_features = importance_df.head(top_n)

fig = go.Figure(go.Bar(
    x=top_features['importance'],
    y=top_features['feature'],
    orientation='h',
    marker=dict(
        color=top_features['importance'],
        colorscale='Viridis',
        showscale=True
    )
))

fig.update_layout(
    title=f'Top {top_n} Most Important Features (XGBoost)',
    xaxis_title='Importance Score',
    yaxis_title='Feature',
    height=600,
    width=900,
    yaxis={'autorange': 'reversed'}  # Most important at top
)

fig.show()

## 9. Cross-Validation

Perform 5-fold cross-validation to assess model stability and generalization.

In [None]:
print("Performing 5-fold cross-validation on XGBoost...\n")

# Perform cross-validation on full labeled dataset
cv_results = cross_validate(X, y, model_type='xgboost', cv=5)

print("Cross-Validation Results:")
print("=" * 50)
print(f"Fold scores: {[f'{s:.4f}' for s in cv_results['scores']]}")
print(f"\nMean CV Score: {cv_results['mean_score']:.4f}")
print(f"Std CV Score:  {cv_results['std_score']:.4f}")
print(f"95% CI:        {cv_results['mean_score']:.4f} +/- {1.96 * cv_results['std_score']:.4f}")
print("=" * 50)

In [None]:
# Visualize CV scores
fig = go.Figure()

# Add bars for each fold
fig.add_trace(go.Bar(
    x=[f'Fold {i+1}' for i in range(len(cv_results['scores']))],
    y=cv_results['scores'],
    name='CV Score',
    marker_color='lightblue'
))

# Add mean line
fig.add_hline(
    y=cv_results['mean_score'],
    line_dash="dash",
    line_color="red",
    annotation_text=f"Mean: {cv_results['mean_score']:.4f}",
    annotation_position="right"
)

fig.update_layout(
    title='5-Fold Cross-Validation Scores (XGBoost)',
    xaxis_title='Fold',
    yaxis_title='Accuracy Score',
    width=800,
    height=500,
    showlegend=True
)

fig.show()

## 10. Compare Models

Train a RandomForest model and compare performance with XGBoost.

In [None]:
print("Training RandomForest classifier...\n")
print("Model Configuration:")
print("  Algorithm: RandomForest")
print("  n_estimators: 100")
print("  max_depth: 10")
print("  min_samples_split: 5")
print("  min_samples_leaf: 2")
print()

# Train RandomForest model
rf_model = train_model(X_train, y_train, model_type='randomforest')

print("RandomForest model trained successfully!")

In [None]:
# Evaluate RandomForest model
rf_metrics = evaluate_model(rf_model, X_test, y_test)

print("RandomForest Test Set Performance:")
print("=" * 50)
print(f"Accuracy:  {rf_metrics['accuracy']:.4f}")
print(f"Precision: {rf_metrics['precision']:.4f}")
print(f"Recall:    {rf_metrics['recall']:.4f}")
print(f"F1 Score:  {rf_metrics['f1']:.4f}")
print("=" * 50)

In [None]:
# Compare both models side by side
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'XGBoost': [
        xgb_metrics['accuracy'],
        xgb_metrics['precision'],
        xgb_metrics['recall'],
        xgb_metrics['f1']
    ],
    'RandomForest': [
        rf_metrics['accuracy'],
        rf_metrics['precision'],
        rf_metrics['recall'],
        rf_metrics['f1']
    ]
})

comparison_df['Difference'] = comparison_df['XGBoost'] - comparison_df['RandomForest']

print("\nModel Comparison:")
print("=" * 70)
print(comparison_df.to_string(index=False))
print("=" * 70)

# Determine best model
if xgb_metrics['f1'] > rf_metrics['f1']:
    best_model = xgb_model
    best_model_name = 'XGBoost'
    best_metrics = xgb_metrics
else:
    best_model = rf_model
    best_model_name = 'RandomForest'
    best_metrics = rf_metrics

print(f"\nBest Model: {best_model_name} (F1 Score: {best_metrics['f1']:.4f})")

In [None]:
# Visualize model comparison
fig = go.Figure()

metrics_list = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
xgb_scores = comparison_df['XGBoost'].values
rf_scores = comparison_df['RandomForest'].values

fig.add_trace(go.Bar(
    name='XGBoost',
    x=metrics_list,
    y=xgb_scores,
    marker_color='blue'
))

fig.add_trace(go.Bar(
    name='RandomForest',
    x=metrics_list,
    y=rf_scores,
    marker_color='green'
))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Metric',
    yaxis_title='Score',
    barmode='group',
    width=800,
    height=500,
    yaxis=dict(range=[0, 1])
)

fig.show()

## 11. Save Best Model

Save the best performing model to the models directory for use in backtesting.

In [None]:
# Save the best model
model_path = save_model(best_model, SYMBOL, INTERVAL)

print(f"Best model ({best_model_name}) saved successfully!")
print(f"\nModel file: {model_path}")
print(f"Model size: {model_path.stat().st_size / 1024:.2f} KB")

# Save model metadata
metadata = {
    'symbol': SYMBOL,
    'interval': INTERVAL,
    'model_type': best_model_name,
    'training_date': pd.Timestamp.now().isoformat(),
    'n_samples_train': len(X_train),
    'n_samples_test': len(X_test),
    'n_features': len(feature_names),
    'metrics': {
        'accuracy': float(best_metrics['accuracy']),
        'precision': float(best_metrics['precision']),
        'recall': float(best_metrics['recall']),
        'f1': float(best_metrics['f1'])
    },
    'feature_names': feature_names,
    'label_mapping': {
        '0': 'ranging',
        '1': 'trending_up',
        '2': 'trending_down'
    }
}

metadata_path = MODELS_DIR / f"{SYMBOL}_{INTERVAL}_model_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\nMetadata saved: {metadata_path}")
print("\n" + "=" * 70)
print("MODEL TRAINING COMPLETE!")
print("=" * 70)
print(f"\nThe trained {best_model_name} model is ready for backtesting.")
print(f"Use this model file in your backtesting notebook: {model_path.name}")

## Summary

This notebook successfully:
1. Loaded features from parquet files and labels from JSON
2. Prepared and split the training data (80/20 stratified split)
3. Trained an XGBoost classifier
4. Evaluated model performance with detailed metrics
5. Visualized feature importance
6. Performed 5-fold cross-validation
7. Compared XGBoost with RandomForest
8. Saved the best model for backtesting

**Next Steps:**
- Use the saved model in backtesting (04_backtest.ipynb)
- If model performance is poor, consider:
  - Labeling more data
  - Engineering additional features
  - Tuning hyperparameters
  - Trying different model architectures