### 📊 Model Evaluation and Backtesting
 
### **Purpose:** Final model evaluation and comprehensive backtesting
 
### This notebook provides:
- Loading and comparison of all trained models
- Comprehensive performance evaluation across multiple metrics
- Backtesting simulation with trading strategies
- Final performance reports and visualizations
- Statistical significance testing
- Model selection recommendations

In [None]:
# Core imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
import os
import pickle
import joblib
from pathlib import Path
warnings.filterwarnings('ignore')

# Project imports
import sys
sys.path.append('..')

from src.utils import (
    get_project_root, get_project_path, load_results_from_json,
    generate_performance_report, create_html_report, plot_model_comparison,
    plot_predictions_vs_actual, create_time_series_plot, save_results_to_json
)
from src.data import (
    load_config, setup_logging, load_raw_data, get_default_tickers,
    clean_data, calculate_returns
)
from src.models import (
    BasePredictor, NaiveLastValue, RandomWalkDrift, 
    RandomForestPredictor, LightGBMPredictor, XGBPredictor, LSTMPredictor,
    load_model_records, predict_multi_horizon
)
from src.evaluate import (
    compare_models, create_evaluation_report, walk_forward_validation,
    calculate_rmse, calculate_mae, calculate_mape, calculate_r2,
    calculate_profit_loss, calculate_max_drawdown, evaluate_trading_strategy,
    statistical_significance_test, plot_validation_results
)
from src.features import (
    create_all_features, scale_features, create_targets,
    process_stock_features
)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# %%
# Setup paths and logging
project_root = get_project_root()
config_path = get_project_path("config/config.yaml")
models_dir = get_project_path("experiments/models")
results_dir = get_project_path("experiments/results")
figures_dir = get_project_path("experiments/figures")

# Load configuration
config = load_config(config_path)
logger = setup_logging()

print(f"Project root: {project_root}")
print(f"Models directory: {models_dir}")
print(f"Results directory: {results_dir}")
print(f"Configuration loaded successfully")

### 📂 Load Available Models and Results

def load_saved_models():
    """Load all saved models from the models directory."""
    models = {}
    model_files = list(Path(models_dir).glob("*.pkl")) + list(Path(models_dir).glob("*.joblib"))
    
    print(f"Found {len(model_files)} model files:")
    for model_file in model_files:
        print(f"  - {model_file.name}")
        
        try:
            # Determine loading method based on file extension
            if model_file.suffix == '.pkl':
                with open(model_file, 'rb') as f:
                    model = pickle.load(f)
            else:  # .joblib
                model = joblib.load(model_file)
            
            # Extract model name from filename
            model_name = model_file.stem
            models[model_name] = model
            print(f"    ✓ Loaded successfully")
            
        except Exception as e:
            print(f"    ✗ Failed to load: {e}")
    
    return models

# Load all available models
saved_models = load_saved_models()
print(f"\nTotal models loaded: {len(saved_models)}")


def load_saved_results():
    """Load all saved evaluation results."""
    results = {}
    result_files = list(Path(results_dir).glob("*.json"))
    
    print(f"Found {len(result_files)} result files:")
    for result_file in result_files:
        print(f"  - {result_file.name}")
        
        try:
            result = load_results_from_json(str(result_file))
            result_name = result_file.stem
            results[result_name] = result
            print(f"    ✓ Loaded successfully")
        except Exception as e:
            print(f"    ✗ Failed to load: {e}")
    
    return results

# Load all available results
saved_results = load_saved_results()
print(f"\nTotal results loaded: {len(saved_results)}")

### 📊 Data Preparation for Backtesting

In [None]:
# Load and prepare data for backtesting
tickers = get_default_tickers()
test_ticker = config['data']['primary_ticker']

print(f"Loading data for backtesting ticker: {test_ticker}")

# Load raw data
raw_data = load_raw_data(get_project_path(f"data/raw/{test_ticker}.csv"))
print(f"Raw data shape: {raw_data.shape}")
print(f"Date range: {raw_data.index.min()} to {raw_data.index.max()}")

# Clean data
cleaned_data = clean_data(raw_data)
print(f"Cleaned data shape: {cleaned_data.shape}")

# Create features
features_data = create_all_features(
    cleaned_data, 
    config['features']['technical_indicators'],
    config['features']['lag_features'],
    config['features']['rolling_features']
)
print(f"Features data shape: {features_data.shape}")

# Define backtesting period (last 252 trading days = ~1 year)
backtest_start = features_data.index[-252]
backtest_data = features_data[features_data.index >= backtest_start].copy()
print(f"Backtesting period: {backtest_start} to {features_data.index.max()}")
print(f"Backtesting data shape: {backtest_data.shape}")

### 🔄 Model Reconstruction and Backtesting

In [None]:
def create_baseline_models():
    """Create baseline models for comparison."""
    return {
        'naive_last_value': NaiveLastValue(),
        'random_walk_drift': RandomWalkDrift()
    }

def backtest_model(model, data, target_col='close', horizon=1):
    """
    Perform walk-forward backtesting on a model.
    
    Args:
        model: Trained model instance
        data: DataFrame with features and target
        target_col: Target column name
        horizon: Prediction horizon
    
    Returns:
        DataFrame with predictions and actual values
    """
    predictions = []
    actual_values = []
    dates = []
    
    # Minimum training window
    min_train_size = 252  # 1 year of data
    
    for i in range(min_train_size, len(data) - horizon + 1):
        # Training data
        train_data = data.iloc[:i]
        
        # Target date and value
        target_date = data.index[i + horizon - 1]
        target_value = data.iloc[i + horizon - 1][target_col]
        
        try:
            # Make prediction
            if hasattr(model, 'predict'):
                # For sklearn-style models
                feature_cols = [col for col in train_data.columns if col != target_col]
                X = train_data[feature_cols].iloc[[-1]]  # Last row for prediction
                pred = model.predict(X)[0]
            else:
                # For custom baseline models
                pred = model.predict(train_data[target_col].values, steps=horizon)
                if isinstance(pred, (list, np.ndarray)):
                    pred = pred[-1]
            
            predictions.append(pred)
            actual_values.append(target_value)
            dates.append(target_date)
            
        except Exception as e:
            print(f"Error predicting for date {target_date}: {e}")
            continue
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'date': dates,
        'actual': actual_values,
        'predicted': predictions
    })
    results_df.set_index('date', inplace=True)
    
    return results_df

# Create baseline models
baseline_models = create_baseline_models()
all_models = {**baseline_models, **saved_models}

print(f"Total models for backtesting: {len(all_models)}")
for name in all_models.keys():
    print(f"  - {name}")

### 📈 Comprehensive Backtesting

In [None]:
# Prepare features for backtesting
feature_columns = [col for col in backtest_data.columns if col not in ['close', 'target_1d', 'target_5d']]
target_column = 'close'

print(f"Using {len(feature_columns)} features for backtesting")
print(f"Target column: {target_column}")

# Scale features if needed
scaled_features = scale_features(backtest_data[feature_columns])
backtest_data_scaled = pd.concat([scaled_features, backtest_data[['close']]], axis=1)

# Perform backtesting for all models
backtest_results = {}
model_performance = {}

print("Starting comprehensive backtesting...")
print("=" * 50)

for model_name, model in all_models.items():
    print(f"\n🔄 Backtesting {model_name}...")
    
    try:
        # Choose appropriate data based on model type
        if model_name in ['naive_last_value', 'random_walk_drift']:
            # Baseline models don't need scaled features
            data_for_model = backtest_data
        else:
            # ML models typically need scaled features
            data_for_model = backtest_data_scaled
        
        # Perform backtesting
        results = backtest_model(model, data_for_model, target_column, horizon=1)
        
        if len(results) > 0:
            backtest_results[model_name] = results
            
            # Calculate performance metrics
            actual = results['actual'].values
            predicted = results['predicted'].values
            
            metrics = {
                'rmse': calculate_rmse(actual, predicted),
                'mae': calculate_mae(actual, predicted),
                'mape': calculate_mape(actual, predicted),
                'r2': calculate_r2(actual, predicted),
                'num_predictions': len(results)
            }
            
            model_performance[model_name] = metrics
            
            print(f"  ✓ Completed: {len(results)} predictions")
            print(f"    RMSE: {metrics['rmse']:.4f}")
            print(f"    MAE: {metrics['mae']:.4f}")
            print(f"    R²: {metrics['r2']:.4f}")
        else:
            print(f"  ✗ No valid predictions generated")
            
    except Exception as e:
        print(f"  ✗ Error: {e}")

print(f"\n✅ Backtesting completed for {len(backtest_results)} models")

### 📊 Performance Comparison and Visualization

In [None]:
# Create performance comparison DataFrame
if model_performance:
    performance_df = pd.DataFrame(model_performance).T
    performance_df = performance_df.round(4)
    
    print("🏆 Model Performance Ranking")
    print("=" * 40)
    
    # Sort by R² (descending) and RMSE (ascending)
    performance_df_sorted = performance_df.sort_values(['r2', 'rmse'], ascending=[False, True])
    print(performance_df_sorted)
    
    # Save performance results
    performance_results = {
        'backtest_period': {
            'start': str(backtest_data.index.min()),
            'end': str(backtest_data.index.max()),
            'num_days': len(backtest_data)
        },
        'model_performance': performance_df.to_dict(),
        'ranking': performance_df_sorted.index.tolist()
    }
    
    save_results_to_json(
        performance_results, 
        get_project_path("experiments/results/backtest_performance.json")
    )
    print("\n💾 Performance results saved")


# Visualize model performance comparison
if len(model_performance) > 1:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('📊 Model Performance Comparison - Backtesting Results', fontsize=16, fontweight='bold')
    
    metrics = ['rmse', 'mae', 'mape', 'r2']
    metric_names = ['RMSE', 'MAE', 'MAPE (%)', 'R²']
    
    for idx, (metric, name) in enumerate(zip(metrics, metric_names)):
        ax = axes[idx // 2, idx % 2]
        
        # Extract values
        models = list(model_performance.keys())
        values = [model_performance[model][metric] for model in models]
        
        # Create bar plot
        bars = ax.bar(models, values)
        ax.set_title(f'{name} Comparison', fontweight='bold')
        ax.set_ylabel(name)
        
        # Rotate x-labels for better readability
        ax.tick_params(axis='x', rotation=45)
        
        # Add value labels on bars
        for bar, value in zip(bars, values):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                   f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
        
        # Color the best performing bar
        if metric == 'r2':
            best_idx = np.argmax(values)
        else:
            best_idx = np.argmin(values)
        bars[best_idx].set_color('gold')
    
    plt.tight_layout()
    plt.savefig(get_project_path("experiments/figures/backtest_performance_comparison.png"), 
                dpi=300, bbox_inches='tight')
    plt.show()

### 📈 Prediction vs Actual Visualizations

In [None]:
# Plot prediction vs actual for top 3 models
if backtest_results:
    top_models = performance_df_sorted.head(3).index.tolist()
    
    fig, axes = plt.subplots(len(top_models), 1, figsize=(15, 5 * len(top_models)))
    if len(top_models) == 1:
        axes = [axes]
    
    fig.suptitle('🎯 Top Models: Predictions vs Actual Prices', fontsize=16, fontweight='bold')
    
    for idx, model_name in enumerate(top_models):
        ax = axes[idx]
        results = backtest_results[model_name]
        
        # Plot actual vs predicted
        ax.plot(results.index, results['actual'], label='Actual', color='blue', linewidth=2)
        ax.plot(results.index, results['predicted'], label='Predicted', color='red', linewidth=1, alpha=0.8)
        
        # Add performance metrics to title
        metrics = model_performance[model_name]
        title = f"{model_name} | R²: {metrics['r2']:.3f} | RMSE: {metrics['rmse']:.3f} | MAE: {metrics['mae']:.3f}"
        ax.set_title(title, fontweight='bold')
        
        ax.set_ylabel('Price ($)')
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        # Format x-axis
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(
        get_project_path("experiments/figures/top_models_predictions.png"),
        dpi=300,
        bbox_inches='tight'
    )
    plt.show()



### 💰 Trading Strategy Evaluation

In [None]:
def evaluate_trading_performance(results_df, initial_capital=10000):
    """
    Evaluate trading performance based on prediction signals.
    
    Args:
        results_df: DataFrame with actual and predicted prices
        initial_capital: Starting capital
    
    Returns:
        Dictionary with trading performance metrics
    """
    df = results_df.copy()
    
    # Calculate returns
    df['actual_return'] = df['actual'].pct_change()
    df['predicted_return'] = df['predicted'].pct_change()
    
    # Generate signals (1 for buy, -1 for sell, 0 for hold)
    df['signal'] = np.where(
        df['predicted_return'] > 0.001, 1,   # Buy if predicted return > 0.1%
        np.where(df['predicted_return'] < -0.001, -1, 0)  # Sell if predicted return < -0.1%
    )
    
    # Calculate strategy returns
    df['strategy_return'] = df['signal'].shift(1) * df['actual_return']
    df['strategy_return'] = df['strategy_return'].fillna(0)
    
    # Calculate cumulative returns
    df['actual_cumulative'] = (1 + df['actual_return']).cumprod()
    df['strategy_cumulative'] = (1 + df['strategy_return']).cumprod()
    
    # Calculate portfolio values
    df['buy_hold_value'] = initial_capital * df['actual_cumulative']
    df['strategy_value'] = initial_capital * df['strategy_cumulative']
    
    # Performance metrics
    total_return_bh = (df['buy_hold_value'].iloc[-1] / initial_capital - 1) * 100
    total_return_strategy = (df['strategy_value'].iloc[-1] / initial_capital - 1) * 100
    
    # Calculate Sharpe ratios (assuming 252 trading days per year)
    sharpe_bh = (df['actual_return'].mean() * 252) / (df['actual_return'].std() * np.sqrt(252))
    sharpe_strategy = (df['strategy_return'].mean() * 252) / (df['strategy_return'].std() * np.sqrt(252))
    
    # Maximum drawdown
    rolling_max_bh = df['buy_hold_value'].expanding().max()
    drawdown_bh = ((df['buy_hold_value'] - rolling_max_bh) / rolling_max_bh * 100)
    max_drawdown_bh = drawdown_bh.min()
    
    rolling_max_strategy = df['strategy_value'].expanding().max()
    drawdown_strategy = ((df['strategy_value'] - rolling_max_strategy) / rolling_max_strategy * 100)
    max_drawdown_strategy = drawdown_strategy.min()
    
    # Win rate
    winning_trades = (df['strategy_return'] > 0).sum()
    total_trades = (df['strategy_return'] != 0).sum()
    win_rate = (winning_trades / total_trades * 100) if total_trades > 0 else 0
    
    return {
        'total_return_buy_hold': total_return_bh,
        'total_return_strategy': total_return_strategy,
        'excess_return': total_return_strategy - total_return_bh,
        'sharpe_ratio_buy_hold': sharpe_bh,
        'sharpe_ratio_strategy': sharpe_strategy,
        'max_drawdown_buy_hold': max_drawdown_bh,
        'max_drawdown_strategy': max_drawdown_strategy,
        'win_rate': win_rate,
        'total_trades': total_trades,
        'final_value_buy_hold': df['buy_hold_value'].iloc[-1],
        'final_value_strategy': df['strategy_value'].iloc[-1],
        'portfolio_data': df
    }

# Evaluate trading performance for top models
trading_performance = {}

print("💰 Trading Strategy Evaluation")
print("=" * 40)

for model_name in top_models:
    print(f"\n📊 Evaluating {model_name}...")
    
    results = backtest_results[model_name]
    trading_metrics = evaluate_trading_performance(results, initial_capital=10000)
    trading_performance[model_name] = trading_metrics
    
    print(f"  Buy & Hold Return: {trading_metrics['total_return_buy_hold']:.2f}%")
    print(f"  Strategy Return: {trading_metrics['total_return_strategy']:.2f}%")
    print(f"  Excess Return: {trading_metrics['excess_return']:.2f}%")
    print(f"  Sharpe Ratio (Strategy): {trading_metrics['sharpe_ratio_strategy']:.3f}")
    print(f"  Max Drawdown: {trading_metrics['max_drawdown_strategy']:.2f}%")
    print(f"  Win Rate: {trading_metrics['win_rate']:.1f}%")
    print(f"  Total Trades: {trading_metrics['total_trades']}")

# Visualize trading performance
if trading_performance:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('💰 Trading Strategy Performance Analysis', fontsize=16, fontweight='bold')
    
    # 1. Cumulative returns comparison
    ax1 = axes[0, 0]
    for model_name in top_models:
        portfolio_data = trading_performance[model_name]['portfolio_data']
        ax1.plot(
            portfolio_data.index, portfolio_data['buy_hold_value'],
            label=f'{model_name} - Buy & Hold', linestyle='--', alpha=0.7
        )
        ax1.plot(
            portfolio_data.index, portfolio_data['strategy_value'],
            label=f'{model_name} - Strategy', linewidth=2
        )
    
    ax1.set_title('Portfolio Value Over Time', fontweight='bold')
    ax1.set_ylabel('Portfolio Value ($)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Total returns comparison
    ax2 = axes[0, 1]
    models = list(trading_performance.keys())
    bh_returns = [trading_performance[m]['total_return_buy_hold'] for m in models]
    strategy_returns = [trading_performance[m]['total_return_strategy'] for m in models]
    
    x = np.arange(len(models))
    width = 0.35
    
    bars1 = ax2.bar(x - width / 2, bh_returns, width, label='Buy & Hold', alpha=0.7)
    bars2 = ax2.bar(x + width / 2, strategy_returns, width, label='Strategy')
    
    ax2.set_title('Total Returns Comparison', fontweight='bold')
    ax2.set_ylabel('Total Return (%)')
    ax2.set_xticks(x)
    ax2.set_xticklabels(models, rotation=45)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Add value labels
    for bar, value in zip(bars1, bh_returns):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width() / 2., height + 0.5,
                 f'{value:.1f}%', ha='center', va='bottom', fontsize=9)
    
    for bar, value in zip(bars2, strategy_returns):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width() / 2., height + 0.5,
                 f'{value:.1f}%', ha='center', va='bottom', fontsize=9)
    
    # 3. Risk-adjusted returns (Sharpe ratio)
    ax3 = axes[1, 0]
    sharpe_bh = [trading_performance[m]['sharpe_ratio_buy_hold'] for m in models]
    sharpe_strategy = [trading_performance[m]['sharpe_ratio_strategy'] for m in models]
    
    bars3 = ax3.bar(x - width / 2, sharpe_bh, width, label='Buy & Hold', alpha=0.7)
    bars4 = ax3.bar(x + width / 2, sharpe_strategy, width, label='Strategy')
    
    ax3.set_title('Sharpe Ratio Comparison', fontweight='bold')
    ax3.set_ylabel('Sharpe Ratio')
    ax3.set_xticks(x)
    ax3.set_xticklabels(models, rotation=45)
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Maximum drawdown
    ax4 = axes[1, 1]
    drawdown_bh = [abs(trading_performance[m]['max_drawdown_buy_hold']) for m in models]
    drawdown_strategy = [abs(trading_performance[m]['max_drawdown_strategy']) for m in models]
    
    bars5 = ax4.bar(x - width / 2, drawdown_bh, width, label='Buy & Hold', alpha=0.7)
    bars6 = ax4.bar(x + width / 2, drawdown_strategy, width, label='Strategy')
    
    ax4.set_title('Maximum Drawdown Comparison', fontweight='bold')
    ax4.set_ylabel('Maximum Drawdown (%)')
    ax4.set_xticks(x)
    ax4.set_xticklabels(models, rotation=45)
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(
        get_project_path("experiments/figures/trading_performance_analysis.png"),
        dpi=300,
        bbox_inches='tight'
    )
    plt.show()



### 📊 Statistical Significance Testing

In [None]:
# Perform statistical significance tests
print("🔬 Statistical Significance Testing")
print("=" * 40)

if len(backtest_results) >= 2:
    model_names = list(backtest_results.keys())
    
    # Pairwise comparison of top models
    for i in range(len(top_models)):
        for j in range(i + 1, len(top_models)):
            model1 = top_models[i]
            model2 = top_models[j]
            
            if model1 in backtest_results and model2 in backtest_results:
                results1 = backtest_results[model1]
                results2 = backtest_results[model2]
                
                # Align dates for comparison
                common_dates = results1.index.intersection(results2.index)
                if len(common_dates) > 10:  # Need minimum samples
                    actual1 = results1.loc[common_dates, 'actual']
                    pred1 = results1.loc[common_dates, 'predicted']
                    pred2 = results2.loc[common_dates, 'predicted']
                    
                    # Calculate prediction errors
                    errors1 = np.abs(actual1 - pred1)
                    errors2 = np.abs(actual1 - pred2)
                    
                    # Perform statistical significance test
                    stat_result = statistical_significance_test(errors1.values, errors2.values)
                    
                    print(f"\n{model1} vs {model2}:")
                    print(f"  Mean MAE {model1}: {errors1.mean():.4f}")
                    print(f"  Mean MAE {model2}: {errors2.mean():.4f}")
                    print(f"  P-value: {stat_result['p_value']:.4f}")
                    print(f"  Significant difference: {stat_result['is_significant']}")
                    if stat_result['is_significant']:
                        better_model = model1 if errors1.mean() < errors2.mean() else model2
                        print(f"  Better model: {better_model}")



### 📋 Final Performance Report

In [None]:
# Create comprehensive performance report
report_data = {
    'evaluation_metadata': {
        'evaluation_date': datetime.now().isoformat(),
        'backtest_period_start': str(backtest_data.index.min()),
        'backtest_period_end': str(backtest_data.index.max()),
        'backtest_days': len(backtest_data),
        'ticker_evaluated': test_ticker,
        'models_evaluated': len(all_models)
    },
    'model_performance': model_performance,
    'trading_performance': {}
}

# Add trading performance (without pandas objects)
for model_name, perf in trading_performance.items():
    report_data['trading_performance'][model_name] = {
        k: v for k, v in perf.items() if k != 'portfolio_data'
    }

# Save comprehensive report
report_path = get_project_path("experiments/results/final_evaluation_report.json")
save_results_to_json(report_data, report_path)

print("📋 Final Performance Report")
print("=" * 50)
print(f"Evaluation Date: {report_data['evaluation_metadata']['evaluation_date']}")
print(f"Backtest Period: {report_data['evaluation_metadata']['backtest_period_start']} to {report_data['evaluation_metadata']['backtest_period_end']}")
print(f"Models Evaluated: {report_data['evaluation_metadata']['models_evaluated']}")

if model_performance:
    print("\n🏆 Top 3 Models by R² Score:")
    for i, model in enumerate(performance_df_sorted.head(3).index, 1):
        metrics = model_performance[model]
        print(f"  {i}. {model}")
        print(f"     R²: {metrics['r2']:.4f} | RMSE: {metrics['rmse']:.4f} | MAE: {metrics['mae']:.4f}")

if trading_performance:
    print("\n💰 Trading Performance Summary:")
    for model in top_models:
        if model in trading_performance:
            perf = trading_performance[model]
            print(f"  {model}:")
            print(f"    Total Return: {perf['total_return_strategy']:.2f}% vs Buy&Hold: {perf['total_return_buy_hold']:.2f}%")
            print(f"    Excess Return: {perf['excess_return']:.2f}%")
            print(f"    Sharpe Ratio: {perf['sharpe_ratio_strategy']:.3f}")
            print(f"    Max Drawdown: {perf['max_drawdown_strategy']:.2f}%")
            print(f"    Win Rate: {perf['win_rate']:.1f}%")

print(f"\n💾 Comprehensive report saved to: {report_path}")


### 🎯 Model Selection Recommendations

In [None]:
print("🎯 Model Selection Recommendations")
print("=" * 50)

if model_performance and len(model_performance) > 0:
    best_overall = performance_df_sorted.index[0]
    best_metrics = model_performance[best_overall]
    
    print(f"\n🥇 BEST OVERALL MODEL: {best_overall}")
    print(f"   Prediction Accuracy (R²): {best_metrics['r2']:.4f}")
    print(f"   Error Metrics - RMSE: {best_metrics['rmse']:.4f}, MAE: {best_metrics['mae']:.4f}")
    print(f"   MAPE: {best_metrics['mape']:.2f}%")
    
    if best_overall in trading_performance:
        trading_metrics = trading_performance[best_overall]
        print(f"   Trading Performance:")
        print(f"     - Strategy Return: {trading_metrics['total_return_strategy']:.2f}%")
        print(f"     - Excess over Buy&Hold: {trading_metrics['excess_return']:.2f}%")
        print(f"     - Risk-Adjusted Return (Sharpe): {trading_metrics['sharpe_ratio_strategy']:.3f}")
    
    # Model-specific recommendations
    print(f"\n📊 DETAILED ANALYSIS:")
    
    # Identify best performers in different categories
    best_accuracy = performance_df_sorted.index[0]  # Highest R²
    best_precision = performance_df.sort_values('rmse').index[0]  # Lowest RMSE
    
    print(f"   🎯 Most Accurate Predictions: {best_accuracy}")
    print(f"   🔍 Most Precise (Low RMSE): {best_precision}")
    
    if trading_performance:
        # Find best trading performer
        trading_returns = {model: perf['total_return_strategy'] 
                          for model, perf in trading_performance.items()}
        best_trading = max(trading_returns, key=trading_returns.get)
        
        best_sharpe = max(trading_performance, 
                         key=lambda x: trading_performance[x]['sharpe_ratio_strategy'])
        
        print(f"   💰 Best Trading Returns: {best_trading} ({trading_returns[best_trading]:.2f}%)")
        print(f"   ⚖️ Best Risk-Adjusted: {best_sharpe}")
    
    # Usage recommendations
    print(f"\n📋 USAGE RECOMMENDATIONS:")
    print(f"   • For Maximum Accuracy: Use {best_overall}")
    print(f"   • For Live Trading: Consider risk management with {best_overall}")
    print(f"   • For Research: Compare top 3 models for ensemble approaches")
    
    # Model insights
    baseline_models = ['naive_last_value', 'random_walk_drift']
    ml_models = [m for m in model_performance.keys() if m not in baseline_models]
    
    if ml_models and baseline_models:
        best_ml = max(ml_models, key=lambda x: model_performance[x]['r2'])
        best_baseline = max([m for m in baseline_models if m in model_performance], 
                           key=lambda x: model_performance[x]['r2'])
        
        ml_r2 = model_performance[best_ml]['r2']
        baseline_r2 = model_performance[best_baseline]['r2']
        improvement = ((ml_r2 - baseline_r2) / abs(baseline_r2)) * 100
        
        print(f"\n🔬 ML vs BASELINE COMPARISON:")
        print(f"   Best ML Model: {best_ml} (R²: {ml_r2:.4f})")
        print(f"   Best Baseline: {best_baseline} (R²: {baseline_r2:.4f})")
        print(f"   ML Improvement: {improvement:.1f}%")


### ⚠️ Risk Assessment and Limitations

In [None]:
print("⚠️ Risk Assessment and Limitations")
print("=" * 50)

# Calculate volatility and risk metrics
if backtest_results:
    print("\n📊 RISK ANALYSIS:")
    
    for model_name in top_models[:2]:  # Top 2 models
        results = backtest_results[model_name]
        
        # Calculate prediction volatility
        pred_returns = results['predicted'].pct_change().dropna()
        actual_returns = results['actual'].pct_change().dropna()
        
        pred_volatility = pred_returns.std() * np.sqrt(252) * 100  # Annualized
        actual_volatility = actual_returns.std() * np.sqrt(252) * 100
        
        # Calculate prediction errors
        errors = results['actual'] - results['predicted']
        error_volatility = errors.std()
        max_error = abs(errors).max()
        
        print(f"\n  {model_name}:")
        print(f"    Prediction Volatility: {pred_volatility:.2f}% (Actual: {actual_volatility:.2f}%)")
        print(f"    Maximum Single Error: ${max_error:.2f}")
        print(f"    Error Standard Deviation: ${error_volatility:.2f}")
        
        # Trading risk metrics
        if model_name in trading_performance:
            trading_perf = trading_performance[model_name]
            print(f"    Maximum Drawdown: {trading_perf['max_drawdown_strategy']:.2f}%")
            print(f"    Trade Success Rate: {trading_perf['win_rate']:.1f}%")

print("\n⚠️ KEY LIMITATIONS:")
print("   1. Backtest Period: Limited to recent historical data")
print("   2. Market Conditions: Performance may vary in different market regimes")
print("   3. Transaction Costs: Real trading costs not included in analysis")
print("   4. Liquidity: Assumes perfect execution at predicted prices")
print("   5. Model Stability: Performance may degrade over time without retraining")
print("   6. Overfitting Risk: Models may not generalize to unseen market conditions")

print("\n🛡️ RISK MITIGATION STRATEGIES:")
print("   • Regular model retraining and validation")
print("   • Position sizing based on prediction confidence")
print("   • Stop-loss mechanisms for risk management")
print("   • Diversification across multiple models/strategies")
print("   • Real-time monitoring of model performance")


### 📈 Advanced Analysis and Insights

In [None]:
# Error analysis by time periods and market conditions
print("📈 Advanced Analysis and Insights")
print("=" * 50)

if backtest_results and len(backtest_results) > 0:
    # Analyze performance in different market conditions
    best_model = performance_df_sorted.index[0]
    results = backtest_results[best_model]
    
    # Calculate market conditions
    results_analysis = results.copy()
    results_analysis['actual_return'] = results_analysis['actual'].pct_change()
    results_analysis['volatility'] = results_analysis['actual_return'].rolling(20).std()
    results_analysis['trend'] = results_analysis['actual'].rolling(20).mean()
    results_analysis['error'] = abs(results_analysis['actual'] - results_analysis['predicted'])
    results_analysis['error_pct'] = results_analysis['error'] / results_analysis['actual'] * 100
    
    # Market regime classification
    volatility_threshold = results_analysis['volatility'].quantile(0.7)
    results_analysis['market_regime'] = np.where(
        results_analysis['volatility'] > volatility_threshold, 'High_Volatility', 'Low_Volatility'
    )
    
    print(f"\n🔍 PERFORMANCE BY MARKET CONDITIONS ({best_model}):")
    
    for regime in ['Low_Volatility', 'High_Volatility']:
        regime_data = results_analysis[results_analysis['market_regime'] == regime]
        if len(regime_data) > 10:
            avg_error = regime_data['error'].mean()
            avg_error_pct = regime_data['error_pct'].mean()
            r2_regime = calculate_r2(regime_data['actual'].values, regime_data['predicted'].values)
            
            print(f"  {regime.replace('_', ' ')} Markets:")
            print(f"    Average Error: ${avg_error:.2f} ({avg_error_pct:.2f}%)")
            print(f"    R² Score: {r2_regime:.4f}")
            print(f"    Data Points: {len(regime_data)}")

    # Time-based performance analysis
    print(f"\n📅 PERFORMANCE BY TIME PERIODS:")
    
    # Monthly performance
    results_analysis['month'] = results_analysis.index.month
    monthly_performance = results_analysis.groupby('month').agg({
        'error': 'mean',
        'error_pct': 'mean'
    })
    
    best_month = monthly_performance['error'].idxmin()
    worst_month = monthly_performance['error'].idxmax()
    
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    print(f"    Best Performance Month: {month_names[best_month-1]} (Error: ${monthly_performance.loc[best_month, 'error']:.2f})")
    print(f"    Worst Performance Month: {month_names[worst_month-1]} (Error: ${monthly_performance.loc[worst_month, 'error']:.2f})")


### 📊 Final Visualization Dashboard

In [None]:
# Create comprehensive dashboard
if backtest_results and model_performance:
    fig = plt.figure(figsize=(20, 16))
    fig.suptitle('📊 Comprehensive Model Evaluation Dashboard', fontsize=20, fontweight='bold')
    
    # Create grid layout
    gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
    
    # 1. Model Performance Comparison (2x2)
    ax1 = fig.add_subplot(gs[0:2, 0:2])
    
    models = list(model_performance.keys())
    r2_scores = [model_performance[m]['r2'] for m in models]
    colors = plt.cm.viridis(np.linspace(0, 1, len(models)))
    
    bars = ax1.barh(models, r2_scores, color=colors)
    ax1.set_xlabel('R² Score')
    ax1.set_title('Model Performance Ranking (R² Score)', fontweight='bold', fontsize=14)
    ax1.grid(True, alpha=0.3)
    
    # Add value labels
    for bar, value in zip(bars, r2_scores):
        width = bar.get_width()
        ax1.text(width + 0.01, bar.get_y() + bar.get_height()/2,
                f'{value:.3f}', ha='left', va='center', fontweight='bold')
    
    # 2. Best Model Predictions (2x2)
    ax2 = fig.add_subplot(gs[0:2, 2:4])
    
    best_model = performance_df_sorted.index[0]
    best_results = backtest_results[best_model]
    
    # Plot last 60 days for clarity
    recent_data = best_results.tail(60)
    ax2.plot(recent_data.index, recent_data['actual'], 'b-', label='Actual', linewidth=2)
    ax2.plot(recent_data.index, recent_data['predicted'], 'r--', label='Predicted', linewidth=2)
    
    ax2.set_title(f'Best Model Predictions: {best_model}', fontweight='bold', fontsize=14)
    ax2.set_ylabel('Price ($)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='x', rotation=45)
    
    # 3. Trading Performance (1x2)
    if trading_performance:
        ax3 = fig.add_subplot(gs[2, 0:2])
        
        trading_models = list(trading_performance.keys())
        excess_returns = [trading_performance[m]['excess_return'] for m in trading_models]
        
        bars = ax3.bar(trading_models, excess_returns, 
                      color=['green' if x > 0 else 'red' for x in excess_returns])
        ax3.set_title('Excess Returns vs Buy & Hold', fontweight='bold', fontsize=14)
        ax3.set_ylabel('Excess Return (%)')
        ax3.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        ax3.tick_params(axis='x', rotation=45)
        ax3.grid(True, alpha=0.3)
        
        # Add value labels
        for bar, value in zip(bars, excess_returns):
            height = bar.get_height()
            ax3.text(bar.get_x() + bar.get_width()/2., height + (0.5 if height > 0 else -0.5),
                    f'{value:.1f}%', ha='center', va='bottom' if height > 0 else 'top', 
                    fontweight='bold')
    
    # 4. Risk Metrics (1x2)
    ax4 = fig.add_subplot(gs[2, 2:4])
    
    if trading_performance:
        models_risk = list(trading_performance.keys())
        drawdowns = [abs(trading_performance[m]['max_drawdown_strategy']) for m in models_risk]
        sharpe_ratios = [trading_performance[m]['sharpe_ratio_strategy'] for m in models_risk]
        
        # Scatter plot: Drawdown vs Sharpe Ratio
        scatter = ax4.scatter(drawdowns, sharpe_ratios, s=100, alpha=0.7, c=colors[:len(models_risk)])
        ax4.set_xlabel('Maximum Drawdown (%)')
        ax4.set_ylabel('Sharpe Ratio')
        ax4.set_title('Risk vs Return Profile', fontweight='bold', fontsize=14)
        ax4.grid(True, alpha=0.3)
        
        # Add model labels
        for i, model in enumerate(models_risk):
            ax4.annotate(model, (drawdowns[i], sharpe_ratios[i]), 
                        xytext=(5, 5), textcoords='offset points', fontsize=9)
    
    # 5. Performance Summary Table (1x4)
    ax5 = fig.add_subplot(gs[3, :])
    ax5.axis('tight')
    ax5.axis('off')
    
    # Create summary table
    summary_data = []
    for model in performance_df_sorted.head(5).index:
        row = [
            model,
            f"{model_performance[model]['r2']:.4f}",
            f"{model_performance[model]['rmse']:.3f}",
            f"{model_performance[model]['mae']:.3f}",
            f"{model_performance[model]['mape']:.1f}%"
        ]
        
        if model in trading_performance:
            row.extend([
                f"{trading_performance[model]['total_return_strategy']:.1f}%",
                f"{trading_performance[model]['excess_return']:.1f}%",
                f"{trading_performance[model]['sharpe_ratio_strategy']:.2f}"
            ])
        else:
            row.extend(['-', '-', '-'])
        
        summary_data.append(row)
    
    columns = ['Model', 'R²', 'RMSE', 'MAE', 'MAPE', 'Strategy Return', 'Excess Return', 'Sharpe Ratio']
    
    table = ax5.table(cellText=summary_data, colLabels=columns, cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.5)
    
    # Style the table
    for i in range(len(columns)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Highlight best model row
    for i in range(len(columns)):
        table[(1, i)].set_facecolor('#E8F5E8')
    
    ax5.set_title('Performance Summary - Top 5 Models', fontweight='bold', fontsize=14, pad=20)
    
    plt.savefig(get_project_path("experiments/figures/comprehensive_evaluation_dashboard.png"), 
                dpi=300, bbox_inches='tight', facecolor='white')
    plt.show()


### 🔚 Final Summary and Next Steps

In [None]:
print("🔚 Final Summary and Next Steps")
print("=" * 60)

if model_performance:
    total_models = len(model_performance)
    successful_models = len([m for m in model_performance.values() if m['r2'] > 0])
    
    print(f"✅ EVALUATION COMPLETED SUCCESSFULLY")
    print(f"   • Total Models Evaluated: {total_models}")
    print(f"   • Models with Positive R²: {successful_models}")
    print(f"   • Backtest Period: {len(backtest_data)} days")
    
    if trading_performance:
        profitable_strategies = len([m for m in trading_performance.values() if m['total_return_strategy'] > 0])
        print(f"   • Profitable Trading Strategies: {profitable_strategies}/{len(trading_performance)}")
    
    print(f"\n🏆 CHAMPION MODEL: {performance_df_sorted.index[0]}")
    champion_metrics = model_performance[performance_df_sorted.index[0]]
    print(f"   • Prediction Accuracy (R²): {champion_metrics['r2']:.4f}")
    print(f"   • Mean Absolute Error: ${champion_metrics['mae']:.2f}")
    
    if performance_df_sorted.index[0] in trading_performance:
        champion_trading = trading_performance[performance_df_sorted.index[0]]
        print(f"   • Trading Return: {champion_trading['total_return_strategy']:.2f}%")
        print(f"   • Risk-Adjusted Return: {champion_trading['sharpe_ratio_strategy']:.3f}")

print(f"\n📁 ALL RESULTS SAVED TO:")
print(f"   • Models: {models_dir}")
print(f"   • Results: {results_dir}")
print(f"   • Figures: {figures_dir}")

print(f"\n🚀 RECOMMENDED NEXT STEPS:")
print(f"   1. Deploy champion model for live predictions")
print(f"   2. Set up automated retraining pipeline")
print(f"   3. Implement risk management system")
print(f"   4. Monitor model performance in production")
print(f"   5. Collect feedback for model improvements")

print(f"\n📊 FOR PRODUCTION DEPLOYMENT:")
print(f"   • Use: {performance_df_sorted.index[0]} model")
print(f"   • Monitor: Prediction accuracy and trading performance")
print(f"   • Retrain: Monthly or when performance degrades")
print(f"   • Risk: Implement position sizing and stop-losses")

print(f"\n🎯 EVALUATION COMPLETE!")
print("=" * 60)
