In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from itertools import combinations
import logging

BASE_REPORT_DIR = Path("../data/reports/supervised_learning")
THEMATIC_MODELS_DIR = BASE_REPORT_DIR / "thematic_models"
MASTER_RESULTS_PATH = BASE_REPORT_DIR / "master_results_summary.csv"
SP500_PRICE_DATA_PATH = Path("../data/processed/regime_identification/smoothed/summaries/sp500_ret126d_logvol21d_3states_smoothed200_full_data_with_states.csv")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
sns.set_theme(style="whitegrid")
plt.rcParams['figure.dpi'] = 100

def load_all_oof_predictions(base_dir: Path) -> dict:
    oof_data = {}
    oof_files = list(base_dir.glob("**/cv_results/oof_predictions.csv"))
    
    if not oof_files:
        logging.warning(f"No oof_predictions.csv files found in {base_dir}")
        return {}
        
    logging.info(f"Found {len(oof_files)} OOF prediction files.")
    
    for file_path in oof_files:
        try:
            run_name = file_path.parent.parent.name
            
            df = pd.read_csv(file_path, parse_dates=['date'])
            df.set_index('date', inplace=True)
            oof_data[run_name] = df
            
        except Exception as e:
            logging.error(f"Failed to load or process {file_path}: {e}")
            
    return oof_data

all_oof_data = load_all_oof_predictions(THEMATIC_MODELS_DIR)

if MASTER_RESULTS_PATH.exists():
    master_results_df = pd.read_csv(MASTER_RESULTS_PATH)
    logging.info(f"Loaded master results summary with shape: {master_results_df.shape}")
else:
    master_results_df = pd.DataFrame()
    logging.warning(f"Master results summary not found at {MASTER_RESULTS_PATH}")

if SP500_PRICE_DATA_PATH.exists():
    sp500_df = pd.read_csv(SP500_PRICE_DATA_PATH, parse_dates=['date'], index_col='date')[['sp500_adjusted_close']]
    logging.info(f"Loaded S&P 500 price data with shape: {sp500_df.shape}")
else:
    sp500_df = pd.DataFrame()
    logging.warning(f"S&P 500 price data not found at {SP500_PRICE_DATA_PATH}")

if all_oof_data:
    sample_key = list(all_oof_data.keys())[0]
    print(f"Sample OOF data from '{sample_key}':")
    display(all_oof_data[sample_key].head())
else:
    print("No OOF data was loaded.")

In [None]:
def calculate_model_agreement(oof_data: dict) -> pd.DataFrame:
    model_names = list(oof_data.keys())
    agreement_matrix = pd.DataFrame(np.eye(len(model_names)), index=model_names, columns=model_names)

    all_predictions = pd.DataFrame({name: df['predicted_label'] for name, df in oof_data.items()})

    for model1, model2 in combinations(model_names, 2):
        aligned_preds = all_predictions[[model1, model2]].dropna()
        
        if not aligned_preds.empty:
            agreement = np.mean(aligned_preds[model1] == aligned_preds[model2])
            agreement_matrix.loc[model1, model2] = agreement
            agreement_matrix.loc[model2, model1] = agreement
        else:
            agreement_matrix.loc[model1, model2] = np.nan
            agreement_matrix.loc[model2, model1] = np.nan
            
    return agreement_matrix

if all_oof_data:
    agreement_df = calculate_model_agreement(all_oof_data)

    plt.figure(figsize=(18, 15))
    sns.heatmap(
        agreement_df, 
        annot=True, 
        cmap="viridis", 
        fmt=".2f",
        linewidths=.5
    )
    plt.title("Pairwise Model Prediction Agreement (%)", fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.show()
else:
    print("Cannot calculate model agreement without OOF data.")

In [None]:
def analyze_prediction_stability(oof_data: dict) -> pd.DataFrame:
    stability_results = []

    for name, df in oof_data.items():
        preds = df['predicted_label'].dropna()
        if preds.empty:
            continue
            
        flips = (preds.diff() != 0)
        flip_rate = flips.mean() 

        blocks = flips.cumsum()
        spell_durations = preds.groupby(blocks).size()
        
        if not spell_durations.empty:
            avg_spell_duration = spell_durations.mean()
            std_spell_duration = spell_durations.std()
            min_spell_duration = spell_durations.min()
            max_spell_duration = spell_durations.max()
        else:
            avg_spell_duration = std_spell_duration = min_spell_duration = max_spell_duration = np.nan

        stability_results.append({
            'model_run': name,
            'flip_rate': flip_rate,
            'avg_spell_duration_days': avg_spell_duration,
            'std_spell_duration': std_spell_duration,
            'min_spell_duration_days': min_spell_duration,
            'max_spell_duration_days': max_spell_duration,
        })
        
    return pd.DataFrame(stability_results)

if all_oof_data:
    stability_df = analyze_prediction_stability(all_oof_data)
    stability_df['model_type'] = stability_df['model_run'].apply(lambda x: x.split('_')[-1])

    plt.figure(figsize=(14, 7))
    sns.barplot(data=stability_df.sort_values('flip_rate'), x='flip_rate', y='model_run', hue='model_type', dodge=False)
    plt.title('Prediction Flip Rate (Lower is More Stable)', fontsize=16)
    plt.xlabel('Fraction of Days Prediction Changed')
    plt.ylabel('Model Run')
    plt.show()
    
    plt.figure(figsize=(14, 7))
    sns.barplot(data=stability_df.sort_values('avg_spell_duration_days', ascending=False), x='avg_spell_duration_days', y='model_run', hue='model_type', dodge=False)
    plt.title('Average Prediction Spell Duration (Higher is More Stable)', fontsize=16)
    plt.xlabel('Average Consecutive Days with Same Prediction')
    plt.ylabel('Model Run')
    plt.show()

    print("\n--- Stability Metrics Summary ---")
    display(stability_df.sort_values('flip_rate').set_index('model_run'))
else:
    print("Cannot analyze stability without OOF data.")

In [None]:
def analyze_prediction_confidence(oof_data: dict) -> pd.DataFrame:
    confidence_results = []

    for name, df in oof_data.items():
        df_clean = df.dropna(subset=['predicted_label', 'true_label']).copy()
        proba_cols = [c for c in df_clean.columns if c.startswith('proba_class_')]
        if not proba_cols or df_clean.empty:
            continue
            
        predicted_labels_int = df_clean['predicted_label'].astype(int)
        df_clean['confidence'] = df_clean[proba_cols].values[np.arange(len(df_clean)), predicted_labels_int]
        
        df_clean['is_correct'] = (df_clean['predicted_label'] == df_clean['true_label'])
        
        avg_confidence = df_clean['confidence'].mean()
        std_confidence = df_clean['confidence'].std()
        
        avg_conf_correct = df_clean[df_clean['is_correct']]['confidence'].mean()
        avg_conf_incorrect = df_clean[~df_clean['is_correct']]['confidence'].mean()
        
        confidence_results.append({
            'model_run': name,
            'avg_confidence': avg_confidence,
            'std_confidence': std_confidence,
            'avg_conf_when_correct': avg_conf_correct,
            'avg_conf_when_incorrect': avg_conf_incorrect,
            'confidence_lift': avg_conf_correct - avg_conf_incorrect,
        })
        
    return pd.DataFrame(confidence_results)

if all_oof_data:
    confidence_df = analyze_prediction_confidence(all_oof_data)
    
    confidence_melted = confidence_df.melt(
        id_vars='model_run', 
        value_vars=['avg_conf_when_correct', 'avg_conf_when_incorrect'],
        var_name='condition', 
        value_name='average_confidence'
    )
    confidence_melted['condition'] = confidence_melted['condition'].map({
        'avg_conf_when_correct': 'Correct',
        'avg_conf_when_incorrect': 'Incorrect'
    })

    plt.figure(figsize=(10, 12))
    sns.barplot(data=confidence_melted, y='model_run', x='average_confidence', hue='condition', palette={'Correct': 'g', 'Incorrect': 'r'})
    plt.title('Model Confidence: Correct vs. Incorrect Predictions', fontsize=16)
    plt.xlabel('Average Prediction Probability')
    plt.ylabel('Model Run')
    plt.xlim(0, 1)
    plt.legend(title='Prediction Outcome')
    plt.show()

    print("\n--- Confidence Metrics Summary (Higher Lift is Better) ---")
    display(confidence_df.sort_values('confidence_lift', ascending=False).set_index('model_run'))
else:
    print("Cannot analyze confidence without OOF data.")

In [None]:
if not master_results_df.empty:
    master_results_df['model_type'] = master_results_df['model'].apply(lambda x: 'lightgbm' if 'lightgbm' in x.lower() else x)

    agg_by_model = master_results_df.groupby('model_type').agg(
        avg_f1_macro=('f1_macro_mean', 'mean'),
        std_of_f1_macro_means=('f1_macro_mean', 'std'),
        avg_mcc=('mcc_mean', 'mean'),
        std_of_mcc_means=('mcc_mean', 'std'),
        avg_auc=('auc_roc_ovr_macro_mean', 'mean'),
        std_of_auc_means=('auc_roc_ovr_macro_mean', 'std'),
        num_models=('model', 'count')
    ).sort_values('avg_f1_macro', ascending=False)

    print("\n--- Average Performance by Algorithm Type ---")
    display(agg_by_model)

    fig, axes = plt.subplots(1, 3, figsize=(20, 6), sharey=True)
    fig.suptitle('Algorithm Performance Comparison Across All Themes', fontsize=16)
    
    metrics_to_plot = [('avg_f1_macro', 'F1 Macro'), ('avg_mcc', 'Matthews Corr Coef'), ('avg_auc', 'AUC ROC (OvR Macro)')]
    
    for i, (metric, title) in enumerate(metrics_to_plot):
        sns.barplot(data=agg_by_model.reset_index(), x='model_type', y=metric, ax=axes[i], palette='viridis')
        axes[i].set_title(title)
        axes[i].set_xlabel('Algorithm')
        axes[i].set_ylabel('Average Score')
        axes[i].tick_params(axis='x', rotation=45)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()
else:
    print("Cannot analyze performance by algorithm without master results summary.")

In [None]:
def plot_ensemble_predictions_over_time(oof_data: dict, price_data: pd.DataFrame):
    if not oof_data or price_data.empty:
        logging.warning("Cannot plot ensemble predictions without OOF and price data.")
        return

    all_preds_df = pd.DataFrame({name: df['predicted_label'] for name, df in oof_data.items()})
    all_preds_df = all_preds_df.ffill().bfill()

    majority_vote = all_preds_df.mode(axis=1)[0].astype(int)
    agreement_score = all_preds_df.apply(lambda row: row.value_counts(normalize=True).max(), axis=1)

    plot_df = price_data.copy()
    plot_df = plot_df.join(all_preds_df.iloc[:, 0].to_frame('true_label_sample')) 
    
    true_label_series = list(oof_data.values())[0]['true_label']
    
    plot_df = plot_df.join(true_label_series)
    plot_df = plot_df.join(majority_vote.rename('majority_vote'))
    plot_df = plot_df.join(agreement_score.rename('agreement'))
    plot_df.dropna(subset=['sp500_adjusted_close', 'true_label', 'majority_vote'], inplace=True)
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 12), sharex=True, gridspec_kw={'height_ratios': [3, 1]})
    
    ax1.plot(plot_df.index, plot_df['sp500_adjusted_close'], color='black', lw=0.5, label='S&P 500')
    ax1.set_yscale('log')
    ax1.set_title('Ensemble Prediction vs. True Regime Over Time', fontsize=16)
    ax1.set_ylabel('S&P 500 (Log Scale)')
    
    regime_colors = {0: 'red', 1: 'green', 2: 'gray'}
    ax1.fill_between(plot_df.index, 0, plot_df['sp500_adjusted_close'].max()*1.1, where=plot_df['true_label'] == 0, 
                     facecolor=regime_colors[0], alpha=0.15, label='True Regime 0 (Bear)')
    ax1.fill_between(plot_df.index, 0, plot_df['sp500_adjusted_close'].max()*1.1, where=plot_df['true_label'] == 1, 
                     facecolor=regime_colors[1], alpha=0.15, label='True Regime 1 (Bull)')
    ax1.fill_between(plot_df.index, 0, plot_df['sp500_adjusted_close'].max()*1.1, where=plot_df['true_label'] == 2, 
                     facecolor=regime_colors[2], alpha=0.15, label='True Regime 2 (Neutral)')

    ax1.scatter(plot_df.index, plot_df['sp500_adjusted_close'], c=plot_df['majority_vote'].map(regime_colors), 
                marker='.', s=10, label='Majority Vote Prediction')
    ax1.legend()
    ax1.grid(which='both', linestyle='--', alpha=0.5)

    ax2.plot(plot_df.index, plot_df['agreement'], label='Model Agreement', color='purple', lw=1.5)
    ax2.set_title('Model Agreement Score', fontsize=14)
    ax2.set_ylabel('Agreement (0.0 to 1.0)')
    ax2.set_ylim(0, 1.05)
    ax2.axhline(y=1/3, color='r', linestyle='--', lw=1, label='Random Chance')
    ax2.grid(True)
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

if all_oof_data and not sp500_df.empty:
    plot_ensemble_predictions_over_time(all_oof_data, sp500_df)
else:
    print("Cannot generate ensemble time-series plot without OOF and Price data.")