# Phase 7 — Model Comparison Dashboard & Ensemble Methods

**Enhanced Evaluation:**
1. Load all trained models (classical ML + deep learning)
2. Evaluate on common test set
3. **Create ensemble models** (Voting, Stacking)
4. **Comprehensive metrics**: Accuracy, F1-score, inference time, model size
5. Per-class performance analysis
6. Visual comparison dashboard

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib
import time
import os
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

print("Loads")



Loads


In [3]:
# ============================================================
# Load Test Data
# ============================================================
print("\n" + "=" * 60)
print("Loading Test Data")
print("=" * 60)

with tqdm(total=2, desc="Loading test data") as pbar:
    X_test = pd.read_csv("../data/processed/ml_balance/test.csv", dtype=np.float32).values
    pbar.update(1)
    y_test = pd.read_csv("../data/processed/ml_balance/test_labels.csv", dtype=np.int32).values
    pbar.update(1)

# Flatten y_test if needed
if len(y_test.shape) > 1 and y_test.shape[1] == 1:
    y_test = y_test.ravel()
elif len(y_test.shape) > 1:
    y_test = y_test.ravel()

print(f"Test set shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")
print(f"Number of classes: {len(np.unique(y_test))}")

num_classes = len(np.unique(y_test))
y_test_cat = to_categorical(y_test, num_classes)

MODEL_DIR = Path("../trained_models")
results = []



Loading Test Data


Loading test data: 100%|██████████| 2/2 [01:20<00:00, 40.04s/it]


Test set shape: (9337316, 37)
Test labels shape: (9337316,)
Number of classes: 34


In [4]:
# ============================================================
# Chunked Inference Helper Functions
# ============================================================
print("\n" + "=" * 60)
print("Setting up Chunked Inference")
print("=" * 60)

CHUNK_SIZE = 100000  # Adjust based on your memory constraints

def load_data_in_chunks(file_path, chunk_size=CHUNK_SIZE, dtype=np.float32):
    """Load CSV data in chunks and return as a list of arrays"""
    chunks = []
    for chunk in pd.read_csv(file_path, dtype=dtype, chunksize=chunk_size):
        chunks.append(chunk.values)
    return chunks

def predict_in_chunks(model, X_chunks, is_dl_model=False, reshape_for_cnn=False):
    """
    Run inference in chunks to manage memory
    
    Args:
        model: The model to use for prediction
        X_chunks: List of data chunks or single array
        is_dl_model: Whether this is a deep learning model
        reshape_for_cnn: Whether to reshape input for CNN (add channel dimension)
    
    Returns:
        predictions: Concatenated predictions
        inference_time: Total inference time
    """
    predictions = []
    start_time = time.time()
    
    # If X_chunks is a single array, split it into chunks
    if isinstance(X_chunks, np.ndarray):
        n_samples = len(X_chunks)
        n_chunks = int(np.ceil(n_samples / CHUNK_SIZE))
        X_chunks = [X_chunks[i*CHUNK_SIZE:(i+1)*CHUNK_SIZE] for i in range(n_chunks)]
    
    for chunk in tqdm(X_chunks, desc="  Processing chunks", leave=False):
        # Prepare input for CNN if needed
        if reshape_for_cnn:
            chunk_input = np.expand_dims(chunk, -1)
        else:
            chunk_input = chunk
        
        # Run inference
        if is_dl_model:
            chunk_pred = model.predict(chunk_input, verbose=0)
            # For classification, get argmax
            if len(chunk_pred.shape) > 1 and chunk_pred.shape[1] > 1:
                chunk_pred = np.argmax(chunk_pred, axis=1)
        else:
            chunk_pred = model.predict(chunk_input)
        
        predictions.append(chunk_pred)
        
        # Clean up memory
        del chunk_input
    
    inference_time = time.time() - start_time
    
    # Concatenate all predictions
    predictions = np.concatenate(predictions)
    
    return predictions, inference_time

print(f"Chunk size set to: {CHUNK_SIZE:,} samples")
print("Memory-efficient inference enabled")


Setting up Chunked Inference
Chunk size set to: 100,000 samples
Memory-efficient inference enabled


In [None]:
# ============================================================
# Load and Evaluate Classical ML Models (with Chunked Inference)
# ============================================================
print("\n" + "=" * 60)
print("Evaluating Classical ML Models")
print("=" * 60)

ml_models = {}
ml_model_configs = [
    {
        'name': 'XGBoost',
        'files': ['final/final_xgb_optuna.pkl', 'xgboost_baseline.pkl'],
        'key': 'xgb'
    },
    {
        'name': 'LightGBM',
        'files': ['final/final_lgbm_optuna.pkl', 'lightgbm_baseline.pkl'],
        'key': 'lgbm'
    }
]

for config in tqdm(ml_model_configs, desc="Loading ML models"):
    model_loaded = False
    for model_name in config['files']:
        model_path = MODEL_DIR / model_name
        if model_path.exists():
            print(f"\nLoading {model_name}...")
            try:
                model = joblib.load(model_path)
                
                # Measure inference time with chunked prediction
                print("  Running chunked inference...")
                y_pred, inference_time = predict_in_chunks(
                    model, 
                    X_test, 
                    is_dl_model=False
                )
                
                # Get model size
                print("Getting model Size")    
                model_size = os.path.getsize(model_path) / (1024 * 1024)  # MB
                
                # Calculate metrics
                print("Calculating metrics")
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
                precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                recall = recall_score(y_test, y_pred, average='weighted')
                
                results.append({
                    'model': model_name.replace('.pkl', '').replace('final/', ''),
                    'accuracy': acc,
                    'f1_weighted': f1,
                    'precision': precision,
                    'recall': recall,
                    'inference_time_sec': inference_time,
                    'model_size_mb': model_size,
                    'predictions_per_sec': len(X_test) / inference_time
                })
                
                print(f" Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
                
                # Save for ensemble
                ml_models[config['key']] = model
                model_loaded = True
                break
                
            except Exception as e:
                print(f" Error loading {model_name}: {e}")
                continue
    
    if not model_loaded:
        print(f"  No valid model found for {config['name']}")

print(f"\n Successfully loaded {len(ml_models)} ML models")



Evaluating Classical ML Models


Loading ML models:   0%|          | 0/2 [00:00<?, ?it/s]


Loading final/final_xgb_optuna.pkl...
  Running chunked inference...




In [None]:
# ============================================================
# Load and Evaluate Deep Learning Models (with Chunked Inference)
# ============================================================
print("\n" + "=" * 60)
print("Evaluating Deep Learning Models")
print("=" * 60)

dl_models = {}
dl_model_configs = [
    {
        'name': 'FFNN Residual',
        'files': ['final_ffnn_residual.keras'],
        'key': 'ffnn',
        'reshape': False
    },
    {
        'name': 'CNN Stable',
        'files': ['final_cnn_stable.keras'],
        'key': 'cnn',
        'reshape': True
    }
]

for config in tqdm(dl_model_configs, desc="Loading DL models"):
    model_loaded = False
    for model_name in config['files']:
        model_path = MODEL_DIR / 'dl_models' / model_name
        if model_path.exists():
            print(f"\nLoading {model_name}...")
            try:
                model = load_model(model_path)
                
                # Measure inference time with chunked prediction
                print("  Running chunked inference...")
                y_pred, inference_time = predict_in_chunks(
                    model, 
                    X_test,
                    is_dl_model=True,
                    reshape_for_cnn=config['reshape']
                )
                
                # Get model size
                model_size = os.path.getsize(model_path) / (1024 * 1024)
                
                # Calculate metrics
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average='weighted')
                precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
                recall = recall_score(y_test, y_pred, average='weighted')
                
                results.append({
                    'model': model_name.replace('.keras', '').replace('.h5', ''),
                    'accuracy': acc,
                    'f1_weighted': f1,
                    'precision': precision,
                    'recall': recall,
                    'inference_time_sec': inference_time,
                    'model_size_mb': model_size,
                    'predictions_per_sec': len(X_test) / inference_time
                })
                
                print(f" Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
                
                # Save for ensemble
                dl_models[config['key']] = model
                model_loaded = True
                break
                
            except Exception as e:
                print(f"Error loading {model_name}: {e}")
                continue
    
    if not model_loaded:
        print(f"No valid model found for {config['name']}")

print(f"\n Successfully loaded {len(ml_models)} ML models and {len(dl_models)} DL models")


In [None]:
# ============================================================
# Visualization Dashboard
# ============================================================
print("\n" + "=" * 60)
print("Visual Comparison Dashboard")
print("=" * 60)

print("\nGenerating visualizations...")
with tqdm(total=7, desc="Creating dashboard") as pbar:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    pbar.update(1)
    
    # Extract data
    models = res_df['model']
    f1_scores = res_df['f1_weighted']
    colors = ['#2ecc71' if 'ensemble' in m else '#3498db' if any(x in m for x in ['ffnn', 'cnn']) else '#e74c3c' for m in models]
    
    # 1. F1 Score Comparison
    ax = axes[0, 0]
    ax.barh(models, f1_scores, color=colors)
    ax.set_xlabel('F1 Score (Weighted)')
    ax.set_title('F1 Score Comparison')
    ax.set_xlim([0.9, 1.0])
    ax.grid(axis='x', alpha=0.3)
    pbar.update(1)
    
    # 2. Accuracy Comparison
    ax = axes[0, 1]
    ax.barh(models, res_df['accuracy'], color=colors)
    ax.set_xlabel('Accuracy')
    ax.set_title('Accuracy Comparison')
    ax.set_xlim([0.9, 1.0])
    ax.grid(axis='x', alpha=0.3)
    pbar.update(1)
    
    # 3. Inference Time
    ax = axes[0, 2]
    ax.barh(models, res_df['inference_time_sec'], color=colors)
    ax.set_xlabel('Inference Time (seconds)')
    ax.set_title('Inference Speed (Lower is Better)')
    ax.grid(axis='x', alpha=0.3)
    pbar.update(1)
    
    # 4. Predictions per Second
    ax = axes[1, 0]
    ax.barh(models, res_df['predictions_per_sec'], color=colors)
    ax.set_xlabel('Predictions/Second')
    ax.set_title('Throughput (Higher is Better)')
    ax.grid(axis='x', alpha=0.3)
    pbar.update(1)
    
    # 5. Model Size
    ax = axes[1, 1]
    non_zero_sizes = res_df[res_df['model_size_mb'] > 0]
    if len(non_zero_sizes) > 0:
        ax.barh(non_zero_sizes['model'], non_zero_sizes['model_size_mb'], 
                color=[colors[i] for i in non_zero_sizes.index])
        ax.set_xlabel('Model Size (MB)')
        ax.set_title('Model Size Comparison')
        ax.grid(axis='x', alpha=0.3)
    else:
        ax.text(0.5, 0.5, 'No size data available', ha='center', va='center', transform=ax.transAxes)
        ax.set_title('Model Size Comparison')
    pbar.update(1)
    
    # 6. Precision vs Recall
    ax = axes[1, 2]
    ax.scatter(res_df['recall'], res_df['precision'], s=100, c=range(len(res_df)), cmap='viridis', alpha=0.7)
    for i, model in enumerate(models):
        ax.annotate(model, (res_df.iloc[i]['recall'], res_df.iloc[i]['precision']), 
                    fontsize=8, alpha=0.8, xytext=(5, 5), textcoords='offset points')
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.set_title('Precision vs Recall')
    ax.grid(alpha=0.3)
    ax.set_xlim([0.9, 1.0])
    ax.set_ylim([0.9, 1.0])
    pbar.update(1)

plt.tight_layout()
plt.savefig(MODEL_DIR / 'model_comparison_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nSaved dashboard to {MODEL_DIR / 'model_comparison_dashboard.png'}")

In [None]:
# ============================================================
# Per-Class Performance Analysis (Best Model)
# ============================================================
print("\n" + "=" * 60)
print("Per-Class Performance Analysis")
print("=" * 60)

# Get best model
best_model_name = res_df.iloc[0]['model']
print(f"\nAnalyzing best model: {best_model_name}")

# Load label encoder to get class names
print("\nLoading label encoder...")
try:
    encoder = joblib.load("../trained_models/encoder.pkl")
    class_names = encoder.classes_
    print(f"Loaded {len(class_names)} class names")
except Exception as e:
    print(f" Could not load encoder: {e}")
    class_names = [f"Class_{i}" for i in range(num_classes)]

# Get predictions from best model
print(f"\nGetting predictions from {best_model_name}...")
with tqdm(total=1, desc="Running inference on best model") as pbar:
    if 'ensemble' in best_model_name:
        best_model = joblib.load(MODEL_DIR / f"{best_model_name}.pkl")
        y_pred_best = best_model.predict(X_test)
    elif any(x in best_model_name for x in ['ffnn', 'cnn']):
        # Find the model file
        model_path = None
        for ext in ['.keras', '.h5']:
            potential_path = MODEL_DIR / 'dl_models' / f"{best_model_name}{ext}"
            if potential_path.exists():
                model_path = potential_path
                break
        
        if model_path is None:
            raise FileNotFoundError(f"Could not find model file for {best_model_name}")
        
        best_model = load_model(model_path)
        if 'cnn' in best_model_name:
            X_test_input = np.expand_dims(X_test, -1)
        else:
            X_test_input = X_test
        y_prob = best_model.predict(X_test_input, verbose=0)
        y_pred_best = np.argmax(y_prob, axis=1)
    else:
        best_model = joblib.load(MODEL_DIR / f"{best_model_name}.pkl")
        y_pred_best = best_model.predict(X_test)
    pbar.update(1)

# Classification report
print("\nDetailed Classification Report:")
print("=" * 80)
report = classification_report(y_test, y_pred_best, target_names=class_names, digits=4)
print(report)

# Per-class F1 scores
print("\nCalculating per-class metrics...")
with tqdm(total=2, desc="Analyzing per-class performance") as pbar:
    from sklearn.metrics import f1_score
    per_class_f1 = f1_score(y_test, y_pred_best, average=None)
    pbar.update(1)
    
    # Create DataFrame for analysis
    class_performance = pd.DataFrame({
        'class': class_names,
        'f1_score': per_class_f1,
        'support': [(y_test == i).sum() for i in range(num_classes)]
    })
    class_performance = class_performance.sort_values('f1_score')
    pbar.update(1)

print("\nPer-Class F1 Scores (sorted by performance):")
print(class_performance.to_string(index=False))

# Plot per-class F1 scores
print("\nGenerating per-class visualization...")
with tqdm(total=1, desc="Creating per-class plot") as pbar:
    plt.figure(figsize=(12, 6))
    colors = ['#e74c3c' if f1 < 0.95 else '#f39c12' if f1 < 0.98 else '#2ecc71' for f1 in class_performance['f1_score']]
    plt.barh(range(len(class_performance)), class_performance['f1_score'], color=colors)
    plt.yticks(range(len(class_performance)), class_performance['class'])
    plt.xlabel('F1 Score')
    plt.title(f'Per-Class F1 Scores - {best_model_name}')
    plt.axvline(x=0.95, color='red', linestyle='--', alpha=0.5, label='0.95 threshold')
    plt.axvline(x=0.98, color='orange', linestyle='--', alpha=0.5, label='0.98 threshold')
    plt.legend()
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig(MODEL_DIR / 'per_class_performance.png', dpi=300, bbox_inches='tight')
    plt.show()
    pbar.update(1)

print(f"\n✓ Saved per-class analysis to {MODEL_DIR / 'per_class_performance.png'}")

# Identify weak classes
weak_classes = class_performance[class_performance['f1_score'] < 0.95]
if len(weak_classes) > 0:
    print("\nClasses with F1 < 0.95 (need improvement):")
    print(weak_classes.to_string(index=False))
else:
    print("\n All classes have F1 >= 0.95!")

# Additional statistics
print("\n" + "=" * 60)
print("Per-Class Statistics")
print("=" * 60)
print(f"Best performing class: {class_performance.iloc[-1]['class']} (F1: {class_performance.iloc[-1]['f1_score']:.4f})")
print(f"Worst performing class: {class_performance.iloc[0]['class']} (F1: {class_performance.iloc[0]['f1_score']:.4f})")
print(f"Mean F1 score: {per_class_f1.mean():.4f}")
print(f"Std F1 score: {per_class_f1.std():.4f}")

In [None]:
# ============================================================
# Results Summary Table
# ============================================================
print("\n" + "=" * 60)
print("Model Comparison Summary")
print("=" * 60)

print("\nProcessing results...")
with tqdm(total=3, desc="Generating summary") as pbar:
    # Sort by F1 score
    res_df = pd.DataFrame(results).sort_values('f1_weighted', ascending=False).reset_index(drop=True)
    pbar.update(1)
    
    # Round for display
    display_df = res_df.copy()
    display_df['accuracy'] = display_df['accuracy'].apply(lambda x: f"{x:.4f}")
    display_df['f1_weighted'] = display_df['f1_weighted'].apply(lambda x: f"{x:.4f}")
    display_df['precision'] = display_df['precision'].apply(lambda x: f"{x:.4f}")
    display_df['recall'] = display_df['recall'].apply(lambda x: f"{x:.4f}")
    display_df['inference_time_sec'] = display_df['inference_time_sec'].apply(lambda x: f"{x:.2f}")
    display_df['model_size_mb'] = display_df['model_size_mb'].apply(lambda x: f"{x:.2f}")
    display_df['predictions_per_sec'] = display_df['predictions_per_sec'].apply(lambda x: f"{x:.0f}")
    pbar.update(1)
    
    # Save comparison
    res_df.to_csv(MODEL_DIR / "model_comparison_enhanced.csv", index=False)
    pbar.update(1)

print("\n")
print(display_df.to_string(index=False))

print(f"\nSaved detailed comparison to {MODEL_DIR / 'model_comparison_enhanced.csv'}")

# Print summary statistics
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)
print(f"Total models evaluated: {len(res_df)}")
print(f"Best F1 Score: {res_df.iloc[0]['f1_weighted']:.4f} ({res_df.iloc[0]['model']})")
print(f"Best Accuracy: {res_df['accuracy'].max():.4f}")
print(f"Fastest inference: {res_df['inference_time_sec'].min():.2f}s")
print(f"Highest throughput: {res_df['predictions_per_sec'].max():.0f} predictions/sec")