# Phase 7 — Model Comparison Dashboard & Ensemble Methods

**Enhanced Evaluation:**
1. Load all trained models (classical ML + deep learning)
2. Evaluate on common test set
3. **Create ensemble models** (Voting, Stacking)
4. **Comprehensive metrics**: Accuracy, F1-score, inference time, model size
5. Per-class performance analysis
6. Visual comparison dashboard

In [None]:
import pandas as pd
import numpy as np
import joblib
import time
import os
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Load test data
X_test = pd.read_csv("../data/processed/ml_balance/test.csv").values
y_test = pd.read_csv("../data/processed/ml_balance/test_labels.csv").values

# Flatten y_test if needed
if len(y_test.shape) > 1 and y_test.shape[1] == 1:
    y_test = y_test.ravel()
elif len(y_test.shape) > 1:
    y_test = y_test.ravel()

print(f"Test set shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")
print(f"Number of classes: {len(np.unique(y_test))}")

num_classes = len(np.unique(y_test))
y_test_cat = to_categorical(y_test, num_classes)

MODEL_DIR = Path("../trained_models")
results = []

# ============================================================
# Load and Evaluate Classical ML Models
# ============================================================
print("\n" + "=" * 60)
print("Evaluating Classical ML Models")
print("=" * 60)

ml_models = {}

# Random Forest
for model_name in ['final_rf_optuna.pkl', 'final_rf.pkl', 'best_baseline.pkl']:
    model_path = MODEL_DIR / model_name
    if model_path.exists():
        print(f"\nLoading {model_name}...")
        model = joblib.load(model_path)
        
        # Measure inference time
        start = time.time()
        y_pred = model.predict(X_test)
        inference_time = time.time() - start
        
        # Get model size
        model_size = os.path.getsize(model_path) / (1024 * 1024)  # MB
        
        # Calculate metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted')
        
        results.append({
            'model': model_name.replace('.pkl', ''),
            'accuracy': acc,
            'f1_weighted': f1,
            'precision': precision,
            'recall': recall,
            'inference_time_sec': inference_time,
            'model_size_mb': model_size,
            'predictions_per_sec': len(X_test) / inference_time
        })
        
        print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
        
        # Save for ensemble
        if 'rf' in model_name.lower():
            ml_models['rf'] = model
        
        break  # Use first available

# XGBoost
for model_name in ['final_xgb_optuna.pkl', 'final_xgb.pkl']:
    model_path = MODEL_DIR / model_name
    if model_path.exists():
        print(f"\nLoading {model_name}...")
        model = joblib.load(model_path)
        
        start = time.time()
        y_pred = model.predict(X_test)
        inference_time = time.time() - start
        
        model_size = os.path.getsize(model_path) / (1024 * 1024)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted')
        
        results.append({
            'model': model_name.replace('.pkl', ''),
            'accuracy': acc,
            'f1_weighted': f1,
            'precision': precision,
            'recall': recall,
            'inference_time_sec': inference_time,
            'model_size_mb': model_size,
            'predictions_per_sec': len(X_test) / inference_time
        })
        
        print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
        
        ml_models['xgb'] = model
        break

# LightGBM
for model_name in ['final_lgbm_optuna.pkl', 'final_lgbm.pkl']:
    model_path = MODEL_DIR / model_name
    if model_path.exists():
        print(f"\nLoading {model_name}...")
        model = joblib.load(model_path)
        
        start = time.time()
        y_pred = model.predict(X_test)
        inference_time = time.time() - start
        
        model_size = os.path.getsize(model_path) / (1024 * 1024)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted')
        
        results.append({
            'model': model_name.replace('.pkl', ''),
            'accuracy': acc,
            'f1_weighted': f1,
            'precision': precision,
            'recall': recall,
            'inference_time_sec': inference_time,
            'model_size_mb': model_size,
            'predictions_per_sec': len(X_test) / inference_time
        })
        
        print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
        
        ml_models['lgbm'] = model
        break

# ============================================================
# Load and Evaluate Deep Learning Models
# ============================================================
print("\n" + "=" * 60)
print("Evaluating Deep Learning Models")
print("=" * 60)

dl_models = {}

# FFNN Residual
for model_name in ['final_ffnn_residual.keras', 'final_ffnn.keras', 'final_ffnn.h5']:
    model_path = MODEL_DIR / 'dl_models' / model_name
    if model_path.exists():
        print(f"\nLoading {model_name}...")
        model = load_model(model_path)
        
        start = time.time()
        y_prob = model.predict(X_test, verbose=0)
        y_pred = np.argmax(y_prob, axis=1)
        inference_time = time.time() - start
        
        model_size = os.path.getsize(model_path) / (1024 * 1024)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted')
        
        results.append({
            'model': model_name.replace('.keras', '').replace('.h5', ''),
            'accuracy': acc,
            'f1_weighted': f1,
            'precision': precision,
            'recall': recall,
            'inference_time_sec': inference_time,
            'model_size_mb': model_size,
            'predictions_per_sec': len(X_test) / inference_time
        })
        
        print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
        
        dl_models['ffnn'] = model
        break

# CNN Stable
for model_name in ['final_cnn_stable.keras', 'final_cnn.keras', 'final_cnn.h5']:
    model_path = MODEL_DIR / 'dl_models' / model_name
    if model_path.exists():
        print(f"\nLoading {model_name}...")
        model = load_model(model_path)
        
        X_test_cnn = np.expand_dims(X_test, -1)
        
        start = time.time()
        y_prob = model.predict(X_test_cnn, verbose=0)
        y_pred = np.argmax(y_prob, axis=1)
        inference_time = time.time() - start
        
        model_size = os.path.getsize(model_path) / (1024 * 1024)
        
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted')
        
        results.append({
            'model': model_name.replace('.keras', '').replace('.h5', ''),
            'accuracy': acc,
            'f1_weighted': f1,
            'precision': precision,
            'recall': recall,
            'inference_time_sec': inference_time,
            'model_size_mb': model_size,
            'predictions_per_sec': len(X_test) / inference_time
        })
        
        print(f"  Accuracy: {acc:.4f}, F1: {f1:.4f}, Time: {inference_time:.2f}s")
        
        dl_models['cnn'] = model
        break

print(f"\nLoaded {len(ml_models)} ML models and {len(dl_models)} DL models")

In [None]:
# ============================================================
# Create Ensemble Models
# ============================================================
print("\n" + "=" * 60)
print("Creating Ensemble Models")
print("=" * 60)

if len(ml_models) >= 2:
    # Voting Classifier (Soft voting - uses probabilities)
    print("\n--- Soft Voting Ensemble ---")
    estimators = [(name, model) for name, model in ml_models.items()]
    
    voting_clf = VotingClassifier(
        estimators=estimators,
        voting='soft',
        n_jobs=-1
    )
    
    # Fit on training data (required for voting)
    print("Fitting voting ensemble...")
    X_train = pd.read_csv("../data/processed/ml_balance/train_original.csv").values
    y_train = pd.read_csv("../data/processed/ml_balance/train_original_labels.csv").values
    if len(y_train.shape) > 1:
        y_train = y_train.ravel()
    
    voting_clf.fit(X_train, y_train)
    
    # Evaluate
    start = time.time()
    y_pred_voting = voting_clf.predict(X_test)
    inference_time = time.time() - start
    
    acc = accuracy_score(y_test, y_pred_voting)
    f1 = f1_score(y_test, y_pred_voting, average='weighted')
    precision = precision_score(y_test, y_pred_voting, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred_voting, average='weighted')
    
    results.append({
        'model': 'voting_ensemble',
        'accuracy': acc,
        'f1_weighted': f1,
        'precision': precision,
        'recall': recall,
        'inference_time_sec': inference_time,
        'model_size_mb': 0,  # Combined size
        'predictions_per_sec': len(X_test) / inference_time
    })
    
    print(f"Voting Ensemble - Accuracy: {acc:.4f}, F1: {f1:.4f}")
    
    # Save voting ensemble
    joblib.dump(voting_clf, MODEL_DIR / "voting_ensemble.pkl")
    print(f"Saved to {MODEL_DIR / 'voting_ensemble.pkl'}")
    
    # Stacking Classifier
    print("\n--- Stacking Ensemble ---")
    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        cv=5,
        n_jobs=-1
    )
    
    print("Fitting stacking ensemble (this may take a while)...")
    stacking_clf.fit(X_train, y_train)
    
    # Evaluate
    start = time.time()
    y_pred_stacking = stacking_clf.predict(X_test)
    inference_time = time.time() - start
    
    acc = accuracy_score(y_test, y_pred_stacking)
    f1 = f1_score(y_test, y_pred_stacking, average='weighted')
    precision = precision_score(y_test, y_pred_stacking, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred_stacking, average='weighted')
    
    results.append({
        'model': 'stacking_ensemble',
        'accuracy': acc,
        'f1_weighted': f1,
        'precision': precision,
        'recall': recall,
        'inference_time_sec': inference_time,
        'model_size_mb': 0,
        'predictions_per_sec': len(X_test) / inference_time
    })
    
    print(f"Stacking Ensemble - Accuracy: {acc:.4f}, F1: {f1:.4f}")
    
    # Save stacking ensemble
    joblib.dump(stacking_clf, MODEL_DIR / "stacking_ensemble.pkl")
    print(f"Saved to {MODEL_DIR / 'stacking_ensemble.pkl'}")
    
else:
    print("Not enough ML models for ensemble (need at least 2)")

In [None]:
# ============================================================
# Visualization Dashboard
# ============================================================
print("\n" + "=" * 60)
print("Visual Comparison Dashboard")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. F1 Score Comparison
ax = axes[0, 0]
models = res_df['model']
f1_scores = res_df['f1_weighted']
colors = ['#2ecc71' if 'ensemble' in m else '#3498db' if any(x in m for x in ['ffnn', 'cnn']) else '#e74c3c' for m in models]
ax.barh(models, f1_scores, color=colors)
ax.set_xlabel('F1 Score (Weighted)')
ax.set_title('F1 Score Comparison')
ax.set_xlim([0.9, 1.0])
ax.grid(axis='x', alpha=0.3)

# 2. Accuracy Comparison
ax = axes[0, 1]
ax.barh(models, res_df['accuracy'], color=colors)
ax.set_xlabel('Accuracy')
ax.set_title('Accuracy Comparison')
ax.set_xlim([0.9, 1.0])
ax.grid(axis='x', alpha=0.3)

# 3. Inference Time
ax = axes[0, 2]
ax.barh(models, res_df['inference_time_sec'], color=colors)
ax.set_xlabel('Inference Time (seconds)')
ax.set_title('Inference Speed (Lower is Better)')
ax.grid(axis='x', alpha=0.3)

# 4. Predictions per Second
ax = axes[1, 0]
ax.barh(models, res_df['predictions_per_sec'], color=colors)
ax.set_xlabel('Predictions/Second')
ax.set_title('Throughput (Higher is Better)')
ax.grid(axis='x', alpha=0.3)

# 5. Model Size
ax = axes[1, 1]
non_zero_sizes = res_df[res_df['model_size_mb'] > 0]
ax.barh(non_zero_sizes['model'], non_zero_sizes['model_size_mb'], 
        color=[colors[i] for i in non_zero_sizes.index])
ax.set_xlabel('Model Size (MB)')
ax.set_title('Model Size Comparison')
ax.grid(axis='x', alpha=0.3)

# 6. Precision vs Recall
ax = axes[1, 2]
ax.scatter(res_df['recall'], res_df['precision'], s=100, c=range(len(res_df)), cmap='viridis', alpha=0.7)
for i, model in enumerate(models):
    ax.annotate(model, (res_df.iloc[i]['recall'], res_df.iloc[i]['precision']), 
                fontsize=8, alpha=0.8, xytext=(5, 5), textcoords='offset points')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision vs Recall')
ax.grid(alpha=0.3)
ax.set_xlim([0.9, 1.0])
ax.set_ylim([0.9, 1.0])

plt.tight_layout()
plt.savefig(MODEL_DIR / 'model_comparison_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nSaved dashboard to {MODEL_DIR / 'model_comparison_dashboard.png'}")

In [None]:
# ============================================================
# Per-Class Performance Analysis (Best Model)
# ============================================================
print("\n" + "=" * 60)
print("Per-Class Performance Analysis")
print("=" * 60)

# Get best model
best_model_name = res_df.iloc[0]['model']
print(f"\nAnalyzing best model: {best_model_name}")

# Load label encoder to get class names
try:
    encoder = joblib.load("../trained_models/encoder.pkl")
    class_names = encoder.classes_
except:
    class_names = [f"Class_{i}" for i in range(num_classes)]

# Get predictions from best model
if 'ensemble' in best_model_name:
    best_model = joblib.load(MODEL_DIR / f"{best_model_name}.pkl")
    y_pred_best = best_model.predict(X_test)
elif any(x in best_model_name for x in ['ffnn', 'cnn']):
    model_path = MODEL_DIR / 'dl_models' / f"{best_model_name}.keras"
    if not model_path.exists():
        model_path = MODEL_DIR / 'dl_models' / f"{best_model_name}.h5"
    best_model = load_model(model_path)
    if 'cnn' in best_model_name:
        X_test_input = np.expand_dims(X_test, -1)
    else:
        X_test_input = X_test
    y_prob = best_model.predict(X_test_input, verbose=0)
    y_pred_best = np.argmax(y_prob, axis=1)
else:
    best_model = joblib.load(MODEL_DIR / f"{best_model_name}.pkl")
    y_pred_best = best_model.predict(X_test)

# Classification report
print("\nDetailed Classification Report:")
print("=" * 80)
report = classification_report(y_test, y_pred_best, target_names=class_names, digits=4)
print(report)

# Per-class F1 scores
from sklearn.metrics import f1_score
per_class_f1 = f1_score(y_test, y_pred_best, average=None)

# Create DataFrame for analysis
class_performance = pd.DataFrame({
    'class': class_names,
    'f1_score': per_class_f1,
    'support': [(y_test == i).sum() for i in range(num_classes)]
})
class_performance = class_performance.sort_values('f1_score')

print("\nPer-Class F1 Scores (sorted by performance):")
print(class_performance.to_string(index=False))

# Plot per-class F1 scores
plt.figure(figsize=(12, 6))
colors = ['#e74c3c' if f1 < 0.95 else '#f39c12' if f1 < 0.98 else '#2ecc71' for f1 in class_performance['f1_score']]
plt.barh(range(len(class_performance)), class_performance['f1_score'], color=colors)
plt.yticks(range(len(class_performance)), class_performance['class'])
plt.xlabel('F1 Score')
plt.title(f'Per-Class F1 Scores - {best_model_name}')
plt.axvline(x=0.95, color='red', linestyle='--', alpha=0.5, label='0.95 threshold')
plt.axvline(x=0.98, color='orange', linestyle='--', alpha=0.5, label='0.98 threshold')
plt.legend()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig(MODEL_DIR / 'per_class_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nSaved per-class analysis to {MODEL_DIR / 'per_class_performance.png'}")

# Identify weak classes
weak_classes = class_performance[class_performance['f1_score'] < 0.95]
if len(weak_classes) > 0:
    print("\n⚠️  Classes with F1 < 0.95 (need improvement):")
    print(weak_classes.to_string(index=False))
else:
    print("\n✓ All classes have F1 >= 0.95!")

In [None]:
# ============================================================
# Results Summary Table
# ============================================================
print("\n" + "=" * 60)
print("Model Comparison Summary")
print("=" * 60)

res_df = pd.DataFrame(results).sort_values('f1_weighted', ascending=False).reset_index(drop=True)

# Round for display
display_df = res_df.copy()
display_df['accuracy'] = display_df['accuracy'].apply(lambda x: f"{x:.4f}")
display_df['f1_weighted'] = display_df['f1_weighted'].apply(lambda x: f"{x:.4f}")
display_df['precision'] = display_df['precision'].apply(lambda x: f"{x:.4f}")
display_df['recall'] = display_df['recall'].apply(lambda x: f"{x:.4f}")
display_df['inference_time_sec'] = display_df['inference_time_sec'].apply(lambda x: f"{x:.2f}")
display_df['model_size_mb'] = display_df['model_size_mb'].apply(lambda x: f"{x:.2f}")
display_df['predictions_per_sec'] = display_df['predictions_per_sec'].apply(lambda x: f"{x:.0f}")

print("\n")
print(display_df.to_string(index=False))

# Save comparison
res_df.to_csv(MODEL_DIR / "model_comparison_enhanced.csv", index=False)
print(f"\nSaved detailed comparison to {MODEL_DIR / 'model_comparison_enhanced.csv'}")