# Phase 7 — Model Comparison Dashboard

Load all trained models and evaluate on a common test set. Produce a performance table and bar charts.

In [None]:
import pandas as pd
import numpy as np
import joblib
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tensorflow.keras.utils import to_categorical

# Load test data
X_test = pd.read_csv("../data/test.csv").values
y_test = pd.read_csv("../data/test_labels.csv", squeeze=True).values

# Load classical models
rf = joblib.load("../trained_models/final_rf.pkl") if Path("../trained_models/final_rf.pkl").exists() else joblib.load("../trained_models/best_baseline.pkl")
xgb = joblib.load("../trained_models/final_xgb.pkl") if Path("../trained_models/final_xgb.pkl").exists() else None

results = []

# Evaluate RF
if rf is not None:
    y_pred_rf = rf.predict(X_test)
    results.append({
        'model': 'RandomForest',
        'f1': f1_score(y_test, y_pred_rf, average='weighted'),
        'accuracy': accuracy_score(y_test, y_pred_rf)
    })

# Evaluate XGB
if xgb is not None:
    y_pred_xgb = xgb.predict(X_test)
    results.append({
        'model': 'XGBoost',
        'f1': f1_score(y_test, y_pred_xgb, average='weighted'),
        'accuracy': accuracy_score(y_test, y_pred_xgb)
    })

# Evaluate DL models
num_classes = len(np.unique(y_test))
y_test_cat = to_categorical(y_test, num_classes=num_classes)

for name in ['final_ffnn.h5', 'final_cnn.h5', 'final_lstm.h5', 'final_autoencoder.h5']:
    model_path = Path("../trained_models") / name
    if model_path.exists():
        print("Loading", name)
        model = load_model(model_path)
        if 'autoencoder' in name:
            # For autoencoder, compute reconstruction error and set a threshold (simple approach)
            recon = model.predict(X_test)
            mse = np.mean(np.square(recon - X_test), axis=1)
            threshold = np.percentile(mse, 95)
            preds = (mse > threshold).astype(int)  # 1 = anomaly
            # Map anomaly predictions to binary metrics — skip if multi-class expected
            print("Autoencoder threshold (95th pct):", threshold)
        else:
            if 'cnn' in name:
                X_in = np.expand_dims(X_test, -1)
            else:
                X_in = X_test
            y_prob = model.predict(X_in)
            y_pred = np.argmax(y_prob, axis=1)
            results.append({
                'model': name.replace('.h5',''),
                'f1': f1_score(y_test, y_pred, average='weighted'),
                'accuracy': accuracy_score(y_test, y_pred)
            })

res_df = pd.DataFrame(results).sort_values('f1', ascending=False).reset_index(drop=True)
print(res_df)

# Save comparison
res_df.to_csv("../trained_models/model_comparison.csv", index=False)
print("Saved comparison to ../trained_models/model_comparison.csv")
