# Model Evaluation and Comparison

This notebook provides comprehensive evaluation of all trained models:
- Baseline models (Logistic Regression, Random Forest, Isolation Forest, LOF)
- Sequential models (LSTM, TCN, Autoencoder)

It includes:
- ROC curves comparison
- Score distributions
- Comprehensive metrics table

In [None]:
import sys
from pathlib import Path

# add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader

# set plotting style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except OSError:
    try:
        plt.style.use('seaborn-darkgrid')
    except OSError:
        plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

# import project modules
from src.data.preprocess import load_and_preprocess
from src.data.sequence_preparation import prepare_sequences_for_training
from src.data.dataset import create_dataloaders_from_dict
from src.features.temporal_features import extract_temporal_features
from src.models.baselines import load_baseline_model
from src.models.lstm import LSTMModel
from src.models.tcn import TCNModel
from src.models.autoencoder import AutoencoderModel
from src.training.evaluate import (
    evaluate_sequential_model,
    compare_all_models,
    plot_roc_curve,
    plot_score_distributions,
    compute_metrics,
)
from src.utils.config import load_config

## 1. Load Data and Models

Load test data and all trained models for evaluation.

In [None]:
# load config
config = load_config(project_root / "config" / "config.yaml")

# load data
data_path = project_root / "data" / "raw" / "engagement_timeseries.parquet"
df = load_and_preprocess(
    file_path=str(data_path),
    target_timezone="UTC",
    resample_frequency="h",
    handle_missing=True,
    missing_method="forward",
    normalize=False
)

# prepare features for baseline models
features_df = extract_temporal_features(df, aggregate_per_id=True)

# prepare sequences for sequential models
sequences_dict = prepare_sequences_for_training(
    df,
    seq_len=config["data"]["seq_len"],
    normalize=True
)

# create test dataloader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataloaders = create_dataloaders_from_dict(
    sequences_dict,
    batch_size=config["training"]["batch_size"],
    train_ratio=0.7,
    val_ratio=0.15,
    test_ratio=0.15,
    random_seed=42
)
test_loader = dataloaders["test"]

print("Data prepared")

In [None]:
# load baseline models
baseline_results = {}
baseline_dir = project_root / "models" / "baselines"

for model_type in ["logistic_regression", "random_forest", "isolation_forest"]:
    model_path = baseline_dir / f"{model_type}.pkl"
    if model_path.exists():
        try:
            model = load_baseline_model(str(model_path))
            from src.training.train import prepare_data
            X_train, X_test, y_train, y_test, _ = prepare_data(features_df, test_size=0.2, random_state=42)
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)
            baseline_results[model_type] = (model, X_test, y_test, y_pred, y_proba)
            print(f"Loaded {model_type}")
        except Exception as e:
            print(f"Error loading {model_type}: {e}")

In [None]:
# load sequential models
sequential_results = {}
sequential_dir = project_root / "models" / "sequential"

for model_type in ["lstm", "tcn", "autoencoder"]:
    model_path = sequential_dir / f"{model_type}_best.pth"
    if model_path.exists():
        try:
            checkpoint = torch.load(str(model_path), map_location=device, weights_only=False)
            if model_type == "lstm":
                model = LSTMModel(**config["models"]["lstm"])
            elif model_type == "tcn":
                model = TCNModel(**config["models"]["tcn"])
            elif model_type == "autoencoder":
                model = AutoencoderModel(**config["models"]["autoencoder"], seq_len=config["data"]["seq_len"])
            model.load_state_dict(checkpoint["model_state_dict"])
            model.to(device)
            model.eval()
            sequential_results[model_type] = (model, test_loader, device, model_type)
            print(f"Loaded {model_type}")
        except Exception as e:
            print(f"Error loading {model_type}: {e}")

## 2. ROC Curves

Plot ROC curves for all models.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

# baseline models
for model_name, (model, X_test, y_test, y_pred, y_proba) in baseline_results.items():
    plot_roc_curve(y_test, y_proba, model_name=model_name, ax=ax)

# sequential models
for model_name, (model, dataloader, device, model_type) in sequential_results.items():
    y_true, y_pred, y_proba = evaluate_sequential_model(model, dataloader, device, model_type)
    plot_roc_curve(y_true, y_proba, model_name=model_name, ax=ax)

ax.set_title("ROC Curves - All Models", fontsize=16, fontweight="bold")
ax.legend(loc="lower right")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Score Distributions

Visualize score distributions for normal vs fake classes.

In [None]:
# collect predictions
all_predictions = {}

for model_name, (model, X_test, y_test, y_pred, y_proba) in baseline_results.items():
    if y_proba.ndim > 1:
        y_proba_positive = y_proba[:, 1] if y_proba.shape[1] > 1 else y_proba.flatten()
    else:
        y_proba_positive = y_proba
    all_predictions[model_name] = (y_test, y_proba_positive)

for model_name, (model, dataloader, device, model_type) in sequential_results.items():
    y_true, y_pred, y_proba = evaluate_sequential_model(model, dataloader, device, model_type)
    if y_proba.ndim > 1:
        y_proba_positive = y_proba[:, 1] if y_proba.shape[1] > 1 else y_proba.flatten()
    else:
        y_proba_positive = y_proba
    all_predictions[model_name] = (y_true, y_proba_positive)

# plot distributions
n_models = len(all_predictions)
fig, axes = plt.subplots((n_models + 1) // 2, 2, figsize=(16, 4 * ((n_models + 1) // 2)))
if n_models == 1:
    axes = [axes]
else:
    axes = axes.flatten()

for idx, (model_name, (y_true, y_proba)) in enumerate(all_predictions.items()):
    ax = axes[idx]
    normal_scores = y_proba[y_true == 0]
    fake_scores = y_proba[y_true == 1]
    ax.hist(normal_scores, bins=50, alpha=0.6, label="Normal", color="blue", density=True, histtype="step", linewidth=2)
    ax.hist(fake_scores, bins=50, alpha=0.6, label="Fake", color="red", density=True, histtype="step", linewidth=2, linestyle="--")
    ax.set_xlabel("Prediction Score", fontsize=12)
    ax.set_ylabel("Density", fontsize=12)
    ax.set_title(f"Score Distribution - {model_name.upper()}", fontsize=14, fontweight="bold")
    ax.legend()
    ax.grid(True, alpha=0.3)
    ax.axvline(x=0.5, color="gray", linestyle=":", linewidth=1)

for idx in range(n_models, len(axes)):
    axes[idx].axis("off")

plt.tight_layout()
plt.show()

## 4. Comprehensive Metrics Table

Create comparison table of all models.

In [None]:
# compute metrics
all_metrics = {}

for model_name, (model, X_test, y_test, y_pred, y_proba) in baseline_results.items():
    metrics = compute_metrics(y_test, y_pred, y_proba)
    metrics["model_type"] = "baseline"
    all_metrics[model_name] = metrics

for model_name, (model, dataloader, device, model_type) in sequential_results.items():
    y_true, y_pred, y_proba = evaluate_sequential_model(model, dataloader, device, model_type)
    metrics = compute_metrics(y_true, y_pred, y_proba)
    metrics["model_type"] = "sequential"
    all_metrics[model_name] = metrics

# create DataFrame
metrics_df = pd.DataFrame(all_metrics).T
display_metrics = ["auc", "precision", "recall", "f1", "false_positive_rate"]
metrics_display = metrics_df[display_metrics].copy()
metrics_display["model_type"] = metrics_df["model_type"]
metrics_display = metrics_display.sort_values("auc", ascending=False)

print("=" * 80)
print("COMPREHENSIVE MODEL COMPARISON")
print("=" * 80)
print("\nMetrics Table:")
print(metrics_display.round(4))

# best model
best_model_name = metrics_display.index[0]
best_metrics = metrics_display.loc[best_model_name]

print("\n" + "=" * 80)
print(f"BEST MODEL: {best_model_name.upper()}")
print("=" * 80)
print(f"  AUC: {best_metrics['auc']:.4f}")
print(f"  Precision: {best_metrics['precision']:.4f}")
print(f"  Recall: {best_metrics['recall']:.4f}")
print(f"  F1-Score: {best_metrics['f1']:.4f}")