# Model Training, Evaluation and Comparison

This notebook trains and compares baseline models (Logistic Regression, Random Forest, Isolation Forest, LOF) with sequential deep learning models (LSTM, TCN, Autoencoder) for fake engagement detection.


In [None]:
import sys
from pathlib import Path

# add project root to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# set plotting style
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except OSError:
    try:
        plt.style.use('seaborn-darkgrid')
    except OSError:
        plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

# import project modules
from src.data.preprocess import load_and_preprocess
from src.data.sequence_preparation import prepare_sequences_for_training
from src.data.dataset import create_dataloaders_from_dict
from src.features.temporal_features import extract_temporal_features
from src.training.train import (
    train_multiple_baselines,
    train_model_from_config,
)
from src.training.evaluate import (
    compare_models,
    compare_all_models,
    evaluate_sequential_model,
    compute_metrics,
)
from src.utils.config import load_config, update_config_with_data

# set random seeds
torch.manual_seed(42)
np.random.seed(42)


## 1. Load Configuration and Data

Load configuration and prepare data for both baseline and sequential models.


In [None]:
# load configuration
config = load_config()
print("Configuration loaded successfully")

# load preprocessed time series data
data_path = project_root / "data" / "raw" / "engagement_timeseries.parquet"
df = load_and_preprocess(
    file_path=str(data_path),
    target_timezone="UTC",
    resample_frequency="h",
    handle_missing=True,
    missing_method="forward",
    normalize=False,
)

print(f"\nTime series data shape: {df.shape}")
print(f"Number of videos: {df['id'].nunique()}")
print(f"Label distribution:")
print(df['label'].value_counts())


## 2. Train Baseline Models

Train baseline models on temporal features.


In [None]:
# extract temporal features
print("Extracting temporal features...")
features_df = extract_temporal_features(
    df,
    id_column="id",
    timestamp_column="timestamp",
    window_sizes=[6, 12, 24],
    autocorr_lags=[1, 6, 12, 24],
    aggregate_per_id=True,
)

print(f"Features extracted: {features_df.shape}")

# train baseline models
baseline_model_types = ['logistic_regression', 'random_forest', 'isolation_forest', 'lof']
baseline_results = train_multiple_baselines(
    features_df,
    model_types=baseline_model_types,
    test_size=0.2,
    random_state=42,
    save_dir=str(project_root / "models" / "baselines"),
)

print(f"\nBaseline models trained: {len(baseline_results)}")


## 3. Prepare Sequences for Sequential Models

Prepare time series sequences for LSTM, TCN, and Autoencoder.


In [None]:
# prepare sequences
data_config = config.get("data", {})
seq_len = data_config.get("seq_len", 48)

print("Preparing sequences for sequential models...")
sequence_data = prepare_sequences_for_training(
    df,
    seq_len=seq_len,
    stride=data_config.get("stride", 1),
    normalize=data_config.get("normalize", True),
    normalization_method=data_config.get("normalization_method", "standardize"),
    normalize_per_series=data_config.get("normalize_per_series", False),
    test_size=data_config.get("test_size", 0.2),
    val_size=data_config.get("val_size", 0.1),
    random_state=config.get("training", {}).get("random_seed", 42),
)

print(f"Sequences prepared:")
print(f"  Train: {sequence_data['X_train'].shape}")
print(f"  Val: {sequence_data['X_val'].shape}")
print(f"  Test: {sequence_data['X_test'].shape}")

# update config with data dimensions
input_size = len(sequence_data['feature_names'])
config = update_config_with_data(config, input_size=input_size, seq_len=seq_len)

# create dataloaders
dataloaders = create_dataloaders_from_dict(
    sequence_data,
    batch_size=data_config.get("batch_size", 32),
    shuffle_train=True,
    num_workers=data_config.get("num_workers", 0),
    pin_memory=data_config.get("pin_memory", False),
)

print(f"\nDataLoaders created: {list(dataloaders.keys())}")


## 4. Train Sequential Models

Train LSTM, TCN, and Autoencoder models with early stopping.


In [None]:
# setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# train sequential models
sequential_models = ['lstm', 'tcn', 'autoencoder']
sequential_results = {}
training_histories = {}

for model_type in sequential_models:
    print(f"\n{'='*60}")
    print(f"Training {model_type.upper()}")
    print(f"{'='*60}")
    
    model, history = train_model_from_config(
        model_type=model_type,
        dataloaders=dataloaders,
        config=config,
        device=device,
        save_dir=str(project_root / "models" / "sequential"),
    )
    
    sequential_results[model_type] = (
        model,
        dataloaders['test'],
        device,
        model_type
    )
    training_histories[model_type] = history
    
    print(f"{model_type.upper()} training completed")

print(f"\nSequential models trained: {len(sequential_results)}")


## 5. Training Curves Visualization

Visualize training and validation curves for sequential models.


In [None]:
# plot training curves
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for idx, (model_type, history) in enumerate(training_histories.items()):
    row = idx // 2
    col = idx % 2
    
    ax = axes[row, col]
    
    epochs = range(1, len(history['train_loss']) + 1)
    ax.plot(epochs, history['train_loss'], label='Train Loss', linewidth=2)
    ax.plot(epochs, history['val_loss'], label='Val Loss', linewidth=2)
    
    if history['train_accuracy'] and any(history['train_accuracy']):
        ax2 = ax.twinx()
        ax2.plot(epochs, history['train_accuracy'], label='Train Acc', 
                linewidth=2, linestyle='--', color='green')
        ax2.plot(epochs, history['val_accuracy'], label='Val Acc', 
                linewidth=2, linestyle='--', color='orange')
        ax2.set_ylabel('Accuracy', fontsize=10)
        ax2.legend(loc='upper right')
    
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12)
    ax.set_title(f'{model_type.upper()} Training Curves', fontsize=14, fontweight='bold')
    ax.legend(loc='upper left')
    ax.grid(True, alpha=0.3)

# hide unused subplot
if len(training_histories) < 4:
    axes[1, 1].axis('off')

plt.tight_layout()
plt.show()


## 6. Evaluate Sequential Models

Evaluate sequential models on test set.


In [None]:
# evaluate sequential models
sequential_metrics = {}

for model_type, (model, test_loader, device, _) in sequential_results.items():
    print(f"\nEvaluating {model_type.upper()}...")
    y_true, y_pred, y_proba = evaluate_sequential_model(model, test_loader, device, model_type)
    metrics = compute_metrics(y_true, y_pred, y_proba)
    sequential_metrics[model_type] = metrics
    
    print(f"  AUC: {metrics['auc']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1: {metrics['f1']:.4f}")

sequential_metrics_df = pd.DataFrame(sequential_metrics).T
print("\nSequential Models Metrics:")
print(sequential_metrics_df[['auc', 'precision', 'recall', 'f1', 'false_positive_rate']])


## 7. LSTM vs TCN Comparison

Compare LSTM and TCN models performance and training characteristics.


In [None]:
# compare LSTM vs TCN
if 'lstm' in sequential_results and 'tcn' in sequential_results:
    # get training histories
    lstm_history = training_histories.get('lstm', {})
    tcn_history = training_histories.get('tcn', {})
    
    # plot comparison
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # loss comparison
    ax = axes[0, 0]
    if lstm_history and 'train_loss' in lstm_history:
        epochs_lstm = range(1, len(lstm_history['train_loss']) + 1)
        ax.plot(epochs_lstm, lstm_history['train_loss'], label='LSTM Train', linewidth=2, color='blue')
        ax.plot(epochs_lstm, lstm_history['val_loss'], label='LSTM Val', linewidth=2, color='blue', linestyle='--')
    if tcn_history and 'train_loss' in tcn_history:
        epochs_tcn = range(1, len(tcn_history['train_loss']) + 1)
        ax.plot(epochs_tcn, tcn_history['train_loss'], label='TCN Train', linewidth=2, color='red')
        ax.plot(epochs_tcn, tcn_history['val_loss'], label='TCN Val', linewidth=2, color='red', linestyle='--')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12)
    ax.set_title('Loss Comparison: LSTM vs TCN', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # accuracy comparison
    ax = axes[0, 1]
    if lstm_history and 'train_accuracy' in lstm_history and any(lstm_history['train_accuracy']):
        epochs_lstm = range(1, len(lstm_history['train_accuracy']) + 1)
        ax.plot(epochs_lstm, lstm_history['train_accuracy'], label='LSTM Train', linewidth=2, color='blue')
        ax.plot(epochs_lstm, lstm_history['val_accuracy'], label='LSTM Val', linewidth=2, color='blue', linestyle='--')
    if tcn_history and 'train_accuracy' in tcn_history and any(tcn_history['train_accuracy']):
        epochs_tcn = range(1, len(tcn_history['train_accuracy']) + 1)
        ax.plot(epochs_tcn, tcn_history['train_accuracy'], label='TCN Train', linewidth=2, color='red')
        ax.plot(epochs_tcn, tcn_history['val_accuracy'], label='TCN Val', linewidth=2, color='red', linestyle='--')
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Accuracy', fontsize=12)
    ax.set_title('Accuracy Comparison: LSTM vs TCN', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # evaluate and compare metrics
    lstm_model, lstm_loader, lstm_device, _ = sequential_results['lstm']
    tcn_model, tcn_loader, tcn_device, _ = sequential_results['tcn']
    
    lstm_y_true, lstm_y_pred, lstm_y_proba = evaluate_sequential_model(lstm_model, lstm_loader, lstm_device, 'lstm')
    tcn_y_true, tcn_y_pred, tcn_y_proba = evaluate_sequential_model(tcn_model, tcn_loader, tcn_device, 'tcn')
    
    lstm_metrics = compute_metrics(lstm_y_true, lstm_y_pred, lstm_y_proba)
    tcn_metrics = compute_metrics(tcn_y_true, tcn_y_pred, tcn_y_proba)
    
    # metrics bar chart
    ax = axes[1, 0]
    metrics_names = ['AUC', 'Precision', 'Recall', 'F1']
    lstm_values = [lstm_metrics['auc'], lstm_metrics['precision'], lstm_metrics['recall'], lstm_metrics['f1']]
    tcn_values = [tcn_metrics['auc'], tcn_metrics['precision'], tcn_metrics['recall'], tcn_metrics['f1']]
    
    x = np.arange(len(metrics_names))
    width = 0.35
    ax.bar(x - width/2, lstm_values, width, label='LSTM', color='blue', alpha=0.7)
    ax.bar(x + width/2, tcn_values, width, label='TCN', color='red', alpha=0.7)
    ax.set_xlabel('Metric', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Performance Metrics: LSTM vs TCN', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics_names)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    # ROC curves comparison
    ax = axes[1, 1]
    plot_roc_curve(lstm_y_true, lstm_y_proba, model_name='LSTM', ax=ax)
    plot_roc_curve(tcn_y_true, tcn_y_proba, model_name='TCN', ax=ax)
    ax.set_title('ROC Curves: LSTM vs TCN', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # print comparison
    print("=" * 60)
    print("LSTM vs TCN Comparison")
    print("=" * 60)
    print(f"\nLSTM Metrics:")
    print(f"  AUC: {lstm_metrics['auc']:.4f}")
    print(f"  Precision: {lstm_metrics['precision']:.4f}")
    print(f"  Recall: {lstm_metrics['recall']:.4f}")
    print(f"  F1: {lstm_metrics['f1']:.4f}")
    print(f"\nTCN Metrics:")
    print(f"  AUC: {tcn_metrics['auc']:.4f}")
    print(f"  Precision: {tcn_metrics['precision']:.4f}")
    print(f"  Recall: {tcn_metrics['recall']:.4f}")
    print(f"  F1: {tcn_metrics['f1']:.4f}")
    print("=" * 60)
else:
    print("Both LSTM and TCN models are required for comparison. Train them first.")


## 8. Evaluate Sequential Models

Evaluate sequential models on test set.


In [None]:
# evaluate sequential models
sequential_metrics = {}

for model_type, (model, test_loader, device, _) in sequential_results.items():
    print(f"\nEvaluating {model_type.upper()}...")
    y_true, y_pred, y_proba = evaluate_sequential_model(model, test_loader, device, model_type)
    metrics = compute_metrics(y_true, y_pred, y_proba)
    sequential_metrics[model_type] = metrics
    
    print(f"  AUC: {metrics['auc']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1: {metrics['f1']:.4f}")

sequential_metrics_df = pd.DataFrame(sequential_metrics).T
print("\nSequential Models Metrics:")
print(sequential_metrics_df[['auc', 'precision', 'recall', 'f1', 'false_positive_rate']])


## 7. Compare Baseline Models

Compare baseline models performance.


In [None]:
# compare baseline models
baseline_metrics_df = compare_models(baseline_results, plot=True)


## 8. Comprehensive Model Comparison

Compare all models (baselines vs sequential) to identify the best model.


In [None]:
# comprehensive comparison
all_metrics_df = compare_all_models(baseline_results, sequential_results, plot=True)

# save results
results_path = project_root / "models" / "model_comparison_results.csv"
all_metrics_df.to_csv(results_path)
print(f"\nResults saved to: {results_path}")


## 9. Error Analysis

Analyze misclassifications and identify patterns in errors.


In [None]:
# get best model predictions for error analysis
best_model_name = all_metrics_df['auc'].idxmax()
print(f"Analyzing errors for best model: {best_model_name}")

if best_model_name in sequential_results:
    # sequential model
    model, test_loader, device, model_type = sequential_results[best_model_name]
    y_true, y_pred, y_proba = evaluate_sequential_model(model, test_loader, device, model_type)
else:
    # baseline model
    model, X_test, y_test, y_pred, y_proba = baseline_results[best_model_name]
    y_true = y_test

# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# confusion matrix heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Normal', 'Fake'], yticklabels=['Normal', 'Fake'])
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('True', fontsize=12)
axes[0].set_title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')

# error distribution
errors = y_true != y_pred
if y_proba.ndim > 1:
    y_proba_positive = y_proba[:, 1] if y_proba.shape[1] > 1 else y_proba.flatten()
else:
    y_proba_positive = y_proba

axes[1].hist(y_proba_positive[errors & (y_true == 0)], bins=20, 
            alpha=0.6, label='False Positives', color='red', density=True)
axes[1].hist(y_proba_positive[errors & (y_true == 1)], bins=20, 
            alpha=0.6, label='False Negatives', color='orange', density=True)
axes[1].set_xlabel('Prediction Score', fontsize=12)
axes[1].set_ylabel('Density', fontsize=12)
axes[1].set_title('Error Distribution', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# print error statistics
print(f"\nError Statistics for {best_model_name}:")
print(f"  Total errors: {errors.sum()}")
print(f"  False Positives: {(errors & (y_true == 0)).sum()}")
print(f"  False Negatives: {(errors & (y_true == 1)).sum()}")
print(f"  Error rate: {errors.mean():.2%}")


In [None]:
print("=" * 80)
print("MODEL SELECTION SUMMARY")
print("=" * 80)

# best model
best_model_name = all_metrics_df['auc'].idxmax()
best_metrics = all_metrics_df.loc[best_model_name]

print(f"\nChampion Model: {best_model_name}")
print(f"  Model Type: {best_metrics['model_type']}")
print(f"  AUC: {best_metrics['auc']:.4f}")
print(f"  Precision: {best_metrics['precision']:.4f}")
print(f"  Recall: {best_metrics['recall']:.4f}")
print(f"  F1-Score: {best_metrics['f1']:.4f}")
print(f"  False Positive Rate: {best_metrics['false_positive_rate']:.4f}")

# comparison baseline vs sequential
baseline_avg_auc = all_metrics_df[all_metrics_df['model_type'] == 'baseline']['auc'].mean()
sequential_avg_auc = all_metrics_df[all_metrics_df['model_type'] == 'sequential']['auc'].mean()

print(f"\nAverage Performance:")
print(f"  Baseline models: {baseline_avg_auc:.4f}")
print(f"  Sequential models: {sequential_avg_auc:.4f}")
print(f"  Improvement: {(sequential_avg_auc - baseline_avg_auc):.4f} ({(sequential_avg_auc - baseline_avg_auc) / baseline_avg_auc * 100:.1f}%)")

# top 3 models
print(f"\nTop 3 Models (by AUC):")
top_3 = all_metrics_df.nlargest(3, 'auc')
for idx, (model_name, row) in enumerate(top_3.iterrows(), 1):
    print(f"  {idx}. {model_name}: AUC={row['auc']:.4f}, F1={row['f1']:.4f}")

print("\n" + "=" * 80)
