# Model Comparison

In [1]:
import json
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent

artifacts = PROJECT_ROOT / 'artifacts'
models_run = sorted(artifacts.glob('run_timecv_models_*'))
if not models_run:
    raise FileNotFoundError('No run_timecv_models_* artifacts found. Run python -m src.timecv_models')
models_run = models_run[-1]

summary = json.loads((models_run / 'metrics_summary.json').read_text(encoding='utf-8'))
rows = []
for model, metrics in summary['models'].items():
    rows.append({
        'model': model,
        'precision': f"{metrics['precision']['mean']:.4f} +/- {metrics['precision']['std']:.4f}",
        'recall': f"{metrics['recall']['mean']:.4f} +/- {metrics['recall']['std']:.4f}",
        'f2': f"{metrics['f2']['mean']:.4f} +/- {metrics['f2']['std']:.4f}",
        'pr_auc': f"{metrics['pr_auc']['mean']:.4f} +/- {metrics['pr_auc']['std']:.4f}",
    })

pd.DataFrame(rows)


Unnamed: 0,model,precision,recall,f2,pr_auc
0,LogisticRegression,0.2835 +/- 0.1706,0.6122 +/- 0.3180,0.4781 +/- 0.2405,0.6632 +/- 0.0173
1,RandomForest,0.3815 +/- 0.1536,0.4178 +/- 0.2963,0.3459 +/- 0.1271,0.4806 +/- 0.0317
2,GradientBoosting,0.6325 +/- 0.1319,0.8133 +/- 0.2225,0.7422 +/- 0.1593,0.8592 +/- 0.0511


## Tuning summary (top 10)

In [2]:
tune_runs = sorted(artifacts.glob('run_timecv_tune_gb_*'))
if not tune_runs:
    raise FileNotFoundError('No run_timecv_tune_gb_* artifacts found. Run python -m src.timecv_tune_gb')

latest_tune = tune_runs[-1]

candidates = pd.read_csv(latest_tune / 'metrics_candidates.csv')
top10 = candidates.sort_values(['f2_mean', 'recall_mean'], ascending=False).head(10)
cols = ['n_estimators','learning_rate','max_depth','min_samples_leaf','subsample','f2_mean','recall_mean','pr_auc_mean','alert_rate_mean']
top10[cols]


Unnamed: 0,n_estimators,learning_rate,max_depth,min_samples_leaf,subsample,f2_mean,recall_mean,pr_auc_mean,alert_rate_mean
43,100,0.1,2,5,0.85,0.892812,0.935556,0.954563,0.088
36,100,0.1,2,1,0.7,0.876637,0.915556,0.951943,0.086667
39,100,0.1,2,3,0.7,0.858049,0.915556,0.95749,0.094667
40,100,0.1,2,3,0.85,0.853563,0.935556,0.949342,0.104
24,100,0.05,2,5,0.7,0.843892,0.935556,0.951969,0.109333
42,100,0.1,2,5,0.7,0.838068,0.935556,0.953704,0.114667
51,100,0.1,3,5,0.7,0.824485,0.935556,0.901463,0.121333
20,100,0.05,2,1,1.0,0.819146,0.918889,0.934556,0.117333
30,100,0.05,3,3,0.7,0.818771,0.935556,0.894142,0.125333
55,200,0.03,2,1,0.85,0.813515,0.915556,0.945652,0.118667


## Tuned vs baseline GradientBoosting

In [3]:
best_cfg = json.loads((latest_tune / 'best_config.json').read_text(encoding='utf-8'))

gb_base = summary['models']['GradientBoosting']
base_f2 = gb_base['f2']['mean']
base_recall = gb_base['recall']['mean']

delta = {
    'f2_delta': best_cfg['f2_mean'] - base_f2,
    'recall_delta': best_cfg['recall_mean'] - base_recall,
}

{'base_f2': base_f2, 'tuned_f2': best_cfg['f2_mean'], **delta}


{'base_f2': 0.7422466727739623,
 'tuned_f2': 0.8928123538480875,
 'f2_delta': 0.15056568107412527,
 'recall_delta': 0.12222222222222223}

## Operating points

In [4]:
ops = json.loads((latest_tune / 'operating_points.json').read_text(encoding='utf-8'))
ops_table = pd.DataFrame([
    {
        'name': name,
        'threshold': f"{v['threshold']:.4f}",
        'precision': f"{v['precision']:.4f}",
        'recall': f"{v['recall']:.4f}",
        'f2': f"{v['f2']:.4f}",
        'alerts_per_1000': f"{v['alerts_per_1000']:.1f}",
    }
    for name, v in ops.items()
])
ops_table


Unnamed: 0,name,threshold,precision,recall,f2,alerts_per_1000
0,f2_opt,0.0255,0.4762,0.9434,0.7886,140.0
1,high_recall,0.0352,0.5102,0.9434,0.8065,130.7
2,alert_budget_5p,0.8509,1.0,0.3962,0.4506,28.0


## Feature importance

In [5]:
fi = pd.read_csv(latest_tune / 'feature_importance.csv')
fi.head(10)


Unnamed: 0,feature,importance_mean,importance_std
0,congestion,0.297041,0.027319
1,packet_loss,0.275927,0.022596
2,latency,0.121313,0.007948
3,jitter,0.119927,0.004197
4,throughput,0.100275,0.01229
5,congestion__lag3,0.016599,0.005957
6,throughput__lag2,0.015114,0.00553
7,congestion__lag1,0.011657,0.007891
8,throughput__lag3,0.006877,0.006947
9,congestion__roll6_std,0.005471,0.001267


## Example true/false positives

In [6]:
report = (latest_tune / 'report.md').read_text(encoding='utf-8')
lines = report.splitlines()

def extract_section(header):
    out = []
    keep = False
    for line in lines:
        if line.startswith(header):
            keep = True
            continue
        if keep and line.startswith('## '):
            break
        if keep and line.startswith('- '):
            out.append(line)
    return out

true_pos = extract_section('## Example true positives')
false_pos = extract_section('## Example false positives')

{'true_positives': true_pos, 'false_positives': false_pos}


{'true_positives': ['- timestamp=2024-05-11T22:17:53+00:00, score=0.9988, top_features: congestion: value=110.190, roll3_mean=24.453, delta1=110.100; packet_loss: value=52.500, roll3_mean=8.333, delta1=27.500; throughput: value=0.110, roll3_mean=2.027, delta1=-1.480',
  '- timestamp=2024-05-11T22:17:53+00:00, score=0.9988, top_features: congestion: value=110.190, roll3_mean=24.453, delta1=110.100; packet_loss: value=52.500, roll3_mean=8.333, delta1=27.500; throughput: value=0.110, roll3_mean=2.027, delta1=-1.480',
  '- timestamp=2024-05-11T22:17:53+00:00, score=0.9982, top_features: congestion: value=110.190, roll3_mean=24.453, delta1=110.100; packet_loss: value=52.500, roll3_mean=8.333, delta1=27.500; throughput: value=0.110, roll3_mean=2.027, delta1=-1.480'],
 'false_positives': ['- timestamp=2024-05-11T22:04:52+00:00, score=0.1814, top_features: packet_loss: value=27.500, roll3_mean=7.500, delta1=27.500; congestion: value=63.210, roll3_mean=62.363, delta1=17.040; latency: value=9.86

Interpretation

- F2 and recall matter most because missed anomalies are costly.
- PR-AUC measures ranking quality under class imbalance.
- Rolling time-CV preserves time order and prevents leakage.
