In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import json

def confusion_counts(y_true, probs, thr):
    y_true = y_true.astype(int)
    y_pred = (probs >= thr).astype(int)
    tn = int(((y_true == 0) & (y_pred == 0)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    recall = tp / (tp + fn + 1e-12)
    fpr = fp / (fp + tn + 1e-12)
    return {"thr": float(thr), "tn": tn, "fp": fp, "fn": fn, "tp": tp, "recall": float(recall), "fpr": float(fpr)}

# Load CNN val probs
cnn_dir = Path("../artifacts/cnn")
cnn_val_probs = np.load(cnn_dir / "val_probs.npy")
y_val = np.load(cnn_dir / "val_y.npy")

import xgboost as xgb

# --- Recompute XGB val probs from Optuna model (SAFER than loading old .npy) ---
xgb_dir = Path("../artifacts/xgb")
FEAT_PATH = Path("../data/processed/vnat/xgb_features.parquet")

df_feat = pd.read_parquet(FEAT_PATH)

# Load the exact feature columns used in training
feature_cols = json.loads(Path("../artifacts/features/feature_columns.json").read_text(encoding="utf-8"))

# Align with CNN val ordering using flow_id from CNN artifacts
cnn_val_flow_ids = np.load(cnn_dir / "val_flow_ids.npy").astype(int)

# Select val rows in exactly the same order as CNN
val_df = df_feat.set_index("flow_id").loc[cnn_val_flow_ids].reset_index()

# y_val must match CNN y_val (sanity check below)
xgb_X_val = val_df[feature_cols].to_numpy(dtype=np.float32)

booster = xgb.Booster()
booster.load_model(str(xgb_dir / "xgb_model_optuna.json"))

dval = xgb.DMatrix(xgb_X_val, feature_names=feature_cols)
xgb_val_probs = booster.predict(dval)

# Sanity check alignment
y_val_from_feats = val_df["label"].to_numpy(dtype=np.int64)
if not np.array_equal(y_val, y_val_from_feats):
    raise ValueError("y_val mismatch between CNN artifacts and xgb_features.parquet (alignment problem).")


print("Loaded probs:", cnn_val_probs.shape, xgb_val_probs.shape, y_val.shape)

KeyError: 'label'

In [None]:
max_fpr = 0.01

thresholds = np.linspace(0.0, 1.0, 1001)
best = None

for thr in thresholds:
    m = confusion_counts(y_val, cnn_val_probs, thr)
    if m["fpr"] <= max_fpr:
        # maximize recall under constraint
        if best is None or m["recall"] > best["recall"]:
            best = m

print("Best CNN threshold under FPR<=1%:", best)

Best CNN threshold under FPR<=1%: {'thr': 0.9530000000000001, 'tn': 421, 'fp': 4, 'fn': 136, 'tp': 1, 'recall': 0.007299270072992648, 'fpr': 0.00941176470588233}


In [None]:
def ensemble_probs(p_xgb, p_cnn, w):
    return w * p_xgb + (1.0 - w) * p_cnn

rows = []
for w in np.arange(0.0, 1.0001, 0.05):
    p_final = ensemble_probs(xgb_val_probs, cnn_val_probs, w)
    best_w = None
    for thr in thresholds:
        m = confusion_counts(y_val, p_final, thr)
        if m["fpr"] <= max_fpr:
            if best_w is None or m["recall"] > best_w["recall"]:
                best_w = m
    if best_w is not None:
        best_w["w"] = float(w)
        rows.append(best_w)

df = pd.DataFrame(rows).sort_values(["recall", "fpr"], ascending=[False, True])
df.head(10)

Unnamed: 0,thr,tn,fp,fn,tp,recall,fpr,w
7,0.621,421,4,0,137,1.0,0.009412,0.35
8,0.573,421,4,0,137,1.0,0.009412,0.4
9,0.525,421,4,0,137,1.0,0.009412,0.45
15,0.255,422,3,1,136,0.992701,0.007059,0.75
6,0.669,421,4,1,136,0.992701,0.009412,0.3
10,0.484,421,4,1,136,0.992701,0.009412,0.5
11,0.436,421,4,1,136,0.992701,0.009412,0.55
12,0.389,421,4,1,136,0.992701,0.009412,0.6
13,0.346,421,4,1,136,0.992701,0.009412,0.65
14,0.302,421,4,1,136,0.992701,0.009412,0.7


In [None]:
best_row = df.iloc[0].to_dict()
print("Selected:", best_row)

ens_dir = Path("../artifacts/ensemble")
ens_dir.mkdir(parents=True, exist_ok=True)

(Path("../artifacts/ensemble/selected_w.json")).write_text(json.dumps({"w": best_row["w"]}, indent=2), encoding="utf-8")
(Path("../artifacts/ensemble/selected_T.json")).write_text(
    json.dumps({"T": float(best_row["thr"])}, indent=2),
    encoding="utf-8"
)

df.to_csv(ens_dir / "val_sweep.csv", index=False)
print("Saved ensemble artifacts to:", ens_dir)

Selected: {'thr': 0.621, 'tn': 421.0, 'fp': 4.0, 'fn': 0.0, 'tp': 137.0, 'recall': 0.9999999999999928, 'fpr': 0.00941176470588233, 'w': 0.35000000000000003}
Saved ensemble artifacts to: ..\artifacts\ensemble
