In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

import xgboost as xgb
import optuna
from sklearn.metrics import confusion_matrix

In [2]:
# paths
XGB_FEATS = Path("../data/processed/vnat/xgb_features.parquet")
FLOWS     = Path("../data/processed/vnat/flows.parquet")
SPLITS    = Path("../data/splits")

# load data
X_df = pd.read_parquet(XGB_FEATS)
flows = pd.read_parquet(FLOWS).set_index("flow_id")

# labels
y = flows.loc[X_df["flow_id"], "label"].to_numpy(dtype=np.int64)

# load capture splits
train_caps = set((SPLITS / "vnat_train_captures.txt").read_text().splitlines())
val_caps   = set((SPLITS / "vnat_val_captures.txt").read_text().splitlines())

is_train = flows.loc[X_df["flow_id"], "capture_id"].isin(train_caps).to_numpy()
is_val   = flows.loc[X_df["flow_id"], "capture_id"].isin(val_caps).to_numpy()

X_train = X_df.loc[is_train].drop(columns=["flow_id"]).to_numpy(dtype=np.float32)
y_train = y[is_train]

X_val = X_df.loc[is_val].drop(columns=["flow_id"]).to_numpy(dtype=np.float32)
y_val = y[is_val]

print("Train:", X_train.shape, "Val:", X_val.shape)
print("Train labels:", dict(zip(*np.unique(y_train, return_counts=True))))
print("Val labels:", dict(zip(*np.unique(y_val, return_counts=True))))

Train: (20240, 66) Val: (562, 66)
Train labels: {np.int64(0): np.int64(20011), np.int64(1): np.int64(229)}
Val labels: {np.int64(0): np.int64(425), np.int64(1): np.int64(137)}


In [3]:
n_pos = int((y_train == 1).sum())
n_neg = int((y_train == 0).sum())

scale_pos_weight = n_neg / n_pos
scale_pos_weight

87.38427947598254

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val, label=y_val)

In [5]:
def recall_under_fpr(y_true, probs, max_fpr=0.01):
    best = {"thr": None, "recall": 0.0, "fpr": 1.0}

    for thr in np.linspace(0.0, 1.0, 501):
        y_pred = (probs >= thr).astype(int)

        tn = np.sum((y_true == 0) & (y_pred == 0))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))
        tp = np.sum((y_true == 1) & (y_pred == 1))

        fpr = fp / (fp + tn + 1e-12)
        recall = tp / (tp + fn + 1e-12)

        if fpr <= max_fpr and recall > best["recall"]:
            best = {"thr": thr, "recall": recall, "fpr": fpr}

    return best

In [6]:
def objective(trial):
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "hist",
        "verbosity": 0,

        # fixed imbalance handling
        "scale_pos_weight": float(scale_pos_weight),

        # tuned params
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 50.0),
        "eta": trial.suggest_float("eta", 0.01, 0.2),
    }

    booster = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=3000,
        evals=[(dval, "val")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    # best_iteration is 0-based; include it
    val_probs = booster.predict(dval, iteration_range=(0, int(booster.best_iteration) + 1))

    best = recall_under_fpr(y_val, val_probs, max_fpr=0.01)

    # If nothing satisfies constraint, kill the trial
    if best["thr"] is None:
        return -1.0

    # Objective: maximize recall under FPR<=1%
    score = float(best["recall"])

    trial.set_user_attr("best_thr", float(best["thr"]))
    trial.set_user_attr("best_fpr", float(best["fpr"]))
    trial.set_user_attr("best_recall", float(best["recall"]))
    trial.set_user_attr("best_iteration", int(booster.best_iteration))

    return score

In [7]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=60, show_progress_bar=True)

print("Best score:", study.best_value)
print("Best params:", study.best_params)
print("Best attrs:", study.best_trial.user_attrs)

study.best_params

[32m[I 2026-02-01 18:56:41,492][0m A new study created in memory with name: no-name-5b0f5150-bc5b-4ae3-9389-c51f471e15c3[0m


  0%|          | 0/60 [00:00<?, ?it/s]

[32m[I 2026-02-01 18:56:44,751][0m Trial 0 finished with value: 0.9927007299270001 and parameters: {'max_depth': 4, 'min_child_weight': 19, 'gamma': 2.9579717136443664, 'subsample': 0.8965452520443027, 'colsample_bytree': 0.6058423506524082, 'reg_alpha': 0.18366373553662219, 'reg_lambda': 8.267779316352817, 'eta': 0.13505912075960602}. Best is trial 0 with value: 0.9927007299270001.[0m
[32m[I 2026-02-01 18:56:49,687][0m Trial 1 finished with value: 0.9927007299270001 and parameters: {'max_depth': 6, 'min_child_weight': 12, 'gamma': 1.836187300219435, 'subsample': 0.6280307599389109, 'colsample_bytree': 0.818594123483058, 'reg_alpha': 0.011366236679027919, 'reg_lambda': 35.787059739697064, 'eta': 0.04473836240575383}. Best is trial 0 with value: 0.9927007299270001.[0m
[32m[I 2026-02-01 18:56:54,011][0m Trial 2 finished with value: 0.9999999999999928 and parameters: {'max_depth': 3, 'min_child_weight': 17, 'gamma': 0.1760298377919023, 'subsample': 0.8485430820459239, 'colsample_b

{'max_depth': 3,
 'min_child_weight': 17,
 'gamma': 0.1760298377919023,
 'subsample': 0.8485430820459239,
 'colsample_bytree': 0.659308765904645,
 'reg_alpha': 3.8562173851733506,
 'reg_lambda': 17.197020311773908,
 'eta': 0.04915273371556882}

In [8]:
ART_DIR = Path("../artifacts/xgb")
ART_DIR.mkdir(parents=True, exist_ok=True)

best = {
    "best_params": study.best_params,
    "best_recall": study.best_value,
    "best_iteration": study.best_trial.user_attrs["best_iteration"],
    "scale_pos_weight": scale_pos_weight,
}

(ART_DIR / "optuna_best_params.json").write_text(
    json.dumps(best, indent=2),
    encoding="utf-8"
)

print("Saved:", ART_DIR / "optuna_best_params.json")

Saved: ..\artifacts\xgb\optuna_best_params.json


In [9]:
best_params = study.best_params.copy()

# pull extra stuff we stored during Optuna
best_thr  = float(study.best_trial.user_attrs["best_thr"])
best_iter = int(study.best_trial.user_attrs["best_iteration"])

final_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "tree_method": "hist",
    "verbosity": 0,
    "scale_pos_weight": float(scale_pos_weight),
    **best_params,
}

# Train+Val
X_trva = np.vstack([X_train, X_val])
y_trva = np.concatenate([y_train, y_val])

dtrva = xgb.DMatrix(X_trva, label=y_trva)

final_booster = xgb.train(
    params=final_params,
    dtrain=dtrva,
    num_boost_round=best_iter + 1,
    verbose_eval=False,
)

# save final model + params + selected threshold
ART_DIR = Path("../artifacts/xgb")
ART_DIR.mkdir(parents=True, exist_ok=True)

model_path = ART_DIR / "xgb_model_optuna.json"
final_booster.save_model(str(model_path))

(ART_DIR / "optuna_best_params.json").write_text(
    json.dumps(best_params, indent=2), encoding="utf-8"
)

(ART_DIR / "optuna_selected_threshold.json").write_text(
    json.dumps({"thr": best_thr, "best_iteration": best_iter}, indent=2),
    encoding="utf-8"
)

print("Saved model:", model_path)
print("Saved params:", ART_DIR / "optuna_best_params.json")
print("Saved threshold:", ART_DIR / "optuna_selected_threshold.json")
print("Use this XGB threshold later:", best_thr)

Saved model: ..\artifacts\xgb\xgb_model_optuna.json
Saved params: ..\artifacts\xgb\optuna_best_params.json
Saved threshold: ..\artifacts\xgb\optuna_selected_threshold.json
Use this XGB threshold later: 0.262


In [10]:
print("Best score:", study.best_value)
print("Best recall:", study.best_trial.user_attrs["best_recall"])
print("Best FPR:", study.best_trial.user_attrs["best_fpr"])
print("Best thr:", study.best_trial.user_attrs["best_thr"])
print("Best iter:", study.best_trial.user_attrs["best_iteration"])

Best score: 0.9999999999999928
Best recall: 0.9999999999999928
Best FPR: 0.00941176470588233
Best thr: 0.262
Best iter: 1522
