In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

XGB_FEATS = Path("../data/processed/vnat/xgb_features.parquet")
FLOWS     = Path("../data/processed/vnat/flows.parquet")

train_caps = set(Path("../data/splits/vnat_train_captures.txt").read_text(encoding="utf-8").splitlines())
val_caps   = set(Path("../data/splits/vnat_val_captures.txt").read_text(encoding="utf-8").splitlines())
test_caps  = set(Path("../data/splits/vnat_test_captures.txt").read_text(encoding="utf-8").splitlines())

xgb_df = pd.read_parquet(XGB_FEATS)
flows  = pd.read_parquet(FLOWS).set_index("flow_id")

# Align by flow_id
xgb_df = xgb_df.set_index("flow_id").sort_index()
flows = flows.sort_index()

assert xgb_df.index.equals(flows.index), "flow_id mismatch between xgb_features and flows"

# Build indices by capture split
train_ids = flows.index[flows["capture_id"].isin(train_caps)]
val_ids   = flows.index[flows["capture_id"].isin(val_caps)]
test_ids  = flows.index[flows["capture_id"].isin(test_caps)]

print("Flows Train:", len(train_ids), "Val:", len(val_ids), "Test:", len(test_ids))
print("Train class_counts:", flows.loc[train_ids, "label"].value_counts().to_dict())
print("Val class_counts:", flows.loc[val_ids, "label"].value_counts().to_dict())
print("Test class_counts:", flows.loc[test_ids, "label"].value_counts().to_dict())

Flows Train: 20240 Val: 562 Test: 12909
Train class_counts: {0: 20011, 1: 229}
Val class_counts: {0: 425, 1: 137}
Test class_counts: {0: 12896, 1: 13}


In [None]:
feature_cols = [c for c in xgb_df.columns if c not in ["capture_id", "label"]]
# safety: drop anything non-numeric (shouldn't exist, but keep safe)
xgb_df = xgb_df[feature_cols].apply(pd.to_numeric, errors="coerce")
assert not xgb_df.isna().any().any(), "NaNs appeared after numeric coercion"

X_train = xgb_df.loc[train_ids].to_numpy(dtype=np.float32)
X_val   = xgb_df.loc[val_ids].to_numpy(dtype=np.float32)
X_test  = xgb_df.loc[test_ids].to_numpy(dtype=np.float32)

y_train = flows.loc[train_ids, "label"].to_numpy(dtype=np.int32)
y_val   = flows.loc[val_ids, "label"].to_numpy(dtype=np.int32)
y_test  = flows.loc[test_ids, "label"].to_numpy(dtype=np.int32)

print("X_train:", X_train.shape, "X_val:", X_val.shape, "X_test:", X_test.shape)

X_train: (20240, 66) X_val: (562, 66) X_test: (12909, 66)


In [3]:
n_pos = int((y_train == 1).sum())
n_neg = int((y_train == 0).sum())
scale_pos_weight = n_neg / max(n_pos, 1)

print("Train positives:", n_pos, "Train negatives:", n_neg)
print("scale_pos_weight:", scale_pos_weight)

Train positives: 229 Train negatives: 20011
scale_pos_weight: 87.38427947598254


In [None]:
import xgboost as xgb

params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 6,
    "learning_rate": 0.1,
    "n_estimators": 2000,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_lambda": 1.0,
    "random_state": 42,
    "n_jobs": -1,
    "scale_pos_weight": scale_pos_weight,
}

model = xgb.XGBClassifier(**params)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=50,
    early_stopping_rounds=50
)
print("Best iteration:", model.best_iteration)

[0]	validation_0-logloss:0.60352
[50]	validation_0-logloss:0.04263
[100]	validation_0-logloss:0.01557
[150]	validation_0-logloss:0.01489
[200]	validation_0-logloss:0.01598
[250]	validation_0-logloss:0.01613
[300]	validation_0-logloss:0.01595
[350]	validation_0-logloss:0.01598
[400]	validation_0-logloss:0.01617
[450]	validation_0-logloss:0.01602
[500]	validation_0-logloss:0.01601
[550]	validation_0-logloss:0.01593
[600]	validation_0-logloss:0.01600
[650]	validation_0-logloss:0.01588
[700]	validation_0-logloss:0.01586
[750]	validation_0-logloss:0.01607
[800]	validation_0-logloss:0.01583
[850]	validation_0-logloss:0.01587
[900]	validation_0-logloss:0.01611
[950]	validation_0-logloss:0.01590
[1000]	validation_0-logloss:0.01584
[1050]	validation_0-logloss:0.01581
[1100]	validation_0-logloss:0.01593
[1150]	validation_0-logloss:0.01599
[1200]	validation_0-logloss:0.01604
[1250]	validation_0-logloss:0.01606
[1300]	validation_0-logloss:0.01608
[1350]	validation_0-logloss:0.01601
[1400]	validati

In [5]:
from sklearn.metrics import confusion_matrix, recall_score

def eval_at_threshold(y_true, probs, thr=0.5):
    y_pred = (probs >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    recall = tp / (tp + fn + 1e-12)
    fpr = fp / (fp + tn + 1e-12)
    return {"thr": thr, "tn": tn, "fp": fp, "fn": fn, "tp": tp, "recall": recall, "fpr": fpr}

val_probs = model.predict_proba(X_val)[:, 1]
test_probs = model.predict_proba(X_test)[:, 1]

print("VAL @0.5:", eval_at_threshold(y_val, val_probs, 0.5))
print("TEST @0.5:", eval_at_threshold(y_test, test_probs, 0.5))

VAL @0.5: {'thr': 0.5, 'tn': np.int64(424), 'fp': np.int64(1), 'fn': np.int64(1), 'tp': np.int64(136), 'recall': np.float64(0.9927007299270001), 'fpr': np.float64(0.0023529411764705824)}
TEST @0.5: {'thr': 0.5, 'tn': np.int64(12893), 'fp': np.int64(3), 'fn': np.int64(3), 'tp': np.int64(10), 'recall': np.float64(0.7692307692307101), 'fpr': np.float64(0.00023263027295285357)}


In [6]:
import json
from pathlib import Path
import numpy as np

ART_DIR = Path("../artifacts/xgb")
ART_DIR.mkdir(parents=True, exist_ok=True)

# save model
model_path = ART_DIR / "xgb_model.json"
model.save_model(str(model_path))

# save feature columns
Path("../artifacts/features").mkdir(parents=True, exist_ok=True)
Path("../artifacts/features/feature_columns.json").write_text(
    json.dumps(feature_cols, indent=2), encoding="utf-8"
)

def to_py(obj):
    """Convert numpy scalars/arrays recursively to Python types for JSON."""
    if isinstance(obj, dict):
        return {k: to_py(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [to_py(v) for v in obj]
    if isinstance(obj, (np.integer,)):
        return int(obj)
    if isinstance(obj, (np.floating,)):
        return float(obj)
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

metrics = {
    "train_pos": int(n_pos),
    "train_neg": int(n_neg),
    "scale_pos_weight": float(scale_pos_weight),
    "val_at_0_5": eval_at_threshold(y_val, val_probs, 0.5),
    "test_at_0_5": eval_at_threshold(y_test, test_probs, 0.5),
}

(Path("../artifacts/xgb/metrics.json")).write_text(
    json.dumps(to_py(metrics), indent=2),
    encoding="utf-8"
)

print("Saved model:", model_path)
print("Saved metrics:", ART_DIR / "metrics.json")

Saved model: ..\artifacts\xgb\xgb_model.json
Saved metrics: ..\artifacts\xgb\metrics.json


In [7]:
import numpy as np
from pathlib import Path

OUT = Path("../artifacts/xgb")
OUT.mkdir(parents=True, exist_ok=True)

np.save(OUT / "val_probs.npy", val_probs)
np.save(OUT / "val_y.npy", y_val)

np.save(OUT / "test_probs.npy", test_probs)
np.save(OUT / "test_y.npy", y_test)

print("Saved XGB probs to:", OUT)

Saved XGB probs to: ..\artifacts\xgb
