In [1]:
# XGB-only notebook — CELL 1: setup (use existing MLflow server)
from pathlib import Path
import os, mlflow

# Quiet GitPython warnings inside container
os.environ.setdefault("GIT_PYTHON_REFRESH", "quiet")

BASE = Path("/tf/notebooks/ids_unsw")
DATA_DIR  = BASE / "data"
MODEL_DIR = BASE / "models"

# Change tracking URI so this container can see the MLflow server
mlflow.set_tracking_uri("http://host.docker.internal:5000")  # or "http://mlflow:5000"
EXPERIMENT_NAME = "unsw-nb15"
mlflow.set_experiment(EXPERIMENT_NAME)


<Experiment: artifact_location='/mlflow/artifacts/1', creation_time=1756847680158, experiment_id='1', last_update_time=1756847680158, lifecycle_stage='active', name='unsw-nb15', tags={}>

In [2]:
# XGB-only notebook — CELL 2: load features + model
import json, pickle
from pathlib import Path

MODEL_DIR = Path("/tf/notebooks/ids_unsw/models")
FEATURES_PATH = MODEL_DIR / "feature_names.json"
XGB_PKL_PATH  = MODEL_DIR / "best_xgboost_model.pkl"  # from your 04 notebook

# 1) load feature list
with open(FEATURES_PATH, "r") as f:
    FEATURES = json.load(f)
print(f"Loaded {len(FEATURES)} feature names from:", FEATURES_PATH)

# 2) load the trained XGB model
with open(XGB_PKL_PATH, "rb") as f:
    xgb = pickle.load(f)
print("Loaded model:", type(xgb).__name__)
try:
    print("n_estimators:", getattr(xgb, "n_estimators", "n/a"),
          "| max_depth:", getattr(xgb, "max_depth", "n/a"))
except Exception as e:
    print("Model introspection note:", e)


Loaded 34 feature names from: /tf/notebooks/ids_unsw/models/feature_names.json
Loaded model: XGBClassifier
n_estimators: 200 | max_depth: 10


In [21]:
# ===== Corrected Cell — Load Test Data and Apply Scaling (no feature-name warning) =====
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

# --- Define Paths ---
DATA_DIR = Path("/tf/notebooks/ids_unsw/data")
MODEL_DIR = Path("/tf/notebooks/ids_unsw/models")
TEST_PARQUET = DATA_DIR / "UNSW_NB15_test_clean.parquet"
SCALER_PATH = MODEL_DIR / "scaler.pkl"

# --- 1) Load Test Data ---
df_test = pd.read_parquet(TEST_PARQUET)
print("Loaded test shape:", df_test.shape)

# --- 2) Find Label Column ---
label_col_candidates = ["label", "y", "is_attack", "target"]
label_col = next((c for c in df_test.columns if c in label_col_candidates), None)
assert label_col is not None, f"Could not find label column among {label_col_candidates}. Columns: {list(df_test.columns)[:10]}..."

# --- 3) Load the Scaler ---
assert SCALER_PATH.exists(), f"Scaler not found at {SCALER_PATH}. Please ensure it has been saved from the training notebook."
with open(SCALER_PATH, "rb") as f:
    scaler = pickle.load(f)
print("Loaded scaler object.")

# --- 4) Prepare Test Features and Labels ---
# Convert to NumPy before scaling to match how the scaler was fitted (no feature names)
X_test_np = df_test.loc[:, FEATURES].to_numpy(dtype=np.float32, copy=False)
X_test = scaler.transform(X_test_np).astype(np.float32, copy=False)

y_test = df_test[label_col].astype(int)

print(f"X_test shape after scaling: {X_test.shape} (n_features expected={len(FEATURES)})")
print(f"y_test base rate (mean of 1's): {y_test.mean():.6f}")


Loaded test shape: (82332, 36)
Loaded scaler object.
X_test shape after scaling: (82332, 34) (n_features expected=34)
y_test base rate (mean of 1's): 0.550600


In [4]:
# ===== Corrected Cell 4 — Predict Probabilities =====
import numpy as np

# Corrected: X_test is already a NumPy array, so we pass it directly to the model.
proba = xgb.predict_proba(X_test)[:, 1]  # P(class=1)

print("Proba shape:", proba.shape)

q = np.quantile(proba, [0, 0.01, 0.05, 0.50, 0.95, 0.99, 1.0])
print("Proba quantiles [min,1%,5%,50%,95%,99%,max]:", [float(v) for v in q])
print("First 10 probs:", np.round(proba[:10], 4))



Proba shape: (82332,)
Proba quantiles [min,1%,5%,50%,95%,99%,max]: [6.70561712468043e-05, 8.351331780431792e-05, 0.00011126959725515917, 0.8789591789245605, 0.9989271759986877, 0.9992443335056305, 0.999703586101532]
First 10 probs: [0.9147 0.8796 0.7478 0.5187 0.5628 0.6867 0.5735 0.8126 0.0012 0.0012]


In [5]:
# XGB-only notebook — CELL 5: sweep thresholds
import numpy as np, pandas as pd
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

y_true = y_test.values if hasattr(y_test, "values") else y_test

def sweep_thresholds(y, proba):
    # cover the range well: fixed grid + data-driven quantiles
    grid = np.linspace(0.50, 0.95, 15)
    qs   = np.quantile(proba, np.linspace(0.50, 0.99, 12))
    thresholds = np.unique(np.clip(np.concatenate([grid, qs]), 0, 1))

    rows = []
    P = (y == 1).sum()
    N = (y == 0).sum()
    for t in thresholds:
        y_hat = (proba >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y, y_hat, labels=[0,1]).ravel()
        prec, rec, f1, _ = precision_recall_fscore_support(
            y, y_hat, average="binary", zero_division=0
        )
        fpr = fp / (fp + tn) if (fp + tn) else 0.0
        rows.append(dict(threshold=float(t),
                         precision=float(prec), recall=float(rec), f1=float(f1),
                         FPR=float(fpr), TP=int(tp), FP=int(fp), TN=int(tn), FN=int(fn)))
    return pd.DataFrame(rows).sort_values("threshold").reset_index(drop=True)

sweep_xgb = sweep_thresholds(y_true, proba)

print("Top of sweep:\n", sweep_xgb.head(), "\n")
print("Bottom of sweep:\n", sweep_xgb.tail())


Top of sweep:
    threshold  precision    recall        f1       FPR     TP     FP     TN  \
0   0.500000   0.814917  0.976132  0.888269  0.271622  44250  10050  26950   
1   0.532143   0.828455  0.970948  0.894060  0.246324  44015   9114  27886   
2   0.564286   0.843241  0.965675  0.900315  0.219946  43776   8138  28862   
3   0.596429   0.856453  0.959080  0.904866  0.196946  43477   7287  29713   
4   0.628571   0.869521  0.952594  0.909164  0.175135  43183   6480  30520   

     FN  
0  1082  
1  1317  
2  1556  
3  1855  
4  2149   

Bottom of sweep:
     threshold  precision    recall        f1  FPR     TP  FP     TN     FN
22   0.998112        1.0  0.341789  0.509453  0.0  15494   0  37000  29838
23   0.998438        1.0  0.269765  0.424906  0.0  12229   0  37000  33103
24   0.998703        1.0  0.179983  0.305061  0.0   8159   0  37000  37173
25   0.998898        1.0  0.099466  0.180935  0.0   4509   0  37000  40823
26   0.999244        1.0  0.018177  0.035705  0.0    824   0 

In [6]:
# XGB-only notebook — CELL 6: choose operating threshold
import numpy as np

# y_true and proba already defined; sweep_xgb already computed in the last cell
eligible = sweep_xgb[sweep_xgb["recall"] >= 0.95].copy()

if eligible.empty:
    print("No threshold reaches recall ≥ 0.95 — falling back to the row with highest recall.")
    chosen = sweep_xgb.sort_values(["recall","threshold"], ascending=[False, True]).iloc[0]
else:
    # minimize FPR; tie-breaker: maximize precision; then prefer the higher threshold
    chosen = (
        eligible.sort_values(["FPR", "precision", "threshold"],
                             ascending=[True, False, True])
                .iloc[0]
    )

thr = float(chosen["threshold"])
print(f"Chosen threshold = {thr:.4f}")
display(chosen.to_frame().T)  # shows the full row nicely if you're in a notebook


Chosen threshold = 0.6286


Unnamed: 0,threshold,precision,recall,f1,FPR,TP,FP,TN,FN
4,0.628571,0.869521,0.952594,0.909164,0.175135,43183.0,6480.0,30520.0,2149.0


In [7]:
# CELL — recompute probabilities & set the chosen threshold
import numpy as np

# assumes `xgb` (XGBClassifier) and `X_test` are already loaded
proba_xgb = xgb.predict_proba(X_test)[:, 1]
thr = 0.757143  # your chosen threshold from the sweep

print("proba_xgb shape:", proba_xgb.shape, "| thr:", thr)


proba_xgb shape: (82332,) | thr: 0.757143


In [8]:
# CELL 7 — finalize eval at chosen threshold (XGB-only)

import numpy as np
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_recall_fscore_support, roc_auc_score
)

# proba_xgb: array of P(class=1), y_test: true labels (0/1), thr: chosen threshold
y_true = np.asarray(y_test)
y_pred = (proba_xgb >= thr).astype(int)

# Confusion matrix in TN, FP, FN, TP order
tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
fpr = fp / (fp + tn) if (fp + tn) else 0.0

# Thresholded metrics
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, average="binary", zero_division=0
)

# Threshold-free metric for reference
roc_auc = roc_auc_score(y_true, proba_xgb)

print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
print({
    "precision": float(prec),
    "recall": float(rec),
    "f1": float(f1),
    "roc_auc": float(roc_auc),
    "FPR": float(fpr),
    "threshold": float(thr),
})

print("\nClassification report:")
print(classification_report(y_true, y_pred, digits=4))


Confusion: TN=34258, FP=2742, FN=3560, TP=41772
{'precision': 0.9384014018061734, 'recall': 0.9214682784787788, 'f1': 0.9298577566057477, 'roc_auc': 0.9804886709704498, 'FPR': 0.07410810810810811, 'threshold': 0.757143}

Classification report:
              precision    recall  f1-score   support

           0     0.9059    0.9259    0.9158     37000
           1     0.9384    0.9215    0.9299     45332

    accuracy                         0.9235     82332
   macro avg     0.9221    0.9237    0.9228     82332
weighted avg     0.9238    0.9235    0.9235     82332



In [9]:
# CELL 8 — persist threshold & metrics, and log to MLflow (server)
import os, json, numpy as np
from pathlib import Path
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_fscore_support
import mlflow

# ensure MLflow points at the running server + experiment name used earlier
os.environ.setdefault("GIT_PYTHON_REFRESH", "quiet")
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://host.docker.internal:5000"))
mlflow.set_experiment("unsw-nb15")

# assumes: MODEL_DIR, y_test, proba_xgb, X_test already exist in memory
thr = 0.757143  # chosen threshold

y_true = np.asarray(y_test)
y_pred = (proba_xgb >= thr).astype(int)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
auc  = roc_auc_score(y_true, proba_xgb)
fpr  = fp / (fp + tn)

# ---- save/update local metadata (models/metadata.json)
meta_path = MODEL_DIR / "metadata.json"
meta = {}
if meta_path.exists():
    meta = json.loads(meta_path.read_text())

meta.update({
    "champion": "xgboost",
    "threshold": float(thr),
    "n_features": int(X_test.shape[1]),
    "metrics_at_threshold": {
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "roc_auc": float(auc),
        "FPR": float(fpr),
        "TP": int(tp), "FP": int(fp), "TN": int(tn), "FN": int(fn),
    },
})
meta_path.write_text(json.dumps(meta, indent=2))
print(f"✅ Saved threshold + metrics to {meta_path}")

# ---- log to MLflow (artifacts under xgb/)
with mlflow.start_run(run_name=f"xgb_threshold@{thr:.4f}"):
    mlflow.log_params({
        "model": "xgboost",
        "threshold": float(thr),
        "n_features": int(X_test.shape[1]),
    })
    mlflow.log_metrics({
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "roc_auc": float(auc),
        "FPR": float(fpr),
        "TP": int(tp), "FP": int(fp), "TN": int(tn), "FN": int(fn),
    })

    # artifacts
    if (MODEL_DIR / "xgb.onnx").exists():
        mlflow.log_artifact(str(MODEL_DIR / "xgb.onnx"), artifact_path="xgb")
    if (MODEL_DIR / "feature_names.json").exists():
        mlflow.log_artifact(str(MODEL_DIR / "feature_names.json"), artifact_path="xgb")
    # push the same metadata.json that we saved locally
    mlflow.log_artifact(str(meta_path), artifact_path="xgb")

print("✅ Logged to MLflow.")


✅ Saved threshold + metrics to /tf/notebooks/ids_unsw/models/metadata.json
🏃 View run xgb_threshold@0.7571 at: http://host.docker.internal:5000/#/experiments/1/runs/1305f0b763514b2cba5ea26da06a5944
🧪 View experiment at: http://host.docker.internal:5000/#/experiments/1
✅ Logged to MLflow.


In [10]:
# CELL 9 — register ONNX in MLflow Model Registry (auto-unique name)
import os, onnx, mlflow, mlflow.onnx
import numpy as np, time, json
from pathlib import Path

BASE = Path("/tf/notebooks/ids_unsw")
MODEL_DIR = BASE / "models"

os.environ.setdefault("GIT_PYTHON_REFRESH", "quiet")
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://host.docker.internal:5000"))
mlflow.set_experiment("unsw-nb15")

# --- load ONNX
onnx_path = MODEL_DIR / "xgb.onnx"
assert onnx_path.exists(), f"Missing {onnx_path}"
onnx_model = onnx.load(str(onnx_path))

# --- infer feature count for input_example
feat_file = MODEL_DIR / "feature_names.json"
if feat_file.exists():
    features = json.loads(feat_file.read_text())
    n_features = len(features)
else:
    n_features = 34

input_example = np.zeros((1, n_features), dtype=np.float32)

# --- decide model registry name
base_name = "unsw_xgb_ids_onnx"
client = mlflow.tracking.MlflowClient()
existing = client.search_registered_models(filter_string=f"name='{base_name}'")
if existing:  
    # model already exists → create unique one
    ts = time.strftime("%Y%m%d-%H%M%S")
    model_name = f"{base_name}_{ts}"
else:
    model_name = base_name

# --- log & register ONNX
with mlflow.start_run(run_name="register_xgb_onnx"):
    mlflow.onnx.log_model(
        onnx_model,
        name="xgb",
        registered_model_name=model_name,
        input_example=input_example
    )
    for name in ("feature_names.json", "metadata.json"):
        p = MODEL_DIR / name
        if p.exists():
            mlflow.log_artifact(str(p), artifact_path="xgb")

info = mlflow.get_experiment_by_name("unsw-nb15")
print(f"✅ Registered as `{model_name}`. Open:", f"http://host.docker.internal:5000/#/experiments/{info.experiment_id}")


Successfully registered model 'unsw_xgb_ids_onnx_20250903-231006'.
2025/09/03 23:10:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: unsw_xgb_ids_onnx_20250903-231006, version 1
Created version '1' of model 'unsw_xgb_ids_onnx_20250903-231006'.


🏃 View run register_xgb_onnx at: http://host.docker.internal:5000/#/experiments/1/runs/f2a627e144e84f93a15edb6db76161aa
🧪 View experiment at: http://host.docker.internal:5000/#/experiments/1
✅ Registered as `unsw_xgb_ids_onnx_20250903-231006`. Open: http://host.docker.internal:5000/#/experiments/1


In [11]:
# CELL 10 — assemble a local serving bundle
from pathlib import Path
import shutil, json

BASE = Path("/tf/notebooks/ids_unsw")
MODEL_DIR = BASE / "models"
BUNDLE = MODEL_DIR / "bundle_xgb"
BUNDLE.mkdir(exist_ok=True)

for name in ("xgb.onnx", "feature_names.json", "metadata.json"):
    src = MODEL_DIR / name
    if src.exists():
        shutil.copy2(src, BUNDLE / name)

print("📦 Bundle contents:", [p.name for p in sorted(BUNDLE.iterdir())])


📦 Bundle contents: ['feature_names.json', 'metadata.json', 'xgb.onnx']


In [12]:
# ===== Corrected Cell 11 — Minimal ONNX Prediction Check =====
import json
import numpy as np
import onnxruntime as ort
from pathlib import Path
import pandas as pd # <-- Make sure pandas is imported

BASE = Path("/tf/notebooks/ids_unsw")
BUNDLE = BASE / "models" / "bundle_xgb"

features = json.loads((BUNDLE/"feature_names.json").read_text())
meta = json.loads((BUNDLE/"metadata.json").read_text())
thr = float(meta["threshold"])

sess = ort.InferenceSession(str(BUNDLE/"xgb.onnx"), providers=["CPUExecutionProvider"])

def score_df(df):
    X = df[features].to_numpy(np.float32)
    # The ONNX model for XGBoost often returns probabilities in the second output tensor
    # The [1] selects that tensor, and [:,1] selects the probability for the positive class (1)
    probs = sess.run(None, {"input": X})[1][:,1]
    preds = (probs >= thr).astype(np.int32)
    return probs, preds

# --- CORRECTED CODE ---
# Create a small test DataFrame from the first 10 rows of our NumPy arrays
# 'features' is the list of column names loaded from the JSON file.
quick_test_df = pd.DataFrame(X_test[:10], columns=features)

# Since y_test is a separate array, we add it back if needed for context,
# but score_df only needs the feature columns.
probs, preds = score_df(quick_test_df)
# --- END CORRECTED CODE ---

print("probs[:5] =", np.round(probs[:5], 4))
print("preds[:5] =", preds[:5])

probs[:5] = [0.9147 0.8796 0.7478 0.5187 0.5628]
preds[:5] = [1 1 0 0 0]


In [20]:
# Section 12 — Option A: XGBoost GPU Predictor (no conversion)
# Runs your trained model on the GPU. Avoids timing the prediction cache.

import time, numpy as np
from xgboost import DMatrix

# Use float32 for best throughput
X_np = np.asarray(X_test, dtype=np.float32)

# Get Booster and select GPU (no deprecated 'predictor' param)
booster = xgb.get_booster()
booster.set_param({'device': 'cuda'})

# --------- Path A: GPU via DMatrix (works without CuPy) ----------
# Warm-up on a tiny DMatrix to prime kernels (doesn't cache the big one)
_ = booster.predict(DMatrix(X_np[:2048]))

# Time **fresh** DMatrix to avoid cached predictions
t0 = time.perf_counter()
proba_gpu_dmatrix = booster.predict(DMatrix(X_np))    # P(class=1) for binary:logistic
t1 = time.perf_counter()
dt_dmat = t1 - t0
print(f"XGB GPU (DMatrix): {dt_dmat*1000:.1f} ms — {X_np.shape[0]/dt_dmat:,.0f} samples/sec")

# --------- Path B: GPU via inplace_predict + CuPy (fastest if available) ----------
proba_gpu_inplace = None
try:
    import cupy as cp
    X_gpu = cp.asarray(X_np)

    # warm-up
    _ = booster.inplace_predict(X_gpu[:2048])
    cp.cuda.Stream.null.synchronize()

    t0 = time.perf_counter()
    proba_gpu_inplace = booster.inplace_predict(X_gpu)  # CuPy vector, P(class=1)
    cp.cuda.Stream.null.synchronize()
    t1 = time.perf_counter()
    dt_inplace = t1 - t0
    print(f"XGB GPU (CuPy + inplace_predict): {dt_inplace*1000:.1f} ms — {X_np.shape[0]/dt_inplace:,.0f} samples/sec")
except Exception:
    print("CuPy not available; skipped inplace_predict path (install: pip install cupy-cuda12x).")

# Choose probabilities to carry forward (numpy array)
if proba_gpu_inplace is not None:
    try:
        proba_xgb_gpu = cp.asnumpy(proba_gpu_inplace)
    except Exception:
        proba_xgb_gpu = proba_gpu_inplace  # already numpy in some builds
else:
    proba_xgb_gpu = proba_gpu_dmatrix


XGB GPU (DMatrix): 87.4 ms — 942,528 samples/sec
XGB GPU (CuPy + inplace_predict): 31.3 ms — 2,628,343 samples/sec


In [18]:
# Section 13 — Benchmarking inference (fixed: use Booster.inplace_predict)
import time, numpy as np, onnxruntime as ort
from xgboost import DMatrix

X_np = np.asarray(X_test, dtype=np.float32)
n = X_np.shape[0]
booster = xgb.get_booster()

# --- XGB CPU (Booster.inplace_predict on NumPy)
booster.set_param({'device': 'cpu'})
t0 = time.perf_counter()
try:
    proba_cpu_inplace = booster.inplace_predict(X_np)   # P(class=1)
except AttributeError:
    # very old xgboost: fallback to sklearn API
    proba_cpu_inplace = xgb.predict_proba(X_np)[:, 1]
t1 = time.perf_counter()
dt_cpu = t1 - t0
print(f"XGB (CPU, inplace_predict): {dt_cpu*1000:.1f} ms — {n/dt_cpu:,.0f} samples/sec")

# --- XGB GPU (fresh DMatrix to avoid cache)
booster.set_param({'device': 'cuda'})
_ = booster.predict(DMatrix(X_np[:2048]))  # warm-up
t0 = time.perf_counter()
proba_gpu_dmat = booster.predict(DMatrix(X_np))
t1 = time.perf_counter()
dt_gpu_dmat = t1 - t0
print(f"XGB (GPU, DMatrix): {dt_gpu_dmat*1000:.1f} ms — {n/dt_gpu_dmat:,.0f} samples/sec")

# --- XGB GPU (CuPy + inplace_predict) if available
try:
    import cupy as cp
    X_gpu = cp.asarray(X_np)
    _ = booster.inplace_predict(X_gpu[:2048]); cp.cuda.Stream.null.synchronize()
    t0 = time.perf_counter()
    proba_gpu_inplace = booster.inplace_predict(X_gpu)
    cp.cuda.Stream.null.synchronize()
    t1 = time.perf_counter()
    dt_gpu_inplace = t1 - t0
    print(f"XGB (GPU, CuPy + inplace_predict): {dt_gpu_inplace*1000:.1f} ms — {n/dt_gpu_inplace:,.0f} samples/sec")
except Exception as e:
    proba_gpu_inplace = None
    print(f"XGB (GPU, CuPy) skipped — {e}")

# --- ONNXRuntime (CPU)
sess = ort.InferenceSession(str(MODEL_DIR / "xgb.onnx"), providers=["CPUExecutionProvider"])
inp_name = sess.get_inputs()[0].name
out_name = sess.get_outputs()[1].name  # 'probabilities' (Nx2)
_ = sess.run([out_name], {inp_name: X_np[:64]})  # warm-up

t0 = time.perf_counter()
probs2 = sess.run([out_name], {inp_name: X_np})[0]  # shape (n,2)
t1 = time.perf_counter()
dt_onnx = t1 - t0
proba_onnx = probs2[:, 1].astype(np.float32)
print(f"ONNXRuntime (CPU): {dt_onnx*1000:.1f} ms — {n/dt_onnx:,.0f} samples/sec")


XGB (CPU, inplace_predict): 48.9 ms — 1,684,618 samples/sec
XGB (GPU, DMatrix): 73.6 ms — 1,118,184 samples/sec
XGB (GPU, CuPy + inplace_predict): 32.0 ms — 2,574,538 samples/sec
ONNXRuntime (CPU): 93.3 ms — 881,993 samples/sec
