In [1]:
# ===== Cell 1 — Setup / Config =====
import json, gc, warnings
from pathlib import Path

import numpy as np
import pandas as pd
import psutil

import torch
import mlflow

warnings.filterwarnings("ignore")

# ---- MLflow ----
mlflow.set_tracking_uri("http://127.0.0.1:5000")
EXPERIMENT_NAME = "unsw-nb15"
mlflow.set_experiment(EXPERIMENT_NAME)

# ---- Device ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# ---- Quick diagnostics helper ----
def gpu_cpu_status(tag=""):
    if torch.cuda.is_available():
        used = torch.cuda.memory_allocated(0) / 1024**2
        total = torch.cuda.get_device_properties(0).total_memory / 1024**2
        print(f"{tag} GPU 0 | Mem Used: {used:.1f}MB / {total:.1f}MB")
    print(f"{tag} CPU Mem Used: {psutil.virtual_memory().percent}%")


2025/09/02 23:14:40 INFO mlflow.tracking.fluent: Experiment with name 'unsw-nb15' does not exist. Creating a new experiment.


Torch: 2.5.1+cu121
CUDA available: True
GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [2]:
# ===== Cell 2b — Load cleaned parquet sitting next to this notebook =====
from pathlib import Path
import pandas as pd

BASE = Path.cwd()
TRAIN_PQ = BASE / "UNSW_NB15_train_clean.parquet"
TEST_PQ  = BASE / "UNSW_NB15_test_clean.parquet"

assert TRAIN_PQ.exists(), f"Missing file: {TRAIN_PQ}"
assert TEST_PQ.exists(),  f"Missing file: {TEST_PQ}"

train_df = pd.read_parquet(TRAIN_PQ)
test_df  = pd.read_parquet(TEST_PQ)

print("Loaded:")
print(f"  train_df: {train_df.shape}  -> {TRAIN_PQ.name}")
print(f"  test_df : {test_df.shape}   -> {TEST_PQ.name}")

# quick sanity checks
assert "label" in train_df.columns, "train_df missing 'label'"
assert "label" in test_df.columns,  "test_df missing 'label'"
print("\nLabel distribution (train):")
print(train_df["label"].value_counts(dropna=False))


Loaded:
  train_df: (96822, 36)  -> UNSW_NB15_train_clean.parquet
  test_df : (82332, 36)   -> UNSW_NB15_test_clean.parquet

Label distribution (train):
label
0    48894
1    47928
Name: count, dtype: int64


In [3]:
# ===== Cell 3 (Final Corrected) — Prep, Scale, and Load =====
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

TARGET = "label"
# --- MODIFIED: Exclude 'attack_cat' from the feature list ---
feature_cols = [c for c in train_df.columns if c not in [TARGET, "attack_cat"]]
print(f"Using {len(feature_cols)} features. 'attack_cat' has been excluded.")
print("-" * 30)


# If anything is still non-numeric, convert to category codes
non_numeric = [c for c in feature_cols if not np.issubdtype(train_df[c].dtype, np.number)]
if non_numeric:
    print("Converting non-numeric columns to categorical codes:", non_numeric)
    for c in non_numeric:
        cats = pd.Categorical(train_df[c]).categories
        train_df[c] = pd.Categorical(train_df[c], categories=cats).codes
        test_df[c]  = pd.Categorical(test_df[c], categories=cats).codes

# Step 1: Feature matrices / targets
X      = train_df[feature_cols].astype("float32").values
y      = train_df[TARGET].astype("float32").values
X_test = test_df[feature_cols].astype("float32").values
y_test = test_df[TARGET].astype("float32").values

# Step 2: Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y, shuffle=True
)

# Step 3: Scale Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)
print("Features have been scaled using StandardScaler.")
print("-" * 30)


# Step 4: Create Tensors from the scaled data
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_t   = torch.tensor(X_val, dtype=torch.float32)
y_val_t   = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
X_test_t  = torch.tensor(X_test, dtype=torch.float32)
y_test_t  = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Step 5: Create DataLoaders
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 8192

train_loader = DataLoader(
    TensorDataset(X_train_t, y_train_t),
    batch_size=BATCH_SIZE, shuffle=True,
    pin_memory=(device.type == "cuda"), num_workers=0
)
val_loader = DataLoader(
    TensorDataset(X_val_t, y_val_t),
    batch_size=BATCH_SIZE, shuffle=False,
    pin_memory=(device.type == "cuda"), num_workers=0
)
test_loader = DataLoader(
    TensorDataset(X_test_t, y_test_t),
    batch_size=BATCH_SIZE, shuffle=False,
    pin_memory=(device.type == "cuda"), num_workers=0
)

# Step 6: Re-calculate class imbalance weight
pos = y_train.sum()
neg = len(y_train) - pos
pos_weight_val = torch.tensor(neg / max(pos, 1.0), dtype=torch.float32, device=device)

# Final diagnostics
print(f"device: {device}")
print(f"feature_cols: {len(feature_cols)} features")
print(f"train / val / test shapes: {X_train.shape} / {X_val.shape} / {X_test.shape}")
print(f"y_train pos rate: {pos/len(y_train):.4f}  -> pos_weight={pos_weight_val.item():.3f}")



Using 34 features. 'attack_cat' has been excluded.
------------------------------
Features have been scaled using StandardScaler.
------------------------------
device: cuda
feature_cols: 34 features
train / val / test shapes: (82298, 34) / (14524, 34) / (82332, 34)
y_train pos rate: 0.4950  -> pos_weight=1.020


In [4]:
# ===== Cell 4 — training helpers with progress + diagnostics =====
import json, math, time, pathlib
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix
)
import matplotlib.pyplot as plt

# tqdm (status bar)
try:
    from tqdm.auto import tqdm
except Exception:
    class tqdm:
        def __init__(self, it, **kwargs): self.it = it
        def __iter__(self): return iter(self.it)
        def set_postfix(self, **kwargs): pass
        def update(self, *args, **kwargs): pass
        def close(self): pass

# Optional CPU mem readout
try:
    import psutil
except Exception:
    psutil = None

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def gpu_cpu_diagnostics():
    """Return a dict with GPU and CPU memory diagnostics (safe if unavailable)."""
    diag = {}
    if torch.cuda.is_available():
        try:
            free_b, total_b = torch.cuda.mem_get_info()
            used_b = total_b - free_b
            diag["gpu_used_mb"] = round(used_b / (1024**2), 1)
            diag["gpu_total_mb"] = round(total_b / (1024**2), 1)
        except Exception:
            diag["gpu_used_mb"] = diag["gpu_total_mb"] = None
    else:
        diag["gpu_used_mb"] = diag["gpu_total_mb"] = None

    if psutil is not None:
        try:
            vm = psutil.virtual_memory()
            diag["cpu_mem_percent"] = vm.percent
        except Exception:
            diag["cpu_mem_percent"] = None
    else:
        diag["cpu_mem_percent"] = None
    return diag

def train_one_epoch(model, loader, criterion, optimizer):
    model.train()
    running = 0.0
    n = 0
    bar = tqdm(loader, total=len(loader), leave=False)
    for xb, yb in bar:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        bs = xb.size(0)
        running += loss.item() * bs
        n += bs
        bar.set_postfix(loss=f"{running/max(n,1):.4f}")
    bar.close()
    return running / max(n, 1)

@torch.inference_mode()
def predict_proba_torch(model, loader):
    model.eval()
    probs = []
    for xb, _ in loader:
        xb = xb.to(device, non_blocking=True)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy().ravel()
        probs.append(p)
    return np.concatenate(probs, axis=0)

def compute_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    out = {}
    # Some splits can be single-class — guard ROC-AUC
    try:
        out["roc_auc"] = float(roc_auc_score(y_true, y_prob))
    except Exception:
        out["roc_auc"] = float("nan")
    out["accuracy"]  = float(accuracy_score(y_true, y_pred))
    out["f1"]        = float(f1_score(y_true, y_pred, zero_division=0))
    out["precision"] = float(precision_score(y_true, y_pred, zero_division=0))
    out["recall"]    = float(recall_score(y_true, y_pred, zero_division=0))
    return out

def save_confusion_png(y_true, y_prob, path="confusion.png", threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(4, 4))
    im = ax.imshow(cm, interpolation="nearest")
    ax.set_title("Confusion Matrix")
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    fig.tight_layout()
    pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(path, dpi=140)
    plt.close(fig)
    return path

def train_torch_model(model, train_loader, val_loader, epochs=8, lr=1e-3, pos_weight=None, mlflow_run=None):
    model = model.to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight) if pos_weight is not None else nn.BCEWithLogitsLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(1, epochs + 1):
        t0 = time.time()
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer)
        y_val_prob = predict_proba_torch(model, val_loader)
        # Extract ground truth from val_loader
        y_val_true = torch.cat([y for _, y in val_loader.dataset], dim=0).cpu().numpy().ravel()
        val_metrics = compute_metrics(y_val_true, y_val_prob)

        diag = gpu_cpu_diagnostics()
        msg = (
            f"Epoch {epoch}/{epochs} | "
            f"Train Loss: {train_loss:.4f} | "
            f"Val F1: {val_metrics['f1']:.4f} | Val AUC: {val_metrics['roc_auc']:.4f} | "
            f"GPU {('on' if torch.cuda.is_available() else 'off')} "
            f"| Mem Used: {diag.get('gpu_used_mb')}MB / {diag.get('gpu_total_mb')}MB "
            f"| CPU: {diag.get('cpu_mem_percent')}%"
        )
        print(msg)

        # Log to MLflow if a run is open
        if mlflow_run is not None:
            import mlflow
            mlflow.log_metric("train_loss", float(train_loss), step=epoch)
            for k, v in val_metrics.items():
                mlflow.log_metric(f"val_{k}", float(v), step=epoch)

    return model


In [5]:
# ===== Cell 5 — Train Torch logistic regression (GPU) =====
import torch.nn as nn, torch

n_features = X_train.shape[1]
pos_weight_t = torch.tensor([pos_weight_val], device=device)

# simple logistic regression layer
model = nn.Sequential(nn.Linear(n_features, 1))

with mlflow.start_run(run_name="logreg_torch_gpu") as run:
    mlflow.log_param("model", "LogisticRegression_Torch")
    mlflow.log_param("epochs", 8)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("lr", 1e-3)
    mlflow.log_param("pos_weight", float(pos_weight_val))
    mlflow.log_param("n_features", int(n_features))

    # --- train (uses helper from Cell 4)
    model = train_torch_model(
        model,
        train_loader,
        val_loader,
        epochs=8,
        lr=1e-3,
        pos_weight=pos_weight_t,
        mlflow_run=mlflow.active_run()
    )

print("✅ training complete, model still in memory")


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 1/8 | Train Loss: 0.7442 | Val F1: 0.4747 | Val AUC: 0.5132 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.8%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 2/8 | Train Loss: 0.7198 | Val F1: 0.5452 | Val AUC: 0.5834 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.7%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 3/8 | Train Loss: 0.6975 | Val F1: 0.6109 | Val AUC: 0.6562 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.8%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 4/8 | Train Loss: 0.6774 | Val F1: 0.6901 | Val AUC: 0.7264 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.7%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 5/8 | Train Loss: 0.6596 | Val F1: 0.7231 | Val AUC: 0.7770 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.7%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 6/8 | Train Loss: 0.6436 | Val F1: 0.7715 | Val AUC: 0.8028 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.8%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 7/8 | Train Loss: 0.6296 | Val F1: 0.7809 | Val AUC: 0.8149 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.7%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 8/8 | Train Loss: 0.6169 | Val F1: 0.7890 | Val AUC: 0.8223 | GPU on | Mem Used: 1189.5MB / 8187.5MB | CPU: 85.8%
🏃 View run logreg_torch_gpu at: http://127.0.0.1:5000/#/experiments/1/runs/0bd5dc38957c450c9d9c3d6afcb487ba
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
✅ training complete, model still in memory


In [6]:
# ===== Cell 7 (compatible with your XGBoost version) =====
import mlflow
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score

use_gpu   = (device.type == "cuda")
tree_meth = "gpu_hist" if use_gpu else "hist"
predictor = "gpu_predictor" if use_gpu else "auto"

params = {
    "max_depth": 6,
    "eta": 0.08,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "min_child_weight": 1.0,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "tree_method": tree_meth,
    "predictor": predictor,
    "scale_pos_weight": float(pos_weight_val.item() if hasattr(pos_weight_val, "item") else pos_weight_val),
    "seed": 42,
}

# Wrap in DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val, label=y_val)

with mlflow.start_run(run_name="xgboost_gpu") as run:
    mlflow.log_param("model", "XGBoost")
    mlflow.log_param("use_gpu", use_gpu)
    mlflow.log_params(params)

    evals = [(dtrain, "train"), (dval, "val")]
    xgb_model = xgb.train(
        params,
        dtrain,
        num_boost_round=400,
        evals=evals,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Validation metrics
    val_prob = xgb_model.predict(dval, iteration_range=(0, xgb_model.best_iteration+1))
    val_metrics = compute_metrics(y_val, val_prob)   # from Cell 4

    for k, v in val_metrics.items():
        mlflow.log_metric(f"val_{k}", float(v))

    print(f"✅ XGBoost trained. use_gpu={use_gpu} | best_iteration={xgb_model.best_iteration}")
    print("VAL metrics:", val_metrics)

# keep xgb_model in memory for test eval


✅ XGBoost trained. use_gpu=True | best_iteration=376
VAL metrics: {'roc_auc': 0.9845176864058003, 'accuracy': 0.9261222803635362, 'f1': 0.9270613826388416, 'precision': 0.9066613482249701, 'recall': 0.9484005563282336}
🏃 View run xgboost_gpu at: http://127.0.0.1:5000/#/experiments/1/runs/519388586ded4e2c91578229afac293b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [7]:
# ===== Cell 9 — MLP (Neural Net) on GPU, log metrics only (no artifacts) =====
import torch
import torch.nn as nn
import mlflow

n_features = X_train.shape[1]

mlp = nn.Sequential(
    nn.Linear(n_features, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64, 1),
)

EPOCHS = 10
LR = 1e-3

with mlflow.start_run(run_name="mlp_torch_gpu") as run:
    mlflow.log_params({
        "model": "MLP_Torch",
        "device": str(device),
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "pos_weight": float(pos_weight_val.item() if hasattr(pos_weight_val, "item") else pos_weight_val),
        "n_features": int(n_features),
        "hidden1": 128,
        "hidden2": 64,
        "dropout": 0.2,
    })

    # Train (uses helpers from earlier cells)
    mlp = train_torch_model(
        mlp, train_loader, val_loader,
        epochs=EPOCHS, lr=LR,
        pos_weight=pos_weight_val,
        mlflow_run=mlflow.active_run()
    )

    # Validate & Test
    val_prob  = predict_proba_torch(mlp, val_loader)
    test_prob = predict_proba_torch(mlp, test_loader)
    val_metrics  = compute_metrics(y_val,  val_prob)
    test_metrics = compute_metrics(y_test, test_prob)

    for k, v in val_metrics.items():
        mlflow.log_metric(f"val_{k}", float(v))
    for k, v in test_metrics.items():
        mlflow.log_metric(f"test_{k}", float(v))

    print(f"✅ MLP trained. Run ID: {run.info.run_id}")
    print("VAL:", val_metrics)
    print("TEST:", test_metrics)
    print("🏃 View runs at: http://127.0.0.1:5000/#/experiments")


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 1/10 | Train Loss: 0.6531 | Val F1: 0.8522 | Val AUC: 0.8544 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 86.5%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 2/10 | Train Loss: 0.5457 | Val F1: 0.8619 | Val AUC: 0.8720 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 86.4%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 3/10 | Train Loss: 0.4494 | Val F1: 0.8607 | Val AUC: 0.8903 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 86.4%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 4/10 | Train Loss: 0.3937 | Val F1: 0.8702 | Val AUC: 0.9066 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 86.5%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 5/10 | Train Loss: 0.3556 | Val F1: 0.8878 | Val AUC: 0.9216 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 86.6%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 6/10 | Train Loss: 0.3275 | Val F1: 0.8935 | Val AUC: 0.9321 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 86.6%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 7/10 | Train Loss: 0.3061 | Val F1: 0.8982 | Val AUC: 0.9409 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 87.1%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 8/10 | Train Loss: 0.2906 | Val F1: 0.9006 | Val AUC: 0.9442 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 87.1%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 9/10 | Train Loss: 0.2794 | Val F1: 0.9009 | Val AUC: 0.9478 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 87.0%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 10/10 | Train Loss: 0.2728 | Val F1: 0.9009 | Val AUC: 0.9492 | GPU on | Mem Used: 1231.5MB / 8187.5MB | CPU: 87.0%
✅ MLP trained. Run ID: 65fe0261d59744a19a26ecc5b4f6886a
VAL: {'roc_auc': 0.9492410128602546, 'accuracy': 0.8940374552464886, 'f1': 0.9009333762471837, 'precision': 0.8385859796285201, 'recall': 0.9732962447844228}
TEST: {'roc_auc': 0.9280904590397334, 'accuracy': 0.7988024097556236, 'f1': 0.8429545217531452, 'precision': 0.7391391091825029, 'recall': 0.980697961704756}
🏃 View runs at: http://127.0.0.1:5000/#/experiments
🏃 View run mlp_torch_gpu at: http://127.0.0.1:5000/#/experiments/1/runs/65fe0261d59744a19a26ecc5b4f6886a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [8]:
# ===== Cell 8 — Train Random Forest Classifier =====
import mlflow
from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run(run_name="random_forest_cpu") as run:
    # --- Model setup ---
    # Using class_weight='balanced' helps with any residual class imbalance.
    rf_model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    
    print("Training RandomForestClassifier...")

    mlflow.log_param("model", "RandomForestClassifier")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("class_weight", "balanced")

    # --- Train the model ---
    rf_model.fit(X_train, y_train)
    print("✅ RandomForest trained.")

    # --- Get validation metrics ---
    # We need the probability of the positive class (1) for ROC AUC
    val_prob = rf_model.predict_proba(X_val)[:, 1]
    
    # Use the helper function from Cell 4 to calculate metrics
    val_metrics = compute_metrics(y_val, val_prob)

    # Log metrics to MLflow
    for k, v in val_metrics.items():
        mlflow.log_metric(f"val_{k}", float(v))

    print("\nVAL metrics:", val_metrics)



Training RandomForestClassifier...
✅ RandomForest trained.

VAL metrics: {'roc_auc': 0.9821188432863417, 'accuracy': 0.9219911870008263, 'f1': 0.9230246620014947, 'precision': 0.9022446540045159, 'recall': 0.9447844228094576}
🏃 View run random_forest_cpu at: http://127.0.0.1:5000/#/experiments/1/runs/ed7d75d4064a4632b3b7a99c72d4b0e7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [9]:
# ===== Cell 10 — Compare all models on TEST (with per-model best thresholds) =====
import numpy as np
import pandas as pd

def best_threshold(y_true, y_prob):
    """Grid-search threshold in [0,1] to maximize F1."""
    ts = np.linspace(0.05, 0.95, 19)
    best_t, best_f1 = 0.5, -1.0
    for t in ts:
        f1 = compute_metrics(y_true, (y_prob >= t).astype(int))["f1"]
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return float(best_t), float(best_f1)

rows = []

# ----- Logistic (torch) if present -----
if "model" in globals():
    val_prob_log = predict_proba_torch(model, val_loader)
    t_log, _ = best_threshold(y_val, val_prob_log)

    test_prob_log = predict_proba_torch(model, test_loader)
    m_log = compute_metrics(y_test, test_prob_log, threshold=t_log)
    rows.append({
        "model": "LogReg_Torch",
        "val_best_thr": t_log,
        **m_log
    })
else:
    print("↪️ Skipping logistic: variable `model` not found.")

# ----- XGBoost if present -----
if "xgb_model" in globals():
    import xgboost as xgb
    dval  = xgb.DMatrix(X_val,  label=y_val)
    dtest = xgb.DMatrix(X_test, label=y_test)

    val_prob_xgb  = xgb_model.predict(dval,  iteration_range=(0, xgb_model.best_iteration + 1))
    t_xgb, _      = best_threshold(y_val, val_prob_xgb)

    test_prob_xgb = xgb_model.predict(dtest, iteration_range=(0, xgb_model.best_iteration + 1))
    m_xgb = compute_metrics(y_test, test_prob_xgb, threshold=t_xgb)
    rows.append({
        "model": "XGBoost",
        "val_best_thr": t_xgb,
        **m_xgb
    })
else:
    print("↪️ Skipping XGBoost: variable `xgb_model` not found.")

# ----- MLP (torch) if present -----
if "mlp" in globals():
    val_prob_mlp = predict_proba_torch(mlp, val_loader)
    t_mlp, _     = best_threshold(y_val, val_prob_mlp)

    test_prob_mlp = predict_proba_torch(mlp, test_loader)
    m_mlp = compute_metrics(y_test, test_prob_mlp, threshold=t_mlp)
    rows.append({
        "model": "MLP_Torch",
        "val_best_thr": t_mlp,
        **m_mlp
    })
else:
    print("↪️ Skipping MLP: variable `mlp` not found.")

# ----- RandomForest if present -----
if "rf_model" in globals():
    # Use validation set to find the best threshold for F1 score
    val_prob_rf = rf_model.predict_proba(X_val)[:, 1]
    t_rf, _     = best_threshold(y_val, val_prob_rf)

    # Use the test set to get final performance metrics
    test_prob_rf = rf_model.predict_proba(X_test)[:, 1]
    m_rf = compute_metrics(y_test, test_prob_rf, threshold=t_rf)
    rows.append({
        "model": "RandomForest",
        "val_best_thr": t_rf,
        **m_rf
    })
else:
    print("↪️ Skipping RandomForest: variable `rf_model` not found.")


# ----- Show comparison -----
cols = ["model", "val_best_thr", "roc_auc", "accuracy", "f1", "precision", "recall"]
comparison_df = pd.DataFrame(rows)[cols].sort_values(by=["roc_auc","f1"], ascending=False)
display(comparison_df)



Unnamed: 0,model,val_best_thr,roc_auc,accuracy,f1,precision,recall
1,XGBoost,0.45,0.979767,0.852378,0.879728,0.79771,0.980544
3,RandomForest,0.5,0.975089,0.865994,0.888784,0.818344,0.972492
2,MLP_Torch,0.35,0.92809,0.790555,0.838835,0.72775,0.989941
0,LogReg_Torch,0.45,0.715268,0.6921,0.774084,0.649391,0.958043


In [10]:
# ===== Cell 9 (Final Version) — Stable Tuning with Manual Logging =====
import mlflow
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from tqdm.auto import tqdm # Import tqdm for the progress bar

# --- 1. Define Models and Parameter Grids ---
models_to_tune = {
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42, class_weight='balanced'),
        "params": {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, 30, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
        "params": {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.05, 0.1, 0.2],
            'max_depth': [5, 7, 10],
            'subsample': [0.7, 0.8, 1.0]
        }
    }
}

# --- 2. Run RandomizedSearchCV for each model ---

best_estimators = {}

# Wrap the items() with tqdm to show progress for each model being tuned
for model_name, config in tqdm(models_to_tune.items(), desc="Overall Tuning Progress"):
    with mlflow.start_run(run_name=f"random_tuning_{model_name}") as parent_run:
        print(f"\n--- Tuning {model_name} with RandomizedSearchCV ---")
        
        # NOTE: mlflow.sklearn.autolog() has been REMOVED to prevent crashes.

        random_search = RandomizedSearchCV(
            estimator=config["model"],
            param_distributions=config["params"],
            n_iter=10,
            cv=3,
            scoring='f1',
            verbose=1,
            random_state=42,
            n_jobs=1  # Keep at 1 for stability
        )
        
        random_search.fit(X_train, y_train)

        print(f"\n✅ Best parameters for {model_name}: {random_search.best_params_}")
        print(f"🏆 Best cross-validated F1-score: {random_search.best_score_:.4f}")
        
        # --- Manually log the most important results to the parent run ---
        mlflow.log_metric("best_cv_f1_score", random_search.best_score_)
        mlflow.log_params(random_search.best_params_)

        best_estimators[model_name] = random_search.best_estimator_

print("\n✅ Hyperparameter tuning complete.")

Overall Tuning Progress:   0%|          | 0/2 [00:00<?, ?it/s]


--- Tuning RandomForest with RandomizedSearchCV ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits

✅ Best parameters for RandomForest: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 30}
🏆 Best cross-validated F1-score: 0.9244
🏃 View run random_tuning_RandomForest at: http://127.0.0.1:5000/#/experiments/1/runs/c1a7f98460984e41930a422b0afe8942
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

--- Tuning XGBoost with RandomizedSearchCV ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits

✅ Best parameters for XGBoost: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.05}
🏆 Best cross-validated F1-score: 0.9247
🏃 View run random_tuning_XGBoost at: http://127.0.0.1:5000/#/experiments/1/runs/e01b0795985a45e3b68aff4ac1b9f32b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

✅ Hyperparameter tuning complete.


In [11]:
# ===== Cell 10 — Compare FINAL TUNED models on TEST =====
import numpy as np
import pandas as pd

# The best_threshold function remains the same
def best_threshold(y_true, y_prob):
    """Grid-search threshold in [0,1] to maximize F1."""
    ts = np.linspace(0.05, 0.95, 19)
    best_t, best_f1 = 0.5, -1.0
    for t in ts:
        f1 = compute_metrics(y_true, (y_prob >= t).astype(int))["f1"]
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return float(best_t), float(best_f1)

rows = []

# --- Use the 'best_estimators' dictionary from the tuning cell (Cell 9) ---

# ----- Tuned RandomForest -----
if "RandomForest" in best_estimators:
    tuned_rf = best_estimators["RandomForest"]
    
    # Use validation set to find the best threshold for F1 score
    val_prob_rf = tuned_rf.predict_proba(X_val)[:, 1]
    t_rf, _     = best_threshold(y_val, val_prob_rf)

    # Use the test set to get final performance metrics
    test_prob_rf = tuned_rf.predict_proba(X_test)[:, 1]
    m_rf = compute_metrics(y_test, test_prob_rf, threshold=t_rf)
    rows.append({
        "model": "RandomForest_Tuned",
        "val_best_thr": t_rf,
        **m_rf
    })
else:
    print("↪️ Skipping Tuned RandomForest: model not found in best_estimators.")

# ----- Tuned XGBoost -----
if "XGBoost" in best_estimators:
    tuned_xgb = best_estimators["XGBoost"]

    val_prob_xgb  = tuned_xgb.predict_proba(X_val)[:, 1]
    t_xgb, _      = best_threshold(y_val, val_prob_xgb)

    test_prob_xgb = tuned_xgb.predict_proba(X_test)[:, 1]
    m_xgb = compute_metrics(y_test, test_prob_xgb, threshold=t_xgb)
    rows.append({
        "model": "XGBoost_Tuned",
        "val_best_thr": t_xgb,
        **m_xgb
    })
else:
    print("↪️ Skipping Tuned XGBoost: model not found in best_estimators.")

# ----- Show final comparison -----
cols = ["model", "val_best_thr", "roc_auc", "accuracy", "f1", "precision", "recall"]
if rows:
    comparison_df = pd.DataFrame(rows)[cols].sort_values(by=["roc_auc", "f1"], ascending=False)
    display(comparison_df)
else:
    print("No tuned models were found to evaluate.")



Unnamed: 0,model,val_best_thr,roc_auc,accuracy,f1,precision,recall
1,XGBoost_Tuned,0.5,0.980489,0.864791,0.888269,0.814917,0.976132
0,RandomForest_Tuned,0.5,0.979094,0.856909,0.88281,0.803921,0.978867


In [13]:
# ===== Cell 11 — Create Model Artifacts =====
import pickle

# --- 1. Define artifact filenames ---
BEST_XGBOOST_FILENAME = "best_xgboost_model.pkl"
BEST_RF_FILENAME = "best_randomforest_model.pkl"

# --- 2. Save the best tuned XGBoost model ---
# We select the tuned XGBoost model from the 'best_estimators' dictionary created during hyperparameter tuning.
if "XGBoost" in best_estimators:
    best_xgb_model = best_estimators["XGBoost"]
    with open(BEST_XGBOOST_FILENAME, "wb") as f:
        pickle.dump(best_xgb_model, f)
    print(f"✅ Best XGBoost model saved to: {BEST_XGBOOST_FILENAME}")
else:
    print("↪️ Could not save best_xgboost_model.pkl: Tuned XGBoost model not found.")

# --- 3. Save the best tuned Random Forest model ---
# We select the tuned Random Forest model from the 'best_estimators' dictionary.
if "RandomForest" in best_estimators:
    best_rf_model = best_estimators["RandomForest"]
    with open(BEST_RF_FILENAME, "wb") as f:
        pickle.dump(best_rf_model, f)
    print(f"✅ Best Random Forest model saved to: {BEST_RF_FILENAME}")
else:
    print("↪️ Could not save best_randomforest_model.pkl: Tuned Random Forest model not found.")



✅ Best XGBoost model saved to: best_xgboost_model.pkl
✅ Best Random Forest model saved to: best_randomforest_model.pkl


In [14]:
# Find the number of features
print(f"Number of features the model was trained on: {X_train.shape[1]}")

Number of features the model was trained on: 34
