# XGBoost

In [2]:
import pandas as pd

train_df = pd.read_csv("../data/processed_train_df.csv")
print(train_df.shape)

test_df = pd.read_csv("../data/processed_test_df.csv")
print(test_df.shape)

(227845, 31)
(56962, 31)


In [None]:
# Split out labels

y_train = train_df['Class']
X_train = train_df.drop(columns=['Class'])

y_test = test_df['Class']
X_test = test_df.drop(columns=['Class'])

In [11]:
from sklearn.model_selection import train_test_split

# Build a validation with 20% of the data (25% of the train dataset)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

## Model Building

In [12]:
import numpy as np

neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
print("scale_pos_weight:", scale_pos_weight)

scale_pos_weight: 578.264406779661


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=2000,          # large + early stopping
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="aucpr",        # optimise PR AUC
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds=100
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

best_iters = xgb.best_iteration
best_iters

[0]	validation_0-aucpr:0.56423
[1]	validation_0-aucpr:0.55914
[2]	validation_0-aucpr:0.66437
[3]	validation_0-aucpr:0.64733
[4]	validation_0-aucpr:0.63668
[5]	validation_0-aucpr:0.64678
[6]	validation_0-aucpr:0.64763
[7]	validation_0-aucpr:0.64101
[8]	validation_0-aucpr:0.65703
[9]	validation_0-aucpr:0.67387
[10]	validation_0-aucpr:0.67378
[11]	validation_0-aucpr:0.66559
[12]	validation_0-aucpr:0.68118
[13]	validation_0-aucpr:0.68612
[14]	validation_0-aucpr:0.68774
[15]	validation_0-aucpr:0.69104
[16]	validation_0-aucpr:0.69180
[17]	validation_0-aucpr:0.69211
[18]	validation_0-aucpr:0.68611
[19]	validation_0-aucpr:0.68874
[20]	validation_0-aucpr:0.68793
[21]	validation_0-aucpr:0.68672
[22]	validation_0-aucpr:0.68676
[23]	validation_0-aucpr:0.69616
[24]	validation_0-aucpr:0.69572
[25]	validation_0-aucpr:0.69626
[26]	validation_0-aucpr:0.69626
[27]	validation_0-aucpr:0.69433
[28]	validation_0-aucpr:0.69758
[29]	validation_0-aucpr:0.69739
[30]	validation_0-aucpr:0.69685
[31]	validation_0-

57

In [26]:
from sklearn.metrics import confusion_matrix

def precision_recall_at_k(y_true, scores, k=0.01):
    n = len(scores)
    top_n = max(1, int(np.ceil(k * n)))
    idx = np.argsort(-scores)[:top_n]
    precision = y_true.iloc[idx].mean()
    recall = y_true.iloc[idx].sum() / y_true.sum()
    return float(precision), float(recall)


def confusion_at_k(y_true, scores, k=0.01):
    n = len(scores)
    top_n = max(1, int(np.ceil(k * n)))
    thresh = np.partition(scores, -top_n)[-top_n]  # threshold for top-k%
    y_pred = (scores >= thresh).astype(int)
    return confusion_matrix(y_true, y_pred)

In [27]:
from sklearn.metrics import roc_auc_score, average_precision_score

# Predict probabilities
proba_test = xgb.predict_proba(X_test)[:, 1]

roc = roc_auc_score(y_test, proba_test)
pr  = average_precision_score(y_test, proba_test)

p_at_1,  r_at_1  = precision_recall_at_k(y_test, proba_test, k=0.01)
p_at_05, r_at_05 = precision_recall_at_k(y_test, proba_test, k=0.005)

print(f"ROC AUC: {roc:.4f}")
print(f"PR  AUC: {pr:.4f}")
print()
print(f"P@1%={p_at_1:.4f}, R@1%={r_at_1:.4f}")
print(f"P@0.5%={p_at_05:.4f}, R@0.5%={r_at_05:.4f}")
print()

# Confusion matrices at operating points (optional)
cm_1  = confusion_at_k(y_test, proba_test, k=0.01)
cm_05 = confusion_at_k(y_test, proba_test, k=0.005)
print("Confusion @1%:\n", cm_1)
print()
print("Confusion @0.5%:\n", cm_05)


ROC AUC: 0.9736
PR  AUC: 0.7016

P@1%=0.1491, R@1%=0.8673
P@0.5%=0.2947, R@0.5%=0.8571

Confusion @1%:
 [[56379   485]
 [   13    85]]

Confusion @0.5%:
 [[56663   201]
 [   14    84]]


## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import StratifiedKFold, ParameterGrid

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Narrowed down through multiple iterations

param_grid = {
    "max_depth":        [4, 5],
    "min_child_weight": [3, 4],
    # "subsample":        [0.8, 0.9, 1.0],
    # "colsample_bytree": [0.8, 0.9, 1.0],
    "learning_rate":    [0.04, 0.05, 0.06],
}

runs = []
best_params = None
best_cv_pr = -np.inf
best_model = None

for params in ParameterGrid(param_grid):

    print(params)

    pr_list, roc_list = [], []
    p1_list, r1_list = [], []
    p05_list, r05_list = [], []
    last_model = None

    # Split data into train and validation subsets
    for tr_idx, va_idx in skf.split(X_train, y_train):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        # Fix class imbalance per fold
        pos = y_tr.sum()
        neg = len(y_tr) - pos
        spw = (neg / max(pos, 1)) if pos > 0 else 1.0

        model = XGBClassifier(
            n_estimators=2000,            # large cap + early stopping
            objective="binary:logistic",
            eval_metric="aucpr",          # PR AUC
            tree_method="hist",
            subsample = 1,                # chosen via the param_grid
            colsample_bytree = 1,         # chosen via the param_grid
            random_state=42,
            n_jobs=-1,
            scale_pos_weight=spw,
            early_stopping_rounds=50,
            **params
        )

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False
        )

        last_model = model

        proba = model.predict_proba(X_va)[:, 1]

        pr_list.append(average_precision_score(y_va, proba))
        roc_list.append(roc_auc_score(y_va, proba))

        n = len(proba)
        for k_frac, P_list, R_list in [(0.01, p1_list, r1_list), (0.005, p05_list, r05_list)]:
            top_n = max(1, int(np.ceil(k_frac * n)))
            thresh = np.partition(proba, -top_n)[-top_n]
            y_pred = (proba >= thresh).astype(int)

            flagged = y_pred == 1
            precision = float(y_va[flagged].mean()) if flagged.any() else 0.0
            recall = float((y_va[flagged].sum() / y_va.sum())) if y_va.sum() > 0 else 0.0

            P_list.append(precision)
            R_list.append(recall)

    cv_pr   = float(np.mean(pr_list))
    cv_roc  = float(np.mean(roc_list))
    cv_p1   = float(np.mean(p1_list))
    cv_r1   = float(np.mean(r1_list))
    cv_p05  = float(np.mean(p05_list))
    cv_r05  = float(np.mean(r05_list))

    runs.append({
        **params,
        "cv_pr_auc":  cv_pr,
        "cv_roc_auc": cv_roc,
        "p_at_1_cv":  cv_p1, 
        "r_at_1_cv":  cv_r1,
        "p_at_05_cv": cv_p05,
        "r_at_05_cv": cv_r05
    })

    # select by CV PR-AUC, tie-break by R@1%
    if (cv_pr > best_cv_pr) or (np.isclose(cv_pr, best_cv_pr) and cv_r1 > (runs[-1]["r_at_1_cv"])):
        best_cv_pr = cv_pr
        best_params = params
        best_model = last_model

results_df = pd.DataFrame(runs).sort_values(
    by=["cv_pr_auc", "r_at_1_cv"], ascending=[False, False]
).reset_index(drop=True)

print("Top configs (Stratified 4-fold CV):")
display(results_df.head(10))
print("\nSelected params:", best_params)
print(f"Selected CV PR AUC: {best_cv_pr:.4f}")

{'learning_rate': 0.04, 'max_depth': 4, 'min_child_weight': 3}
{'learning_rate': 0.04, 'max_depth': 4, 'min_child_weight': 4}
{'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 3}
{'learning_rate': 0.04, 'max_depth': 5, 'min_child_weight': 4}
{'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 3}
{'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 4}
{'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 3}
{'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4}
{'learning_rate': 0.06, 'max_depth': 4, 'min_child_weight': 3}
{'learning_rate': 0.06, 'max_depth': 4, 'min_child_weight': 4}
{'learning_rate': 0.06, 'max_depth': 5, 'min_child_weight': 3}
{'learning_rate': 0.06, 'max_depth': 5, 'min_child_weight': 4}
Top configs (Stratified 4-fold CV):


Unnamed: 0,learning_rate,max_depth,min_child_weight,cv_pr_auc,cv_roc_auc,p_at_1_cv,r_at_1_cv,p_at_05_cv,r_at_05_cv
0,0.05,5,4,0.85574,0.981864,0.15479,0.898371,0.303738,0.881387
1,0.04,5,4,0.855516,0.981213,0.155374,0.901749,0.30257,0.878008
2,0.06,5,3,0.85463,0.981068,0.155958,0.905128,0.301402,0.874537
3,0.04,5,3,0.851393,0.980916,0.153621,0.891568,0.30257,0.877916
4,0.05,5,3,0.850468,0.97954,0.155374,0.901703,0.301402,0.874583
5,0.06,5,4,0.848339,0.981221,0.153621,0.891568,0.301402,0.874583
6,0.05,4,3,0.841005,0.982572,0.155374,0.901749,0.299065,0.867734
7,0.06,4,3,0.834078,0.975187,0.154206,0.8949,0.301402,0.874537
8,0.05,4,4,0.825685,0.975459,0.153621,0.891522,0.300234,0.871159
9,0.04,4,4,0.822183,0.975328,0.153621,0.891522,0.300234,0.871159



Selected params: {'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4}
Selected CV PR-AUC: 0.8557


## Evaluation

In [46]:
model = XGBClassifier(
    n_estimators=2000,
    learning_rate=0.06,
    max_depth=5,
    min_child_weight=3,
    objective="binary:logistic",
    eval_metric="aucpr",
    tree_method="hist",
    subsample = 1,
    colsample_bytree = 1,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    early_stopping_rounds=50,
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

proba = model.predict_proba(X_test)[:, 1]

roc = roc_auc_score(y_test, proba)
pr_auc = average_precision_score(y_test, proba)
p_at_1,  r_at_1  = precision_recall_at_k(y_test, proba, k=0.01)
p_at_05, r_at_05 = precision_recall_at_k(y_test, proba, k=0.005)

print({
    "pr_auc":  pr_auc,
    "roc_auc": roc,
    "p_at_1":  p_at_1,
    "r_at_1_cv":  r_at_1,
    "p_at_05": p_at_05,
    "r_at_05_cv": r_at_05
})

{'pr_auc': 0.8817067124420728, 'roc_auc': 0.9753096898579352, 'p_at_1': 0.156140350877193, 'r_at_1_cv': 0.9081632653061225, 'p_at_05': 0.312280701754386, 'r_at_05_cv': 0.9081632653061225}
