In [1]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.1.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.0-py3-none-macosx_12_0_arm64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.1.0


In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay, accuracy_score, precision_recall_fscore_support, ConfusionMatrixDisplay
)
import xgboost as xgb

import os,json

In [3]:
DATA_DIR = "../dataset"
OUTPUT_DIR = "../results"

TRAIN_CSV = os.path.join(DATA_DIR, "UNSW_NB15_training-set.csv")
TEST_CSV  = os.path.join(DATA_DIR, "UNSW_NB15_testing-set.csv")

In [4]:
#load data
def load_data(train_csv, test_csv):
    train = pd.read_csv(train_csv, low_memory=False)
    test  = pd.read_csv(test_csv,  low_memory=False)
    for df in (train, test):
        df.columns = [c.strip().lower() for c in df.columns]
    # expect 'label' present
    return train, test

train_df, test_df = load_data(TRAIN_CSV, TEST_CSV)

In [5]:
#clean data
def clean(df, drop_attack_cat=True):
    df = df.copy()
    drop_candidates = [c for c in ["id", "label.1", "stime", "ltime", "timestamp", "time"] if c in df.columns]
    if drop_candidates:
        df = df.drop(columns=drop_candidates)
    y = df["label"].astype(int)
    df = df.drop(columns=["label"])
    if drop_attack_cat and "attack_cat" in df.columns:
        df = df.drop(columns=["attack_cat"])
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = pd.factorize(df[c], sort=True)[0]
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    X = StandardScaler().fit_transform(df.values)
    return X, y

In [6]:
def eval_cls(model_name, y_true, y_hat, proba=None):
    acc = accuracy_score(y_true, y_hat)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    auc = None
    if proba is not None:
        try:
            auc = roc_auc_score(y_true, proba)
        except Exception:
            pass
    return {
        "model": model_name,
        "accuracy": acc,
        "precision_att": p,
        "recall_att": r,
        "f1_att": f1,
        "roc_auc": auc
    }

def show_report(name, y_true, y_hat):
    from sklearn.metrics import classification_report, confusion_matrix
    print(f"\n=== {name} ===")
    print(confusion_matrix(y_true, y_hat))
    print(classification_report(y_true, y_hat, digits=4))

In [7]:
X_train, y_train = clean(train_df)
X_test,  y_test  = clean(test_df)

xgb_clf = xgb.XGBClassifier(
        n_estimators=700,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        n_jobs=-1,
        random_state=42,
        tree_method="hist",
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1])  # handle imbalance
    )
xgb_clf.fit(X_train, y_train)
y_hat_xgb = xgb_clf.predict(X_test)
try:
    proba_xgb = xgb_clf.predict_proba(X_test)[:,1]
except Exception:
    proba_xgb = None
    
show_report("XGBoost Classifier", y_test, y_hat_xgb)
res_xgb = eval_cls("XGB", y_test, y_hat_xgb, proba_xgb)


=== XGBoost Classifier ===
[[25018 11982]
 [ 7314 38018]]
              precision    recall  f1-score   support

           0     0.7738    0.6762    0.7217     37000
           1     0.7604    0.8387    0.7976     45332

    accuracy                         0.7656     82332
   macro avg     0.7671    0.7574    0.7596     82332
weighted avg     0.7664    0.7656    0.7635     82332



In [13]:
MODEL_NAME = "XG Boost"
cm = confusion_matrix(y_test, y_hat_xgb)
pd.DataFrame(cm, index=["Actual_0","Actual_1"], columns=["Pred_0","Pred_1"])\
  .to_csv(os.path.join(OUTPUT_DIR, f"{MODEL_NAME}_cm.csv"), index=True)

fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm).plot(ax=ax, colorbar=False)
ax.set_title(f"{MODEL_NAME} Confusion Matrix")
fig.tight_layout()
fig.savefig(os.path.join(OUTPUT_DIR, f"{MODEL_NAME}_cm.png"), dpi=160)
plt.close(fig)

rep = classification_report(y_test, y_hat_xgb, digits=4)
with open(os.path.join(OUTPUT_DIR, f"{MODEL_NAME}_report.txt"), "w") as f:
    f.write(rep)

In [12]:
try:
    if proba_xgb is not None:
        fig, ax = plt.subplots()
        RocCurveDisplay.from_predictions(y_test, proba_xgb, ax=ax)
        ax.set_title(f"{MODEL_NAME} ROC")
        fig.tight_layout()
        fig.savefig(os.path.join(OUTPUT_DIR, f"{MODEL_NAME}_roc.png"), dpi=160)
        plt.close(fig)
except NameError:
    pass