In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay
)
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression

import os

In [13]:
DATA_DIR = "dataset"
OUTPUT_DIR = "results"

os.makedirs(OUTPUT_DIR, exist_ok=True) #if results directory does not exist, create it

TRAIN_CSV = os.path.join(DATA_DIR, "UNSW_NB15_training-set.csv")
TEST_CSV  = os.path.join(DATA_DIR, "UNSW_NB15_testing-set.csv")


In [16]:
#load data
def load_data(train_csv, test_csv):
    train = pd.read_csv(train_csv, low_memory=False)
    test  = pd.read_csv(test_csv,  low_memory=False)
    for df in (train, test):
        df.columns = [c.strip().lower() for c in df.columns]
    # expect 'label' present
    return train, test

train_df, test_df = load_data(TRAIN_CSV, TEST_CSV)

In [17]:
#clean data
def clean(df, drop_attack_cat=True):
    df = df.copy()
    drop_candidates = [c for c in ["id", "label.1", "stime", "ltime", "timestamp", "time"] if c in df.columns]
    if drop_candidates:
        df = df.drop(columns=drop_candidates)
    y = df["label"].astype(int)
    df = df.drop(columns=["label"])
    if drop_attack_cat and "attack_cat" in df.columns:
        df = df.drop(columns=["attack_cat"])
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = pd.factorize(df[c], sort=True)[0]
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    X = StandardScaler().fit_transform(df.values)
    return X, y

In [20]:
X_train, y_train = clean(train_df)
X_test,  y_test  = clean(test_df)

isof = IsolationForest(n_estimators=300, contamination=0.1,max_samples=0.8, random_state=42,n_jobs=-1)
isof.fit(X_train)
y_pred_raw = isof.predict(X_test)
y_hat = np.where(y_pred_raw == -1, 1, 0)

print("=== Isolation Forest on UNSW-NB15 dataset ===")
print(confusion_matrix(y_test, y_hat))
print(classification_report(y_test, y_hat, digits=4))

=== Isolation Forest on UNSW-NB15 dataset ===
[[ 1703 35297]
 [ 5131 40201]]
              precision    recall  f1-score   support

           0     0.2492    0.0460    0.0777     37000
           1     0.5325    0.8868    0.6654     45332

    accuracy                         0.5090     82332
   macro avg     0.3908    0.4664    0.3716     82332
weighted avg     0.4052    0.5090    0.4013     82332

