In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, RocCurveDisplay, accuracy_score, precision_recall_fscore_support
)
from sklearn.linear_model import LogisticRegression

import os

In [2]:
DATA_DIR = "../dataset"
OUTPUT_DIR = "../results"

TRAIN_CSV = os.path.join(DATA_DIR, "UNSW_NB15_training-set.csv")
TEST_CSV  = os.path.join(DATA_DIR, "UNSW_NB15_testing-set.csv")

In [3]:
#load data
def load_data(train_csv, test_csv):
    train = pd.read_csv(train_csv, low_memory=False)
    test  = pd.read_csv(test_csv,  low_memory=False)
    for df in (train, test):
        df.columns = [c.strip().lower() for c in df.columns]
    # expect 'label' present
    return train, test

train_df, test_df = load_data(TRAIN_CSV, TEST_CSV)

In [5]:
#clean data
def clean(df, drop_attack_cat=True):
    df = df.copy()
    drop_candidates = [c for c in ["id", "label.1", "stime", "ltime", "timestamp", "time"] if c in df.columns]
    if drop_candidates:
        df = df.drop(columns=drop_candidates)
    y = df["label"].astype(int)
    df = df.drop(columns=["label"])
    if drop_attack_cat and "attack_cat" in df.columns:
        df = df.drop(columns=["attack_cat"])
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = pd.factorize(df[c], sort=True)[0]
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    X = StandardScaler().fit_transform(df.values)
    return X, y

In [7]:
def eval_cls(model_name, y_true, y_hat, proba=None):
    acc = accuracy_score(y_true, y_hat)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    auc = None
    if proba is not None:
        try:
            auc = roc_auc_score(y_true, proba)
        except Exception:
            pass
    return {
        "model": model_name,
        "accuracy": acc,
        "precision_att": p,
        "recall_att": r,
        "f1_att": f1,
        "roc_auc": auc
    }

def show_report(name, y_true, y_hat):
    from sklearn.metrics import classification_report, confusion_matrix
    print(f"\n=== {name} ===")
    print(confusion_matrix(y_true, y_hat))
    print(classification_report(y_true, y_hat, digits=4))

In [9]:
X_train, y_train = clean(train_df)
X_test,  y_test  = clean(test_df)

logreg = LogisticRegression(max_iter=1000, class_weight="balanced", n_jobs=None)
logreg.fit(X_train, y_train)
y_hat_lr = logreg.predict(X_test)
try:
    proba_lr = logreg.predict_proba(X_test)[:,1]
except Exception:
    proba_lr = None

show_report("Logistic Regression", y_test, y_hat_lr)
res_lr = eval_cls("LogReg", y_test, y_hat_lr, proba_lr)


=== Logistic Regression ===
[[31383  5617]
 [ 7001 38331]]
              precision    recall  f1-score   support

           0     0.8176    0.8482    0.8326     37000
           1     0.8722    0.8456    0.8587     45332

    accuracy                         0.8467     82332
   macro avg     0.8449    0.8469    0.8456     82332
weighted avg     0.8477    0.8467    0.8470     82332

