In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix
import pickle

# Paths (relative to this notebook)
DATA_DIR = os.path.join(".", "datasets")
TRAIN_CSV = os.path.join(DATA_DIR, "UNSW_NB15_training_set.csv")
TEST_CSV  = os.path.join(DATA_DIR, "UNSW_NB15_testing_set.csv")

MODEL_DIR = os.path.join("..", "Python", "models")
os.makedirs(MODEL_DIR, exist_ok=True)

TRAIN_CSV, TEST_CSV, MODEL_DIR

('.\\datasets\\UNSW_NB15_training_set.csv',
 '.\\datasets\\UNSW_NB15_testing_set.csv',
 '..\\Python\\models')

In [2]:
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)

df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

df_train.head()

Train shape: (175341, 45)
Test shape: (82332, 45)


Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


pd.set_option("display.max_columns", None)
df_train.columns

In [3]:
# Map UNSW-NB15 columns to your runtime features

feature_cols = [
    # approximate mapping
    "sbytes",     # ≈ packet_length / size sent
    "sttl",       # ≈ ttl
    "sport",      # src_port
    "dsport",     # dst_port
    # the rest are "flow-ish" but fine for ML
    "swin",       # window_size-ish
    "dwin",
    "proto",      # we will encode this
    "dbytes",     
    "ct_state_ttl",
    "ct_srv_src",
    "ct_dst_ltm",
]

label_col = "label"   # 0 = normal, 1 = attack


In [4]:
label_col = "label"   # 0 = normal, 1 = attack

# Define candidate features based on UNSW-NB15 dataset
candidate_features = [
    "dur","proto","state","spkts","dpkts","sbytes","dbytes","sttl","dttl",
    "sload","dload","sloss","dloss","sintpkt","dintpkt","sjit","djit",
    "swin","dwin","stcpb","dtcpb","tcprtt","synack","ackdat",
    "smean","dmean","trans_depth","response_body_len",
    "ct_srv_src","ct_state_ttl","ct_dst_ltm","ct_src_dport_ltm","ct_dst_sport_ltm",
    "ct_dst_src_ltm","ct_src_ltm","ct_srv_dst","is_ftp_login","ct_ftp_cmd",
    "ct_flw_http_mthd","ct_src_dst_ltm","is_sm_ips_ports"
]

# Keep only the columns that actually exist in your CSV
feature_cols = [c for c in candidate_features if c in df_train.columns]

print("Using feature cols:", feature_cols)
print("Missing cols (ignored):", [c for c in candidate_features if c not in df_train.columns])


Using feature cols: ['dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sjit', 'djit', 'swin', 'dwin', 'stcpb', 'dtcpb', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_src_ltm', 'ct_srv_dst', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'is_sm_ips_ports']
Missing cols (ignored): ['sintpkt', 'dintpkt', 'ct_src_dst_ltm']


In [5]:
# Make a copy so we don't mess the original
train = df_train[feature_cols + [label_col]].copy()
test  = df_test[feature_cols + [label_col]].copy()

# 1) Encode ALL categorical feature columns (proto/service/state/etc.)
cat_cols = [c for c in feature_cols if train[c].dtype == "object" or test[c].dtype == "object"]
print("Categorical cols detected:", cat_cols)

for c in cat_cols:
    # Use a shared category mapping so train/test use the same codes
    combined = pd.concat([train[c], test[c]], axis=0).astype("category")
    categories = combined.cat.categories

    train[c] = pd.Categorical(train[c], categories=categories).codes
    test[c]  = pd.Categorical(test[c],  categories=categories).codes

# 2) Force the remaining columns to numeric (safe coercion)
for c in feature_cols:
    train[c] = pd.to_numeric(train[c], errors="coerce")
    test[c]  = pd.to_numeric(test[c],  errors="coerce")

# 3) Fill missing values (important for scaler/models)
train[feature_cols] = train[feature_cols].fillna(0)
test[feature_cols]  = test[feature_cols].fillna(0)

# 4) Build matrices
X_train = train[feature_cols].astype(float).values
y_train = train[label_col].astype(int).values

X_test  = test[feature_cols].astype(float).values
y_test  = test[label_col].astype(int).values

print("X_train:", X_train.shape, "y_train bincount:", np.bincount(y_train))
print("X_test :", X_test.shape,  "y_test  bincount:", np.bincount(y_test))

# 5) Train models mostly on normal traffic (label 0 assumed normal)
normal_mask = (y_train == 0)
if normal_mask.sum() == 0:
    raise ValueError("No normal samples found (y_train==0). Check label_col and label encoding!")

X_normal = X_train[normal_mask]

scaler = StandardScaler()
scaler.fit(X_normal)

X_train_scaled  = scaler.transform(X_train)
X_test_scaled   = scaler.transform(X_test)
X_normal_scaled = scaler.transform(X_normal)


Categorical cols detected: ['proto', 'state']
X_train: (175341, 38) y_train bincount: [ 56000 119341]
X_test : (82332, 38) y_test  bincount: [37000 45332]


In [11]:
# Isolation Forest
iforest = IsolationForest(
    n_estimators=100,
    contamination=0.1,
    random_state=42
)
iforest.fit(X_normal_scaled)

# LOF (novelty mode)
lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.1,
    novelty=True
)
lof.fit(X_normal_scaled)

# One-Class SVM
ocsvm = OneClassSVM(
    nu=0.1,
    kernel='rbf',
    gamma='auto'
)
ocsvm.fit(X_normal_scaled)


0,1,2
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,tol,0.001
,nu,0.1
,shrinking,True
,cache_size,200
,verbose,False
,max_iter,-1


In [7]:
def predict_to_labels(model, X):
    preds = model.predict(X)  # -1 anomaly, 1 normal
    return np.where(preds == -1, 1, 0)  # 1 = attack, 0 = normal

for name, model in [("IForest", iforest), ("LOF", lof), ("OCSVM", ocsvm)]:
    y_pred = predict_to_labels(model, X_test_scaled)
    print("="*60)
    print(name)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=3))


IForest
[[32011  4989]
 [29640 15692]]
              precision    recall  f1-score   support

           0      0.519     0.865     0.649     37000
           1      0.759     0.346     0.475     45332

    accuracy                          0.579     82332
   macro avg      0.639     0.606     0.562     82332
weighted avg      0.651     0.579     0.553     82332

LOF
[[29876  7124]
 [10845 34487]]
              precision    recall  f1-score   support

           0      0.734     0.807     0.769     37000
           1      0.829     0.761     0.793     45332

    accuracy                          0.782     82332
   macro avg      0.781     0.784     0.781     82332
weighted avg      0.786     0.782     0.782     82332

OCSVM
[[31778  5222]
 [29718 15614]]
              precision    recall  f1-score   support

           0      0.517     0.859     0.645     37000
           1      0.749     0.344     0.472     45332

    accuracy                          0.576     82332
   macro avg     

In [8]:
scaler_path = os.path.join(MODEL_DIR, "scaler.pkl")
iforest_path = os.path.join(MODEL_DIR, "isolation_forest.pkl")
lof_path = os.path.join(MODEL_DIR, "local_outlier_factor.pkl")
ocsvm_path = os.path.join(MODEL_DIR, "one_class_svm.pkl")

with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

with open(iforest_path, "wb") as f:
    pickle.dump(iforest, f)

with open(lof_path, "wb") as f:
    pickle.dump(lof, f)

with open(ocsvm_path, "wb") as f:
    pickle.dump(ocsvm, f)

scaler_path, iforest_path, lof_path, ocsvm_path


('..\\Python\\models\\scaler.pkl',
 '..\\Python\\models\\isolation_forest.pkl',
 '..\\Python\\models\\local_outlier_factor.pkl',
 '..\\Python\\models\\one_class_svm.pkl')

In [9]:
from pathlib import Path
import os
import pickle

def find_project_root(start: Path) -> Path:
    """
    Walk upwards until we find a folder that contains BOTH:
      - Python/
      - RustSniffer/
    That folder is treated as PROJECT_ROOT (your NIDS root).
    """
    cur = start.resolve()
    while True:
        if (cur / "Python").is_dir() and (cur / "RustSniffer").is_dir():
            return cur
        if cur.parent == cur:
            raise RuntimeError(
                "Could not locate project root. Expected folders 'Python' and 'RustSniffer' "
                "somewhere above this notebook."
            )
        cur = cur.parent

# 1) Locate notebook directory & project root
nb_dir = Path.cwd()  # VS Code Jupyter usually sets this to the notebook's folder
PROJECT_ROOT = find_project_root(nb_dir)

# 2) Set correct models directory: NIDS/Python/models
MODEL_DIR = PROJECT_ROOT / "Python" / "src" / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("Notebook dir :", nb_dir)
print("PROJECT_ROOT :", PROJECT_ROOT)
print("MODEL_DIR    :", MODEL_DIR)

# 3) Sanity checks: make sure the trained objects actually exist
required = ["scaler", "iforest", "lof", "ocsvm"]
missing = [name for name in required if name not in globals()]
if missing:
    raise RuntimeError(
        f"These objects are missing: {missing}. "
        "Run the training cell(s) first (the ones that fit scaler/iforest/lof/ocsvm)."
    )

# 4) Save models
artifacts = {
    "scaler.pkl": scaler,
    "isolation_forest.pkl": iforest,
    "local_outlier_factor.pkl": lof,
    "one_class_svm.pkl": ocsvm,
}

for filename, obj in artifacts.items():
    out_path = MODEL_DIR / filename
    with open(out_path, "wb") as f:
        pickle.dump(obj, f)

# 5) Verify
print("Saved files:", sorted([p.name for p in MODEL_DIR.glob("*.pkl")]))


Notebook dir : c:\Users\roden\Documents\Final_Year_Project\NIDS\Python\Notebooks\src
PROJECT_ROOT : C:\Users\roden\Documents\Final_Year_Project\NIDS
MODEL_DIR    : C:\Users\roden\Documents\Final_Year_Project\NIDS\Python\src\models
Saved files: ['isolation_forest.pkl', 'local_outlier_factor.pkl', 'one_class_svm.pkl', 'scaler.pkl']


In [10]:
import json
import os
import pandas as pd

# Sanity checks: these MUST exist before exporting metadata
required = ["feature_cols", "cat_cols", "MODEL_DIR", "train", "test"]
missing = [name for name in required if name not in globals()]

if missing:
    raise NameError(
        f"Missing variables: {missing}\n"
        "Run the notebook cells that create these (or use Run All), "
        "then run this metadata cell again."
    )

meta = {
    "feature_cols": feature_cols,
    "cat_cols": cat_cols,
    "categories": {
        c: list(
            pd.concat([train[c], test[c]], axis=0)
              .astype("category")
              .cat.categories
        )
        for c in cat_cols
    }
}

meta_path = os.path.join(MODEL_DIR, "feature_meta.json")
with open(meta_path, "w") as f:
    json.dump(meta, f, indent=2)

print("Saved feature metadata to:", meta_path)
print("Total features:", len(feature_cols))
feature_cols


Saved feature metadata to: C:\Users\roden\Documents\Final_Year_Project\NIDS\Python\src\models\feature_meta.json
Total features: 38


['dur',
 'proto',
 'state',
 'spkts',
 'dpkts',
 'sbytes',
 'dbytes',
 'sttl',
 'dttl',
 'sload',
 'dload',
 'sloss',
 'dloss',
 'sjit',
 'djit',
 'swin',
 'dwin',
 'stcpb',
 'dtcpb',
 'tcprtt',
 'synack',
 'ackdat',
 'smean',
 'dmean',
 'trans_depth',
 'response_body_len',
 'ct_srv_src',
 'ct_state_ttl',
 'ct_dst_ltm',
 'ct_src_dport_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_src_ltm',
 'ct_srv_dst',
 'is_ftp_login',
 'ct_ftp_cmd',
 'ct_flw_http_mthd',
 'is_sm_ips_ports']