# Libraries Setup

In [1]:
# Install required libraries
!pip install numpy pandas scikit-learn matplotlib tensorflow keras

Collecting numpy
  Downloading numpy-1.24.3-cp38-cp38-win_amd64.whl.metadata (5.6 kB)
Collecting typing-extensions<4.6.0,>=3.6.6 (from tensorflow-intel==2.13.0->tensorflow)
  Downloading typing_extensions-4.5.0-py3-none-any.whl.metadata (8.5 kB)
Downloading numpy-1.24.3-cp38-cp38-win_amd64.whl (14.9 MB)
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.9 MB 487.6 kB/s eta 0:00:31
   --- ------------------------------------ 1.2/14.9 MB 8.4 MB/s eta 0:00:02
   -------- ------------------------------- 3.1/14.9 MB 16.3 MB/s eta 0:00:01
   -------- ------------------------------- 3.2/14.9 MB 13.6 MB/s eta 0:00:01
   -------- ------------------------------- 3.2/14.9 MB 12.9 MB/s eta 0:00:01
   -------- ------------------------------- 3.2/14.9 MB 12.9 MB/s eta 0:00:01
   -------- ------------------------------- 3.3/14.9 MB 8.9 MB/s eta 0:00:02
   -------

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
pydantic 2.7.4 requires typing-extensions>=4.6.1, but you have typing-extensions 4.5.0 which is incompatible.
pydantic-core 2.18.4 requires typing-extensions!=4.7.0,>=4.6.0, but you have typing-extensions 4.5.0 which is incompatible.


In [14]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading and Preprocessing

In [15]:
ROOT ='data/'

In [16]:
# Load the dataset
train_url = ROOT + 'UNSW_NB15_training-set.parquet'
test_url = ROOT + 'UNSW_NB15_testing-set.parquet'

train_data = pd.read_parquet(train_url)
test_data = pd.read_parquet(test_url)

In [17]:
# Inspect the dataset
print(train_data.head())

        dur proto service state  spkts  dpkts  sbytes  dbytes          rate  \
0  0.000011   udp       -   INT      2      0     496       0   90909.09375   
1  0.000008   udp       -   INT      2      0    1762       0  125000.00000   
2  0.000005   udp       -   INT      2      0    1068       0  200000.00000   
3  0.000006   udp       -   INT      2      0     900       0  166666.65625   
4  0.000010   udp       -   INT      2      0    2126       0  100000.00000   

         sload  ...  trans_depth  response_body_len  ct_src_dport_ltm  \
0  180363632.0  ...            0                  0                 1   
1  881000000.0  ...            0                  0                 1   
2  854400000.0  ...            0                  0                 1   
3  600000000.0  ...            0                  0                 2   
4  850400000.0  ...            0                  0                 2   

   ct_dst_sport_ltm  is_ftp_login  ct_ftp_cmd  ct_flw_http_mthd  \
0                 1

In [18]:
train_data

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sload,...,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,attack_cat,label
0,0.000011,udp,-,INT,2,0,496,0,90909.093750,1.803636e+08,...,0,0,1,1,0,0,0,0,Normal,0
1,0.000008,udp,-,INT,2,0,1762,0,125000.000000,8.810000e+08,...,0,0,1,1,0,0,0,0,Normal,0
2,0.000005,udp,-,INT,2,0,1068,0,200000.000000,8.544000e+08,...,0,0,1,1,0,0,0,0,Normal,0
3,0.000006,udp,-,INT,2,0,900,0,166666.656250,6.000000e+08,...,0,0,2,1,0,0,0,0,Normal,0
4,0.000010,udp,-,INT,2,0,2126,0,100000.000000,8.504000e+08,...,0,0,2,1,0,0,0,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0.000005,udp,-,INT,2,0,104,0,200000.000000,8.320000e+07,...,0,0,1,1,0,0,0,0,Normal,0
82328,1.106101,tcp,-,FIN,20,8,18062,354,24.410067,1.241044e+05,...,0,0,1,1,0,0,0,0,Normal,0
82329,0.000000,arp,-,INT,1,0,46,0,0.000000,0.000000e+00,...,0,0,1,1,0,0,0,1,Normal,0
82330,0.000000,arp,-,INT,1,0,46,0,0.000000,0.000000e+00,...,0,0,1,1,0,0,0,1,Normal,0


In [19]:
# Preprocess the dataset
# Drop non-numeric columns (4 columns)
train_data = train_data.select_dtypes(include=[np.number]).dropna()
test_data = test_data.select_dtypes(include=[np.number]).dropna()

In [20]:
train_data

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sload,dload,sloss,dloss,...,dmean,trans_depth,response_body_len,ct_src_dport_ltm,ct_dst_sport_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,is_sm_ips_ports,label
0,0.000011,2,0,496,0,90909.093750,1.803636e+08,0.000000,0,0,...,0,0,0,1,1,0,0,0,0,0
1,0.000008,2,0,1762,0,125000.000000,8.810000e+08,0.000000,0,0,...,0,0,0,1,1,0,0,0,0,0
2,0.000005,2,0,1068,0,200000.000000,8.544000e+08,0.000000,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0.000006,2,0,900,0,166666.656250,6.000000e+08,0.000000,0,0,...,0,0,0,2,1,0,0,0,0,0
4,0.000010,2,0,2126,0,100000.000000,8.504000e+08,0.000000,0,0,...,0,0,0,2,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0.000005,2,0,104,0,200000.000000,8.320000e+07,0.000000,0,0,...,0,0,0,1,1,0,0,0,0,0
82328,1.106101,20,8,18062,354,24.410067,1.241044e+05,2242.109863,7,1,...,44,0,0,1,1,0,0,0,0,0
82329,0.000000,1,0,46,0,0.000000,0.000000e+00,0.000000,0,0,...,0,0,0,1,1,0,0,0,1,0
82330,0.000000,1,0,46,0,0.000000,0.000000e+00,0.000000,0,0,...,0,0,0,1,1,0,0,0,1,0


In [21]:
# Split into features and labels
X_train = train_data.drop(columns=['label'])
y_train = train_data['label']
X_test = test_data.drop(columns=['label'])
y_test = test_data['label']

In [22]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Clustering Models

## Isolation Forest

In [10]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100],  # Number of base estimators in the ensemble
    'contamination': [0.05, 0.1, 0.2],  # The amount of contamination of the data set, i.e., the proportion of outliers
    'max_features': [1.0, 0.5],  # The number of features to draw from X to train each base estimator
}

In [11]:
# Custom scoring function for GridSearchCV
def unsupervised_score(estimator, X):
    # Predict the anomaly scores (lower means more anomalous)
    scores = estimator.decision_function(X)
    # Since GridSearchCV maximizes the score, we need to negate the anomaly scores
    return np.mean(scores)

In [12]:
scorer = make_scorer(unsupervised_score, greater_is_better=True)

In [13]:
# Initialize the Isolation Forest model
iso_forest = IsolationForest(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=iso_forest, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1, verbose=1)

# Perform the grid search on the scaled training data
grid_search.fit(X_train_scaled)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




In [14]:
# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best parameters found by GridSearchCV:", best_params)

Best parameters found by GridSearchCV: {'contamination': 0.05, 'max_features': 1.0, 'n_estimators': 50}


In [15]:
# Train the Isolation Forest with the best parameters
best_iso_forest = IsolationForest(**best_params, random_state=42)
best_iso_forest.fit(X_train_scaled)

In [16]:
# Predict anomalies
y_pred_best_iso_forest = best_iso_forest.predict(X_test_scaled)
y_pred_best_iso_forest = np.where(y_pred_best_iso_forest == 1, 0, 1)  # Convert to 0 (normal) and 1 (anomaly)

In [17]:
# Evaluate
print("Best Isolation Forest")
print(classification_report(y_test, y_pred_best_iso_forest))

Best Isolation Forest
              precision    recall  f1-score   support

           0       0.32      0.93      0.47     56000
           1       0.64      0.05      0.10    119341

    accuracy                           0.34    175341
   macro avg       0.48      0.49      0.29    175341
weighted avg       0.54      0.34      0.22    175341



In [18]:
from sklearn.tree import export_text

# Access individual trees from the trained Isolation Forest
trees = best_iso_forest.estimators_

# Example: print the structure of the first tree
for i, tree in enumerate(trees):
    print(f"Tree {i}")
    print(export_text(tree, feature_names=X_train.columns.to_list()))
    print("\n" + "-"*50 + "\n")


Tree 0
|--- swin <= -0.09
|   |--- sload <= 2.65
|   |   |--- sjit <= -0.11
|   |   |   |--- sinpkt <= 0.10
|   |   |   |   |--- spkts <= 0.10
|   |   |   |   |   |--- smean <= 0.98
|   |   |   |   |   |   |--- smean <= 0.55
|   |   |   |   |   |   |   |--- dinpkt <= -0.09
|   |   |   |   |   |   |   |   |--- value: [0.54]
|   |   |   |   |   |   |   |--- dinpkt >  -0.09
|   |   |   |   |   |   |   |   |--- value: [0.71]
|   |   |   |   |   |   |--- smean >  0.55
|   |   |   |   |   |   |   |--- value: [0.58]
|   |   |   |   |   |--- smean >  0.98
|   |   |   |   |   |   |--- dbytes <= -0.08
|   |   |   |   |   |   |   |--- ct_src_dport_ltm <= 0.00
|   |   |   |   |   |   |   |   |--- value: [0.56]
|   |   |   |   |   |   |   |--- ct_src_dport_ltm >  0.00
|   |   |   |   |   |   |   |   |--- value: [0.51]
|   |   |   |   |   |   |--- dbytes >  -0.08
|   |   |   |   |   |   |   |--- value: [0.37]
|   |   |   |   |--- spkts >  0.10
|   |   |   |   |   |--- value: [0.20]
|   |   |   |--- 

## One-Class SVM

In [24]:
# One-Class SVM
oc_svm = OneClassSVM(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
oc_svm.fit(X_train_scaled)

In [25]:
# Predict anomalies
y_pred_oc_svm = oc_svm.predict(X_test_scaled)
y_pred_oc_svm = np.where(y_pred_oc_svm == 1, 0, 1)  # Convert to 0 (normal) and 1 (anomaly)

In [26]:
# Evaluate
print("One-Class SVM")
print(classification_report(y_test, y_pred_oc_svm))

One-Class SVM
              precision    recall  f1-score   support

           0       0.31      0.45      0.36     56000
           1       0.67      0.52      0.59    119341

    accuracy                           0.50    175341
   macro avg       0.49      0.48      0.47    175341
weighted avg       0.55      0.50      0.51    175341

