%%<br>
Core

In [1]:
import pandas as pd
import numpy as np

Visualization

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

Machine Learning

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier

Metrics

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

%%

In [5]:
unsw_df = pd.read_csv("UNSW_NB15_training-set.csv")
nsl_df = pd.read_csv("kdd_train.csv")

In [6]:
unsw_df.head(100)
# nsl_df.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.087490,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,0.000011,udp,-,INT,2,0,604,0,90909.090200,...,1,40,0,0,0,2,39,0,Normal,0
96,97,0.000010,udp,-,INT,2,0,552,0,100000.002500,...,1,40,0,0,0,2,39,0,Normal,0
97,98,37.578835,tcp,-,FIN,22,24,1920,4312,1.197483,...,1,2,0,0,0,3,7,0,Normal,0
98,99,0.000011,udp,-,INT,2,0,1754,0,90909.090200,...,1,63,0,0,0,8,62,0,Normal,0


%%<br>
Basic info

In [7]:
unsw_df.info()
unsw_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

id                   0
dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
attack_cat 

Encode categorical columns if any

In [8]:
categorical_cols = unsw_df.select_dtypes(include=['object']).columns
unsw_df[categorical_cols] = unsw_df[categorical_cols].apply(LabelEncoder().fit_transform)

Fill or drop missing values

In [9]:
unsw_df = unsw_df.dropna()

%%

%%

In [10]:
X = unsw_df.drop('label', axis=1)   
y = unsw_df['label']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

%%

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

Identify categorical columns

In [13]:
categorical_cols = X_train.select_dtypes(include=['object']).columns

Preprocessor

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ],
    remainder='passthrough'
)

Build the pipeline

In [15]:
model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', GaussianNB())
])

Train and predict

In [16]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

%%<br>
 models used in the paper: bagging, xgb,  decision tree, logistic regression , two nbs

In [17]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted')
rec = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [18]:
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.8478
Precision: 0.8475
Recall:    0.8478
F1 Score:  0.8477
