In [1]:
# This should be included at the top of every .ipynb file in the project
%load_ext autoreload
%autoreload 2

import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[1]
sys.path.append(str(PROJECT_ROOT))

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [4]:
data_path = PROJECT_ROOT / 'data' / 'KDDTrain+.csv' 

In [24]:
df = pd.read_csv(data_path)

In [6]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,attack_class
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,Normal
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,DoS
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,Normal
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,Normal
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,DoS


In [7]:
X = df.drop(columns=['attack', 'attack_class'])
y = df['attack_class']

In [8]:
print(f"Number of features before encoding: {X.shape[1]}")

Number of features before encoding: 41


In [9]:
categorical_cols = X.select_dtypes(include='object').columns
categorical_cols

Index(['protocol_type', 'service', 'flag'], dtype='object')

In [10]:
numerical_cols = X.select_dtypes(include='number').columns

In [11]:
numerical_cols

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [12]:
df.isna().sum().sort_values(ascending=False)

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [13]:
X[categorical_cols].nunique()

protocol_type     3
service          70
flag             11
dtype: int64

In [14]:
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [15]:
print("Number of features after decoding :", X_encoded.shape[1])

Number of features after decoding : 119


In [16]:
y = LabelEncoder().fit_transform(y)

In [18]:
X_scaled = X_encoded.copy()

scaler = RobustScaler()
X_scaled[numerical_cols] = scaler.fit_transform(X_scaled[numerical_cols])

In [21]:
X_scaled.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,0.369565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,0.0,-0.15942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,True,False,False,False,False,False
2,0.0,0.681159,15.800388,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
3,0.0,0.561594,0.813953,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,0.0,-0.15942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,False,False,False,False,False


Before normalization

In [22]:
X_encoded[numerical_cols].mean()

duration                         287.146929
src_bytes                      45567.100824
dst_bytes                      19779.271433
land                               0.000198
wrong_fragment                     0.022688
urgent                             0.000111
hot                                0.204411
num_failed_logins                  0.001222
logged_in                          0.395739
num_compromised                    0.279253
root_shell                         0.001342
su_attempted                       0.001103
num_root                           0.302194
num_file_creations                 0.012669
num_shells                         0.000413
num_access_files                   0.004096
num_outbound_cmds                  0.000000
is_host_login                      0.000008
is_guest_login                     0.009423
count                             84.108207
srv_count                         27.738093
serror_rate                        0.284487
srv_serror_rate                 

after normalization

In [23]:
X_scaled[numerical_cols].mean()

duration                       287.146929
src_bytes                      164.938771
dst_bytes                       38.331921
land                             0.000198
wrong_fragment                   0.022688
urgent                           0.000111
hot                              0.204411
num_failed_logins                0.001222
logged_in                        0.395739
num_compromised                  0.279253
root_shell                       0.001342
su_attempted                     0.001103
num_root                         0.302194
num_file_creations               0.012669
num_shells                       0.000413
num_access_files                 0.004096
num_outbound_cmds                0.000000
is_host_login                    0.000008
is_guest_login                   0.009423
count                            0.497221
srv_count                        1.233631
serror_rate                      0.284487
srv_serror_rate                  0.282488
rerror_rate                      0