In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy.io import arff

In [2]:
largeDataSet = './data/raw/KDDTrain.arff'
smallDataSet = './data/raw/KDDTrain_20Percent.arff'
testDataSet = './data/raw/KDDTest.arff'

# Loading the .arff data format into a pandas dataframe
data, meta = arff.loadarff(smallDataSet)
df = pd.DataFrame(data)

test_data, test_deta = arff.loadarff(testDataSet)
test_df = pd.DataFrame(test_data)

pd.set_option('display.max_columns', None)

# Review of the dataset
df.head()
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'private',b'REJ',0.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',229.0,10.0,0.0,0.0,1.0,1.0,0.04,0.06,0.0,255.0,10.0,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,b'anomaly'
1,0.0,b'tcp',b'private',b'REJ',0.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',136.0,1.0,0.0,0.0,1.0,1.0,0.01,0.06,0.0,255.0,1.0,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,b'anomaly'
2,2.0,b'tcp',b'ftp_data',b'SF',12983.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,134.0,86.0,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,b'normal'
3,0.0,b'icmp',b'eco_i',b'SF',20.0,0.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',1.0,65.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.0,57.0,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,b'anomaly'
4,1.0,b'tcp',b'telnet',b'RSTO',0.0,15.0,b'0',0.0,0.0,0.0,0.0,b'0',0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'0',b'0',1.0,8.0,0.0,0.12,1.0,0.5,1.0,0.0,0.75,29.0,86.0,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,b'anomaly'


In [3]:
# Print meta data
print(meta)

Dataset: 'KDDTrain-20Percent'
	duration's type is numeric
	protocol_type's type is nominal, range is ('tcp', 'udp', 'icmp')
	service's type is nominal, range is ('aol', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'harvest', 'hostnames', 'http', 'http_2784', 'http_443', 'http_8001', 'imap4', 'IRC', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois', 'X11', 'Z39_50')
	flag's type is nominal, range is ('OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH')
	src_bytes's type is numeric
	dst_bytes's type 

In [4]:
# List all unique classes
print(df['class'].unique())

# So we have two mutual exclusive labels: 'normal' and 'anomaly'. This what we want to predict. 
# In other words, this is the range of the labelling function and thus also the range of the predictor function.

[b'normal' b'anomaly']


In [5]:
# Manually Checking if data include missing values
missingValues = df.isnull().sum()
print(f"Missing Values in every Column: \n{missingValues}")

# No missing values. Imputation not required.

Missing Values in every Column: 
duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_r

In [6]:
# Checking Anomolies in the dataset
print(f"Descriptive Statistics: \n{df.describe(include='all')}")

Descriptive Statistics: 
            duration protocol_type  service   flag     src_bytes  \
count   25192.000000         25192    25192  25192  2.519200e+04   
unique           NaN             3       66     11           NaN   
top              NaN        b'tcp'  b'http'  b'SF'           NaN   
freq             NaN         20526     8003  14973           NaN   
mean      305.054104           NaN      NaN    NaN  2.433063e+04   
std      2686.555640           NaN      NaN    NaN  2.410805e+06   
min         0.000000           NaN      NaN    NaN  0.000000e+00   
25%         0.000000           NaN      NaN    NaN  0.000000e+00   
50%         0.000000           NaN      NaN    NaN  4.400000e+01   
75%         0.000000           NaN      NaN    NaN  2.790000e+02   
max     42862.000000           NaN      NaN    NaN  3.817091e+08   

           dst_bytes   land  wrong_fragment       urgent           hot  \
count   2.519200e+04  25192    25192.000000  25192.00000  25192.000000   
unique    

In [7]:
# Dataset summary
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     25192 non-null  float64
 1   protocol_type                25192 non-null  object 
 2   service                      25192 non-null  object 
 3   flag                         25192 non-null  object 
 4   src_bytes                    25192 non-null  float64
 5   dst_bytes                    25192 non-null  float64
 6   land                         25192 non-null  object 
 7   wrong_fragment               25192 non-null  float64
 8   urgent                       25192 non-null  float64
 9   hot                          25192 non-null  float64
 10  num_failed_logins            25192 non-null  float64
 11  logged_in                    25192 non-null  object 
 12  num_compromised              25192 non-null  float64
 13  root_shell      

# Data preprocessing

### Encoding nominal features

In [8]:
# protocol_type
print(df['protocol_type'].unique()) # Only 3 unique values. We are going to use LabelEncoder
enc = LabelEncoder()
df['protocol_type_enc'] = enc.fit_transform(df['protocol_type'])

df[['protocol_type_enc', 'protocol_type']].head(400)
# 0: icmp, 1: tcp, 2: udp

[b'tcp' b'udp' b'icmp']


Unnamed: 0,protocol_type_enc,protocol_type
0,1,b'tcp'
1,2,b'udp'
2,1,b'tcp'
3,1,b'tcp'
4,1,b'tcp'
...,...,...
395,1,b'tcp'
396,1,b'tcp'
397,1,b'tcp'
398,0,b'icmp'


In [9]:
# List of nominal feature columns to encode
nominal_features = df.select_dtypes(include=['object']).columns

# Initialize a dictionary to store LabelEncoders for each feature
label_encoders = {}

# Encode each nominal feature and create new columns
for feature in nominal_features:
    le = LabelEncoder()
    df[f'{feature}_enc'] = le.fit_transform(df[feature])
    label_encoders[feature] = le
    df.drop(feature, axis=1, inplace=True)  # Drop the original feature column



In [16]:
df.head()
print(label_encoders)

# Apply the same encoders to the test data
for feature, encoder in label_encoders.items():
    test_df[f'{feature}_enc'] = encoder.transform(test_df[feature])
    test_df.drop(feature, axis=1, inplace=True)


# test_df.head(10)

{'protocol_type': LabelEncoder(), 'service': LabelEncoder(), 'flag': LabelEncoder(), 'land': LabelEncoder(), 'logged_in': LabelEncoder(), 'is_host_login': LabelEncoder(), 'is_guest_login': LabelEncoder(), 'class': LabelEncoder()}


ValueError: y contains previously unseen labels: b'tftp_u'