In [21]:
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

### 1. Reading data

In [22]:
col_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
            'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
            'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
            'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
            'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
            'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
            'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
            'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class']

train_set = pd.read_csv('KDDTrain+.txt', sep = ",", header = None)
test_set = pd.read_csv('KDDTest+.txt', sep = ",", header = None)

train_set = train_set.iloc[:, :-1]
test_set = test_set.iloc[:, :-1]

train_set.columns = col_names
test_set.columns = col_names

print(f'training set dim: {train_set.shape}')
print(f'test set dim: {test_set.shape}')

training set dim: (125973, 42)
test set dim: (22544, 42)


In [23]:
train_set.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [24]:
test_set.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan


In [25]:
print('train set - label distribution:')
print(train_set['class'].value_counts())

train set - label distribution:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: class, dtype: int64


In [26]:
print('test set - label distribution:')
print(test_set['class'].value_counts())

test set - label distribution:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
rootkit              13
xterm                13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
worm                  2
loadmodule            2
perl                  2
sqlattack             2
udpstorm              2
phf                   2
imap                  1
Name: class, dtype: int64


### 2. Data preprocessing

In [27]:
print('train set:')

for column_name in train_set.columns:
    if train_set[column_name].dtypes == 'object':
        unique_cat = len(train_set[column_name].unique())
        print(f'feature {column_name} has {unique_cat} categories')

train set:
feature protocol_type has 3 categories
feature service has 70 categories
feature flag has 11 categories
feature class has 23 categories


In [28]:
print('test set:')

for column_name in test_set.columns:
    if test_set[column_name].dtypes == 'object':
        unique_cat = len(test_set[column_name].unique())
        print(f'feature {column_name} has {unique_cat} categories')

test set:
feature protocol_type has 3 categories
feature service has 64 categories
feature flag has 11 categories
feature class has 38 categories


In [29]:
categorical_columns = ['protocol_type', 'service', 'flag']

train_set_categorical_values = train_set[categorical_columns]
test_set_categorical_values = train_set[categorical_columns]

train_set_categorical_values.head(5)

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [30]:
unique_protocol = sorted(train_set.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]

unique_service = sorted(train_set.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]

unique_flag = sorted(train_set.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]

dumcols = unique_protocol2 + unique_service2 + unique_flag2

unique_service_test = sorted(train_set.service.unique())
unique_service2_test = [string2 + x for x in unique_service_test]
testdumcols = unique_protocol2 + unique_service2_test + unique_flag2

In [31]:
train_set_categorical_values_enc = train_set_categorical_values.apply(LabelEncoder().fit_transform)

print(train_set_categorical_values.head(5))
print('--------------------')
print(train_set_categorical_values_enc.head(5))

test_set_categorical_values_enc = test_set_categorical_values.apply(LabelEncoder().fit_transform)

  protocol_type   service flag
0           tcp  ftp_data   SF
1           udp     other   SF
2           tcp   private   S0
3           tcp      http   SF
4           tcp      http   SF
--------------------
   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


In [32]:
enc = OneHotEncoder()

train_set_categorical_values_encenc = enc.fit_transform(train_set_categorical_values_enc)

train_set_cat_data = pd.DataFrame(train_set_categorical_values_encenc.toarray(),columns = dumcols)

test_set_categorical_values_encenc = enc.fit_transform(test_set_categorical_values_enc)

test_set_cat_data = pd.DataFrame(test_set_categorical_values_encenc.toarray(),columns = testdumcols)

train_set_cat_data.head(5)

Unnamed: 0,Protocol_type_icmp,Protocol_type_tcp,Protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
trainservice = train_set['service'].tolist()
testservice = test_set['service'].tolist()

difference = list(set(trainservice) - set(testservice))

string = 'service_'

difference = [string + x for x in difference]

print(difference)

['service_harvest', 'service_aol', 'service_red_i', 'service_urh_i', 'service_http_2784', 'service_http_8001']


In [34]:
for col in difference:
    test_set_cat_data[col] = 0

print(test_set_cat_data.shape)

(125973, 84)


In [35]:
new_train_set = train_set.join(train_set_cat_data)
new_train_set.drop('flag', axis = 1, inplace = True)
new_train_set.drop('protocol_type', axis = 1, inplace = True)
new_train_set.drop('service', axis = 1, inplace = True)

new_test_set = test_set.join(test_set_cat_data)
new_test_set.drop('flag', axis = 1, inplace = True)
new_test_set.drop('protocol_type', axis = 1, inplace = True)
new_test_set.drop('service', axis = 1, inplace = True)

print(new_train_set.shape)
print(new_test_set.shape)

(125973, 123)
(22544, 123)


In [36]:
label_train_set = new_train_set['class']
label_test_set = new_test_set['class']

new_label_train_set = label_train_set.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

new_label_test_set = label_test_set.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

new_train_set['label'] = new_label_train_set
new_test_set['label'] = new_label_test_set

In [37]:
to_drop_DoS = [2,3,4]
to_drop_Probe = [1,3,4]
to_drop_R2L = [1,2,4]
to_drop_U2R = [1,2,3]

DoS_train_set = new_train_set[~new_train_set['class'].isin(to_drop_DoS)]
Probe_train_set = new_train_set[~new_train_set['class'].isin(to_drop_Probe)]
R2L_train_set = new_train_set[~new_train_set['class'].isin(to_drop_R2L)]
U2R_train_set = new_train_set[~new_train_set['class'].isin(to_drop_U2R)]

DoS_test_set = new_test_set[~new_test_set['class'].isin(to_drop_DoS)]
Probe_test_set = new_test_set[~new_test_set['class'].isin(to_drop_Probe)]
R2L_test_set = new_test_set[~new_test_set['class'].isin(to_drop_R2L)]
U2R_test_set = new_test_set[~new_test_set['class'].isin(to_drop_U2R)]

print('train set:')
print('dimensions of DoS:', DoS_train_set.shape)
print('dimensions of Probe:', Probe_train_set.shape)
print('dimensions of R2L:', R2L_train_set.shape)
print('dimensions of U2R:', U2R_train_set.shape)

print()

print('test set:')
print('dimensions of DoS:', DoS_test_set.shape)
print('dimensions of Probe:', Probe_test_set.shape)
print('dimensions of R2L:', R2L_test_set.shape)
print('dimensions of U2R:', U2R_test_set.shape)

train set:
dimensions of DoS: (125973, 124)
dimensions of Probe: (125973, 124)
dimensions of R2L: (125973, 124)
dimensions of U2R: (125973, 124)

test set:
dimensions of DoS: (22544, 124)
dimensions of Probe: (22544, 124)
dimensions of R2L: (22544, 124)
dimensions of U2R: (22544, 124)


### 2. Feature scaling

In [38]:
X_DoS = DoS_train_set.drop('label', 1)
Y_DoS = DoS_train_set.label

X_Probe = Probe_train_set.drop('label', 1)
Y_Probe = Probe_train_set.label

X_R2L = R2L_train_set.drop('label', 1)
Y_R2L = R2L_train_set.label

X_U2R = U2R_train_set.drop('label', 1)
Y_U2R = U2R_train_set.label

X_DoS_test = DoS_test_set.drop('label', 1)
Y_DoS_test = DoS_test_set.label

X_Probe_test = Probe_test_set.drop('label', 1)
Y_Probe_test = Probe_test_set.label

X_R2L_test = R2L_test_set.drop('label', 1)
Y_R2L_test = R2L_test_set.label

X_U2R_test = U2R_test_set.drop('label', 1)
Y_U2R_test = U2R_test_set.label

  X_DoS = DoS_train_set.drop('label', 1)
  X_Probe = Probe_train_set.drop('label', 1)
  X_R2L = R2L_train_set.drop('label', 1)
  X_U2R = U2R_train_set.drop('label', 1)
  X_DoS_test = DoS_test_set.drop('label', 1)
  X_Probe_test = Probe_test_set.drop('label', 1)
  X_R2L_test = R2L_test_set.drop('label', 1)
  X_U2R_test = U2R_test_set.drop('label', 1)


In [39]:
colNames = list(X_DoS)
colNames_test = list(X_DoS_test)

In [40]:
scaler1 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS = scaler1.transform(X_DoS)

scaler2 = preprocessing.StandardScaler().fit(X_Probe)
X_Probe = scaler2.transform(X_Probe)

scaler3 = preprocessing.StandardScaler().fit(X_R2L)
X_R2L = scaler3.transform(X_R2L)

scaler4 = preprocessing.StandardScaler().fit(X_U2R)
X_U2R = scaler4.transform(X_U2R) 

scaler5 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test = scaler5.transform(X_DoS_test)

scaler6 = preprocessing.StandardScaler().fit(X_Probe_test)
X_Probe_test = scaler6.transform(X_Probe_test)

scaler7 = preprocessing.StandardScaler().fit(X_R2L_test)
X_R2L_test = scaler7.transform(X_R2L_test)

scaler8 = preprocessing.StandardScaler().fit(X_U2R_test)
X_U2R_test = scaler8.transform(X_U2R_test)

ValueError: could not convert string to float: 'normal'