In [1]:
import os
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable


In [None]:
!pip install imblearn

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Reading and processing dataset 

In [None]:
dataset_root = '/home/julia/Downloads/NSL_KDD-master'

In [None]:
train_file = os.path.join(dataset_root, 'KDDTrain_Modify_v2.1.csv')
test_file = os.path.join(dataset_root, 'KDDTest_Modify_v2.1.csv')
names_file = os.path.join(dataset_root, 'Field Names.csv')


In [None]:
header_names = ['duration', 'protocol_type', 'service', 'node_number', 'cluster_number','flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'success_pred']

In [None]:
len(['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'success_pred'])

In [None]:
# Differentiating between nominal, binary, and numeric features

# root_shell is marked as a continuous feature in the kddcup.names 
# file, but it is supposed to be a binary feature according to the 
# dataset documentation

col_names = np.array(header_names)

nominal_idx = [1, 2, 3, 4, 5]
binary_idx = [8, 13, 15, 16, 22, 23]
numeric_idx = list(set(range(43)).difference(nominal_idx).difference(binary_idx))

nominal_cols = col_names[nominal_idx].tolist()
binary_cols = col_names[binary_idx].tolist()
numeric_cols = col_names[numeric_idx].tolist()

In [None]:
category = defaultdict(list)
category['benign'].append('normal')

## Generating and analyzing train and test sets

In [None]:
train_df = pd.read_csv(train_file, names=header_names, sep=';') 
test_df = pd.read_csv(test_file, names=header_names, sep=';')

In [13]:
test_attack_types = test_df['dst_host_rerror_rate'].value_counts()
test_attack_cats = test_df['dst_host_rerror_rate'].value_counts()

In [14]:
# Let's take a look at the binary features
# By definition, all of these features should have a min of 0.0 and a max of 1.0
train_df[binary_cols].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
land,125972.0,0.000198,0.014086,0.0,0.0,0.0,0.0,1.0
logged_in,125972.0,0.395739,0.489011,0.0,0.0,0.0,1.0,1.0
root_shell,125972.0,0.001342,0.036603,0.0,0.0,0.0,0.0,1.0
su_attempted,125972.0,0.001103,0.045155,0.0,0.0,0.0,0.0,2.0
is_host_login,125972.0,8e-06,0.002817,0.0,0.0,0.0,0.0,1.0
is_guest_login,125972.0,0.009423,0.096613,0.0,0.0,0.0,0.0,1.0


## Data preparation

In [15]:
train_df

Unnamed: 0,duration,protocol_type,service,node_number,cluster_number,flag,src_bytes,dst_bytes,land,wrong_fragment,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,success_pred
0,0,tcp,http,0T,T0,SF,305,1035,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,normal,21
1,0,tcp,private,00,00,S0,0,0,0,0,...,0.04,0.05,0.00,0.00,1.00,1.00,0.00,0.0,neptune,21
2,0,tcp,ftp_data,0T,TT,SF,71,0,0,0,...,0.29,0.02,0.16,0.00,0.50,0.45,0.00,0.0,normal,14
3,0,icmp,eco_i,10,01,SF,8,0,0,0,...,1.00,0.00,1.00,0.55,0.00,0.00,0.00,0.0,ipsweep,15
4,0,tcp,http,0T,01,SF,338,3581,0,0,...,1.00,0.00,0.01,0.02,0.00,0.00,0.00,0.0,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0,tcp,private,1T,T1,S0,0,0,0,0,...,0.00,1.00,0.00,0.00,0.05,1.00,0.95,0.0,satan,18
125968,0,tcp,echo,01,11,RSTO,0,0,0,0,...,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.0,neptune,19
125969,0,tcp,private,10,00,REJ,0,0,0,0,...,0.00,1.00,1.00,0.00,0.02,0.00,0.98,1.0,portsweep,18
125970,0,tcp,private,0T,TT,S0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.0,neptune,20


In [16]:
train_Y = train_df["attack_type"]
train_x_raw = train_df.drop(["attack_type"], axis=1)
test_Y = test_df['attack_type']
test_x_raw = test_df.drop(['attack_type'], axis=1)

In [17]:
train_Y

0            normal
1           neptune
2            normal
3           ipsweep
4            normal
            ...    
125967        satan
125968      neptune
125969    portsweep
125970      neptune
125971       normal
Name: attack_type, Length: 125972, dtype: object

In [18]:
combined_df_raw = pd.concat([train_x_raw, test_x_raw])
combined_df = pd.get_dummies(combined_df_raw, columns=nominal_cols, drop_first=True)

train_x = combined_df[:len(train_x_raw)]
test_x = combined_df[len(train_x_raw):]

# Store dummy variable feature names
dummy_variables = list(set(train_x)-set(combined_df_raw))

In [19]:
# Example statistics for the 'duration' feature before scaling
train_x['duration'].describe()

count    125972.000000
mean        287.146929
std        2604.525522
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       42908.000000
Name: duration, dtype: float64

In [21]:
# Experimenting with StandardScaler on the single 'duration' feature
from sklearn.preprocessing import StandardScaler

durations = train_x['duration'].values.reshape(-1, 1)
standard_scaler = StandardScaler().fit(durations)
scaled_durations = standard_scaler.transform(durations)
pd.Series(scaled_durations.flatten()).describe()

count    1.259720e+05
mean     4.004742e-17
std      1.000004e+00
min     -1.102497e-01
25%     -1.102497e-01
50%     -1.102497e-01
75%     -1.102497e-01
max      1.636422e+01
dtype: float64

In [22]:
# Experimenting with MinMaxScaler on the single 'duration' feature
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler().fit(durations)
min_max_scaled_durations = min_max_scaler.transform(durations)
pd.Series(min_max_scaled_durations.flatten()).describe()

count    125972.000000
mean          0.006692
std           0.060700
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
dtype: float64

In [23]:
# Experimenting with RobustScaler on the single 'duration' feature
from sklearn.preprocessing import RobustScaler

min_max_scaler = RobustScaler().fit(durations)
robust_scaled_durations = min_max_scaler.transform(durations)
pd.Series(robust_scaled_durations.flatten()).describe()

count    125972.000000
mean        287.146929
std        2604.525522
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       42908.000000
dtype: float64

In [24]:
numeric_cols

['duration',
 'src_bytes',
 'dst_bytes',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'num_compromised',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

In [25]:
# Let's proceed with StandardScaler- Apply to all the numeric columns

standard_scaler = StandardScaler().fit(train_x[numeric_cols])

train_x[numeric_cols] = \
    standard_scaler.transform(train_x[numeric_cols])

test_x[numeric_cols] = \
    standard_scaler.transform(test_x[numeric_cols])

In [26]:
train_x.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
count,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,...,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0
mean,4.004742e-17,-1.9741689999999998e-19,7.473638e-19,0.000198,3.215074e-18,4.822612e-18,-6.373744e-18,-1.4552440000000002e-17,0.395739,-4.56879e-18,...,0.089171,0.0124,0.000818,0.019219,0.276657,0.002897,0.001008,0.000389,0.594926,0.002151
std,1.000004,1.000004,1.000004,0.014086,1.000004,1.000004,1.000004,1.000004,0.489011,1.000004,...,0.284991,0.110661,0.028583,0.137293,0.447347,0.05375,0.031736,0.019719,0.490908,0.046332
min,-0.1102497,-0.007762271,-0.004918664,0.0,-0.08948678,-0.007736016,-0.09507605,-0.02702293,0.0,-0.01166369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.1102497,-0.007762271,-0.004918664,0.0,-0.08948678,-0.007736016,-0.09507605,-0.02702293,0.0,-0.01166369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.1102497,-0.007754776,-0.004918664,0.0,-0.08948678,-0.007736016,-0.09507605,-0.02702293,0.0,-0.01166369,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,-0.1102497,-0.007715255,-0.004790346,0.0,-0.08948678,-0.007736016,-0.09507605,-0.02702293,1.0,-0.01166369,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,16.36422,235.0665,325.7473,1.0,11.74343,208.8183,35.7194,110.4967,1.0,312.3677,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
train_Y_bin = train_Y.apply(lambda x: 0 if x is 'benign' else 1)
test_Y_bin = test_Y.apply(lambda x: 0 if x is 'benign' else 1)

In [28]:
# 5-class classification version
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss

classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(train_x, train_Y)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

[[  0  74   0 ...   0   0   0]
 [  0 302   0 ...   0   0   0]
 [  0   0   2 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   1 ...   0   0   0]]
0.30396593026350816


In [29]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
classifier.fit(train_x, train_Y)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

[[  0 446   0 ...   0   0   0]
 [  0 357   0 ...   0   0   0]
 [  0   1   4 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
0.2938958388785379


In [30]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(random_state=0)
classifier.fit(train_x, train_Y)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

KeyboardInterrupt: 

In [None]:
# training a Naive Bayes classifier 
from sklearn.naive_bayes import GaussianNB 

gnb = GaussianNB().fit(train_x, train_Y) 
pred_y = gnb.predict(test_x)  
  
# creating a confusion matrix 
results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

## Dealing with class imbalance

In [None]:
test_Y.value_counts().apply(lambda x: x/float(len(test_Y)))

In [None]:
train_Y.value_counts().apply(lambda x: x/float(len(train_Y)))

In [None]:
strategy = pd.Series(train_Y).value_counts().to_dict()

In [None]:
strategy

In [None]:
lst = list(train_Y)
train_Y22 = []
for idx, i in enumerate(lst):
    if i != 'normal':
        train_Y22.append('not_normal')
    else: 
        train_Y22.append(i)
train_Y2 = pd.DataFrame(train_Y22)

In [None]:
(train_Y, train_Y2)

In [None]:
strategy2

In [None]:
from imblearn.over_sampling import SMOTE

sm2 = SMOTE(sampling_strategy=strategy2)
train_x_sm2, train_Y_sm2 = sm2.fit_sample(train_x, train_Y2)
print(pd.Series(train_Y_sm2).value_counts())

In [None]:
from imblearn.under_sampling import RandomUnderSampler

mean_class_size = int(pd.Series(train_Y2).value_counts().sum()/2)
i = 0
for value in strategy.values():
    strategy[keys[i]] = int(value/2)
    i+=1
rus = RandomUnderSampler(sampling_strategy=strategy2, random_state=0, replacement=True)
train_x_rus2, train_Y_rus2 = rus.fit_sample(train_x, train_Y2)
print(pd.Series(train_Y_rus2).value_counts())

In [None]:
strategy2 = {'normal':2927, 'not_normal':2538}

In [325]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy=strategy, random_state=0)
train_x_sm, train_Y_sm = sm.fit_sample(train_x, train_Y)
print(pd.Series(train_Y_sm).value_counts())

normal             67342
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: attack_type, dtype: int64


In [326]:
keys = list(strategy.keys())

In [327]:
from imblearn.under_sampling import RandomUnderSampler

mean_class_size = int(pd.Series(train_Y).value_counts().sum()/23)
i = 0
for value in strategy.values():
    strategy[keys[i]] = int(value/23)
    i+=1
rus = RandomUnderSampler(sampling_strategy=strategy, random_state=0, replacement=True)
train_x_rus, train_Y_rus = rus.fit_sample(train_x_sm, train_Y_sm)
print(pd.Series(train_Y_rus).value_counts())

normal             2927
neptune            1791
satan               157
ipsweep             156
portsweep           127
smurf               115
nmap                 64
back                 41
warezclient          38
teardrop             38
pod                   8
guess_passwd          2
buffer_overflow       1
Name: attack_type, dtype: int64


In [328]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss

classifier = DecisionTreeClassifier(random_state=1)
classifier.fit(train_x_rus, train_Y_rus)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

[[  0 433   0 ...   0   0   0]
 [  0 356   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   5   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
0.2890604205483098


In [333]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
classifier.fit(train_x_rus, train_Y_rus)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

[[  0 202   0 ...   0   0   0]
 [  0 355   0 ...   0   0   0]
 [  0   1   4 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   3 ...   0   0   0]]
0.30170348682459414


In [330]:
# training a Naive Bayes classifier 
from sklearn.naive_bayes import GaussianNB 

gnb = GaussianNB().fit(train_x_rus, train_Y_rus) 
pred_y = gnb.predict(test_x)  
  
# creating a confusion matrix 
results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

[[  0   1   0 ...   0   0   0]
 [  0 118   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
0.5594889539526218


In [382]:
# importing necessary libraries 
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
  
# training a linear SVM classifier 
from sklearn.svm import SVC 
import time
start_time = time.time()
svm_model_linear = SVC(kernel = 'poly', probability=True, coef0=0.6, class_weight='balanced',gamma='auto').fit(train_x_rus, train_Y_rus) 
print("--- %s seconds ---" % (time.time() - start_time))
pred_y = svm_model_linear.predict(test_x) 

# creating a confusion matrix 
# cm = confusion_matrix(y_test, svm_predictions)
results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

--- 2.6480677127838135 seconds ---
[[  0 484   0 ...   0   0   0]
 [  0 358   0 ...   0   0   0]
 [  0   2   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   1 ...   0   0   0]]
0.27686097063259696


In [None]:
# importing necessary libraries 
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
  
# training a linear SVM classifier 
from sklearn.svm import SVC 
import time
start_time = time.time()
svm_model_linear = SVC(kernel = 'poly', probability=True, coef0=0.6, class_weight='balanced',gamma='auto').fit(train_x_rus, train_Y_rus) 
# (time.time() - start_time)
# print("--- %s seconds ---" % (time.time() - start_time))
pred_y = svm_model_linear.predict(test_x) 

# creating a confusion matrix 
# cm = confusion_matrix(y_test, svm_predictions)
results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

In [400]:
test_Y

0              mscan
1             normal
2             normal
3             normal
4        warezmaster
            ...     
22537         normal
22538        neptune
22539          mscan
22540         normal
22541         normal
Name: attack_type, Length: 22542, dtype: object

In [401]:
lst = list(test_Y)
test_Y22 = []
for idx, i in enumerate(lst):
    if i != 'normal':
        test_Y22.append('not_normal')
    else: 
        test_Y22.append(i)
test_Y2 = pd.DataFrame(test_Y22)

In [None]:
# importing necessary libraries 
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
  
# training a linear SVM classifier 
from sklearn.svm import SVC 
import time
start_time = time.time()

svm_model_linear = SVC(kernel = 'poly', probability=True, coef0=0.61, class_weight='balanced',gamma='auto').fit(train_x, train_Y2) 

pred_y = svm_model_linear.predict(test_x) 
results = confusion_matrix(test_Y2, pred_y)
error = zero_one_loss(test_Y2, pred_y)

print(results)
print(error)

In [None]:
# importing necessary libraries 
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split 
  
# training a linear SVM classifier 
from sklearn.svm import SVC 
import time
start_time = time.time()

svm_model_linear = SVC(kernel = 'linear', probability=True, coef0=0.61, class_weight='balanced',gamma='auto').fit(train_x, train_Y2) 

pred_y = svm_model_linear.predict(test_x) 
results = confusion_matrix(test_Y2, pred_y)
error = zero_one_loss(test_Y2, pred_y)

print(results)
print(error)

In [341]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(random_state=0)
classifier.fit(train_x_rus, train_Y_rus)

pred_y = classifier.predict(test_x)

results = confusion_matrix(test_Y, pred_y)
error = zero_one_loss(test_Y, pred_y)

print(results)
print(error)

[[  0 495   0 ...   0   0   0]
 [  0 257   0 ...   0   0   0]
 [  0   4   1 ...   0   0   0]
 ...
 [  0   1   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   5   0 ...   0   0   0]]
0.3052524177091651
