In [283]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from numpy import random

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import Lasso, Ridge, LinearRegression, SGDClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import accuracy_score, precision_score, make_scorer, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.utils import resample

from IPython.display import clear_output

In [284]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [285]:
file = 'Intrusion_detection_NSL_KDD.csv'
data = pd.read_csv(file)

In [301]:
def K_fold_cross_validation(clf, X, y, cv=10):
    _scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='micro'),
        'recall': make_scorer(recall_score, average='micro'),
        'geometric_mean_score': make_scorer(geometric_mean_score, average='micro', greater_is_better=True),
        'f1': make_scorer(f1_score, average='micro'),
    }

    cross = cross_validate(
        estimator=clf, 
        X=X, 
        y=y, 
        cv=cv, 
        scoring=_scoring,
        return_train_score=False,
        n_jobs=-1
        )

    return cross


def K_fold_times(clf, X, y, cv=10, times=100):
    cross_val_res = []
    for i in range(times):
        clear_output(wait=True)
        print('Iteration: ', i+1)
        cross_val = K_fold_cross_validation(clf, X, y, cv)
        cross_val_res.append(cross_val)
        
    return cross_val_res

    

In [287]:
def post_process(KFold_result): 
    acc = []
    pre = []
    rec = []
    gmean = []
    f1 = []
    for data in KFold_result:
        acc = np.append(acc, data['test_accuracy'])
        pre = np.append(pre, data['test_precision'])
        rec = np.append(rec, data['test_recall'])
        gmean = np.append(gmean, data['test_geometric_mean_score'])
        f1 = np.append(f1, data['test_f1'])

    data = {
        'accuracy': [max(acc), acc.mean(), acc.std()], 
        'precision': [max(pre), pre.mean(), pre.std()], 
        'recall': [max(rec), rec.mean(), rec.std()], 
        'geo mean': [max(gmean), gmean.mean(), gmean.std()],
        'f1': [max(f1), f1.mean(), f1.std()],
    }

    df = pd.DataFrame(data, index=['best', 'mean', 'std'])

    return df

## Question 1

In [288]:
np.sort([data['attack_type'].unique()])

array([['apache2', 'back', 'buffer_overflow', 'ftp_write',
        'guess_passwd', 'httptunnel', 'imap', 'ipsweep', 'land',
        'loadmodule', 'mailbomb', 'mscan', 'multihop', 'named',
        'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod', 'portsweep',
        'processtable', 'ps', 'rootkit', 'saint', 'satan', 'sendmail',
        'smurf', 'snmpgetattack', 'snmpguess', 'spy', 'sqlattack',
        'teardrop', 'udpstorm', 'warezclient', 'warezmaster', 'worm',
        'xlock', 'xsnoop', 'xterm']], dtype=object)

In [289]:
# Data Preprocessing
# where is mail bomb?

Attack_Class = {
    'dos': ['apache2', 'back', 'pod', 'processtable', 'worm', 'neptune', 'smurf', 'land', 'udpstorm', 'teardrop'],
    'probe': ['satan', 'ipsweep', 'nmap', 'portsweep', 'mscan', 'saint'],
    'r2l' : ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster', 'warezclient', 'spy', 'xlock', 'xsnoop', 'snmpguess', 'snmpgetattack', 'httptunnel', 'sendmail', 'named'],
    'u2r': ['buffer_overflow', 'xterm','sqlattack','perl', 'loadmodule', 'ps','rootkit'],
}

In [290]:
data['attack_class'] = data['attack_type'].map(lambda x: 'dos' if x in Attack_Class['dos'] else 'probe' if x in Attack_Class['probe'] else 'r2l' if x in Attack_Class['r2l'] else 'u2r' if x in Attack_Class['u2r'] else 'normal')

In [291]:
data['attack_class'].value_counts()

normal    77345
dos       53093
probe     14077
r2l        3880
u2r         119
Name: attack_class, dtype: int64

In [292]:
if 'attack_type' in data.columns:   
    data = data.drop(['attack_type'], axis=1)

data_dummy = pd.get_dummies(data = data, columns = ['protocol_type', 'service', 'flag'])

In [293]:
data_normal = data_dummy.loc[data_dummy['attack_class'] == 'normal']
data_dos = data_dummy.loc[data_dummy['attack_class'] == 'dos']
data_probe = data_dummy.loc[data_dummy['attack_class'] == 'probe']
data_r2l = data_dummy.loc[data_dummy['attack_class'] == 'r2l']
data_u2r = data_dummy.loc[data_dummy['attack_class'] == 'u2r']

In [294]:
target = {
    'normal': 6817,
    'dos': 11617,
    'probe': 988,
    'r2l': 53,
    'u2r': 3086,
}

### Step 1 
A certain amount of sample data was taken from each data set. Empty lines in some data sets have been removed. 

In [297]:
# pre process data

data_normal_keep = data_normal.iloc[random.randint(0, data_normal.shape[0], target['normal'])]

data_dos_keep = data_dos.iloc[random.randint(0, data_dos.shape[0], target['dos'])]

data_probe_keep = data_probe.iloc[random.randint(0, data_probe.shape[0], target['probe'])]

data_r2l_keep = data_r2l.iloc[random.randint(0, data_r2l.shape[0], target['r2l'])]

data_u2r_keep = data_u2r.iloc[random.randint(0, data_u2r.shape[0], target['u2r'])]


### Step 2
The data sets obtained are normalized with the min-max normalization.

In [298]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# X_train_scale = scaler.fit_transform(X_train)
# X_test_scale = scaler.transform(X_test)

combined_data = pd.concat([data_normal_keep, data_dos_keep, data_probe_keep, data_r2l_keep, data_u2r_keep])

X = scaler.fit_transform(combined_data.drop(['attack_class'], axis=1))
y = combined_data['attack_class']


### Step 3
The data sets are classified with SVM, KNN and DT machine learning methods

In [302]:
# SVM Classifier

svm_linearSVC_clf = svm.LinearSVC()
cv = KFold(n_splits=10, shuffle=True)
result_linearSVC = K_fold_times(svm_linearSVC_clf, X, y, cv=cv,times=100)
post_process(result_linearSVC)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.981383,0.981383,0.981383,0.98834,0.981383
mean,0.970234,0.970234,0.970234,0.981329,0.970234
std,0.003462,0.003462,0.003462,0.002178,0.003462


In [303]:
svm_quad = svm.SVC(kernel='poly', degree=2)
cv = KFold(n_splits=10, shuffle=True)
result_svm_quad = K_fold_times(svm_quad, X, y, cv=cv, times=100)
post_process(result_svm_quad)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.97961,0.97961,0.97961,0.987227,0.97961
mean,0.968386,0.968386,0.968386,0.980164,0.968386
std,0.003703,0.003703,0.003703,0.002333,0.003703


In [304]:
svm_cube = svm.SVC(kernel='poly', degree=3)
cv = KFold(n_splits=10, shuffle=True)
result_svm_cube = K_fold_times(svm_cube, X, y, cv = cv,times=100)
post_process(result_svm_cube)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.982713,0.982713,0.982713,0.989174,0.982713
mean,0.973019,0.973019,0.973019,0.98308,0.973019
std,0.003464,0.003464,0.003464,0.00218,0.003464


In [305]:
knn_fine = KNeighborsClassifier(n_neighbors=1)
cv = KFold(n_splits=10, shuffle=True)
result_knn_fine = K_fold_times(knn_fine, X, y, cv = cv,times=100)
post_process(result_knn_fine)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.99734,0.99734,0.99734,0.998337,0.99734
mean,0.992167,0.992167,0.992167,0.9951,0.992167
std,0.001867,0.001867,0.001867,0.001169,0.001867


In [306]:
knn_med = KNeighborsClassifier(n_neighbors=10)
cv = KFold(n_splits=10, shuffle=True)
result_knn_med = K_fold_times(knn_med, X, y, cv = cv,times=100)
post_process(result_knn_med)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.989805,0.989805,0.989805,0.993621,0.989805
mean,0.981828,0.981828,0.981828,0.988619,0.981828
std,0.002864,0.002864,0.002864,0.001797,0.002864


In [307]:
knn_cubic = KNeighborsClassifier(p=3)
cv = KFold(n_splits=10, shuffle=True)
result_knn_cubic = K_fold_times(knn_cubic, X, y, cv = cv,times=100)
post_process(result_knn_cubic)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.993351,0.993351,0.993351,0.995841,0.993351
mean,0.985357,0.985357,0.985357,0.990831,0.985357
std,0.002464,0.002464,0.002464,0.001545,0.002464


In [308]:
tree_fine = DecisionTreeClassifier(max_leaf_nodes = 100)
cv = KFold(n_splits=10, shuffle=True)
result_tree_fine = K_fold_times(tree_fine, X, y, cv = cv,times=100)
post_process(result_tree_fine)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.996454,0.996454,0.996454,0.997783,0.996454
mean,0.990318,0.990318,0.990318,0.993941,0.990318
std,0.002042,0.002042,0.002042,0.00128,0.002042


In [309]:
tree_med = DecisionTreeClassifier(max_leaf_nodes = 20)
cv = KFold(n_splits=10, shuffle=True)
result_tree_med = K_fold_times(tree_med, X, y, cv = cv,times=100)
post_process(result_tree_med)

Iteration:  100


Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.975177,0.975177,0.975177,0.984442,0.975177
mean,0.953127,0.953127,0.953127,0.970537,0.953127
std,0.005649,0.005649,0.005649,0.003567,0.005649


[1]. Mahfouz, A. M., Venugopal, D., & Shiva, S. G. (2020). Comparative analysis of ML classifiers for network intrusion detection. In Fourth international congress on information and communication technology (pp. 193-207). Springer, Singapore.

[2]. Dhanabal, L., & Shantharajah, S. P. (2015). A study on NSL-KDD dataset for intrusion detection system based on classification algorithms. International journal of advanced research in computer and communication engineering, 4(6), 446-452.

[3]. Johnson, J. M., & Yadav, A. (2018). Fault detection and classification technique for HVDC transmission lines using KNN. In Information and communication technology for sustainable development (pp. 245-253). Springer, Singapore.