In [98]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import Lasso, Ridge, LinearRegression, SGDClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import accuracy_score, precision_score, make_scorer, recall_score, f1_score, confusion_matrix, classification_report
from imblearn.metrics import geometric_mean_score
from sklearn.utils import resample

from IPython.display import clear_output

In [2]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [3]:
file = 'Intrusion_detection_NSL_KDD.csv'
data = pd.read_csv(file)

In [118]:
def K_fold_cross_validation(clf, X, y, cv=10):
    _scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='micro'),
        'recall': make_scorer(recall_score, average='micro'),
        'geometric_mean_score': make_scorer(geometric_mean_score, average='micro', greater_is_better=True),
        'f1': make_scorer(f1_score, average='micro'),
    }

    cross_linearSVC = cross_validate(
        estimator=clf, 
        X=X, 
        y=y, 
        cv=cv, 
        scoring=_scoring,
        return_train_score=False,
        n_jobs=-1
        )

    accuracy = [max(cross_linearSVC['test_accuracy']), cross_linearSVC['test_accuracy'].mean(), cross_linearSVC['test_accuracy'].std()]
    precision = [max(cross_linearSVC['test_precision']), cross_linearSVC['test_precision'].mean(), cross_linearSVC['test_precision'].std()]
    recall = [max(cross_linearSVC['test_recall']), cross_linearSVC['test_recall'].mean(), cross_linearSVC['test_recall'].std()]
    gmean = [max(cross_linearSVC['test_geometric_mean_score']), cross_linearSVC['test_geometric_mean_score'].mean(), cross_linearSVC['test_geometric_mean_score'].std()]
    f1 = [max(cross_linearSVC['test_f1']), cross_linearSVC['test_f1'].mean(), cross_linearSVC['test_f1'].std()]

    data = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'geo mean': gmean, 'f1': f1}
    df = pd.DataFrame(data, index=['best', 'mean', 'std'])

    return cross_linearSVC, df


def K_fold_times(clf, X, y, cv=10, times=100):
    cross_val_res = []
    for i in range(times):
        clear_output(wait=True)
        cross_val, df = K_fold_cross_validation(clf, X, y, cv)
        cross_val_res.append(cross_val)
        print('Iteration: ', i)
    return cross_val_res

    

In [117]:
def post_process(KFold_result): 
    acc = []
    pre = []
    rec = []
    gmean = []
    f1 = []
    for data in KFold_result:
        acc = np.append(acc, data['test_accuracy'])
        pre = np.append(pre, data['test_precision'])
        rec = np.append(rec, data['test_recall'])
        gmean = np.append(gmean, data['test_geometric_mean_score'])
        f1 = np.append(f1, data['test_f1'])

    data = {
        'accuracy': [max(acc), acc.mean(), acc.std()], 
        'precision': [max(pre), pre.mean(), pre.std()], 
        'recall': [max(rec), rec.mean(), rec.std()], 
        'geo mean': [max(gmean), gmean.mean(), gmean.std()],
        'f1': [max(f1), f1.mean(), f1.std()],
    }

    df = pd.DataFrame(data, index=['best', 'mean', 'std'])

    return df

## Question 1

In [4]:
np.sort([data['attack_type'].unique()])

array([['apache2', 'back', 'buffer_overflow', 'ftp_write',
        'guess_passwd', 'httptunnel', 'imap', 'ipsweep', 'land',
        'loadmodule', 'mailbomb', 'mscan', 'multihop', 'named',
        'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod', 'portsweep',
        'processtable', 'ps', 'rootkit', 'saint', 'satan', 'sendmail',
        'smurf', 'snmpgetattack', 'snmpguess', 'spy', 'sqlattack',
        'teardrop', 'udpstorm', 'warezclient', 'warezmaster', 'worm',
        'xlock', 'xsnoop', 'xterm']], dtype=object)

In [5]:
# Data Preprocessing
# where is mail bomb?

Attack_Class = {
    'dos': ['apache2', 'back', 'pod', 'processtable', 'worm', 'neptune', 'smurf', 'land', 'udpstorm', 'teardrop'],
    'probe': ['satan', 'ipsweep', 'nmap', 'portsweep', 'mscan', 'saint'],
    'r2l' : ['guess_passwd', 'ftp_write', 'imap', 'phf', 'multihop', 'warezmaster', 'warezclient', 'spy', 'xlock', 'xsnoop', 'snmpguess', 'snmpgetattack', 'httptunnel', 'sendmail', 'named'],
    'u2r': ['buffer_overflow', 'xterm','sqlattack','perl', 'loadmodule', 'ps','rootkit'],
}

In [6]:
data['attack_class'] = data['attack_type'].map(lambda x: 'dos' if x in Attack_Class['dos'] else 'probe' if x in Attack_Class['probe'] else 'r2l' if x in Attack_Class['r2l'] else 'u2r' if x in Attack_Class['u2r'] else 'normal')

In [8]:
data['attack_class'].value_counts()

normal    77345
dos       53093
probe     14077
r2l        3880
u2r         119
Name: attack_class, dtype: int64

In [20]:
if 'attack_type' in data.columns:   
    data = data.drop(['attack_type'], axis=1)

data_dummy = pd.get_dummies(data = data, columns = ['protocol_type', 'service', 'flag'])

In [29]:
data_normal = data_dummy.loc[data_dummy['attack_class'] == 'normal']
data_dos = data_dummy.loc[data_dummy['attack_class'] == 'dos']
data_probe = data_dummy.loc[data_dummy['attack_class'] == 'probe']
data_r2l = data_dummy.loc[data_dummy['attack_class'] == 'r2l']
data_u2r = data_dummy.loc[data_dummy['attack_class'] == 'u2r']

### Step 1 
A certain amount of sample data was taken from each data set. Empty lines in some data sets have been removed. 

In [11]:
## wacky way to get the data
# X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data_normal.drop(['attack_class'], axis=1), data_normal['attack_class'], test_size=0.911, random_state=42)
# X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(data_dos.drop(['attack_class'], axis=1), data_dos['attack_class'], test_size=0.781, random_state=42)
# X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(data_probe.drop(['attack_class'], axis=1), data_probe['attack_class'], test_size=0.929, random_state=42)
# X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(data_r2l.drop(['attack_class'], axis=1), data_r2l['attack_class'], test_size=0.204, random_state=42)
# X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(data_u2r.drop(['attack_class'], axis=1), data_u2r['attack_class'], test_size=0.554, random_state=42)

In [30]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data_normal.drop(['attack_class'], axis=1), data_normal['attack_class'], test_size=0.3, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(data_dos.drop(['attack_class'], axis=1), data_dos['attack_class'], test_size=0.3, random_state=42)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(data_probe.drop(['attack_class'], axis=1), data_probe['attack_class'], test_size=0.3, random_state=42)
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(data_r2l.drop(['attack_class'], axis=1), data_r2l['attack_class'], test_size=0.3, random_state=42)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(data_u2r.drop(['attack_class'], axis=1), data_u2r['attack_class'], test_size=0.3, random_state=42)

In [31]:
X_train = pd.concat([X_train_1, X_train_2, X_train_3, X_train_4, X_train_5])
X_test = pd.concat([X_test_1, X_test_2, X_test_3, X_test_4, X_test_5])
y_train = pd.concat([y_train_1, y_train_2, y_train_3, y_train_4, y_train_5])
y_test = pd.concat([y_test_1, y_test_2, y_test_3, y_test_4, y_test_5])

### Step 2
The data sets obtained are normalized with the min-max normalization.

In [19]:
data_dummy.select_dtypes(include=['object']).columns

Index(['attack_class'], dtype='object')

In [33]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

X = scaler.fit_transform(data_dummy.drop(['attack_class'], axis=1))
y = data_dummy['attack_class']


### Step 3
The data sets are classified with SVM, KNN and DT machine learning methods

In [99]:
def K_fold_cross_validation(clf, X, y, cv=10):
    _scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='micro'),
        'recall': make_scorer(recall_score, average='micro'),
        'geometric_mean_score': make_scorer(geometric_mean_score, average='micro', greater_is_better=True),
        'f1': make_scorer(f1_score, average='micro'),
    }

    cross = cross_validate(
        estimator=clf, 
        X=X, 
        y=y, 
        cv=cv, 
        scoring=_scoring,
        return_train_score=False,
        n_jobs=-1
        )

    accuracy = [max(cross['test_accuracy']), cross['test_accuracy'].mean(), cross['test_accuracy'].std()]
    precision = [max(cross['test_precision']), cross['test_precision'].mean(), cross['test_precision'].std()]
    recall = [max(cross['test_recall']), cross['test_recall'].mean(), cross['test_recall'].std()]
    gmean = [max(cross['test_geometric_mean_score']), cross['test_geometric_mean_score'].mean(), cross['test_geometric_mean_score'].std()]
    f1 = [max(cross['test_f1']), cross['test_f1'].mean(), cross['test_f1'].std()]

    data = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'geo mean': gmean, 'f1': f1}
    df = pd.DataFrame(data, index=['best', 'mean', 'std'])

    return cross, df


def K_fold_times(clf, X, y, cv=10, times=100):
    cross_val_res = []
    for i in range(times):
        clear_output(wait=True)
        cross_val, df = K_fold_cross_validation(clf, X, y, cv)
        cross_val_res.append(cross_val)
        print('Iteration: ', i)
    return cross_val_res

    

In [100]:
# SVM Classifier

svm_linearSVC_clf = svm.LinearSVC()
cv = KFold(n_splits=10, shuffle=True)
result_linearSVC = K_fold_times(svm_linearSVC_clf, X, y, cv = cv,times=100)

Iteration:  99


In [119]:
post_process(result_linearSVC)

Unnamed: 0,accuracy,precision,recall,geo mean,f1
best,0.967344,0.967344,0.967344,0.979514,0.967344
mean,0.963258,0.963258,0.963258,0.976939,0.963258
std,0.001494,0.001494,0.001494,0.000941,0.001494


In [120]:
svm_quad = svm.SVC(kernel='poly', degree=2)
cv = KFold(n_splits=10, shuffle=True)
result_svm_quad = K_fold_times(svm_quad, X, y, cv = cv,times=100)
post_process(result_svm_quad)

Iteration:  9


In [None]:
svm_cube = svm.SVC(kernel='poly', degree=3)
cv = KFold(n_splits=10, shuffle=True)
result_svm_cube = K_fold_times(svm_cube, X, y, cv = cv,times=100)
post_process(result_svm_cube)

              precision    recall  f1-score   support

         dos       1.00      0.98      0.99     26547
      normal       0.97      0.98      0.98     38673
       probe       0.98      0.98      0.98      7039
         r2l       0.81      0.84      0.82      1940
         u2r       0.65      0.47      0.54        60

    accuracy                           0.98     74259
   macro avg       0.88      0.85      0.86     74259
weighted avg       0.98      0.98      0.98     74259



In [None]:
knn_fine = KNeighborsClassifier(n_neighbors=1)
cv = KFold(n_splits=10, shuffle=True)
result_knn_fine = K_fold_times(knn_fine, X, y, cv = cv,times=100)
post_process(result_knn_fine)

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00     26547
      normal       0.99      0.99      0.99     38673
       probe       0.99      0.98      0.99      7039
         r2l       0.89      0.86      0.88      1940
         u2r       0.55      0.60      0.58        60

    accuracy                           0.99     74259
   macro avg       0.88      0.89      0.89     74259
weighted avg       0.99      0.99      0.99     74259



In [None]:
knn_med = KNeighborsClassifier(n_neighbors=10)
cv = KFold(n_splits=10, shuffle=True)
result_knn_med = K_fold_times(knn_med, X, y, cv = cv,times=100)
post_process(result_knn_med)

              precision    recall  f1-score   support

         dos       0.99      1.00      0.99     26547
      normal       0.99      0.99      0.99     38673
       probe       0.99      0.97      0.98      7039
         r2l       0.90      0.83      0.87      1940
         u2r       0.58      0.47      0.52        60

    accuracy                           0.98     74259
   macro avg       0.89      0.85      0.87     74259
weighted avg       0.98      0.98      0.98     74259



In [None]:
knn_cubic = KNeighborsClassifier(p=3)
cv = KFold(n_splits=10, shuffle=True)
result_knn_cubic = K_fold_times(knn_cubic, X, y, cv = cv,times=100)
post_process(result_knn_cubic)

              precision    recall  f1-score   support

         dos       0.99      1.00      0.99     26547
      normal       0.99      0.99      0.99     38673
       probe       0.99      0.97      0.98      7039
         r2l       0.90      0.85      0.87      1940
         u2r       0.65      0.50      0.57        60

    accuracy                           0.99     74259
   macro avg       0.90      0.86      0.88     74259
weighted avg       0.99      0.99      0.99     74259



In [None]:
tree_fine = DecisionTreeClassifier(max_leaf_nodes = 100)
cv = KFold(n_splits=10, shuffle=True)
result_tree_fine = K_fold_times(tree_fine, X, y, cv = cv,times=100)
post_process(result_tree_fine)

              precision    recall  f1-score   support

         dos       1.00      0.98      0.99     26547
      normal       0.97      0.99      0.98     38673
       probe       0.98      0.98      0.98      7039
         r2l       0.91      0.86      0.88      1940
         u2r       0.58      0.18      0.28        60

    accuracy                           0.98     74259
   macro avg       0.89      0.80      0.82     74259
weighted avg       0.98      0.98      0.98     74259



In [None]:
tree_med = DecisionTreeClassifier(max_leaf_nodes = 20)
cv = KFold(n_splits=10, shuffle=True)
result_tree_med = K_fold_times(tree_med, X, y, cv = cv,times=100)
post_process(result_tree_med)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         dos       0.98      0.97      0.97     26547
      normal       0.93      0.98      0.96     38673
       probe       0.91      0.91      0.91      7039
         r2l       0.81      0.30      0.43      1940
         u2r       0.00      0.00      0.00        60

    accuracy                           0.95     74259
   macro avg       0.73      0.63      0.65     74259
weighted avg       0.95      0.95      0.94     74259



  _warn_prf(average, modifier, msg_start, len(result))


[1]. Mahfouz, A. M., Venugopal, D., & Shiva, S. G. (2020). Comparative analysis of ML classifiers for network intrusion detection. In Fourth international congress on information and communication technology (pp. 193-207). Springer, Singapore.

[2]. Dhanabal, L., & Shantharajah, S. P. (2015). A study on NSL-KDD dataset for intrusion detection system based on classification algorithms. International journal of advanced research in computer and communication engineering, 4(6), 446-452.

[3]. Johnson, J. M., & Yadav, A. (2018). Fault detection and classification technique for HVDC transmission lines using KNN. In Information and communication technology for sustainable development (pp. 245-253). Springer, Singapore.