In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('../dataset/preprocessed.csv')
data.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes Sent,Bytes Received,Elapsed Time (sec)
0,57222,53,54587,53,allow,94,83,30
1,56258,3389,56258,3389,allow,1600,3168,17
2,6881,50321,43265,50321,allow,118,120,1199
3,50553,3389,50553,3389,allow,1438,1889,17
4,50002,443,45848,443,allow,6778,18580,16


In [3]:
Y = data['Action']
X = data.drop(['Action'], axis=1)
feature_names = X.columns.tolist()

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=42)

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']])
X_train[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']] = scaler.transform(X_train[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']])
X_test[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']] = scaler.transform(X_test[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']])

In [6]:
scaler = MinMaxScaler()
scaler.fit(X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])
X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']] = scaler.transform(X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])
X_test[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']] = scaler.transform(X_test[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])

In [7]:
X_train.describe()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Bytes Sent,Bytes Received,Elapsed Time (sec)
count,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0
mean,0.753108,0.161336,0.294259,0.04086,0.000633,0.0004584566,0.008231
std,0.233894,0.281651,0.335535,0.14924,0.010232,0.009720304,0.036569
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.750431,0.001221,0.0,0.0,1e-06,0.0,0.0
50%,0.820724,0.00679,0.133679,0.000809,7e-06,6.38314e-07,0.001916
75%,0.894786,0.228889,0.585204,0.00676,3.6e-05,3.611726e-06,0.003833
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['allow', 'drop', 'deny', 'reset-both'], index=['allow', 'drop', 'deny', 'reset-both']))
    print("---------------------------------------------------------------------------------")

In [9]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

In [10]:
report(knn, X_train, Y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00     28227
        deny       0.99      0.99      0.99     11240
        drop       1.00      1.00      1.00      9638
  reset-both       1.00      0.05      0.09        41

    accuracy                           1.00     49146
   macro avg       1.00      0.76      0.77     49146
weighted avg       1.00      1.00      1.00     49146

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
            allow   drop  deny  reset-both
allow       28135     92     0           0
drop           20  11183    37           0
deny            0      3  9635      

In [11]:
report(knn, X_test, Y_test, "test")

[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      9410
        deny       0.99      0.99      0.99      3747
        drop       1.00      1.00      1.00      3213
  reset-both       1.00      0.00      0.00        13

    accuracy                           1.00     16383
   macro avg       1.00      0.75      0.75     16383
weighted avg       1.00      1.00      1.00     16383

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on test data [0m
---------------------------------------------------------------------------------
            allow  drop  deny  reset-both
allow        9380    29     1           0
drop           12  3726     9           0
deny            0     0  3213           0
reset

In [12]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV

params_grid = {'n_neighbors': range(5, 50, 5),
               'weights': ['uniform', 'distance'],
                'p': [1, 2],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
              }

estimator = GridSearchCV(KNeighborsClassifier(), params_grid, cv=3, verbose=4)

In [13]:
estimator.fit(X_train, Y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.994 total time=   0.8s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.994 total time=   0.7s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.995 total time=   0.7s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.995 total time=   0.5s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.996 total time=   0.4s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.996 total time=   0.4s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.993 total time=   0.7s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.994 total time=   0.6s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.995 total time=   0.6s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=2, wei

[CV 2/3] END algorithm=auto, n_neighbors=35, p=2, weights=distance;, score=0.992 total time=   0.7s
[CV 3/3] END algorithm=auto, n_neighbors=35, p=2, weights=distance;, score=0.992 total time=   0.7s
[CV 1/3] END algorithm=auto, n_neighbors=40, p=1, weights=uniform;, score=0.984 total time=   1.1s
[CV 2/3] END algorithm=auto, n_neighbors=40, p=1, weights=uniform;, score=0.986 total time=   0.9s
[CV 3/3] END algorithm=auto, n_neighbors=40, p=1, weights=uniform;, score=0.986 total time=   1.0s
[CV 1/3] END algorithm=auto, n_neighbors=40, p=1, weights=distance;, score=0.989 total time=   0.8s
[CV 2/3] END algorithm=auto, n_neighbors=40, p=1, weights=distance;, score=0.992 total time=   0.7s
[CV 3/3] END algorithm=auto, n_neighbors=40, p=1, weights=distance;, score=0.992 total time=   0.8s
[CV 1/3] END algorithm=auto, n_neighbors=40, p=2, weights=uniform;, score=0.984 total time=   1.0s
[CV 2/3] END algorithm=auto, n_neighbors=40, p=2, weights=uniform;, score=0.985 total time=   1.0s
[CV 3

[CV 1/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=uniform;, score=0.985 total time=   3.8s
[CV 2/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=uniform;, score=0.988 total time=   3.0s
[CV 3/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=uniform;, score=0.988 total time=   3.2s
[CV 1/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=distance;, score=0.991 total time=   3.1s
[CV 2/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=distance;, score=0.993 total time=   2.7s
[CV 3/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=distance;, score=0.993 total time=   2.5s
[CV 1/3] END algorithm=ball_tree, n_neighbors=30, p=1, weights=uniform;, score=0.985 total time=   4.0s
[CV 2/3] END algorithm=ball_tree, n_neighbors=30, p=1, weights=uniform;, score=0.987 total time=   2.6s
[CV 3/3] END algorithm=ball_tree, n_neighbors=30, p=1, weights=uniform;, score=0.988 total time=   1.9s
[CV 1/3] END algorithm=ball_tree, n_neighbors=30, p=1, weight

[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform;, score=0.991 total time=   2.1s
[CV 3/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform;, score=0.992 total time=   1.6s
[CV 1/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance;, score=0.992 total time=   1.1s
[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance;, score=0.995 total time=   1.0s
[CV 3/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance;, score=0.995 total time=   1.2s
[CV 1/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=uniform;, score=0.989 total time=   1.9s
[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=uniform;, score=0.991 total time=   1.8s
[CV 3/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=uniform;, score=0.992 total time=   1.8s
[CV 1/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=distance;, score=0.993 total time=   1.4s
[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=distance;, score=

[CV 1/3] END algorithm=kd_tree, n_neighbors=45, p=2, weights=distance;, score=0.988 total time=   1.9s
[CV 2/3] END algorithm=kd_tree, n_neighbors=45, p=2, weights=distance;, score=0.991 total time=   1.3s
[CV 3/3] END algorithm=kd_tree, n_neighbors=45, p=2, weights=distance;, score=0.991 total time=   1.1s
[CV 1/3] END algorithm=brute, n_neighbors=5, p=1, weights=uniform;, score=0.994 total time=   6.2s
[CV 2/3] END algorithm=brute, n_neighbors=5, p=1, weights=uniform;, score=0.994 total time=   5.6s
[CV 3/3] END algorithm=brute, n_neighbors=5, p=1, weights=uniform;, score=0.995 total time=   5.4s
[CV 1/3] END algorithm=brute, n_neighbors=5, p=1, weights=distance;, score=0.995 total time=   4.9s
[CV 2/3] END algorithm=brute, n_neighbors=5, p=1, weights=distance;, score=0.996 total time=   3.7s
[CV 3/3] END algorithm=brute, n_neighbors=5, p=1, weights=distance;, score=0.996 total time=   4.0s
[CV 1/3] END algorithm=brute, n_neighbors=5, p=2, weights=uniform;, score=0.993 total time=   

[CV 2/3] END algorithm=brute, n_neighbors=35, p=2, weights=uniform;, score=0.986 total time=   1.3s
[CV 3/3] END algorithm=brute, n_neighbors=35, p=2, weights=uniform;, score=0.987 total time=   1.3s
[CV 1/3] END algorithm=brute, n_neighbors=35, p=2, weights=distance;, score=0.989 total time=   1.0s
[CV 2/3] END algorithm=brute, n_neighbors=35, p=2, weights=distance;, score=0.992 total time=   1.0s
[CV 3/3] END algorithm=brute, n_neighbors=35, p=2, weights=distance;, score=0.992 total time=   1.0s
[CV 1/3] END algorithm=brute, n_neighbors=40, p=1, weights=uniform;, score=0.984 total time=   3.0s
[CV 2/3] END algorithm=brute, n_neighbors=40, p=1, weights=uniform;, score=0.986 total time=   3.0s
[CV 3/3] END algorithm=brute, n_neighbors=40, p=1, weights=uniform;, score=0.986 total time=   3.0s
[CV 1/3] END algorithm=brute, n_neighbors=40, p=1, weights=distance;, score=0.989 total time=   2.7s
[CV 2/3] END algorithm=brute, n_neighbors=40, p=1, weights=distance;, score=0.992 total time=   

In [14]:
print(estimator.best_params_, '\n')
print(estimator.best_score_)

{'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'} 

0.9956252797786188


In [15]:
report(estimator.best_estimator_, X_train, Y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00     28227
        deny       1.00      1.00      1.00     11240
        drop       1.00      1.00      1.00      9638
  reset-both       1.00      1.00      1.00        41

    accuracy                           1.00     49146
   macro avg       1.00      1.00      1.00     49146
weighted avg       1.00      1.00      1.00     49146

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
            allow   drop  deny  reset-both
allow       28220      7     0           0
drop            0  11240     0           0
deny            0      6  9632      

In [16]:
report(estimator.best_estimator_, X_test, Y_test, "test")

[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      9410
        deny       0.99      1.00      0.99      3747
        drop       1.00      1.00      1.00      3213
  reset-both       0.50      0.08      0.13        13

    accuracy                           1.00     16383
   macro avg       0.87      0.77      0.78     16383
weighted avg       1.00      1.00      1.00     16383

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on test data [0m
---------------------------------------------------------------------------------
            allow  drop  deny  reset-both
allow        9386    22     1           1
drop            4  3733    10           0
deny            0     3  3210           0
reset

In [17]:
from sklearn.ensemble import BaggingClassifier

baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=50)
baggingKnn.fit(X_train, Y_train)

In [18]:
report(baggingKnn, X_train, Y_train)

[32mClassification report for model BaggingClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00     28227
        deny       0.99      1.00      0.99     11240
        drop       1.00      1.00      1.00      9638
  reset-both       1.00      0.05      0.09        41

    accuracy                           1.00     49146
   macro avg       1.00      0.76      0.77     49146
weighted avg       1.00      1.00      1.00     49146

---------------------------------------------------------------------------------
[32mConfusion matrix for model BaggingClassifier on training data [0m
---------------------------------------------------------------------------------
            allow   drop  deny  reset-both
allow       28137     90     0           0
drop           17  11186    37           0
deny            0      1  9637           0

In [19]:
report(baggingKnn, X_test, Y_test, "test")

[32mClassification report for model BaggingClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      9410
        deny       0.99      0.99      0.99      3747
        drop       1.00      1.00      1.00      3213
  reset-both       1.00      0.00      0.00        13

    accuracy                           1.00     16383
   macro avg       1.00      0.75      0.75     16383
weighted avg       1.00      1.00      1.00     16383

---------------------------------------------------------------------------------
[32mConfusion matrix for model BaggingClassifier on test data [0m
---------------------------------------------------------------------------------
            allow  drop  deny  reset-both
allow        9378    31     1           0
drop           12  3725    10           0
deny            0     0  3213           0
reset-both 