In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('../dataset/preprocessed_cat.csv')
data.head()

Unnamed: 0,Action,Bytes Sent,Bytes Received,Elapsed Time (sec),Destination Port_80,Destination Port_443,Destination Port_445,Destination Port_25174,Destination Port_Other,NAT Source Port_48817,NAT Source Port_50116,NAT Source Port_57596,NAT Source Port_58638,NAT Source Port_Other,NAT Destination Port_53,NAT Destination Port_80,NAT Destination Port_443,NAT Destination Port_27015,NAT Destination Port_Other
0,allow,94,83,30,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
1,allow,1600,3168,17,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
2,allow,118,120,1199,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
3,allow,1438,1889,17,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
4,allow,6778,18580,16,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0


In [3]:
Y = data['Action']
X = data.drop(['Action'], axis=1)
feature_names = X.columns.tolist()

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=42)

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])
X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']] = scaler.transform(X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])
X_test[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']] = scaler.transform(X_test[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])

In [6]:
X_train.describe()

Unnamed: 0,Bytes Sent,Bytes Received,Elapsed Time (sec),Destination Port_80,Destination Port_443,Destination Port_445,Destination Port_25174,Destination Port_Other,NAT Source Port_48817,NAT Source Port_50116,NAT Source Port_57596,NAT Source Port_58638,NAT Source Port_Other,NAT Destination Port_53,NAT Destination Port_80,NAT Destination Port_443,NAT Destination Port_27015,NAT Destination Port_Other
count,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0,49146.0
mean,0.000633,0.0004584566,0.008231,0.062121,0.177817,0.196781,0.016726,0.310951,0.001262,0.000224,8.1e-05,0.000875,0.563464,0.230599,0.061999,0.177756,0.003357,0.092195
std,0.010232,0.009720304,0.036569,0.241378,0.382363,0.397569,0.128243,0.462888,0.035496,0.014959,0.009021,0.029567,0.495961,0.42122,0.241156,0.382311,0.057846,0.289304
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,7e-06,6.38314e-07,0.001916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,3.6e-05,3.611726e-06,0.003833,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['allow', 'drop', 'deny', 'reset-both'], index=['allow', 'drop', 'deny', 'reset-both']))
    print("---------------------------------------------------------------------------------")

In [8]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(metric='manhattan')
knn.fit(X_train, Y_train)

In [9]:
report(knn, X_train, Y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00     28227
        deny       1.00      1.00      1.00     11240
        drop       1.00      1.00      1.00      9638
  reset-both       0.64      0.22      0.33        41

    accuracy                           1.00     49146
   macro avg       0.91      0.80      0.83     49146
weighted avg       1.00      1.00      1.00     49146

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
            allow   drop  deny  reset-both
allow       28207     19     0           1
drop           12  11191    33           4
deny            0      0  9638      

In [10]:
report(knn, X_test, Y_test)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      9410
        deny       1.00      1.00      1.00      3747
        drop       1.00      1.00      1.00      3213
  reset-both       0.50      0.23      0.32        13

    accuracy                           1.00     16383
   macro avg       0.87      0.81      0.83     16383
weighted avg       1.00      1.00      1.00     16383

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
            allow  drop  deny  reset-both
allow        9405     4     0           1
drop            7  3731     7           2
deny            0     0  3213          

In [11]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV

params_grid = {'n_neighbors': range(5, 50, 5),
               'weights': ['uniform', 'distance'],
                'p': [1, 2],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
              }

estimator = GridSearchCV(KNeighborsClassifier(), params_grid, cv=3, verbose=4)

In [12]:
estimator.fit(X_train, Y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV 1/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.998 total time=   5.2s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.997 total time=   5.1s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=1, weights=uniform;, score=0.998 total time=   5.2s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.998 total time=   4.8s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.998 total time=   4.8s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=1, weights=distance;, score=0.998 total time=   4.9s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.998 total time=   1.5s
[CV 2/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.997 total time=   1.4s
[CV 3/3] END algorithm=auto, n_neighbors=5, p=2, weights=uniform;, score=0.998 total time=   1.5s
[CV 1/3] END algorithm=auto, n_neighbors=5, p=2, wei

[CV 2/3] END algorithm=auto, n_neighbors=35, p=2, weights=distance;, score=0.998 total time=   1.2s
[CV 3/3] END algorithm=auto, n_neighbors=35, p=2, weights=distance;, score=0.998 total time=   1.2s
[CV 1/3] END algorithm=auto, n_neighbors=40, p=1, weights=uniform;, score=0.996 total time=   5.6s
[CV 2/3] END algorithm=auto, n_neighbors=40, p=1, weights=uniform;, score=0.996 total time=   5.4s
[CV 3/3] END algorithm=auto, n_neighbors=40, p=1, weights=uniform;, score=0.996 total time=   5.4s
[CV 1/3] END algorithm=auto, n_neighbors=40, p=1, weights=distance;, score=0.998 total time=   7.5s
[CV 2/3] END algorithm=auto, n_neighbors=40, p=1, weights=distance;, score=0.998 total time=   7.9s
[CV 3/3] END algorithm=auto, n_neighbors=40, p=1, weights=distance;, score=0.998 total time=   7.8s
[CV 1/3] END algorithm=auto, n_neighbors=40, p=2, weights=uniform;, score=0.996 total time=   2.8s
[CV 2/3] END algorithm=auto, n_neighbors=40, p=2, weights=uniform;, score=0.996 total time=   2.6s
[CV 3

[CV 1/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=uniform;, score=0.997 total time=   4.9s
[CV 2/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=uniform;, score=0.997 total time=   4.7s
[CV 3/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=uniform;, score=0.997 total time=   5.4s
[CV 1/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=distance;, score=0.998 total time=   5.0s
[CV 2/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=distance;, score=0.998 total time=   4.6s
[CV 3/3] END algorithm=ball_tree, n_neighbors=25, p=2, weights=distance;, score=0.998 total time=   4.7s
[CV 1/3] END algorithm=ball_tree, n_neighbors=30, p=1, weights=uniform;, score=0.997 total time=   4.6s
[CV 2/3] END algorithm=ball_tree, n_neighbors=30, p=1, weights=uniform;, score=0.996 total time=   5.0s
[CV 3/3] END algorithm=ball_tree, n_neighbors=30, p=1, weights=uniform;, score=0.996 total time=   5.1s
[CV 1/3] END algorithm=ball_tree, n_neighbors=30, p=1, weight

[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform;, score=0.997 total time=   3.7s
[CV 3/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=uniform;, score=0.997 total time=   5.2s
[CV 1/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance;, score=0.998 total time=   3.3s
[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance;, score=0.998 total time=   3.5s
[CV 3/3] END algorithm=kd_tree, n_neighbors=15, p=1, weights=distance;, score=0.998 total time=   5.4s
[CV 1/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=uniform;, score=0.998 total time=   3.4s
[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=uniform;, score=0.997 total time=   3.6s
[CV 3/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=uniform;, score=0.997 total time=   5.0s
[CV 1/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=distance;, score=0.998 total time=   3.4s
[CV 2/3] END algorithm=kd_tree, n_neighbors=15, p=2, weights=distance;, score=

[CV 1/3] END algorithm=kd_tree, n_neighbors=45, p=2, weights=distance;, score=0.998 total time=   3.5s
[CV 2/3] END algorithm=kd_tree, n_neighbors=45, p=2, weights=distance;, score=0.998 total time=   3.8s
[CV 3/3] END algorithm=kd_tree, n_neighbors=45, p=2, weights=distance;, score=0.998 total time=   5.1s
[CV 1/3] END algorithm=brute, n_neighbors=5, p=1, weights=uniform;, score=0.998 total time=   3.6s
[CV 2/3] END algorithm=brute, n_neighbors=5, p=1, weights=uniform;, score=0.997 total time=   3.7s
[CV 3/3] END algorithm=brute, n_neighbors=5, p=1, weights=uniform;, score=0.998 total time=   3.8s
[CV 1/3] END algorithm=brute, n_neighbors=5, p=1, weights=distance;, score=0.998 total time=   3.5s
[CV 2/3] END algorithm=brute, n_neighbors=5, p=1, weights=distance;, score=0.998 total time=   3.5s
[CV 3/3] END algorithm=brute, n_neighbors=5, p=1, weights=distance;, score=0.998 total time=   3.7s
[CV 1/3] END algorithm=brute, n_neighbors=5, p=2, weights=uniform;, score=0.998 total time=   

[CV 2/3] END algorithm=brute, n_neighbors=35, p=2, weights=uniform;, score=0.996 total time=   1.0s
[CV 3/3] END algorithm=brute, n_neighbors=35, p=2, weights=uniform;, score=0.996 total time=   1.0s
[CV 1/3] END algorithm=brute, n_neighbors=35, p=2, weights=distance;, score=0.998 total time=   0.7s
[CV 2/3] END algorithm=brute, n_neighbors=35, p=2, weights=distance;, score=0.998 total time=   0.8s
[CV 3/3] END algorithm=brute, n_neighbors=35, p=2, weights=distance;, score=0.998 total time=   0.8s
[CV 1/3] END algorithm=brute, n_neighbors=40, p=1, weights=uniform;, score=0.996 total time=   4.4s
[CV 2/3] END algorithm=brute, n_neighbors=40, p=1, weights=uniform;, score=0.996 total time=   4.4s
[CV 3/3] END algorithm=brute, n_neighbors=40, p=1, weights=uniform;, score=0.996 total time=   4.3s
[CV 1/3] END algorithm=brute, n_neighbors=40, p=1, weights=distance;, score=0.998 total time=   4.0s
[CV 2/3] END algorithm=brute, n_neighbors=40, p=1, weights=distance;, score=0.998 total time=   

In [13]:
print(estimator.best_params_, '\n')
print(estimator.best_score_)

{'algorithm': 'auto', 'n_neighbors': 25, 'p': 1, 'weights': 'distance'} 

0.9980873316241402


In [14]:
report(estimator.best_estimator_, X_train, Y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00     28227
        deny       1.00      1.00      1.00     11240
        drop       1.00      1.00      1.00      9638
  reset-both       1.00      0.22      0.36        41

    accuracy                           1.00     49146
   macro avg       1.00      0.80      0.84     49146
weighted avg       1.00      1.00      1.00     49146

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
            allow   drop  deny  reset-both
allow       28219      8     0           0
drop            7  11201    32           0
deny            0      0  9638      

In [15]:
report(estimator.best_estimator_, X_test, Y_test, "test")

[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      9410
        deny       1.00      1.00      1.00      3747
        drop       1.00      1.00      1.00      3213
  reset-both       0.75      0.23      0.35        13

    accuracy                           1.00     16383
   macro avg       0.94      0.81      0.84     16383
weighted avg       1.00      1.00      1.00     16383

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on test data [0m
---------------------------------------------------------------------------------
            allow  drop  deny  reset-both
allow        9405     4     0           1
drop            6  3734     7           0
deny            0     0  3213           0
reset

In [16]:
from sklearn.ensemble import BaggingClassifier

baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=50)
baggingKnn.fit(X_train, Y_train)

In [17]:
report(baggingKnn, X_train, Y_train)

[32mClassification report for model BaggingClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00     28227
        deny       1.00      1.00      1.00     11240
        drop       1.00      1.00      1.00      9638
  reset-both       0.75      0.22      0.34        41

    accuracy                           1.00     49146
   macro avg       0.94      0.80      0.83     49146
weighted avg       1.00      1.00      1.00     49146

---------------------------------------------------------------------------------
[32mConfusion matrix for model BaggingClassifier on training data [0m
---------------------------------------------------------------------------------
            allow   drop  deny  reset-both
allow       28207     19     0           1
drop           13  11192    33           2
deny            0      0  9638           0

In [18]:
report(baggingKnn, X_test, Y_test, "test")

[32mClassification report for model BaggingClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

       allow       1.00      1.00      1.00      9410
        deny       1.00      1.00      1.00      3747
        drop       1.00      1.00      1.00      3213
  reset-both       1.00      0.23      0.38        13

    accuracy                           1.00     16383
   macro avg       1.00      0.81      0.84     16383
weighted avg       1.00      1.00      1.00     16383

---------------------------------------------------------------------------------
[32mConfusion matrix for model BaggingClassifier on test data [0m
---------------------------------------------------------------------------------
            allow  drop  deny  reset-both
allow        9405     5     0           0
drop            7  3733     7           0
deny            0     0  3213           0
reset-both 