In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

KeyboardInterrupt: 

In [None]:
data = pd.read_csv('../dataset/preprocessed.csv')
data.head()

In [None]:
Y = data['Action']
X = data.drop(['Action'], axis=1)
feature_names = X.columns.tolist()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']])
X_train[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']] = scaler.transform(X_train[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']])
X_test[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']] = scaler.transform(X_test[['Source Port', 'Destination Port', 'NAT Source Port', 'NAT Destination Port']])

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])
X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']] = scaler.transform(X_train[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])
X_test[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']] = scaler.transform(X_test[['Elapsed Time (sec)', 'Bytes Sent', 'Bytes Received']])

In [None]:
X_train.describe()

In [None]:
from sklearn.metrics import classification_report
from termcolor import colored

def report(model, x, y, text = "training"):
    y_pred = model.predict(x)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division=True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['allow', 'drop', 'deny', 'reset-both'], index=['allow', 'drop', 'deny', 'reset-both']))
    print("---------------------------------------------------------------------------------")

In [None]:
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

In [None]:
report(knn, X_train, Y_train)

In [None]:
report(knn, X_test, Y_test, "test")

In [None]:
# GridSearchCV

from sklearn.model_selection import GridSearchCV

params_grid = {'n_neighbors': range(5, 50, 5),
               'weights': ['uniform', 'distance'],
                'p': [1, 2],
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
              }

estimator = GridSearchCV(KNeighborsClassifier(), params_grid, cv=3, verbose=4)

In [None]:
estimator.fit(X_train, Y_train)

In [None]:
print(estimator.best_params_, '\n')
print(estimator.best_score_)

In [None]:
report(estimator.best_estimator_, X_train, Y_train)

In [None]:
report(estimator.best_estimator_, X_test, Y_test, "test")

In [None]:
from sklearn.ensemble import BaggingClassifier

baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=50)
baggingKnn.fit(X_train, Y_train)

In [None]:
report(baggingKnn, X_train, Y_train)

In [None]:
report(baggingKnn, X_test, Y_test, "test")