In [2]:
import mysql.connector
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.ensemble import BalancedRandomForestClassifier
import pickle
import matplotlib.pyplot as plt 
import seaborn 
from tqdm import tqdm
import warnings 

In [3]:
con = mysql.connector.connect(
    host = "localhost",
    user = "root",
    passwd = "G5thesis",
    database = "dataset_NF"
)
cursor = con.cursor()
q1 = "DESCRIBE Encoded_L7_DstPrt_Cleaned_NF_dataset"
cursor.execute(q1)
colnames = [row[0] for row in cursor.fetchall()]

## Binary Classifiers Using Balanced Random Forest

In [4]:
# Extract all the data
labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
data = []
for i in labels:
    cursor.execute("""SELECT * FROM Encoded_L7_DstPrt_Cleaned_NF_dataset WHERE Attack = %s ORDER BY RAND() """,(i,))
    rows = cursor.fetchall()
    data.extend(rows)

In [5]:
df = pd.DataFrame(data = data, columns = colnames)
df.head()

Unnamed: 0,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,SERVER_TCP_FLAGS,FLOW_DURATION_MILLISECONDS,DURATION_IN,DURATION_OUT,...,L4_DST_PORT_3306,L4_DST_PORT_8080,L4_DST_PORT_31808,L4_DST_PORT_OTHERS,L7_PROTO_7,L7_PROTO_79,L7_PROTO_92,L7_PROTO_131,L7_PROTO_178,L7_PROTO_OTHERS
0,2191,27,35433,35,219,219,27,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1719,18,7330,15,27,27,27,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1689,11,4727,11,30,30,26,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,1560,10,2092,11,222,222,90,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,2041,21,5533,18,27,27,27,4294951,16,16,...,0,0,0,0,0,0,0,0,0,1


In [6]:
# Benign Case (benign is 1 while others are 0)
warnings.filterwarnings('ignore') 
base_models = []
df_0 = df.copy()
df_0['Attack'][df_0['Attack'] != 0] = -1
df_0.replace({'Attack': 0}, {'Attack': 1}, inplace=True)
df_0.replace({'Attack': -1}, {'Attack': 0}, inplace=True)
X_0 = df_0.drop(columns=['Attack'])
Y_0 = df_0[['Attack']]
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0, Y_0, test_size=0.2, random_state=30)
base_model = BalancedRandomForestClassifier(n_estimators=100, random_state=30, class_weight='balanced')
base_model.fit(X_train_0 , y_train_0 )
base_models.append(('M_Model_' + '0', base_model))

y_pred_copy = base_model.predict(X_test_0)
results_copy = classification_report(y_test_0,y_pred_copy,output_dict = True)
df_res = pd.DataFrame(results_copy).transpose()
print(0)
print(df_res)
print("\n")

# Storing the model
with open('M_Model_' + str(0) + '.pkl', 'wb') as file:  
    pickle.dump(base_model, file)

0
              precision    recall  f1-score        support
0              0.941114  0.969377  0.955036  103092.000000
1              0.990733  0.981811  0.986252  343770.000000
accuracy       0.978942  0.978942  0.978942       0.978942
macro avg      0.965923  0.975594  0.970644  446862.000000
weighted avg   0.979286  0.978942  0.979050  446862.000000




In [7]:
df_copy = df.copy()

In [8]:
warnings.filterwarnings('ignore') 
for i in tqdm(range(1, 15)):
    df_copy.truncate(before=-1, after=-1)
    df_copy = df.copy()
    df_copy['Attack'][df_copy['Attack'] != i] = 0
    df_copy.replace({'Attack': i}, {'Attack': 1}, inplace=True)
    X_copy = df_copy.drop(columns=['Attack'])
    Y_copy = df_copy[['Attack']]
    X_train_copy, X_test_copy, y_train_copy, y_test_copy = train_test_split(X_copy, Y_copy, test_size=0.2, random_state=30)
    base_model = BalancedRandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    base_model.fit(X_train_copy , y_train_copy )
    y_pred_copy = base_model.predict(X_test_copy)
    base_models.append(('M_Model_' + str(i), base_model))
    results_copy = classification_report(y_test_copy,y_pred_copy,output_dict = True)
    df_res = pd.DataFrame(results_copy).transpose()
    print('Model '+str(i))
    print(df_res)
    # Storing the model
    with open('M_Model_' + str(i) + '.pkl', 'wb') as file:  
        pickle.dump(base_model, file)

  7%|███▏                                        | 1/14 [00:28<06:06, 28.23s/it]

Model 1
              precision    recall  f1-score        support
0              1.000000  0.999993  0.999997  446738.000000
1              0.976378  1.000000  0.988048     124.000000
accuracy       0.999993  0.999993  0.999993       0.999993
macro avg      0.988189  0.999997  0.994022  446862.000000
weighted avg   0.999993  0.999993  0.999993  446862.000000


 14%|██████▎                                     | 2/14 [00:56<05:40, 28.39s/it]

Model 2
              precision    recall  f1-score        support
0              0.999995  0.959060  0.979100  446553.000000
1              0.016515  0.993528  0.032490     309.000000
accuracy       0.959084  0.959084  0.959084       0.959084
macro avg      0.508255  0.976294  0.505795  446862.000000
weighted avg   0.999315  0.959084  0.978445  446862.000000


 21%|█████████▍                                  | 3/14 [01:25<05:13, 28.55s/it]

Model 3
              precision    recall  f1-score        support
0              0.999995  0.958536  0.978827  446697.000000
1              0.008724  0.987879  0.017294     165.000000
accuracy       0.958546  0.958546  0.958546       0.958546
macro avg      0.504359  0.973207  0.498061  446862.000000
weighted avg   0.999629  0.958546  0.978472  446862.000000


 29%|████████████▌                               | 4/14 [02:03<05:24, 32.43s/it]

Model 4
              precision    recall  f1-score        support
0              1.000000  0.998867  0.999433  427138.000000
1              0.976049  1.000000  0.987879   19724.000000
accuracy       0.998917  0.998917  0.998917       0.998917
macro avg      0.988025  0.999433  0.993656  446862.000000
weighted avg   0.998943  0.998917  0.998923  446862.000000


 36%|███████████████▋                            | 5/14 [02:31<04:37, 30.84s/it]

Model 5
              precision   recall  f1-score       support
0              1.000000  0.99972  0.999860  446463.00000
1              0.761450  1.00000  0.864572     399.00000
accuracy       0.999720  0.99972  0.999720       0.99972
macro avg      0.880725  0.99986  0.932216  446862.00000
weighted avg   0.999787  0.99972  0.999739  446862.00000


 43%|██████████████████▊                         | 6/14 [03:01<04:04, 30.53s/it]

Model 6
              precision    recall  f1-score        support
0              1.000000  0.999937  0.999968  443439.000000
1              0.991886  1.000000  0.995927    3423.000000
accuracy       0.999937  0.999937  0.999937       0.999937
macro avg      0.995943  0.999968  0.997948  446862.000000
weighted avg   0.999938  0.999937  0.999937  446862.000000


 50%|██████████████████████                      | 7/14 [03:32<03:34, 30.64s/it]

Model 7
              precision    recall  f1-score        support
0              1.000000  0.999964  0.999982  441371.000000
1              0.997095  1.000000  0.998545    5491.000000
accuracy       0.999964  0.999964  0.999964       0.999964
macro avg      0.998547  0.999982  0.999264  446862.000000
weighted avg   0.999964  0.999964  0.999964  446862.000000
Model 8
              precision    recall  f1-score        support
0              0.999997  0.999997  0.999997  381964.000000
1              0.999985  0.999985  0.999985   64898.000000
accuracy       0.999996  0.999996  0.999996       0.999996
macro avg      0.999991  0.999991  0.999991  446862.000000
weighted avg   0.999996  0.999996  0.999996  446862.000000


 64%|████████████████████████████▎               | 9/14 [05:00<03:00, 36.15s/it]

Model 9
              precision    recall  f1-score        support
0              1.000000  0.997908  0.998953  446852.000000
1              0.010582  1.000000  0.020942      10.000000
accuracy       0.997908  0.997908  0.997908       0.997908
macro avg      0.505291  0.998954  0.509948  446862.000000
weighted avg   0.999978  0.997908  0.998931  446862.000000


 71%|██████████████████████████████▋            | 10/14 [05:28<02:14, 33.59s/it]

Model 10
              precision    recall  f1-score        support
0              1.000000  0.999705  0.999852  446815.000000
1              0.262570  1.000000  0.415929      47.000000
accuracy       0.999705  0.999705  0.999705       0.999705
macro avg      0.631285  0.999852  0.707891  446862.000000
weighted avg   0.999922  0.999705  0.999791  446862.000000


 79%|█████████████████████████████████▊         | 11/14 [05:55<01:35, 31.76s/it]

Model 11
              precision    recall  f1-score        support
0              1.000000  0.999731  0.999866  446854.000000
1              0.062500  1.000000  0.117647       8.000000
accuracy       0.999731  0.999731  0.999731       0.999731
macro avg      0.531250  0.999866  0.558756  446862.000000
weighted avg   0.999983  0.999731  0.999850  446862.000000


 86%|████████████████████████████████████▊      | 12/14 [06:33<01:07, 33.74s/it]

Model 12
              precision    recall  f1-score        support
0              0.998321  0.867582  0.928371  441413.000000
1              0.075961  0.881813  0.139873    5449.000000
accuracy       0.867756  0.867756  0.867756       0.867756
macro avg      0.537141  0.874698  0.534122  446862.000000
weighted avg   0.987074  0.867756  0.918756  446862.000000


 93%|███████████████████████████████████████▉   | 13/14 [07:02<00:32, 32.12s/it]

Model 13
              precision    recall  f1-score       support
0              1.000000  0.965535  0.982465  446799.00000
1              0.004075  1.000000  0.008116      63.00000
accuracy       0.965540  0.965540  0.965540       0.96554
macro avg      0.502037  0.982767  0.495291  446862.00000
weighted avg   0.999860  0.965540  0.982328  446862.00000


100%|███████████████████████████████████████████| 14/14 [07:31<00:00, 32.28s/it]

Model 14
              precision    recall  f1-score        support
0              1.000000  0.999993  0.999997  443880.000000
1              0.998995  1.000000  0.999497    2982.000000
accuracy       0.999993  0.999993  0.999993       0.999993
macro avg      0.999497  0.999997  0.999747  446862.000000
weighted avg   0.999993  0.999993  0.999993  446862.000000



