In [1]:
import pandas as pd
import os

dataset_path = '../../Datasets/CIC-DDOS2019/Kaggle Preprocessed/'
dataframes = []
file_list = os.listdir(dataset_path)
for file_name in file_list:
    if file_name.endswith('.parquet'):
        file_path = os.path.join(dataset_path, file_name)
        df = pd.read_parquet(file_path)
        dataframes.append(df)

In [2]:
# Concatenate dataset files and correct labels
df = pd.concat(dataframes)
del dataframes
df['Label'] = df['Label'].str.replace('DrDoS_DNS', 'DNS')
df['Label'] = df['Label'].str.replace('DrDoS_LDAP', 'LDAP')
df['Label'] = df['Label'].str.replace('DrDoS_MSSQL', 'MSSQL')
df['Label'] = df['Label'].str.replace('DrDoS_NTP', 'NTP')
df['Label'] = df['Label'].str.replace('DrDoS_NetBIOS', 'NetBIOS')
df['Label'] = df['Label'].str.replace('DrDoS_SNMP', 'SNMP')
df['Label'] = df['Label'].str.replace('DrDoS-SSDP', 'SSDP')
df['Label'] = df['Label'].str.replace('DrDoS_UDP', 'UDP')
df['Label'] = df['Label'].str.replace('UDPLag', 'UDP-lag')

In [3]:
# Remove classes that have few samples in the graphs approach for conformity
df = df[df['Label'] != 'DNS']
df = df[df['Label'] != 'LDAP']
df = df[df['Label'] != 'MSSQL']
df = df[df['Label'] != 'NTP']
df = df[df['Label'] != 'NetBIOS']
df = df[df['Label'] != 'SNMP']
df = df[df['Label'] != 'SSDP']
df = df[df['Label'] != 'Portmap']
df = df[df['Label'] != 'WebDDoS']

In [None]:
# Set all attack labels to Malicious class
df.loc[df['Label'] != 'Benign', 'Label'] = 'Malicious' # type: ignore

In [4]:
data = df[df.columns[:-1]] # type: ignore
categories = df['Label'].astype('category')

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(data)

In [6]:
# 10-fold data (for 10-fold cross validation), 
from sklearn.model_selection import train_test_split
from run_models.run_models import run_models
from imblearn.over_sampling import SMOTE
data_train, data_test, categories_train, categories_test = train_test_split(
    data,
    categories,
    random_state=100,
    test_size=0.1,
    shuffle=True,
    stratify=categories,
)
data_train, data_dev, categories_train, categories_dev = train_test_split(
    data_train,
    categories_train,
    random_state=100,
    test_size=0.1 / (1 - 0.1),
    shuffle=True,
    stratify=categories_train,
)
# Oversample infrequent data using SMOTE
ros = SMOTE(random_state=101)
data_train, categories_train = ros.fit_resample(data_train, categories_train) # type: ignore
results = run_models(data_train, categories_train, data_dev, categories_dev, data_test, categories_test)
print(results)

2023-10-04 21:15:17.960320: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-04 21:15:17.960354: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-04 21:15:17.960382: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Fold 1/5

Running MLP Classifier
Epoch 1/20


2023-10-04 21:15:19.574371: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


8628/8628 - 17s - loss: 0.0231 - binary_crossentropy: 0.0231 - categorical_accuracy: 0.9937 - val_loss: 0.0084 - val_binary_crossentropy: 0.0084 - val_categorical_accuracy: 0.9984 - 17s/epoch - 2ms/step
Epoch 2/20
8628/8628 - 16s - loss: 0.0134 - binary_crossentropy: 0.0134 - categorical_accuracy: 0.9962 - val_loss: 0.0054 - val_binary_crossentropy: 0.0054 - val_categorical_accuracy: 0.9988 - 16s/epoch - 2ms/step
Epoch 3/20
8628/8628 - 17s - loss: 0.0121 - binary_crossentropy: 0.0121 - categorical_accuracy: 0.9964 - val_loss: 0.0045 - val_binary_crossentropy: 0.0045 - val_categorical_accuracy: 0.9989 - 17s/epoch - 2ms/step
Epoch 4/20
8628/8628 - 16s - loss: 0.0112 - binary_crossentropy: 0.0112 - categorical_accuracy: 0.9968 - val_loss: 0.0058 - val_binary_crossentropy: 0.0058 - val_categorical_accuracy: 0.9988 - 16s/epoch - 2ms/step
Epoch 5/20
8628/8628 - 16s - loss: 0.0105 - binary_crossentropy: 0.0105 - categorical_accuracy: 0.9970 - val_loss: 0.0064 - val_binary_crossentropy: 0.0064

Epoch 2/10
8628/8628 - 43s - loss: 0.0132 - binary_crossentropy: 0.0132 - categorical_accuracy: 0.9962 - val_loss: 0.0048 - val_binary_crossentropy: 0.0048 - val_categorical_accuracy: 0.9987 - 43s/epoch - 5ms/step
Epoch 3/10
8628/8628 - 43s - loss: 0.0111 - binary_crossentropy: 0.0111 - categorical_accuracy: 0.9966 - val_loss: 0.0055 - val_binary_crossentropy: 0.0055 - val_categorical_accuracy: 0.9991 - 43s/epoch - 5ms/step
Epoch 4/10
8628/8628 - 43s - loss: 0.0103 - binary_crossentropy: 0.0103 - categorical_accuracy: 0.9969 - val_loss: 0.0039 - val_binary_crossentropy: 0.0039 - val_categorical_accuracy: 0.9990 - 43s/epoch - 5ms/step
Epoch 5/10
8628/8628 - 43s - loss: 0.0097 - binary_crossentropy: 0.0097 - categorical_accuracy: 0.9971 - val_loss: 0.0048 - val_binary_crossentropy: 0.0048 - val_categorical_accuracy: 0.9986 - 43s/epoch - 5ms/step
Epoch 6/10
8628/8628 - 43s - loss: 0.0092 - binary_crossentropy: 0.0092 - categorical_accuracy: 0.9972 - val_loss: 0.0036 - val_binary_crossentr

Accuracy: 0.9993972691656814
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00     19566
   Malicious       1.00      1.00      1.00     66708

    accuracy                           1.00     86274
   macro avg       1.00      1.00      1.00     86274
weighted avg       1.00      1.00      1.00     86274


Running Naive Bayes Classifier
Accuracy: 0.8801492917912697
              precision    recall  f1-score   support

      Benign       0.66      0.95      0.78     19566
   Malicious       0.98      0.86      0.92     66708

    accuracy                           0.88     86274
   macro avg       0.82      0.90      0.85     86274
weighted avg       0.91      0.88      0.89     86274


Running KNN Classifier
Accuracy: 0.9990263578830239
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00     19566
   Malicious       1.00      1.00      1.00     66708

    accuracy                           

Epoch 11/20
8628/8628 - 15s - loss: 0.0086 - binary_crossentropy: 0.0086 - categorical_accuracy: 0.9974 - val_loss: 0.0049 - val_binary_crossentropy: 0.0049 - val_categorical_accuracy: 0.9987 - 15s/epoch - 2ms/step
Accuracy: 0.9979252150126342
Macro-Precision: 0.9966065045989083
Macro-Recall: 0.9974844851022173
Macro-F-Score: 0.9970446490106377
              precision    recall  f1-score   support

      Benign       0.99      1.00      1.00     19566
   Malicious       1.00      1.00      1.00     66708

    accuracy                           1.00     86274
   macro avg       1.00      1.00      1.00     86274
weighted avg       1.00      1.00      1.00     86274


Running CNN Classifier
Epoch 1/10
8628/8628 - 45s - loss: 0.0249 - binary_crossentropy: 0.0249 - categorical_accuracy: 0.9933 - val_loss: 0.0054 - val_binary_crossentropy: 0.0054 - val_categorical_accuracy: 0.9989 - 45s/epoch - 5ms/step
Epoch 2/10
8628/8628 - 45s - loss: 0.0132 - binary_crossentropy: 0.0132 - categorical_ac