# Model Comparison

## Imports

In [1]:
# Custom
import sys
sys.path.append('../')
from utils.dataset_manager import fit_dataset, get_classes_weights
from utils.constant import ALL_ATTACKS, FEATURES, LABELS

# Models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Other
import warnings
from tqdm import tqdm

# Ignore warnings
warnings.filterwarnings('ignore')

## Dataset

In [2]:
n_files = 20

df_train, df_test = fit_dataset(n_files, ALL_ATTACKS)

# Binary classification: Bening traffic vs. DDoS traffic
df_train = df_train[(df_train[LABELS] == ALL_ATTACKS['DDoS-SYN_Flood']) | (df_train[LABELS] == ALL_ATTACKS['BenignTraffic'])]
df_test = df_test[(df_test[LABELS] == ALL_ATTACKS['DDoS-SYN_Flood']) | (df_test[LABELS] == ALL_ATTACKS['BenignTraffic'])]

df_train[LABELS] = df_train[LABELS].apply(lambda x: 1 if x == ALL_ATTACKS['DDoS-SYN_Flood'] else 0)
df_test[LABELS] = df_test[LABELS].apply(lambda x: 1 if x == ALL_ATTACKS['DDoS-SYN_Flood'] else 0)

X_train, y_train = df_train[FEATURES], df_train[LABELS]
X_test, y_test = df_test[FEATURES], df_test[LABELS]

# Prints
print('Training Population: {}'.format(len(df_train)))
print('Testing Population: {}'.format(len(df_test)))

100%|██████████| 20/20 [00:18<00:00,  1.07it/s]
100%|██████████| 6/6 [00:04<00:00,  1.26it/s]


Training Population: 522642
Testing Population: 182230


In [3]:
get_classes_weights(df_train)

{1: 0.6351468279237982, 0: 2.3498399395726928}

## Models

Since we have unbalanced data, we have to add their weigth in each model

In [4]:
models = {
    'log_reg': LogisticRegression(class_weight=get_classes_weights(df_train)),
    'xgb': XGBClassifier(scale_pos_weight=sum(y_train == 0) / sum(y_train == 1)),
    'rf': RandomForestClassifier(class_weight=get_classes_weights(df_train)),
}

### Training

In [5]:
for model in tqdm(models):
    models[model].fit(X_train, y_train)

100%|██████████| 3/3 [01:09<00:00, 23.31s/it]


## Evaluation

In [6]:
# Predict
for model in tqdm(models):
    y_pred = list(models[model].predict(X_test))

    # Evaluate
    y_test = list(y_test)
    print('Model: ', model)
    print('  accuracy_score = ', accuracy_score(y_pred, y_test))
    print('  recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('  precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('  f1_score = ', f1_score(y_pred, y_test, average='macro'))
    print('  classification_report = \n', classification_report(y_pred, y_test), '\n\n')

  0%|          | 0/3 [00:00<?, ?it/s]

Model:  log_reg
  accuracy_score =  0.9991219886956044
  recall_score =  0.9979603475808431
  precision_score =  0.9994237241189936
  f1_score =  0.9986898702253497


 33%|███▎      | 1/3 [00:00<00:01,  1.32it/s]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38865
           1       1.00      1.00      1.00    143365

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230
 


Model:  xgb
  accuracy_score =  0.9999945124293476
  recall_score =  0.9999965162135422
  precision_score =  0.9999870831072877
  f1_score =  0.9999917995709224


 67%|██████▋   | 2/3 [00:01<00:00,  1.21it/s]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38708
           1       1.00      1.00      1.00    143522

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230
 


Model:  rf
  accuracy_score =  0.999989024858695
  recall_score =  0.9999835992965562
  precision_score =  0.9999835992965562
  f1_score =  0.9999835992965562


100%|██████████| 3/3 [00:04<00:00,  1.35s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38709
           1       1.00      1.00      1.00    143521

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230
 





