# Model Comparison

## Imports

In [1]:
# Custom
from utils.dataset_manager import fit_dataset, get_classes_weights
from utils.constant import ALL_ATTACKS, FEATURES, LABELS

# Models
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Other
import warnings
from tqdm import tqdm
from joblib import dump

# Ignore warnings
warnings.filterwarnings('ignore')

## Dataset

In [2]:
n_files = 20

df_train, df_test = fit_dataset(n_files, ALL_ATTACKS, dataset_directory='./data/')

# Binary classification: Bening traffic vs. DDoS traffic
df_train = df_train[(df_train[LABELS] == ALL_ATTACKS['DDoS-SYN_Flood']) | (df_train[LABELS] == ALL_ATTACKS['BenignTraffic'])]
df_test = df_test[(df_test[LABELS] == ALL_ATTACKS['DDoS-SYN_Flood']) | (df_test[LABELS] == ALL_ATTACKS['BenignTraffic'])]

df_train[LABELS] = df_train[LABELS].apply(lambda x: 1 if x == ALL_ATTACKS['DDoS-SYN_Flood'] else 0)
df_test[LABELS] = df_test[LABELS].apply(lambda x: 1 if x == ALL_ATTACKS['DDoS-SYN_Flood'] else 0)

X_train, y_train = df_train[FEATURES], df_train[LABELS]
X_test, y_test = df_test[FEATURES], df_test[LABELS]

# Prints
print('Training Population: {}'.format(len(df_train)))
print('Testing Population: {}'.format(len(df_test)))

100%|██████████| 20/20 [00:19<00:00,  1.04it/s]
100%|██████████| 6/6 [00:04<00:00,  1.28it/s]


Training Population: 522642
Testing Population: 182230


## Models

Since we have unbalanced data, we have to add their weigth in each model

In [3]:
xgb_params = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5, 7],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt"],
    "class_weight": ["balanced", get_classes_weights(df_train)],
}

log_reg_params = {
    "penalty": ["l1", "l2"],
    "C": [0.1, 0.5, 1, 5, 10],
    "class_weight": ["balanced", get_classes_weights(df_train)],
}

voting_params = {
    "voting": ["soft", "hard"],
    "weights": [[1, 1, 1], [1, 2, 1], [1, 1, 2], [1, 2, 2]],
}

models = {
    'log_reg': (LogisticRegression(class_weight=get_classes_weights(df_train)), log_reg_params),
    'xgb': (XGBClassifier(), xgb_params),
    'random_forest': (RandomForestClassifier(class_weight=get_classes_weights(df_train)), rf_params),
    'voting_classifier': (VotingClassifier(estimators=[('xgb', XGBClassifier()), ('rf', RandomForestClassifier()), ('logistic', LogisticRegression())], voting='soft'), voting_params)
}

### Training

In [4]:
best_estimators = {}
for model in tqdm(models):
    print('Training {}'.format(model))
    grid = GridSearchCV(models[model][0], models[model][1], cv=5, scoring='f1_macro', n_jobs=-1)
    grid.fit(X_train, y_train)
    print('Best params: {}'.format(grid.best_params_))
    print('Best score: {}'.format(grid.best_score_))
    best_estimators[model] = grid.best_estimator_

  0%|          | 0/4 [00:00<?, ?it/s]

Training log_reg


 25%|██▌       | 1/4 [01:20<04:02, 80.86s/it]

Best params: {'C': 0.1, 'class_weight': 'balanced', 'penalty': 'l2'}
Best score: 0.9984350042428976
Training xgb


 50%|█████     | 2/4 [25:54<29:59, 899.87s/it]

Best params: {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.8}
Best score: 0.9999942887426471
Training random_forest


 75%|███████▌  | 3/4 [1:16:38<31:19, 1879.13s/it]

Best params: {'class_weight': 'balanced', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 200}
Best score: 0.9985745085665645
Training voting_classifier


100%|██████████| 4/4 [1:22:59<00:00, 1244.96s/it]

Best params: {'voting': 'hard', 'weights': [1, 2, 1]}
Best score: 0.9999971444181665





In [5]:
best_estimators

{'log_reg': LogisticRegression(C=0.1, class_weight='balanced'),
 'xgb': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=0.5, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=0, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=0.1, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=3, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
 'random_forest': RandomForestClassifier(class_weight='balanced', max_depth=7, n_estimators=200),
 'voting_classifier': VotingClassifier(estimators=[('xgb',
          

## Evaluation

In [6]:
# Predict
best_model = None
best_score = 0
for model in tqdm(best_estimators):
    y_pred = list(best_estimators[model].predict(X_test))

    # Evaluate
    y_test = list(y_test)
    f1score = f1_score(y_pred, y_test, average='macro')
    print('Model: ', model)
    print('  accuracy_score = ', accuracy_score(y_pred, y_test))
    print('  recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('  precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('  f1_score = ', f1score)
    print('  classification_report = \n', classification_report(y_pred, y_test))

    if f1score > best_score:
        best_score = f1score
        best_model = best_estimators[model]

dump(best_model, './outputs/best_model_syn_attacks.joblib')

  0%|          | 0/4 [00:00<?, ?it/s]

Model:  log_reg
  accuracy_score =  0.9991329638369094
  recall_score =  0.9979859744863098
  precision_score =  0.9994306917404566
  f1_score =  0.9987062225107972


 25%|██▌       | 1/4 [00:00<00:02,  1.25it/s]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38863
           1       1.00      1.00      1.00    143367

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230

Model:  xgb
  accuracy_score =  0.9999945124293476
  recall_score =  0.9999965162135422


 50%|█████     | 2/4 [00:01<00:01,  1.41it/s]

  precision_score =  0.9999870831072877
  f1_score =  0.9999917995709224
  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38708
           1       1.00      1.00      1.00    143522

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230

Model:  random_forest
  accuracy_score =  0.9991933271140866
  recall_score =  0.9981176842313443
  precision_score =  0.9994784467404842
  f1_score =  0.9987961937204097


 75%|███████▌  | 3/4 [00:03<00:01,  1.40s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38854
           1       1.00      1.00      1.00    143376

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230

Model:  voting_classifier
  accuracy_score =  0.999989024858695
  recall_score =  0.9999835992965562
  precision_score =  0.9999835992965562
  f1_score =  0.9999835992965562
  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     38709
           1       1.00      1.00      1.00    143521

    accuracy                           1.00    182230
   macro avg       1.00      1.00      1.00    182230
weighted avg       1.00      1.00      1.00    182230



100%|██████████| 4/4 [00:05<00:00,  1.47s/it]


['./outputs/best_model_syn_attacks.joblib']