# Model Comparison

## Imports

In [1]:
# Custom
from utils.dataset_manager import fit_dataset, get_classes_weights
from utils.constant import ALL_ATTACKS, FEATURES, LABELS

# Models
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Other
import warnings
from tqdm import tqdm
from joblib import dump

# Ignore warnings
warnings.filterwarnings('ignore')

## Dataset

In [2]:
n_files = 1

df_train, df_test = fit_dataset(n_files, ALL_ATTACKS, dataset_directory='./data/')

X_train, y_train = df_train[FEATURES], df_train[LABELS]
X_test, y_test = df_test[FEATURES], df_test[LABELS]

# Prints
print('Training Population: {}'.format(len(df_train)))
print('Testing Population: {}'.format(len(df_test)))

100%|██████████| 1/1 [00:00<00:00,  1.72it/s]
100%|██████████| 1/1 [00:00<00:00,  2.03it/s]


Training Population: 238687
Testing Population: 218805


## Models

Since we have unbalanced data, we have to add their weigth in each model

In [3]:
xgb_params = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5, 7],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt"],
    "class_weight": ["balanced", get_classes_weights(df_train)],
}

log_reg_params = {
    "penalty": ["l1", "l2"],
    "C": [0.1, 0.5, 1, 5, 10],
    "class_weight": ["balanced", get_classes_weights(df_train)],
}

voting_params = {
    "voting": ["soft", "hard"],
    "weights": [[1, 1, 1], [1, 2, 1], [1, 1, 2], [1, 2, 2]],
}

models = {
    'log_reg': (LogisticRegression(class_weight=get_classes_weights(df_train)), log_reg_params),
    'xgb': (XGBClassifier(), xgb_params),
    'random_forest': (RandomForestClassifier(class_weight=get_classes_weights(df_train)), rf_params),
    'voting_classifier': (VotingClassifier(estimators=[
        ('xgb', XGBClassifier()),
        ('rf', RandomForestClassifier(class_weight=get_classes_weights(df_train))),
        ('logistic', LogisticRegression(class_weight=get_classes_weights(df_train)))],
        voting='soft'), voting_params)
}

### Training

In [4]:
best_estimators = {}
for model in tqdm(models):
    print('Training {}'.format(model))
    grid = GridSearchCV(models[model][0], models[model][1], cv=5, scoring='f1_macro', n_jobs=-1)
    grid.fit(X_train, y_train)
    print('Best params: {}'.format(grid.best_params_))
    print('Best score: {}'.format(grid.best_score_))
    best_estimators[model] = grid.best_estimator_


  0%|          | 0/4 [00:00<?, ?it/s]

Training log_reg


 25%|██▌       | 1/4 [02:21<07:05, 141.88s/it]

Best params: {'C': 0.5, 'class_weight': 'balanced', 'penalty': 'l2'}
Best score: 0.47358156837975135
Training xgb


 50%|█████     | 2/4 [4:51:55<5:42:36, 10278.03s/it]

Best params: {'colsample_bytree': 0.5, 'gamma': 0.25, 'learning_rate': 0.1, 'max_depth': 5, 'reg_lambda': 0, 'scale_pos_weight': 1, 'subsample': 0.8}
Best score: 0.7143549109991147
Training random_forest


 75%|███████▌  | 3/4 [10:50:38<4:16:52, 15412.88s/it]

Best params: {'class_weight': 'balanced', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 300}
Best score: 0.6055219830914593
Training voting_classifier


100%|██████████| 4/4 [11:02:34<00:00, 9938.57s/it]   

Best params: {'voting': 'hard', 'weights': [1, 2, 1]}
Best score: 0.7232131477335944





In [6]:
dump(best_model, './outputs/best_model_all_attacks.joblib')

['./outputs/best_model_all_attacks.joblib']

## Evaluation

In [7]:
# Predict
best_model = None
best_score = 0
for model in tqdm(best_estimators):
    y_pred = list(best_estimators[model].predict(X_test))

    # Evaluate
    y_test = list(y_test)
    f1score = f1_score(y_pred, y_test, average='macro')
    print('Model: ', model)
    print('  accuracy_score = ', accuracy_score(y_pred, y_test))
    print('  recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('  precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('  f1_score = ', f1score)
    print('  classification_report = \n', classification_report(y_pred, y_test))

    if f1score > best_score:
        best_score = f1score
        best_model = best_estimators[model]

dump(best_model, './outputs/best_model_all_attacks.joblib')

  0%|          | 0/4 [00:00<?, ?it/s]

Model:  log_reg
  accuracy_score =  0.7776467630995635
  recall_score =  0.476677860611158
  precision_score =  0.5428789567644444
  f1_score =  0.4703202270343937


 25%|██▌       | 1/4 [00:02<00:06,  2.22s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19005
           1       0.98      1.00      0.99     19039
           2       0.93      0.66      0.77     27103
           3       0.88      0.73      0.80     30388
           4       0.80      0.65      0.72     25650
           5       1.00      1.00      1.00     33472
           6       0.62      0.78      0.69     13359
           7       0.98      0.84      0.90      1621
           8       0.99      0.95      0.97      1438
           9       0.97      0.98      0.98      2126
          10       0.53      0.11      0.19       466
          11       0.66      0.18      0.29       458
          12       0.46      0.70      0.56     10318
          13       0.20      0.47      0.28      4039
          14       0.27      0.44      0.34      7554
          15       0.75      0.53      0.62       495
          16       0.75      0.64      0.69      5572

 50%|█████     | 2/4 [00:05<00:06,  3.04s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19031
           1       1.00      1.00      1.00     19374
           2       1.00      1.00      1.00     19248
           3       1.00      1.00      1.00     25356
           4       1.00      1.00      1.00     20963
           5       1.00      1.00      1.00     33525
           6       1.00      1.00      1.00     16788
           7       1.00      1.00      1.00      1383
           8       0.99      1.00      1.00      1383
           9       1.00      1.00      1.00      2133
          10       0.98      0.92      0.95       106
          11       0.95      0.98      0.97       123
          12       1.00      1.00      1.00     15486
          13       1.00      1.00      1.00      9312
          14       1.00      1.00      1.00     12332
          15       0.97      0.98      0.98       344
          16       1.00      1.00      1.00      4722

 75%|███████▌  | 3/4 [00:32<00:13, 13.70s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19017
           1       1.00      1.00      1.00     19346
           2       0.92      1.00      0.95     17678
           3       0.99      1.00      0.99     25088
           4       0.98      0.80      0.88     25744
           5       1.00      1.00      1.00     33486
           6       0.99      0.96      0.98     17343
           7       0.98      0.98      0.98      1388
           8       0.98      0.94      0.96      1454
           9       0.97      1.00      0.99      2078
          10       0.74      0.26      0.39       282
          11       0.94      0.15      0.25       816
          12       0.99      0.98      0.99     15684
          13       0.97      0.95      0.96      9513
          14       0.57      0.96      0.72      7286
          15       0.92      0.51      0.65       633
          16       0.99      1.00      1.00      4699

100%|██████████| 4/4 [00:52<00:00, 13.04s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19020
           1       1.00      1.00      1.00     19364
           2       1.00      1.00      1.00     19263
           3       1.00      1.00      1.00     25349
           4       1.00      1.00      1.00     20947
           5       1.00      1.00      1.00     33500
           6       1.00      1.00      1.00     16761
           7       0.99      0.99      0.99      1386
           8       0.99      0.99      0.99      1389
           9       0.99      0.99      0.99      2136
          10       0.89      0.86      0.87       104
          11       0.89      0.99      0.94       114
          12       1.00      1.00      1.00     15484
          13       1.00      1.00      1.00      9319
          14       1.00      1.00      1.00     12317
          15       0.97      0.97      0.97       344
          16       1.00      1.00      1.00      4715




['./outputs/best_model_all_attacks.joblib']