# Model Comparison

## Imports

In [1]:
# Custom
from utils.dataset_manager import fit_dataset, get_classes_weights
from utils.constant import ALL_ATTACKS, FEATURES, LABELS

# Models
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Other
import warnings
from tqdm import tqdm
from joblib import dump

# Ignore warnings
warnings.filterwarnings('ignore')

## Dataset

In [2]:
n_files = 2

df_train, df_test = fit_dataset(n_files, ALL_ATTACKS, dataset_directory='./data/')

X_train, y_train = df_train[FEATURES], df_train[LABELS]
X_test, y_test = df_test[FEATURES], df_test[LABELS]

# Prints
print('Training Population: {}'.format(len(df_train)))
print('Testing Population: {}'.format(len(df_test)))

./data/


100%|██████████| 2/2 [00:07<00:00,  3.55s/it]
100%|██████████| 1/1 [00:04<00:00,  4.07s/it]


Training Population: 457492
Testing Population: 275258


## Models

Since we have unbalanced data, we have to add their weigth in each model

In [3]:
xgb_params = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5, 7],
    "min_samples_leaf": [1, 3, 5],
    "max_features": ["sqrt"],
    "class_weight": ["balanced", get_classes_weights(df_train)],
}

log_reg_params = {
    "penalty": ["l1", "l2"],
    "C": [0.1, 0.5, 1, 5, 10],
    "class_weight": ["balanced", get_classes_weights(df_train)],
}

voting_params = {
    "voting": ["soft", "hard"],
    "weights": [[1, 1, 1], [1, 2, 1], [1, 1, 2], [1, 2, 2]],
}

models = {
    'log_reg': (LogisticRegression(class_weight=get_classes_weights(df_train)), log_reg_params),
    'xgb': (XGBClassifier(), xgb_params),
    'random_forest': (RandomForestClassifier(class_weight=get_classes_weights(df_train)), rf_params),
    'voting_classifier': (VotingClassifier(estimators=[
        ('xgb', XGBClassifier()),
        ('rf', RandomForestClassifier(class_weight=get_classes_weights(df_train))),
        ('logistic', LogisticRegression(class_weight=get_classes_weights(df_train)))],
        voting='soft'), voting_params)
}

### Training

In [4]:
best_estimators = {}
for model in tqdm(models):
    print('Training {}'.format(model))
    grid = GridSearchCV(models[model][0], models[model][1], cv=5, scoring='f1_macro', n_jobs=-1)
    grid.fit(X_train, y_train)
    print('Best params: {}'.format(grid.best_params_))
    print('Best score: {}'.format(grid.best_score_))
    best_estimators[model] = grid.best_estimator_


  0%|          | 0/4 [00:00<?, ?it/s]

Training log_reg


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

## Evaluation

In [None]:
# Predict
best_model = None
best_score = 0
for model in tqdm(best_estimators):
    y_pred = list(best_estimators[model].predict(X_test))

    # Evaluate
    y_test = list(y_test)
    f1score = f1_score(y_pred, y_test, average='macro')
    print('Model: ', model)
    print('  accuracy_score = ', accuracy_score(y_pred, y_test))
    print('  recall_score = ', recall_score(y_pred, y_test, average='macro'))
    print('  precision_score = ', precision_score(y_pred, y_test, average='macro'))
    print('  f1_score = ', f1score)
    print('  classification_report = \n', classification_report(y_pred, y_test))

    if f1score > best_score:
        best_score = f1score
        best_model = best_estimators[model]

dump(best_model, './outputs/best_model_all_attacks.joblib')

  0%|          | 0/2 [00:00<?, ?it/s]

Model:  log_reg
  accuracy_score =  0.7741936655792021
  recall_score =  0.48097141706008584
  precision_score =  0.5256358749894402
  f1_score =  0.473294530373888


 50%|█████     | 1/2 [00:02<00:02,  2.74s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23779
           1       0.98      1.00      0.99     23474
           2       0.93      0.66      0.77     34163
           3       0.90      0.72      0.80     39454
           4       0.79      0.65      0.71     32416
           5       1.00      1.00      1.00     42280
           6       0.63      0.77      0.69     17480
           7       0.98      0.91      0.94      1758
           8       0.98      0.92      0.95      1800
           9       0.97      0.97      0.97      2680
          10       0.51      0.15      0.23       459
          11       0.69      0.26      0.37       396
          12       0.44      0.74      0.55     11745
          13       0.21      0.47      0.29      5279
          14       0.27      0.43      0.33     10205
          15       0.76      0.62      0.68       562
          16       0.65      0.59      0.62      6261

100%|██████████| 2/2 [00:11<00:00,  5.62s/it]

  classification_report = 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     23824
           1       1.00      1.00      1.00     23890
           2       1.00      1.00      1.00     24172
           3       1.00      1.00      1.00     31749
           4       1.00      1.00      1.00     26523
           5       1.00      1.00      1.00     42340
           6       1.00      1.00      1.00     21422
           7       1.00      1.00      1.00      1635
           8       1.00      1.00      1.00      1689
           9       1.00      1.00      1.00      2676
          10       0.98      0.97      0.98       133
          11       0.98      0.97      0.97       148
          12       1.00      1.00      1.00     19592
          13       1.00      1.00      1.00     12010
          14       1.00      1.00      1.00     15932
          15       0.99      1.00      0.99       450
          16       1.00      1.00      1.00      5683


