# XGBoost regression

## Imports

In [1]:
# Custom
import sys
sys.path.append('../')
from utils.dataset_manager import fit_dataset
from utils.constant import FEATURES, LABELS, ATTACKS 

# General
import warnings
import numpy as np
from joblib import dump

# Model and Metrics
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Warning
warnings.filterwarnings('ignore')

## Dataset

In [2]:
n_files = 2

df_train, df_test = fit_dataset(n_files, ATTACKS)

X_train, y_train = df_train[FEATURES], df_train[LABELS]

# Prints
print('Training Population: {}'.format(len(df_train)))
print('Testing Population: {}'.format(len(df_test)))

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:09<00:00,  4.88s/it]
100%|██████████| 1/1 [00:04<00:00,  4.36s/it]


Training Population: 457492
Testing Population: 275258


In [3]:
# Calculate the scale_pos_weight
positive_class = np.sum(y_train == 1)
negative_class = np.sum(y_train == 0)
scale_pos_weight = negative_class / positive_class

## Model

In [4]:
# Model
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, scale_pos_weight=scale_pos_weight)

# Train
xgb_model.fit(X_train, y_train)

In [5]:
# Save Model
name = f"../outputs/xgboost_{n_files}.joblib"
dump(xgb_model, name)

['../outputs/xgboost_2.joblib']

## Evaluation

In [6]:
X_test, y_test = df_test[FEATURES], df_test[LABELS]

# Predict
y_pred = xgb_model.predict(X_test)

# Evaluate
print('Accuracy: {:4f}'.format(accuracy_score(y_test, y_pred)))
print('Recall: {:4f}'.format(recall_score(y_test, y_pred)))
print('Precision: {:4f}'.format(precision_score(y_test, y_pred)))
print('F1: {:4f}'.format(f1_score(y_test, y_pred)))
print("Classification Report:\n{}".format(classification_report(y_test, y_pred)))

Accuracy: 0.990776
Recall: 0.990568
Precision: 0.999989
F1: 0.995256
Classification Report:
              precision    recall  f1-score   support

           0       0.72      1.00      0.83      6387
           1       1.00      0.99      1.00    268871

    accuracy                           0.99    275258
   macro avg       0.86      1.00      0.91    275258
weighted avg       0.99      0.99      0.99    275258



## Param Grid XGBoost regressor

```python
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(x_train, y_train)

grid_cv.best_score_
```