In [1]:
import os
import random
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from utils.constant import ATTACKS, FEATURES, LABELS, DATASET_DIRECTORY

# XGBoost regression

## Creating datasets

In [2]:
# Merge all csv into one dataframe
df = pd.DataFrame()
for i, filename in enumerate(tqdm(os.listdir(DATASET_DIRECTORY))):
    if i == 4:
        break
    if filename.endswith('.csv'):
        df = df.append(pd.read_csv(DATASET_DIRECTORY + filename), ignore_index=True)

df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)

# Determine the number of samples for the training set
indexes = list(df.index)
train_size = int(len(indexes) * 0.8)
# Randomly select indices for the training set
train_indices = random.sample(population=indexes, k=train_size)
test_indices = np.setdiff1d(indexes, train_indices)

df_train = df.iloc[train_indices]
df_test = df.iloc[test_indices]

x_train, y_train = df_train[FEATURES], df_train[LABELS].apply(lambda x: ATTACKS[x])
x_test, y_test = df_test[FEATURES], df_test[LABELS].apply(lambda x: ATTACKS[x])

  2%|▏         | 4/170 [00:06<04:33,  1.65s/it]


## Training

In [9]:
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(x_train, y_train)

In [11]:
# Predict
preds = xgb_cl.predict(x_test)
accuracy_score(y_test, preds)

0.9926037735849057

## Param Grid XGBoost regressor

In [15]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(x_train, y_train)

grid_cv.best_score_