In [None]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score,recall_score

In [78]:
train = pd.read_csv(r"..\API\data\processed\train_data.csv")
train_label = train["class"]
train.drop('class',inplace=True,axis=1)

train,val,train_label,val_label = train_test_split(train,train_label,random_state=42,test_size=0.1)

test = pd.read_csv(r"..\API\data\processed\test_data.csv")
test_label = test["class"]
test.drop('class',inplace=True,axis=1)


In [79]:
train.head(5)

Unnamed: 0,service,flag,src_bytes,dst_bytes,same_srv_rate,diff_srv_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_serror_rate
17527,17,5,0,0,0.33,0.17,36,0.14,0.05,1.0
14304,22,9,236,3222,1.0,0.0,229,0.9,0.02,0.0
13196,15,5,0,0,0.13,0.06,18,0.07,0.06,1.0
7735,22,9,221,2280,1.0,0.0,255,1.0,0.0,0.0
1866,22,9,207,27865,1.0,0.0,255,1.0,0.0,0.0


In [80]:
num_classes = len(train_label.unique())
xgbc = xgb.XGBClassifier(objective='binary:logistic',eval_metric = 'logloss')
xgbc.fit(train, train_label)

xgb_train = xgbc.score(train,train_label)
xgb_test = xgbc.score(test, test_label)

print("without any optimisation")
print(f"xgb_train : {xgb_train}")
print(f"xgb_test : {xgb_test}")

without any optimisation
xgb_train : 1.0
xgb_test : 0.9966263147449891


<h2> Tuning with Optuna</h2>

In [81]:
import optuna
from optuna import study
import json



In [90]:
# Define Objective
def objective(trial):
    h_params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 10),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 10),
    }
    xgbc = xgb.XGBClassifier(**h_params, objective="binary:logistic", eval_metric="logloss", early_stopping_rounds = 50)
    xgbc.fit(train,train_label,eval_set=[(val,val_label)],verbose = False) 

    preds = xgbc.predict(test)
    accuracy = accuracy_score(test_label,preds)

    return accuracy


# Create the study
study = optuna.create_study(direction="maximize")
study.optimize(objective,n_trials=15)



[I 2025-03-26 17:03:13,397] A new study created in memory with name: no-name-7432136b-1bf8-4ebb-b66d-e8a298b7ba73
[I 2025-03-26 17:03:13,643] Trial 0 finished with value: 0.9944433419329232 and parameters: {'n_estimators': 200, 'learning_rate': 0.14499874833396048, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8471980123207233, 'colsample_bytree': 0.5247702071613713, 'reg_alpha': 5.142322372558907, 'reg_lambda': 2.661140500467295}. Best is trial 0 with value: 0.9944433419329232.
[I 2025-03-26 17:03:13,800] Trial 1 finished with value: 0.9944433419329232 and parameters: {'n_estimators': 150, 'learning_rate': 0.17670622217915613, 'max_depth': 4, 'min_child_weight': 8, 'subsample': 0.8547057872236055, 'colsample_bytree': 0.8521050496153545, 'reg_alpha': 7.2346336106560205, 'reg_lambda': 5.848922632844548}. Best is trial 0 with value: 0.9944433419329232.
[I 2025-03-26 17:03:14,303] Trial 2 finished with value: 0.9946417940067473 and parameters: {'n_estimators': 950, 'learning_rate

In [91]:
study.best_params

{'n_estimators': 400,
 'learning_rate': 0.0332530293643473,
 'max_depth': 8,
 'min_child_weight': 1,
 'subsample': 0.9683292484556811,
 'colsample_bytree': 0.6740166745761794,
 'reg_alpha': 0.41270556463560176,
 'reg_lambda': 0.2921992917197971}

In [93]:
best_params = study.best_params
study.best_value

0.9956340543758683

In [94]:
# Save best hyperparameters
with open(r"..\API\model\Hyperparams\Xgboost_hparam.json", "w") as f:
    json.dump(study.best_params, f)