# 6.0.0 Hyperparameter Optimization for Classifier Model

### Methodology

The primary goal is to tune the XGBOOST parameters:

    - eta (learning rate): Extended to explore more conservative and slightly more aggressive learning rates.
    - gamma: Now starts at 0 up to 1 to explore the impact of making trees more conservative.
    - max_depth: Increased the upper limit to allow deeper trees which might capture more complex patterns.
    - min_child_weight: Broader range to better control overfitting by requiring nodes to justify splits with more samples.
    - subsample and colsample_bytree: Allowed to vary more widely to assess different levels of data and feature subsampling.
    - scale_pos_weight: Adjusted to better balance the classes given the known imbalance.
    - lambda and alpha: Expanded the range for regularization parameters to further control overfitting.
    - max_delta_step: Introduced a broader range to help stabilize updates in scenarios of high class imbalance.
    - n_estimators: Increased the maximum to allow more trees to be evaluated, which can be crucial when all other parameters are being optimized for better granularity.

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import yaml
import optuna 
import xgboost as xgb
from pathlib import Path
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from xgboost import XGBClassifier

from src.utils import calculate_metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def objective(trial):
   
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',  # For binary classification
        'booster': 'gbtree',             # Tree-based learning algorithms
        'eval_metric': 'auc',            # Evaluation metric for the validation data
        'eta': trial.suggest_float('eta', 0.005, 0.05),  # Learning rate
        'gamma': trial.suggest_float('gamma', 0, 1),  # Minimum loss reduction required to make a further partition
        'max_depth': trial.suggest_int('max_depth', 3, 10),  # Depth of the tree
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),  # Minimum sum of instance weight needed in a child
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),  # Subsample ratio of the training instances
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),  # Subsample ratio of features
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),  # Balancing of positive and negative weights
        'lambda': trial.suggest_loguniform('lambda', 0.1, 5),  # L2 regularization
        'alpha': trial.suggest_loguniform('alpha', 0.01, 1),  # L1 regularization
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),  # Might be used in logistic regression when class is extremely imbalanced
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)  # Number of trees
    }

    
    clf = XGBClassifier(**param)
    clf.fit(X_train, Y_train, eval_set=[(X_valid, Y_valid)], early_stopping_rounds=10, verbose=False)
    
    preds = clf.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(Y_valid, preds)
    return auc

## 1. Data Preparation

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    
model_parameters = config["model_parameters"]["xgbm"]
numeric_features = config["filter_features"]["numerical"]
features = numeric_features
target = config["main"]["target"]
data_train_path = Path.cwd().parent / config["main"]["data_train_path"]
train_validation_path = Path.cwd().parent / config["main"]["data_validation_path"]

train_df = pd.read_pickle(data_train_path)
validation_df = pd.read_pickle(train_validation_path)

X_train, Y_train = train_df[features], train_df[target]
X_valid, Y_valid = validation_df[features], validation_df[target]

split_seed = config["main"]["random_seed"]

X_train.shape

(9479, 14)

## 3. Results

In [4]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Best trial:', study.best_trial.params)
trial = study.best_trial


[I 2024-05-01 18:12:52,629] A new study created in memory with name: no-name-f1dce6f3-2c2f-4d27-b18e-1e02343aec20
  'lambda': trial.suggest_loguniform('lambda', 0.1, 5),  # L2 regularization
  'alpha': trial.suggest_loguniform('alpha', 0.01, 1),  # L1 regularization
[I 2024-05-01 18:12:52,753] Trial 0 finished with value: 0.5923995932892729 and parameters: {'eta': 0.0460545061809906, 'gamma': 0.6409014738600065, 'max_depth': 6, 'min_child_weight': 3, 'subsample': 0.5318430600738346, 'colsample_bytree': 0.7633369382348736, 'scale_pos_weight': 8.285497961937995, 'lambda': 2.495397601926462, 'alpha': 0.013609052025450147, 'max_delta_step': 10, 'n_estimators': 83}. Best is trial 0 with value: 0.5923995932892729.
  'lambda': trial.suggest_loguniform('lambda', 0.1, 5),  # L2 regularization
  'alpha': trial.suggest_loguniform('alpha', 0.01, 1),  # L1 regularization
[I 2024-05-01 18:12:52,883] Trial 1 finished with value: 0.600248545444275 and parameters: {'eta': 0.03136574485309018, 'gamma': 

Best trial: {'eta': 0.04840804490284224, 'gamma': 0.19262710344139433, 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.6291574423748258, 'colsample_bytree': 0.8073318621840868, 'scale_pos_weight': 1.3609816779677464, 'lambda': 0.14263837351394984, 'alpha': 0.8334424441902536, 'max_delta_step': 0, 'n_estimators': 238}


### 3. Results

In [5]:
study.best_params

{'eta': 0.04840804490284224,
 'gamma': 0.19262710344139433,
 'max_depth': 6,
 'min_child_weight': 6,
 'subsample': 0.6291574423748258,
 'colsample_bytree': 0.8073318621840868,
 'scale_pos_weight': 1.3609816779677464,
 'lambda': 0.14263837351394984,
 'alpha': 0.8334424441902536,
 'max_delta_step': 0,
 'n_estimators': 238}

In [6]:
model_parameters

{'objective': 'binary:logistic',
 'booster': 'gbtree',
 'eval_metric': 'auc',
 'eta': 0.01,
 'gamma': 0.1,
 'max_depth': 6,
 'min_child_weight': 3,
 'subsample': 0.8,
 'colsample_bytree': 0.8,
 'scale_pos_weight': 4,
 'lambda': 1,
 'alpha': 0.1,
 'max_delta_step': 1,
 'n_estimators': 100}

In [7]:
xgbm_model = XGBClassifier(missing=np.nan, **model_parameters, random_state=split_seed)

xgbm_model.fit(X_train, Y_train)
xgbm_preds = xgbm_model.predict_proba(X_valid)[:, 1]

model_results = calculate_metrics(Y_valid, xgbm_preds)
model_results

{'roc_auc_score': 0.6110941648308197,
 'pr_auc': 0.28320533857646,
 'ks': 0.17798678190137265}

In [8]:
model = XGBClassifier(missing=np.nan,**study.best_params, random_state=split_seed )
model.fit(X_train, Y_train)
preds = model.predict_proba(X_valid)[:, 1]
roc_auc = metrics.roc_auc_score(y_true = Y_valid, y_score = preds)

print(roc_auc)

0.5897107834830254
