In [1]:
import pandas as pd
import numpy as np
import re
import time
import category_encoders as ce
from collections import Counter
from xgboost import XGBClassifier
from xgboost import plot_tree
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
import seaborn as sns
import matplotlib
from matplotlib import pyplot
%matplotlib inline

In [2]:
#data_path = '/kaggle/input/icr-identify-age-related-conditions/train.csv'
data_path = 'train.csv'

In [3]:
df = pd.read_csv(data_path)

In [4]:
df.head(3)

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0
2,013f2bd269f5,0.47003,2635.10654,85.200147,32.360553,8.138688,6.73284,0.025578,12.82457,1.2299,...,7.70956,0.97556,1.198821,37.077772,88.609437,13676.95781,28.022851,35.192676,0.196941,0


In [5]:
# drop 'Id' column
df.drop(['Id'], axis=1, inplace=True)

In [6]:
#dealing with Null values
#df.fillna(0,inplace=True)
#df.isnull().values.any()

In [7]:
# splitting the data
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [8]:
# encoding categorical variable
enc = ce.OneHotEncoder(cols=["EJ"])
enc.fit(X)
encoded_X = enc.transform(X)
encoded_X.head(2)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,...,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978


In [9]:
encoded_X.shape

(617, 57)

In [10]:
# splitting the data
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(encoded_X, y, test_size = test_size, random_state = 0)

#ec1_eval_set = [(X_train, y_train), (X_test, y_test)]

In [17]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split

def objective(trial, encoded_X, y):

    param = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'booster': 'gbtree',
        'verbosity': 0,
        'n_jobs': -1,
        'seed': 1
    }

    # Define the search space for hyperparameters
    param_space = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
        'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
    }

    # Train the XGBoost model with the selected hyperparameters
    clf = xgb.XGBClassifier(**param, **param_space)
    clf.fit(X_train, y_train)

    # Evaluate the model on the validation set
    #y_pred = model.predict_proba(X_valid)[:, 1]
    cv_scores = cross_val_score(clf, encoded_X, y, cv = 20, scoring="f1_macro") 
    score = np.mean(cv_scores)
    return score

In [18]:
# Create and run an Optuna study

# More options for creating the optuna study can be found at their webpage:
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.create_study.html
#
# The default sampler is called TPESampler and is very good, but there are others.

study = optuna.create_study(direction="maximize")

[I 2023-07-18 23:29:02,862] A new study created in memory with name: no-name-5714af48-1986-486e-a67d-dda0e029c313


In [19]:
study.optimize(lambda trial: objective(trial, encoded_X, y), n_trials=100,  gc_after_trial=True)

  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:04,227] Trial 0 finished with value: 0.45201640464798365 and parameters: {'max_depth': 6, 'learning_rate': 0.008388278011467982, 'subsample': 0.5, 'colsample_bytree': 0.5, 'min_child_weight': 42, 'gamma': 0.5136087647920262, 'reg_alpha': 0.9848103881778183, 'reg_lambda': 0.23068376824779394}. Best is trial 0 with value: 0.45201640464798365.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:06,325] Trial 1 finished with value: 0.6901655187811443 and parameters: {'max_depth': 9, 

[I 2023-07-18 23:29:12,611] Trial 4 finished with value: 0.45201640464798365 and parameters: {'max_depth': 5, 'learning_rate': 0.016063091916820527, 'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_weight': 39, 'gamma': 0.06401418907543246, 'reg_alpha': 0.17427253243834856, 'reg_lambda': 0.39514744680365305}. Best is trial 2 with value: 0.7398397416801968.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:13,963] Trial 5 finished with value: 0.45201640464798365 and parameters: {'max_depth': 2, 'learning_rate': 0.0068435416136507395, 'subsample': 0.5, 'colsample_bytree': 0.8, 'min_child_weight': 40, 'gamma': 0.13138692989318812, 'reg_alpha': 9.996872266357146, 'reg_lambda': 2.123995765444183}. Best is trial 2 with value: 0.7398397416801968.
  'learning

[I 2023-07-18 23:29:19,951] Trial 9 finished with value: 0.45201640464798365 and parameters: {'max_depth': 1, 'learning_rate': 0.00559299299175247, 'subsample': 0.5, 'colsample_bytree': 1.0, 'min_child_weight': 24, 'gamma': 0.044158096871965195, 'reg_alpha': 0.7622514460612725, 'reg_lambda': 0.024915234054676712}. Best is trial 2 with value: 0.7398397416801968.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:21,826] Trial 10 finished with value: 0.45201640464798365 and parameters: {'max_depth': 7, 'learning_rate': 1.4862278073364439e-05, 'subsample': 0.4, 'colsample_bytree': 0.9, 'min_child_weight': 15, 'gamma': 0.8807032927373113, 'reg_alpha': 0.010170036156111908, 'reg_lambda': 0.010970802688907011}. Best is trial 2 with value: 0.7398397416801968.
  'l

[I 2023-07-18 23:29:34,975] Trial 14 finished with value: 0.7406870264899441 and parameters: {'max_depth': 10, 'learning_rate': 6.705890449574089e-05, 'subsample': 1.0, 'colsample_bytree': 0.9, 'min_child_weight': 13, 'gamma': 0.46919583216349187, 'reg_alpha': 0.2862321217612535, 'reg_lambda': 0.03533770166604716}. Best is trial 11 with value: 0.7541516508334727.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:36,760] Trial 15 finished with value: 0.47704630895420375 and parameters: {'max_depth': 10, 'learning_rate': 4.6294851200432704e-05, 'subsample': 1.0, 'colsample_bytree': 0.3, 'min_child_weight': 16, 'gamma': 0.8467316998452135, 'reg_alpha': 3.134542526938685, 'reg_lambda': 0.043777401835011456}. Best is trial 11 with value: 0.7541516508334727.
  '

[I 2023-07-18 23:29:43,075] Trial 19 finished with value: 0.5436166686489428 and parameters: {'max_depth': 7, 'learning_rate': 0.08630909175780134, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 31, 'gamma': 0.08144153130238908, 'reg_alpha': 1.6992958402422358, 'reg_lambda': 4.804885470558383}. Best is trial 11 with value: 0.7541516508334727.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:44,909] Trial 20 finished with value: 0.4811604453051822 and parameters: {'max_depth': 9, 'learning_rate': 0.0002507845264837176, 'subsample': 1.0, 'colsample_bytree': 0.4, 'min_child_weight': 20, 'gamma': 0.29474188886829017, 'reg_alpha': 6.298073860296386, 'reg_lambda': 0.9616746174217448}. Best is trial 11 with value: 0.7541516508334727.
  'learning_

[I 2023-07-18 23:29:54,626] Trial 24 finished with value: 0.6777271396425553 and parameters: {'max_depth': 8, 'learning_rate': 9.477502518625135e-05, 'subsample': 1.0, 'colsample_bytree': 0.9, 'min_child_weight': 10, 'gamma': 0.3330624295664186, 'reg_alpha': 0.2642591991526534, 'reg_lambda': 0.16783714408224285}. Best is trial 11 with value: 0.7541516508334727.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:29:56,728] Trial 25 finished with value: 0.7450113290217937 and parameters: {'max_depth': 10, 'learning_rate': 2.603874863887875e-05, 'subsample': 1.0, 'colsample_bytree': 0.9, 'min_child_weight': 20, 'gamma': 0.48051227850532824, 'reg_alpha': 0.5539482685538515, 'reg_lambda': 0.07870704848374448}. Best is trial 11 with value: 0.7541516508334727.
  'lea

[I 2023-07-18 23:30:03,853] Trial 29 finished with value: 0.45201640464798365 and parameters: {'max_depth': 6, 'learning_rate': 0.0001399836228860661, 'subsample': 0.6, 'colsample_bytree': 0.5, 'min_child_weight': 17, 'gamma': 0.5487448223954221, 'reg_alpha': 3.748722089881561, 'reg_lambda': 0.2385171852372901}. Best is trial 11 with value: 0.7541516508334727.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:30:05,526] Trial 30 finished with value: 0.45201640464798365 and parameters: {'max_depth': 8, 'learning_rate': 0.002249859884985185, 'subsample': 1.0, 'colsample_bytree': 0.8, 'min_child_weight': 47, 'gamma': 0.48562030904565034, 'reg_alpha': 1.1025361371798512, 'reg_lambda': 0.666408019448132}. Best is trial 11 with value: 0.7541516508334727.
  'learnin

[I 2023-07-18 23:30:21,778] Trial 34 finished with value: 0.8186470306657796 and parameters: {'max_depth': 9, 'learning_rate': 1.8344752762154445e-05, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 1, 'gamma': 0.2866023849310324, 'reg_alpha': 0.37775490233319825, 'reg_lambda': 1.2036595454324606}. Best is trial 34 with value: 0.8186470306657796.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:30:25,247] Trial 35 finished with value: 0.7907423292556676 and parameters: {'max_depth': 9, 'learning_rate': 1.026912092334979e-05, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 2, 'gamma': 0.28024726325324495, 'reg_alpha': 0.36366650532358263, 'reg_lambda': 1.490188124128677}. Best is trial 34 with value: 0.8186470306657796.
  'learni

[I 2023-07-18 23:30:36,739] Trial 39 finished with value: 0.8114923444639401 and parameters: {'max_depth': 9, 'learning_rate': 1.0529323411617343e-05, 'subsample': 0.7, 'colsample_bytree': 1.0, 'min_child_weight': 1, 'gamma': 0.3042986162935564, 'reg_alpha': 0.13687861991920036, 'reg_lambda': 1.3800576529284923}. Best is trial 34 with value: 0.8186470306657796.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:30:38,956] Trial 40 finished with value: 0.7287488434599185 and parameters: {'max_depth': 6, 'learning_rate': 1.5616827652172706e-05, 'subsample': 0.7, 'colsample_bytree': 1.0, 'min_child_weight': 9, 'gamma': 0.21614294350286714, 'reg_alpha': 0.13357613107961827, 'reg_lambda': 2.781311304348586}. Best is trial 34 with value: 0.8186470306657796.
  'learn

[I 2023-07-18 23:30:51,885] Trial 44 finished with value: 0.749430790927614 and parameters: {'max_depth': 9, 'learning_rate': 1.0571616055770422e-05, 'subsample': 0.7, 'colsample_bytree': 1.0, 'min_child_weight': 4, 'gamma': 0.3977050631282384, 'reg_alpha': 0.8660545384400306, 'reg_lambda': 0.884770385740172}. Best is trial 34 with value: 0.8186470306657796.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:30:55,945] Trial 45 finished with value: 0.7740795820916883 and parameters: {'max_depth': 7, 'learning_rate': 3.202631227363517e-05, 'subsample': 0.7, 'colsample_bytree': 1.0, 'min_child_weight': 1, 'gamma': 0.18302914592007605, 'reg_alpha': 0.18141861138110726, 'reg_lambda': 3.013491604582547}. Best is trial 34 with value: 0.8186470306657796.
  'learning_

[I 2023-07-18 23:31:06,369] Trial 49 finished with value: 0.7687812872579799 and parameters: {'max_depth': 8, 'learning_rate': 3.451106870850551e-05, 'subsample': 0.4, 'colsample_bytree': 0.5, 'min_child_weight': 1, 'gamma': 0.7122342668156155, 'reg_alpha': 0.39218728193298574, 'reg_lambda': 1.024512316704313}. Best is trial 34 with value: 0.8186470306657796.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:31:07,756] Trial 50 finished with value: 0.45201640464798365 and parameters: {'max_depth': 1, 'learning_rate': 1.0380441661772561e-05, 'subsample': 0.7, 'colsample_bytree': 0.3, 'min_child_weight': 12, 'gamma': 0.4495175232597484, 'reg_alpha': 0.24980875047704673, 'reg_lambda': 2.154934442530192}. Best is trial 34 with value: 0.8186470306657796.
  'learni

[I 2023-07-18 23:31:18,955] Trial 54 finished with value: 0.7287488434599185 and parameters: {'max_depth': 9, 'learning_rate': 5.586453611381151e-05, 'subsample': 0.5, 'colsample_bytree': 1.0, 'min_child_weight': 8, 'gamma': 0.3190059705770511, 'reg_alpha': 0.0868118879855831, 'reg_lambda': 0.5556805356779515}. Best is trial 34 with value: 0.8186470306657796.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:31:23,461] Trial 55 finished with value: 0.8160724385911875 and parameters: {'max_depth': 8, 'learning_rate': 2.1036695510777382e-05, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 1, 'gamma': 0.14792525543157953, 'reg_alpha': 0.30186909747604657, 'reg_lambda': 0.7732335517205068}. Best is trial 34 with value: 0.8186470306657796.
  'learni

[I 2023-07-18 23:31:35,843] Trial 59 finished with value: 0.7398315936789459 and parameters: {'max_depth': 7, 'learning_rate': 4.272251107922027e-05, 'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_weight': 7, 'gamma': 0.08348507510180886, 'reg_alpha': 0.9585198788652499, 'reg_lambda': 0.758482725585423}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:31:39,486] Trial 60 finished with value: 0.8001278878633009 and parameters: {'max_depth': 8, 'learning_rate': 1.0333353945407326e-05, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 2, 'gamma': 0.1555115007766065, 'reg_alpha': 0.4828052415985416, 'reg_lambda': 0.3084246947278101}. Best is trial 56 with value: 0.8212288883579116.
  'learning

[I 2023-07-18 23:31:51,234] Trial 64 finished with value: 0.7465833687934132 and parameters: {'max_depth': 7, 'learning_rate': 2.773615771895994e-05, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 6, 'gamma': 0.13790269046010722, 'reg_alpha': 0.2981029943815736, 'reg_lambda': 0.3444735940098536}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:31:54,348] Trial 65 finished with value: 0.77269726123579 and parameters: {'max_depth': 8, 'learning_rate': 2.1851832376648342e-05, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'gamma': 0.1334761885241927, 'reg_alpha': 0.6467370987605287, 'reg_lambda': 0.311559402151992}. Best is trial 56 with value: 0.8212288883579116.
  'learning_r

[I 2023-07-18 23:32:04,079] Trial 69 finished with value: 0.7430222147544568 and parameters: {'max_depth': 6, 'learning_rate': 7.493466044726071e-05, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 5, 'gamma': 0.07061914142274778, 'reg_alpha': 0.23525077475486497, 'reg_lambda': 0.5391717814977446}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:32:05,989] Trial 70 finished with value: 0.7072840443561865 and parameters: {'max_depth': 7, 'learning_rate': 2.554120249645812e-05, 'subsample': 0.6, 'colsample_bytree': 0.8, 'min_child_weight': 15, 'gamma': 0.10894363805767265, 'reg_alpha': 0.28592748643211197, 'reg_lambda': 0.44939657232922986}. Best is trial 56 with value: 0.8212288883579116.
  'lea

[I 2023-07-18 23:32:18,548] Trial 74 finished with value: 0.7317157432529242 and parameters: {'max_depth': 5, 'learning_rate': 3.1769913634112e-05, 'subsample': 0.6, 'colsample_bytree': 0.4, 'min_child_weight': 1, 'gamma': 0.12705024661097258, 'reg_alpha': 0.3225466531601524, 'reg_lambda': 0.914203897827018}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:32:22,104] Trial 75 finished with value: 0.8173413832129148 and parameters: {'max_depth': 6, 'learning_rate': 4.538897092088949e-05, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 2, 'gamma': 0.19695417389232234, 'reg_alpha': 0.15589330410396937, 'reg_lambda': 0.6363227640299294}. Best is trial 56 with value: 0.8212288883579116.
  'learning_

[I 2023-07-18 23:32:31,292] Trial 79 finished with value: 0.7455501407773092 and parameters: {'max_depth': 7, 'learning_rate': 3.652743507197386e-05, 'subsample': 0.4, 'colsample_bytree': 0.7, 'min_child_weight': 3, 'gamma': 0.09166539939915426, 'reg_alpha': 0.15263903974967805, 'reg_lambda': 0.4606171583492119}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:32:33,459] Trial 80 finished with value: 0.5290395179868864 and parameters: {'max_depth': 5, 'learning_rate': 2.1555965463926334e-05, 'subsample': 0.8, 'colsample_bytree': 0.3, 'min_child_weight': 7, 'gamma': 0.18730366211362914, 'reg_alpha': 0.21574767644111348, 'reg_lambda': 1.1752641912193331}. Best is trial 56 with value: 0.8212288883579116.
  'lear

[I 2023-07-18 23:32:44,718] Trial 84 finished with value: 0.7749830894819993 and parameters: {'max_depth': 7, 'learning_rate': 1.3091759291941594e-05, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 4, 'gamma': 0.09968415312280211, 'reg_alpha': 0.19318231639772243, 'reg_lambda': 0.5139535136383867}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:32:48,651] Trial 85 finished with value: 0.8151367278596944 and parameters: {'max_depth': 7, 'learning_rate': 2.693171501481475e-05, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 1, 'gamma': 0.11855237678593808, 'reg_alpha': 0.3351927813497589, 'reg_lambda': 0.7042806482076978}. Best is trial 56 with value: 0.8212288883579116.
  'learn

[I 2023-07-18 23:33:00,882] Trial 89 finished with value: 0.7967446825411785 and parameters: {'max_depth': 4, 'learning_rate': 1.6233332972231305e-05, 'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_weight': 1, 'gamma': 0.07805908495260526, 'reg_alpha': 0.11619245076683175, 'reg_lambda': 1.1236557743850695}. Best is trial 56 with value: 0.8212288883579116.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:33:03,808] Trial 90 finished with value: 0.7570375913270118 and parameters: {'max_depth': 5, 'learning_rate': 2.5052474140274172e-05, 'subsample': 0.7, 'colsample_bytree': 0.9, 'min_child_weight': 4, 'gamma': 0.12641048181279896, 'reg_alpha': 0.1615868902235576, 'reg_lambda': 0.6462371599973301}. Best is trial 56 with value: 0.8212288883579116.
  'lear

[I 2023-07-18 23:33:16,278] Trial 94 finished with value: 0.8280592211654128 and parameters: {'max_depth': 7, 'learning_rate': 3.935770307293339e-05, 'subsample': 0.6, 'colsample_bytree': 0.6, 'min_child_weight': 1, 'gamma': 0.10295543042219762, 'reg_alpha': 0.18723120322324682, 'reg_lambda': 0.6128169300568804}. Best is trial 93 with value: 0.8318631427340403.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'gamma': trial.suggest_loguniform('gamma', 0.01, 1.0),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.01, 10.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.01, 10.0)
[I 2023-07-18 23:33:18,526] Trial 95 finished with value: 0.7095396143386472 and parameters: {'max_depth': 6, 'learning_rate': 3.933706636755788e-05, 'subsample': 0.5, 'colsample_bytree': 0.6, 'min_child_weight': 5, 'gamma': 0.1118878817309439, 'reg_alpha': 0.2670493285307297, 'reg_lambda': 0.571860389350705}. Best is trial 93 with value: 0.8318631427340403.
  'learning

[I 2023-07-18 23:33:31,855] Trial 99 finished with value: 0.7277496195997861 and parameters: {'max_depth': 7, 'learning_rate': 6.550425685489964e-05, 'subsample': 0.7, 'colsample_bytree': 0.6, 'min_child_weight': 6, 'gamma': 0.09561161515874066, 'reg_alpha': 0.11185057825476913, 'reg_lambda': 1.1750183427619179}. Best is trial 97 with value: 0.8408031788883619.


In [20]:
best_params = study.best_params
print('best_params: ', best_params)
best_score = study.best_value
print('best_score: ', best_score)

best_params:  {'max_depth': 7, 'learning_rate': 5.495568561536644e-05, 'subsample': 0.7, 'colsample_bytree': 0.6, 'min_child_weight': 1, 'gamma': 0.09713551852723058, 'reg_alpha': 0.14538846198743027, 'reg_lambda': 0.9524402910037023}
best_score:  0.8408031788883619


In [21]:
# All the details of the best trial
study.best_trial

FrozenTrial(number=97, state=TrialState.COMPLETE, values=[0.8408031788883619], datetime_start=datetime.datetime(2023, 7, 18, 23, 33, 22, 541704), datetime_complete=datetime.datetime(2023, 7, 18, 23, 33, 26, 463358), params={'max_depth': 7, 'learning_rate': 5.495568561536644e-05, 'subsample': 0.7, 'colsample_bytree': 0.6, 'min_child_weight': 1, 'gamma': 0.09713551852723058, 'reg_alpha': 0.14538846198743027, 'reg_lambda': 0.9524402910037023}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=1e-05, step=None), 'subsample': CategoricalDistribution(choices=(0.4, 0.5, 0.6, 0.7, 0.8, 1.0)), 'colsample_bytree': CategoricalDistribution(choices=(0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)), 'min_child_weight': IntDistribution(high=50, log=False, low=1, step=1), 'gamma': FloatDistribution(high=1.0, log=True, low=0.01, step=None), 'reg_alpha': FloatDistribut

In [22]:
# testing the model
test_df = pd.read_csv('test.csv')
test_df.head(3)

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,00eed32682bb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,010ebe33f668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,02fa521e1838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
new_test_df = test_df.drop(['Id'],axis=1)
new_test_df.head(3)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
encoded_test_df = enc.transform(new_test_df)
encoded_test_df.head(3)

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
clf = XGBClassifier(**study.best_params)
clf.fit(encoded_X,y)

In [26]:
#X_test_df = test_df.drop(['Id'], axis=1)

# Extract IDs from the test dataset
test_ids = test_df['Id']

# Make predictions
predictions = clf.predict_proba(encoded_test_df)

# Create a new DataFrame with prediction results
prediction_df = pd.DataFrame({'Id': test_ids, 'class_0': predictions[:, 0], 'class_1': predictions[:, 1]})

# Print the prediction DataFrame
print(prediction_df)

             Id   class_0   class_1
0  00eed32682bb  0.500758  0.499242
1  010ebe33f668  0.500758  0.499242
2  02fa521e1838  0.500758  0.499242
3  040e15f562a2  0.500758  0.499242
4  046e85c7cc7f  0.500758  0.499242


In [27]:
prediction_df.to_csv('submission.csv', index=False)