In [1]:
import optuna 
from xgboost import XGBClassifier
from optuna.integration import XGBoostPruningCallback 
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np


train_df = pd.read_csv("/home/mwanikii/Documents/data_science_projects/financial_inclusivity/input/train_clean.csv")
test_df =  pd.read_csv("/home/mwanikii/Documents/data_science_projects/financial_inclusivity/input/test_clean.csv")

#X_train, y_train and X_test
y = train_df["bank_account"]
X = train_df.drop(["bank_account"], axis=1)
X_test = test_df.drop(["bank_account"], axis=1)
y_test = test_df["bank_account"]
 
def objective(trial, X, y):
    parameter_grid = {

        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_leaves": trial.suggest_int("max_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda": trial.suggest_int("lambda", 0, 100, step=5),
        "alpha": trial.suggest_int("alpha", 0, 100, step=5),
        "gamma": trial.suggest_float("gamma", 0, 15),
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    cv_scores = np.empty(5)


    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = XGBClassifier(objective="binary:logistic", **parameter_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=100,
            callbacks=[XGBoostPruningCallback(trial, "validation_0-logloss")])

        preds = model.predict(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

study = optuna.create_study(direction="minimize", study_name="XGB Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=20)



KeyboardInterrupt: 

In [2]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")
    


	Best value (rmse): 3.87322
	Best params:
		n_estimators: 10000
		learning_rate: 0.20172781199546924
		max_leaves: 2560
		max_depth: 8
		max_bin: 280
		lambda: 40
		alpha: 5
		gamma: 3.3910506390918242
