In [None]:
import optuna 
from xgboost import XGBClassifier
from optuna.integration import XGBoostPruningCallback 
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


train_df = pd.read_csv("/home/mwanikii/Documents/data_science_projects/financial_inclusivity/input/train_clean.csv")
test_df =  pd.read_csv("/home/mwanikii/Documents/data_science_projects/financial_inclusivity/input/test_clean.csv")

#X_train, y_train and X_test
y_train = train_df["bank_account"]
X_train = train_df.drop(["bank_account"], axis=1)
X_test = test_df.drop(["bank_account"], axis=1)
y_test = test_df["bank_account"]
 
def objective(trial, X, y):
    parameter_grid = {

        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_leaves": trial.suggest_int("max_leaves", 20, 3000, step=20),
        "max_depth": trial.suggestion_int("max_depth", 3, 12),
        "max_bin": trial.suggestion_int("max_bin", 200, 300),
        "lambda": trial.suggestion_int("lambda", 0, 100, step=5),
        "alpha": trial.suggestion_int("alpha", 0, 100, step=5),
        "gamma": trial.suggest_float("gamma", 0, 15),
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = XGBClassifier(objective="binary", **parameter_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=100)

        preds = model.predict_proba(X_test)
        cv_scores[idx] = preds

    return np.mean(cv_scores)
