# Imports

In [None]:
# run as shell instead of python

!pip uninstall --quiet -y lightgbm cupy dask rapids-dask-dependency
!pip install --quiet lightgbm optuna optuna-integration

In [None]:
import warnings, random
import numpy as np
import pandas as pd

import os
os.environ["LIGHTGBM_USE_DASK"] = "0"
import lightgbm as lgb

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

import optuna

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


---
# Dataset

In [None]:
loaded_dataset = "custo"

if loaded_dataset == "california":
    from sklearn.datasets import fetch_california_housing
    data = fetch_california_housing()
    X = data.data
    y = data.target
elif loaded_dataset == "custo":
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
    
    file_path = ""
    dataset_path = ""
    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      dataset_path,
      file_path,
    )
    df = df.rename(columns={'custo': 'Target'})
    
    X = df.drop(columns={"Target"})
    y = df["Target"]

## Preprocessing

In [None]:
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or str(X[c].dtype).startswith('category')]
num_cols = [c for c in X.columns if c not in cat_cols]

#Stratification for regression
n_bins = 10 
y_binned = pd.qcut(y, q=n_bins, duplicates='drop')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RANDOM_STATE, stratify=y_binned
)

if len(cat_cols) > 0:

    onehot_trees = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

    preprocess_for_trees = ColumnTransformer(
       transformers=[("onehot_trees", onehot_trees, cat_cols)],
       remainder="passthrough"
    )

    X_train = preprocess_for_trees.fit_transform(X_train)
    X_test = preprocess_for_trees.transform(X_test)
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()


---
# Optuna - Hyperparameter optimization

## Study Parameters

In [None]:
study_params = {
    "show_progress_bar":True,
    #"n_trials": 50,
    "timeout": 7*60*60
}

study_param_keys = []

def get_params(trial: optuna.Trial) -> dict:
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
    
        "n_estimators": trial.suggest_int(
            #"n_estimators", 100, 500
            "n_estimators", 400, 1500
        ),
        "learning_rate": trial.suggest_float(
            #"learning_rate", 0.001, 0.05, log=True
            "learning_rate", 0.01, 0.07, log=True
        ),
    
        "num_leaves": trial.suggest_int(
            #"num_leaves", 2, 256
            "num_leaves", 8, 64
        ),
        "min_child_samples": trial.suggest_int(
            #"min_child_samples", 5, 100
            "min_child_samples", 20, 120
        ),
    
        "subsample": trial.suggest_float(
            "subsample", 0.6, 0.9
        ),
        "colsample_bytree": trial.suggest_float(
            "colsample_bytree", 0.6, 0.9
        ),
    
        "lambda_l1": trial.suggest_float(
            "lambda_l1", 0.1, 10.0, log=True
        ),
        "lambda_l2": trial.suggest_float(
            "lambda_l2", 1.0, 20.0, log=True
        ),
    }
    study_param_keys = params.keys()

    return params

# Optuna

In [None]:
study_name ='LightGBM tuning'

def objective(trial: optuna.Trial) -> float:
    params = get_params(trial)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_rmse = []
    for train_idx, valid_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[valid_idx]
        y_tr, y_val = y_train[train_idx], y_train[valid_idx]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_val,   label=y_val)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            callbacks=[
                optuna.integration.LightGBMPruningCallback(trial, "rmse"),
                lgb.early_stopping(50, verbose=False),
            ],
        )

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_rmse.append(rmse)

    return np.mean(fold_rmse)

study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=3),
    study_name=study_name
)
study.optimize(objective, **study_params)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)