# Imports

In [None]:
# run as shell instead of python

!pip install --quiet catboost pytorch-tabnet optuna optuna-integration

In [None]:
import warnings, random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor

import optuna
from optuna.integration import CatBoostPruningCallback

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


---
# Dataset

In [None]:
loaded_dataset = "custo"

if loaded_dataset == "california":
    from sklearn.datasets import fetch_california_housing
    data = fetch_california_housing()
    X = data.data
    y = data.target
elif loaded_dataset == "custo":
    import kagglehub
    from kagglehub import KaggleDatasetAdapter

    file_path = ""
    dataset_path = ""
    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      dataset_path,
      file_path,
    )
    df = df.rename(columns={'custo': 'Target'})

    X = df.drop(columns={"Target"})
    y = df["Target"]

## Preprocessing

In [None]:
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or str(X[c].dtype).startswith('category')]
num_cols = [c for c in X.columns if c not in cat_cols]

#Stratification for regression
n_bins = 10 
y_binned = pd.qcut(y, q=n_bins, duplicates='drop')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RANDOM_STATE, stratify=y_binned
)

if len(cat_cols) > 0:

    onehot_trees = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

    preprocess_for_trees = ColumnTransformer(
       transformers=[("onehot_trees", onehot_trees, cat_cols)],
       remainder="passthrough"
    )

    X_train = preprocess_for_trees.fit_transform(X_train)
    X_test = preprocess_for_trees.transform(X_test)
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()


---
# Optuna - Hyperparameter optimization

## Study Parameters

In [None]:
study_params = {
    "show_progress_bar":True,
    #"n_trials": 50,
    "timeout": 7*60*60
}

def get_params(trial: optuna.Trial) -> dict:
    params = {
        "objective": "RMSE",
        "loss_function": "RMSE",
        "eval_metric": "RMSE",

        "random_state": RANDOM_STATE,

        "learning_rate": trial.suggest_float(
            #"learning_rate", 0.08, 0.16, log=True
            #"learning_rate", 0.04, 0.08, log=True
            "learning_rate", 0.02, 0.08, log=True
        ),

        # - Number of boosting rounds (trees) to build.
        #   Higher = better fit, but higher risk of overfitting and longer training.
        "iterations": trial.suggest_int(
            #"iterations", 50, 200
            "iterations", 300, 1200
        ),

        # Maximum depth of each decision tree. Controls model complexity.
        "depth": trial.suggest_int(
            #"depth", 8, 12
            "depth", 4, 7
        ),

        # - L2 regularization term on leaf weights. Prevents overfitting.
        #   Higher = stronger regularization
        "l2_leaf_reg": trial.suggest_float(
            #"l2_leaf_reg", 5, 10
            "l2_leaf_reg", 8, 30
        ),

        # - Fraction of features used at each tree level (helps reduce overfitting).
        "colsample_bylevel": trial.suggest_float(
            #"colsample_bylevel", 0.1, 1.0
            "colsample_bylevel", 0.5, 0.9
        ),

        # - Method for sampling data before each tree.
        #   "Bayesian", "Bernoulli", "MVS", or "Poisson".
        "bootstrap_type": "Bayesian",

        # "Ordered", "Plain"
        "boosting_type": "Ordered",
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float(
                                            #"bagging_temperature", 0, 10
                                            "bagging_temperature", 1.0, 8.0
                                        )
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    return params

# Optuna

In [None]:
study_name ='catboost tuning'

def objective(trial: optuna.Trial) -> float:
    params = get_params(trial)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_rmse = []
    for train_idx, valid_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train[train_idx], X_train[valid_idx]
        y_tr, y_val = y_train[train_idx], y_train[valid_idx]
        model = CatBoostRegressor(**params)
        pruning_callback = CatBoostPruningCallback(trial, "RMSE")

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=100,
            verbose=0,
            callbacks=[pruning_callback],
        )

        pruning_callback.check_pruned()

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_rmse.append(rmse)

    return np.mean(fold_rmse)

study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=3),
    study_name=study_name
)
study.optimize(objective, **study_params)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)