# Imports

In [None]:
# run as shell instead of python

!pip install --quiet optuna optuna-integration pytorch-tabnet

In [None]:
import warnings, random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

import torch
from pytorch_tabnet.tab_model import TabNetRegressor

import optuna

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)


---
# Dataset

In [None]:
loaded_dataset = "custo"

if loaded_dataset == "california":
    from sklearn.datasets import fetch_california_housing
    data = fetch_california_housing()
    X = data.data
    y = data.target
elif loaded_dataset == "custo":
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
    
    file_path = ""
    dataset_path = ""
    df = kagglehub.load_dataset(
      KaggleDatasetAdapter.PANDAS,
      dataset_path,
      file_path,
    )
    df = df.rename(columns={'custo': 'Target'})
    
    X = df.drop(columns={"Target"})
    y = df["Target"]

## Preprocessing

In [None]:
cat_cols = [c for c in X.columns if X[c].dtype == 'object' or str(X[c].dtype).startswith('category')]
num_cols = [c for c in X.columns if c not in cat_cols]

#Stratification for regression
n_bins = 10 
y_binned = pd.qcut(y, q=n_bins, duplicates='drop')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=RANDOM_STATE, stratify=y_binned
)

onehot_tab = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
 
scaler_standard = StandardScaler()

preprocess_for_tab = ColumnTransformer(
   transformers=[("onehot_encoder", onehot_tab, cat_cols),
                 ("scaler_standard", scaler_standard, num_cols),
                ],
   remainder="passthrough"
)

X_train = preprocess_for_tab.fit_transform(X_train)
X_test = preprocess_for_tab.transform(X_test)

y_train = y_train.values.astype(np.float32).reshape(-1,1)
y_test = y_test.values.astype(np.float32).reshape(-1,1)


---
# Optuna - Hyperparameter optimization

In [None]:
study_params = {
    "show_progress_bar":True,
    #"n_trials": 50,
    "timeout": 7*60*60
}

def get_params(trial: optuna.Trial) -> dict:
    params = {
        "seed": 42,
        "n_d": trial.suggest_int("n_d", 8, 16), 
        "n_a": trial.suggest_int("n_a", 8, 16), 
        "n_steps": trial.suggest_int("n_steps", 3, 5), 

        "gamma": trial.suggest_float("gamma", 1.0, 1.5), 
        "lambda_sparse": trial.suggest_float("lambda_sparse", 1e-4, 1e-2, log=True),
        "mask_type": "entmax", 

        "optimizer_fn": torch.optim.Adam,
        "optimizer_params": {"lr": trial.suggest_float("learning_rate", 1e-3, 1e-2, log=True)},
    }
    return params
 

In [None]:
study_name ='tabnet tuning'

def objective(trial: optuna.Trial) -> float:
    params = get_params(trial)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_rmse = []

    for fold_i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train), start=1):
        X_tr, X_val = X_train[train_idx], X_train[valid_idx]
        y_tr, y_val = y_train[train_idx], y_train[valid_idx]

        model = TabNetRegressor(**params)

        model.fit(
            X_tr,
            y_tr,
            eval_set=[(X_val, y_val)],
            eval_name=["val"],
            eval_metric=["rmse"],
            max_epochs=200,
            patience=20,
            batch_size=256,
            virtual_batch_size=64,
            drop_last=False,
        )

        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        fold_rmse.append(rmse)

        mean_rmse_so_far = np.mean(fold_rmse)
        trial.report(mean_rmse_so_far, step=fold_i)

        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(fold_rmse)

study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
    study_name=study_name,
)

study.optimize(objective, **study_params)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)