# Hyperparameter Tuning with Optuna

## 1. Objective
Our previous experiments showed that the Tree-based models (XGBoost, LightGBM, CatBoost) trained on 2D features are the strongest. To beat our best score (23.97), we will rigorously optimize their hyperparameters using **Optuna**.

We will optimize for **MAE** (Mean Absolute Error) using 5-Fold Cross-Validation.

In [1]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import sys
import os

# Set Optuna verbosity
optuna.logging.set_verbosity(optuna.logging.INFO)

# Load Data
train_df = pd.read_csv('../data/processed/train_featurized.csv')
test_df = pd.read_csv('../data/processed/test_featurized.csv')

X = train_df.drop(['id', 'SMILES', 'Tm'], axis=1)
y = train_df['Tm']

print(f"Data Loaded: {X.shape}")

Data Loaded: (2662, 2856)


## 2. XGBoost Optimization

In [2]:
def objective_xgb(trial):
    params = {
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_jobs': -1,
        'random_state': 42,
        'early_stopping_rounds': 100
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = xgb.XGBRegressor(**params)
        
        # Fit with early stopping
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )
        
        preds = model.predict(X_val)
        mae_scores.append(mean_absolute_error(y_val, preds))

    return np.mean(mae_scores)

print("Tuning XGBoost...")
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=20) # Running 20 trials for demonstration, increase for real run

print("Best XGB Params:", study_xgb.best_params)

[I 2026-01-12 08:43:34,598] A new study created in memory with name: no-name-d6534677-e6d9-4e19-b19a-ee5178b413cb


Tuning XGBoost...


[I 2026-01-12 08:49:03,158] Trial 0 finished with value: 28.429271867383754 and parameters: {'learning_rate': 0.005647754396745869, 'max_depth': 4, 'subsample': 0.7901385077662257, 'colsample_bytree': 0.8588804383759647, 'reg_alpha': 0.0011873047061612607, 'reg_lambda': 3.1075229186109445, 'min_child_weight': 2}. Best is trial 0 with value: 28.429271867383754.
[I 2026-01-12 08:50:25,677] Trial 1 finished with value: 27.783555694913712 and parameters: {'learning_rate': 0.04575566639924989, 'max_depth': 7, 'subsample': 0.7264707081945254, 'colsample_bytree': 0.9704555057741865, 'reg_alpha': 0.019057750144696822, 'reg_lambda': 0.018098380182874522, 'min_child_weight': 8}. Best is trial 1 with value: 27.783555694913712.
[I 2026-01-12 08:55:37,689] Trial 2 finished with value: 28.073921502909748 and parameters: {'learning_rate': 0.005183970221330391, 'max_depth': 8, 'subsample': 0.9919805431799427, 'colsample_bytree': 0.8310736494844495, 'reg_alpha': 5.3021595462380455, 'reg_lambda': 0.0050

Best XGB Params: {'learning_rate': 0.032149192305104574, 'max_depth': 5, 'subsample': 0.8182263532180157, 'colsample_bytree': 0.6438617477333907, 'reg_alpha': 0.011663247389322558, 'reg_lambda': 0.9180159830270932, 'min_child_weight': 1}


## 3. LightGBM Optimization

In [3]:
def objective_lgb(trial):
    params = {
        'n_estimators': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'n_jobs': -1,
        'random_state': 42,
        'verbosity': -1
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMRegressor(**params)
        
        # LightGBM requires specific callbacks for early stopping in recent versions
        callbacks = [lgb.early_stopping(stopping_rounds=100, verbose=False)]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=callbacks
        )
        
        preds = model.predict(X_val)
        mae_scores.append(mean_absolute_error(y_val, preds))

    return np.mean(mae_scores)

print("Tuning LightGBM...")
study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=20)

print("Best LightGBM Params:", study_lgb.best_params)

[I 2026-01-12 09:51:45,728] A new study created in memory with name: no-name-0e85ed81-a2ca-44cd-bd72-d157c89cfa56


Tuning LightGBM...


[I 2026-01-12 09:51:50,609] Trial 0 finished with value: 29.804475712302104 and parameters: {'learning_rate': 0.07792715728137667, 'num_leaves': 100, 'subsample': 0.7308420366559001, 'colsample_bytree': 0.793287680993924, 'reg_alpha': 0.0077265837915390595, 'reg_lambda': 0.0567537194418649, 'min_child_samples': 92}. Best is trial 0 with value: 29.804475712302104.
[I 2026-01-12 09:52:06,385] Trial 1 finished with value: 29.42913417264569 and parameters: {'learning_rate': 0.013094552320005587, 'num_leaves': 133, 'subsample': 0.8175064015940465, 'colsample_bytree': 0.7136212331609348, 'reg_alpha': 0.19023705831244916, 'reg_lambda': 0.1859263122277528, 'min_child_samples': 79}. Best is trial 1 with value: 29.42913417264569.
[I 2026-01-12 09:52:13,728] Trial 2 finished with value: 29.526610777268104 and parameters: {'learning_rate': 0.04063730025912167, 'num_leaves': 77, 'subsample': 0.6372510039798597, 'colsample_bytree': 0.8361164256668896, 'reg_alpha': 0.15853535462810997, 'reg_lambda': 

Best LightGBM Params: {'learning_rate': 0.009885825629476801, 'num_leaves': 45, 'subsample': 0.8550205074388135, 'colsample_bytree': 0.6386218409676736, 'reg_alpha': 0.0044909797577605826, 'reg_lambda': 0.0031770350899868174, 'min_child_samples': 5}


## 4. CatBoost Optimization

In [4]:
def objective_cat(trial):
    params = {
        'iterations': 3000,
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1.0, 10.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': 254,
        'verbose': False,
        'random_state': 42,
        'loss_function': 'MAE', # CatBoost can optimize MAE directly
        'task_type': 'CPU'
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    mae_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**params)
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=100,
            verbose=False
        )
        
        preds = model.predict(X_val)
        mae_scores.append(mean_absolute_error(y_val, preds))

    return np.mean(mae_scores)

print("Tuning CatBoost...")
study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective_cat, n_trials=10) # CatBoost is slower, fewer trials

print("Best CatBoost Params:", study_cat.best_params)

[I 2026-01-12 10:04:58,470] A new study created in memory with name: no-name-3ad61783-f978-46b5-8ad4-8af2aed34c8b


Tuning CatBoost...


[I 2026-01-12 10:10:51,877] Trial 0 finished with value: 27.09308222053185 and parameters: {'learning_rate': 0.0352337226770881, 'depth': 8, 'l2_leaf_reg': 7.68515693761527, 'random_strength': 9.679270622158992, 'bagging_temperature': 0.4743148262571497}. Best is trial 0 with value: 27.09308222053185.
[I 2026-01-12 10:13:08,979] Trial 1 finished with value: 29.922480957130375 and parameters: {'learning_rate': 0.008394403414311994, 'depth': 4, 'l2_leaf_reg': 4.075069765987971, 'random_strength': 3.5779950705963364, 'bagging_temperature': 0.5240983434273802}. Best is trial 0 with value: 27.09308222053185.
[I 2026-01-12 10:15:11,584] Trial 2 finished with value: 27.319745184561896 and parameters: {'learning_rate': 0.062344870892394696, 'depth': 6, 'l2_leaf_reg': 0.1181787488370042, 'random_strength': 3.6071505352094424, 'bagging_temperature': 0.012469266767700327}. Best is trial 0 with value: 27.09308222053185.
[I 2026-01-12 10:17:39,485] Trial 3 finished with value: 28.758612583220163 an

Best CatBoost Params: {'learning_rate': 0.03859044348158982, 'depth': 8, 'l2_leaf_reg': 0.8643903376115983, 'random_strength': 7.102351169364942, 'bagging_temperature': 0.10904200877579029}
