In [1]:
import optuna
import pandas as pd
import numpy as np
import torch
import json
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score, mean_squared_error
import lightgbm as lgb

In [2]:
X = np.load("model_data\X_fgp.npy")
print(X.shape)
print(len(X))

y = np.load("model_data\y_gfp.npy")
print(y.shape)
print(len(y))

(16648, 2777)
16648
(16648,)
16648


In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(13318, 2777) (13318,)
(1665, 2777) (1665,)
(1665, 2777) (1665,)


In [5]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

def objective_xgb(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 0.001, 20.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.001, 20.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.001, 20.0, log=True),
        # keep these default and don't need to tune
        "n_estimators": 2000,
        "random_state": 42,
        "n_jobs": -1,
        "early_stopping_rounds": 50,
        "tree_method": "hist",
        "device": "cuda",
    }

    losses = []
    for f, (t_id, v_id) in enumerate(cv.split(X_train)):
        X_tr, X_va = X[t_id], X[v_id]
        y_tr, y_va = y[t_id], y[v_id]

        model = XGBRegressor(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)])
        
        pred = model.predict(X_va)
        rmse = np.sqrt(mean_squared_error(y_va, pred))
        losses.append(rmse)

        trial.report(rmse, step=f)
        
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(losses))

In [5]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=10, show_progress_bar=True)

print("RMSE:", study_xgb.best_value)
print(study_xgb.best_params)

[32m[I 2026-02-11 11:13:11,314][0m A new study created in memory with name: no-name-9a772ae4-770e-46cc-a047-e4eda2b64903[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[0]	validation_0-rmse:0.30039
[1]	validation_0-rmse:0.29452
[2]	validation_0-rmse:0.28921
[3]	validation_0-rmse:0.28425
[4]	validation_0-rmse:0.28040
[5]	validation_0-rmse:0.27672
[6]	validation_0-rmse:0.27342
[7]	validation_0-rmse:0.27013
[8]	validation_0-rmse:0.26711
[9]	validation_0-rmse:0.26412
[10]	validation_0-rmse:0.26155
[11]	validation_0-rmse:0.25999
[12]	validation_0-rmse:0.25788
[13]	validation_0-rmse:0.25600
[14]	validation_0-rmse:0.25425
[15]	validation_0-rmse:0.25292
[16]	validation_0-rmse:0.25146
[17]	validation_0-rmse:0.25005
[18]	validation_0-rmse:0.24877
[19]	validation_0-rmse:0.24770
[20]	validation_0-rmse:0.24650
[21]	validation_0-rmse:0.24582
[22]	validation_0-rmse:0.24430
[23]	validation_0-rmse:0.24366
[24]	validation_0-rmse:0.24272
[25]	validation_0-rmse:0.24136
[26]	validation_0-rmse:0.24054
[27]	validation_0-rmse:0.23969
[28]	validation_0-rmse:0.23870
[29]	validation_0-rmse:0.23775
[30]	validation_0-rmse:0.23727
[31]	validation_0-rmse:0.23662
[32]	validation_0-

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[0]	validation_0-rmse:0.30527
[1]	validation_0-rmse:0.29908
[2]	validation_0-rmse:0.29361
[3]	validation_0-rmse:0.28768
[4]	validation_0-rmse:0.28348
[5]	validation_0-rmse:0.27936
[6]	validation_0-rmse:0.27587
[7]	validation_0-rmse:0.27340
[8]	validation_0-rmse:0.27041
[9]	validation_0-rmse:0.26767
[10]	validation_0-rmse:0.26546
[11]	validation_0-rmse:0.26320
[12]	validation_0-rmse:0.26091
[13]	validation_0-rmse:0.25873
[14]	validation_0-rmse:0.25739
[15]	validation_0-rmse:0.25604
[16]	validation_0-rmse:0.25404
[17]	validation_0-rmse:0.25285
[18]	validation_0-rmse:0.25112
[19]	validation_0-rmse:0.24970
[20]	validation_0-rmse:0.24884
[21]	validation_0-rmse:0.24752
[22]	validation_0-rmse:0.24677
[23]	validation_0-rmse:0.24615
[24]	validation_0-rmse:0.24504
[25]	validation_0-rmse:0.24406
[26]	validation_0-rmse:0.24352
[27]	validation_0-rmse:0.24298
[28]	validation_0-rmse:0.24242
[29]	validation_0-rmse:0.24170
[30]	validation_0-rmse:0.24095
[31]	validation_0-rmse:0.23972
[32]	validation_0-

In [None]:
with open("model_params/xgb_params.json", "w") as f:
    json.dump(study_xgb.best_params, f, indent=2)

In [6]:
with open("model_params/xgb_params.json", "r") as f:
    params = json.load(f)

In [10]:
X_train_xgb = np.vstack([X_train, X_val])
y_train_xgb = np.concatenate([y_train, y_val])

xgb_model = XGBRegressor(
    **params,
    n_estimators=2000,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
    device="cuda"
)

xgb_model.fit(X_train_xgb, y_train_xgb)

y_pred = xgb_model.predict(X_test)
print(f"R Squared: {r2_score(y_test, y_pred):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}")

R Squared: 0.724
RMSE: 0.158


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


### LGB Tuning

In [9]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def objective_lgb(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.001, 5.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.001, 5.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "subsample_freq": 1,
        
        "n_estimators": 2000,
        "random_state": 42,
        "n_jobs": -1,
    }

    losses = []
    for fold, (t_id, v_id) in enumerate(cv.split(X_train)):
        X_tr, X_va = X_train[t_id], X_train[v_id]
        y_tr, y_va = y_train[t_id], y_train[v_id]

        lgb_model = lgb.LGBMRegressor(**params)

        lgb_model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])

        lgb_pred = lgb_model.predict(X_va)
        rmse = np.sqrt(mean_squared_error(y_va, lgb_pred))
        losses.append(rmse)

        trial.report(rmse, step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(losses))

In [10]:
study_lgb = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner(n_warmup_steps=2))
study_lgb.optimize(objective_lgb, n_trials=10, show_progress_bar=True)

print("RMSE:", study_lgb.best_value)
print(study_lgb.best_params)

[32m[I 2026-02-11 14:01:03,472][0m A new study created in memory with name: no-name-ffbb3d16-737f-4aff-800f-7473cea72323[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077972 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148866
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1939
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070675 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150428
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1936
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.090684 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150133
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1929
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.227266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150308
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1925
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150243
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1933
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:03:21,315][0m Trial 0 finished with value: 0.18013201348399902 and parameters: {'learning_rate': 0.12417264504859911, 'num_leaves': 24, 'max_depth': 10, 'min_child_samples': 52, 'reg_alpha': 0.4171863437465864, 'reg_lambda': 0.001429455097652511, 'subsample': 0.9702416320699112, 'colsample_bytree': 0.7297494775818801}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.245946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149769
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2224
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.255695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151384
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2235
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.273143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151109
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2233
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.258878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151264
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 2229
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.287286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151153
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 2218
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:06:11,275][0m Trial 1 finished with value: 0.18833666485755649 and parameters: {'learning_rate': 0.03651165166146529, 'num_leaves': 137, 'max_depth': 4, 'min_child_samples': 35, 'reg_alpha': 1.0840177298589557, 'reg_lambda': 0.17324647478356786, 'subsample': 0.8718163766014012, 'colsample_bytree': 0.7747139108566076}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 147532
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1550
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149093
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1549
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148813
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1543
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148978
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1540
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148872
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1537
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:06:34,174][0m Trial 2 finished with value: 0.23471892833943092 and parameters: {'learning_rate': 0.10365611288804875, 'num_leaves': 45, 'max_depth': 1, 'min_child_samples': 88, 'reg_alpha': 2.544433291392748, 'reg_lambda': 0.0015526777257312938, 'subsample': 0.7611345042763796, 'colsample_bytree': 0.8691168525828625}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.110670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150745
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2540
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.091968 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152326
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2541
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152067
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2546
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152245
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 2545
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.096570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 152135
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 2536
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:09:56,775][0m Trial 3 finished with value: 0.18288035540096365 and parameters: {'learning_rate': 0.02369179915135801, 'num_leaves': 141, 'max_depth': 9, 'min_child_samples': 18, 'reg_alpha': 3.627409143898266, 'reg_lambda': 0.0031302287472706374, 'subsample': 0.7478947359325693, 'colsample_bytree': 0.8698799778211157}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 147901
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1655
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.071936 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149426
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1644
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149169
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1643
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.095450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149368
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1651
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149252
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1643
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:12:34,198][0m Trial 4 finished with value: 0.18469217332244248 and parameters: {'learning_rate': 0.017238094328022438, 'num_leaves': 34, 'max_depth': 8, 'min_child_samples': 77, 'reg_alpha': 0.28479043751700583, 'reg_lambda': 2.15883707956886, 'subsample': 0.9344889194467105, 'colsample_bytree': 0.9206075327439986}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148319
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1779
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149925
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1790
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072854 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149665
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1790
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149845
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1789
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079831 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149742
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1788
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:13:10,251][0m Trial 5 finished with value: 0.19001506135121174 and parameters: {'learning_rate': 0.15925771169270042, 'num_leaves': 177, 'max_depth': 3, 'min_child_samples': 63, 'reg_alpha': 0.0030205603529941966, 'reg_lambda': 0.2846284538648337, 'subsample': 0.8303529912638691, 'colsample_bytree': 0.7799581110462707}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070248 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148186
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1742
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068889 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149776
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1748
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149488
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1739
[LightGBM] [Info] Start training from score 0.345463




[32m[I 2026-02-11 14:13:30,901][0m Trial 6 pruned. [0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148785
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1914
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084854 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150333
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1906
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150043
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1901
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.073393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150231
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1902
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074988 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150123
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1897
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:15:02,242][0m Trial 7 finished with value: 0.18233898093425988 and parameters: {'learning_rate': 0.06929565282967196, 'num_leaves': 76, 'max_depth': 5, 'min_child_samples': 54, 'reg_alpha': 0.1003412954422577, 'reg_lambda': 0.009704257169097855, 'subsample': 0.7893805764914565, 'colsample_bytree': 0.9988348703665304}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 148729
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1899
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150301
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1897
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.075392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150009
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 1891
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150200
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1892
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087006 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150092
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 1889
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:16:02,550][0m Trial 8 finished with value: 0.18465268853112524 and parameters: {'learning_rate': 0.11052709036826124, 'num_leaves': 89, 'max_depth': 4, 'min_child_samples': 55, 'reg_alpha': 0.3718599474232717, 'reg_lambda': 0.0037660990237973323, 'subsample': 0.7394207988224052, 'colsample_bytree': 0.7896723096429956}. Best is trial 0 with value: 0.18013201348399902.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066979 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149977
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2287
[LightGBM] [Info] Start training from score 0.346192




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151533
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2282
[LightGBM] [Info] Start training from score 0.344651




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151261
[LightGBM] [Info] Number of data points in the train set: 10654, number of used features: 2282
[LightGBM] [Info] Start training from score 0.345463




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151439
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 2282
[LightGBM] [Info] Start training from score 0.344344




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.085745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 151365
[LightGBM] [Info] Number of data points in the train set: 10655, number of used features: 2284
[LightGBM] [Info] Start training from score 0.344269




[32m[I 2026-02-11 14:17:21,740][0m Trial 9 finished with value: 0.18291773337448242 and parameters: {'learning_rate': 0.12966209361891926, 'num_leaves': 101, 'max_depth': 10, 'min_child_samples': 32, 'reg_alpha': 1.5413038955985914, 'reg_lambda': 1.9224400920349358, 'subsample': 0.841353352424477, 'colsample_bytree': 0.9476458834225385}. Best is trial 0 with value: 0.18013201348399902.[0m
RMSE: 0.18013201348399902
{'learning_rate': 0.12417264504859911, 'num_leaves': 24, 'max_depth': 10, 'min_child_samples': 52, 'reg_alpha': 0.4171863437465864, 'reg_lambda': 0.001429455097652511, 'subsample': 0.9702416320699112, 'colsample_bytree': 0.7297494775818801}


In [11]:
with open('model_params/lgb_params.json', 'w') as f:
    json.dump(study_lgb.best_params, f, indent=2)

In [4]:
with open('model_params/lgb_params.json', 'r') as f:
    lgb_params = json.load(f)

In [5]:
lgb_model = lgb.LGBMRegressor(**lgb_params, n_estimators=2000, random_state=42)
lgb_model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])

y_val_lgb = lgb_model.predict(X_val)
print("validation:")
print(f"R Squared: {r2_score(y_val, y_val_lgb):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val, y_val_lgb)):.3f}")
print()
lgb_pred = lgb_model.predict(X_test)
print("test:")
print(f"R Squared: {r2_score(y_test, lgb_pred):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, lgb_pred)):.3f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153095
[LightGBM] [Info] Number of data points in the train set: 13318, number of used features: 2102
[LightGBM] [Info] Start training from score 0.344984
validation:
R Squared: 0.652
RMSE: 0.184

test:
R Squared: 0.701
RMSE: 0.165




### MLP Regressor Tuning

In [6]:
X_train_mlp = np.vstack([X_train, X_val])
y_train_mlp = np.concatenate([y_train, y_val])

cv = KFold(n_splits=5, shuffle=True, random_state=42)

def objective_mlp(trial):
    n_layers = trial.suggest_int("n_layers", 1, 4)
    
    sizes = []
    for i in range(n_layers):
        sizes.append(trial.suggest_int(f"n_units_l{i}", 32, 512, log=True))
    hidden = tuple(sizes)

    params = {
        "hidden_layer_sizes": hidden,
        "activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
        "learning_rate_init": trial.suggest_float("learning_rate_init", 0.001, 0.01, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [64, 128, 256]),
        "alpha": trial.suggest_float("alpha", 1e-6, 1e-1, log=True),
        
        "solver": "adam",
        "max_iter": 1000,  
        "random_state": 42,
        "early_stopping": True,
        "n_iter_no_change": 20,
    }

    losses = []
    for fold, (t_id, v_id) in enumerate(cv.split(X_train_mlp)):
        X_tr, X_va = X_train_mlp[t_id], X_train_mlp[v_id]
        y_tr, y_va = y_train_mlp[t_id], y_train_mlp[v_id]

        mlp_model = Pipeline([("scaler", StandardScaler(with_mean=True, with_std=True)), ("mlp", MLPRegressor(**params))])

        mlp_model.fit(X_tr, y_tr)
        mlp_pred = mlp_model.predict(X_va)
        
        rmse = np.sqrt(mean_squared_error(y_va, mlp_pred))
        losses.append(rmse)

        trial.report(rmse, step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(losses))

In [7]:
study_mlp = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner(n_warmup_steps=2))
study_mlp.optimize(objective_mlp, n_trials=10, show_progress_bar=True)

print("RMSE:", study_mlp.best_value)
print(study_mlp.best_params)

[32m[I 2026-02-11 14:37:47,620][0m A new study created in memory with name: no-name-debb3a84-64ed-41bf-9198-0081051a5aaa[0m


  0%|          | 0/10 [00:00<?, ?it/s]

[32m[I 2026-02-11 14:43:36,525][0m Trial 0 finished with value: 0.20511133561510184 and parameters: {'n_layers': 1, 'n_units_l0': 323, 'activation': 'tanh', 'learning_rate_init': 0.0069844819916329865, 'batch_size': 256, 'alpha': 0.010850928217907285}. Best is trial 0 with value: 0.20511133561510184.[0m
[32m[I 2026-02-11 14:46:46,678][0m Trial 1 finished with value: 0.20327647717961478 and parameters: {'n_layers': 1, 'n_units_l0': 33, 'activation': 'relu', 'learning_rate_init': 0.0016184527978948988, 'batch_size': 64, 'alpha': 0.01557931037284271}. Best is trial 1 with value: 0.20327647717961478.[0m
[32m[I 2026-02-11 14:49:50,103][0m Trial 2 finished with value: 0.1941030431970717 and parameters: {'n_layers': 4, 'n_units_l0': 113, 'n_units_l1': 36, 'n_units_l2': 37, 'n_units_l3': 170, 'activation': 'relu', 'learning_rate_init': 0.007961022260431602, 'batch_size': 256, 'alpha': 0.0006014806653372383}. Best is trial 2 with value: 0.1941030431970717.[0m
[32m[I 2026-02-11 15:11:1

In [9]:
with open('model_params/mlp_params.json', 'w') as f:
    json.dump(study_mlp.best_params, f, indent=2)

In [10]:
with open('model_params/mlp_params.json', 'r') as f:
    mlp_params = json.load(f)

In [11]:
n_layers = mlp_params["n_layers"]
hidden = tuple(mlp_params[f"n_units_l{i}"] for i in range(n_layers))

mlp_model = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", MLPRegressor(
        hidden_layer_sizes=hidden,
        activation=mlp_params["activation"],
        learning_rate_init=mlp_params["learning_rate_init"],
        batch_size=mlp_params["batch_size"],
        alpha=mlp_params["alpha"],

        solver="adam",
        max_iter=2000,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=20,
    ))
])

mlp_model.fit(X_train_mlp, y_train_mlp)

mlp_pred = mlp_model.predict(X_test)
print("test:")
print(f"R2: {r2_score(y_test, mlp_pred):.3f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, mlp_pred)):.3f}")

test:
R2: 0.695
RMSE: 0.166
