In [1]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import optuna
import os
import json
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

from src.utils import get_kfold_data, convert_non_numeric_to_numeric, calculate_r2_score, calculate_metrics
from src.normalisation import Normaliser
from src.constants import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv(DATA_PATH)

In [3]:
data.head()

Unnamed: 0,outcome,carat,cut,color,clarity,depth,table,price,x,y,...,a6,a7,a8,a9,a10,b6,b7,b8,b9,b10
0,-26.701232,1.14,Ideal,G,VS1,62.3,56.0,7948,6.73,6.7,...,0.168836,-0.273758,1.107832,1.247795,0.482344,0.489511,-0.321138,0.573382,0.446871,-1.990581
1,6.548093,0.38,Premium,H,VS2,60.5,59.0,898,4.69,4.66,...,-0.256549,0.315373,-0.030326,-0.114335,-1.059588,-1.76136,-1.343951,-1.00255,-0.22503,-0.446653
2,6.612562,0.5,Very Good,E,SI1,60.7,58.0,1351,5.09,5.13,...,-1.193327,-0.657307,-0.591726,-0.446856,-0.765286,-0.816544,-1.397794,-0.47713,0.810509,1.725131
3,-5.073562,0.7,Premium,D,SI1,61.2,58.0,2512,5.74,5.7,...,-1.740788,-1.77886,-0.82507,0.444932,1.173109,0.453606,-0.26344,0.24621,-0.850503,-0.41295
4,-14.436557,0.83,Ideal,G,SI2,62.4,54.0,2751,6.01,6.08,...,-0.859322,1.409268,0.861992,1.109063,-1.436722,-1.461618,0.081787,0.258087,0.851146,2.204813


Inspecting columns

In [4]:
# Find columns
all_columns = data.columns.tolist()
print(all_columns)

numeric_columns = data.select_dtypes(include=["number"]).columns.tolist()
numeric_columns.remove("outcome") # Remove the target column
print(numeric_columns)

non_numeric_columns = data.select_dtypes(exclude=["number"]).columns.tolist()
print(non_numeric_columns)

['outcome', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['cut', 'color', 'clarity']


In [5]:
for non_numeric_column in non_numeric_columns:
    print(data[non_numeric_column].value_counts())

cut
Ideal        4040
Premium      2439
Very Good    2296
Good          925
Fair          300
Name: count, dtype: int64
color
G    2120
E    1873
F    1746
H    1506
D    1246
I     983
J     526
Name: count, dtype: int64
clarity
SI1     2408
VS2     2256
SI2     1743
VS1     1503
VVS2     951
VVS1     675
IF       318
I1       146
Name: count, dtype: int64


Converting non-numeric features to numerical features

In [6]:
data = convert_non_numeric_to_numeric(data=data)
print(data)

['G', 'E', 'F', 'H', 'D', 'I', 'J']
        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...      

Normalise data using each columns respective mean and std.

In [7]:
print(data)

        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...        b8        b9       b10  colour_G  

In [8]:
normaliser = Normaliser()
for column in numeric_columns:
    print(data[column])
    data[column] = normaliser.standardise(data[column])
    print("after", data[column])

0       1.14
1       0.38
2       0.50
3       0.70
4       0.83
        ... 
9995    0.33
9996    1.01
9997    0.52
9998    0.31
9999    0.37
Name: carat, Length: 10000, dtype: float64
after 0       0.723643
1      -0.886369
2      -0.632156
3      -0.208469
4       0.066928
          ...   
9995   -0.992290
9996    0.448246
9997   -0.589788
9998   -1.034659
9999   -0.907553
Name: carat, Length: 10000, dtype: float64
0       62.3
1       60.5
2       60.7
3       61.2
4       62.4
        ... 
9995    62.6
9996    69.5
9997    57.9
9998    62.0
9999    59.9
Name: depth, Length: 10000, dtype: float64
after 0       0.386072
1      -0.872995
2      -0.733098
3      -0.383358
4       0.456020
          ...   
9995    0.595916
9996    5.422336
9997   -2.691646
9998    0.176227
9999   -1.292683
Name: depth, Length: 10000, dtype: float64
0       56.0
1       59.0
2       58.0
3       58.0
4       54.0
        ... 
9995    57.0
9996    55.0
9997    61.0
9998    54.0
9999    59.0
Name: table, 

In [9]:
print(data)

        outcome     carat  cut  clarity     depth     table     price  \
0    -26.701232  0.723643    0        3  0.386072 -0.653020  1.024563   
1      6.548093 -0.886369    1        4 -0.872995  0.682072 -0.764609   
2      6.612562 -0.632156    2        5 -0.733098  0.237041 -0.649645   
3     -5.073562 -0.208469    1        5 -0.383358  0.237041 -0.355003   
4    -14.436557  0.066928    0        6  0.456020 -1.543082 -0.294349   
...         ...       ...  ...      ...       ...       ...       ...   
9995  10.718277 -0.992290    0        3  0.595916 -0.207990 -0.738215   
9996 -12.246698  0.448246    4        5  5.422336 -1.098051  0.239104   
9997  11.122516 -0.589788    2        6 -2.691646  1.572133 -0.669440   
9998 -24.730782 -1.034659    0        0  0.176227 -1.543082 -0.789226   
9999   8.735755 -0.907553    2        5 -1.292683  0.682072 -0.827801   

             x         y         z  ...        b8        b9       b10  \
0     0.893417  0.780367  0.923092  ...  0.593856 

Data splitting:
- Split the entire dataset into training and testing sets first.
- Use the training set to generate folds (one validation and the rest training folds) (K-Fold Cross Validation)

In [10]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=REPRODUCIBILITY_SEED)
print(f"Training set size: {len(train_data)} | Test set size: {len(test_data)}")    
print()

kfold_data = get_kfold_data(data=train_data, k=NUM_FOLDS, reproducibility_seed=REPRODUCIBILITY_SEED)

Training set size: 8000 | Test set size: 2000

Fold: 0/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 1/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 2/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 3/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 4/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%



Define models and hyperparameter tuning objectives for each model

In [11]:
models = {
        "linear_regression": LinearRegression,
        "lasso": Lasso,
        "ridge": Ridge,
        "xgb": xgb.XGBRegressor,
        "random_forest": RandomForestRegressor,
        "gradient_boosting": GradientBoostingRegressor,
        "ada_boost": AdaBoostRegressor,
        "lgbm": lgb.LGBMRegressor
        }

def objective(model_type, trial, x_train, y_train, x_val, y_val):
    if model_type == LinearRegression:
        parameters = {
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        }
    elif model_type == Lasso:
        parameters = {
            "alpha": trial.suggest_float("alpha", 1e-3, 0.1, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "selection": trial.suggest_categorical("selection", ["cyclic", "random"]),
            "warm_start": trial.suggest_categorical("warm_start", [True, False]),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == Ridge:
        parameters = {
            "alpha": trial.suggest_float("alpha", 1e-3, 0.1, log=True),
            "solver": trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "positive": False,
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == xgb.XGBRegressor:
        parameters = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "n_estimators": 100,
            "eta": trial.suggest_float("eta", 1e-2, 0.2, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 10, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 6),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "seed": REPRODUCIBILITY_SEED
        }
    elif model_type == RandomForestRegressor:
        parameters = {
            "n_estimators": 100,
            "criterion": trial.suggest_categorical("criterion", ["absolute_error", "squared_error"]),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "bootstrap": True,
            "oob_score": False,
            "n_jobs": -1,
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == GradientBoostingRegressor:
        parameters = {
            "n_estimators": 100,
            "loss": trial.suggest_categorical("loss", ["absolute_error", "squared_error", "huber", "quantile"]),
            "criterion": trial.suggest_categorical("criterion", ["friedman_mse", "squared_error"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "subsample": trial.suggest_float("subsample", 0.05, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 2**10),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == AdaBoostRegressor:
        parameters = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 100),
            "loss": trial.suggest_categorical("loss", ["linear", "square", "exponential"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == lgb.LGBMRegressor:
        parameters = {
                    "objective": "regression",
                    "metric": "rmse",
                    "n_estimators": 100,
                    "verbosity": -1,
                    "bagging_freq": 1,
                    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
                    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
                    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
                    "seed": REPRODUCIBILITY_SEED
        }

    
    model = model_type(**parameters) # Create the model
    model.fit(x_train, y_train)
    predictions = model.predict(x_val)
    metrics = calculate_metrics(targets=y_val, preds=predictions)
    rmse = metrics["rmse"]
    return rmse

In [12]:
# Train + Validate models
metrics = ["mae", "mse", "rmse", "pcc", "spearman_r", "r2_score"]
model_scores = {model_name: defaultdict(list) for model_name in models.keys()}

if os.path.exists("model_best_hyperparameters"):
    raise Exception("Directory for best hyperparameters already exists. Please delete it before running this script.")
os.makedirs("model_best_hyperparameters")

for fold in range(NUM_FOLDS):
    fold_data = kfold_data[fold]
     
    # Extract data
    train_data = fold_data["train"]
    val_data = fold_data["val"]

    train_y = train_data["outcome"]
    val_y = val_data["outcome"]
    
    train_x = train_data.drop(columns=["outcome"])
    val_x = val_data.drop(columns=["outcome"])

    # print(f"Fold {fold+1}/{NUM_FOLDS}")
    # print(f"Train data shape: {train_x.shape} | Train target shape: {train_y.shape}")
    # print(f"Val data shape: {val_x.shape} | Val target shape: {val_y.shape}")
    # print(f"Test data shape: {test_x.shape} | Test target shape: {test_y.shape}")

    # Train model
    for model_name, model in models.items():
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: objective(trial=trial, 
                                               model_type=model, 
                                               x_train=train_x, 
                                               y_train=train_y, 
                                               x_val=val_x, 
                                               y_val=val_y
                                               ), n_trials=N_TRIALS)
        
        # Train model with best hyperparameters
        best_fold_params = study.best_params
        model = model(**best_fold_params)
        model.fit(train_x, train_y)
        preds = model.predict(val_x)
        
        # Save the best hyperparameters for this model at this fold.
        os.makedirs(f"model_best_hyperparameters/{model_name}", exist_ok=True)
        with open(f"model_best_hyperparameters/{model_name}/fold_{fold+1}.json", "w") as f:
            json.dump(best_fold_params, f)

        # Calculate metrics
        metrics = calculate_metrics(targets=val_y, preds=preds)
        mae = metrics["mae"]
        mse = metrics["mse"]
        rmse = metrics["rmse"]
        pcc = metrics["pcc"]
        spearman_r = metrics["spearman_r"]
        r2_score = metrics["r2_score"]

        for metric in metrics:
            model_scores[model_name][metric].append(metrics[metric])

        print(f"Fold: {fold+1}/{NUM_FOLDS}")
        print(f"Model name: {model_name}")
        print(f"MAE: {mae}")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"PCC: {pcc}")
        print(f"Spearman R: {spearman_r}")
        print(f"R2 Score: {r2_score}")
        print()


[I 2025-02-17 17:19:09,875] A new study created in memory with name: no-name-17215b5f-bd45-49df-b257-7aab76cf69d2
[I 2025-02-17 17:19:09,886] Trial 0 finished with value: 10.789711542941271 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 17:19:09,896] Trial 1 finished with value: 10.789711542941271 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 17:19:09,904] Trial 2 finished with value: 10.789711542941271 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 17:19:09,914] Trial 3 finished with value: 10.789711542941271 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 17:19:09,923] Trial 4 finished with value: 10.789711542941271 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 17:19:09,931] Trial 5 finished with value: 1

Fold: 1/5
Model name: linear_regression
MAE: 8.70751717596474
MSE: 116.4178751798801
RMSE: 10.789711542941271
PCC: 0.5299360904553945
Spearman R: 0.5514073667606901
R2 Score: 0.2804847588737671



[I 2025-02-17 17:19:09,992] Trial 1 finished with value: 10.778365780198119 and parameters: {'alpha': 0.006636303017225869, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': True}. Best is trial 1 with value: 10.778365780198119.
[I 2025-02-17 17:19:09,999] Trial 2 finished with value: 10.772731929825774 and parameters: {'alpha': 0.02821377120279643, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 2 with value: 10.772731929825774.
[I 2025-02-17 17:19:10,029] Trial 3 finished with value: 10.77775286056672 and parameters: {'alpha': 0.008482905903487513, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 2 with value: 10.772731929825774.
[I 2025-02-17 17:19:10,044] Trial 4 finished with value: 10.777015397176955 and parameters: {'alpha': 0.011898940422561783, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 2 with value: 10.772731929825774.
[I 2025-02-17 17:19:10,111] Trial 5 fini

Fold: 1/5
Model name: lasso
MAE: 8.68876816055746
MSE: 116.04443165758475
RMSE: 10.772392104708441
PCC: 0.5319570619729997
Spearman R: 0.5540044507829887
R2 Score: 0.28279280912443683



[I 2025-02-17 17:19:10,801] Trial 0 finished with value: 10.788828698797593 and parameters: {'alpha': 0.007346256824117937, 'solver': 'saga', 'fit_intercept': True}. Best is trial 0 with value: 10.788828698797593.
[I 2025-02-17 17:19:10,808] Trial 1 finished with value: 10.789355034934347 and parameters: {'alpha': 0.042602037275108216, 'solver': 'lsqr', 'fit_intercept': False}. Best is trial 0 with value: 10.788828698797593.
[I 2025-02-17 17:19:10,812] Trial 2 finished with value: 10.789709717528646 and parameters: {'alpha': 0.001235080032974388, 'solver': 'auto', 'fit_intercept': False}. Best is trial 0 with value: 10.788828698797593.
[I 2025-02-17 17:19:11,157] Trial 3 finished with value: 10.789270796715257 and parameters: {'alpha': 0.001228709196102466, 'solver': 'sag', 'fit_intercept': True}. Best is trial 0 with value: 10.788828698797593.
[I 2025-02-17 17:19:11,163] Trial 4 finished with value: 10.789652032560891 and parameters: {'alpha': 0.006448028178453317, 'solver': 'sparse_c

Fold: 1/5
Model name: ridge
MAE: 8.707045164308557
MSE: 116.39874865865353
RMSE: 10.788825175089896
PCC: 0.5300433162135427
Spearman R: 0.5514642730329192
R2 Score: 0.28060296944504726



[I 2025-02-17 17:19:13,232] Trial 1 finished with value: 10.093963093981353 and parameters: {'eta': 0.01130349406603219, 'gamma': 0.0006108754993167166, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.712562349710006, 'colsample_bytree': 0.7614610776148094}. Best is trial 0 with value: 9.31737168656868.
[I 2025-02-17 17:19:13,296] Trial 2 finished with value: 9.162455276408393 and parameters: {'eta': 0.0883979098651366, 'gamma': 6.0276834331480576e-05, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.8046937189870189, 'colsample_bytree': 0.665340714971185}. Best is trial 2 with value: 9.162455276408393.
[I 2025-02-17 17:19:13,357] Trial 3 finished with value: 9.201385126453374 and parameters: {'eta': 0.14384217530336404, 'gamma': 5.52674858059526e-07, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.8284175444555005, 'colsample_bytree': 0.8399124916836613}. Best is trial 2 with value: 9.162455276408393.
[I 2025-02-17 17:19:13,462] Trial 4 finished with value: 9.57839116

Fold: 1/5
Model name: xgb
MAE: 7.3645282159137375
MSE: 83.47671563751388
RMSE: 9.136559288786664
PCC: 0.6966880294974864
Spearman R: 0.698612042426579
R2 Score: 0.48407605715576474



[I 2025-02-17 17:19:14,626] Trial 1 finished with value: 10.998699919887702 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 0 with value: 10.081786429710084.
[I 2025-02-17 17:19:14,740] Trial 2 finished with value: 9.854575218959527 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 2 with value: 9.854575218959527.
[I 2025-02-17 17:19:16,435] Trial 3 finished with value: 9.881563751310088 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 2 with value: 9.854575218959527.
[I 2025-02-17 17:19:16,579] Trial 4 finished with value: 9.624692060404115 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 8, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 4

Fold: 1/5
Model name: random_forest
MAE: 7.769470825483581
MSE: 92.47047871697433
RMSE: 9.616157169939264
PCC: 0.6794696823423926
Spearman R: 0.6829940568336159
R2 Score: 0.428490524429356



[I 2025-02-17 17:19:18,999] Trial 0 finished with value: 9.453506282547712 and parameters: {'loss': 'absolute_error', 'criterion': 'friedman_mse', 'learning_rate': 0.03268042915942602, 'subsample': 0.5198008969100638, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 7, 'max_features': 'log2', 'max_leaf_nodes': 468}. Best is trial 0 with value: 9.453506282547712.
[I 2025-02-17 17:19:19,682] Trial 1 finished with value: 9.695408459780674 and parameters: {'loss': 'squared_error', 'criterion': 'squared_error', 'learning_rate': 0.09701501269738802, 'subsample': 0.3679116779578882, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'max_leaf_nodes': 474}. Best is trial 0 with value: 9.453506282547712.
[I 2025-02-17 17:19:20,742] Trial 2 finished with value: 9.351786206860783 and parameters: {'loss': 'squared_error', 'criterion': 'squared_error', 'learning_rate': 0.029428732260321794, 'subsample': 0.7634012308970842, 'max_depth': 7, 'min_sample

Fold: 1/5
Model name: gradient_boosting
MAE: 7.477765681197163
MSE: 86.5307859846834
RMSE: 9.30219253642298
PCC: 0.685316695749056
Spearman R: 0.6863436372436084
R2 Score: 0.46520051799251483



[I 2025-02-17 17:19:28,086] Trial 0 finished with value: 9.750683481924685 and parameters: {'n_estimators': 62, 'loss': 'square', 'learning_rate': 0.018194129976653168}. Best is trial 0 with value: 9.750683481924685.
[I 2025-02-17 17:19:29,893] Trial 1 finished with value: 9.787266678541823 and parameters: {'n_estimators': 64, 'loss': 'square', 'learning_rate': 0.012948618031382426}. Best is trial 0 with value: 9.750683481924685.
[I 2025-02-17 17:19:32,627] Trial 2 finished with value: 9.790101957183866 and parameters: {'n_estimators': 97, 'loss': 'square', 'learning_rate': 0.0024899824696830346}. Best is trial 0 with value: 9.750683481924685.
[I 2025-02-17 17:19:34,758] Trial 3 finished with value: 9.666869333624064 and parameters: {'n_estimators': 77, 'loss': 'square', 'learning_rate': 0.028771896848411697}. Best is trial 3 with value: 9.666869333624064.
[I 2025-02-17 17:19:37,011] Trial 4 finished with value: 9.807542854042326 and parameters: {'n_estimators': 80, 'loss': 'exponentia

Fold: 1/5
Model name: ada_boost
MAE: 7.687244054321398
MSE: 91.44327868541247
RMSE: 9.562597904618412
PCC: 0.6601379309273128
Spearman R: 0.6690214275636233
R2 Score: 0.4348390862567568



[I 2025-02-17 17:19:49,430] Trial 1 finished with value: 10.88202694526243 and parameters: {'learning_rate': 0.004584247642710583, 'num_leaves': 71, 'subsample': 0.4563703293060669, 'colsample_bytree': 0.9612393363644555, 'min_data_in_leaf': 21}. Best is trial 1 with value: 10.88202694526243.
[I 2025-02-17 17:19:49,511] Trial 2 finished with value: 12.544412189297235 and parameters: {'learning_rate': 0.0046582178303403304, 'num_leaves': 214, 'subsample': 0.46514146396139044, 'colsample_bytree': 0.05537200481166993, 'min_data_in_leaf': 35}. Best is trial 1 with value: 10.88202694526243.
[I 2025-02-17 17:19:49,683] Trial 3 finished with value: 12.474416520623631 and parameters: {'learning_rate': 0.0014348140528443745, 'num_leaves': 824, 'subsample': 0.8128919675963701, 'colsample_bytree': 0.2387279164161013, 'min_data_in_leaf': 37}. Best is trial 1 with value: 10.88202694526243.
[I 2025-02-17 17:19:49,750] Trial 4 finished with value: 12.14365067337382 and parameters: {'learning_rate': 0

Fold: 1/5
Model name: lgbm
MAE: 7.44773372616304
MSE: 85.32379602516649
RMSE: 9.237088070662013
PCC: 0.6881037686230529
Spearman R: 0.6919567624831103
R2 Score: 0.47266026307390296

Fold: 2/5
Model name: linear_regression
MAE: 8.589235102672504
MSE: 116.93096828103435
RMSE: 10.81346236323197
PCC: 0.5423829744003588
Spearman R: 0.5661288207534456
R2 Score: 0.2926921600863055



[I 2025-02-17 17:19:50,615] Trial 5 finished with value: 10.815559452412716 and parameters: {'alpha': 0.01977922832956571, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 1 with value: 10.812607386769882.
[I 2025-02-17 17:19:50,655] Trial 6 finished with value: 10.813045140847981 and parameters: {'alpha': 0.002797122365832284, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 1 with value: 10.812607386769882.
[I 2025-02-17 17:19:50,664] Trial 7 finished with value: 10.817553888757676 and parameters: {'alpha': 0.022289739696002318, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.812607386769882.
[I 2025-02-17 17:19:50,672] Trial 8 finished with value: 10.854886587666634 and parameters: {'alpha': 0.05884842425841623, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.812607386769882.
[I 2025-02-17 17:19:50,679] Trial 9 finis

Fold: 2/5
Model name: lasso
MAE: 8.591581507910353
MSE: 116.9124785004306
RMSE: 10.812607386769882
PCC: 0.5427752558704653
Spearman R: 0.5665187066479322
R2 Score: 0.292804003569444



[I 2025-02-17 17:19:51,150] Trial 8 finished with value: 10.813557261117575 and parameters: {'alpha': 0.06733990772321469, 'solver': 'saga', 'fit_intercept': True}. Best is trial 5 with value: 10.813453994886203.
[I 2025-02-17 17:19:51,159] Trial 9 finished with value: 10.81346650907505 and parameters: {'alpha': 0.05110214352756312, 'solver': 'svd', 'fit_intercept': True}. Best is trial 5 with value: 10.813453994886203.
[I 2025-02-17 17:19:51,164] A new study created in memory with name: no-name-c5962d4b-3482-4e4a-8607-9b62dee4bd1b
[I 2025-02-17 17:19:51,257] Trial 0 finished with value: 9.37938721284016 and parameters: {'eta': 0.041217375967481415, 'gamma': 0.000609645023646012, 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.8686783966114526, 'colsample_bytree': 0.6115637796229663}. Best is trial 0 with value: 9.37938721284016.


Fold: 2/5
Model name: ridge
MAE: 8.589251675294278
MSE: 116.93078729952038
RMSE: 10.813453994886203
PCC: 0.5423857769213549
Spearman R: 0.5661048441815798
R2 Score: 0.29269325483174025



[I 2025-02-17 17:19:51,672] Trial 1 finished with value: 9.846151011449445 and parameters: {'eta': 0.1417116180995765, 'gamma': 4.62769995090768e-08, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.9169114289571587, 'colsample_bytree': 0.8559022071964502}. Best is trial 0 with value: 9.37938721284016.
[I 2025-02-17 17:19:51,784] Trial 2 finished with value: 9.59036790527581 and parameters: {'eta': 0.1121259221417297, 'gamma': 0.28663179505616143, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.6413711081272727, 'colsample_bytree': 0.7808425896439615}. Best is trial 0 with value: 9.37938721284016.
[I 2025-02-17 17:19:51,867] Trial 3 finished with value: 9.572634993397305 and parameters: {'eta': 0.027318139045096407, 'gamma': 4.648641682508907e-05, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.7009094053408118, 'colsample_bytree': 0.5637320177667717}. Best is trial 0 with value: 9.37938721284016.
[I 2025-02-17 17:19:52,243] Trial 4 finished with value: 9.835836511872

Fold: 2/5
Model name: xgb
MAE: 7.455939393461204
MSE: 88.82234882607588
RMSE: 9.424560935453485
PCC: 0.6850326241394231
Spearman R: 0.6921474998623047
R2 Score: 0.4627193753049397



[I 2025-02-17 17:19:55,185] Trial 0 finished with value: 9.769432963042872 and parameters: {'criterion': 'absolute_error', 'max_features': 'sqrt', 'max_depth': 8, 'min_samples_split': 6, 'min_samples_leaf': 6}. Best is trial 0 with value: 9.769432963042872.
[I 2025-02-17 17:19:55,312] Trial 1 finished with value: 9.99446704625635 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 9.769432963042872.
[I 2025-02-17 17:19:55,466] Trial 2 finished with value: 9.813533400629844 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 0 with value: 9.769432963042872.
[I 2025-02-17 17:19:56,914] Trial 3 finished with value: 10.794546651128085 and parameters: {'criterion': 'absolute_error', 'max_features': 'sqrt', 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 0 

Fold: 2/5
Model name: random_forest
MAE: 7.622836638410183
MSE: 93.33718989709502
RMSE: 9.661117424868358
PCC: 0.6782451988088887
Spearman R: 0.6845191443434159
R2 Score: 0.43540939461769246



[I 2025-02-17 17:20:03,030] Trial 0 finished with value: 10.117680553677701 and parameters: {'loss': 'huber', 'criterion': 'squared_error', 'learning_rate': 0.019352536244499046, 'subsample': 0.7971821707842845, 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 9, 'max_features': 'log2', 'max_leaf_nodes': 219}. Best is trial 0 with value: 10.117680553677701.
[I 2025-02-17 17:20:03,658] Trial 1 finished with value: 12.45676502958365 and parameters: {'loss': 'absolute_error', 'criterion': 'squared_error', 'learning_rate': 0.001230687280043893, 'subsample': 0.21056908730900736, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_leaf_nodes': 350}. Best is trial 0 with value: 10.117680553677701.
[I 2025-02-17 17:20:04,243] Trial 2 finished with value: 12.441488776133653 and parameters: {'loss': 'squared_error', 'criterion': 'squared_error', 'learning_rate': 0.0010974509008079313, 'subsample': 0.37595044270625577, 'max_depth': 9, 'min_sample

Fold: 2/5
Model name: gradient_boosting
MAE: 7.465757626249674
MSE: 88.83591135149007
RMSE: 9.425280438877671
PCC: 0.6830701310432634
Spearman R: 0.6889802544844744
R2 Score: 0.4626373364687314



[I 2025-02-17 17:20:10,023] Trial 0 finished with value: 9.777175207345747 and parameters: {'n_estimators': 73, 'loss': 'square', 'learning_rate': 0.012440732530383575}. Best is trial 0 with value: 9.777175207345747.
[I 2025-02-17 17:20:11,822] Trial 1 finished with value: 9.838063109646198 and parameters: {'n_estimators': 64, 'loss': 'linear', 'learning_rate': 0.025790318911135767}. Best is trial 0 with value: 9.777175207345747.
[I 2025-02-17 17:20:13,234] Trial 2 finished with value: 9.79851044228357 and parameters: {'n_estimators': 50, 'loss': 'exponential', 'learning_rate': 0.0049043977672228985}. Best is trial 0 with value: 9.777175207345747.
[I 2025-02-17 17:20:15,014] Trial 3 finished with value: 9.81458894393103 and parameters: {'n_estimators': 63, 'loss': 'square', 'learning_rate': 0.007758318272205958}. Best is trial 0 with value: 9.777175207345747.
[I 2025-02-17 17:20:17,245] Trial 4 finished with value: 9.721957148869398 and parameters: {'n_estimators': 80, 'loss': 'exponen

Fold: 2/5
Model name: ada_boost
MAE: 7.707438002835854
MSE: 94.61570732039698
RMSE: 9.727060569380503
PCC: 0.6550425878623409
Spearman R: 0.6642183501609874
R2 Score: 0.4276757256824082



[I 2025-02-17 17:20:30,143] Trial 3 finished with value: 12.228001175364236 and parameters: {'learning_rate': 0.0014518963645589814, 'num_leaves': 129, 'subsample': 0.10998748168598085, 'colsample_bytree': 0.8830249758969451, 'min_data_in_leaf': 42}. Best is trial 2 with value: 9.401114605076588.
[I 2025-02-17 17:20:30,209] Trial 4 finished with value: 10.484347344022403 and parameters: {'learning_rate': 0.015363137680262439, 'num_leaves': 250, 'subsample': 0.7042218763783908, 'colsample_bytree': 0.3801956838645099, 'min_data_in_leaf': 97}. Best is trial 2 with value: 9.401114605076588.
[I 2025-02-17 17:20:30,254] Trial 5 finished with value: 11.33579416562191 and parameters: {'learning_rate': 0.020102479960952584, 'num_leaves': 350, 'subsample': 0.18701632888783049, 'colsample_bytree': 0.22983744882749063, 'min_data_in_leaf': 53}. Best is trial 2 with value: 9.401114605076588.
[I 2025-02-17 17:20:30,525] Trial 6 finished with value: 9.609139399707903 and parameters: {'learning_rate': 

Fold: 2/5
Model name: lgbm
MAE: 7.413961094227168
MSE: 88.15311804480801
RMSE: 9.388989191857023
PCC: 0.6834596417302095
Spearman R: 0.690349456191194
R2 Score: 0.46676750887691754

Fold: 3/5
Model name: linear_regression
MAE: 8.755107813660903
MSE: 120.93045179515703
RMSE: 10.996838263571808
PCC: 0.5082810320078581
Spearman R: 0.5304086007455472
R2 Score: 0.2563254092788819



[I 2025-02-17 17:20:32,392] Trial 5 finished with value: 10.994778334737934 and parameters: {'alpha': 0.002813038373584879, 'fit_intercept': True, 'selection': 'random', 'warm_start': False}. Best is trial 2 with value: 10.979450161996922.
[I 2025-02-17 17:20:32,408] Trial 6 finished with value: 10.992296780687006 and parameters: {'alpha': 0.005913358186401484, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best is trial 2 with value: 10.979450161996922.
[I 2025-02-17 17:20:32,444] Trial 7 finished with value: 10.994961057332661 and parameters: {'alpha': 0.002807153964125775, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 2 with value: 10.979450161996922.
[I 2025-02-17 17:20:32,473] Trial 8 finished with value: 10.990562451138794 and parameters: {'alpha': 0.010246774155600508, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 2 with value: 10.979450161996922.
[I 2025-02-17 17:20:32,481] Trial 9 fini

Fold: 3/5
Model name: lasso
MAE: 8.738412309684096
MSE: 120.54832585977425
RMSE: 10.979450161996922
PCC: 0.5097898768888802
Spearman R: 0.5344389704839729
R2 Score: 0.2586753330936137



[I 2025-02-17 17:20:32,768] Trial 0 finished with value: 10.996915955036204 and parameters: {'alpha': 0.007861377519052422, 'solver': 'saga', 'fit_intercept': False}. Best is trial 0 with value: 10.996915955036204.
[I 2025-02-17 17:20:32,774] Trial 1 finished with value: 10.996828069399296 and parameters: {'alpha': 0.0028821356728493423, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 1 with value: 10.996828069399296.
[I 2025-02-17 17:20:32,778] Trial 2 finished with value: 10.996821610837877 and parameters: {'alpha': 0.07875283383620546, 'solver': 'auto', 'fit_intercept': False}. Best is trial 2 with value: 10.996821610837877.
[I 2025-02-17 17:20:32,784] Trial 3 finished with value: 10.997304579186874 and parameters: {'alpha': 0.0026754892697516356, 'solver': 'lsqr', 'fit_intercept': False}. Best is trial 2 with value: 10.996821610837877.
[I 2025-02-17 17:20:32,790] Trial 4 finished with value: 10.99682389040476 and parameters: {'alpha': 0.019469872977177102, 'solver': 's

Fold: 3/5
Model name: ridge
MAE: 8.755104122141239
MSE: 120.93008554059097
RMSE: 10.996821610837877
PCC: 0.5082814977022012
Spearman R: 0.5304001456641194
R2 Score: 0.25632766160003406



[I 2025-02-17 17:20:33,456] Trial 1 finished with value: 9.464786151490006 and parameters: {'eta': 0.029453751636772378, 'gamma': 5.581948904029847e-08, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.9135951414191358, 'colsample_bytree': 0.8719459647809569}. Best is trial 0 with value: 9.458397679448327.
[I 2025-02-17 17:20:33,561] Trial 2 finished with value: 9.438117209703162 and parameters: {'eta': 0.08261960764392552, 'gamma': 8.23329663996211e-07, 'max_depth': 6, 'min_child_weight': 3, 'subsample': 0.9152100334594458, 'colsample_bytree': 0.5079597638941542}. Best is trial 2 with value: 9.438117209703162.
[I 2025-02-17 17:20:33,734] Trial 3 finished with value: 9.903586309054688 and parameters: {'eta': 0.1308922595381426, 'gamma': 1.9374287065190332e-05, 'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.647680740403892, 'colsample_bytree': 0.823570771055143}. Best is trial 2 with value: 9.438117209703162.
[I 2025-02-17 17:20:33,842] Trial 4 finished with value: 9.6302834

Fold: 3/5
Model name: xgb
MAE: 7.371352904127673
MSE: 87.18963092443013
RMSE: 9.337538804440394
PCC: 0.6817668304033638
Spearman R: 0.6862242934469898
R2 Score: 0.46381815224932754



[I 2025-02-17 17:20:34,583] Trial 1 finished with value: 9.865141069629397 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 0 with value: 9.815386316141513.
[I 2025-02-17 17:20:34,721] Trial 2 finished with value: 9.681538045926342 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 2 with value: 9.681538045926342.
[I 2025-02-17 17:20:34,836] Trial 3 finished with value: 10.24884185608378 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 2 with value: 9.681538045926342.
[I 2025-02-17 17:20:34,956] Trial 4 finished with value: 10.24914493378121 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 2 wi

Fold: 3/5
Model name: random_forest
MAE: 7.650928641143548
MSE: 93.70441298167363
RMSE: 9.6801039757677
PCC: 0.6639649535347948
Spearman R: 0.6725580195539139
R2 Score: 0.42375481164207884



[I 2025-02-17 17:20:42,317] Trial 0 finished with value: 11.04267047028154 and parameters: {'loss': 'huber', 'criterion': 'squared_error', 'learning_rate': 0.008842447280737986, 'subsample': 0.6073109941739371, 'max_depth': 4, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_leaf_nodes': 406}. Best is trial 0 with value: 11.04267047028154.
[I 2025-02-17 17:20:43,257] Trial 1 finished with value: 9.955688716121951 and parameters: {'loss': 'absolute_error', 'criterion': 'friedman_mse', 'learning_rate': 0.07827852541375771, 'subsample': 0.19334624930418737, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_leaf_nodes': 910}. Best is trial 1 with value: 9.955688716121951.
[I 2025-02-17 17:20:43,820] Trial 2 finished with value: 14.55027318971566 and parameters: {'loss': 'quantile', 'criterion': 'squared_error', 'learning_rate': 0.08712578242723797, 'subsample': 0.6438947664311017, 'max_depth': 4, 'min_samples_split': 3, 

Fold: 3/5
Model name: gradient_boosting
MAE: 7.528748435126797
MSE: 90.44681621530883
RMSE: 9.510353106762588
PCC: 0.6703896053588436
Spearman R: 0.6756208889144097
R2 Score: 0.4437877471516922



[I 2025-02-17 17:20:53,735] Trial 0 finished with value: 9.823291487520518 and parameters: {'n_estimators': 97, 'loss': 'exponential', 'learning_rate': 0.008745054662483247}. Best is trial 0 with value: 9.823291487520518.
[I 2025-02-17 17:20:55,387] Trial 1 finished with value: 9.79854899415703 and parameters: {'n_estimators': 58, 'loss': 'exponential', 'learning_rate': 0.0010560250274194746}. Best is trial 1 with value: 9.79854899415703.
[I 2025-02-17 17:20:57,310] Trial 2 finished with value: 9.82000938839456 and parameters: {'n_estimators': 67, 'loss': 'linear', 'learning_rate': 0.0014444053355033408}. Best is trial 1 with value: 9.79854899415703.
[I 2025-02-17 17:20:58,975] Trial 3 finished with value: 9.791985892965421 and parameters: {'n_estimators': 59, 'loss': 'exponential', 'learning_rate': 0.03891384091479507}. Best is trial 3 with value: 9.791985892965421.
[I 2025-02-17 17:21:01,408] Trial 4 finished with value: 9.788104033355419 and parameters: {'n_estimators': 87, 'loss': 

Fold: 3/5
Model name: ada_boost
MAE: 7.70686275729752
MSE: 95.9933206549204
RMSE: 9.797618111302379
PCC: 0.6407931348314446
Spearman R: 0.6587416827817428
R2 Score: 0.40967893206144557



[I 2025-02-17 17:21:14,308] Trial 1 finished with value: 10.151326389908307 and parameters: {'learning_rate': 0.010909269263512593, 'num_leaves': 444, 'subsample': 0.41247097585556747, 'colsample_bytree': 0.7210728751631533, 'min_data_in_leaf': 8}. Best is trial 0 with value: 9.60640430734289.
[I 2025-02-17 17:21:14,428] Trial 2 finished with value: 10.742898149813268 and parameters: {'learning_rate': 0.04472131222421437, 'num_leaves': 645, 'subsample': 0.934942269920914, 'colsample_bytree': 0.1681617895721685, 'min_data_in_leaf': 55}. Best is trial 0 with value: 9.60640430734289.
[I 2025-02-17 17:21:14,460] Trial 3 finished with value: 11.308695926796288 and parameters: {'learning_rate': 0.00405954120341952, 'num_leaves': 533, 'subsample': 0.08929972696614083, 'colsample_bytree': 0.9522599733638843, 'min_data_in_leaf': 66}. Best is trial 0 with value: 9.60640430734289.
[I 2025-02-17 17:21:14,491] Trial 4 finished with value: 9.676669270712964 and parameters: {'learning_rate': 0.026687

Fold: 3/5
Model name: lgbm
MAE: 7.39768445751914
MSE: 88.1996211480243
RMSE: 9.391465335506718
PCC: 0.6770047094059929
Spearman R: 0.6820384744290916
R2 Score: 0.4576071106546429

Fold: 4/5
Model name: linear_regression
MAE: 8.602220439887196
MSE: 115.140769914462
RMSE: 10.730366718545177
PCC: 0.5397874341390442
Spearman R: 0.5686795893279646
R2 Score: 0.29042379751749636



[I 2025-02-17 17:21:15,195] Trial 4 finished with value: 10.710351874003011 and parameters: {'alpha': 0.026559745057510777, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best is trial 4 with value: 10.710351874003011.
[I 2025-02-17 17:21:15,252] Trial 5 finished with value: 10.729173012145997 and parameters: {'alpha': 0.001294371525183929, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 4 with value: 10.710351874003011.
[I 2025-02-17 17:21:15,268] Trial 6 finished with value: 10.72891355188574 and parameters: {'alpha': 0.001567912561378857, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 4 with value: 10.710351874003011.
[I 2025-02-17 17:21:15,282] Trial 7 finished with value: 10.74626540293383 and parameters: {'alpha': 0.01355185146238478, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': True}. Best is trial 4 with value: 10.710351874003011.
[I 2025-02-17 17:21:15,311] Trial 8 finished 

Fold: 4/5
Model name: lasso
MAE: 8.600046136216092
MSE: 114.71163726495983
RMSE: 10.710351874003011
PCC: 0.5426371595559425
Spearman R: 0.5716122193797732
R2 Score: 0.2930684065123921



[I 2025-02-17 17:21:15,664] Trial 6 finished with value: 10.730364993857407 and parameters: {'alpha': 0.027247710811377068, 'solver': 'sag', 'fit_intercept': True}. Best is trial 4 with value: 10.729750689236711.
[I 2025-02-17 17:21:15,820] Trial 7 finished with value: 10.730572925417812 and parameters: {'alpha': 0.03380836722668038, 'solver': 'sag', 'fit_intercept': False}. Best is trial 4 with value: 10.729750689236711.
[I 2025-02-17 17:21:15,829] Trial 8 finished with value: 10.730433126501966 and parameters: {'alpha': 0.07682942175642722, 'solver': 'svd', 'fit_intercept': False}. Best is trial 4 with value: 10.729750689236711.
[I 2025-02-17 17:21:15,835] Trial 9 finished with value: 10.729748799881417 and parameters: {'alpha': 0.010613681023106234, 'solver': 'lsqr', 'fit_intercept': False}. Best is trial 9 with value: 10.729748799881417.
[I 2025-02-17 17:21:15,842] A new study created in memory with name: no-name-fbbf81cf-d9ba-448e-aebe-d686b3016d0e


Fold: 4/5
Model name: ridge
MAE: 8.602145467623204
MSE: 115.12750930855672
RMSE: 10.729748799881417
PCC: 0.5398744230571586
Spearman R: 0.5686870395261874
R2 Score: 0.29050551844387085



[I 2025-02-17 17:21:16,039] Trial 0 finished with value: 9.297936040145686 and parameters: {'eta': 0.04579268679077602, 'gamma': 0.8080616202476726, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.5671063158537144, 'colsample_bytree': 0.7715685821725762}. Best is trial 0 with value: 9.297936040145686.
[I 2025-02-17 17:21:16,362] Trial 1 finished with value: 10.07892510921488 and parameters: {'eta': 0.15279853628518347, 'gamma': 2.0694277311136773e-08, 'max_depth': 10, 'min_child_weight': 2, 'subsample': 0.525986557309934, 'colsample_bytree': 0.6677577088639776}. Best is trial 0 with value: 9.297936040145686.
[I 2025-02-17 17:21:16,754] Trial 2 finished with value: 9.60588524358179 and parameters: {'eta': 0.012984993117475077, 'gamma': 0.01687675129226271, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.5637764512384176, 'colsample_bytree': 0.988005547127607}. Best is trial 0 with value: 9.297936040145686.
[I 2025-02-17 17:21:17,118] Trial 3 finished with value: 9.6512068030

Fold: 4/5
Model name: xgb
MAE: 7.264449472964047
MSE: 82.96778097799044
RMSE: 9.108665158956631
PCC: 0.7005243894295858
Spearman R: 0.7073104081290658
R2 Score: 0.4886957678110151



[I 2025-02-17 17:21:19,827] Trial 1 finished with value: 10.649789093361617 and parameters: {'criterion': 'absolute_error', 'max_features': 'sqrt', 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 9.680899097601518.
[I 2025-02-17 17:21:19,952] Trial 2 finished with value: 9.842836235939284 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 9.680899097601518.
[I 2025-02-17 17:21:20,086] Trial 3 finished with value: 9.897947583311199 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 2}. Best is trial 0 with value: 9.680899097601518.
[I 2025-02-17 17:21:20,210] Trial 4 finished with value: 10.073690658171344 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 3}. Best is trial 0

Fold: 4/5
Model name: random_forest
MAE: 7.597435556017292
MSE: 90.03866942454027
RMSE: 9.488870819256645
PCC: 0.6915989396714375
Spearman R: 0.6987379780226477
R2 Score: 0.44512011536568563



[I 2025-02-17 17:21:26,711] Trial 0 finished with value: 12.162467484301228 and parameters: {'loss': 'squared_error', 'criterion': 'squared_error', 'learning_rate': 0.0019157845283865433, 'subsample': 0.2568959475763274, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 638}. Best is trial 0 with value: 12.162467484301228.
[I 2025-02-17 17:21:27,454] Trial 1 finished with value: 9.256501487030048 and parameters: {'loss': 'squared_error', 'criterion': 'squared_error', 'learning_rate': 0.04534234362421003, 'subsample': 0.712344379091803, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 254}. Best is trial 1 with value: 9.256501487030048.
[I 2025-02-17 17:21:28,493] Trial 2 finished with value: 10.635004468214635 and parameters: {'loss': 'absolute_error', 'criterion': 'friedman_mse', 'learning_rate': 0.008129458458670684, 'subsample': 0.4187437780096739, 'max_depth': 8, 'min_samp

Fold: 4/5
Model name: gradient_boosting
MAE: 7.371174130701531
MSE: 85.49803138259432
RMSE: 9.246514553202969
PCC: 0.6907735106435807
Spearman R: 0.6971295232927825
R2 Score: 0.47310263364348737



[I 2025-02-17 17:21:40,207] Trial 0 finished with value: 9.751358880090349 and parameters: {'n_estimators': 53, 'loss': 'linear', 'learning_rate': 0.004126591178923212}. Best is trial 0 with value: 9.751358880090349.
[I 2025-02-17 17:21:42,821] Trial 1 finished with value: 9.472143193802236 and parameters: {'n_estimators': 94, 'loss': 'exponential', 'learning_rate': 0.06625438478159645}. Best is trial 1 with value: 9.472143193802236.
[I 2025-02-17 17:21:44,326] Trial 2 finished with value: 9.753314116648339 and parameters: {'n_estimators': 52, 'loss': 'linear', 'learning_rate': 0.004755904153325323}. Best is trial 1 with value: 9.472143193802236.
[I 2025-02-17 17:21:46,335] Trial 3 finished with value: 9.676878357155228 and parameters: {'n_estimators': 70, 'loss': 'square', 'learning_rate': 0.008018463244247242}. Best is trial 1 with value: 9.472143193802236.
[I 2025-02-17 17:21:48,601] Trial 4 finished with value: 9.73813226667836 and parameters: {'n_estimators': 79, 'loss': 'square',

Fold: 4/5
Model name: ada_boost
MAE: 7.551405232075513
MSE: 89.69795998969396
RMSE: 9.47090069579942
PCC: 0.6698839996626639
Spearman R: 0.6766082796732974
R2 Score: 0.44721980001351125



[I 2025-02-17 17:22:00,446] Trial 1 finished with value: 12.052962492723028 and parameters: {'learning_rate': 0.002581809018525862, 'num_leaves': 1009, 'subsample': 0.47923623012228056, 'colsample_bytree': 0.4320444267773224, 'min_data_in_leaf': 78}. Best is trial 0 with value: 9.225078336735539.
[I 2025-02-17 17:22:00,536] Trial 2 finished with value: 12.57308979369784 and parameters: {'learning_rate': 0.002158903998922566, 'num_leaves': 761, 'subsample': 0.5043632536854612, 'colsample_bytree': 0.16987212779861294, 'min_data_in_leaf': 51}. Best is trial 0 with value: 9.225078336735539.
[I 2025-02-17 17:22:00,579] Trial 3 finished with value: 9.690465456264397 and parameters: {'learning_rate': 0.013926107478291734, 'num_leaves': 51, 'subsample': 0.21525640605901225, 'colsample_bytree': 0.8305550925367815, 'min_data_in_leaf': 66}. Best is trial 0 with value: 9.225078336735539.
[I 2025-02-17 17:22:00,709] Trial 4 finished with value: 12.214750899808163 and parameters: {'learning_rate': 0

Fold: 4/5
Model name: lgbm
MAE: 7.398346463956858
MSE: 85.93473975408855
RMSE: 9.270099231081
PCC: 0.6884937106843567
Spearman R: 0.6967378766554206
R2 Score: 0.4704113378664364

Fold: 5/5
Model name: linear_regression
MAE: 8.280015810684853
MSE: 119.27670375940366
RMSE: 10.921387446629831
PCC: 0.497221315847363
Spearman R: 0.5898355302091915
R2 Score: 0.23207756770388854



[I 2025-02-17 17:22:01,807] Trial 1 finished with value: 10.326570040421505 and parameters: {'alpha': 0.049466780068221396, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 1 with value: 10.326570040421505.
[I 2025-02-17 17:22:01,814] Trial 2 finished with value: 10.32850458337165 and parameters: {'alpha': 0.0565175064735115, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.326570040421505.
[I 2025-02-17 17:22:01,823] Trial 3 finished with value: 10.31900942765934 and parameters: {'alpha': 0.013630895852381879, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 3 with value: 10.31900942765934.
[I 2025-02-17 17:22:01,884] Trial 4 finished with value: 10.315255708978327 and parameters: {'alpha': 0.0041860200290106545, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 4 with value: 10.315255708978327.
  model = cd_fast.enet_coordinate_descent(
[

Fold: 5/5
Model name: lasso
MAE: 8.200711149142023
MSE: 106.40470300802495
RMSE: 10.315265532599001
PCC: 0.56163582467636
Spearman R: 0.5906359182171557
R2 Score: 0.3149495604230603



[I 2025-02-17 17:22:02,520] Trial 7 finished with value: 10.914923244071744 and parameters: {'alpha': 0.02397509052530527, 'solver': 'sag', 'fit_intercept': True}. Best is trial 2 with value: 10.909637414825639.
[I 2025-02-17 17:22:02,524] Trial 8 finished with value: 10.907701186882935 and parameters: {'alpha': 0.09122968801979377, 'solver': 'auto', 'fit_intercept': True}. Best is trial 8 with value: 10.907701186882935.
[I 2025-02-17 17:22:02,530] Trial 9 finished with value: 10.45981737267271 and parameters: {'alpha': 0.018115768993250094, 'solver': 'lsqr', 'fit_intercept': False}. Best is trial 9 with value: 10.45981737267271.
[I 2025-02-17 17:22:02,538] A new study created in memory with name: no-name-21d141d9-6473-41e4-81ad-6b4296de8e32


Fold: 5/5
Model name: ridge
MAE: 8.235487843037355
MSE: 109.40777946966584
RMSE: 10.45981737267271
PCC: 0.5447067547361621
Spearman R: 0.5897089207847347
R2 Score: 0.2956152754528265



[I 2025-02-17 17:22:02,905] Trial 0 finished with value: 9.372654741053019 and parameters: {'eta': 0.02183491941862955, 'gamma': 0.0032165146294050938, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.6788508622879148, 'colsample_bytree': 0.7184996312922625}. Best is trial 0 with value: 9.372654741053019.
[I 2025-02-17 17:22:03,219] Trial 1 finished with value: 9.419376579292758 and parameters: {'eta': 0.02179978349993165, 'gamma': 1.711673680917428e-08, 'max_depth': 9, 'min_child_weight': 3, 'subsample': 0.923626323320152, 'colsample_bytree': 0.5948183540628171}. Best is trial 0 with value: 9.372654741053019.
[I 2025-02-17 17:22:03,330] Trial 2 finished with value: 9.11016954243307 and parameters: {'eta': 0.04771580680132028, 'gamma': 4.680288337286944, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.5154433292543668, 'colsample_bytree': 0.8786046232824078}. Best is trial 2 with value: 9.11016954243307.
[I 2025-02-17 17:22:03,750] Trial 3 finished with value: 9.258193559724

Fold: 5/5
Model name: xgb
MAE: 7.2879034424928975
MSE: 84.29273922267429
RMSE: 9.181107733965128
PCC: 0.6765307381797726
Spearman R: 0.683464377329835
R2 Score: 0.45730990806597804



[I 2025-02-17 17:22:06,580] Trial 1 finished with value: 9.861289346293553 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 7}. Best is trial 1 with value: 9.861289346293553.
[I 2025-02-17 17:22:06,702] Trial 2 finished with value: 9.857121561313113 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 2 with value: 9.857121561313113.
[I 2025-02-17 17:22:08,453] Trial 3 finished with value: 9.509275117378182 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 7}. Best is trial 3 with value: 9.509275117378182.
[I 2025-02-17 17:22:08,581] Trial 4 finished with value: 9.809907971478266 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 3 w

Fold: 5/5
Model name: random_forest
MAE: 7.4514257839081095
MSE: 87.18527667044717
RMSE: 9.337305642981125
PCC: 0.6812461887159562
Spearman R: 0.6869755095607459
R2 Score: 0.4386872908876741



[I 2025-02-17 17:22:48,086] Trial 0 finished with value: 9.097021191484354 and parameters: {'loss': 'huber', 'criterion': 'squared_error', 'learning_rate': 0.048903558995946304, 'subsample': 0.9609855178090588, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_leaf_nodes': 38}. Best is trial 0 with value: 9.097021191484354.
[I 2025-02-17 17:22:48,697] Trial 1 finished with value: 10.846704467513424 and parameters: {'loss': 'huber', 'criterion': 'friedman_mse', 'learning_rate': 0.007088094892574632, 'subsample': 0.6037767632663125, 'max_depth': 4, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'log2', 'max_leaf_nodes': 29}. Best is trial 0 with value: 9.097021191484354.
[I 2025-02-17 17:22:49,441] Trial 2 finished with value: 12.180265834621126 and parameters: {'loss': 'absolute_error', 'criterion': 'friedman_mse', 'learning_rate': 0.0010687879463432705, 'subsample': 0.7368211173912499, 'max_depth': 4, 'min_samples_split': 7, 'mi

Fold: 5/5
Model name: gradient_boosting
MAE: 7.373714989960438
MSE: 85.18869715849249
RMSE: 9.229772324304239
PCC: 0.672337582704402
Spearman R: 0.6777668038151577
R2 Score: 0.4515415880535777



[I 2025-02-17 17:22:56,932] Trial 0 finished with value: 9.322238296615584 and parameters: {'n_estimators': 93, 'loss': 'exponential', 'learning_rate': 0.06064596255063649}. Best is trial 0 with value: 9.322238296615584.
[I 2025-02-17 17:22:59,171] Trial 1 finished with value: 9.350112875782925 and parameters: {'n_estimators': 80, 'loss': 'exponential', 'learning_rate': 0.04747019313423497}. Best is trial 0 with value: 9.322238296615584.
[I 2025-02-17 17:23:01,477] Trial 2 finished with value: 9.396050539567277 and parameters: {'n_estimators': 81, 'loss': 'exponential', 'learning_rate': 0.023948839824421277}. Best is trial 0 with value: 9.322238296615584.
[I 2025-02-17 17:23:03,746] Trial 3 finished with value: 9.43185759394802 and parameters: {'n_estimators': 79, 'loss': 'linear', 'learning_rate': 0.01070341975596317}. Best is trial 0 with value: 9.322238296615584.
[I 2025-02-17 17:23:06,280] Trial 4 finished with value: 9.394737485891573 and parameters: {'n_estimators': 89, 'loss': '

Fold: 5/5
Model name: ada_boost
MAE: 7.426850861174142
MSE: 87.12109462623654
RMSE: 9.333868149177839
PCC: 0.6630597106516927
Spearman R: 0.6739915683207954
R2 Score: 0.4391005051193433



[I 2025-02-17 17:23:20,498] Trial 1 finished with value: 11.37360891473757 and parameters: {'learning_rate': 0.0023170685921187714, 'num_leaves': 916, 'subsample': 0.8060543041411522, 'colsample_bytree': 0.9333554854157214, 'min_data_in_leaf': 68}. Best is trial 0 with value: 9.070071072951906.
[I 2025-02-17 17:23:21,166] Trial 2 finished with value: 11.094886501727308 and parameters: {'learning_rate': 0.0036144388127091255, 'num_leaves': 406, 'subsample': 0.798707352614215, 'colsample_bytree': 0.7719598852723173, 'min_data_in_leaf': 2}. Best is trial 0 with value: 9.070071072951906.
[I 2025-02-17 17:23:21,646] Trial 3 finished with value: 10.367075186134937 and parameters: {'learning_rate': 0.0071287264729174345, 'num_leaves': 761, 'subsample': 0.49950168766501396, 'colsample_bytree': 0.7229458697330952, 'min_data_in_leaf': 8}. Best is trial 0 with value: 9.070071072951906.
[I 2025-02-17 17:23:21,712] Trial 4 finished with value: 9.054729004548776 and parameters: {'learning_rate': 0.0

Fold: 5/5
Model name: lgbm
MAE: 7.240123700783736
MSE: 83.0225580728234
RMSE: 9.11167153012132
PCC: 0.6838796540925558
Spearman R: 0.6891210377425929
R2 Score: 0.4654875367839687



Compute average scores and rank models by R2 score

In [13]:
for model_name, model_metrics in model_scores.items():
    for metric, scores in model_metrics.items():
        model_scores[model_name][metric] = sum(scores) / len(scores)
    model_scores[model_name] = dict(model_scores[model_name])

model_scores = dict(sorted(model_scores.items(), key=lambda x: x[1]["r2_score"], reverse=True))

In [14]:
# Print results
for i, (model_name, model_metrics) in enumerate(model_scores.items()):
    print(f"No.{i+1} Model: {model_name}")
    for metric, score in model_metrics.items():
        print(f"{metric}: {score}")
    print()

No.1 Model: xgb
mae: 7.348834685791911
mse: 85.34984311773692
rmse: 9.23768638432046
pcc: 0.6881085223299264
spearman_r: 0.693551724238955
r2_score: 0.471323852117405

No.2 Model: lgbm
mae: 7.379569888529988
mse: 86.12676660898215
rmse: 9.279862671845615
pcc: 0.6841882969072336
spearman_r: 0.6900407215002818
r2_score: 0.4665867514511738

No.3 Model: gradient_boosting
mae: 7.443432172647121
mse: 87.30004841851382
rmse: 9.34282259191409
pcc: 0.6803775050998292
spearman_r: 0.6851682215500865
r2_score: 0.4592539646620007

No.4 Model: random_forest
mae: 7.618419488992542
mse: 91.34720553814608
rmse: 9.556711006562619
pcc: 0.678904992614694
spearman_r: 0.6851569416628679
r2_score: 0.43429242738849744

No.5 Model: ada_boost
mae: 7.615960181540885
mse: 91.77427225533208
rmse: 9.57840908605571
pcc: 0.6577834727870909
spearman_r: 0.6685162617000893
r2_score: 0.431702809826693

No.6 Model: lasso
mae: 8.563903852702005
mse: 114.92431525815486
rmse: 10.71801341201545
pcc: 0.5377590357929295
spearma