In [1]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import optuna
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

from src.utils import get_kfold_data, convert_non_numeric_to_numeric, calculate_r2_score, calculate_metrics
from src.normalisation import Normaliser
from src.constants import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv(DATA_PATH)

In [3]:
data.head()

Unnamed: 0,outcome,carat,cut,color,clarity,depth,table,price,x,y,...,a6,a7,a8,a9,a10,b6,b7,b8,b9,b10
0,-26.701232,1.14,Ideal,G,VS1,62.3,56.0,7948,6.73,6.7,...,0.168836,-0.273758,1.107832,1.247795,0.482344,0.489511,-0.321138,0.573382,0.446871,-1.990581
1,6.548093,0.38,Premium,H,VS2,60.5,59.0,898,4.69,4.66,...,-0.256549,0.315373,-0.030326,-0.114335,-1.059588,-1.76136,-1.343951,-1.00255,-0.22503,-0.446653
2,6.612562,0.5,Very Good,E,SI1,60.7,58.0,1351,5.09,5.13,...,-1.193327,-0.657307,-0.591726,-0.446856,-0.765286,-0.816544,-1.397794,-0.47713,0.810509,1.725131
3,-5.073562,0.7,Premium,D,SI1,61.2,58.0,2512,5.74,5.7,...,-1.740788,-1.77886,-0.82507,0.444932,1.173109,0.453606,-0.26344,0.24621,-0.850503,-0.41295
4,-14.436557,0.83,Ideal,G,SI2,62.4,54.0,2751,6.01,6.08,...,-0.859322,1.409268,0.861992,1.109063,-1.436722,-1.461618,0.081787,0.258087,0.851146,2.204813


Inspecting columns

In [4]:
# Find columns
all_columns = data.columns.tolist()
print(all_columns)

numeric_columns = data.select_dtypes(include=["number"]).columns.tolist()
numeric_columns.remove("outcome") # Remove the target column
print(numeric_columns)

non_numeric_columns = data.select_dtypes(exclude=["number"]).columns.tolist()
print(non_numeric_columns)

['outcome', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['cut', 'color', 'clarity']


In [5]:
for non_numeric_column in non_numeric_columns:
    print(data[non_numeric_column].value_counts())

cut
Ideal        4040
Premium      2439
Very Good    2296
Good          925
Fair          300
Name: count, dtype: int64
color
G    2120
E    1873
F    1746
H    1506
D    1246
I     983
J     526
Name: count, dtype: int64
clarity
SI1     2408
VS2     2256
SI2     1743
VS1     1503
VVS2     951
VVS1     675
IF       318
I1       146
Name: count, dtype: int64


Converting non-numeric features to numerical features

In [6]:
data = convert_non_numeric_to_numeric(data=data)
print(data)

['G', 'E', 'F', 'H', 'D', 'I', 'J']
        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...      

Normalise data using each columns respective mean and std.

In [7]:
print(data)

        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...        b8        b9       b10  colour_G  

In [8]:
normaliser = Normaliser()
for column in numeric_columns:
    print(data[column])
    data[column] = normaliser.standardise(data[column])
    print("after", data[column])

0       1.14
1       0.38
2       0.50
3       0.70
4       0.83
        ... 
9995    0.33
9996    1.01
9997    0.52
9998    0.31
9999    0.37
Name: carat, Length: 10000, dtype: float64
after 0       0.723643
1      -0.886369
2      -0.632156
3      -0.208469
4       0.066928
          ...   
9995   -0.992290
9996    0.448246
9997   -0.589788
9998   -1.034659
9999   -0.907553
Name: carat, Length: 10000, dtype: float64
0       62.3
1       60.5
2       60.7
3       61.2
4       62.4
        ... 
9995    62.6
9996    69.5
9997    57.9
9998    62.0
9999    59.9
Name: depth, Length: 10000, dtype: float64
after 0       0.386072
1      -0.872995
2      -0.733098
3      -0.383358
4       0.456020
          ...   
9995    0.595916
9996    5.422336
9997   -2.691646
9998    0.176227
9999   -1.292683
Name: depth, Length: 10000, dtype: float64
0       56.0
1       59.0
2       58.0
3       58.0
4       54.0
        ... 
9995    57.0
9996    55.0
9997    61.0
9998    54.0
9999    59.0
Name: table, 

In [9]:
print(data)

        outcome     carat  cut  clarity     depth     table     price  \
0    -26.701232  0.723643    0        3  0.386072 -0.653020  1.024563   
1      6.548093 -0.886369    1        4 -0.872995  0.682072 -0.764609   
2      6.612562 -0.632156    2        5 -0.733098  0.237041 -0.649645   
3     -5.073562 -0.208469    1        5 -0.383358  0.237041 -0.355003   
4    -14.436557  0.066928    0        6  0.456020 -1.543082 -0.294349   
...         ...       ...  ...      ...       ...       ...       ...   
9995  10.718277 -0.992290    0        3  0.595916 -0.207990 -0.738215   
9996 -12.246698  0.448246    4        5  5.422336 -1.098051  0.239104   
9997  11.122516 -0.589788    2        6 -2.691646  1.572133 -0.669440   
9998 -24.730782 -1.034659    0        0  0.176227 -1.543082 -0.789226   
9999   8.735755 -0.907553    2        5 -1.292683  0.682072 -0.827801   

             x         y         z  ...        b8        b9       b10  \
0     0.893417  0.780367  0.923092  ...  0.593856 

Data splitting:
- Split the entire dataset into training and testing sets first.
- Use the training set to generate folds (one validation and the rest training folds) (K-Fold Cross Validation)

In [10]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=REPRODUCIBILITY_SEED)
print(f"Training set size: {len(train_data)} | Test set size: {len(test_data)}")    
print()

kfold_data = get_kfold_data(data=train_data, k=NUM_FOLDS, reproducibility_seed=REPRODUCIBILITY_SEED)

Training set size: 8000 | Test set size: 2000

Fold: 0/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 1/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 2/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 3/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%

Fold: 4/5
Train shape: (6400, 37) | 80.00%
Validation shape: (1600, 37) | 20.00%



Define models and hyperparameter tuning objectives for each model

In [11]:
models = {
        "linear_regression": LinearRegression,
        "lasso": Lasso,
        "ridge": Ridge,
        "xgb": xgb.XGBRegressor,
        "random_forest": RandomForestRegressor,
        "gradient_boosting": GradientBoostingRegressor,
        "ada_boost": AdaBoostRegressor,
        "lgbm": lgb.LGBMRegressor
        }

def objective(model_type, trial, x_train, y_train, x_val, y_val):
    if model_type == LinearRegression:
        parameters = {
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        }
    elif model_type == Lasso:
        parameters = {
            "alpha": trial.suggest_float("alpha", 1e-3, 0.1, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "selection": trial.suggest_categorical("selection", ["cyclic", "random"]),
            "warm_start": trial.suggest_categorical("warm_start", [True, False]),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == Ridge:
        parameters = {
            "alpha": trial.suggest_float("alpha", 1e-3, 0.1, log=True),
            "solver": trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "positive": False,
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == xgb.XGBRegressor:
        parameters = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "n_estimators": 100,
            "eta": trial.suggest_float("eta", 1e-2, 0.2, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 10, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 6),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "seed": REPRODUCIBILITY_SEED
        }
    elif model_type == RandomForestRegressor:
        parameters = {
            "n_estimators": 100,
            "criterion": trial.suggest_categorical("criterion", ["absolute_error", "squared_error"]),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "bootstrap": True,
            "oob_score": False,
            "n_jobs": -1,
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == GradientBoostingRegressor:
        parameters = {
            "n_estimators": 100,
            "loss": trial.suggest_categorical("loss", ["absolute_error", "squared_error", "huber", "quantile"]),
            "criterion": trial.suggest_categorical("criterion", ["friedman_mse", "squared_error"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "subsample": trial.suggest_float("subsample", 0.05, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 2**10),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == AdaBoostRegressor:
        parameters = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 100),
            "loss": trial.suggest_categorical("loss", ["linear", "square", "exponential"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == lgb.LGBMRegressor:
        parameters = {
                    "objective": "regression",
                    "metric": "rmse",
                    "n_estimators": 100,
                    "verbosity": -1,
                    "bagging_freq": 1,
                    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
                    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
                    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
                    "seed": REPRODUCIBILITY_SEED
        }

    
    model = model_type(**parameters) # Create the model
    model.fit(x_train, y_train)
    predictions = model.predict(x_val)
    metrics = calculate_metrics(targets=y_val, preds=predictions)
    rmse = metrics["rmse"]
    return rmse

In [12]:
# Train + Validate models
metrics = ["mae", "mse", "rmse", "pcc", "spearman_r", "r2_score"]
model_scores = {model_name: defaultdict(list) for model_name in models.keys()}

for fold in range(NUM_FOLDS):
    fold_data = kfold_data[fold]
     
    # Extract data
    train_data = fold_data["train"]
    val_data = fold_data["val"]

    train_y = train_data["outcome"]
    val_y = val_data["outcome"]
    
    train_x = train_data.drop(columns=["outcome"])
    val_x = val_data.drop(columns=["outcome"])

    # print(f"Fold {fold+1}/{NUM_FOLDS}")
    # print(f"Train data shape: {train_x.shape} | Train target shape: {train_y.shape}")
    # print(f"Val data shape: {val_x.shape} | Val target shape: {val_y.shape}")
    # print(f"Test data shape: {test_x.shape} | Test target shape: {test_y.shape}")

    # Train model
    for model_name, model in models.items():
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: objective(trial=trial, 
                                               model_type=model, 
                                               x_train=train_x, 
                                               y_train=train_y, 
                                               x_val=val_x, 
                                               y_val=val_y
                                               ), n_trials=N_TRIALS)
        
        # Train model with best hyperparameters
        best_fold_params = study.best_params
        model = model(**best_fold_params)
        model.fit(train_x, train_y)
        preds = model.predict(val_x)

        metrics = calculate_metrics(targets=val_y, preds=preds)
        mae = metrics["mae"]
        mse = metrics["mse"]
        rmse = metrics["rmse"]
        pcc = metrics["pcc"]
        spearman_r = metrics["spearman_r"]
        r2_score = metrics["r2_score"]

        for metric in metrics:
            model_scores[model_name][metric].append(metrics[metric])

        print(f"Fold: {fold+1}/{NUM_FOLDS}")
        print(f"Model name: {model_name}")
        print(f"MAE: {mae}")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"PCC: {pcc}")
        print(f"Spearman R: {spearman_r}")
        print(f"R2 Score: {r2_score}")
        print()


[I 2025-02-17 16:52:19,660] A new study created in memory with name: no-name-71bb3804-c960-4a39-aed6-28a4a5faa27f
[I 2025-02-17 16:52:19,671] Trial 0 finished with value: 10.789711542941271 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 16:52:19,680] Trial 1 finished with value: 10.789711542941271 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 16:52:19,690] Trial 2 finished with value: 10.789711542941271 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 16:52:19,700] Trial 3 finished with value: 10.789711542941271 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 16:52:19,707] Trial 4 finished with value: 10.789711542941271 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.789711542941271.
[I 2025-02-17 16:52:19,715] Trial 5 finished with value: 1

Fold: 1/5
Model name: linear_regression
MAE: 8.70751717596474
MSE: 116.4178751798801
RMSE: 10.789711542941271
PCC: 0.5299360904553945
Spearman R: 0.5514073667606901
R2 Score: 0.2804847588737671



[I 2025-02-17 16:52:19,758] Trial 0 finished with value: 10.830423289907934 and parameters: {'alpha': 0.07729778830633975, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 0 with value: 10.830423289907934.
[I 2025-02-17 16:52:19,765] Trial 1 finished with value: 10.828135486105305 and parameters: {'alpha': 0.07408219540380476, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.828135486105305.
[I 2025-02-17 16:52:19,784] Trial 2 finished with value: 10.777451723402349 and parameters: {'alpha': 0.018716613701131993, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 2 with value: 10.777451723402349.
[I 2025-02-17 16:52:19,796] Trial 3 finished with value: 10.824032705489495 and parameters: {'alpha': 0.06695571312109415, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 2 with value: 10.777451723402349.
  model = cd_fast.enet_coordinate_desce

Fold: 1/5
Model name: lasso
MAE: 8.69630113852176
MSE: 116.15284123132396
RMSE: 10.777422754597872
PCC: 0.531236813927418
Spearman R: 0.5524663261587213
R2 Score: 0.2821227888163971



[I 2025-02-17 16:52:20,773] Trial 0 finished with value: 10.788818645614917 and parameters: {'alpha': 0.019200037526401995, 'solver': 'saga', 'fit_intercept': True}. Best is trial 0 with value: 10.788818645614917.
[I 2025-02-17 16:52:20,779] Trial 1 finished with value: 10.78940055143547 and parameters: {'alpha': 0.010785443415656282, 'solver': 'lsqr', 'fit_intercept': False}. Best is trial 0 with value: 10.788818645614917.
[I 2025-02-17 16:52:20,783] Trial 2 finished with value: 10.789696236310164 and parameters: {'alpha': 0.014093806298744125, 'solver': 'cholesky', 'fit_intercept': True}. Best is trial 0 with value: 10.788818645614917.
[I 2025-02-17 16:52:21,414] Trial 3 finished with value: 10.788832929562263 and parameters: {'alpha': 0.0023634068254566954, 'solver': 'saga', 'fit_intercept': True}. Best is trial 0 with value: 10.788818645614917.
[I 2025-02-17 16:52:21,418] Trial 4 finished with value: 10.789707755677837 and parameters: {'alpha': 0.0025627593783451113, 'solver': 'aut

Fold: 1/5
Model name: ridge
MAE: 8.707021319073988
MSE: 116.3976921814434
RMSE: 10.788776213335941
PCC: 0.5300491105123213
Spearman R: 0.5514766802643283
R2 Score: 0.2806094989531097



[I 2025-02-17 16:52:23,274] Trial 0 finished with value: 9.54860128644346 and parameters: {'eta': 0.1342536450961115, 'gamma': 4.178623001124848e-06, 'max_depth': 7, 'min_child_weight': 6, 'subsample': 0.9553070299646376, 'colsample_bytree': 0.5578699456906179}. Best is trial 0 with value: 9.54860128644346.
[I 2025-02-17 16:52:23,603] Trial 1 finished with value: 9.37814590436285 and parameters: {'eta': 0.04405607657898593, 'gamma': 1.4058900480299355e-05, 'max_depth': 9, 'min_child_weight': 2, 'subsample': 0.5309961470291238, 'colsample_bytree': 0.9884590401896864}. Best is trial 1 with value: 9.37814590436285.
[I 2025-02-17 16:52:23,859] Trial 2 finished with value: 9.494377249336077 and parameters: {'eta': 0.058457119563401154, 'gamma': 5.61561013655387e-07, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.7414772947509954, 'colsample_bytree': 0.5338235112431065}. Best is trial 1 with value: 9.37814590436285.
[I 2025-02-17 16:52:24,006] Trial 3 finished with value: 9.4921017004

Fold: 1/5
Model name: xgb
MAE: 7.41592906084104
MSE: 84.8220034947302
RMSE: 9.209886182506828
PCC: 0.6929008504312886
Spearman R: 0.6949916650358067
R2 Score: 0.47576156837581074



[I 2025-02-17 16:52:27,675] Trial 0 finished with value: 9.565246232844824 and parameters: {'criterion': 'absolute_error', 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 9.565246232844824.
[I 2025-02-17 16:52:27,812] Trial 1 finished with value: 9.809701644289522 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 9, 'min_samples_split': 4, 'min_samples_leaf': 8}. Best is trial 0 with value: 9.565246232844824.
[I 2025-02-17 16:52:27,929] Trial 2 finished with value: 10.35776314537956 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 0 with value: 9.565246232844824.
[I 2025-02-17 16:52:29,197] Trial 3 finished with value: 10.977218503053804 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 3, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0

Fold: 1/5
Model name: random_forest
MAE: 7.70902217596265
MSE: 91.69320764314936
RMSE: 9.575657034540729
PCC: 0.6782027770091488
Spearman R: 0.6819127849268691
R2 Score: 0.4332944120045199



[I 2025-02-17 16:53:07,119] Trial 0 finished with value: 12.337984377202538 and parameters: {'loss': 'huber', 'criterion': 'squared_error', 'learning_rate': 0.0012476113469481126, 'subsample': 0.9941978504899442, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 196}. Best is trial 0 with value: 12.337984377202538.
[I 2025-02-17 16:53:07,592] Trial 1 finished with value: 11.106482879810534 and parameters: {'loss': 'huber', 'criterion': 'friedman_mse', 'learning_rate': 0.009409624953689662, 'subsample': 0.6217031096826351, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_leaf_nodes': 801}. Best is trial 1 with value: 11.106482879810534.
[I 2025-02-17 16:53:08,673] Trial 2 finished with value: 10.192093953058466 and parameters: {'loss': 'squared_error', 'criterion': 'friedman_mse', 'learning_rate': 0.012945054128955517, 'subsample': 0.895622148572554, 'max_depth': 6, 'min_samples_split': 4, 

Fold: 1/5
Model name: gradient_boosting
MAE: 7.503798511119755
MSE: 86.72109141422041
RMSE: 9.312415981592554
PCC: 0.6912135678816844
Spearman R: 0.6934270325886845
R2 Score: 0.4640243441720481



[I 2025-02-17 16:53:19,089] Trial 0 finished with value: 9.792341926624463 and parameters: {'n_estimators': 86, 'loss': 'linear', 'learning_rate': 0.0030634680628351726}. Best is trial 0 with value: 9.792341926624463.
[I 2025-02-17 16:53:21,191] Trial 1 finished with value: 9.793370018033938 and parameters: {'n_estimators': 73, 'loss': 'square', 'learning_rate': 0.00615019985981721}. Best is trial 0 with value: 9.792341926624463.
[I 2025-02-17 16:53:23,828] Trial 2 finished with value: 9.68146183216389 and parameters: {'n_estimators': 93, 'loss': 'square', 'learning_rate': 0.019211338783601185}. Best is trial 2 with value: 9.68146183216389.
[I 2025-02-17 16:53:25,532] Trial 3 finished with value: 9.802316499162723 and parameters: {'n_estimators': 60, 'loss': 'square', 'learning_rate': 0.006474799473772369}. Best is trial 2 with value: 9.68146183216389.
[I 2025-02-17 16:53:27,428] Trial 4 finished with value: 9.73567500446339 and parameters: {'n_estimators': 67, 'loss': 'linear', 'learn

Fold: 1/5
Model name: ada_boost
MAE: 7.769508278735686
MSE: 93.84269775407978
RMSE: 9.687244074249383
PCC: 0.6483456056832557
Spearman R: 0.661956164659959
R2 Score: 0.42000958874971583



[I 2025-02-17 16:53:39,597] Trial 1 finished with value: 10.6703900490015 and parameters: {'learning_rate': 0.028792544242817346, 'num_leaves': 387, 'subsample': 0.8278542045152988, 'colsample_bytree': 0.2674409321536746, 'min_data_in_leaf': 22}. Best is trial 1 with value: 10.6703900490015.
[I 2025-02-17 16:53:39,721] Trial 2 finished with value: 10.658754678482788 and parameters: {'learning_rate': 0.013473496778301678, 'num_leaves': 765, 'subsample': 0.5774417726018258, 'colsample_bytree': 0.32642119241577566, 'min_data_in_leaf': 47}. Best is trial 2 with value: 10.658754678482788.
[I 2025-02-17 16:53:39,787] Trial 3 finished with value: 11.186174752292244 and parameters: {'learning_rate': 0.01937764689635404, 'num_leaves': 773, 'subsample': 0.13863353358366798, 'colsample_bytree': 0.27130655664555076, 'min_data_in_leaf': 39}. Best is trial 2 with value: 10.658754678482788.
[I 2025-02-17 16:53:39,961] Trial 4 finished with value: 10.669335540323978 and parameters: {'learning_rate': 0

Fold: 1/5
Model name: lgbm
MAE: 8.674771745300532
MSE: 117.07179337469402
RMSE: 10.81997196737099
PCC: 0.6718332585973127
Spearman R: 0.6752207139534039
R2 Score: 0.27644324800706055

Fold: 2/5
Model name: linear_regression
MAE: 8.589235102672504
MSE: 116.93096828103435
RMSE: 10.81346236323197
PCC: 0.5423829744003588
Spearman R: 0.5661288207534456
R2 Score: 0.2926921600863055



[I 2025-02-17 16:53:40,789] Trial 4 finished with value: 10.816116068613614 and parameters: {'alpha': 0.04614893013327525, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.813120771564973.
[I 2025-02-17 16:53:40,805] Trial 5 finished with value: 10.813614448043243 and parameters: {'alpha': 0.0030813644158862244, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best is trial 1 with value: 10.813120771564973.
[I 2025-02-17 16:53:40,828] Trial 6 finished with value: 10.813958851068636 and parameters: {'alpha': 0.010793458698034315, 'fit_intercept': True, 'selection': 'random', 'warm_start': False}. Best is trial 1 with value: 10.813120771564973.
[I 2025-02-17 16:53:40,859] Trial 7 finished with value: 10.813570371869975 and parameters: {'alpha': 0.004717435555070359, 'fit_intercept': True, 'selection': 'random', 'warm_start': False}. Best is trial 1 with value: 10.813120771564973.
[I 2025-02-17 16:53:40,871] Trial 8 finis

Fold: 2/5
Model name: lasso
MAE: 8.590301906153568
MSE: 116.91659314255486
RMSE: 10.812797655674265
PCC: 0.5425952283621374
Spearman R: 0.5663402798204218
R2 Score: 0.2927791143662182



[I 2025-02-17 16:53:41,230] Trial 1 finished with value: 10.813452678374547 and parameters: {'alpha': 0.06087983962073559, 'solver': 'saga', 'fit_intercept': False}. Best is trial 1 with value: 10.813452678374547.
[I 2025-02-17 16:53:41,236] Trial 2 finished with value: 10.813721365412082 and parameters: {'alpha': 0.00191477291900844, 'solver': 'sparse_cg', 'fit_intercept': False}. Best is trial 1 with value: 10.813452678374547.
[I 2025-02-17 16:53:41,385] Trial 3 finished with value: 10.813466144951528 and parameters: {'alpha': 0.016564832162884173, 'solver': 'sag', 'fit_intercept': False}. Best is trial 1 with value: 10.813452678374547.
[I 2025-02-17 16:53:41,390] Trial 4 finished with value: 10.81344576623703 and parameters: {'alpha': 0.0054762581776392275, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 4 with value: 10.81344576623703.
[I 2025-02-17 16:53:41,394] Trial 5 finished with value: 10.813463043834153 and parameters: {'alpha': 0.008386501133770784, 'solver': '

Fold: 2/5
Model name: ridge
MAE: 8.589229731607379
MSE: 116.93060933934954
RMSE: 10.81344576623703
PCC: 0.5423846792083384
Spearman R: 0.5661178549288497
R2 Score: 0.29269433130126643



[I 2025-02-17 16:53:42,489] Trial 0 finished with value: 10.032158215248085 and parameters: {'eta': 0.15944795390283484, 'gamma': 0.002328100657246368, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.6493280661665483, 'colsample_bytree': 0.6694153703675165}. Best is trial 0 with value: 10.032158215248085.
[I 2025-02-17 16:53:42,609] Trial 1 finished with value: 9.484976073662096 and parameters: {'eta': 0.0851939739323465, 'gamma': 2.0956072427465178, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.9603679272086765, 'colsample_bytree': 0.5918782964150029}. Best is trial 1 with value: 9.484976073662096.
[I 2025-02-17 16:53:42,832] Trial 2 finished with value: 9.75295292482477 and parameters: {'eta': 0.08608399948044067, 'gamma': 0.5070220278665817, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.6362675256447895, 'colsample_bytree': 0.5409770018823725}. Best is trial 1 with value: 9.484976073662096.
[I 2025-02-17 16:53:43,052] Trial 3 finished with value: 10.35286115827

Fold: 2/5
Model name: xgb
MAE: 7.404366276491894
MSE: 87.48022355390098
RMSE: 9.35308631168883
PCC: 0.6865035975672337
Spearman R: 0.6940143824274932
R2 Score: 0.4708378039907781



[I 2025-02-17 16:53:45,801] Trial 1 finished with value: 9.895820008724238 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 9, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 1 with value: 9.895820008724238.
[I 2025-02-17 16:53:47,343] Trial 2 finished with value: 10.424875098831128 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 1 with value: 9.895820008724238.
[I 2025-02-17 16:53:48,851] Trial 3 finished with value: 10.437241945518235 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 8}. Best is trial 1 with value: 9.895820008724238.
[I 2025-02-17 16:53:50,600] Trial 4 finished with value: 9.987617972412147 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is tria

Fold: 2/5
Model name: random_forest
MAE: 7.6363677020032075
MSE: 93.03770726442177
RMSE: 9.645605593451442
PCC: 0.6818737195502971
Spearman R: 0.6877115176607492
R2 Score: 0.4372209456304119



[I 2025-02-17 16:53:55,702] Trial 0 finished with value: 19.543575956536873 and parameters: {'loss': 'quantile', 'criterion': 'squared_error', 'learning_rate': 0.004962454717448584, 'subsample': 0.13600904858471807, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'max_leaf_nodes': 177}. Best is trial 0 with value: 19.543575956536873.
[I 2025-02-17 16:53:56,201] Trial 1 finished with value: 9.59892432423168 and parameters: {'loss': 'huber', 'criterion': 'squared_error', 'learning_rate': 0.09410462494486146, 'subsample': 0.2924987768765818, 'max_depth': 5, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_leaf_nodes': 199}. Best is trial 1 with value: 9.59892432423168.
[I 2025-02-17 16:53:56,498] Trial 2 finished with value: 9.495830780240693 and parameters: {'loss': 'absolute_error', 'criterion': 'friedman_mse', 'learning_rate': 0.055349105695156074, 'subsample': 0.19740827906327574, 'max_depth': 4, 'min_samples_split': 

Fold: 2/5
Model name: gradient_boosting
MAE: 7.506534552279702
MSE: 89.80003157695096
RMSE: 9.47628785848926
PCC: 0.6819416198752478
Spearman R: 0.6869907878870266
R2 Score: 0.45680543578311594



[I 2025-02-17 16:54:03,326] Trial 0 finished with value: 9.836822588320743 and parameters: {'n_estimators': 70, 'loss': 'exponential', 'learning_rate': 0.0034303138082165258}. Best is trial 0 with value: 9.836822588320743.
[I 2025-02-17 16:54:05,007] Trial 1 finished with value: 9.77427394825749 and parameters: {'n_estimators': 60, 'loss': 'linear', 'learning_rate': 0.03343758866438751}. Best is trial 1 with value: 9.77427394825749.
[I 2025-02-17 16:54:07,569] Trial 2 finished with value: 9.826487515096389 and parameters: {'n_estimators': 90, 'loss': 'linear', 'learning_rate': 0.005459053676961409}. Best is trial 1 with value: 9.77427394825749.
[I 2025-02-17 16:54:09,105] Trial 3 finished with value: 9.852836884174891 and parameters: {'n_estimators': 54, 'loss': 'exponential', 'learning_rate': 0.001363950721811646}. Best is trial 1 with value: 9.77427394825749.
[I 2025-02-17 16:54:11,584] Trial 4 finished with value: 9.703367500167607 and parameters: {'n_estimators': 98, 'loss': 'squar

Fold: 2/5
Model name: ada_boost
MAE: 7.6600027953568235
MSE: 93.44558333268564
RMSE: 9.666725574499653
PCC: 0.660996846942407
Spearman R: 0.6667557982495149
R2 Score: 0.434753729759054



[I 2025-02-17 16:54:24,742] Trial 0 finished with value: 11.478345360941528 and parameters: {'learning_rate': 0.004197896671473455, 'num_leaves': 950, 'subsample': 0.22264401196420208, 'colsample_bytree': 0.697191276243512, 'min_data_in_leaf': 8}. Best is trial 0 with value: 11.478345360941528.
[I 2025-02-17 16:54:24,949] Trial 1 finished with value: 12.427925038108603 and parameters: {'learning_rate': 0.0014321830037710942, 'num_leaves': 512, 'subsample': 0.8646371708571816, 'colsample_bytree': 0.4754649726582509, 'min_data_in_leaf': 29}. Best is trial 0 with value: 11.478345360941528.
[I 2025-02-17 16:54:25,187] Trial 2 finished with value: 9.704962326536247 and parameters: {'learning_rate': 0.03973448080828983, 'num_leaves': 435, 'subsample': 0.20545309188455185, 'colsample_bytree': 0.6267050201033133, 'min_data_in_leaf': 6}. Best is trial 2 with value: 9.704962326536247.
[I 2025-02-17 16:54:25,235] Trial 3 finished with value: 9.943230829038718 and parameters: {'learning_rate': 0.0

Fold: 2/5
Model name: lgbm
MAE: 7.510640637770284
MSE: 90.42615177801996
RMSE: 9.50926662671838
PCC: 0.6745308361423505
Spearman R: 0.6839847837831188
R2 Score: 0.45301807531347493

Fold: 3/5
Model name: linear_regression
MAE: 8.755107813660905
MSE: 120.93045179515703
RMSE: 10.996838263571808
PCC: 0.5082810320078581
Spearman R: 0.5304086007455472
R2 Score: 0.2563254092788819



[I 2025-02-17 16:54:26,721] Trial 6 finished with value: 10.992161849254824 and parameters: {'alpha': 0.006897600954286823, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 3 with value: 10.972291819975874.
[I 2025-02-17 16:54:26,745] Trial 7 finished with value: 10.995034525254392 and parameters: {'alpha': 0.002535590717239805, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 3 with value: 10.972291819975874.
[I 2025-02-17 16:54:26,779] Trial 8 finished with value: 10.990515988418153 and parameters: {'alpha': 0.008356633630984472, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 3 with value: 10.972291819975874.
[I 2025-02-17 16:54:26,826] Trial 9 finished with value: 10.995362416145989 and parameters: {'alpha': 0.002291173229758973, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 3 with value: 10.972291819975874.
[I 2025-02-17 16:54:26,833] A new study 

Fold: 3/5
Model name: lasso
MAE: 8.733965393350642
MSE: 120.39202272146598
RMSE: 10.972329867510636
PCC: 0.5101856030952661
Spearman R: 0.5367754557716624
R2 Score: 0.25963653575749457



[I 2025-02-17 16:54:27,180] Trial 3 finished with value: 10.996861146199718 and parameters: {'alpha': 0.03336109792437315, 'solver': 'sag', 'fit_intercept': False}. Best is trial 2 with value: 10.9968309870709.
[I 2025-02-17 16:54:27,186] Trial 4 finished with value: 10.996830690280891 and parameters: {'alpha': 0.007965007113321864, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 4 with value: 10.996830690280891.
[I 2025-02-17 16:54:27,475] Trial 5 finished with value: 10.99691627023197 and parameters: {'alpha': 0.006333971299784133, 'solver': 'saga', 'fit_intercept': False}. Best is trial 4 with value: 10.996830690280891.
[I 2025-02-17 16:54:27,479] Trial 6 finished with value: 10.996819915398472 and parameters: {'alpha': 0.09676361846887953, 'solver': 'auto', 'fit_intercept': True}. Best is trial 6 with value: 10.996819915398472.
[I 2025-02-17 16:54:27,486] Trial 7 finished with value: 10.997303890323009 and parameters: {'alpha': 0.005651286887047073, 'solver': 'lsqr', '

Fold: 3/5
Model name: ridge
MAE: 8.7550837647293
MSE: 120.93004825170445
RMSE: 10.996819915398472
PCC: 0.5082824046647236
Spearman R: 0.5304103292618474
R2 Score: 0.2563278909119816



[I 2025-02-17 16:54:27,725] Trial 0 finished with value: 9.556597895946506 and parameters: {'eta': 0.03368266155100214, 'gamma': 0.0009464139043356663, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.8083888145720756, 'colsample_bytree': 0.5855449527913253}. Best is trial 0 with value: 9.556597895946506.
[I 2025-02-17 16:54:27,810] Trial 1 finished with value: 9.783965054830317 and parameters: {'eta': 0.016133882769379538, 'gamma': 0.37940597380018787, 'max_depth': 4, 'min_child_weight': 6, 'subsample': 0.600447376077214, 'colsample_bytree': 0.7882464508087668}. Best is trial 0 with value: 9.556597895946506.
[I 2025-02-17 16:54:27,907] Trial 2 finished with value: 9.56915138337885 and parameters: {'eta': 0.021558653834521758, 'gamma': 0.003573854534199866, 'max_depth': 5, 'min_child_weight': 6, 'subsample': 0.8865495227767528, 'colsample_bytree': 0.7614609516362199}. Best is trial 0 with value: 9.556597895946506.
[I 2025-02-17 16:54:27,980] Trial 3 finished with value: 10.2956086

Fold: 3/5
Model name: xgb
MAE: 7.4374165125715805
MSE: 89.15269115959747
RMSE: 9.442070279318909
PCC: 0.6727004921169427
Spearman R: 0.6773906405822815
R2 Score: 0.45174610591792175



[I 2025-02-17 16:54:31,002] Trial 1 finished with value: 9.763059409435122 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 1 with value: 9.763059409435122.
[I 2025-02-17 16:54:32,185] Trial 2 finished with value: 11.121045884669542 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 7}. Best is trial 1 with value: 9.763059409435122.
[I 2025-02-17 16:54:32,310] Trial 3 finished with value: 10.505287842475196 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 1 with value: 9.763059409435122.
[I 2025-02-17 16:54:32,452] Trial 4 finished with value: 9.643939817164561 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is tria

Fold: 3/5
Model name: random_forest
MAE: 7.648012151665793
MSE: 93.40886156671837
RMSE: 9.66482599774659
PCC: 0.6633472211224046
Spearman R: 0.6732977864444479
R2 Score: 0.42557233629605284



[I 2025-02-17 16:54:38,388] Trial 0 finished with value: 10.355201028422092 and parameters: {'loss': 'absolute_error', 'criterion': 'squared_error', 'learning_rate': 0.012491832545531397, 'subsample': 0.501182237425421, 'max_depth': 9, 'min_samples_split': 8, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_leaf_nodes': 31}. Best is trial 0 with value: 10.355201028422092.
[I 2025-02-17 16:54:39,427] Trial 1 finished with value: 20.274040246564194 and parameters: {'loss': 'quantile', 'criterion': 'friedman_mse', 'learning_rate': 0.0010222838227453037, 'subsample': 0.5458423445346811, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 10, 'max_features': 'log2', 'max_leaf_nodes': 343}. Best is trial 0 with value: 10.355201028422092.
[I 2025-02-17 16:54:39,704] Trial 2 finished with value: 20.05573829917224 and parameters: {'loss': 'quantile', 'criterion': 'squared_error', 'learning_rate': 0.0017410001803400185, 'subsample': 0.07514808688975143, 'max_depth': 7, 'min_samples_sp

Fold: 3/5
Model name: gradient_boosting
MAE: 7.496037096987671
MSE: 90.35109459217374
RMSE: 9.505319278812982
PCC: 0.6679889626669259
Spearman R: 0.6731191867653072
R2 Score: 0.4443763973870253



[I 2025-02-17 16:54:50,660] Trial 0 finished with value: 9.827314796222547 and parameters: {'n_estimators': 100, 'loss': 'exponential', 'learning_rate': 0.009199010946538466}. Best is trial 0 with value: 9.827314796222547.
[I 2025-02-17 16:54:53,090] Trial 1 finished with value: 9.825355075757846 and parameters: {'n_estimators': 85, 'loss': 'linear', 'learning_rate': 0.007628677135177158}. Best is trial 1 with value: 9.825355075757846.
[I 2025-02-17 16:54:54,833] Trial 2 finished with value: 9.786794735468744 and parameters: {'n_estimators': 61, 'loss': 'exponential', 'learning_rate': 0.02754211052062974}. Best is trial 2 with value: 9.786794735468744.
[I 2025-02-17 16:54:56,926] Trial 3 finished with value: 9.825219767778798 and parameters: {'n_estimators': 73, 'loss': 'square', 'learning_rate': 0.0016839975563779396}. Best is trial 2 with value: 9.786794735468744.
[I 2025-02-17 16:54:59,731] Trial 4 finished with value: 9.809407647846239 and parameters: {'n_estimators': 98, 'loss': '

Fold: 3/5
Model name: ada_boost
MAE: 7.679417473466685
MSE: 95.46219202166289
RMSE: 9.770475526895448
PCC: 0.6433842205593424
Spearman R: 0.6587000930878172
R2 Score: 0.412945163710254



[I 2025-02-17 16:55:12,495] Trial 2 finished with value: 11.801757097793478 and parameters: {'learning_rate': 0.0025515420312026936, 'num_leaves': 839, 'subsample': 0.6327307337043715, 'colsample_bytree': 0.715902020198422, 'min_data_in_leaf': 68}. Best is trial 0 with value: 9.561152982146341.
[I 2025-02-17 16:55:12,659] Trial 3 finished with value: 10.502670953700266 and parameters: {'learning_rate': 0.010476528149051076, 'num_leaves': 131, 'subsample': 0.7739537543631246, 'colsample_bytree': 0.5166393158612744, 'min_data_in_leaf': 25}. Best is trial 0 with value: 9.561152982146341.
[I 2025-02-17 16:55:12,732] Trial 4 finished with value: 9.438598868719177 and parameters: {'learning_rate': 0.07599782051876308, 'num_leaves': 535, 'subsample': 0.6280401988013393, 'colsample_bytree': 0.45429028200911636, 'min_data_in_leaf': 96}. Best is trial 4 with value: 9.438598868719177.
[I 2025-02-17 16:55:13,051] Trial 5 finished with value: 11.979960329313686 and parameters: {'learning_rate': 0.0

Fold: 3/5
Model name: lgbm
MAE: 7.437808482149877
MSE: 89.16023848960317
RMSE: 9.442469935859112
PCC: 0.6727137225001714
Spearman R: 0.677572090848473
R2 Score: 0.4516996928146073

Fold: 4/5
Model name: linear_regression
MAE: 8.602220439887196
MSE: 115.140769914462
RMSE: 10.730366718545177
PCC: 0.5397874341390442
Spearman R: 0.5686795893279646
R2 Score: 0.29042379751749636



[I 2025-02-17 16:55:13,781] Trial 4 finished with value: 10.729269240479972 and parameters: {'alpha': 0.0011714759878202377, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.71669762795461.
[I 2025-02-17 16:55:13,792] Trial 5 finished with value: 10.847147073147282 and parameters: {'alpha': 0.08462797109144467, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 1 with value: 10.71669762795461.
[I 2025-02-17 16:55:13,808] Trial 6 finished with value: 10.728935275979895 and parameters: {'alpha': 0.0015413333962436774, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 1 with value: 10.71669762795461.
[I 2025-02-17 16:55:13,826] Trial 7 finished with value: 10.722620705155302 and parameters: {'alpha': 0.00753654778715231, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best is trial 1 with value: 10.71669762795461.
[I 2025-02-17 16:55:13,836] Trial 8 finished

Fold: 4/5
Model name: lasso
MAE: 8.599988849607774
MSE: 114.70828399290666
RMSE: 10.710195329353553
PCC: 0.5426612460397977
Spearman R: 0.5716998434374388
R2 Score: 0.29308907166906206



[I 2025-02-17 16:55:14,028] Trial 5 finished with value: 10.73036860924288 and parameters: {'alpha': 0.007548529866401581, 'solver': 'sag', 'fit_intercept': True}. Best is trial 4 with value: 10.730142104602653.
[I 2025-02-17 16:55:14,192] Trial 6 finished with value: 10.73059359635086 and parameters: {'alpha': 0.058136054862025256, 'solver': 'sag', 'fit_intercept': False}. Best is trial 4 with value: 10.730142104602653.
[I 2025-02-17 16:55:14,197] Trial 7 finished with value: 10.73036683860379 and parameters: {'alpha': 0.020694149561951425, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 4 with value: 10.730142104602653.
[I 2025-02-17 16:55:14,203] Trial 8 finished with value: 10.730142812816954 and parameters: {'alpha': 0.0843444534294335, 'solver': 'lsqr', 'fit_intercept': True}. Best is trial 4 with value: 10.730142104602653.
[I 2025-02-17 16:55:14,208] Trial 9 finished with value: 10.73050086906901 and parameters: {'alpha': 0.0014699320753326992, 'solver': 'sparse_cg'

Fold: 4/5
Model name: ridge
MAE: 8.602195856924313
MSE: 115.13594958496664
RMSE: 10.730142104602653
PCC: 0.5398148898859719
Spearman R: 0.5686684711595592
R2 Score: 0.2904535036858724



[I 2025-02-17 16:55:14,689] Trial 1 finished with value: 9.40362556696121 and parameters: {'eta': 0.044734215609359314, 'gamma': 8.77381464461235e-06, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.8126241439946221, 'colsample_bytree': 0.7211625265722406}. Best is trial 1 with value: 9.40362556696121.
[I 2025-02-17 16:55:14,845] Trial 2 finished with value: 9.345350644593607 and parameters: {'eta': 0.04076605168660906, 'gamma': 4.128373993784663e-05, 'max_depth': 7, 'min_child_weight': 2, 'subsample': 0.5034311912628586, 'colsample_bytree': 0.5966686045631542}. Best is trial 2 with value: 9.345350644593607.
[I 2025-02-17 16:55:14,902] Trial 3 finished with value: 9.093257868296316 and parameters: {'eta': 0.09409960167488594, 'gamma': 0.004274997298532611, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.8895000114894109, 'colsample_bytree': 0.7824268537406515}. Best is trial 3 with value: 9.093257868296316.
[I 2025-02-17 16:55:15,180] Trial 4 finished with value: 9.4579420

Fold: 4/5
Model name: xgb
MAE: 7.287025629577338
MSE: 83.36608600899348
RMSE: 9.13050305344637
PCC: 0.6978881049721999
Spearman R: 0.7046006600002579
R2 Score: 0.48624113969328775



[I 2025-02-17 16:55:16,369] Trial 1 finished with value: 9.438367981249224 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}. Best is trial 1 with value: 9.438367981249224.
[I 2025-02-17 16:55:17,794] Trial 2 finished with value: 10.764164582782893 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 1 with value: 9.438367981249224.
[I 2025-02-17 16:55:19,391] Trial 3 finished with value: 9.99137205385776 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 1 with value: 9.438367981249224.
[I 2025-02-17 16:55:19,493] Trial 4 finished with value: 10.655771198993584 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 3, 'min_samples_split': 6, 'min_samples_leaf': 4}. Best is trial 1

Fold: 4/5
Model name: random_forest
MAE: 7.510812856166285
MSE: 88.13941838130253
RMSE: 9.388259603425043
PCC: 0.6924802151919149
Spearman R: 0.7007743870212451
R2 Score: 0.45682459974444



[I 2025-02-17 16:55:27,353] Trial 0 finished with value: 16.877588103545726 and parameters: {'loss': 'quantile', 'criterion': 'squared_error', 'learning_rate': 0.014599561262577823, 'subsample': 0.7639791385082292, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_leaf_nodes': 670}. Best is trial 0 with value: 16.877588103545726.
[I 2025-02-17 16:55:27,834] Trial 1 finished with value: 15.32353782635578 and parameters: {'loss': 'quantile', 'criterion': 'squared_error', 'learning_rate': 0.031031494805532703, 'subsample': 0.14946432015534217, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_features': 'log2', 'max_leaf_nodes': 157}. Best is trial 1 with value: 15.32353782635578.
[I 2025-02-17 16:55:28,306] Trial 2 finished with value: 11.28612887609272 and parameters: {'loss': 'absolute_error', 'criterion': 'friedman_mse', 'learning_rate': 0.008075783690834453, 'subsample': 0.5959869785582275, 'max_depth': 3, 'min_samples_spli

Fold: 4/5
Model name: gradient_boosting
MAE: 7.58156667253814
MSE: 89.92610766705559
RMSE: 9.482937712916582
PCC: 0.6761337133703749
Spearman R: 0.680102187539917
R2 Score: 0.4458137979290391



[I 2025-02-17 16:55:34,131] Trial 0 finished with value: 9.509272387080713 and parameters: {'n_estimators': 87, 'loss': 'linear', 'learning_rate': 0.054182203400008386}. Best is trial 0 with value: 9.509272387080713.
[I 2025-02-17 16:55:35,622] Trial 1 finished with value: 9.749168585216191 and parameters: {'n_estimators': 52, 'loss': 'square', 'learning_rate': 0.005638180967991617}. Best is trial 0 with value: 9.509272387080713.
[I 2025-02-17 16:55:37,896] Trial 2 finished with value: 9.694926709829481 and parameters: {'n_estimators': 80, 'loss': 'exponential', 'learning_rate': 0.018714789185185234}. Best is trial 0 with value: 9.509272387080713.
[I 2025-02-17 16:55:40,096] Trial 3 finished with value: 9.734021549005975 and parameters: {'n_estimators': 77, 'loss': 'linear', 'learning_rate': 0.011429395448684824}. Best is trial 0 with value: 9.509272387080713.
[I 2025-02-17 16:55:41,971] Trial 4 finished with value: 9.651048231263882 and parameters: {'n_estimators': 66, 'loss': 'expone

Fold: 4/5
Model name: ada_boost
MAE: 7.558191279864056
MSE: 89.69316550090286
RMSE: 9.470647575583355
PCC: 0.6696650994893537
Spearman R: 0.6788451562429144
R2 Score: 0.4472493469337876



[I 2025-02-17 16:55:54,909] Trial 2 finished with value: 9.72794085926015 and parameters: {'learning_rate': 0.08912060309276744, 'num_leaves': 818, 'subsample': 0.18139681846941963, 'colsample_bytree': 0.26274175063290645, 'min_data_in_leaf': 24}. Best is trial 1 with value: 9.256265767155709.
[I 2025-02-17 16:55:55,012] Trial 3 finished with value: 9.401283379282344 and parameters: {'learning_rate': 0.02270150752347799, 'num_leaves': 390, 'subsample': 0.5618907954393326, 'colsample_bytree': 0.6074049797307022, 'min_data_in_leaf': 44}. Best is trial 1 with value: 9.256265767155709.
[I 2025-02-17 16:55:55,080] Trial 4 finished with value: 9.686248632634808 and parameters: {'learning_rate': 0.02005324902780516, 'num_leaves': 198, 'subsample': 0.4373925374681372, 'colsample_bytree': 0.49271297709247586, 'min_data_in_leaf': 57}. Best is trial 1 with value: 9.256265767155709.
[I 2025-02-17 16:55:55,164] Trial 5 finished with value: 12.251029001805833 and parameters: {'learning_rate': 0.0020

Fold: 4/5
Model name: lgbm
MAE: 7.4552322589789695
MSE: 86.70401078783715
RMSE: 9.311498847545284
PCC: 0.6887830357283526
Spearman R: 0.694995708006136
R2 Score: 0.46567056342821933

Fold: 5/5
Model name: linear_regression
MAE: 8.280015810684853
MSE: 119.27670375940366
RMSE: 10.921387446629831
PCC: 0.497221315847363
Spearman R: 0.5898355302091915
R2 Score: 0.23207756770388854



  model = cd_fast.enet_coordinate_descent(
[I 2025-02-17 16:55:56,736] Trial 3 finished with value: 10.320066304720648 and parameters: {'alpha': 0.0036527630808194625, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 2 with value: 10.320024564157642.
[I 2025-02-17 16:55:56,757] Trial 4 finished with value: 10.318854812976728 and parameters: {'alpha': 0.0359289899018746, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 4 with value: 10.318854812976728.
[I 2025-02-17 16:55:56,763] Trial 5 finished with value: 10.320186536268096 and parameters: {'alpha': 0.020793168162279933, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best is trial 4 with value: 10.318854812976728.
  model = cd_fast.enet_coordinate_descent(
[I 2025-02-17 16:55:56,846] Trial 6 finished with value: 10.394836866477856 and parameters: {'alpha': 0.0026955581303648862, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best

Fold: 5/5
Model name: lasso
MAE: 8.19921990815225
MSE: 106.30668170803398
RMSE: 10.310513164146291
PCC: 0.56248243419533
Spearman R: 0.5913827075713701
R2 Score: 0.31558063717764384



[I 2025-02-17 16:55:57,120] Trial 0 finished with value: 10.898987441679074 and parameters: {'alpha': 0.08955786209595713, 'solver': 'saga', 'fit_intercept': False}. Best is trial 0 with value: 10.898987441679074.
[I 2025-02-17 16:55:57,126] Trial 1 finished with value: 10.921079652154157 and parameters: {'alpha': 0.0020169751084135627, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 0 with value: 10.898987441679074.
[I 2025-02-17 16:55:57,338] Trial 2 finished with value: 10.9116704860266 and parameters: {'alpha': 0.0011699163699098773, 'solver': 'saga', 'fit_intercept': False}. Best is trial 0 with value: 10.898987441679074.
[I 2025-02-17 16:55:57,346] Trial 3 finished with value: 10.91667183990963 and parameters: {'alpha': 0.031088508685118015, 'solver': 'svd', 'fit_intercept': True}. Best is trial 0 with value: 10.898987441679074.
[I 2025-02-17 16:55:57,357] Trial 4 finished with value: 10.919908917061106 and parameters: {'alpha': 0.009962637698981211, 'solver': 'svd',

Fold: 5/5
Model name: ridge
MAE: 8.278247399369711
MSE: 118.78472469710167
RMSE: 10.898840520766495
PCC: 0.4993337365524501
Spearman R: 0.5898398954452717
R2 Score: 0.2352450073316953



[I 2025-02-17 16:55:58,082] Trial 0 finished with value: 9.157746145482967 and parameters: {'eta': 0.048482367237264234, 'gamma': 0.0013020107081006633, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.7348718143808628, 'colsample_bytree': 0.7095665599099041}. Best is trial 0 with value: 9.157746145482967.
[I 2025-02-17 16:55:58,163] Trial 1 finished with value: 9.061228320576959 and parameters: {'eta': 0.07888920117528672, 'gamma': 1.0519804177184448e-08, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.7885606426989934, 'colsample_bytree': 0.5850848717503602}. Best is trial 1 with value: 9.061228320576959.
[I 2025-02-17 16:55:58,235] Trial 2 finished with value: 9.024966170009382 and parameters: {'eta': 0.05642126823036662, 'gamma': 1.329555691712593e-06, 'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.6265907736778866, 'colsample_bytree': 0.9722441380097802}. Best is trial 2 with value: 9.024966170009382.
[I 2025-02-17 16:55:58,342] Trial 3 finished with value: 9.271

Fold: 5/5
Model name: xgb
MAE: 7.203648813248447
MSE: 82.11609239234927
RMSE: 9.061793000965608
PCC: 0.686973231620805
Spearman R: 0.6924062353149357
R2 Score: 0.47132350733146766



[I 2025-02-17 16:55:59,959] Trial 1 finished with value: 9.391547487700974 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 1 with value: 9.391547487700974.
[I 2025-02-17 16:56:00,088] Trial 2 finished with value: 10.03454340868289 and parameters: {'criterion': 'squared_error', 'max_features': 'log2', 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 10}. Best is trial 1 with value: 9.391547487700974.
[I 2025-02-17 16:56:01,801] Trial 3 finished with value: 9.463073523066825 and parameters: {'criterion': 'absolute_error', 'max_features': 'log2', 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 1 with value: 9.391547487700974.
[I 2025-02-17 16:56:01,905] Trial 4 finished with value: 10.440233489385443 and parameters: {'criterion': 'squared_error', 'max_features': 'sqrt', 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 8}. Best is trial 1 

Fold: 5/5
Model name: random_forest
MAE: 7.521441199976957
MSE: 88.84498874667868
RMSE: 9.425761971675218
PCC: 0.6813809726793819
Spearman R: 0.6862793686638159
R2 Score: 0.42800179997184684



[I 2025-02-17 16:56:05,769] Trial 0 finished with value: 20.583265010412255 and parameters: {'loss': 'quantile', 'criterion': 'friedman_mse', 'learning_rate': 0.0014618077111956736, 'subsample': 0.4698869723499492, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 938}. Best is trial 0 with value: 20.583265010412255.
[I 2025-02-17 16:56:06,000] Trial 1 finished with value: 18.646704654050783 and parameters: {'loss': 'quantile', 'criterion': 'friedman_mse', 'learning_rate': 0.009723759316180542, 'subsample': 0.1301188922303018, 'max_depth': 4, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_leaf_nodes': 598}. Best is trial 1 with value: 18.646704654050783.
[I 2025-02-17 16:56:08,115] Trial 2 finished with value: 11.57956226417625 and parameters: {'loss': 'huber', 'criterion': 'squared_error', 'learning_rate': 0.002230271035696182, 'subsample': 0.5866274138741037, 'max_depth': 9, 'min_samples_split': 9, 'm

Fold: 5/5
Model name: gradient_boosting
MAE: 7.962966439068109
MSE: 98.72098237334443
RMSE: 9.93584331465349
PCC: 0.6838565561787968
Spearman R: 0.6867568874050342
R2 Score: 0.36441857870486793



[I 2025-02-17 16:56:15,011] Trial 0 finished with value: 9.420028276539627 and parameters: {'n_estimators': 58, 'loss': 'linear', 'learning_rate': 0.012283315089453139}. Best is trial 0 with value: 9.420028276539627.
[I 2025-02-17 16:56:16,440] Trial 1 finished with value: 9.443127609797791 and parameters: {'n_estimators': 50, 'loss': 'exponential', 'learning_rate': 0.010195288401610207}. Best is trial 0 with value: 9.420028276539627.
[I 2025-02-17 16:56:17,952] Trial 2 finished with value: 9.449700619744172 and parameters: {'n_estimators': 53, 'loss': 'linear', 'learning_rate': 0.0010195010452670582}. Best is trial 0 with value: 9.420028276539627.
[I 2025-02-17 16:56:20,679] Trial 3 finished with value: 9.448307365652898 and parameters: {'n_estimators': 96, 'loss': 'square', 'learning_rate': 0.0010153905497053924}. Best is trial 0 with value: 9.420028276539627.
[I 2025-02-17 16:56:22,584] Trial 4 finished with value: 9.432877309092872 and parameters: {'n_estimators': 66, 'loss': 'line

Fold: 5/5
Model name: ada_boost
MAE: 7.421677456056014
MSE: 86.93640173483229
RMSE: 9.323969204948732
PCC: 0.6638800688807438
Spearman R: 0.6736801246552018
R2 Score: 0.44028958739546853



[I 2025-02-17 16:56:34,823] Trial 1 finished with value: 11.956192900182792 and parameters: {'learning_rate': 0.0013654465043500522, 'num_leaves': 428, 'subsample': 0.7972168702162381, 'colsample_bytree': 0.5825777040651329, 'min_data_in_leaf': 8}. Best is trial 0 with value: 9.45642315163072.
[I 2025-02-17 16:56:34,902] Trial 2 finished with value: 9.382548817017465 and parameters: {'learning_rate': 0.021053284637999468, 'num_leaves': 368, 'subsample': 0.5240622659050079, 'colsample_bytree': 0.5289705477054043, 'min_data_in_leaf': 57}. Best is trial 2 with value: 9.382548817017465.
[I 2025-02-17 16:56:34,999] Trial 3 finished with value: 9.173675899844817 and parameters: {'learning_rate': 0.01901189295022119, 'num_leaves': 656, 'subsample': 0.25590007499933287, 'colsample_bytree': 0.8932092609212973, 'min_data_in_leaf': 24}. Best is trial 3 with value: 9.173675899844817.
[I 2025-02-17 16:56:35,072] Trial 4 finished with value: 9.742505103377704 and parameters: {'learning_rate': 0.0315

Fold: 5/5
Model name: lgbm
MAE: 7.267925524292694
MSE: 83.00983274650297
RMSE: 9.110973205234608
PCC: 0.6833523611373424
Spearman R: 0.6880041466813073
R2 Score: 0.4655694644633206



Compute average scores and rank models by R2 score

In [13]:
for model_name, model_metrics in model_scores.items():
    for metric, scores in model_metrics.items():
        model_scores[model_name][metric] = sum(scores) / len(scores)
    model_scores[model_name] = dict(model_scores[model_name])

model_scores = dict(sorted(model_scores.items(), key=lambda x: x[1]["r2_score"], reverse=True))

In [14]:
# Print results
for i, (model_name, model_metrics) in enumerate(model_scores.items()):
    print(f"No.{i+1} Model: {model_name}")
    for metric, score in model_metrics.items():
        print(f"{metric}: {score}")
    print()

No.1 Model: xgb
mae: 7.349677258546061
mse: 85.38741932191428
rmse: 9.239467765585308
pcc: 0.687393255341694
spearman_r: 0.692680716672155
r2_score: 0.47118202506185314

No.2 Model: random_forest
mae: 7.6051312171549785
mse: 91.02483672045415
rmse: 9.540022040167804
pcc: 0.6794569811106295
spearman_r: 0.6859951689434254
r2_score: 0.43618281872945436

No.3 Model: gradient_boosting
mae: 7.610180654398675
mse: 91.10386152474902
rmse: 9.542560829292974
pcc: 0.680226883994606
spearman_r: 0.6840792164371939
r2_score: 0.43508771079521924

No.4 Model: ada_boost
mae: 7.617759456695853
mse: 91.8760080688327
rmse: 9.583812391235314
pcc: 0.6572543683110206
spearman_r: 0.6679874673790815
r2_score: 0.431049483309656

No.5 Model: lgbm
mae: 7.669275729698471
mse: 93.27440543533146
rmse: 9.638836116545674
pcc: 0.6782426428211059
spearman_r: 0.6839554886544879
r2_score: 0.4224802088053366

No.6 Model: lasso
mae: 8.5639554391572
mse: 114.8952845592571
rmse: 10.716651754256524
pcc: 0.5378322651239898
spea