In [1]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import optuna
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from src.utils import get_kfold_data, convert_non_numeric_to_numeric, calculate_r2_score, calculate_metrics
from src.normalisation import Normaliser
from src.constants import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv(DATA_PATH)

In [3]:
data.head()

Unnamed: 0,outcome,carat,cut,color,clarity,depth,table,price,x,y,...,a6,a7,a8,a9,a10,b6,b7,b8,b9,b10
0,-26.701232,1.14,Ideal,G,VS1,62.3,56.0,7948,6.73,6.7,...,0.168836,-0.273758,1.107832,1.247795,0.482344,0.489511,-0.321138,0.573382,0.446871,-1.990581
1,6.548093,0.38,Premium,H,VS2,60.5,59.0,898,4.69,4.66,...,-0.256549,0.315373,-0.030326,-0.114335,-1.059588,-1.76136,-1.343951,-1.00255,-0.22503,-0.446653
2,6.612562,0.5,Very Good,E,SI1,60.7,58.0,1351,5.09,5.13,...,-1.193327,-0.657307,-0.591726,-0.446856,-0.765286,-0.816544,-1.397794,-0.47713,0.810509,1.725131
3,-5.073562,0.7,Premium,D,SI1,61.2,58.0,2512,5.74,5.7,...,-1.740788,-1.77886,-0.82507,0.444932,1.173109,0.453606,-0.26344,0.24621,-0.850503,-0.41295
4,-14.436557,0.83,Ideal,G,SI2,62.4,54.0,2751,6.01,6.08,...,-0.859322,1.409268,0.861992,1.109063,-1.436722,-1.461618,0.081787,0.258087,0.851146,2.204813


Inspecting columns

In [4]:
# Find columns
all_columns = data.columns.tolist()
print(all_columns)

numeric_columns = data.select_dtypes(include=["number"]).columns.tolist()
numeric_columns.remove("outcome") # Remove the target column
print(numeric_columns)

non_numeric_columns = data.select_dtypes(exclude=["number"]).columns.tolist()
print(non_numeric_columns)

['outcome', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['cut', 'color', 'clarity']


In [5]:
for non_numeric_column in non_numeric_columns:
    print(data[non_numeric_column].value_counts())

cut
Ideal        4040
Premium      2439
Very Good    2296
Good          925
Fair          300
Name: count, dtype: int64
color
G    2120
E    1873
F    1746
H    1506
D    1246
I     983
J     526
Name: count, dtype: int64
clarity
SI1     2408
VS2     2256
SI2     1743
VS1     1503
VVS2     951
VVS1     675
IF       318
I1       146
Name: count, dtype: int64


Converting non-numeric features to numerical features

In [6]:
data = convert_non_numeric_to_numeric(data=data)
print(data)

['G', 'E', 'F', 'H', 'D', 'I', 'J']
        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...      

Normalise data using each columns respective mean and std.

In [7]:
print(data)

        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...        b8        b9       b10  colour_G  

In [8]:
normaliser = Normaliser()
for column in numeric_columns:
    print(data[column])
    data[column] = normaliser.standardise(data[column])
    print("after", data[column])

0       1.14
1       0.38
2       0.50
3       0.70
4       0.83
        ... 
9995    0.33
9996    1.01
9997    0.52
9998    0.31
9999    0.37
Name: carat, Length: 10000, dtype: float64
after 0       0.723643
1      -0.886369
2      -0.632156
3      -0.208469
4       0.066928
          ...   
9995   -0.992290
9996    0.448246
9997   -0.589788
9998   -1.034659
9999   -0.907553
Name: carat, Length: 10000, dtype: float64
0       62.3
1       60.5
2       60.7
3       61.2
4       62.4
        ... 
9995    62.6
9996    69.5
9997    57.9
9998    62.0
9999    59.9
Name: depth, Length: 10000, dtype: float64
after 0       0.386072
1      -0.872995
2      -0.733098
3      -0.383358
4       0.456020
          ...   
9995    0.595916
9996    5.422336
9997   -2.691646
9998    0.176227
9999   -1.292683
Name: depth, Length: 10000, dtype: float64
0       56.0
1       59.0
2       58.0
3       58.0
4       54.0
        ... 
9995    57.0
9996    55.0
9997    61.0
9998    54.0
9999    59.0
Name: table, 

In [9]:
print(data)

        outcome     carat  cut  clarity     depth     table     price  \
0    -26.701232  0.723643    0        3  0.386072 -0.653020  1.024563   
1      6.548093 -0.886369    1        4 -0.872995  0.682072 -0.764609   
2      6.612562 -0.632156    2        5 -0.733098  0.237041 -0.649645   
3     -5.073562 -0.208469    1        5 -0.383358  0.237041 -0.355003   
4    -14.436557  0.066928    0        6  0.456020 -1.543082 -0.294349   
...         ...       ...  ...      ...       ...       ...       ...   
9995  10.718277 -0.992290    0        3  0.595916 -0.207990 -0.738215   
9996 -12.246698  0.448246    4        5  5.422336 -1.098051  0.239104   
9997  11.122516 -0.589788    2        6 -2.691646  1.572133 -0.669440   
9998 -24.730782 -1.034659    0        0  0.176227 -1.543082 -0.789226   
9999   8.735755 -0.907553    2        5 -1.292683  0.682072 -0.827801   

             x         y         z  ...        b8        b9       b10  \
0     0.893417  0.780367  0.923092  ...  0.593856 

In [10]:
kfold_data = get_kfold_data(data=data, k=NUM_FOLDS, reproducibility_seed=REPRODUCIBILITY_SEED)

Fold: 0/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 1/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 2/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 3/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 4/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%



Define models and hyperparameter tuning objectives for each model

In [11]:
models = {
        "linear_regression": LinearRegression,
        "lasso": Lasso,
        "ridge": Ridge,
        # "xgb": xgb.XGBRegressor,
        # "random_forest": RandomForestRegressor,
        # "gradient_boosting": GradientBoostingRegressor,
        # "ada_boost": AdaBoostRegressor,
        # "lgbm": lgb.LGBMRegressor
        }

def objective(model_type, trial, x_train, y_train, x_val, y_val):
    if model_type == LinearRegression:
        parameters = {
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        }
    elif model_type == Lasso:
        parameters = {
            "alpha": trial.suggest_float("alpha", 1e-3, 0.1, log=True),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "selection": trial.suggest_categorical("selection", ["cyclic", "random"]),
            "warm_start": trial.suggest_categorical("warm_start", [True, False]),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == Ridge:
        parameters = {
            "alpha": trial.suggest_float("alpha", 1e-3, 0.1, log=True),
            "solver": trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]),
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "positive": False,
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == xgb.XGBRegressor:
        parameters = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "n_estimators": 100,
            "eta": trial.suggest_float("eta", 1e-2, 0.2, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 10, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 6),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "seed": REPRODUCIBILITY_SEED
        }
    elif model_type == RandomForestRegressor:
        parameters = {
            "n_estimators": 100,
            "criterion": trial.suggest_categorical("criterion", ["absolute_error", "squared_error"]),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "bootstrap": True,
            "oob_score": False,
            "n_jobs": -1,
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == GradientBoostingRegressor:
        parameters = {
            "n_estimators": 100,
            "loss": trial.suggest_categorical("loss", ["absolute_error", "squared_error", "huber", "quantile"]),
            "criterion": trial.suggest_categorical("criterion", ["friedman_mse", "squared_error"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "subsample": trial.suggest_float("subsample", 0.05, 1.0),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 2, 2**10),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == AdaBoostRegressor:
        parameters = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 100),
            "loss": trial.suggest_categorical("loss", ["linear", "square", "exponential"]),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
            "random_state": REPRODUCIBILITY_SEED
        }
    elif model_type == lgb.LGBMRegressor:
        parameters = {
                    "objective": "regression",
                    "metric": "rmse",
                    "n_estimators": 100,
                    "verbosity": -1,
                    "bagging_freq": 1,
                    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
                    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
                    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
                    "seed": REPRODUCIBILITY_SEED
        }

    
    model = model_type(**parameters) # Create the model
    model.fit(x_train, y_train)
    predictions = model.predict(x_val)
    metrics = calculate_metrics(targets=y_val, preds=predictions)
    rmse = metrics["rmse"]
    return rmse

In [12]:
# Train + Validate models
metrics = ["mae", "mse", "rmse", "pcc", "spearman_r", "r2_score"]
model_scores = {model_name: defaultdict(list) for model_name in models.keys()}

for fold in range(NUM_FOLDS):
    fold_data = kfold_data[fold]
     
    # Extract data
    train_data = fold_data["train"]
    val_data = fold_data["val"]
    test_data = kfold_data[fold]["test"]

    train_y = train_data["outcome"]
    val_y = val_data["outcome"]
    test_y = test_data["outcome"]

    train_x = train_data.drop(columns=["outcome"])
    val_x = val_data.drop(columns=["outcome"])
    test_x = test_data.drop(columns=["outcome"])

    # print(f"Fold {fold+1}/{NUM_FOLDS}")
    # print(f"Train data shape: {train_x.shape} | Train target shape: {train_y.shape}")
    # print(f"Val data shape: {val_x.shape} | Val target shape: {val_y.shape}")
    # print(f"Test data shape: {test_x.shape} | Test target shape: {test_y.shape}")

    # Train model
    for model_name, model in models.items():
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: objective(trial=trial, 
                                               model_type=model, 
                                               x_train=train_x, 
                                               y_train=train_y, 
                                               x_val=val_x, 
                                               y_val=val_y
                                               ), n_trials=N_TRIALS)
        
        # Train model with best hyperparameters
        best_fold_params = study.best_params
        model = model(**best_fold_params)
        model.fit(train_x, train_y)
        preds = model.predict(val_x)

        metrics = calculate_metrics(targets=val_y, preds=preds)
        mae = metrics["mae"]
        mse = metrics["mse"]
        rmse = metrics["rmse"]
        pcc = metrics["pcc"]
        spearman_r = metrics["spearman_r"]
        r2_score = metrics["r2_score"]

        for metric in metrics:
            model_scores[model_name][metric].append(metrics[metric])

        print(f"Fold: {fold+1}/{NUM_FOLDS}")
        print(f"Model name: {model_name}")
        print(f"MAE: {mae}")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"PCC: {pcc}")
        print(f"Spearman R: {spearman_r}")
        print(f"R2 Score: {r2_score}")
        print()


[I 2025-02-16 16:06:58,398] A new study created in memory with name: no-name-4045507f-949b-4578-8370-22aad4d780cc
[I 2025-02-16 16:06:58,406] Trial 0 finished with value: 10.913995409241268 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.913995409241268.
[I 2025-02-16 16:06:58,412] Trial 1 finished with value: 10.913995409241267 and parameters: {'fit_intercept': False}. Best is trial 1 with value: 10.913995409241267.
[I 2025-02-16 16:06:58,419] Trial 2 finished with value: 10.913995409241268 and parameters: {'fit_intercept': True}. Best is trial 1 with value: 10.913995409241267.
[I 2025-02-16 16:06:58,425] Trial 3 finished with value: 10.913995409241268 and parameters: {'fit_intercept': True}. Best is trial 1 with value: 10.913995409241267.
[I 2025-02-16 16:06:58,430] Trial 4 finished with value: 10.913995409241268 and parameters: {'fit_intercept': True}. Best is trial 1 with value: 10.913995409241267.
[I 2025-02-16 16:06:58,437] Trial 5 finished with value: 10.

Fold: 1/5
Model name: linear_regression
MAE: 8.77537026769259
MSE: 119.11529579293945
RMSE: 10.913995409241267
PCC: 0.5463216691195154
Spearman R: 0.5697531327160675
R2 Score: 0.298001097465473



[I 2025-02-16 16:06:59,274] Trial 10 finished with value: 10.91841944294354 and parameters: {'alpha': 0.01442867474308622, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 7 with value: 10.906660232011188.
[I 2025-02-16 16:06:59,294] Trial 11 finished with value: 10.908602314827775 and parameters: {'alpha': 0.007695226548480104, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': True}. Best is trial 7 with value: 10.906660232011188.
[I 2025-02-16 16:06:59,316] Trial 12 finished with value: 10.907799564635404 and parameters: {'alpha': 0.009352466597855881, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 7 with value: 10.906660232011188.
[I 2025-02-16 16:06:59,331] Trial 13 finished with value: 10.906571969372381 and parameters: {'alpha': 0.0174610538230186, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 13 with value: 10.906571969372381.
[I 2025-02-16 16:06:59,342] Trial 14 

Fold: 1/5
Model name: lasso
MAE: 8.775224549857615
MSE: 118.95281603148356
RMSE: 10.906549226564907
PCC: 0.5475468129283234
Spearman R: 0.5709848705019025
R2 Score: 0.29895866226407264



[I 2025-02-16 16:07:01,444] Trial 4 finished with value: 10.913987385345253 and parameters: {'alpha': 0.029838130419921064, 'solver': 'saga', 'fit_intercept': True}. Best is trial 1 with value: 10.913930184878181.
[I 2025-02-16 16:07:01,603] Trial 5 finished with value: 10.913894481783467 and parameters: {'alpha': 0.09960982980318053, 'solver': 'sag', 'fit_intercept': False}. Best is trial 5 with value: 10.913894481783467.
[I 2025-02-16 16:07:01,607] Trial 6 finished with value: 10.91398500667589 and parameters: {'alpha': 0.020465400868657536, 'solver': 'cholesky', 'fit_intercept': False}. Best is trial 5 with value: 10.913894481783467.
[I 2025-02-16 16:07:01,612] Trial 7 finished with value: 10.913923695066508 and parameters: {'alpha': 0.01466037726510766, 'solver': 'sparse_cg', 'fit_intercept': False}. Best is trial 5 with value: 10.913894481783467.
[I 2025-02-16 16:07:01,617] Trial 8 finished with value: 10.913936297441776 and parameters: {'alpha': 0.0014033692304273931, 'solver': '

Fold: 1/5
Model name: ridge
MAE: 8.775414908333685
MSE: 119.11147385650344
RMSE: 10.913820314468415
PCC: 0.5463492257001896
Spearman R: 0.569791505777932
R2 Score: 0.2980236218202653



[I 2025-02-16 16:07:23,151] Trial 30 finished with value: 10.75478290359764 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.75478290359764.
[I 2025-02-16 16:07:23,158] Trial 31 finished with value: 10.75478290359764 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.75478290359764.
[I 2025-02-16 16:07:23,164] Trial 32 finished with value: 10.75478290359764 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.75478290359764.
[I 2025-02-16 16:07:23,171] Trial 33 finished with value: 10.75478290359764 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.75478290359764.
[I 2025-02-16 16:07:23,177] Trial 34 finished with value: 10.75478290359764 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.75478290359764.
[I 2025-02-16 16:07:23,184] Trial 35 finished with value: 10.75478290359764 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.75478290359764.
[I 2025-02-16 1

Fold: 2/5
Model name: linear_regression
MAE: 8.572113483311382
MSE: 115.66535530351608
RMSE: 10.75478290359764
PCC: 0.5251877646872845
Spearman R: 0.5521546102166447
R2 Score: 0.2750306342091945



[I 2025-02-16 16:07:23,774] Trial 2 finished with value: 10.755454612720396 and parameters: {'alpha': 0.0018748168061514634, 'fit_intercept': False, 'selection': 'random', 'warm_start': False}. Best is trial 1 with value: 10.755295246379834.
[I 2025-02-16 16:07:23,813] Trial 3 finished with value: 10.753881768240172 and parameters: {'alpha': 0.001328200073463699, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 3 with value: 10.753881768240172.
[I 2025-02-16 16:07:23,844] Trial 4 finished with value: 10.761393051542028 and parameters: {'alpha': 0.008701494931099058, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 3 with value: 10.753881768240172.
[I 2025-02-16 16:07:23,871] Trial 5 finished with value: 10.750862216580147 and parameters: {'alpha': 0.007632642929728759, 'fit_intercept': True, 'selection': 'random', 'warm_start': False}. Best is trial 5 with value: 10.750862216580147.
[I 2025-02-16 16:07:23,896] Trial 6 fi

Fold: 2/5
Model name: lasso
MAE: 8.54440919326049
MSE: 115.43550098338339
RMSE: 10.744091445226227
PCC: 0.526153575894466
Spearman R: 0.5537575542021697
R2 Score: 0.27647131919437096



[I 2025-02-16 16:07:25,158] Trial 13 finished with value: 10.754764242885255 and parameters: {'alpha': 0.02549470473077374, 'solver': 'saga', 'fit_intercept': True}. Best is trial 10 with value: 10.754518253135366.
[I 2025-02-16 16:07:25,165] Trial 14 finished with value: 10.754518088776184 and parameters: {'alpha': 0.030560783047384143, 'solver': 'lsqr', 'fit_intercept': True}. Best is trial 14 with value: 10.754518088776184.
[I 2025-02-16 16:07:25,172] Trial 15 finished with value: 10.754520173212397 and parameters: {'alpha': 0.0034973481525401528, 'solver': 'lsqr', 'fit_intercept': True}. Best is trial 14 with value: 10.754518088776184.
[I 2025-02-16 16:07:25,179] Trial 16 finished with value: 10.754516865648217 and parameters: {'alpha': 0.046475920011233886, 'solver': 'lsqr', 'fit_intercept': True}. Best is trial 16 with value: 10.754516865648217.
[I 2025-02-16 16:07:25,262] Trial 17 finished with value: 10.754763300285774 and parameters: {'alpha': 0.0933908760491362, 'solver': 'sa

Fold: 2/5
Model name: ridge
MAE: 8.571466056026908
MSE: 115.64258671316516
RMSE: 10.753724318261332
PCC: 0.5253159183054947
Spearman R: 0.552171414910709
R2 Score: 0.2751733435836943



[I 2025-02-16 16:07:27,132] Trial 28 finished with value: 10.76861868449502 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.76861868449502.
[I 2025-02-16 16:07:27,138] Trial 29 finished with value: 10.76861868449502 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.76861868449502.
[I 2025-02-16 16:07:27,145] Trial 30 finished with value: 10.76861868449502 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.76861868449502.
[I 2025-02-16 16:07:27,151] Trial 31 finished with value: 10.76861868449502 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.76861868449502.
[I 2025-02-16 16:07:27,156] Trial 32 finished with value: 10.76861868449502 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.76861868449502.
[I 2025-02-16 16:07:27,163] Trial 33 finished with value: 10.76861868449502 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.76861868449502.
[I 2025-02-16 1

Fold: 3/5
Model name: linear_regression
MAE: 8.63664187923468
MSE: 115.96314837205529
RMSE: 10.76861868449502
PCC: 0.538746619155549
Spearman R: 0.5727311289965348
R2 Score: 0.28899901278645357



[I 2025-02-16 16:07:27,773] Trial 12 finished with value: 10.760706520185552 and parameters: {'alpha': 0.0381061614761106, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 12 with value: 10.760706520185552.
[I 2025-02-16 16:07:27,785] Trial 13 finished with value: 10.75950472217923 and parameters: {'alpha': 0.048312179031491896, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 13 with value: 10.75950472217923.
[I 2025-02-16 16:07:27,800] Trial 14 finished with value: 10.832122068504786 and parameters: {'alpha': 0.08947926611015883, 'fit_intercept': False, 'selection': 'random', 'warm_start': True}. Best is trial 13 with value: 10.75950472217923.
[I 2025-02-16 16:07:27,812] Trial 15 finished with value: 10.761712762643318 and parameters: {'alpha': 0.03067092679135238, 'fit_intercept': True, 'selection': 'random', 'warm_start': True}. Best is trial 13 with value: 10.75950472217923.
[I 2025-02-16 16:07:27,825] Trial 16 finish

Fold: 3/5
Model name: lasso
MAE: 8.626090581014205
MSE: 115.63956923231198
RMSE: 10.753584018005903
PCC: 0.5407774438311033
Spearman R: 0.576865025142588
R2 Score: 0.2909829627828875



[I 2025-02-16 16:07:29,209] Trial 3 finished with value: 10.76846124390162 and parameters: {'alpha': 0.008813819494341174, 'solver': 'saga', 'fit_intercept': False}. Best is trial 3 with value: 10.76846124390162.
[I 2025-02-16 16:07:29,215] Trial 4 finished with value: 10.768598389877772 and parameters: {'alpha': 0.05089495098149799, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 3 with value: 10.76846124390162.
[I 2025-02-16 16:07:29,224] Trial 5 finished with value: 10.768582439538614 and parameters: {'alpha': 0.08678985373342532, 'solver': 'svd', 'fit_intercept': False}. Best is trial 3 with value: 10.76846124390162.
[I 2025-02-16 16:07:29,229] Trial 6 finished with value: 10.768580577494062 and parameters: {'alpha': 0.09126814656349752, 'solver': 'auto', 'fit_intercept': False}. Best is trial 3 with value: 10.76846124390162.
[I 2025-02-16 16:07:29,367] Trial 7 finished with value: 10.7686081308121 and parameters: {'alpha': 0.0019561141774920446, 'solver': 'sag', 'fit_

Fold: 3/5
Model name: ridge
MAE: 8.636568929120049
MSE: 115.95898571178226
RMSE: 10.768425405405484
PCC: 0.5387820736558202
Spearman R: 0.5727363877876516
R2 Score: 0.2890245351666675



[I 2025-02-16 16:07:49,995] Trial 28 finished with value: 10.480654637722811 and parameters: {'fit_intercept': False}. Best is trial 0 with value: 10.480654637722811.
[I 2025-02-16 16:07:50,002] Trial 29 finished with value: 10.480654637722811 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.480654637722811.
[I 2025-02-16 16:07:50,010] Trial 30 finished with value: 10.480654637722811 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.480654637722811.
[I 2025-02-16 16:07:50,017] Trial 31 finished with value: 10.480654637722811 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.480654637722811.
[I 2025-02-16 16:07:50,024] Trial 32 finished with value: 10.480654637722811 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.480654637722811.
[I 2025-02-16 16:07:50,031] Trial 33 finished with value: 10.480654637722811 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 10.480654637722811.
[I 

Fold: 4/5
Model name: linear_regression
MAE: 8.275292415766604
MSE: 109.84412163522067
RMSE: 10.480654637722811
PCC: 0.5648315854058772
Spearman R: 0.5899297343475526
R2 Score: 0.3169390462193944



[I 2025-02-16 16:07:50,691] Trial 6 finished with value: 10.463749509478458 and parameters: {'alpha': 0.06700805450511063, 'fit_intercept': True, 'selection': 'random', 'warm_start': False}. Best is trial 6 with value: 10.463749509478458.
[I 2025-02-16 16:07:50,757] Trial 7 finished with value: 10.476736366006165 and parameters: {'alpha': 0.004063024533907817, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': False}. Best is trial 6 with value: 10.463749509478458.
[I 2025-02-16 16:07:50,776] Trial 8 finished with value: 10.4779772782322 and parameters: {'alpha': 0.00669092282337851, 'fit_intercept': False, 'selection': 'cyclic', 'warm_start': True}. Best is trial 6 with value: 10.463749509478458.
  model = cd_fast.enet_coordinate_descent(
[I 2025-02-16 16:07:50,856] Trial 9 finished with value: 10.478271506324594 and parameters: {'alpha': 0.0012243998798623082, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 6 with value: 10.463749509478458.

Fold: 4/5
Model name: lasso
MAE: 8.239858036054665
MSE: 109.48162076851052
RMSE: 10.463346537724464
PCC: 0.5684747469994225
Spearman R: 0.596416472232997
R2 Score: 0.3191932422935695



[I 2025-02-16 16:07:52,880] Trial 1 finished with value: 10.480139558257111 and parameters: {'alpha': 0.01891051035024349, 'solver': 'saga', 'fit_intercept': True}. Best is trial 1 with value: 10.480139558257111.
[I 2025-02-16 16:07:53,291] Trial 2 finished with value: 10.480362388747132 and parameters: {'alpha': 0.04884672689844555, 'solver': 'sag', 'fit_intercept': True}. Best is trial 1 with value: 10.480139558257111.
[I 2025-02-16 16:07:53,296] Trial 3 finished with value: 10.480604636800127 and parameters: {'alpha': 0.05839854916802006, 'solver': 'cholesky', 'fit_intercept': True}. Best is trial 1 with value: 10.480139558257111.
[I 2025-02-16 16:07:53,718] Trial 4 finished with value: 10.480333523670918 and parameters: {'alpha': 0.05676299270896878, 'solver': 'sag', 'fit_intercept': False}. Best is trial 1 with value: 10.480139558257111.
[I 2025-02-16 16:07:53,722] Trial 5 finished with value: 10.480650759078294 and parameters: {'alpha': 0.0043070554376385345, 'solver': 'cholesky'

Fold: 4/5
Model name: ridge
MAE: 8.275037060690744
MSE: 109.83089109495282
RMSE: 10.48002343007652
PCC: 0.5649157009743753
Spearman R: 0.5900857900725742
R2 Score: 0.3170213197659425



[I 2025-02-16 16:08:41,979] Trial 28 finished with value: 10.905947367181518 and parameters: {'fit_intercept': False}. Best is trial 3 with value: 10.905947367181518.
[I 2025-02-16 16:08:41,985] Trial 29 finished with value: 10.90594736718152 and parameters: {'fit_intercept': True}. Best is trial 3 with value: 10.905947367181518.
[I 2025-02-16 16:08:41,992] Trial 30 finished with value: 10.905947367181518 and parameters: {'fit_intercept': False}. Best is trial 3 with value: 10.905947367181518.
[I 2025-02-16 16:08:41,998] Trial 31 finished with value: 10.905947367181518 and parameters: {'fit_intercept': False}. Best is trial 3 with value: 10.905947367181518.
[I 2025-02-16 16:08:42,005] Trial 32 finished with value: 10.905947367181518 and parameters: {'fit_intercept': False}. Best is trial 3 with value: 10.905947367181518.
[I 2025-02-16 16:08:42,012] Trial 33 finished with value: 10.905947367181518 and parameters: {'fit_intercept': False}. Best is trial 3 with value: 10.905947367181518.


Fold: 5/5
Model name: linear_regression
MAE: 8.59605856688314
MSE: 118.9396879757335
RMSE: 10.905947367181518
PCC: 0.5235762981646424
Spearman R: 0.5555014425786886
R2 Score: 0.27210682468771663



[I 2025-02-16 16:08:42,653] Trial 2 finished with value: 10.792845191708164 and parameters: {'alpha': 0.011843477920111244, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 2 with value: 10.792845191708164.
  model = cd_fast.enet_coordinate_descent(
[I 2025-02-16 16:08:42,742] Trial 3 finished with value: 10.815996377989574 and parameters: {'alpha': 0.0018128310518095714, 'fit_intercept': True, 'selection': 'random', 'warm_start': False}. Best is trial 2 with value: 10.792845191708164.
[I 2025-02-16 16:08:42,747] Trial 4 finished with value: 10.774871742891381 and parameters: {'alpha': 0.07706988234529652, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': True}. Best is trial 4 with value: 10.774871742891381.
[I 2025-02-16 16:08:42,753] Trial 5 finished with value: 10.778844162861256 and parameters: {'alpha': 0.052406494277336735, 'fit_intercept': True, 'selection': 'cyclic', 'warm_start': False}. Best is trial 4 with value: 10.774871742891381

Fold: 5/5
Model name: lasso
MAE: 8.549139279895455
MSE: 116.04304225523414
RMSE: 10.772327615480052
PCC: 0.5384247273449996
Spearman R: 0.56237715815514
R2 Score: 0.2898338650653506



[I 2025-02-16 16:08:45,282] Trial 1 finished with value: 10.88932126717128 and parameters: {'alpha': 0.008939830386912652, 'solver': 'sag', 'fit_intercept': True}. Best is trial 1 with value: 10.88932126717128.
[I 2025-02-16 16:08:46,183] Trial 2 finished with value: 10.888930345273428 and parameters: {'alpha': 0.022859965762662703, 'solver': 'sag', 'fit_intercept': True}. Best is trial 2 with value: 10.888930345273428.
[I 2025-02-16 16:08:46,189] Trial 3 finished with value: 10.905693418125779 and parameters: {'alpha': 0.006037440661156117, 'solver': 'sparse_cg', 'fit_intercept': True}. Best is trial 2 with value: 10.888930345273428.
[I 2025-02-16 16:08:47,168] Trial 4 finished with value: 10.85430356277203 and parameters: {'alpha': 0.002335194731251321, 'solver': 'saga', 'fit_intercept': True}. Best is trial 4 with value: 10.85430356277203.
[I 2025-02-16 16:08:47,177] Trial 5 finished with value: 10.902506399337721 and parameters: {'alpha': 0.08209969251678136, 'solver': 'svd', 'fit_

Fold: 5/5
Model name: ridge
MAE: 8.59029287267013
MSE: 117.75969010950713
RMSE: 10.85171369459714
PCC: 0.5296325816128555
Spearman R: 0.5547682030344544
R2 Score: 0.27932823587793654





Compute average scores and rank models by R2 score

In [13]:
for model_name, model_metrics in model_scores.items():
    for metric, scores in model_metrics.items():
        model_scores[model_name][metric] = sum(scores) / len(scores)
    model_scores[model_name] = dict(model_scores[model_name])

model_scores = dict(sorted(model_scores.items(), key=lambda x: x[1]["r2_score"], reverse=True))

In [14]:
# Print results
for i, (model_name, model_metrics) in enumerate(model_scores.items()):
    print(f"No.{i+1} Model: {model_name}")
    for metric, score in model_metrics.items():
        print(f"{metric}: {score}")
    print()

No.1 Model: lasso
mae: 8.546944328016485
mse: 115.11050985418474
rmse: 10.72797976860031
pcc: 0.5442754613996629
spearman_r: 0.5720802160469594
r2_score: 0.2950880103200502

No.2 Model: ridge
mae: 8.569755965368305
mse: 115.66072549718214
rmse: 10.753541432561779
pcc: 0.5409991000497472
spearman_r: 0.5679106603166642
r2_score: 0.29171421124290126

No.3 Model: linear_regression
mae: 8.57109532257768
mse: 115.90552181589301
rmse: 10.764799800447651
pcc: 0.5397327873065738
spearman_r: 0.5680140097710977
r2_score: 0.2902153230736464

