In [1]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import optuna
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from src.utils import get_kfold_data, convert_non_numeric_to_numeric, calculate_r2_score, calculate_metrics
from src.normalisation import Normaliser
from src.constants import *


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv(DATA_PATH)

In [3]:
data.head()

Unnamed: 0,outcome,carat,cut,color,clarity,depth,table,price,x,y,...,a6,a7,a8,a9,a10,b6,b7,b8,b9,b10
0,-26.701232,1.14,Ideal,G,VS1,62.3,56.0,7948,6.73,6.7,...,0.168836,-0.273758,1.107832,1.247795,0.482344,0.489511,-0.321138,0.573382,0.446871,-1.990581
1,6.548093,0.38,Premium,H,VS2,60.5,59.0,898,4.69,4.66,...,-0.256549,0.315373,-0.030326,-0.114335,-1.059588,-1.76136,-1.343951,-1.00255,-0.22503,-0.446653
2,6.612562,0.5,Very Good,E,SI1,60.7,58.0,1351,5.09,5.13,...,-1.193327,-0.657307,-0.591726,-0.446856,-0.765286,-0.816544,-1.397794,-0.47713,0.810509,1.725131
3,-5.073562,0.7,Premium,D,SI1,61.2,58.0,2512,5.74,5.7,...,-1.740788,-1.77886,-0.82507,0.444932,1.173109,0.453606,-0.26344,0.24621,-0.850503,-0.41295
4,-14.436557,0.83,Ideal,G,SI2,62.4,54.0,2751,6.01,6.08,...,-0.859322,1.409268,0.861992,1.109063,-1.436722,-1.461618,0.081787,0.258087,0.851146,2.204813


Inspecting columns

In [4]:
# Find columns
all_columns = data.columns.tolist()
print(all_columns)

numeric_columns = data.select_dtypes(include=["number"]).columns.tolist()
numeric_columns.remove("outcome") # Remove the target column
print(numeric_columns)

non_numeric_columns = data.select_dtypes(exclude=["number"]).columns.tolist()
print(non_numeric_columns)

['outcome', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['carat', 'depth', 'table', 'price', 'x', 'y', 'z', 'a1', 'a2', 'a3', 'a4', 'a5', 'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7', 'b8', 'b9', 'b10']
['cut', 'color', 'clarity']


In [5]:
for non_numeric_column in non_numeric_columns:
    print(data[non_numeric_column].value_counts())

cut
Ideal        4040
Premium      2439
Very Good    2296
Good          925
Fair          300
Name: count, dtype: int64
color
G    2120
E    1873
F    1746
H    1506
D    1246
I     983
J     526
Name: count, dtype: int64
clarity
SI1     2408
VS2     2256
SI2     1743
VS1     1503
VVS2     951
VVS1     675
IF       318
I1       146
Name: count, dtype: int64


Converting non-numeric features to numerical features

In [6]:
data = convert_non_numeric_to_numeric(data=data)
print(data)

['G', 'E', 'F', 'H', 'D', 'I', 'J']
        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...      

Normalise data using each columns respective mean and std.

In [7]:
print(data)

        outcome  carat  cut  clarity  depth  table  price     x     y     z  \
0    -26.701232   1.14    0        3   62.3   56.0   7948  6.73  6.70  4.18   
1      6.548093   0.38    1        4   60.5   59.0    898  4.69  4.66  2.83   
2      6.612562   0.50    2        5   60.7   58.0   1351  5.09  5.13  3.10   
3     -5.073562   0.70    1        5   61.2   58.0   2512  5.74  5.70  3.50   
4    -14.436557   0.83    0        6   62.4   54.0   2751  6.01  6.08  3.77   
...         ...    ...  ...      ...    ...    ...    ...   ...   ...   ...   
9995  10.718277   0.33    0        3   62.6   57.0   1002  4.42  4.40  2.76   
9996 -12.246698   1.01    4        5   69.5   55.0   4853  6.00  5.94  4.15   
9997  11.122516   0.52    2        6   57.9   61.0   1273  5.28  5.33  3.07   
9998 -24.730782   0.31    0        0   62.0   54.0    801  4.35  4.39  2.71   
9999   8.735755   0.37    2        5   59.9   59.0    649  4.68  4.70  2.81   

      ...        b8        b9       b10  colour_G  

In [8]:
normaliser = Normaliser()
for column in numeric_columns:
    print(data[column])
    data[column] = normaliser.standardise(data[column])
    print("after", data[column])
    break

0       1.14
1       0.38
2       0.50
3       0.70
4       0.83
        ... 
9995    0.33
9996    1.01
9997    0.52
9998    0.31
9999    0.37
Name: carat, Length: 10000, dtype: float64
after 0       0.723643
1      -0.886369
2      -0.632156
3      -0.208469
4       0.066928
          ...   
9995   -0.992290
9996    0.448246
9997   -0.589788
9998   -1.034659
9999   -0.907553
Name: carat, Length: 10000, dtype: float64


In [9]:
print(data)

        outcome     carat  cut  clarity  depth  table  price     x     y  \
0    -26.701232  0.723643    0        3   62.3   56.0   7948  6.73  6.70   
1      6.548093 -0.886369    1        4   60.5   59.0    898  4.69  4.66   
2      6.612562 -0.632156    2        5   60.7   58.0   1351  5.09  5.13   
3     -5.073562 -0.208469    1        5   61.2   58.0   2512  5.74  5.70   
4    -14.436557  0.066928    0        6   62.4   54.0   2751  6.01  6.08   
...         ...       ...  ...      ...    ...    ...    ...   ...   ...   
9995  10.718277 -0.992290    0        3   62.6   57.0   1002  4.42  4.40   
9996 -12.246698  0.448246    4        5   69.5   55.0   4853  6.00  5.94   
9997  11.122516 -0.589788    2        6   57.9   61.0   1273  5.28  5.33   
9998 -24.730782 -1.034659    0        0   62.0   54.0    801  4.35  4.39   
9999   8.735755 -0.907553    2        5   59.9   59.0    649  4.68  4.70   

         z  ...        b8        b9       b10  colour_G  colour_E  colour_F  \
0     4.

In [10]:
kfold_data = get_kfold_data(data=data, k=NUM_FOLDS, reproducibility_seed=REPRODUCIBILITY_SEED)

Fold: 0/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 1/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 2/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 3/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%

Fold: 4/5
Train shape: (6400, 37) | 64.00%
Validation shape: (1600, 37) | 16.00%
Test shape: (2000, 37) | 20.00%



In [11]:
def objective(model_type, trial, x_train, y_train, x_val, y_val):
    if model_type == lgb.LGBMRegressor:
        parameters = {
                    "objective": "regression",
                    "metric": "rmse",
                    "n_estimators": 100,
                    "verbosity": -1,
                    "bagging_freq": 1,
                    "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
                    "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
                    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
                    "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
                    "seed": REPRODUCIBILITY_SEED
        }
    elif model_type == xgb.XGBRegressor:
        parameters = {
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "n_estimators": 100,
            "eta": trial.suggest_float("eta", 1e-2, 0.2, log=True),
            "gamma": trial.suggest_float("gamma", 1e-8, 10, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 6),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "seed": REPRODUCIBILITY_SEED
        }

    model = model_type(**parameters) # Create the model
    model.fit(x_train, y_train)
    predictions = model.predict(x_val)
    metrics = calculate_metrics(targets=y_val, preds=predictions)
    rmse = metrics["rmse"]
    return rmse


models = {
        # "linear_regression": LinearRegression(),
        "xgb": xgb.XGBRegressor,
        # "random_forest": RandomForestRegressor(**HYPERPARAMETERS_2),
        # "gradient_boosting": GradientBoostingRegressor(**HYPERPARAMETERS_2),
        # "ada_boost": AdaBoostRegressor(),
        # "ridge": Ridge(),
        # "lasso": Lasso(),
        "lgbm": lgb.LGBMRegressor
        }

In [12]:
# Train + Validate models
metrics = ["mae", "mse", "rmse", "pcc", "spearman_r", "r2_score"]
model_scores = {model_name: defaultdict(list) for model_name in models.keys()}

for fold in range(NUM_FOLDS):
    fold_data = kfold_data[fold]
     
    # Extract data
    train_data = fold_data["train"]
    val_data = fold_data["val"]
    test_data = kfold_data[fold]["test"]

    train_y = train_data["outcome"]
    val_y = val_data["outcome"]
    test_y = test_data["outcome"]

    train_x = train_data.drop(columns=["outcome"])
    val_x = val_data.drop(columns=["outcome"])
    test_x = test_data.drop(columns=["outcome"])

    # print(f"Fold {fold+1}/{NUM_FOLDS}")
    # print(f"Train data shape: {train_x.shape} | Train target shape: {train_y.shape}")
    # print(f"Val data shape: {val_x.shape} | Val target shape: {val_y.shape}")
    # print(f"Test data shape: {test_x.shape} | Test target shape: {test_y.shape}")

    # Train model
    for model_name, model in models.items():
        study = optuna.create_study(direction="minimize")
        study.optimize(lambda trial: objective(trial=trial, 
                                               model_type=model, 
                                               x_train=train_x, 
                                               y_train=train_y, 
                                               x_val=val_x, 
                                               y_val=val_y
                                               ), n_trials=30)
        
        # Train model with best hyperparameters
        best_fold_params = study.best_params
        model = model(**best_fold_params)
        model.fit(train_x, train_y)
        preds = model.predict(val_x)

        metrics = calculate_metrics(targets=val_y, preds=preds)
        mae = metrics["mae"]
        mse = metrics["mse"]
        rmse = metrics["rmse"]
        pcc = metrics["pcc"]
        spearman_r = metrics["spearman_r"]
        r2_score = metrics["r2_score"]

        for metric in metrics:
            model_scores[model_name][metric].append(metrics[metric])

        print(f"Fold: {fold+1}/{NUM_FOLDS}")
        print(f"Model name: {model_name}")
        print(f"MAE: {mae}")
        print(f"MSE: {mse}")
        print(f"RMSE: {rmse}")
        print(f"PCC: {pcc}")
        print(f"Spearman R: {spearman_r}")
        print(f"R2 Score: {r2_score}")
        print()

# Compute average scores
for model_name, model_metrics in model_scores.items():
    for metric, scores in model_metrics.items():
        model_scores[model_name][metric] = sum(scores) / len(scores)
    model_scores[model_name] = dict(model_scores[model_name])
print(model_scores)

[I 2025-02-16 14:41:40,626] A new study created in memory with name: no-name-134ecf2c-1fcc-4d18-b55c-2a38a9fe7626
[I 2025-02-16 14:41:40,998] Trial 0 finished with value: 9.732452353316903 and parameters: {'eta': 0.08167106443849242, 'gamma': 0.15999312725791798, 'max_depth': 9, 'min_child_weight': 5, 'subsample': 0.994505919138053, 'colsample_bytree': 0.6755794720214263}. Best is trial 0 with value: 9.732452353316903.
[I 2025-02-16 14:41:41,073] Trial 1 finished with value: 9.520773600734751 and parameters: {'eta': 0.03865058456332115, 'gamma': 7.527479471003771e-06, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.932069301003361, 'colsample_bytree': 0.767655145720773}. Best is trial 1 with value: 9.520773600734751.
[I 2025-02-16 14:41:41,334] Trial 2 finished with value: 10.108508865700609 and parameters: {'eta': 0.1686078660110199, 'gamma': 3.605153393033279e-08, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.8237776316009608, 'colsample_bytree': 0.9340187157331268}. Be

Fold: 1/5
Model name: xgb
MAE: 7.451147020646338
MSE: 89.44566428218742
RMSE: 9.457571796300963
PCC: 0.6879888271071875
Spearman R: 0.693400056015647
R2 Score: 0.47285730397112247



[I 2025-02-16 14:41:46,533] Trial 2 finished with value: 10.176101281742318 and parameters: {'learning_rate': 0.07696981950331315, 'num_leaves': 904, 'subsample': 0.5847684755111281, 'colsample_bytree': 0.49294037534448054, 'min_data_in_leaf': 3}. Best is trial 2 with value: 10.176101281742318.
[I 2025-02-16 14:41:46,562] Trial 3 finished with value: 11.856134790869199 and parameters: {'learning_rate': 0.027130416193802732, 'num_leaves': 196, 'subsample': 0.055643904832140455, 'colsample_bytree': 0.1615114306151802, 'min_data_in_leaf': 92}. Best is trial 2 with value: 10.176101281742318.
[I 2025-02-16 14:41:46,735] Trial 4 finished with value: 12.025022745564042 and parameters: {'learning_rate': 0.002936783322845734, 'num_leaves': 634, 'subsample': 0.12180260317904601, 'colsample_bytree': 0.6043176146867408, 'min_data_in_leaf': 6}. Best is trial 2 with value: 10.176101281742318.
[I 2025-02-16 14:41:46,821] Trial 5 finished with value: 9.630909130992626 and parameters: {'learning_rate':

Fold: 1/5
Model name: lgbm
MAE: 7.654095790235568
MSE: 92.62657043720785
RMSE: 9.624269865148621
PCC: 0.6742412334539789
Spearman R: 0.6811044486736129
R2 Score: 0.45411082296694216



[I 2025-02-16 14:41:50,171] Trial 2 finished with value: 9.82301861957684 and parameters: {'eta': 0.015288000699293846, 'gamma': 2.289358498027265e-08, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.7702132838703255, 'colsample_bytree': 0.7741230668141951}. Best is trial 1 with value: 9.49215987452655.
[I 2025-02-16 14:41:50,449] Trial 3 finished with value: 9.661397118443569 and parameters: {'eta': 0.06942264947682555, 'gamma': 4.146271366687371e-07, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.78923918602954, 'colsample_bytree': 0.9674052053770561}. Best is trial 1 with value: 9.49215987452655.
[I 2025-02-16 14:41:50,651] Trial 4 finished with value: 10.213312823269659 and parameters: {'eta': 0.01326802530456722, 'gamma': 0.06676436297151787, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.9909693883902506, 'colsample_bytree': 0.5564431733661146}. Best is trial 1 with value: 9.49215987452655.
[I 2025-02-16 14:41:50,813] Trial 5 finished with value: 9.48206808321

Fold: 2/5
Model name: xgb
MAE: 7.463817277354354
MSE: 87.62753435177169
RMSE: 9.360957982587664
PCC: 0.6722758461670779
Spearman R: 0.6678673360809907
R2 Score: 0.4507665857411256



[I 2025-02-16 14:41:53,558] Trial 1 finished with value: 12.137393541690507 and parameters: {'learning_rate': 0.0010147977223808492, 'num_leaves': 297, 'subsample': 0.883844580139113, 'colsample_bytree': 0.9196851433451531, 'min_data_in_leaf': 54}. Best is trial 0 with value: 10.325604273875149.
[I 2025-02-16 14:41:53,633] Trial 2 finished with value: 12.274481702605838 and parameters: {'learning_rate': 0.011418845032199223, 'num_leaves': 116, 'subsample': 0.11650985400885969, 'colsample_bytree': 0.06609394158504121, 'min_data_in_leaf': 8}. Best is trial 0 with value: 10.325604273875149.
[I 2025-02-16 14:41:53,663] Trial 3 finished with value: 10.508097663060981 and parameters: {'learning_rate': 0.007012329355498811, 'num_leaves': 270, 'subsample': 0.0722741042767143, 'colsample_bytree': 0.9647411994830545, 'min_data_in_leaf': 47}. Best is trial 0 with value: 10.325604273875149.
[I 2025-02-16 14:41:53,821] Trial 4 finished with value: 11.759391624885676 and parameters: {'learning_rate'

Fold: 2/5
Model name: lgbm
MAE: 7.508314401948074
MSE: 88.7783420217918
RMSE: 9.422225958964887
PCC: 0.6686746979311605
Spearman R: 0.6655975305068478
R2 Score: 0.443553533012481



[I 2025-02-16 14:41:57,107] Trial 1 finished with value: 9.59515558575141 and parameters: {'eta': 0.01900350192199086, 'gamma': 0.00015475904765982322, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.8238292752293744, 'colsample_bytree': 0.8119102797931836}. Best is trial 0 with value: 9.46261954058216.
[I 2025-02-16 14:41:57,188] Trial 2 finished with value: 10.493708526883267 and parameters: {'eta': 0.011723246140942827, 'gamma': 7.928146559654382e-08, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.5556479213649355, 'colsample_bytree': 0.502446204712008}. Best is trial 0 with value: 9.46261954058216.
[I 2025-02-16 14:41:57,480] Trial 3 finished with value: 10.162846355964541 and parameters: {'eta': 0.17400519768394995, 'gamma': 1.940960477309898e-06, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.8555350104711743, 'colsample_bytree': 0.5058685670676749}. Best is trial 0 with value: 9.46261954058216.
[I 2025-02-16 14:41:57,758] Trial 4 finished with value: 9.79841

Fold: 3/5
Model name: xgb
MAE: 7.331341744070153
MSE: 84.3707503457484
RMSE: 9.18535521064637
PCC: 0.6970368794686259
Spearman R: 0.7064742163961784
R2 Score: 0.4827004300080683



[I 2025-02-16 14:42:00,647] Trial 2 finished with value: 9.30544107561793 and parameters: {'learning_rate': 0.03200049198087024, 'num_leaves': 521, 'subsample': 0.30438225287797144, 'colsample_bytree': 0.8141349845005706, 'min_data_in_leaf': 73}. Best is trial 2 with value: 9.30544107561793.
[I 2025-02-16 14:42:00,777] Trial 3 finished with value: 11.866417191035108 and parameters: {'learning_rate': 0.0024430212969736263, 'num_leaves': 89, 'subsample': 0.7512396490309204, 'colsample_bytree': 0.7023287428592427, 'min_data_in_leaf': 25}. Best is trial 2 with value: 9.30544107561793.
[I 2025-02-16 14:42:00,824] Trial 4 finished with value: 9.363657112346777 and parameters: {'learning_rate': 0.09344483085740864, 'num_leaves': 965, 'subsample': 0.1930157481153515, 'colsample_bytree': 0.4871150267320048, 'min_data_in_leaf': 80}. Best is trial 2 with value: 9.30544107561793.
[I 2025-02-16 14:42:00,875] Trial 5 finished with value: 9.268923372263094 and parameters: {'learning_rate': 0.05381984

Fold: 3/5
Model name: lgbm
MAE: 7.3620307065791915
MSE: 84.81295005661778
RMSE: 9.209394662876479
PCC: 0.6962257673605439
Spearman R: 0.7049284746655068
R2 Score: 0.4799891856568461



[I 2025-02-16 14:42:03,573] Trial 1 finished with value: 9.28876275978517 and parameters: {'eta': 0.05904658463534432, 'gamma': 2.1863792966416173e-08, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.7508904916529873, 'colsample_bytree': 0.9761870316298173}. Best is trial 0 with value: 9.236196721120209.
[I 2025-02-16 14:42:03,767] Trial 2 finished with value: 9.538061342850936 and parameters: {'eta': 0.09280979323868159, 'gamma': 3.6190725920470435, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.5679274885274277, 'colsample_bytree': 0.7799418814355075}. Best is trial 0 with value: 9.236196721120209.
[I 2025-02-16 14:42:03,975] Trial 3 finished with value: 9.697874797219628 and parameters: {'eta': 0.021222577270732415, 'gamma': 5.933768138353069, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.7928878034068709, 'colsample_bytree': 0.5525320336528426}. Best is trial 0 with value: 9.236196721120209.
[I 2025-02-16 14:42:04,171] Trial 4 finished with value: 9.64892337797

Fold: 4/5
Model name: xgb
MAE: 7.212979049762589
MSE: 83.32585703282697
RMSE: 9.128299788724457
PCC: 0.6948293079630741
Spearman R: 0.7015240679781517
R2 Score: 0.4818417359788939



[I 2025-02-16 14:42:07,333] Trial 2 finished with value: 12.449248488523988 and parameters: {'learning_rate': 0.0014313437702203273, 'num_leaves': 848, 'subsample': 0.8388119383723778, 'colsample_bytree': 0.23624162433342527, 'min_data_in_leaf': 63}. Best is trial 0 with value: 12.379835783190718.
[I 2025-02-16 14:42:07,377] Trial 3 finished with value: 11.091173586852218 and parameters: {'learning_rate': 0.005626470765216354, 'num_leaves': 915, 'subsample': 0.14547609676993578, 'colsample_bytree': 0.5705230258842565, 'min_data_in_leaf': 61}. Best is trial 3 with value: 11.091173586852218.
[I 2025-02-16 14:42:07,428] Trial 4 finished with value: 9.240776658298511 and parameters: {'learning_rate': 0.03190461954913864, 'num_leaves': 101, 'subsample': 0.4754443077617689, 'colsample_bytree': 0.5735795910782396, 'min_data_in_leaf': 86}. Best is trial 4 with value: 9.240776658298511.
[I 2025-02-16 14:42:07,492] Trial 5 finished with value: 11.757546136492051 and parameters: {'learning_rate':

Fold: 4/5
Model name: lgbm
MAE: 7.31000254285919
MSE: 85.39047808921401
RMSE: 9.24069684002316
PCC: 0.6857835174251826
Spearman R: 0.6913901548399042
R2 Score: 0.46900297859272655



[I 2025-02-16 14:42:10,945] Trial 1 finished with value: 9.41973659512777 and parameters: {'eta': 0.022691063661380722, 'gamma': 0.0005012947502663314, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 0.5132866861410093, 'colsample_bytree': 0.7953079796928902}. Best is trial 0 with value: 9.199954854664243.
[I 2025-02-16 14:42:11,145] Trial 2 finished with value: 10.087434454204512 and parameters: {'eta': 0.01655881626227256, 'gamma': 1.1453150330163246, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.9790278753647725, 'colsample_bytree': 0.512998449644454}. Best is trial 0 with value: 9.199954854664243.
[I 2025-02-16 14:42:11,204] Trial 3 finished with value: 9.198598853037966 and parameters: {'eta': 0.09188714484260777, 'gamma': 3.544178631397142e-08, 'max_depth': 3, 'min_child_weight': 4, 'subsample': 0.9486023836757652, 'colsample_bytree': 0.9837843225352626}. Best is trial 3 with value: 9.198598853037966.
[I 2025-02-16 14:42:11,345] Trial 4 finished with value: 9.66649079

Fold: 5/5
Model name: xgb
MAE: 7.343272031317213
MSE: 84.43334407947884
RMSE: 9.188761836040744
PCC: 0.697395638297166
Spearman R: 0.701116006295315
R2 Score: 0.48328050989350657



[I 2025-02-16 14:42:14,809] Trial 0 finished with value: 11.288440332750003 and parameters: {'learning_rate': 0.004489719502989617, 'num_leaves': 784, 'subsample': 0.9251777425091258, 'colsample_bytree': 0.6535157532643664, 'min_data_in_leaf': 31}. Best is trial 0 with value: 11.288440332750003.
[I 2025-02-16 14:42:15,005] Trial 1 finished with value: 9.529276425958942 and parameters: {'learning_rate': 0.04285116068217127, 'num_leaves': 171, 'subsample': 0.1842160625946524, 'colsample_bytree': 0.8964474643919493, 'min_data_in_leaf': 6}. Best is trial 1 with value: 9.529276425958942.
[I 2025-02-16 14:42:15,113] Trial 2 finished with value: 10.412856376374291 and parameters: {'learning_rate': 0.015030296806603826, 'num_leaves': 714, 'subsample': 0.6583704896885971, 'colsample_bytree': 0.3869846478986904, 'min_data_in_leaf': 50}. Best is trial 1 with value: 9.529276425958942.
[I 2025-02-16 14:42:15,172] Trial 3 finished with value: 10.230714075024263 and parameters: {'learning_rate': 0.01

Fold: 5/5
Model name: lgbm
MAE: 7.341575621082354
MSE: 84.5759149641768
RMSE: 9.196516458103948
PCC: 0.6948921972204705
Spearman R: 0.6983889759331938
R2 Score: 0.4824079973139288

{'xgb': {'mae': 7.36051142463013, 'mse': 85.84063001840268, 'rmse': 9.26418932286004, 'pcc': 0.6899052998006263, 'spearman_r': 0.6940763365532565, 'r2_score': 0.4742893131185434}, 'lgbm': {'mae': 7.435203812540875, 'mse': 87.23685111380163, 'rmse': 9.338620757023419, 'pcc': 0.6839634826782672, 'spearman_r': 0.688281916923813, 'r2_score': 0.46581290350858495}}
