In [75]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
#website help from 
#https://medium.com/@walter_sperat/using-optuna-with-sklearn-the-right-way-part-1-6b4ad0ab2451

In [78]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [4]:
def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20)
    }

    return XGBRegressor(**params)

def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'max_depth': trial.suggest_int ('max_depth', 2, 6),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    'min_samples_split': trial.suggest_int ('min_sample_split', 6, 10),
    'min_samples_leaf': trial.suggest_int ('min_sample_leaf', 1, 10)
    }
    
    return GradientBoostingRegressor(**params)



def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
    params = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'loss': trial.suggest_categorical('loss',['linear', 'square', 'exponential']),
    'n_estimators': trial.suggest_int('n_estimators', 1, 1000)
    }
    
    return AdaBoostRegressor(**params)



In [5]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor



In [21]:
def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", .00001, .001, log=True)
    }

    return Lasso(**params)

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_lasso(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_lasso(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [76]:
from optuna import create_study

study_lasso = create_study(study_name='optimization', direction='maximize')

study_lasso.optimize(lambda trial: objective_lasso(trial, X, y), n_trials=100)

[I 2024-06-17 11:10:59,715] A new study created in memory with name: optimization
[I 2024-06-17 11:11:00,887] Trial 0 finished with value: 0.9380287471878199 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'alpha': 6.437080223170085e-05, 'l1_ratio': 0.5391856219916714}. Best is trial 0 with value: 0.9380287471878199.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-17 11:11:02,514] Trial 1 finished with value: 0.9378284016452542 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 1.1455649798527888e-05, 'l1_ratio': 0.10807237335547105}. Best is trial 0 with value: 0.9380287471878199.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
 

[I 2024-06-17 11:11:08,439] Trial 6 finished with value: 0.9389359930606241 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0008542668060526543, 'l1_ratio': 0.21447548511558276}. Best is trial 2 with value: 0.9430788576759868.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-17 11:11:11,204] Trial 7 finished with value: 0.9420678039427699 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 2.4882894396007986e-05, 'l1_ratio': 0.24258993930043637}. Best is trial 2 with value: 0.9430788576759868.
[I 2024-06-17 11:11:11,978] Trial 8 finished with value: 0.9370942393789949 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'alpha': 6.739626424466772e-05, 'l1_ratio': 0.19663396610209474}. Best is trial 2 with 

[I 2024-06-17 11:11:22,210] Trial 16 finished with value: 0.9465332741879916 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004415060958645766, 'l1_ratio': 0.6686741408509588}. Best is trial 16 with value: 0.9465332741879916.
[I 2024-06-17 11:11:22,971] Trial 17 finished with value: 0.9464468052444013 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004595632433304441, 'l1_ratio': 0.59936396427624}. Best is trial 16 with value: 0.9465332741879916.
[I 2024-06-17 11:11:23,611] Trial 18 finished with value: 0.942369168710437 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009455218027079588, 'l1_ratio': 0.5874001636655691}. Best is trial 16 with value: 0.9465332741879916.
[I 2024-06-17 11:11:24,654] Trial 19 finished with value: 0.9453316537459078 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 

[I 2024-06-17 11:11:27,848] Trial 21 finished with value: 0.9464568458284317 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004090092807059781, 'l1_ratio': 0.6855972350527196}. Best is trial 16 with value: 0.9465332741879916.
[I 2024-06-17 11:11:28,521] Trial 22 finished with value: 0.9468233332748699 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000594988008870434, 'l1_ratio': 0.6053997083008502}. Best is trial 22 with value: 0.9468233332748699.
[I 2024-06-17 11:11:29,254] Trial 23 finished with value: 0.947065499890129 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006401418978917264, 'l1_ratio': 0.6971157757174008}. Best is trial 23 with value: 0.947065499890129.
[I 2024-06-17 11:11:30,338] Trial 24 finished with value: 0.9451088239190609 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot'

[I 2024-06-17 11:11:32,436] Trial 27 finished with value: 0.9464874830608812 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00032246165073756644, 'l1_ratio': 0.49172708138762095}. Best is trial 26 with value: 0.9482502701794825.
[I 2024-06-17 11:11:33,139] Trial 28 finished with value: 0.9476019140832422 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007587309744673502, 'l1_ratio': 0.30851863360097515}. Best is trial 26 with value: 0.9482502701794825.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-17 11:11:35,879] Trial 29 finished with value: 0.9426073782733464 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0001307595163028304, 'l1_ratio': 0.13910165793706125}. Best is trial 26 with value

[I 2024-06-17 11:11:38,437] Trial 32 finished with value: 0.9482571387674493 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009773265715906814, 'l1_ratio': 0.35641028930262647}. Best is trial 32 with value: 0.9482571387674493.
[I 2024-06-17 11:11:39,280] Trial 33 finished with value: 0.9482576947955584 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009854120483557254, 'l1_ratio': 0.3657816562632502}. Best is trial 33 with value: 0.9482576947955584.
[I 2024-06-17 11:11:40,004] Trial 34 finished with value: 0.9482700650834699 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009108023236950893, 'l1_ratio': 0.3563186420927809}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:11:40,695] Trial 35 finished with value: 0.9482571775798607 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'd

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-17 11:11:45,394] Trial 39 finished with value: 0.9425898253935767 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 5.732560841182597e-05, 'l1_ratio': 0.2640386315696354}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:11:46,087] Trial 40 finished with value: 0.9386048070646377 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.00031117358851029246, 'l1_ratio': 0.4080467748305629}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:11:47,027] Trial 41 finished with value: 0.9470676083152629 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005232391824155212, 'l1_ratio': 0.3712743234577159}. Best is trial 34 with value: 0.948270065083469

[I 2024-06-17 11:11:50,159] Trial 45 finished with value: 0.9482231521018069 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009957875565018278, 'l1_ratio': 0.4399601236671685}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:11:51,070] Trial 46 finished with value: 0.9467644550262712 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007154768871056983, 'l1_ratio': 0.2267846889201385}. Best is trial 34 with value: 0.9482700650834699.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-17 11:11:52,238] Trial 47 finished with value: 0.9378606182653545 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'alpha': 2.9712090938200142e-05, 'l1_ratio': 0.5356040465818266}. Bes

[I 2024-06-17 11:12:00,128] Trial 50 finished with value: 0.9456520282836638 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005205964823532592, 'l1_ratio': 0.27506667495435516}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:01,027] Trial 51 finished with value: 0.9482532091307869 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007117262889892437, 'l1_ratio': 0.4977869489849167}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:01,900] Trial 52 finished with value: 0.9482632099252 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0008250273193613604, 'l1_ratio': 0.3989970376045039}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:02,870] Trial 53 finished with value: 0.9480159770123032 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-17 11:12:09,310] Trial 57 finished with value: 0.9422468651393274 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.4673361983206823e-05, 'l1_ratio': 0.41476497316192545}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:09,957] Trial 58 finished with value: 0.9390570964631962 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0004631059991739292, 'l1_ratio': 0.4666807535953337}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:10,798] Trial 59 finished with value: 0.9479926960759276 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000836567436956464, 'l1_ratio': 0.32497044887032794}. Best is trial 34 with value: 0.94827006508346

[I 2024-06-17 11:12:14,104] Trial 63 finished with value: 0.9482177082911379 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006949644911849363, 'l1_ratio': 0.6445136134273344}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:14,987] Trial 64 finished with value: 0.9478994720865568 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004999184583367969, 'l1_ratio': 0.5421768871706641}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:15,783] Trial 65 finished with value: 0.9482589944431034 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006019589820790669, 'l1_ratio': 0.6370336730875555}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:16,811] Trial 66 finished with value: 0.946813433306176 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dro

[I 2024-06-17 11:12:19,211] Trial 69 finished with value: 0.9466598863571771 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0003805901938949148, 'l1_ratio': 0.44273661869028913}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:20,522] Trial 70 finished with value: 0.9438570045499778 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0001854007138067895, 'l1_ratio': 0.34604338806358786}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:21,262] Trial 71 finished with value: 0.948222661044475 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007776775300881339, 'l1_ratio': 0.5694762951952351}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:22,096] Trial 72 finished with value: 0.9482549858134316 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'd

[I 2024-06-17 11:12:24,587] Trial 75 finished with value: 0.9464102285729434 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00046823888145656903, 'l1_ratio': 0.3135798383192537}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:25,169] Trial 76 finished with value: 0.9391634480399211 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0005756738774548276, 'l1_ratio': 0.6141939939050636}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:25,993] Trial 77 finished with value: 0.9478856052086392 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007191823033372013, 'l1_ratio': 0.3654281780449428}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:26,646] Trial 78 finished with value: 0.9427116425754282 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', '

[I 2024-06-17 11:12:29,543] Trial 82 finished with value: 0.9482593133219381 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007177814145443179, 'l1_ratio': 0.5144221702167765}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:30,353] Trial 83 finished with value: 0.9478092999065599 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000494720776957687, 'l1_ratio': 0.5282236645787651}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:31,071] Trial 84 finished with value: 0.947612618725195 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00040476312304840815, 'l1_ratio': 0.6098614710975516}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:31,802] Trial 85 finished with value: 0.9482299673622931 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dro

[I 2024-06-17 11:12:35,251] Trial 87 finished with value: 0.947854472288135 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005847893961836258, 'l1_ratio': 0.4504958912668066}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:35,769] Trial 88 finished with value: 0.9391639416496179 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0006954947022616366, 'l1_ratio': 0.4186320114937904}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:37,094] Trial 89 finished with value: 0.9442719321596704 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00013811079914045646, 'l1_ratio': 0.6549974781318577}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:37,852] Trial 90 finished with value: 0.9482491861066501 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alph

[I 2024-06-17 11:12:40,958] Trial 94 finished with value: 0.9482576587713971 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009041427195821917, 'l1_ratio': 0.3759995279785199}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:41,760] Trial 95 finished with value: 0.9482495750880982 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007640997483152713, 'l1_ratio': 0.4006680355568806}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:42,583] Trial 96 finished with value: 0.9475268281828558 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006797373145265834, 'l1_ratio': 0.3377505069731236}. Best is trial 34 with value: 0.9482700650834699.
[I 2024-06-17 11:12:43,469] Trial 97 finished with value: 0.9474849708879158 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dr

In [77]:
print(study_lasso.best_value)
print(study_lasso.best_params)

0.9482700650834699
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009108023236950893, 'l1_ratio': 0.3563186420927809}


In [17]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1, 100, log=True)
    }

    return Ridge(**params)


def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_ridge(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_ridge(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [20]:
from optuna import create_study

study_ridge = create_study(study_name='optimization', direction='maximize')

study_ridge.optimize(lambda trial: objective_ridge(trial, X, y), n_trials=500)

[I 2024-05-29 14:46:39,540] A new study created in memory with name: optimization
[I 2024-05-29 14:46:39,890] Trial 0 finished with value: 0.9489513810750617 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 4.224215714175673}. Best is trial 0 with value: 0.9489513810750617.
[I 2024-05-29 14:46:40,182] Trial 1 finished with value: 0.9447229219725486 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 5.62343318358076}. Best is trial 0 with value: 0.9489513810750617.
[I 2024-05-29 14:46:40,434] Trial 2 finished with value: 0.9304956991457264 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 22.248940842824254}. Best is trial 0 with value: 0.9489513810750617.
[I 2024-05-29 14:46:40,734] Trial 3 finished with value: 0.9410264681204739 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 97.57095213396795}. Best is tria

[I 2024-05-29 14:46:44,720] Trial 14 finished with value: 0.9493767434924593 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 8.700601892953443}. Best is trial 14 with value: 0.9493767434924593.
[I 2024-05-29 14:46:45,332] Trial 15 finished with value: 0.9496900068919819 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 9.6553354941934}. Best is trial 15 with value: 0.9496900068919819.
[I 2024-05-29 14:46:45,810] Trial 16 finished with value: 0.9494589158427698 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 10.865913529572724}. Best is trial 15 with value: 0.9496900068919819.
[I 2024-05-29 14:46:46,382] Trial 17 finished with value: 0.9497900481877638 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 30.516293655006987}. Best is trial 17 with value: 0.9497900481877638.
[I 2024-05-2

[I 2024-05-29 14:46:48,215] Trial 20 finished with value: 0.9495111507934821 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 41.36118854680351}. Best is trial 17 with value: 0.9497900481877638.
[I 2024-05-29 14:46:48,750] Trial 21 finished with value: 0.9497721800681983 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 31.371388834959447}. Best is trial 17 with value: 0.9497900481877638.
[I 2024-05-29 14:46:49,310] Trial 22 finished with value: 0.9497815381491457 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 30.92871828028217}. Best is trial 17 with value: 0.9497900481877638.
[I 2024-05-29 14:46:49,915] Trial 23 finished with value: 0.9499152920477292 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 18.925744191220836}. Best is trial 23 with value: 0.9499152920477292.
[I 2024-05-29 

[I 2024-05-29 14:46:52,120] Trial 27 finished with value: 0.9499143798490011 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 20.834311993009308}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:52,718] Trial 28 finished with value: 0.9499137479259223 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 21.022135970880317}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:53,316] Trial 29 finished with value: 0.9492615962889877 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 49.10085410242272}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:53,914] Trial 30 finished with value: 0.9497659568337182 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 11.165085687087524}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29

[I 2024-05-29 14:46:55,647] Trial 33 finished with value: 0.9494756871123078 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 6.905005748026618}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:55,982] Trial 34 finished with value: 0.9369297500020453 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 13.934120462563872}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:56,582] Trial 35 finished with value: 0.9498967444358417 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 23.582833642703136}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:57,050] Trial 36 finished with value: 0.9419004314262921 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 46.79491138795538}. Best is trial 24 with value: 0.9499157785004215.
[I 2024-05-29 14:46:57,481] Tr

[I 2024-05-29 14:47:00,090] Trial 42 finished with value: 0.9499108740890542 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 17.847321100339546}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:00,670] Trial 43 finished with value: 0.9498728036592485 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 25.670955972354296}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:01,240] Trial 44 finished with value: 0.9499139647488585 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 18.488338692613365}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:01,676] Trial 45 finished with value: 0.9380188915003239 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 12.66916662019485}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:

[I 2024-05-29 14:47:04,375] Trial 51 finished with value: 0.9499139256954091 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 18.47795210697765}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:04,990] Trial 52 finished with value: 0.9498569279308556 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 26.781398261763986}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:05,600] Trial 53 finished with value: 0.949872480847332 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 14.70050139299995}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:06,190] Trial 54 finished with value: 0.9499129356352756 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 18.24059315146403}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:

[I 2024-05-29 14:47:08,333] Trial 58 finished with value: 0.9494162566496982 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 28.588260424123284}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:08,909] Trial 59 finished with value: 0.9499146419554924 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 18.68638296128222}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:09,490] Trial 60 finished with value: 0.9498674086586227 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 14.446617142734361}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14:47:10,120] Trial 61 finished with value: 0.9499038532315535 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 16.92448256905433}. Best is trial 41 with value: 0.949916005089724.
[I 2024-05-29 14

[I 2024-05-29 14:47:11,821] Trial 64 finished with value: 0.9252337705185638 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 24.034145967385818}. Best is trial 63 with value: 0.9499161826666598.
[I 2024-05-29 14:47:12,490] Trial 65 finished with value: 0.9473760136032172 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 95.96789285869681}. Best is trial 63 with value: 0.9499161826666598.
[I 2024-05-29 14:47:13,092] Trial 66 finished with value: 0.9498256967708858 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 12.802452195521697}. Best is trial 63 with value: 0.9499161826666598.
[I 2024-05-29 14:47:13,589] Trial 67 finished with value: 0.9419218737614135 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 42.98689004668654}. Best is trial 63 with value: 0.9499161826666598.
[I 2024-05-29 14:47:14,169] Tr

[I 2024-05-29 14:47:20,170] Trial 79 finished with value: 0.9214452240766159 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 28.246791549814162}. Best is trial 73 with value: 0.9500457881985074.
[I 2024-05-29 14:47:20,636] Trial 80 finished with value: 0.9419244720854703 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 31.055432938374746}. Best is trial 73 with value: 0.9500457881985074.
[I 2024-05-29 14:47:21,190] Trial 81 finished with value: 0.9500483157685269 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.57601769638403}. Best is trial 81 with value: 0.9500483157685269.
[I 2024-05-29 14:47:21,720] Trial 82 finished with value: 0.9500402331104993 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.9452668021245}. Best is trial 81 with value: 0.9500483157685269.
[I 2024-05-29 14:47:22,281] Trial 83 fin

[I 2024-05-29 14:47:39,505] Trial 113 finished with value: 0.9500321075916132 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 26.05632069249111}. Best is trial 81 with value: 0.9500483157685269.
[I 2024-05-29 14:47:40,053] Trial 114 finished with value: 0.949920256415143 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 33.64760704248981}. Best is trial 81 with value: 0.9500483157685269.
[I 2024-05-29 14:47:40,716] Trial 115 finished with value: 0.9500483311202934 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.541193887356858}. Best is trial 115 with value: 0.9500483311202934.
[I 2024-05-29 14:47:41,353] Trial 116 finished with value: 0.9499559360517118 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 15.073479525951145}. Best is trial 115 with value: 0.9500483311202934.
[I 2024-05-29 14:47:4

[I 2024-05-29 14:47:59,471] Trial 147 finished with value: 0.9496893730880469 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.242479857598287}. Best is trial 130 with value: 0.9500483598435061.
[I 2024-05-29 14:47:59,984] Trial 148 finished with value: 0.950004028380772 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 28.645042759481385}. Best is trial 130 with value: 0.9500483598435061.
[I 2024-05-29 14:48:00,601] Trial 149 finished with value: 0.9499764219362963 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 15.837293698200659}. Best is trial 130 with value: 0.9500483598435061.
[I 2024-05-29 14:48:01,233] Trial 150 finished with value: 0.9500428259188961 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.485943930516566}. Best is trial 130 with value: 0.9500483598435061.
[I 2024-05-29 1

[I 2024-05-29 14:48:19,066] Trial 181 finished with value: 0.9500483407057937 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.514526077909984}. Best is trial 161 with value: 0.9500483600707013.
[I 2024-05-29 14:48:19,610] Trial 182 finished with value: 0.9500481768858856 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.76499271378878}. Best is trial 161 with value: 0.9500483600707013.
[I 2024-05-29 14:48:20,181] Trial 183 finished with value: 0.9500435492849221 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.340457607332958}. Best is trial 161 with value: 0.9500483600707013.
[I 2024-05-29 14:48:20,729] Trial 184 finished with value: 0.9500144049931787 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 27.80833953397618}. Best is trial 161 with value: 0.9500483600707013.
[I 2024-05-29 14:4

[I 2024-05-29 14:48:38,105] Trial 215 finished with value: 0.9500436362123672 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 20.56829301095356}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:38,694] Trial 216 finished with value: 0.9500482193795555 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.71888861575131}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:39,329] Trial 217 finished with value: 0.950045899290745 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 23.774900860455574}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:39,943] Trial 218 finished with value: 0.9500286294233419 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 26.45016663292116}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:

[I 2024-05-29 14:48:57,461] Trial 249 finished with value: 0.9500096551640993 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 17.43846587323692}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:58,034] Trial 250 finished with value: 0.9500473525842551 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 23.27100931616415}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:58,623] Trial 251 finished with value: 0.9500377473230918 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 25.32458843850761}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48:59,200] Trial 252 finished with value: 0.9500478606215254 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 21.788237267154564}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:48

[I 2024-05-29 14:49:15,539] Trial 283 finished with value: 0.9496903302864667 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 21.848395049432032}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:16,087] Trial 284 finished with value: 0.9500290577756381 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 18.80934336851527}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:16,588] Trial 285 finished with value: 0.9499931475856244 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 29.4373784397763}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:17,134] Trial 286 finished with value: 0.9500457934110716 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 23.804910086162103}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:

[I 2024-05-29 14:49:30,972] Trial 312 finished with value: 0.9500476665943747 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 21.68158956701195}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:31,514] Trial 313 finished with value: 0.9500423424216142 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.578228445592334}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:32,105] Trial 314 finished with value: 0.949953771510258 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 31.877651493313888}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:32,659] Trial 315 finished with value: 0.9499636037330823 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 15.345467101829586}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:4

[I 2024-05-29 14:49:46,439] Trial 341 finished with value: 0.9500482217855841 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.71608444537485}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:46,899] Trial 342 finished with value: 0.9322569010275412 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 20.499733984885726}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:47,421] Trial 343 finished with value: 0.9499128755777029 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 18.22751057013013}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:49:47,871] Trial 344 finished with value: 0.9247609647754318 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.90627046638614}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14

[I 2024-05-29 14:49:59,759] Trial 366 finished with value: 0.9500462113969107 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 21.15105104807232}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:00,221] Trial 367 finished with value: 0.9283206508025575 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 25.1504862293041}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:00,787] Trial 368 finished with value: 0.9500110328660295 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 17.52028749586782}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:01,329] Trial 369 finished with value: 0.9500481858201946 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.755778854758248}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:

[I 2024-05-29 14:50:15,249] Trial 395 finished with value: 0.9475413800912751 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 99.24229776417987}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:15,790] Trial 396 finished with value: 0.9500256193251395 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 26.766538098045515}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:16,159] Trial 397 finished with value: 0.9215942968962885 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'alpha': 18.465677208472446}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:16,689] Trial 398 finished with value: 0.9500416876304545 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 20.23582942346699}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:17,274] Tri

[I 2024-05-29 14:50:30,849] Trial 424 finished with value: 0.9498982758034984 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 16.38911342696073}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:31,379] Trial 425 finished with value: 0.9500310960595115 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 18.99327756482061}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:31,920] Trial 426 finished with value: 0.9500482892222042 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.16351311039548}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:32,462] Trial 427 finished with value: 0.9500445874064949 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 20.757439365921787}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14

[I 2024-05-29 14:50:44,388] Trial 449 finished with value: 0.9499776726361617 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 15.888134595692799}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:44,948] Trial 450 finished with value: 0.9480757075307071 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 1.7057935050670603}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:45,555] Trial 451 finished with value: 0.9500413413153007 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 20.182452332392728}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:50:46,087] Trial 452 finished with value: 0.9500086293071217 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 17.378643925913124}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14

[I 2024-05-29 14:51:01,044] Trial 478 finished with value: 0.9499295609559629 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 14.236213475051581}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:51:01,700] Trial 479 finished with value: 0.9500382644345023 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 19.759927928719005}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:51:02,289] Trial 480 finished with value: 0.950013226738944 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 27.908381542438477}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14:51:02,759] Trial 481 finished with value: 0.9496845724984592 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 23.46890916959578}. Best is trial 207 with value: 0.9500483603074065.
[I 2024-05-29 14

In [24]:
print(study_ridge.best_value)
print(study_ridge.best_params)

0.9500483603074065
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 22.384020959927625}


# EN

In [64]:
def instantiate_en(trial : Trial) -> ElasticNet:
    params = {
        "alpha": trial.suggest_float("alpha", .00001, .001, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', .1, .7, log = True)
    }

    return ElasticNet(**params)



def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_en(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_en(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [65]:
from optuna import create_study

study_en = create_study(study_name='optimization', direction='maximize')

study_en.optimize(lambda trial: objective_en(trial, X, y), n_trials=200)

[I 2024-06-04 12:11:09,318] A new study created in memory with name: optimization
[I 2024-06-04 12:11:11,047] Trial 0 finished with value: 0.948728188983764 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 7.788102907248964e-05, 'l1_ratio': 0.6611655504096104}. Best is trial 0 with value: 0.948728188983764.
[I 2024-06-04 12:11:11,915] Trial 1 finished with value: 0.9501783277518389 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00023978682282845947, 'l1_ratio': 0.4590270612941629}. Best is trial 1 with value: 0.9501783277518389.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:15,549] Trial 2 finished with value: 0.9462631388575155 and parameters: {'scaling_method': 'robust', 'encoding_method': 'oneh

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:20,979] Trial 6 finished with value: 0.9460312612911462 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.0810755343611517e-05, 'l1_ratio': 0.10760765653450712}. Best is trial 1 with value: 0.9501783277518389.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:24,314] Trial 7 finished with value: 0.9464946045658464 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.5606337963784704e-05, 'l1_ratio': 0.3601779153297862}. Best is trial 1 with value: 0.9501783277518389.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model

[I 2024-06-04 12:11:32,673] Trial 14 finished with value: 0.9417705577027908 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'alpha': 0.00012726945204769044, 'l1_ratio': 0.6973862163128914}. Best is trial 13 with value: 0.9503906980010737.
[I 2024-06-04 12:11:33,538] Trial 15 finished with value: 0.9504647308835044 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00033039138120244133, 'l1_ratio': 0.5839963510057812}. Best is trial 15 with value: 0.9504647308835044.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:35,906] Trial 16 finished with value: 0.9463254479289654 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 5.274077303921959e-05, 'l1_ratio': 0.26407634356768434}. Best is trial 15 with value: 0.9504647308835044.
[I 2024-06-04 12:11:36,614] Trial 17 finished with value: 0.9503193720200291 and parame

[I 2024-06-04 12:11:55,895] Trial 37 finished with value: 0.9488350820536899 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00024328192657776227, 'l1_ratio': 0.13196826812856255}. Best is trial 31 with value: 0.9505499049761467.
[I 2024-06-04 12:11:56,401] Trial 38 finished with value: 0.9417457762066904 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0007354764476121989, 'l1_ratio': 0.3159175589563822}. Best is trial 31 with value: 0.9505499049761467.
[I 2024-06-04 12:11:57,104] Trial 39 finished with value: 0.9481549613128278 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0004022539129003468, 'l1_ratio': 0.5374274671812856}. Best is trial 31 with value: 0.9505499049761467.
[I 2024-06-04 12:11:58,872] Trial 40 finished with value: 0.9485464449761973 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alp

[I 2024-06-04 12:12:08,457] Trial 51 finished with value: 0.9504455869420523 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006417597873670123, 'l1_ratio': 0.6914189001452589}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:09,109] Trial 52 finished with value: 0.9502840877576274 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007951112055397861, 'l1_ratio': 0.6533440106770946}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:09,797] Trial 53 finished with value: 0.9505306538606046 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005353346286880682, 'l1_ratio': 0.6994398595142804}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:10,752] Trial 54 finished with value: 0.9499698885800566 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dr

[I 2024-06-04 12:12:12,543] Trial 57 finished with value: 0.950577262408429 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00053370221372854, 'l1_ratio': 0.5928394187134195}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:13,264] Trial 58 finished with value: 0.9505674608233898 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005525033029698415, 'l1_ratio': 0.5892705906089115}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:13,905] Trial 59 finished with value: 0.9503876380888503 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0008031136821558983, 'l1_ratio': 0.5890833539789105}. Best is trial 46 with value: 0.9507322851800556.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coord

[I 2024-06-04 12:12:17,829] Trial 62 finished with value: 0.9505865977698754 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000547365281712682, 'l1_ratio': 0.5074348417297473}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:18,518] Trial 63 finished with value: 0.9505600104643548 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006763099329097973, 'l1_ratio': 0.5010753013816447}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:19,188] Trial 64 finished with value: 0.9505544385087727 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006752018854488276, 'l1_ratio': 0.512257772085704}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:19,872] Trial 65 finished with value: 0.9505610115918952 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop

[I 2024-06-04 12:12:22,536] Trial 69 finished with value: 0.9505668734975471 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000598033389476201, 'l1_ratio': 0.5499685352456576}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:23,254] Trial 70 finished with value: 0.9505872693783823 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005453676637810146, 'l1_ratio': 0.5549696077421559}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:23,967] Trial 71 finished with value: 0.9505837290328352 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005633168107613195, 'l1_ratio': 0.553216185104922}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:24,696] Trial 72 finished with value: 0.9505831084234 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:12:29,505] Trial 75 finished with value: 0.9464408856041644 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.1811470814604836e-05, 'l1_ratio': 0.4549627520644883}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:30,248] Trial 76 finished with value: 0.9505470917566982 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005019139832583904, 'l1_ratio': 0.5119738830505115}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:31,036] Trial 77 finished with value: 0.9504400076308397 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000391420775171671, 'l1_ratio': 0.5644241

[I 2024-06-04 12:12:33,084] Trial 80 finished with value: 0.9503268087820554 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007275600791069953, 'l1_ratio': 0.23651295309844964}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:33,777] Trial 81 finished with value: 0.9505659299662632 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005567673044213204, 'l1_ratio': 0.5881426047725423}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:34,584] Trial 82 finished with value: 0.9503494655977095 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0003672666704496957, 'l1_ratio': 0.5299262568646396}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:35,364] Trial 83 finished with value: 0.9504705997559354 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'd

[I 2024-06-04 12:12:37,567] Trial 86 finished with value: 0.9502056950388436 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00033409044291808236, 'l1_ratio': 0.4260790080666367}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:38,273] Trial 87 finished with value: 0.9503852809171167 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007297987523109834, 'l1_ratio': 0.651075654318817}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:39,056] Trial 88 finished with value: 0.9505514873806357 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00043198206015939914, 'l1_ratio': 0.60471944277708}. Best is trial 46 with value: 0.9507322851800556.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coo

  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:12:44,279] Trial 90 finished with value: 0.9477018148681259 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 4.3312611035599385e-05, 'l1_ratio': 0.5453161891928837}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:45,027] Trial 91 finished with value: 0.9505925573148751 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005948463475809608, 'l1_ratio': 0.49482340300377425}. Best is trial 46 with value: 0.9507322851800556.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:12:47,880] Trial 92 finished with value: 0.9493084580865908 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha':

[I 2024-06-04 12:12:49,889] Trial 95 finished with value: 0.9505629716198436 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006369459863925763, 'l1_ratio': 0.5249111385925039}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:50,608] Trial 96 finished with value: 0.9502464781282434 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007602353746941758, 'l1_ratio': 0.4525456098302309}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:51,557] Trial 97 finished with value: 0.9501491357601406 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004136730847154995, 'l1_ratio': 0.2937106089812104}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:52,288] Trial 98 finished with value: 0.9505860534020304 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dro

[I 2024-06-04 12:13:02,986] Trial 112 finished with value: 0.9508142864190117 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008538883065837565, 'l1_ratio': 0.33578661444506}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:03,759] Trial 113 finished with value: 0.9508191540538323 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008739171477669513, 'l1_ratio': 0.3401917031329937}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:04,545] Trial 114 finished with value: 0.9508140305778845 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008920977328144325, 'l1_ratio': 0.32063652487224986}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:05,307] Trial 115 finished with value: 0.9508206424132493 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': N

[I 2024-06-04 12:13:25,967] Trial 142 finished with value: 0.9508105015577785 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007872105072179762, 'l1_ratio': 0.3580349217835481}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:26,699] Trial 143 finished with value: 0.9508202201885936 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0009044251245493753, 'l1_ratio': 0.3330619199452988}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:27,447] Trial 144 finished with value: 0.9507954368725164 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0009265280516420815, 'l1_ratio': 0.37660970968558816}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:28,239] Trial 145 finished with value: 0.9506670212463165 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop':

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:13:47,685] Trial 168 finished with value: 0.9409312411500317 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 1.920788613331729e-05, 'l1_ratio': 0.2803920883031563}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:48,373] Trial 169 finished with value: 0.950212935030683 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007192835260415021, 'l1_ratio': 0.32718064223393006}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:49,199] Trial 170 finished with value: 0.9507060018349653 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008155876488844761, 'l1_ratio': 0.2939630778213785}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:49,959] Trial 171 finished with value: 0.9508171281704605 and param

[I 2024-06-04 12:14:06,796] Trial 189 finished with value: 0.9508211865943965 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007890891316732335, 'l1_ratio': 0.386603322661332}. Best is trial 189 with value: 0.9508211865943965.
[I 2024-06-04 12:14:07,632] Trial 190 finished with value: 0.9508115073280979 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007539799779360345, 'l1_ratio': 0.3758948220644501}. Best is trial 189 with value: 0.9508211865943965.
[I 2024-06-04 12:14:08,462] Trial 191 finished with value: 0.9508125388598604 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008330956165337611, 'l1_ratio': 0.3987889471811326}. Best is trial 189 with value: 0.9508211865943965.
[I 2024-06-04 12:14:09,309] Trial 192 finished with value: 0.9508210288482131 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop'

In [66]:
# after adding in the L1 value
print(study_en.best_value)
print(study_en.best_params)

0.9508214083737423
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0006921322276640134, 'l1_ratio': 0.4415555448483296}


In [29]:
print(study_en.best_value)
print(study_en.best_params)

0.950820633330468
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0006178621404820522}


In [69]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

en = ElasticNet(alpha =  0.0006921322276640134, l1_ratio= 0.4415555448483296)

numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

numerical_preprocessor = RobustScaler()

preprocessor = ColumnTransformer(
    [
        ("robust-scaler", numerical_preprocessor, numerical_columns),
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
    ]
)

en_pipe = Pipeline(steps=[("Preprocess", preprocessor), ("EN", en)])

en_regr = compose.TransformedTargetRegressor(regressor= en_pipe,
                                                func=np.log, inverse_func=np.exp)

In [70]:
en_scores = cross_val_score(en_regr, X, y, cv = kf, error_score= 'raise')
en_scores.mean()
#0.950820633330468

0.9508214083737423

# RF

In [48]:
def instantiate_rf(trial : Trial) -> RandomForestRegressor:
    params = {
    'bootstrap':trial.suggest_categorical('bootstrap', [True]),
    'n_estimators': trial.suggest_int('n_estimators', 250, 500),
    'max_depth': trial.suggest_int('max_depth', 50, 75),
    'min_samples_split': trial.suggest_int('min_samples_split', 4, 6),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
    }
    
    return RandomForestRegressor(**params)
  
def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_rf(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_rf(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [49]:
from optuna import create_study

study_rf = create_study(study_name='optimization', direction='maximize')

study_rf.optimize(lambda trial: objective_en(trial, X, y), n_trials=50)

[I 2024-05-31 08:58:37,051] A new study created in memory with name: optimization
[I 2024-05-31 09:01:47,272] Trial 0 finished with value: 0.9041942334135129 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'bootstrap': True, 'n_estimators': 402, 'max_depth': 50, 'min_sample_split': 5, 'min_sample_leaf': 3}. Best is trial 0 with value: 0.9041942334135129.
[I 2024-05-31 09:05:06,098] Trial 1 finished with value: 0.9073980977362396 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'bootstrap': True, 'n_estimators': 336, 'max_depth': 69, 'min_sample_split': 6, 'min_sample_leaf': 1}. Best is trial 1 with value: 0.9073980977362396.
[I 2024-05-31 09:07:30,292] Trial 2 finished with value: 0.9076826989942189 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 260, 'max_depth': 73, 'min_sample_split': 5, 'min_sample_leaf': 1}. Best is trial 2 with value: 

[I 2024-05-31 09:27:43,062] Trial 8 finished with value: 0.9056804031622304 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'bootstrap': True, 'n_estimators': 420, 'max_depth': 57, 'min_sample_split': 5, 'min_sample_leaf': 3}. Best is trial 4 with value: 0.9097534039592309.
[I 2024-05-31 09:30:56,969] Trial 9 finished with value: 0.9093634132516474 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'bootstrap': True, 'n_estimators': 254, 'max_depth': 55, 'min_sample_split': 4, 'min_sample_leaf': 1}. Best is trial 4 with value: 0.9097534039592309.
[I 2024-05-31 09:32:39,310] Trial 10 finished with value: 0.8993885557189456 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 322, 'max_depth': 60, 'min_sample_split': 6, 'min_sample_leaf': 5}. Best is trial 4 with value: 0.9097534039592309.
[I 2024-05-31 09:34:36,582] Trial 11 finished with value: 0.9082

[I 2024-05-31 10:31:29,412] Trial 32 finished with value: 0.9094899639883157 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 327, 'max_depth': 63, 'min_sample_split': 6, 'min_sample_leaf': 1}. Best is trial 25 with value: 0.9104726627980002.
[I 2024-05-31 10:34:00,216] Trial 33 finished with value: 0.9095126002045921 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 277, 'max_depth': 67, 'min_sample_split': 5, 'min_sample_leaf': 1}. Best is trial 25 with value: 0.9104726627980002.
[I 2024-05-31 10:36:38,891] Trial 34 finished with value: 0.9102377098381838 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 305, 'max_depth': 71, 'min_sample_split': 4, 'min_sample_leaf': 2}. Best is trial 25 with value: 0.9104726627980002.
[I 2024-05-31 10:38:43,955] Trial 35 finished with value: 0.9064976064216932 and paramet

In [51]:
print(study_rf.best_value)
print(study_rf.best_params)

0.9106961840498735
{'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 338, 'max_depth': 73, 'min_sample_split': 4, 'min_sample_leaf': 2}


# XGB