In [22]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
#website help from 
#https://medium.com/@walter_sperat/using-optuna-with-sklearn-the-right-way-part-1-6b4ad0ab2451

In [2]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [4]:


def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'max_depth': trial.suggest_int ('max_depth', 2, 6),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    'min_samples_split': trial.suggest_int ('min_sample_split', 6, 10),
    'min_samples_leaf': trial.suggest_int ('min_sample_leaf', 1, 10)
    }
    
    return GradientBoostingRegressor(**params)



def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
    params = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'loss': trial.suggest_categorical('loss',['linear', 'square', 'exponential']),
    'n_estimators': trial.suggest_int('n_estimators', 1, 1000)
    }
    
    return AdaBoostRegressor(**params)



In [3]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor



# Lasso

In [5]:
def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", .00001, .001, log=True)
    }

    return Lasso(**params)

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_lasso(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_lasso(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [8]:
from optuna import create_study

study_lasso = create_study(study_name='optimization', direction='maximize')

study_lasso.optimize(lambda trial: objective_lasso(trial, X, y), n_trials=200)

[I 2024-07-08 11:07:01,252] A new study created in memory with name: optimization
[I 2024-07-08 11:07:02,397] Trial 0 finished with value: 0.9467361127612117 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0003841185967841102}. Best is trial 0 with value: 0.9467361127612117.
[I 2024-07-08 11:07:02,902] Trial 1 finished with value: 0.9323125711122492 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'alpha': 0.0008788944418275458}. Best is trial 0 with value: 0.9467361127612117.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 11:07:04,125] Trial 2 finished with value: 0.9410933355982675 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 2.11890200594345e-05}. Best is trial 0 with value: 0.9467361127

[I 2024-07-08 11:07:09,886] Trial 5 finished with value: 0.9495680701364331 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 7.955963434862829e-05}. Best is trial 5 with value: 0.9495680701364331.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 11:07:11,201] Trial 6 finished with value: 0.9410989159546519 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'alpha': 1.4665959167784572e-05}. Best is trial 5 with value: 0.9495680701364331.
[I 2024-07-08 11:07:11,991] Trial 7 finished with value: 0.9393600439822679 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'alpha': 0.000111750098319519}. Best is trial 5 with value: 0.9495680701364331.
[I 2024-07-08 11:07:13,033] Trial 8 finished with value: 0.9412729598873465 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 5.

[I 2024-07-08 11:07:26,547] Trial 17 finished with value: 0.9501479138438587 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0004042906555245167}. Best is trial 12 with value: 0.9505316923005414.
[I 2024-07-08 11:07:27,168] Trial 18 finished with value: 0.9435612469682632 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0009218227581495386}. Best is trial 12 with value: 0.9505316923005414.
[I 2024-07-08 11:07:28,755] Trial 19 finished with value: 0.9504212218173027 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00014548211412720207}. Best is trial 12 with value: 0.9505316923005414.
[I 2024-07-08 11:07:31,698] Trial 20 finished with value: 0.9489740326737615 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 4.0668691741889e-05}. Best is trial 12 with value: 0.9505316923005414.
[I 20

[I 2024-07-08 11:07:36,043] Trial 24 finished with value: 0.9495421668902756 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005799084192342355}. Best is trial 12 with value: 0.9505316923005414.
[I 2024-07-08 11:07:37,118] Trial 25 finished with value: 0.9501007177440985 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00032052747816874096}. Best is trial 12 with value: 0.9505316923005414.
[I 2024-07-08 11:07:37,950] Trial 26 finished with value: 0.9413680883475847 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'alpha': 0.00013129644373445254}. Best is trial 12 with value: 0.9505316923005414.
[I 2024-07-08 11:07:39,253] Trial 27 finished with value: 0.9505899435979714 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00018866784606573576}. Best is trial 27 with value: 0.9505899435979714.
[I 2024-07-08 

[I 2024-07-08 11:07:55,203] Trial 45 finished with value: 0.9507746615014426 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003649761025040236}. Best is trial 45 with value: 0.9507746615014426.
[I 2024-07-08 11:07:55,909] Trial 46 finished with value: 0.950740280680139 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003884582051924655}. Best is trial 45 with value: 0.9507746615014426.
[I 2024-07-08 11:07:56,743] Trial 47 finished with value: 0.9507830470423917 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003589201325166769}. Best is trial 47 with value: 0.9507830470423917.
[I 2024-07-08 11:07:57,330] Trial 48 finished with value: 0.9419103771855338 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0007917367148203394}. Best is trial 47 with value: 0.9507830470423917.
[I 2024-07-08 11:07:58,101

[I 2024-07-08 11:08:23,207] Trial 74 finished with value: 0.9508111200046067 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00029785107381510016}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:08:23,956] Trial 75 finished with value: 0.9505262858095543 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0004994052225332545}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:08:24,524] Trial 76 finished with value: 0.9494912200852065 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.000648591770527666}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:08:25,330] Trial 77 finished with value: 0.9505012918215103 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00041703628264054717}. Best is trial 62 with value: 0.9508122835720298.
[I 202

[I 2024-07-08 11:08:42,628] Trial 98 finished with value: 0.9482600195320805 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00021712928836636005}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:08:43,444] Trial 99 finished with value: 0.9508044285580872 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0002881626250367436}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:08:44,177] Trial 100 finished with value: 0.9504583594521454 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004389517417116825}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:08:44,974] Trial 101 finished with value: 0.9508106746800922 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003129671054994815}. Best is trial 62 with value: 0.9508122835720298.
[I 202

[I 2024-07-08 11:09:03,297] Trial 122 finished with value: 0.9507317901036314 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0002554335225312177}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:09:04,057] Trial 123 finished with value: 0.9508108275011906 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003137024887393732}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:09:04,934] Trial 124 finished with value: 0.9505756100830369 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00021948551363737593}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:09:05,740] Trial 125 finished with value: 0.9508114683472307 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003115081647262303}. Best is trial 62 with value: 0.9508122835720298.
[I 2024

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 11:09:26,434] Trial 149 finished with value: 0.9472634812417441 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 1.895259818006878e-05}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:09:27,316] Trial 150 finished with value: 0.9506894891739108 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00024567849580295803}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:09:28,090] Trial 151 finished with value: 0.9508113274633262 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00029835874328685544}. Best is trial 62 with value: 0.9508122835720298.
[I 2024-07-08 11:09:28,852] Trial 152 finished with value: 0.9508119136442102 and parameters: {'scaling_method': 'robust', 'e

[I 2024-07-08 11:09:46,243] Trial 175 finished with value: 0.9505744777598629 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0003000082597888178}. Best is trial 168 with value: 0.950812490222533.
[I 2024-07-08 11:09:47,086] Trial 176 finished with value: 0.9507110413466382 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00025052280364583845}. Best is trial 168 with value: 0.950812490222533.
[I 2024-07-08 11:09:47,777] Trial 177 finished with value: 0.9507845845252273 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0003576620170855338}. Best is trial 168 with value: 0.950812490222533.
[I 2024-07-08 11:09:48,443] Trial 178 finished with value: 0.9507385004432749 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00038971466314564944}. Best is trial 168 with value: 0.950812490222533.
[I 

In [11]:
print(study_lasso.best_value)
print(study_lasso.best_params)

0.9508124989217249
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00030733929099045147}


In [13]:
trial_number = []
score = []
parameters = []
optuna_dict = {}
trials = range(200)

for trial in trials:
    trial_number.append(study_lasso.get_trials()[trial].number)
    score.append(study_lasso.get_trials()[trial].value)
    parameters.append(list(study_lasso.get_trials()[trial].params.items()))

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Parameters'] = parameters

optuna_lasso = pd.DataFrame.from_dict(optuna_dict)
optuna_lasso

Unnamed: 0,Trial,Score,Parameters
0,0,0.946736,"[(scaling_method, maxabs), (encoding_method, o..."
1,1,0.932313,"[(scaling_method, maxabs), (encoding_method, o..."
2,2,0.941093,"[(scaling_method, robust), (encoding_method, o..."
3,3,0.948149,"[(scaling_method, maxabs), (encoding_method, o..."
4,4,0.947131,"[(scaling_method, robust), (encoding_method, o..."
...,...,...,...
195,195,0.950399,"[(scaling_method, minmax), (encoding_method, o..."
196,196,0.950790,"[(scaling_method, robust), (encoding_method, o..."
197,197,0.950787,"[(scaling_method, robust), (encoding_method, o..."
198,198,0.950707,"[(scaling_method, robust), (encoding_method, o..."


# Ridge

In [15]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1, 100, log=True)
    }

    return Ridge(**params)


def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_ridge(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_ridge(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [16]:
from optuna import create_study

study_ridge = create_study(study_name='optimization', direction='maximize')

study_ridge.optimize(lambda trial: objective_ridge(trial, X, y), n_trials=20)

[I 2024-07-08 11:16:20,304] A new study created in memory with name: optimization
[I 2024-07-08 11:16:20,770] Trial 0 finished with value: 0.9415020582137291 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 6.945508650106731}. Best is trial 0 with value: 0.9415020582137291.
[I 2024-07-08 11:16:21,251] Trial 1 finished with value: 0.9139707635504198 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 33.38377369283048}. Best is trial 0 with value: 0.9415020582137291.
[I 2024-07-08 11:16:21,744] Trial 2 finished with value: 0.9431168591430699 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 6.095029924693382}. Best is trial 2 with value: 0.9431168591430699.
[I 2024-07-08 11:16:22,226] Trial 3 finished with value: 0.9412291279726801 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 1.0187234120138662}. Best is trial 2 with 

[I 2024-07-08 11:16:26,701] Trial 12 finished with value: 0.9478465336558358 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 95.81528312969256}. Best is trial 5 with value: 0.9493349741120131.
[I 2024-07-08 11:16:27,144] Trial 13 finished with value: 0.949443149646983 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 40.85362827698336}. Best is trial 13 with value: 0.949443149646983.
[I 2024-07-08 11:16:27,600] Trial 14 finished with value: 0.9496764273611051 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.817075515507273}. Best is trial 14 with value: 0.9496764273611051.
[I 2024-07-08 11:16:28,047] Trial 15 finished with value: 0.9496530065090127 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 15.585377824848988}. Best is trial 14 with value: 0.9496764273611051.
[I 2024-07-08 11:1

In [17]:
print(study_ridge.best_value)
print(study_ridge.best_params)

0.9496764273611051
{'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 24.817075515507273}


# EN

In [64]:
def instantiate_en(trial : Trial) -> ElasticNet:
    params = {
        "alpha": trial.suggest_float("alpha", .00001, .001, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', .1, .7, log = True)
    }

    return ElasticNet(**params)



def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_en(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_en(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [65]:
from optuna import create_study

study_en = create_study(study_name='optimization', direction='maximize')

study_en.optimize(lambda trial: objective_en(trial, X, y), n_trials=200)

[I 2024-06-04 12:11:09,318] A new study created in memory with name: optimization
[I 2024-06-04 12:11:11,047] Trial 0 finished with value: 0.948728188983764 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 7.788102907248964e-05, 'l1_ratio': 0.6611655504096104}. Best is trial 0 with value: 0.948728188983764.
[I 2024-06-04 12:11:11,915] Trial 1 finished with value: 0.9501783277518389 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00023978682282845947, 'l1_ratio': 0.4590270612941629}. Best is trial 1 with value: 0.9501783277518389.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:15,549] Trial 2 finished with value: 0.9462631388575155 and parameters: {'scaling_method': 'robust', 'encoding_method': 'oneh

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:20,979] Trial 6 finished with value: 0.9460312612911462 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.0810755343611517e-05, 'l1_ratio': 0.10760765653450712}. Best is trial 1 with value: 0.9501783277518389.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:24,314] Trial 7 finished with value: 0.9464946045658464 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.5606337963784704e-05, 'l1_ratio': 0.3601779153297862}. Best is trial 1 with value: 0.9501783277518389.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model

[I 2024-06-04 12:11:32,673] Trial 14 finished with value: 0.9417705577027908 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'alpha': 0.00012726945204769044, 'l1_ratio': 0.6973862163128914}. Best is trial 13 with value: 0.9503906980010737.
[I 2024-06-04 12:11:33,538] Trial 15 finished with value: 0.9504647308835044 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00033039138120244133, 'l1_ratio': 0.5839963510057812}. Best is trial 15 with value: 0.9504647308835044.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:11:35,906] Trial 16 finished with value: 0.9463254479289654 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 5.274077303921959e-05, 'l1_ratio': 0.26407634356768434}. Best is trial 15 with value: 0.9504647308835044.
[I 2024-06-04 12:11:36,614] Trial 17 finished with value: 0.9503193720200291 and parame

[I 2024-06-04 12:11:55,895] Trial 37 finished with value: 0.9488350820536899 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.00024328192657776227, 'l1_ratio': 0.13196826812856255}. Best is trial 31 with value: 0.9505499049761467.
[I 2024-06-04 12:11:56,401] Trial 38 finished with value: 0.9417457762066904 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 0.0007354764476121989, 'l1_ratio': 0.3159175589563822}. Best is trial 31 with value: 0.9505499049761467.
[I 2024-06-04 12:11:57,104] Trial 39 finished with value: 0.9481549613128278 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0004022539129003468, 'l1_ratio': 0.5374274671812856}. Best is trial 31 with value: 0.9505499049761467.
[I 2024-06-04 12:11:58,872] Trial 40 finished with value: 0.9485464449761973 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'alp

[I 2024-06-04 12:12:08,457] Trial 51 finished with value: 0.9504455869420523 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006417597873670123, 'l1_ratio': 0.6914189001452589}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:09,109] Trial 52 finished with value: 0.9502840877576274 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007951112055397861, 'l1_ratio': 0.6533440106770946}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:09,797] Trial 53 finished with value: 0.9505306538606046 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005353346286880682, 'l1_ratio': 0.6994398595142804}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:10,752] Trial 54 finished with value: 0.9499698885800566 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dr

[I 2024-06-04 12:12:12,543] Trial 57 finished with value: 0.950577262408429 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00053370221372854, 'l1_ratio': 0.5928394187134195}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:13,264] Trial 58 finished with value: 0.9505674608233898 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005525033029698415, 'l1_ratio': 0.5892705906089115}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:13,905] Trial 59 finished with value: 0.9503876380888503 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0008031136821558983, 'l1_ratio': 0.5890833539789105}. Best is trial 46 with value: 0.9507322851800556.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coord

[I 2024-06-04 12:12:17,829] Trial 62 finished with value: 0.9505865977698754 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000547365281712682, 'l1_ratio': 0.5074348417297473}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:18,518] Trial 63 finished with value: 0.9505600104643548 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006763099329097973, 'l1_ratio': 0.5010753013816447}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:19,188] Trial 64 finished with value: 0.9505544385087727 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006752018854488276, 'l1_ratio': 0.512257772085704}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:19,872] Trial 65 finished with value: 0.9505610115918952 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop

[I 2024-06-04 12:12:22,536] Trial 69 finished with value: 0.9505668734975471 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000598033389476201, 'l1_ratio': 0.5499685352456576}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:23,254] Trial 70 finished with value: 0.9505872693783823 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005453676637810146, 'l1_ratio': 0.5549696077421559}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:23,967] Trial 71 finished with value: 0.9505837290328352 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005633168107613195, 'l1_ratio': 0.553216185104922}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:24,696] Trial 72 finished with value: 0.9505831084234 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:12:29,505] Trial 75 finished with value: 0.9464408856041644 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 1.1811470814604836e-05, 'l1_ratio': 0.4549627520644883}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:30,248] Trial 76 finished with value: 0.9505470917566982 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005019139832583904, 'l1_ratio': 0.5119738830505115}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:31,036] Trial 77 finished with value: 0.9504400076308397 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.000391420775171671, 'l1_ratio': 0.5644241

[I 2024-06-04 12:12:33,084] Trial 80 finished with value: 0.9503268087820554 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007275600791069953, 'l1_ratio': 0.23651295309844964}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:33,777] Trial 81 finished with value: 0.9505659299662632 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005567673044213204, 'l1_ratio': 0.5881426047725423}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:34,584] Trial 82 finished with value: 0.9503494655977095 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0003672666704496957, 'l1_ratio': 0.5299262568646396}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:35,364] Trial 83 finished with value: 0.9504705997559354 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'd

[I 2024-06-04 12:12:37,567] Trial 86 finished with value: 0.9502056950388436 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00033409044291808236, 'l1_ratio': 0.4260790080666367}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:38,273] Trial 87 finished with value: 0.9503852809171167 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0007297987523109834, 'l1_ratio': 0.651075654318817}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:39,056] Trial 88 finished with value: 0.9505514873806357 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.00043198206015939914, 'l1_ratio': 0.60471944277708}. Best is trial 46 with value: 0.9507322851800556.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coo

  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:12:44,279] Trial 90 finished with value: 0.9477018148681259 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 4.3312611035599385e-05, 'l1_ratio': 0.5453161891928837}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:45,027] Trial 91 finished with value: 0.9505925573148751 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0005948463475809608, 'l1_ratio': 0.49482340300377425}. Best is trial 46 with value: 0.9507322851800556.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:12:47,880] Trial 92 finished with value: 0.9493084580865908 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha':

[I 2024-06-04 12:12:49,889] Trial 95 finished with value: 0.9505629716198436 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0006369459863925763, 'l1_ratio': 0.5249111385925039}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:50,608] Trial 96 finished with value: 0.9502464781282434 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007602353746941758, 'l1_ratio': 0.4525456098302309}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:51,557] Trial 97 finished with value: 0.9501491357601406 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'alpha': 0.0004136730847154995, 'l1_ratio': 0.2937106089812104}. Best is trial 46 with value: 0.9507322851800556.
[I 2024-06-04 12:12:52,288] Trial 98 finished with value: 0.9505860534020304 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'dro

[I 2024-06-04 12:13:02,986] Trial 112 finished with value: 0.9508142864190117 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008538883065837565, 'l1_ratio': 0.33578661444506}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:03,759] Trial 113 finished with value: 0.9508191540538323 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008739171477669513, 'l1_ratio': 0.3401917031329937}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:04,545] Trial 114 finished with value: 0.9508140305778845 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008920977328144325, 'l1_ratio': 0.32063652487224986}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:05,307] Trial 115 finished with value: 0.9508206424132493 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': N

[I 2024-06-04 12:13:25,967] Trial 142 finished with value: 0.9508105015577785 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007872105072179762, 'l1_ratio': 0.3580349217835481}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:26,699] Trial 143 finished with value: 0.9508202201885936 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0009044251245493753, 'l1_ratio': 0.3330619199452988}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:27,447] Trial 144 finished with value: 0.9507954368725164 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0009265280516420815, 'l1_ratio': 0.37660970968558816}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:28,239] Trial 145 finished with value: 0.9506670212463165 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop':

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-06-04 12:13:47,685] Trial 168 finished with value: 0.9409312411500317 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'alpha': 1.920788613331729e-05, 'l1_ratio': 0.2803920883031563}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:48,373] Trial 169 finished with value: 0.950212935030683 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007192835260415021, 'l1_ratio': 0.32718064223393006}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:49,199] Trial 170 finished with value: 0.9507060018349653 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008155876488844761, 'l1_ratio': 0.2939630778213785}. Best is trial 110 with value: 0.950820843082717.
[I 2024-06-04 12:13:49,959] Trial 171 finished with value: 0.9508171281704605 and param

[I 2024-06-04 12:14:06,796] Trial 189 finished with value: 0.9508211865943965 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007890891316732335, 'l1_ratio': 0.386603322661332}. Best is trial 189 with value: 0.9508211865943965.
[I 2024-06-04 12:14:07,632] Trial 190 finished with value: 0.9508115073280979 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0007539799779360345, 'l1_ratio': 0.3758948220644501}. Best is trial 189 with value: 0.9508211865943965.
[I 2024-06-04 12:14:08,462] Trial 191 finished with value: 0.9508125388598604 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0008330956165337611, 'l1_ratio': 0.3987889471811326}. Best is trial 189 with value: 0.9508211865943965.
[I 2024-06-04 12:14:09,309] Trial 192 finished with value: 0.9508210288482131 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop'

In [66]:
# after adding in the L1 value
print(study_en.best_value)
print(study_en.best_params)

0.9508214083737423
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0006921322276640134, 'l1_ratio': 0.4415555448483296}


In [29]:
print(study_en.best_value)
print(study_en.best_params)

0.950820633330468
{'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'alpha': 0.0006178621404820522}


# XGB

In [18]:
def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20)
    }

    return XGBRegressor(**params)

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_xgb(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_xgb(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [20]:
from optuna import create_study

study_xgb = create_study(study_name='optimization', direction='maximize')

study_xgb.optimize(lambda trial: objective_xgb(trial, X, y), n_trials=5)

[I 2024-07-08 11:22:14,907] A new study created in memory with name: optimization
[I 2024-07-08 11:22:24,071] Trial 0 finished with value: -6.665611267490787 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'n_estimators': 967, 'learning_rate': 0.00037351480086420696, 'max_depth': 5, 'subsample': 0.5885644152741032, 'colsample_bytree': 0.5718369843240108, 'min_child_weight': 19}. Best is trial 0 with value: -6.665611267490787.
[I 2024-07-08 11:22:27,930] Trial 1 finished with value: 0.5202660253430551 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'n_estimators': 168, 'learning_rate': 0.02330663672649549, 'max_depth': 6, 'subsample': 0.9241194283730881, 'colsample_bytree': 0.8746460804429347, 'min_child_weight': 19}. Best is trial 1 with value: 0.5202660253430551.
[I 2024-07-08 11:22:32,101] Trial 2 finished with value: -6.669056481733607 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordi

In [21]:
print(study_xgb.best_value)
print(study_xgb.best_params)

0.941248550185122
{'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'n_estimators': 935, 'learning_rate': 0.1902318039321245, 'max_depth': 4, 'subsample': 0.7359502095892552, 'colsample_bytree': 0.9469594103722121, 'min_child_weight': 12}


# RF

In [48]:
def instantiate_rf(trial : Trial) -> RandomForestRegressor:
    params = {
    'bootstrap':trial.suggest_categorical('bootstrap', [True]),
    'n_estimators': trial.suggest_int('n_estimators', 250, 500),
    'max_depth': trial.suggest_int('max_depth', 50, 75),
    'min_samples_split': trial.suggest_int('min_samples_split', 4, 6),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
    }
    
    return RandomForestRegressor(**params)
  
def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_rf(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

def objective_rf(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [49]:
from optuna import create_study

study_rf = create_study(study_name='optimization', direction='maximize')

study_rf.optimize(lambda trial: objective_en(trial, X, y), n_trials=50)

[I 2024-05-31 08:58:37,051] A new study created in memory with name: optimization
[I 2024-05-31 09:01:47,272] Trial 0 finished with value: 0.9041942334135129 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'bootstrap': True, 'n_estimators': 402, 'max_depth': 50, 'min_sample_split': 5, 'min_sample_leaf': 3}. Best is trial 0 with value: 0.9041942334135129.
[I 2024-05-31 09:05:06,098] Trial 1 finished with value: 0.9073980977362396 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'bootstrap': True, 'n_estimators': 336, 'max_depth': 69, 'min_sample_split': 6, 'min_sample_leaf': 1}. Best is trial 1 with value: 0.9073980977362396.
[I 2024-05-31 09:07:30,292] Trial 2 finished with value: 0.9076826989942189 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 260, 'max_depth': 73, 'min_sample_split': 5, 'min_sample_leaf': 1}. Best is trial 2 with value: 

[I 2024-05-31 09:27:43,062] Trial 8 finished with value: 0.9056804031622304 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'bootstrap': True, 'n_estimators': 420, 'max_depth': 57, 'min_sample_split': 5, 'min_sample_leaf': 3}. Best is trial 4 with value: 0.9097534039592309.
[I 2024-05-31 09:30:56,969] Trial 9 finished with value: 0.9093634132516474 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'bootstrap': True, 'n_estimators': 254, 'max_depth': 55, 'min_sample_split': 4, 'min_sample_leaf': 1}. Best is trial 4 with value: 0.9097534039592309.
[I 2024-05-31 09:32:39,310] Trial 10 finished with value: 0.8993885557189456 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 322, 'max_depth': 60, 'min_sample_split': 6, 'min_sample_leaf': 5}. Best is trial 4 with value: 0.9097534039592309.
[I 2024-05-31 09:34:36,582] Trial 11 finished with value: 0.9082

[I 2024-05-31 10:31:29,412] Trial 32 finished with value: 0.9094899639883157 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 327, 'max_depth': 63, 'min_sample_split': 6, 'min_sample_leaf': 1}. Best is trial 25 with value: 0.9104726627980002.
[I 2024-05-31 10:34:00,216] Trial 33 finished with value: 0.9095126002045921 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 277, 'max_depth': 67, 'min_sample_split': 5, 'min_sample_leaf': 1}. Best is trial 25 with value: 0.9104726627980002.
[I 2024-05-31 10:36:38,891] Trial 34 finished with value: 0.9102377098381838 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 305, 'max_depth': 71, 'min_sample_split': 4, 'min_sample_leaf': 2}. Best is trial 25 with value: 0.9104726627980002.
[I 2024-05-31 10:38:43,955] Trial 35 finished with value: 0.9064976064216932 and paramet

In [51]:
print(study_rf.best_value)
print(study_rf.best_params)

0.9106961840498735
{'scaling_method': 'robust', 'encoding_method': 'ordinal', 'bootstrap': True, 'n_estimators': 338, 'max_depth': 73, 'min_sample_split': 4, 'min_sample_leaf': 2}
