In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
#website help from 
#https://medium.com/@walter_sperat/using-optuna-with-sklearn-the-right-way-part-1-6b4ad0ab2451

### Instantiate function for each regression model 

In [3]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Lasso(**params)

def instantiate_en(trial : Trial) -> ElasticNet:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', .00001, 1, log = True)
    }

    return ElasticNet(**params)

def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20)
    }

    return XGBRegressor(**params)

def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'max_depth': trial.suggest_int ('max_depth', 2, 6),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    'min_samples_split': trial.suggest_int ('min_sample_split', 6, 10),
    'min_samples_leaf': trial.suggest_int ('min_sample_leaf', 1, 10)
    }
    
    return GradientBoostingRegressor(**params)

def instantiate_rf(trial : Trial) -> RandomForestRegressor:
    params = {
    'bootstrap':trial.suggest_categorical('bootstrap', [True, False]),
    'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
    'max_depth': trial.suggest_int('max_depth', 1, 100),
    'min_samples_split': trial.suggest_int('min_sample_split', 2, 10),
    'min_samples_leaf': trial.suggest_int('min_sample_leaf', 1, 10)
    }
    
    return RandomForestRegressor(**params)

def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
    params = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'loss': trial.suggest_categorical('loss',['linear', 'square', 'exponential']),
    'n_estimators': trial.suggest_int('n_estimators', 1, 1000)
    }
    
    return AdaBoostRegressor(**params)


### Instantiate function for regression model selection

In [4]:
Classifier = (
    Ridge |
    Lasso |
    ElasticNet |
    XGBRegressor |
    GradientBoostingRegressor |
    RandomForestRegressor |
    AdaBoostRegressor
    )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'en', 'xgb', 'gbr', 'rf', 'ada'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='en':
        model = instantiate_en(trial)
    elif algorithm=='xgb':
        model = instantiate_xgb(trial)
    elif algorithm=='gbr':
        model = instantiate_gbr(trial)
    elif algorithm == 'rf':
        model = instantiate_rf(trial)
    elif algorithm == 'ada':
        model = instantiate_ada(trial)
        
    return model

### Instantiate functions for encoding categorical columns 

In [5]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

### Instantiate functions for encoding numerical columns 

In [6]:

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

### Instantiate function to scale and encode 

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [8]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [9]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [10]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

[I 2024-07-08 09:50:33,536] A new study created in memory with name: optimization


In [12]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=200)

[I 2024-07-08 09:50:57,985] Trial 0 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 1.596299953148117}. Best is trial 0 with value: -0.029441307066881618.
[I 2024-07-08 09:52:42,088] Trial 1 finished with value: 0.8393679950758675 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'algorithm': 'rf', 'bootstrap': False, 'n_estimators': 246, 'max_depth': 64, 'min_sample_split': 3, 'min_sample_leaf': 10}. Best is trial 1 with value: 0.8393679950758675.
[I 2024-07-08 09:53:11,436] Trial 2 finished with value: 0.8641449377737869 and parameters: {'scaling_method': 'standard', 'encoding_method': 'ordinal', 'algorithm': 'ada', 'learning_rate': 0.17448906672786785, 'loss': 'linear', 'n_estimators': 279}. Best is trial 2 with value: 0.8641449377737869.
[I 2024-07-08 09:53:40,784] Trial 3 finished with value: 0.8946663651310004 and parameters: {'scaling_me

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 09:56:54,429] Trial 13 finished with value: 0.9470588749996617 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 0.00011010104040317402, 'l1_ratio': 0.0001455025676712381}. Best is trial 13 with value: 0.9470588749996617.
[I 2024-07-08 09:56:54,805] Trial 14 finished with value: -0.025586122789539933 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 199.93883658061745, 'l1_ratio': 7.950477271767913e-05}. Best is trial 13 with value: 0.9470588749996617.
[I 2024-07-08 09:56:56,103] Trial 15 finished with value: 0.9506146499925329 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 0.005798418815720318, 'l1_ratio': 0.03787907308950946}. Best is trial 15 wi

[I 2024-07-08 09:58:22,916] Trial 27 finished with value: 0.1900980953448697 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.15243420261017115, 'l1_ratio': 0.31606174092095934}. Best is trial 15 with value: 0.9506146499925329.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 09:58:25,815] Trial 28 finished with value: 0.9489120573469423 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.0008608564486929174, 'l1_ratio': 0.02884428444223919}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 09:58:26,359] Trial 29 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm':

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 10:00:30,764] Trial 31 finished with value: 0.9498560012919262 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.0008420495381024598, 'l1_ratio': 0.08792956359810299}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:03:59,158] Trial 32 finished with value: 0.8882791010176795 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'rf', 'bootstrap': True, 'n_estimators': 577, 'max_depth': 92, 'min_sample_split': 10, 'min_sample_leaf': 10}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:03:59,667] Trial 33 finished with value: 0.7368214868842125 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.05566575548888979, 'l1_ratio': 0.11820072246312327}. Best is trial 15 wit

[I 2024-07-08 10:06:44,164] Trial 37 finished with value: 0.745408937325795 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'ada', 'learning_rate': 0.0005601672835746196, 'loss': 'square', 'n_estimators': 405}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:06:44,614] Trial 38 finished with value: 0.941864120697199 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'lasso', 'alpha': 0.0005793817977169322}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:06:45,157] Trial 39 finished with value: 0.9411548656561404 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.006359993639482432, 'l1_ratio': 0.6251098987933636}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:06:45,495] Trial 40 finished with value: 0.9407217253975555 and parameters: {'scaling_method': 'standard', 'en

[I 2024-07-08 10:20:08,642] Trial 49 finished with value: 0.9118215291243491 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'gbr', 'n_estimators': 832, 'learning_rate': 0.0044836116972927385, 'max_depth': 3, 'subsample': 0.8138941342231104, 'min_sample_split': 9, 'min_sample_leaf': 7}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:09,193] Trial 50 finished with value: 0.9458266337548127 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 0.004772481935016255}. Best is trial 15 with value: 0.9506146499925329.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 10:20:11,865] Trial 51 finished with value: 0.9495981699591987 and parameters: {'scaling_method': 'robust', 'e

[I 2024-07-08 10:20:14,176] Trial 54 finished with value: 0.9499162969416757 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.0003685855154742009, 'l1_ratio': 0.25721071195017775}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:15,233] Trial 55 finished with value: 0.9499820008422049 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.00032540205918819246, 'l1_ratio': 0.3145147537278743}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:16,050] Trial 56 finished with value: 0.9502817747063315 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.0002861929856396532, 'l1_ratio': 0.46854683593831487}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:16,550] Trial 57 finished with value: 0.9431483448027016 and parameters: {

[I 2024-07-08 10:20:19,617] Trial 61 finished with value: 0.9504463386027325 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.0001750252331418698}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:20,873] Trial 62 finished with value: 0.9504214647535318 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.00014501287344099588}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:22,189] Trial 63 finished with value: 0.9504077370356221 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.00011849493864022492}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:23,275] Trial 64 finished with value: 0.9504550644937725 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': '

[I 2024-07-08 10:20:25,478] Trial 67 finished with value: 0.9484703228321049 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.0004973765104169647}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:25,825] Trial 68 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 151.18896040578736}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:26,251] Trial 69 finished with value: 0.7400320378975258 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.012563827493497461}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:20:27,547] Trial 70 finished with value: 0.950520595111418 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'a

[I 2024-07-08 10:23:05,240] Trial 84 finished with value: 0.9504065509720707 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030881824809650106}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:23:05,959] Trial 85 finished with value: 0.9502085460302702 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00022570217272519397}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:23:06,413] Trial 86 finished with value: 0.9395212103576183 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0013428291937205817}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:23:06,947] Trial 87 finished with value: 0.945817700514205 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha

[I 2024-07-08 10:30:44,562] Trial 96 finished with value: 0.9500112102570567 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.00011296409806943494}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:30:46,257] Trial 97 finished with value: 0.9481262698391015 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 0.0007729618718592502, 'l1_ratio': 0.0011891018868038594}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:39,812] Trial 98 finished with value: 0.757303529912053 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'ada', 'learning_rate': 0.002133530766129601, 'loss': 'linear', 'n_estimators': 343}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:40,570] Trial 99 finished with value: 0.9495704757450211 and parameters: {'scaling_method': 'm

[I 2024-07-08 10:31:44,158] Trial 103 finished with value: 0.9499559033610787 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.0003399585080097176}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:44,592] Trial 104 finished with value: 0.8431383302498437 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.007521777529803439}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:45,146] Trial 105 finished with value: 0.9436889041904181 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.000907603113352332}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:46,022] Trial 106 finished with value: 0.9412812829172692 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'en', 'alpha': 0.000

[I 2024-07-08 10:31:50,841] Trial 110 finished with value: 0.9044751891765352 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.0038622861855030256}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:51,975] Trial 111 finished with value: 0.9504475645861536 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.00017600999519226733}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:52,966] Trial 112 finished with value: 0.9504299049727469 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.00024830550488678997}. Best is trial 72 with value: 0.950617005001886.
[I 2024-07-08 10:31:54,176] Trial 113 finished with value: 0.9504161842819692 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 

[I 2024-07-08 10:31:59,044] Trial 116 finished with value: 0.9508115794180636 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003112943623901131}. Best is trial 116 with value: 0.9508115794180636.
[I 2024-07-08 10:33:05,335] Trial 117 finished with value: 0.9405026432935989 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'gbr', 'n_estimators': 902, 'learning_rate': 0.34044036009965684, 'max_depth': 2, 'subsample': 0.6914912006901939, 'min_sample_split': 8, 'min_sample_leaf': 8}. Best is trial 116 with value: 0.9508115794180636.
[I 2024-07-08 10:33:08,292] Trial 118 finished with value: -6.667616802855942 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'xgb', 'n_estimators': 218, 'learning_rate': 0.0012998394824280223, 'max_depth': 6, 'subsample': 0.5730586373436786, 'colsample_bytree': 0.9841607126022083, 'mi

[I 2024-07-08 10:33:23,839] Trial 139 finished with value: 0.9508120418340444 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030406267928934067}. Best is trial 138 with value: 0.9508125334796595.
[I 2024-07-08 10:33:24,458] Trial 140 finished with value: 0.9500624585576197 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0006673544124237484}. Best is trial 138 with value: 0.9508125334796595.
[I 2024-07-08 10:33:25,212] Trial 141 finished with value: 0.9508124744936358 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030519412063227956}. Best is trial 138 with value: 0.9508125334796595.
[I 2024-07-08 10:33:26,004] Trial 142 finished with value: 0.9508008348893316 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso

[I 2024-07-08 10:41:20,971] Trial 169 finished with value: 0.9508080321678772 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002921277153152075}. Best is trial 138 with value: 0.9508125334796595.
[I 2024-07-08 10:41:22,091] Trial 170 finished with value: 0.9501689281680938 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00013598564982807433}. Best is trial 138 with value: 0.9508125334796595.
[I 2024-07-08 10:41:22,884] Trial 171 finished with value: 0.9508070583226983 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002908242205133235}. Best is trial 138 with value: 0.9508125334796595.
[I 2024-07-08 10:41:23,566] Trial 172 finished with value: 0.9505064311789182 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso'

[I 2024-07-08 10:46:07,440] Trial 198 finished with value: 0.9501547132194716 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0006402216747315148}. Best is trial 192 with value: 0.9508125685081981.
[I 2024-07-08 10:46:08,299] Trial 199 finished with value: 0.9508003810858396 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002842214495409678}. Best is trial 192 with value: 0.9508125685081981.


In [13]:
study.best_trial

FrozenTrial(number=192, state=1, values=[0.9508125685081981], datetime_start=datetime.datetime(2024, 7, 8, 10, 46, 1, 546028), datetime_complete=datetime.datetime(2024, 7, 8, 10, 46, 2, 365898), params={'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.000305954350488747}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scaling_method': CategoricalDistribution(choices=('standard', 'minmax', 'maxabs', 'robust')), 'encoding_method': CategoricalDistribution(choices=('ordinal', 'onehot')), 'drop': CategoricalDistribution(choices=(None, 'first')), 'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'en', 'xgb', 'gbr', 'rf', 'ada')), 'alpha': FloatDistribution(high=1000.0, log=True, low=0.0001, step=None)}, trial_id=192, value=None)

In [14]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna_dict = {}
trials = range(200)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items()))

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Algorithm'] = algorithm
optuna_dict['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna_dict)

In [15]:
df_optuna

Unnamed: 0,Trial,Score,Algorithm,Parameters
0,0,-0.029441,lasso,"[(scaling_method, standard), (encoding_method,..."
1,1,0.839368,rf,"[(scaling_method, maxabs), (encoding_method, o..."
2,2,0.864145,ada,"[(scaling_method, standard), (encoding_method,..."
3,3,0.894666,rf,"[(scaling_method, maxabs), (encoding_method, o..."
4,4,0.854980,ada,"[(scaling_method, standard), (encoding_method,..."
...,...,...,...,...
195,195,0.950081,lasso,"[(scaling_method, robust), (encoding_method, o..."
196,196,0.950811,lasso,"[(scaling_method, robust), (encoding_method, o..."
197,197,0.950466,lasso,"[(scaling_method, robust), (encoding_method, o..."
198,198,0.950155,lasso,"[(scaling_method, robust), (encoding_method, o..."


In [26]:
df_optuna.to_csv('optuna_all_200.csv')

In [16]:
idx = df_optuna.groupby('Algorithm')['Score'].transform(max) == df_optuna['Score']
df_optuna[idx]

Unnamed: 0,Trial,Score,Algorithm,Parameters
15,15,0.950615,en,"[(scaling_method, robust), (encoding_method, o..."
34,34,0.873818,ada,"[(scaling_method, robust), (encoding_method, o..."
50,50,0.945827,ridge,"[(scaling_method, robust), (encoding_method, o..."
77,77,0.946084,xgb,"[(scaling_method, standard), (encoding_method,..."
95,95,0.907803,rf,"[(scaling_method, minmax), (encoding_method, o..."
176,176,0.947324,gbr,"[(scaling_method, robust), (encoding_method, o..."
192,192,0.950813,lasso,"[(scaling_method, robust), (encoding_method, o..."


In [17]:
df_optuna.Algorithm.value_counts()

Algorithm
lasso    118
en        38
rf         9
ada        9
xgb        9
ridge      9
gbr        8
Name: count, dtype: int64

In [37]:
import plotly.express as px


fig = px.scatter(df_optuna.loc[df_optuna.Score > .7], 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'],
                labels = {'Trial': 'Optuna Trial', 'Score': 'Score (R-Squared)'},
                title = 'Optuna Trials with R-Squared over 0.7  <br><sup> Best Score: 0.9508, Trial 192, alpha = 0.000305954350488747, scaling = robust, one hot encoding (no drop) </sup>')

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()

fig.write_image("images/optuna_all.png")