In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
#website help from 
#https://medium.com/@walter_sperat/using-optuna-with-sklearn-the-right-way-part-1-6b4ad0ab2451

### Instantiate function for each regression model 

In [10]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Lasso(**params)

def instantiate_en(trial : Trial) -> ElasticNet:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', .00001, 1, log = True)
    }

    return ElasticNet(**params)

def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20)
    }

    return XGBRegressor(**params)

def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'max_depth': trial.suggest_int ('max_depth', 2, 6),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    'min_samples_split': trial.suggest_int ('min_sample_split', 6, 10),
    'min_samples_leaf': trial.suggest_int ('min_sample_leaf', 1, 10)
    }
    
    return GradientBoostingRegressor(**params)

def instantiate_rf(trial : Trial) -> RandomForestRegressor:
    params = {
    'bootstrap':trial.suggest_categorical('bootstrap', [True, False]),
    'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
    'max_depth': trial.suggest_int('max_depth', 1, 100),
    'min_samples_split': trial.suggest_int('min_sample_split', 2, 10),
    'min_samples_leaf': trial.suggest_int('min_sample_leaf', 1, 10)
    }
    
    return RandomForestRegressor(**params)

def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
    params = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'loss': trial.suggest_categorical('loss',['linear', 'square', 'exponential']),
    'n_estimators': trial.suggest_int('n_estimators', 1, 1000)
    }
    
    return AdaBoostRegressor(**params)


### Instantiate function for regression model selection

In [3]:
Classifier = (
    Ridge |
    Lasso |
    ElasticNet |
    XGBRegressor |
    GradientBoostingRegressor |
    RandomForestRegressor |
    AdaBoostRegressor
    )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'en', 'xgb', 'gbr', 'rf', 'ada'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='en':
        model = instantiate_en(trial)
    elif algorithm=='xgb':
        model = instantiate_xgb(trial)
    elif algorithm=='gbr':
        model = instantiate_gbr(trial)
    elif algorithm == 'rf':
        model = instantiate_rf(trial)
    elif algorithm == 'ada':
        model = instantiate_ada(trial)
        
    return model

### Instantiate functions for encoding categorical columns 

In [4]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

### Instantiate functions for encoding numerical columns 

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

### Instantiate function to scale and encode 

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [19]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [20]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [21]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

[I 2024-05-31 07:26:05,472] A new study created in memory with name: optimization


In [22]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=200)

[I 2024-05-31 07:27:14,676] Trial 0 finished with value: 0.8534689230641774 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 767, 'learning_rate': 0.0028366153642322566, 'max_depth': 3, 'subsample': 0.52779532452511, 'min_sample_split': 6, 'min_sample_leaf': 2}. Best is trial 0 with value: 0.8534689230641774.
[I 2024-05-31 07:27:17,530] Trial 1 finished with value: -6.669552756007664 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'xgb', 'n_estimators': 382, 'learning_rate': 0.0001701062406931514, 'max_depth': 5, 'subsample': 0.7974799759343989, 'colsample_bytree': 0.529374079799673, 'min_child_weight': 14}. Best is trial 0 with value: 0.8534689230641774.
[I 2024-05-31 07:27:23,014] Trial 2 finished with value: 0.8983485182718915 and parameters: {'scaling_method': 'standard', 'encoding_method': 'ordinal', 'algorithm': 'xgb', 'n_estimators': 357, 'learning_rate': 0.6707334874038324, 'ma


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-31 07:32:44,409] Trial 12 finished with value: 0.9026055886209007 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'rf', 'bootstrap': True, 'n_estimators': 112, 'max_depth': 100, 'min_sample_split': 10, 'min_sample_leaf': 2}. Best is trial 12 with value: 0.9026055886209007.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. Thes

[I 2024-05-31 07:34:59,074] Trial 23 finished with value: 0.577040401256889 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 957.5689866973827}. Best is trial 22 with value: 0.9458788067499121.
[I 2024-05-31 07:35:57,373] Trial 24 finished with value: 0.9453253751550055 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'gbr', 'n_estimators': 1000, 'learning_rate': 0.03744695920657587, 'max_depth': 2, 'subsample': 0.5091479863132224, 'min_sample_split': 6, 'min_sample_leaf': 10}. Best is trial 22 with value: 0.9458788067499121.
[I 2024-05-31 07:35:57,790] Trial 25 finished with value: 0.8902797960369311 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 0.00764193776184547}. Best is trial 22 with value: 0.9458788067499121.
[I 2024-05-31 07:35:58,191] Trial 26 finished with value: 0.9457794732062131 and p

[I 2024-05-31 07:45:49,961] Trial 51 finished with value: 0.949669214435516 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 25.752073839163717}. Best is trial 42 with value: 0.9499522348640677.
[I 2024-05-31 07:45:50,396] Trial 52 finished with value: 0.9496810295998003 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 24.11339626123322}. Best is trial 42 with value: 0.9499522348640677.
[I 2024-05-31 07:45:50,874] Trial 53 finished with value: 0.9495325341174727 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 11.284535868983939}. Best is trial 42 with value: 0.9499522348640677.
[I 2024-05-31 07:45:51,375] Trial 54 finished with value: 0.9477255936749225 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 

[I 2024-05-31 07:54:30,268] Trial 69 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 80.96550536271954}. Best is trial 42 with value: 0.9499522348640677.
[I 2024-05-31 07:54:30,674] Trial 70 finished with value: 0.9479346630157572 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 1.542381419829497}. Best is trial 42 with value: 0.9499522348640677.
[I 2024-05-31 07:54:31,081] Trial 71 finished with value: 0.9496832036071915 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 23.73258423726294}. Best is trial 42 with value: 0.9499522348640677.
[I 2024-05-31 07:54:31,449] Trial 72 finished with value: 0.9496912164320612 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha'

[I 2024-05-31 08:04:12,454] Trial 95 finished with value: 0.95001628056841 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 17.8492450268799}. Best is trial 95 with value: 0.95001628056841.
[I 2024-05-31 08:04:12,853] Trial 96 finished with value: -0.027638635748393627 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'lasso', 'alpha': 2.9979996842440513}. Best is trial 95 with value: 0.95001628056841.
[I 2024-05-31 08:04:27,779] Trial 97 finished with value: 0.8988411428005779 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'xgb', 'n_estimators': 587, 'learning_rate': 0.009612827769989538, 'max_depth': 5, 'subsample': 0.7086472207807635, 'colsample_bytree': 0.8522694718270836, 'min_child_weight': 15}. Best is trial 95 with value: 0.95001628056841.
[I 2024-05-31 08:04:28,549] Trial 98 finished with value: 0.9499245295139204 and


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-31 08:12:44,642] Trial 120 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 37.98733074610962}. Best is trial 95 with value: 0.95001628056841.
[I 2024-05-31 08:12:45,176] Trial 121 finished with value: 0.949985448893939 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 29.95922709367116}. Best is trial 95 with value: 0.950016280568


Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-31 08:15:20,335] Trial 147 finished with value: 0.9499147059212328 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'ridge', 'alpha': 20.724835853224008}. Best is trial 145 with value: 0.9500472965273005.
[I 2024-05-31 08:18:02,959] Trial 148 finished with value: 0.858382944545087 and parameters: {'scaling_method': 'robust', 'encoding_method':

[I 2024-05-31 08:19:01,303] Trial 173 finished with value: 0.9500483479389941 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 22.296786063651155}. Best is trial 173 with value: 0.9500483479389941.
[I 2024-05-31 08:19:01,794] Trial 174 finished with value: 0.9494107831944859 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 51.99051262413413}. Best is trial 173 with value: 0.9500483479389941.
[I 2024-05-31 08:19:02,260] Trial 175 finished with value: 0.9500477326155489 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 21.715934137032452}. Best is trial 173 with value: 0.9500483479389941.
[I 2024-05-31 08:19:02,665] Trial 176 finished with value: 0.9297935101376275 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 

[I 2024-05-31 08:23:46,788] Trial 199 finished with value: 0.7832808668840541 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'ada', 'learning_rate': 0.12567447772017895, 'loss': 'square', 'n_estimators': 21}. Best is trial 173 with value: 0.9500483479389941.


In [12]:
study.best_trial

FrozenTrial(number=19, state=1, values=[0.9498253167672198], datetime_start=datetime.datetime(2024, 5, 29, 14, 21, 30, 900293), datetime_complete=datetime.datetime(2024, 5, 29, 14, 22, 19, 688551), params={'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'gbr', 'n_estimators': 776, 'learning_rate': 0.050062717855334275, 'max_depth': 3, 'subsample': 0.8684466240320046, 'min_sample_split': 7, 'min_sample_leaf': 8}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scaling_method': CategoricalDistribution(choices=('standard', 'minmax', 'maxabs', 'robust')), 'encoding_method': CategoricalDistribution(choices=('ordinal', 'onehot')), 'drop': CategoricalDistribution(choices=(None, 'first')), 'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'en', 'xgb', 'gbr', 'rf', 'ada')), 'n_estimators': IntDistribution(high=1000, log=False, low=500, step=1), 'learning_rate': FloatDistribution(high=1.0, log=True, low=0.0001, step=N

In [23]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna_dict = {}
trials = range(200)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items()))

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Algorithm'] = algorithm
optuna_dict['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna_dict)

In [24]:
idx = df_optuna.groupby('Algorithm')['Score'].transform(max) == df_optuna['Score']
df_optuna[idx]

Unnamed: 0,Trial,Score,Algorithm,Parameters
3,3,0.391197,lasso,"[(scaling_method, standard), (encoding_method,..."
20,20,0.876856,ada,"[(scaling_method, robust), (encoding_method, o..."
24,24,0.945325,gbr,"[(scaling_method, maxabs), (encoding_method, o..."
25,25,0.89028,en,"[(scaling_method, maxabs), (encoding_method, o..."
32,32,0.944913,xgb,"[(scaling_method, robust), (encoding_method, o..."
173,173,0.950048,ridge,"[(scaling_method, robust), (encoding_method, o..."
179,179,0.908172,rf,"[(scaling_method, robust), (encoding_method, o..."


In [191]:
list(df_optuna.Parameters.items())

[(0,
  [('scaling_method', 'robust'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'lasso'),
   ('alpha', 485.91494069511094)]),
 (1,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'gbr'),
   ('n_estimators', 436),
   ('learning_rate', 0.0012861054183096832),
   ('max_depth', 8),
   ('min_sample_split', 7)]),
 (2,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'ordinal'),
   ('algorithm', 'ridge'),
   ('alpha', 2.9564732483382494)]),
 (3,
  [('scaling_method', 'maxabs'),
   ('encoding_method', 'ordinal'),
   ('algorithm', 'ridge'),
   ('alpha', 17.622180822136105)]),
 (4,
  [('scaling_method', 'standard'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'ridge'),
   ('alpha', 11.016938443949593)]),
 (5,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'onehot'),
   ('drop', 'first'),
   ('algorithm', 'ridge'),
   ('alpha', 72.72185663024044)]),
 (6,
  [('scaling_method

In [193]:
df_optuna[df_optuna.Algorithm == 'lasso']

Unnamed: 0,Trial,Score,Algorithm,Parameters
0,0,-0.029441,lasso,"[(scaling_method, robust), (encoding_method, o..."
7,7,0.158099,lasso,"[(scaling_method, maxabs), (encoding_method, o..."
9,9,-0.029441,lasso,"[(scaling_method, maxabs), (encoding_method, o..."
28,28,-0.029441,lasso,"[(scaling_method, robust), (encoding_method, o..."
29,29,-0.029441,lasso,"[(scaling_method, robust), (encoding_method, o..."
...,...,...,...,...
193,193,0.950580,lasso,"[(scaling_method, robust), (encoding_method, o..."
194,194,0.943710,lasso,"[(scaling_method, robust), (encoding_method, o..."
195,195,0.948354,lasso,"[(scaling_method, robust), (encoding_method, o..."
196,196,0.950578,lasso,"[(scaling_method, robust), (encoding_method, o..."


In [26]:
# print('lasso', df_optuna.Parameters[169])
# print('ridge', df_optuna.Parameters[53])
# print('xgb', df_optuna.Parameters[137])
# print('gbr', df_optuna.Parameters[135])
print('rf', df_optuna.Parameters[179])

rf [('scaling_method', 'robust'), ('encoding_method', 'onehot'), ('drop', None), ('algorithm', 'rf'), ('bootstrap', True), ('n_estimators', 355), ('max_depth', 67), ('min_sample_split', 5), ('min_sample_leaf', 1)]


In [25]:
import plotly.express as px


fig = px.scatter(df_optuna.loc[df_optuna.Score > 0], 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'])

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()