In [59]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from optuna import samplers
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor

### Instantiate function for each regression model 

In [162]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 15, 25)
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", .0001, .0004)
    }

    return Lasso(**params)

def instantiate_en(trial : Trial) -> ElasticNet:
    params = {
        "alpha": trial.suggest_float("alpha", .0001, .0004)
    }

    return ElasticNet(**params)

### Instantiate function for regression model selection

In [159]:
Classifier = (
    Ridge |
    Lasso |
    ElasticNet
    )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'en'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='en':
        model = instantiate_en(trial)
    return model

### Instantiate functions for encoding categorical columns 

In [144]:
from sklearn.preprocessing import OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    

### Instantiate functions for encoding numerical columns 

In [145]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

### Instantiate function to scale and encode 

In [163]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
#     learner = instantiate_lasso(trial)
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [147]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [148]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [172]:
from optuna import create_study
#study = create_study(sampler= samplers.RandomSampler(42), 

study = create_study(study_name='optimization', 
                     direction='maximize')

[I 2024-05-09 15:11:09,186] A new study created in memory with name: optimization


In [173]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=500)

[I 2024-05-09 15:11:12,195] Trial 0 finished with value: 0.9499516590290001 and parameters: {'scaling_method': 'standard', 'drop': None, 'algorithm': 'en', 'alpha': 0.0002763555611934212}. Best is trial 0 with value: 0.9499516590290001.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 15:11:13,250] Trial 1 finished with value: 0.9504063447200879 and parameters: {'scaling_method': 'minmax', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.000


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 15:11:16,885] Trial 7 finished with value: 0.9496567088711311 and parameters: {'scaling_method': 'standard', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.00019870306671335755}. Best is trial 5 with value: 0.950477374221143.
[I 2024-05-09 15:11:17,342] Trial 8 finished with value: 0.9504298375610452 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'en', 'alpha': 0.00035734815701707963}. Best is trial 5 with value: 0.950477374221143.
[I 2024-05-09 15:11:17,525] Trial 9 finished with valu


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 15:11:33,030] Trial 35 finished with value: 0.9498795863160696 and parameters: {'scaling_method': 'robust', 'drop': 'first', 'algorithm': 'ridge', 'alpha': 15.087951305732666}. Best is trial 21 with value: 0.9508118984390123.
[I 2024-05-09 15:11:33,944] Trial 36 finished with value: 0.9508004856839587 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002843236065576017}. Best is trial 21 with value: 0.9508118984390123.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] duri

[I 2024-05-09 15:11:42,290] Trial 50 finished with value: 0.9502243476716778 and parameters: {'scaling_method': 'standard', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00036822778206654406}. Best is trial 44 with value: 0.9508124279335561.
[I 2024-05-09 15:11:43,093] Trial 51 finished with value: 0.9508098554531846 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003273967344593095}. Best is trial 44 with value: 0.9508124279335561.
[I 2024-05-09 15:11:43,906] Trial 52 finished with value: 0.9508005245886842 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003436421762786338}. Best is trial 44 with value: 0.9508124279335561.
[I 2024-05-09 15:11:44,787] Trial 53 finished with value: 0.9508117711095098 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030165544892913964}. Best is trial 44 with value: 0.9508124279335561.
[I 2024-05-09 15:11:45,658] Trial 54

[I 2024-05-09 15:12:06,317] Trial 80 finished with value: 0.9499545416246533 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'ridge', 'alpha': 15.025583457740456}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:07,143] Trial 81 finished with value: 0.9508122884639405 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003066623528944073}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:07,973] Trial 82 finished with value: 0.9508117299963101 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003101556944283647}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:08,789] Trial 83 finished with value: 0.9508049299456391 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00033717644211051114}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:09,615] Trial 84 finis

[I 2024-05-09 15:12:29,322] Trial 106 finished with value: 0.9503145725171116 and parameters: {'scaling_method': 'minmax', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00032438417563930577}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:30,155] Trial 107 finished with value: 0.9508106139514355 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003126953566583453}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:30,890] Trial 108 finished with value: 0.9502887186166916 and parameters: {'scaling_method': 'standard', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002957401402025318}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:31,771] Trial 109 finished with value: 0.9508046638873923 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00028841149714700095}. Best is trial 74 with value: 0.9508124950361158.
[I 2024-05-09 15:12:32,614] Tria

[I 2024-05-09 15:12:49,274] Trial 136 finished with value: 0.9507941089298502 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002786430825834685}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:12:50,059] Trial 137 finished with value: 0.9508113478669445 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003173221343240015}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:12:50,778] Trial 138 finished with value: 0.9502841401630766 and parameters: {'scaling_method': 'standard', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003036451523779578}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:12:51,593] Trial 139 finished with value: 0.9508112904078028 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003008773691899138}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:12:52,378] Tr

[I 2024-05-09 15:13:11,033] Trial 166 finished with value: 0.9502343880668583 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00014842003505581474}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:11,863] Trial 167 finished with value: 0.9508121106942354 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.000304550468702008}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:12,558] Trial 168 finished with value: 0.9502720965835648 and parameters: {'scaling_method': 'standard', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003260887204957969}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:13,382] Trial 169 finished with value: 0.9508083172308067 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002925227566740934}. Best is trial 131 with value: 0.9508125921535637.

Found unknown categories in c

[I 2024-05-09 15:13:31,103] Trial 196 finished with value: 0.9508114852425068 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003208837844643989}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:31,492] Trial 197 finished with value: 0.9508089423320127 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002934053638364525}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:32,058] Trial 198 finished with value: 0.9503941256135956 and parameters: {'scaling_method': 'minmax', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003111401839461804}. Best is trial 131 with value: 0.9508125921535637.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in column

[I 2024-05-09 15:13:50,187] Trial 226 finished with value: 0.9508112955692946 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00029828223407289066}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:50,573] Trial 227 finished with value: 0.9508113871995179 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003179817446940748}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:51,037] Trial 228 finished with value: 0.9508119774090206 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003036843212781627}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:51,425] Trial 229 finished with value: 0.9508117103595705 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00031101918739783645}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:13:51,914] Tr

[I 2024-05-09 15:14:09,222] Trial 256 finished with value: 0.9508113847826728 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003151074266012936}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:09,835] Trial 257 finished with value: 0.9478311867025457 and parameters: {'scaling_method': 'maxabs', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030357515270717616}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:10,291] Trial 258 finished with value: 0.9508102097795929 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002962760414082596}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:11,151] Trial 259 finished with value: 0.9508050095447675 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00028875914072846364}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:11,945] Tr

[I 2024-05-09 15:14:30,398] Trial 282 finished with value: 0.9508112309074305 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030072155073200085}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:31,205] Trial 283 finished with value: 0.9508116982355255 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.000311045835705724}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:32,005] Trial 284 finished with value: 0.9508113348776817 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00031716377911832067}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:32,839] Trial 285 finished with value: 0.9508125913455245 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030615596278814073}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:33,755] Tr

[I 2024-05-09 15:14:55,651] Trial 312 finished with value: 0.950811440065276 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00031546659895494405}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:56,226] Trial 313 finished with value: 0.9508096123615388 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002946828928720624}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:57,192] Trial 314 finished with value: 0.9501846439586222 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0001388961813028587}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:58,042] Trial 315 finished with value: 0.9500669611141086 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00011809503033988937}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:14:58,690] Tri


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 15:15:16,759] Trial 339 finished with value: 0.9505705743199826 and parameters: {'scaling_method': 'robust', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.0003081788997250402}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:17,548] Trial 340 finished with value: 0.9508034032745026 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003395935933104427}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:18,120] Trial 341 finished with value: 0.9499539919394875 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'ridge', 'alpha': 15.006828346756718}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:19,071] Trial 342 fini

[I 2024-05-09 15:15:40,547] Trial 367 finished with value: 0.9508093956333943 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.000294246100155041}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:41,395] Trial 368 finished with value: 0.9508106740701244 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003129643289125872}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:42,337] Trial 369 finished with value: 0.9508033832796444 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002871553759229408}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:43,148] Trial 370 finished with value: 0.950811929569815 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030223205060260857}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:43,896] Trial


Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 15:15:56,740] Trial 397 finished with value: 0.95001512197965 and parameters: {'scaling_method': 'standard', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 0.0003078714871733506}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:15:57,308] Trial 398 finished with value: 0.9508114100338011 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha'

[I 2024-05-09 15:16:15,071] Trial 422 finished with value: 0.9508111916596075 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002985237141023597}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:15,816] Trial 423 finished with value: 0.9502786457751032 and parameters: {'scaling_method': 'standard', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003164626358160414}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:16,732] Trial 424 finished with value: 0.9508077124023824 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00029165411390804334}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:17,676] Trial 425 finished with value: 0.950812313736486 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030639313192955557}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:18,575] T

[I 2024-05-09 15:16:37,991] Trial 452 finished with value: 0.9499540811163747 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'ridge', 'alpha': 15.00986626023484}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:38,813] Trial 453 finished with value: 0.9508122810094026 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003067390679521191}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:39,640] Trial 454 finished with value: 0.9508113409566393 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003172368307717228}. Best is trial 131 with value: 0.9508125921535637.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5

[I 2024-05-09 15:16:58,690] Trial 477 finished with value: 0.9508124479961717 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030499902028138055}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:16:59,522] Trial 478 finished with value: 0.9508107485260073 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00029700698962539085}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:17:00,381] Trial 479 finished with value: 0.9508119594223926 and parameters: {'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.000302346113489379}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:17:01,792] Trial 480 finished with value: 0.9473481738795944 and parameters: {'scaling_method': 'maxabs', 'drop': None, 'algorithm': 'en', 'alpha': 0.00010035118104258578}. Best is trial 131 with value: 0.9508125921535637.
[I 2024-05-09 15:17:02,157] Trial

In [174]:
study.best_trial
#[0.9508125905283533]

FrozenTrial(number=497, state=1, values=[0.9508125946535184], datetime_start=datetime.datetime(2024, 5, 9, 15, 17, 12, 820009), datetime_complete=datetime.datetime(2024, 5, 9, 15, 17, 13, 693235), params={'scaling_method': 'robust', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003061900989342117}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scaling_method': CategoricalDistribution(choices=('standard', 'minmax', 'maxabs', 'robust')), 'drop': CategoricalDistribution(choices=(None, 'first')), 'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'en')), 'alpha': FloatDistribution(high=0.0004, log=False, low=0.0001, step=None)}, trial_id=497, value=None)

In [175]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna_dict = {}
trials = range(500)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items()))

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Algorithm'] = algorithm
optuna_dict['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna_dict)

In [177]:
idx = df_optuna.groupby('Algorithm')['Score'].transform(max) == df_optuna['Score']
df_optuna[idx]

Unnamed: 0,Trial,Score,Algorithm,Parameters
8,8,0.95043,en,"[(scaling_method, robust), (drop, None), (algo..."
80,80,0.949955,ridge,"[(scaling_method, robust), (drop, None), (algo..."
497,497,0.950813,lasso,"[(scaling_method, robust), (drop, None), (algo..."


In [179]:
print('lasso', df_optuna.Parameters[497])
print('ridge', df_optuna.Parameters[80])
print('en', df_optuna.Parameters[8])

lasso [('scaling_method', 'robust'), ('drop', None), ('algorithm', 'lasso'), ('alpha', 0.0003061900989342117)]
ridge [('scaling_method', 'robust'), ('drop', None), ('algorithm', 'ridge'), ('alpha', 15.025583457740456)]
en [('scaling_method', 'robust'), ('drop', None), ('algorithm', 'en'), ('alpha', 0.00035734815701707963)]


In [178]:
import plotly.express as px

#fig = px.scatter(df_optuna.loc[df_optuna.Score > .5], 
fig = px.scatter(df_optuna, 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'])

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()

In [30]:
import plotly.graph_objects as go

fig = optuna.visualization.plot_rank(study)

fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    )

fig.show()


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.

