In [67]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor

### Instantiate function for each regression model 

In [115]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Lasso(**params)

def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20)
    }

    return XGBRegressor(**params)

def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
    'max_depth': trial.suggest_float('max_depth', 1, 10),
    'min_sample_split': trial.suggest_float('min_sample_split', 2, 10)
    }
    
    return GradientBoostingRegressor(**params)

# def instantiate_rf(trial : Trial) -> RandomForestRegressor:
#     params = {
#     'bootstrap':trial.suggest_categorical('bootstrap', [True, False]),
#     'max_features':trial.suggest_categorical('max_features', None, 'sqrt'),
#     'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#     'max_depth': trial.suggest_float('max_depth', 1, 10),
#     'min_samples_split': trial.suggest_float('min_sample_split', 2, 10),
#     'min_samples_leaf': trial.suggest_float('min_sample_leaf', 2, 10)
#     }
    
#     return RandomForestRegressor(**params)

# def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
#     params = {
#     'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#     'loss': trial.suggest_categorical('loss',['linear', 'square', 'expoential']),
#     'n_estimators': trial.suggest_int('n_estimators', 100, 1000)
#     }
    
#     return AdaBoostRegressor(**params)



### Instantiate function for regression model selection

In [97]:
Classifier = (
    Ridge |
    Lasso |
    XGBRegressor |
    GradientBoostingRegressor
    )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'xgb'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='xgb':
        model = instantiate_xgb(trial)
    elif algorithm=='gbr':
        model = instantiate_gbr(trial)
        
    return model

### Instantiate functions for encoding categorical columns 

In [98]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

### Instantiate functions for encoding numerical columns 

In [99]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

### Instantiate function to scale and encode 

In [100]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [101]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [102]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [103]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

[I 2024-05-08 12:58:21,889] A new study created in memory with name: optimization


In [141]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=200)

[I 2024-05-08 13:43:09,915] Trial 55 finished with value: 0.947573193447426 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'ridge', 'alpha': 1.0133772705820585}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:43:20,590] Trial 56 finished with value: -6.327682977814066 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'xgb', 'learning_rate': 0.0011496999812009986, 'max_depth': 3, 'subsample': 0.7420615881700867, 'colsample_bytree': 0.7045382582491888, 'min_child_weight': 11}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:43:21,088] Trial 57 finished with value: 0.9341610573023228 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'lasso', 'alpha': 0.008369726019144512}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:43:21,593] Trial 58 finished with value: -0.029441307066881618 and parameter

[I 2024-05-08 13:43:48,383] Trial 81 finished with value: 0.9508072074077194 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003337967069676376}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:43:49,257] Trial 82 finished with value: 0.9506023960185445 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0002255778359260152}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:43:49,941] Trial 83 finished with value: 0.9505171596101875 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0005030246518150047}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:43:50,552] Trial 84 finished with value: 0.9469192590709231 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 

[I 2024-05-08 13:44:30,662] Trial 103 finished with value: 0.9501425999641967 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0006439112772038698}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:44:31,491] Trial 104 finished with value: 0.9508117860857238 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030170892992476044}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:44:32,211] Trial 105 finished with value: 0.9507945219334403 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003502287711302046}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:44:33,079] Trial 106 finished with value: 0.950607610609782 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alph

[I 2024-05-08 13:45:06,616] Trial 130 finished with value: 0.9497274232948139 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0007554395215299302}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:45:07,536] Trial 131 finished with value: 0.9505636994883288 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00021688334622544975}. Best is trial 51 with value: 0.950811959316562.
[I 2024-05-08 13:45:08,290] Trial 132 finished with value: 0.950812416973192 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003079818441118096}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:45:08,877] Trial 133 finished with value: 0.9504708293544606 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alp

[I 2024-05-08 13:45:39,277] Trial 156 finished with value: 0.9503190758390293 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.000582278957455787}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:45:40,026] Trial 157 finished with value: 0.950802749890612 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0003405683386790038}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:45:40,605] Trial 158 finished with value: 0.9417167197580749 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'lasso', 'alpha': 0.000220507502390194}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:45:41,113] Trial 159 finished with value: 0.9440245807940079 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0009025

[I 2024-05-08 13:46:10,705] Trial 182 finished with value: 0.9506878313422911 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00024527641491545177}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:46:11,445] Trial 183 finished with value: 0.9506048078374774 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00046537198512391236}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:46:12,262] Trial 184 finished with value: 0.9508115897577711 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00031656424022910573}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:46:13,255] Trial 185 finished with value: 0.9504071557629356 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso',

[I 2024-05-08 13:46:32,488] Trial 209 finished with value: 0.9506613894695928 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0004356475589120512}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:46:33,136] Trial 210 finished with value: 0.9495152137275642 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0008049742690751011}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:46:33,986] Trial 211 finished with value: 0.950808015005929 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00029210188082808605}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:46:34,804] Trial 212 finished with value: 0.9508114792960454 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'a

[I 2024-05-08 13:47:30,487] Trial 235 finished with value: 0.9502491236523113 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00015155730280446285}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:47:31,118] Trial 236 finished with value: 0.9483299927469011 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.0005172791210192754}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:47:31,845] Trial 237 finished with value: 0.9507675248491247 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00036981770792040235}. Best is trial 132 with value: 0.950812416973192.
[I 2024-05-08 13:47:32,750] Trial 238 finished with value: 0.9505964665322632 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 

In [107]:
study.best_trial

FrozenTrial(number=51, state=1, values=[0.950811959316562], datetime_start=datetime.datetime(2024, 5, 8, 13, 2, 41, 239943), datetime_complete=datetime.datetime(2024, 5, 8, 13, 2, 42, 51942), params={'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'lasso', 'alpha': 0.00030234570649513435}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scaling_method': CategoricalDistribution(choices=('standard', 'minmax', 'maxabs', 'robust')), 'encoding_method': CategoricalDistribution(choices=('ordinal', 'onehot')), 'drop': CategoricalDistribution(choices=(None, 'first')), 'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None)}, trial_id=51, value=None)

In [111]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna_dict = {}
trials = range(50)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items())[1:])

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Algorithm'] = algorithm
optuna_dict['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna_dict)

In [137]:
idx = df_optuna.groupby('Algorithm')['Score'].transform(max) == df_optuna['Score']
df_optuna[idx]

Unnamed: 0,Trial,Score,Algorithm,Parameters
5,5,0.949782,xgb,"[(encoding_method, ordinal), (algorithm, xgb),..."
6,6,0.949665,ridge,"[(encoding_method, onehot), (drop, None), (alg..."
44,44,0.950812,lasso,"[(encoding_method, onehot), (drop, None), (alg..."


In [131]:
print('lasso', df_optuna.Parameters[44])
print('ridge', df_optuna.Parameters[6])
print('xgb', df_optuna.Parameters[5])

lasso [('encoding_method', 'onehot'), ('drop', None), ('algorithm', 'lasso'), ('alpha', 0.0003203849779983578)]
ridge [('encoding_method', 'onehot'), ('drop', None), ('algorithm', 'ridge'), ('alpha', 26.191383015391732)]
xgb [('encoding_method', 'ordinal'), ('algorithm', 'xgb'), ('learning_rate', 0.032033941123685704), ('max_depth', 8), ('subsample', 0.31715243333753884), ('colsample_bytree', 0.4718623158670449), ('min_child_weight', 3)]


In [140]:
import plotly.express as px


fig = px.scatter(df_optuna.loc[df_optuna.Score > .9], 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'])

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()