In [35]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial, create_study, samplers
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor

### Instantiate function for each regression model 

In [55]:
def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", .001, 0.1, log = True),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20)
    }

    return XGBRegressor(**params)

def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
    'learning_rate': trial.suggest_float('learning_rate', .001, 0.1, log = True),
    'max_depth': trial.suggest_int ('max_depth', 2, 6),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    'min_samples_split': trial.suggest_int ('min_sample_split', 6, 10),
    'min_samples_leaf': trial.suggest_int ('min_sample_leaf', 1, 10)
    }
    
    return GradientBoostingRegressor(**params)

# def instantiate_rf(trial : Trial) -> RandomForestRegressor:
#     params = {
#     'bootstrap':trial.suggest_categorical('bootstrap', [True, False]),
#     'max_features':trial.suggest_categorical('max_features', None, 'sqrt'),
#     'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#     'max_depth': trial.suggest_float('max_depth', 1, 10),
#     'min_samples_split': trial.suggest_float('min_sample_split', 2, 10),
#     'min_samples_leaf': trial.suggest_float('min_sample_leaf', 2, 10)
#     }
    
#     return RandomForestRegressor(**params)

# def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
#     params = {
#     'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#     'loss': trial.suggest_categorical('loss',['linear', 'square', 'expoential']),
#     'n_estimators': trial.suggest_int('n_estimators', 100, 1000)
#     }
    
#     return AdaBoostRegressor(**params)



### Instantiate function for regression model selection

In [58]:
Classifier = (
    XGBRegressor |
    GradientBoostingRegressor
    )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['xgb', 'gbr'])
    
    if algorithm=='xgb':
        model = instantiate_xgb(trial)
    elif algorithm=='gbr':
        model = instantiate_gbr(trial)
        
    return model

### Instantiate functions for encoding categorical columns 

In [57]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

### Instantiate functions for encoding numerical columns 

In [59]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

### Instantiate function to scale and encode 

In [61]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [62]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [63]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [71]:
from optuna import create_study

study = create_study(study_name='optimization', 
                     direction='maximize')

[I 2024-05-09 14:26:03,222] A new study created in memory with name: optimization


In [72]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=100)

[I 2024-05-09 14:29:28,569] Trial 0 finished with value: 0.8714640923128938 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'gbr', 'n_estimators': 675, 'learning_rate': 0.0027571811235649617, 'max_depth': 6, 'subsample': 0.7770656803997849, 'min_sample_split': 6, 'min_sample_leaf': 8}. Best is trial 0 with value: 0.8714640923128938.
[I 2024-05-09 14:33:27,086] Trial 1 finished with value: 0.9362739781445062 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 804, 'learning_rate': 0.0653121058890592, 'max_depth': 6, 'subsample': 0.961400784503118, 'min_sample_split': 9, 'min_sample_leaf': 7}. Best is trial 1 with value: 0.9362739781445062.
[I 2024-05-09 14:33:34,549] Trial 2 finished with value: -6.005122413427998 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'xgb', 'n_estimators': 914, 'learning_rate': 0.00147949019463729, 'max_depth'


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 14:39:20,329] Trial 12 finished with value: 0.9468467526040856 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'xgb', 'n_estimators': 409, 'learning_rate': 0.05896295568601594, 'max_depth': 4, 'subsample': 0.6106841271541497, 'colsample_bytree': 0.9825297193283792, 'min_child_weight': 10}. Best is trial 12 with value: 0.9468467526040856.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros




[I 2024-05-09 14:40:53,104] Trial 18 finished with value: 0.9431850595342992 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'gbr', 'n_estimators': 606, 'learning_rate': 0.04123738427866517, 'max_depth': 2, 'subsample': 0.6271633596141895, 'min_sample_split': 10, 'min_sample_leaf': 2}. Best is trial 13 with value: 0.9473361779418956.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 14:40:56


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 14:41:47,609] Trial 24 finished with value: 0.9444578658565188 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'xgb', 'n_estimators': 214, 'learning_rate': 0.06338942981075953, 'max_depth': 5, 'subsample': 0.6528762383740928, 'colsample_bytree': 0.9259085574579127, 'min_child_weight': 11}. Best is trial 13 with value: 0.9473361779418956.
[I 2024-05-09 14:41:58,849] Trial 25 finished with value: 0.9352098187991708 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'xgb', 'n_estimators': 526, 'learning_rate': 0.0168821135876


Found unknown categories in columns [9, 21] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [0] during transform. These unknown categories will be encoded as all zeros

[I 2024-05-09 14:44:56,472] Trial 31 finished with value: 0.9471928011745494 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'xgb', 'n_estimators': 375, 'learning_rate': 0.054103376320489026, 'max_depth': 5, 'subsample': 0.5943648442816784, 'colsample_bytree': 0.8234777377354241, 'min_child_weight': 10}. Best is trial 27 with value: 0.947731954317037.

Found unknown categories in columns [3, 10] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [3, 9, 14] during transform. These unknown categories will be encoded as all zeros


Found unknown categories in columns [5, 21] during transform. These unknown categories will be encoded as all zeros




[I 2024-05-09 14:47:42,601] Trial 38 finished with value: -6.187424986309174 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'xgb', 'n_estimators': 492, 'learning_rate': 0.002541777292387631, 'max_depth': 6, 'subsample': 0.5526931965226454, 'colsample_bytree': 0.9249117157757407, 'min_child_weight': 14}. Best is trial 27 with value: 0.947731954317037.
[I 2024-05-09 14:49:10,984] Trial 39 finished with value: 0.9478130958575682 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 857, 'learning_rate': 0.022914063703816382, 'max_depth': 3, 'subsample': 0.6884176107673963, 'min_sample_split': 7, 'min_sample_leaf': 4}. Best is trial 39 with value: 0.9478130958575682.
[I 2024-05-09 14:50:18,557] Trial 40 finished with value: 0.9417092868177488 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 846, 'learning_rate': 0.023585939299099686

[I 2024-05-09 15:23:32,959] Trial 60 finished with value: 0.9455676872154726 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 950, 'learning_rate': 0.06521661270888303, 'max_depth': 2, 'subsample': 0.9664983199913249, 'min_sample_split': 7, 'min_sample_leaf': 2}. Best is trial 42 with value: 0.9510312863246181.
[I 2024-05-09 15:25:31,746] Trial 61 finished with value: 0.9487900833784845 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 859, 'learning_rate': 0.025780123103406972, 'max_depth': 3, 'subsample': 0.7016295848132901, 'min_sample_split': 7, 'min_sample_leaf': 4}. Best is trial 42 with value: 0.9510312863246181.
[I 2024-05-09 15:27:37,025] Trial 62 finished with value: 0.9493716935278694 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 825, 'learning_rate': 0.02480029613723202, 'max_depth': 3, 's

[I 2024-05-09 16:19:37,657] Trial 82 finished with value: 0.9505262305946218 and parameters: {'scaling_method': 'standard', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 793, 'learning_rate': 0.04119681641767256, 'max_depth': 3, 'subsample': 0.7831765211827085, 'min_sample_split': 9, 'min_sample_leaf': 6}. Best is trial 42 with value: 0.9510312863246181.
[I 2024-05-09 16:21:11,608] Trial 83 finished with value: 0.9502539573491837 and parameters: {'scaling_method': 'standard', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 773, 'learning_rate': 0.04471451415063894, 'max_depth': 3, 'subsample': 0.7892325464671189, 'min_sample_split': 9, 'min_sample_leaf': 6}. Best is trial 42 with value: 0.9510312863246181.
[I 2024-05-09 16:22:43,607] Trial 84 finished with value: 0.9502019165412552 and parameters: {'scaling_method': 'standard', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 777, 'learning_rate': 0.041460813918546405, 'max_depth':

In [77]:
study.best_trial

FrozenTrial(number=42, state=1, values=[0.9510312863246181], datetime_start=datetime.datetime(2024, 5, 9, 14, 51, 45, 558381), datetime_complete=datetime.datetime(2024, 5, 9, 14, 53, 30, 514178), params={'scaling_method': 'minmax', 'encoding_method': 'ordinal', 'algorithm': 'gbr', 'n_estimators': 873, 'learning_rate': 0.07662776587148856, 'max_depth': 3, 'subsample': 0.6842960902488119, 'min_sample_split': 7, 'min_sample_leaf': 4}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'scaling_method': CategoricalDistribution(choices=('standard', 'minmax', 'maxabs', 'robust')), 'encoding_method': CategoricalDistribution(choices=('ordinal', 'onehot')), 'algorithm': CategoricalDistribution(choices=('xgb', 'gbr')), 'n_estimators': IntDistribution(high=1000, log=False, low=500, step=1), 'learning_rate': FloatDistribution(high=0.1, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=6, log=False, low=2, step=1), 'subsample': FloatDistribution(high=1.0, log=Fa

In [78]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna_dict = {}
trials = range(5)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items()))

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Algorithm'] = algorithm
optuna_dict['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna_dict)

In [79]:
idx = df_optuna.groupby('Algorithm')['Score'].transform(max) == df_optuna['Score']
df_optuna[idx]

Unnamed: 0,Trial,Score,Algorithm,Parameters
1,1,0.936274,gbr,"[(scaling_method, minmax), (encoding_method, o..."
4,4,0.939124,xgb,"[(scaling_method, standard), (encoding_method,..."


In [51]:
list(df_optuna.Parameters.items())

[(0,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'ordinal'),
   ('algorithm', 'gbr'),
   ('n_estimators', 801),
   ('learning_rate', 0.02607024758370768),
   ('max_depth', 2),
   ('min_sample_split', 10)]),
 (1,
  [('scaling_method', 'standard'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'xgb'),
   ('n_estimators', 363),
   ('learning_rate', 0.005404103854647328),
   ('max_depth', 6),
   ('subsample', 0.8925879806965068),
   ('colsample_bytree', 0.5998368910791798),
   ('min_child_weight', 15)]),
 (2,
  [('scaling_method', 'maxabs'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'xgb'),
   ('n_estimators', 716),
   ('learning_rate', 0.007591104805282696),
   ('max_depth', 4),
   ('subsample', 0.7475884550556351),
   ('colsample_bytree', 0.5171942605576092),
   ('min_child_weight', 20)]),
 (3,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'ordinal'),
   ('algorithm', 'xgb'),
   ('n_estimators', 946),
   ('learning

In [52]:
print('xgb', df_optuna.Parameters[3])
print('gbr', df_optuna.Parameters[46])

{'regressor__XGB__colsample_bytree': 0.9678010901635519, 
 'regressor__XGB__learning_rate': 0.03948737640519406, 
 'regressor__XGB__max_depth': 3, 'regressor__XGB__min_child_weight': 2, 
 'regressor__XGB__n_estimators': 1000, 
 'regressor__XGB__subsample': 0.3203617673528298}

xgb [('scaling_method', 'minmax'), ('encoding_method', 'ordinal'), ('algorithm', 'xgb'), ('n_estimators', 946), ('learning_rate', 0.06161049539380966), ('max_depth', 6), ('subsample', 0.9609371175115584), ('colsample_bytree', 0.5442462510259598), ('min_child_weight', 12)]
gbr [('scaling_method', 'minmax'), ('encoding_method', 'ordinal'), ('algorithm', 'gbr'), ('n_estimators', 794), ('learning_rate', 0.030965560706362184), ('max_depth', 4), ('min_sample_split', 6)]


In [70]:
import plotly.express as px


fig = px.scatter(df_optuna.loc[df_optuna.Score > -100], 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'])

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()