In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression

### Search function for each model 

In [95]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 100, log=True),
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 100, log=True),
    }

    return Lasso(**params)

def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", .0001, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    return XGBRegressor(**params)

Classifier = (
    Ridge |
    Lasso |
    XGBRegressor )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'xgb'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='xgb':
        model = instantiate_xgb(trial)
    
    return model

In [None]:
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# from category_encoders import WOEEncoder

# Encoder = (
#   OrdinalEncoder |
#   OneHotEncoder 
# )

# def instantiate_encoder(trial : Trial) -> Encoder:
#     method = trial.suggest_categorical(
#     'encoding_method', ['ordinal', 'onehot']
#   )
#     if method=='ordinal':
#     encoder = instantiate_ordinal_encoder(trial)
#     elif method=='onehot':
#     encoder = instantiate_onehot_encoder(trial)
    
#     return encoder

# from sklearn.preprocessing import (
#   StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# )



In [80]:
hover_data=["algorithm", "Trial"] 

In [233]:
import plotly.graph_objects as go

fig = optuna.visualization.plot_rank(study)

fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    )

fig.show()

AttributeError: 'dict' object has no attribute 'visualization'

In [93]:
optuna.visualization.plot_optimization_history(study)


In [111]:
fig = optuna.visualization.plot_slice(study, params = ['algorithm'])

fig.show()

In [96]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# def instantiate_numerical_pipeline(trial : Trial) -> Pipeline:
#     pipeline = Pipeline([
#     ('scaler', instantiate_scaler(trial))
#   ])
#     return pipeline

# def instantiate_categorical_function(trial : Trial) -> Pipeline:
#     pipeline = Pipeline([
#     ('encoder', instantiate_encoder(trial))
#   ])
#     return pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = StandardScaler()
    categorical_pipeline = OneHotEncoder(handle_unknown="ignore")
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [97]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [98]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [99]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

[I 2024-05-06 14:58:59,518] A new study created in memory with name: optimization


In [100]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=200)

[I 2024-05-06 14:59:03,344] Trial 0 finished with value: 0.9471554321468393 and parameters: {'algorithm': 'ridge', 'alpha': 0.6010505222092465}. Best is trial 0 with value: 0.9471554321468393.
[I 2024-05-06 14:59:03,735] Trial 1 finished with value: 0.9389672175428982 and parameters: {'algorithm': 'lasso', 'alpha': 0.006162467158491157}. Best is trial 0 with value: 0.9471554321468393.
[I 2024-05-06 14:59:04,136] Trial 2 finished with value: 0.9458097018173504 and parameters: {'algorithm': 'ridge', 'alpha': 0.001244729405956262}. Best is trial 0 with value: 0.9471554321468393.
[I 2024-05-06 14:59:04,573] Trial 3 finished with value: 0.9473545665481765 and parameters: {'algorithm': 'ridge', 'alpha': 0.7759373016028097}. Best is trial 3 with value: 0.9473545665481765.
[I 2024-05-06 14:59:04,969] Trial 4 finished with value: 0.9496058571087869 and parameters: {'algorithm': 'ridge', 'alpha': 31.29323725185007}. Best is trial 4 with value: 0.9496058571087869.
[I 2024-05-06 14:59:05,394] Tria

[I 2024-05-06 15:00:12,528] Trial 39 finished with value: 0.9485588883459293 and parameters: {'algorithm': 'ridge', 'alpha': 3.0896988698846903}. Best is trial 32 with value: 0.9502797535996578.
[I 2024-05-06 15:00:13,049] Trial 40 finished with value: 0.9490795432899979 and parameters: {'algorithm': 'lasso', 'alpha': 0.0007487749013093245}. Best is trial 32 with value: 0.9502797535996578.
[I 2024-05-06 15:00:13,752] Trial 41 finished with value: 0.9502825368726407 and parameters: {'algorithm': 'lasso', 'alpha': 0.00026577767050225043}. Best is trial 41 with value: 0.9502825368726407.
[I 2024-05-06 15:00:14,485] Trial 42 finished with value: 0.9502294508274328 and parameters: {'algorithm': 'lasso', 'alpha': 0.00023634827143863648}. Best is trial 41 with value: 0.9502825368726407.
[I 2024-05-06 15:00:14,910] Trial 43 finished with value: 0.9430131991735313 and parameters: {'algorithm': 'lasso', 'alpha': 0.002427898969308335}. Best is trial 41 with value: 0.9502825368726407.
[I 2024-05-0

[I 2024-05-06 15:01:10,766] Trial 79 finished with value: 0.9499926313562599 and parameters: {'algorithm': 'lasso', 'alpha': 0.0001529524006218151}. Best is trial 75 with value: 0.9502953233147071.
[I 2024-05-06 15:01:11,254] Trial 80 finished with value: 0.9481827933626569 and parameters: {'algorithm': 'lasso', 'alpha': 0.0009624329368942344}. Best is trial 75 with value: 0.9502953233147071.
[I 2024-05-06 15:01:11,960] Trial 81 finished with value: 0.9502672109276192 and parameters: {'algorithm': 'lasso', 'alpha': 0.00025615699836524443}. Best is trial 75 with value: 0.9502953233147071.
[I 2024-05-06 15:01:12,599] Trial 82 finished with value: 0.9502963988384143 and parameters: {'algorithm': 'lasso', 'alpha': 0.0002775282989311552}. Best is trial 82 with value: 0.9502963988384143.
[I 2024-05-06 15:01:13,037] Trial 83 finished with value: 0.9439723885844347 and parameters: {'algorithm': 'lasso', 'alpha': 0.001863957343085763}. Best is trial 82 with value: 0.9502963988384143.
[I 2024-05

[I 2024-05-06 15:02:13,063] Trial 120 finished with value: 0.9501437506612426 and parameters: {'algorithm': 'lasso', 'alpha': 0.00019366385317210036}. Best is trial 90 with value: 0.9502964822130379.
[I 2024-05-06 15:02:13,733] Trial 121 finished with value: 0.9502898226246657 and parameters: {'algorithm': 'lasso', 'alpha': 0.00029322512255107095}. Best is trial 90 with value: 0.9502964822130379.
[I 2024-05-06 15:02:14,178] Trial 122 finished with value: 0.950172458965406 and parameters: {'algorithm': 'lasso', 'alpha': 0.0003937165488046054}. Best is trial 90 with value: 0.9502964822130379.
[I 2024-05-06 15:02:14,889] Trial 123 finished with value: 0.9498818867954568 and parameters: {'algorithm': 'lasso', 'alpha': 0.00012651680490057297}. Best is trial 90 with value: 0.9502964822130379.
[I 2024-05-06 15:02:15,518] Trial 124 finished with value: 0.9502742213840186 and parameters: {'algorithm': 'lasso', 'alpha': 0.00032284555737474547}. Best is trial 90 with value: 0.9502964822130379.
[I

[I 2024-05-06 15:02:55,798] Trial 160 finished with value: 0.9502968446519408 and parameters: {'algorithm': 'lasso', 'alpha': 0.00027951955384252825}. Best is trial 160 with value: 0.9502968446519408.
[I 2024-05-06 15:02:56,501] Trial 161 finished with value: 0.9502895965361698 and parameters: {'algorithm': 'lasso', 'alpha': 0.0002939250037567679}. Best is trial 160 with value: 0.9502968446519408.
[I 2024-05-06 15:02:57,125] Trial 162 finished with value: 0.95021458199043 and parameters: {'algorithm': 'lasso', 'alpha': 0.00037385749754682466}. Best is trial 160 with value: 0.9502968446519408.
[I 2024-05-06 15:02:57,688] Trial 163 finished with value: 0.9500186429126172 and parameters: {'algorithm': 'lasso', 'alpha': 0.00046763996206561896}. Best is trial 160 with value: 0.9502968446519408.
[I 2024-05-06 15:02:58,476] Trial 164 finished with value: 0.9500779176060921 and parameters: {'algorithm': 'lasso', 'alpha': 0.00017255223505524656}. Best is trial 160 with value: 0.9502968446519408

In [138]:
study.best_trials

[FrozenTrial(number=160, state=1, values=[0.9502968446519408], datetime_start=datetime.datetime(2024, 5, 6, 15, 2, 55, 142023), datetime_complete=datetime.datetime(2024, 5, 6, 15, 2, 55, 797641), params={'algorithm': 'lasso', 'alpha': 0.00027951955384252825}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=100.0, log=True, low=0.0001, step=None)}, trial_id=160, value=None)]

In [224]:
list(study.get_trials()[10].params.items())[1:]

[('learning_rate', 0.07748571364848593),
 ('max_depth', 1),
 ('subsample', 0.1081539403017378),
 ('colsample_bytree', 0.8774623283402829),
 ('min_child_weight', 20)]

In [225]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna = {}
trials = range(200)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items())[1:])

optuna['Trial'] = trial_number
optuna['Score'] = score
optuna['Algorithm'] = algorithm
optuna['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna)


In [226]:
df_optuna

Unnamed: 0,Trial,Score,Algorithm,Parameters
0,0,0.947155,ridge,"[(alpha, 0.6010505222092465)]"
1,1,0.938967,lasso,"[(alpha, 0.006162467158491157)]"
2,2,0.945810,ridge,"[(alpha, 0.001244729405956262)]"
3,3,0.947355,ridge,"[(alpha, 0.7759373016028097)]"
4,4,0.949606,ridge,"[(alpha, 31.29323725185007)]"
...,...,...,...,...
195,195,0.949932,lasso,"[(alpha, 0.0005009178284093929)]"
196,196,0.949888,lasso,"[(alpha, 0.00012855378397921287)]"
197,197,0.950254,lasso,"[(alpha, 0.00034571676369411016)]"
198,198,0.950166,lasso,"[(alpha, 0.00020514835456026157)]"


In [182]:
df_optuna.

0      ridge
1      lasso
2      ridge
3      ridge
4      ridge
       ...  
195    lasso
196    lasso
197    lasso
198    lasso
199    ridge
Name: Algorithm, Length: 200, dtype: object

In [230]:
df_optuna.groupby('Algorithm').max('Score')

Unnamed: 0_level_0,Trial,Score
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1
lasso,198,0.950297
ridge,199,0.949606
xgb,188,0.946761


In [231]:
print('lasso', df_optuna.Parameters[198])
print('ridge', df_optuna.Parameters[199])
print('xgb', df_optuna.Parameters[188])

lasso [('alpha', 0.00020514835456026157)]
ridge [('alpha', 0.00028929130728814323)]
xgb [('learning_rate', 0.022432477519196928), ('max_depth', 5), ('subsample', 0.5819591447885447), ('colsample_bytree', 0.1949715272549276), ('min_child_weight', 13)]


In [228]:
import plotly.express as px


fig = px.scatter(df_optuna.loc[df_optuna.Score > 0], 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'])

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()

In [169]:
study.get_trials()[0].params['algorithm']

'ridge'

In [101]:
# 200 trials...does not want to pick xgb
study.best_trials

[FrozenTrial(number=160, state=1, values=[0.9502968446519408], datetime_start=datetime.datetime(2024, 5, 6, 15, 2, 55, 142023), datetime_complete=datetime.datetime(2024, 5, 6, 15, 2, 55, 797641), params={'algorithm': 'lasso', 'alpha': 0.00027951955384252825}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=100.0, log=True, low=0.0001, step=None)}, trial_id=160, value=None)]

In [87]:
# generic parameters and 50 trials
study.best_trials

[FrozenTrial(number=40, state=1, values=[0.9502656323414269], datetime_start=datetime.datetime(2024, 5, 6, 14, 36, 12, 170582), datetime_complete=datetime.datetime(2024, 5, 6, 14, 36, 12, 845745), params={'algorithm': 'lasso', 'alpha': 0.00033386661330344237}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=100.0, log=True, low=0.0001, step=None)}, trial_id=40, value=None)]

In [67]:
# lower the learning rate min for better visualization with xgb
study.best_trials

[FrozenTrial(number=17, state=1, values=[0.9496878313016601], datetime_start=datetime.datetime(2024, 5, 6, 14, 14, 25, 963589), datetime_complete=datetime.datetime(2024, 5, 6, 14, 14, 26, 319467), params={'algorithm': 'ridge', 'alpha': 22.71735402737913}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=100.0, log=True, low=0.1, step=None)}, trial_id=17, value=None)]

In [41]:
# study without drop and 20 trials 
study.best_trials

[FrozenTrial(number=10, state=1, values=[0.950206827235748], datetime_start=datetime.datetime(2024, 5, 6, 13, 40, 19, 276115), datetime_complete=datetime.datetime(2024, 5, 6, 13, 40, 19, 928448), params={'algorithm': 'lasso', 'alpha': 0.0003786753353101835}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None)}, trial_id=10, value=None)]

[FrozenTrial(number=10, state=1, values=[0.950206827235748], datetime_start=datetime.datetime(2024, 5, 6, 13, 40, 19, 276115), datetime_complete=datetime.datetime(2024, 5, 6, 13, 40, 19, 928448), params={'algorithm': 'lasso', 'alpha': 0.0003786753353101835}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None)}, trial_id=10, value=None)]

In [35]:
# study with drop and 20 trials and target variable transformed 
study.best_trials

[FrozenTrial(number=2, state=1, values=[0.9499955354069816], datetime_start=datetime.datetime(2024, 5, 6, 13, 32, 15, 632163), datetime_complete=datetime.datetime(2024, 5, 6, 13, 32, 16, 348714), params={'algorithm': 'lasso', 'alpha': 0.00022003193991056288}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None)}, trial_id=2, value=None)]