In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
#website help from 
#https://medium.com/@walter_sperat/using-optuna-with-sklearn-the-right-way-part-1-6b4ad0ab2451

### Instantiate function for each regression model 

In [3]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True)
    }

    return Lasso(**params)

def instantiate_en(trial : Trial) -> ElasticNet:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1000, log=True),
        'l1_ratio': trial.suggest_float('l1_ratio', .00001, 1, log = True)
    }

    return ElasticNet(**params)

def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 20)
    }

    return XGBRegressor(**params)

def instantiate_gbr(trial : Trial) -> GradientBoostingRegressor:
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'max_depth': trial.suggest_int ('max_depth', 2, 6),
    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
    'min_samples_split': trial.suggest_int ('min_sample_split', 6, 10),
    'min_samples_leaf': trial.suggest_int ('min_sample_leaf', 1, 10)
    }
    
    return GradientBoostingRegressor(**params)

def instantiate_rf(trial : Trial) -> RandomForestRegressor:
    params = {
    'bootstrap':trial.suggest_categorical('bootstrap', [True, False]),
    'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
    'max_depth': trial.suggest_int('max_depth', 1, 100),
    'min_samples_split': trial.suggest_int('min_sample_split', 2, 10),
    'min_samples_leaf': trial.suggest_int('min_sample_leaf', 1, 10)
    }
    
    return RandomForestRegressor(**params)

def instantiate_ada(trial : Trial) -> AdaBoostRegressor:
    params = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1, log=True),
    'loss': trial.suggest_categorical('loss',['linear', 'square', 'exponential']),
    'n_estimators': trial.suggest_int('n_estimators', 1, 1000)
    }
    
    return AdaBoostRegressor(**params)


### Instantiate function for regression model selection

In [4]:
Classifier = (
    Ridge |
    Lasso |
    ElasticNet |
    XGBRegressor |
    GradientBoostingRegressor |
    RandomForestRegressor |
    AdaBoostRegressor
    )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'en', 'xgb', 'gbr', 'rf', 'ada'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='en':
        model = instantiate_en(trial)
    elif algorithm=='xgb':
        model = instantiate_xgb(trial)
    elif algorithm=='gbr':
        model = instantiate_gbr(trial)
    elif algorithm == 'rf':
        model = instantiate_rf(trial)
    elif algorithm == 'ada':
        model = instantiate_ada(trial)
        
    return model

### Instantiate functions for encoding categorical columns 

In [5]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
#from category_encoders import WOEEncoder

def instantiate_ordinal_encoder(trial: Trial)-> OrdinalEncoder:
    params = {
        'handle_unknown': "use_encoded_value", 
        'unknown_value': -1
    }
    
    return OrdinalEncoder(**params)

def instantiate_onehot_encoder(trial: Trial)-> OneHotEncoder:
    params = {
        'handle_unknown': 'ignore',
        'drop': trial.suggest_categorical('drop', [None, 'first'])
    }
    
    return OneHotEncoder(**params)
    
Encoder = (
    OrdinalEncoder |
    OneHotEncoder 
    )

def instantiate_encoder (trial : Trial) -> Encoder:
    encoding_method = trial.suggest_categorical(
        'encoding_method', ['ordinal', 'onehot'])
    if encoding_method =='ordinal':
        encoder = instantiate_ordinal_encoder(trial)
    elif encoding_method =='onehot':
        encoder = instantiate_onehot_encoder(trial)
    
    return encoder

### Instantiate functions for encoding numerical columns 

In [6]:

Scaler = (
  StandardScaler |
  MinMaxScaler |
  MaxAbsScaler |
  RobustScaler
)

def instantiate_scaler(trial : Trial) -> Scaler:
    method = trial.suggest_categorical(
    'scaling_method', ['standard', 'minmax', 'maxabs', 'robust']
    )
    if method=='standard':
        scaler = StandardScaler()
    elif method=='minmax':
        scaler = MinMaxScaler()
    elif method=='maxabs':
        scaler = MaxAbsScaler()
    elif method=='robust':
        scaler = RobustScaler()
        
    return scaler

### Instantiate function to scale and encode 

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = instantiate_scaler(trial)
    categorical_pipeline = instantiate_encoder(trial)
   # numerical_pipeline = StandardScaler()
    #categorical_pipeline = OneHotEncoder(handle_unknown="ignore", drop='first')
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [8]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [9]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [10]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

[I 2024-07-08 09:50:33,536] A new study created in memory with name: optimization


In [None]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=200)

[I 2024-07-08 09:50:57,985] Trial 0 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'lasso', 'alpha': 1.596299953148117}. Best is trial 0 with value: -0.029441307066881618.
[I 2024-07-08 09:52:42,088] Trial 1 finished with value: 0.8393679950758675 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'ordinal', 'algorithm': 'rf', 'bootstrap': False, 'n_estimators': 246, 'max_depth': 64, 'min_sample_split': 3, 'min_sample_leaf': 10}. Best is trial 1 with value: 0.8393679950758675.
[I 2024-07-08 09:53:11,436] Trial 2 finished with value: 0.8641449377737869 and parameters: {'scaling_method': 'standard', 'encoding_method': 'ordinal', 'algorithm': 'ada', 'learning_rate': 0.17448906672786785, 'loss': 'linear', 'n_estimators': 279}. Best is trial 2 with value: 0.8641449377737869.
[I 2024-07-08 09:53:40,784] Trial 3 finished with value: 0.8946663651310004 and parameters: {'scaling_me

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 09:56:54,429] Trial 13 finished with value: 0.9470588749996617 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 0.00011010104040317402, 'l1_ratio': 0.0001455025676712381}. Best is trial 13 with value: 0.9470588749996617.
[I 2024-07-08 09:56:54,805] Trial 14 finished with value: -0.025586122789539933 and parameters: {'scaling_method': 'minmax', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 199.93883658061745, 'l1_ratio': 7.950477271767913e-05}. Best is trial 13 with value: 0.9470588749996617.
[I 2024-07-08 09:56:56,103] Trial 15 finished with value: 0.9506146499925329 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': None, 'algorithm': 'en', 'alpha': 0.005798418815720318, 'l1_ratio': 0.03787907308950946}. Best is trial 15 wi

[I 2024-07-08 09:58:22,916] Trial 27 finished with value: 0.1900980953448697 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.15243420261017115, 'l1_ratio': 0.31606174092095934}. Best is trial 15 with value: 0.9506146499925329.
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 09:58:25,815] Trial 28 finished with value: 0.9489120573469423 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.0008608564486929174, 'l1_ratio': 0.02884428444223919}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 09:58:26,359] Trial 29 finished with value: -0.029441307066881618 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm':

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
[I 2024-07-08 10:00:30,764] Trial 31 finished with value: 0.9498560012919262 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.0008420495381024598, 'l1_ratio': 0.08792956359810299}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:03:59,158] Trial 32 finished with value: 0.8882791010176795 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'rf', 'bootstrap': True, 'n_estimators': 577, 'max_depth': 92, 'min_sample_split': 10, 'min_sample_leaf': 10}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:03:59,667] Trial 33 finished with value: 0.7368214868842125 and parameters: {'scaling_method': 'maxabs', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.05566575548888979, 'l1_ratio': 0.11820072246312327}. Best is trial 15 wit

[I 2024-07-08 10:06:44,164] Trial 37 finished with value: 0.745408937325795 and parameters: {'scaling_method': 'standard', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'ada', 'learning_rate': 0.0005601672835746196, 'loss': 'square', 'n_estimators': 405}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:06:44,614] Trial 38 finished with value: 0.941864120697199 and parameters: {'scaling_method': 'robust', 'encoding_method': 'ordinal', 'algorithm': 'lasso', 'alpha': 0.0005793817977169322}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:06:45,157] Trial 39 finished with value: 0.9411548656561404 and parameters: {'scaling_method': 'robust', 'encoding_method': 'onehot', 'drop': 'first', 'algorithm': 'en', 'alpha': 0.006359993639482432, 'l1_ratio': 0.6251098987933636}. Best is trial 15 with value: 0.9506146499925329.
[I 2024-07-08 10:06:45,495] Trial 40 finished with value: 0.9407217253975555 and parameters: {'scaling_method': 'standard', 'en

In [11]:
study.best_trial

ValueError: No trials are completed yet.

In [23]:
trial_number = []
score = []
algorithm = []
parameters = []
optuna_dict = {}
trials = range(200)

for trial in trials:
    trial_number.append(study.get_trials()[trial].number)
    score.append(study.get_trials()[trial].value)
    algorithm.append(study.get_trials()[trial].params['algorithm'])
    parameters.append(list(study.get_trials()[trial].params.items()))

optuna_dict['Trial'] = trial_number
optuna_dict['Score'] = score
optuna_dict['Algorithm'] = algorithm
optuna_dict['Parameters'] = parameters


df_optuna = pd.DataFrame.from_dict(optuna_dict)

In [None]:
df_optuna.to_csv('optuna_all_200.csv')

In [24]:
idx = df_optuna.groupby('Algorithm')['Score'].transform(max) == df_optuna['Score']
df_optuna[idx]

Unnamed: 0,Trial,Score,Algorithm,Parameters
3,3,0.391197,lasso,"[(scaling_method, standard), (encoding_method,..."
20,20,0.876856,ada,"[(scaling_method, robust), (encoding_method, o..."
24,24,0.945325,gbr,"[(scaling_method, maxabs), (encoding_method, o..."
25,25,0.89028,en,"[(scaling_method, maxabs), (encoding_method, o..."
32,32,0.944913,xgb,"[(scaling_method, robust), (encoding_method, o..."
173,173,0.950048,ridge,"[(scaling_method, robust), (encoding_method, o..."
179,179,0.908172,rf,"[(scaling_method, robust), (encoding_method, o..."


In [191]:
list(df_optuna.Parameters.items())

[(0,
  [('scaling_method', 'robust'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'lasso'),
   ('alpha', 485.91494069511094)]),
 (1,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'gbr'),
   ('n_estimators', 436),
   ('learning_rate', 0.0012861054183096832),
   ('max_depth', 8),
   ('min_sample_split', 7)]),
 (2,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'ordinal'),
   ('algorithm', 'ridge'),
   ('alpha', 2.9564732483382494)]),
 (3,
  [('scaling_method', 'maxabs'),
   ('encoding_method', 'ordinal'),
   ('algorithm', 'ridge'),
   ('alpha', 17.622180822136105)]),
 (4,
  [('scaling_method', 'standard'),
   ('encoding_method', 'onehot'),
   ('drop', None),
   ('algorithm', 'ridge'),
   ('alpha', 11.016938443949593)]),
 (5,
  [('scaling_method', 'minmax'),
   ('encoding_method', 'onehot'),
   ('drop', 'first'),
   ('algorithm', 'ridge'),
   ('alpha', 72.72185663024044)]),
 (6,
  [('scaling_method

In [193]:
df_optuna[df_optuna.Algorithm == 'lasso']

Unnamed: 0,Trial,Score,Algorithm,Parameters
0,0,-0.029441,lasso,"[(scaling_method, robust), (encoding_method, o..."
7,7,0.158099,lasso,"[(scaling_method, maxabs), (encoding_method, o..."
9,9,-0.029441,lasso,"[(scaling_method, maxabs), (encoding_method, o..."
28,28,-0.029441,lasso,"[(scaling_method, robust), (encoding_method, o..."
29,29,-0.029441,lasso,"[(scaling_method, robust), (encoding_method, o..."
...,...,...,...,...
193,193,0.950580,lasso,"[(scaling_method, robust), (encoding_method, o..."
194,194,0.943710,lasso,"[(scaling_method, robust), (encoding_method, o..."
195,195,0.948354,lasso,"[(scaling_method, robust), (encoding_method, o..."
196,196,0.950578,lasso,"[(scaling_method, robust), (encoding_method, o..."


In [26]:
# print('lasso', df_optuna.Parameters[169])
# print('ridge', df_optuna.Parameters[53])
# print('xgb', df_optuna.Parameters[137])
# print('gbr', df_optuna.Parameters[135])
print('rf', df_optuna.Parameters[179])

rf [('scaling_method', 'robust'), ('encoding_method', 'onehot'), ('drop', None), ('algorithm', 'rf'), ('bootstrap', True), ('n_estimators', 355), ('max_depth', 67), ('min_sample_split', 5), ('min_sample_leaf', 1)]


In [1]:
import plotly.express as px


fig = px.scatter(df_optuna.loc[df_optuna.Score > 0], 
                 x="Trial", 
                 y="Score", 
                 color="Algorithm",
                 hover_data=['Parameters'])

fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=8,
        font_family="Rockwell"
    )
)

fig.show()

NameError: name 'df_optuna' is not defined