In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn import compose, pipeline
from sklearn.model_selection import KFold
from typing import Optional
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import roc_auc_score, make_scorer, r2_score
from pandas import DataFrame, Series
import numpy as np
kf = KFold(n_splits=5, shuffle=True, random_state=42)
import xgboost as xgb
from xgboost import XGBRegressor
import optuna
from optuna import Trial
from optuna import create_study
from sklearn import compose
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LinearRegression

### Search function for each model 

In [22]:
def instantiate_ridge(trial : Trial) -> Ridge:
    params = {
        "alpha": trial.suggest_float("alpha", .1, 100, log=True),
    }

    return Ridge(**params)

def instantiate_lasso(trial : Trial) -> Lasso:
    params = {
        "alpha": trial.suggest_float("alpha", 1e-4, 1, log=True),
    }

    return Lasso(**params)

def instantiate_xgb(trial : Trial) -> XGBRegressor:
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    return XGBRegressor(**params)

Classifier = (
    Ridge |
    Lasso |
    XGBRegressor )

def instantiate_learner(trial : Trial) -> Classifier:
    algorithm = trial.suggest_categorical(
    'algorithm', ['ridge', 'lasso', 'xgb'])
    
    if algorithm =='ridge':
        model = instantiate_ridge(trial)
    elif algorithm=='lasso':
        model = instantiate_lasso(trial)
    elif algorithm=='xgb':
        model = instantiate_xgb(trial)
    
    return model

In [None]:
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
# from category_encoders import WOEEncoder

# Encoder = (
#   OrdinalEncoder |
#   OneHotEncoder 
# )

# def instantiate_encoder(trial : Trial) -> Encoder:
#     method = trial.suggest_categorical(
#     'encoding_method', ['ordinal', 'onehot']
#   )
#     if method=='ordinal':
#     encoder = instantiate_ordinal_encoder(trial)
#     elif method=='onehot':
#     encoder = instantiate_onehot_encoder(trial)
    
#     return encoder

# from sklearn.preprocessing import (
#   StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# )



In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# def instantiate_numerical_pipeline(trial : Trial) -> Pipeline:
#     pipeline = Pipeline([
#     ('scaler', instantiate_scaler(trial))
#   ])
#     return pipeline

# def instantiate_categorical_function(trial : Trial) -> Pipeline:
#     pipeline = Pipeline([
#     ('encoder', instantiate_encoder(trial))
#   ])
#     return pipeline

def instantiate_processor(trial : Trial, 
                          numerical_columns : list[str], 
                          categorical_columns : list[str]) -> ColumnTransformer:
    
    numerical_pipeline = StandardScaler()
    categorical_pipeline = OneHotEncoder(handle_unknown="ignore")
    
#     numerical_pipeline = instantiate_numerical_pipeline(trial)
#     categorical_pipeline = instantiate_categorical_pipeline(trial)
    
    processor = ColumnTransformer([
        ('numerical_pipeline', numerical_pipeline, numerical_columns),
        ('categorical_pipeline', categorical_pipeline, categorical_columns)
    ])
    return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], 
                      categorical_columns : list[str]) -> Pipeline:
    
    processor = instantiate_processor(
        trial, numerical_columns, categorical_columns
    )
    
    learner = instantiate_learner(trial)
    
    model_pipe = Pipeline([
    ('processor', processor),
    ('model', learner)
    ])
    
    model = compose.TransformedTargetRegressor(regressor= model_pipe,
                                                func=np.log, inverse_func=np.exp)
    
    return model

In [37]:
def objective(trial : Trial, X : DataFrame,
              y : np.ndarray | Series, 
              numerical_columns : Optional[list[str]]=None, 
              categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
    
    if numerical_columns is None:
        numerical_columns = [
            *X.select_dtypes(exclude=['object', 'category']).columns
        ]
    
    if categorical_columns is None:
        categorical_columns = [
            *X.select_dtypes(include=['object', 'category']).columns
        ]
    
    model = instantiate_model(trial, numerical_columns, categorical_columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    r2 = make_scorer(r2_score)
    scores = cross_val_score(model, X, y, scoring= r2, cv=kf)
    
    return np.min([np.mean(scores), np.median([scores])])

In [38]:
df = pd.read_csv('df_normal_quality.csv', index_col=0)
y = df.SalePrice
X = df.drop(['PID', 'SalePrice'], axis =1).copy()

In [39]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

[I 2024-05-06 13:39:12,914] A new study created in memory with name: optimization


In [40]:
study.optimize(lambda trial: objective(trial, X, y), n_trials=20)

[I 2024-05-06 13:39:22,576] Trial 0 finished with value: 0.9493219060468837 and parameters: {'algorithm': 'ridge', 'alpha': 7.865444908605931}. Best is trial 0 with value: 0.9493219060468837.
[I 2024-05-06 13:39:34,759] Trial 1 finished with value: -6.469745399788328 and parameters: {'algorithm': 'xgb', 'learning_rate': 0.0010113605272903524, 'max_depth': 5, 'subsample': 0.8700579565766164, 'colsample_bytree': 0.8467118630224082, 'min_child_weight': 14}. Best is trial 0 with value: 0.9493219060468837.
[I 2024-05-06 13:40:16,373] Trial 2 finished with value: 0.9466220933014494 and parameters: {'algorithm': 'xgb', 'learning_rate': 0.031692312425249475, 'max_depth': 8, 'subsample': 0.7062348936986825, 'colsample_bytree': 0.9797195042020221, 'min_child_weight': 16}. Best is trial 0 with value: 0.9493219060468837.
[I 2024-05-06 13:40:16,742] Trial 3 finished with value: 0.9489470072119932 and parameters: {'algorithm': 'ridge', 'alpha': 4.8171047989234275}. Best is trial 0 with value: 0.9493

In [41]:
# study without drop and 20 trials 
study.best_trials

[FrozenTrial(number=10, state=1, values=[0.950206827235748], datetime_start=datetime.datetime(2024, 5, 6, 13, 40, 19, 276115), datetime_complete=datetime.datetime(2024, 5, 6, 13, 40, 19, 928448), params={'algorithm': 'lasso', 'alpha': 0.0003786753353101835}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None)}, trial_id=10, value=None)]

In [35]:
# study with drop and 20 trials and target variable transformed 
study.best_trials

[FrozenTrial(number=2, state=1, values=[0.9499955354069816], datetime_start=datetime.datetime(2024, 5, 6, 13, 32, 15, 632163), datetime_complete=datetime.datetime(2024, 5, 6, 13, 32, 16, 348714), params={'algorithm': 'lasso', 'alpha': 0.00022003193991056288}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'algorithm': CategoricalDistribution(choices=('ridge', 'lasso', 'xgb')), 'alpha': FloatDistribution(high=1.0, log=True, low=0.0001, step=None)}, trial_id=2, value=None)]