## 5. Modelagem - Boosting

Etapa 1 - Importando a base já trabalhada

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import QuantileTransformer, StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from utils import functions

In [2]:
# Carregando a abt
with open('./artefatos/ABT/abt.pkl', mode= 'rb') as f:
    abt = pickle.load(f)

Etapa 2 - Fazendo o split
* Separando 20% dos dados para teste
* Mantendo a taxa de evento original do problema

In [3]:
# Separando o target da base
X = abt.drop(columns=['class'])
y = abt['class']
del(abt)

In [4]:
from sklearn.model_selection import train_test_split

# Fazendo o Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2,random_state= 1,stratify= y)

# Checando os shapes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(48000, 170) (48000,)
(12000, 170) (12000,)


Etapa 3 - Data Preparation

> Pipeline: Numéricos
* Inputando mediana para missings
* Transformação por quartis
* Escalonamento por padronização

> Pipeline: Categóricos (cluster e anomaly)
* Inputando a moda
* One Hot Encoding

> Salvar o Artefato

In [5]:
# Separando as variáveis numéricas
dp_boost_num_vars = X.select_dtypes(exclude= 'object').columns

# Separando as variáveis categóricas
dp_boost_cat_vars = X.select_dtypes(include= 'object').columns

# Armazenando o index
train_idx, test_idx = X_train.index, X_test.index

# Pipeline: Vars. Numéricas
dp_boost_num_pipeline = Pipeline([
    ('dp_boost_num_inputer', SimpleImputer(strategy='median')),
    ('dp_boost_num_transformer', QuantileTransformer(random_state= 1)),
    ('dp_boost_num_scaler', StandardScaler())
])

# Pipeline. Vars. Categóricas
dp_boost_cat_pipeline = Pipeline([
    ('dp_boost_cat_inputer', SimpleImputer(strategy= 'most_frequent')),
    ('dp_boost_cat_encoder', OneHotEncoder(sparse_output= False, drop= 'first', dtype= np.int8))
])

# Montando o Transformador
dp_boost_preprocessor = ColumnTransformer([
    ('num_pipeline', dp_boost_num_pipeline, dp_boost_num_vars),
    ('cat_pipeline', dp_boost_cat_pipeline, dp_boost_cat_vars)
])

# Configurando a Pipeline
dp_boost_pipeline = Pipeline(
    steps= [('preprocessor', dp_boost_preprocessor)]
)

In [6]:
# Ajustando o pipeline aos dados de Treino
dp_boost_pipeline = dp_boost_pipeline.fit(X_train, y_train)

# Pegando o nome das features do One Hot Enconder
dp_boost_cat_vars = dp_boost_pipeline.named_steps['preprocessor'].named_transformers_['cat_pipeline'].named_steps['dp_boost_cat_encoder'].get_feature_names_out(dp_boost_cat_vars)

# Juntando as variáveis categóricas e numéricas
dp_boost_vars = list(dp_boost_num_vars) + list(dp_boost_cat_vars)

# Transformando os dados de Treino
X_train_boosting = pd.DataFrame(data = dp_boost_pipeline.transform(X_train), columns= dp_boost_vars, index= train_idx)

# Transformando os dados de Teste
X_test_boosting = pd.DataFrame(data = dp_boost_pipeline.transform(X_test), columns= dp_boost_vars, index= test_idx)

In [7]:
with open('./artefatos/Boosting/pipeline.pkl', mode= 'wb') as file:
    pickle.dump(dp_boost_pipeline, file)

In [9]:
features = X_train_boosting.columns.to_list()
with open('./artefatos/Boosting/pipeline_feature_names_out.pkl', mode= 'wb') as file:
    pickle.dump(features, file)

Etapa 4 - Feature Selection

* Removendo Features com Alta Correlação
* Removendo Features com baixa variância
* Removendo Features via RFE

In [8]:
# Remoção por Correlação - Aplicando nos conjuntos de treino e teste
X_train_boosting_fs, fs_corr_vars_to_drop = functions.remove_highly_correlated_features(
    df= X_train_boosting,
    threshold= 0.6
)

X_test_boosting_fs = X_test_boosting.drop(columns= fs_corr_vars_to_drop)

assert (X_train_boosting_fs.columns == X_test_boosting_fs.columns).all()

In [9]:
# Remoção por Variância - Aplicando nos conjuntos de treino e teste
var_fs = VarianceThreshold().fit(X_train_boosting_fs)

fs_VAR_vars_to_drop = X_train_boosting_fs.loc[:, ~var_fs.get_support()].columns.to_list()

# Aplicando ao conjunto de treino
X_train_boosting_fs = X_train_boosting_fs.drop(columns= fs_VAR_vars_to_drop)

# Aplicando ao conjunto de teste
X_test_boosting_fs = X_test_boosting_fs.drop(columns= fs_VAR_vars_to_drop)

# Verificando se os conjuntos de treino e teste possuem as mesmas colunas
assert (X_train_boosting_fs.columns == X_test_boosting_fs.columns).all()

In [None]:
# Salvando as colunas selecionadas pós feature selection
selected_features = X_train_boosting_fs.columns.to_list()

Etapa 4 - Teste de Modelo Pós Data Preparation

> Testar os seguintes modelos:

* GradientBoosting
* Light GBM
* XGBoost

> Para avaliar os modelos de forma que faça sentido para o negócio, não podemos levar em consideração apenas uma métrica em específico. Isso porque busca-se o cenário em que a empresa tenha o menor custo possível com as manutenções. Segundo as instruções da empresa, temos que:


1. Se um caminhão é mandado para manutenção, porém não possui um defeito, será cobrado $10

2. Se um caminhão é mandado para manutenção, e possui um defeito, será cobrado $25

3. Se um caminhão não é mandado para manutenção, e possui um defeito, será cobrado $500


> Observando o cenário acima, fica claro que existe uma preocupação muito grande com Falsos Negativos (FN), uma vez que seu custo é 20 vezes o de um Verdadeiro Positivo (TP), e 50 vezes o de um Falso Positivo (FP). Dessa forma, podemos criar uma função de custo (Loss Function) padronizada para avaliar o modelo aos olhos no negócio da seguinte maneira:


$$C_{total} = (FP \times V_{FP}) + (TP \times V_{TP}) + (FN \times V_{FN})$$

Onde $V$ indica o valor associado ao tipo de erro subscrito.

Logo, para o nosso problema, temos que:

$$C_{total} = (FP \times 10) + (TP \times 25) + (FN \times 500)$$

In [11]:
# Testando os Modelos
models = [
    GradientBoostingClassifier(random_state=1),
    XGBClassifier(random_state=1, n_jobs=-1),
    LGBMClassifier(random_state=1, n_jobs=-1)
]

for model in models:
    model_name = str(model)[:str(model).find("(")]
    # Treinamento
    model.fit(X_train_boosting_fs, y_train)

    # Avaliação
    metrics = functions.calculate_metrics(model_name, model, X_train_boosting_fs, y_train, X_test_boosting_fs, y_test)
    display(metrics)

    custo = functions.loss_function(y_test, model.predict(X_test_boosting_fs))
    print(f'Modelo: {model_name} // Custo: {custo}')

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.994396,0.899248,0.7475,0.990072,0.980143,0.923199
1,GradientBoostingClassifier,Teste,0.991083,0.756906,0.685,0.980002,0.960004,0.903475


Modelo: GradientBoostingClassifier // Custo: 35365


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,XGBClassifier,Treino,0.999646,1.0,0.97875,0.999951,0.999902,0.996081
1,XGBClassifier,Teste,0.9925,0.816092,0.71,0.981223,0.962445,0.903983


Modelo: XGBClassifier // Custo: 32870
[LightGBM] [Info] Number of positive: 800, number of negative: 47200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6170
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.016667 -> initscore=-4.077537
[LightGBM] [Info] Start training from score -4.077537


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LGBMClassifier,Treino,0.9995,0.996164,0.97375,0.999987,0.999975,0.999131
1,LGBMClassifier,Teste,0.992667,0.833333,0.7,0.976168,0.952336,0.915339


Modelo: LGBMClassifier // Custo: 33780


Etapa 5 - Tunning Hiperparâmetros e Cross Validation

> Tunar os seguintes modelos, levando em consideração a função de custo:

* GradientBoosting
* Light GBM
* XGBoost

> Light GBM

In [12]:
import optuna
from sklearn.model_selection import StratifiedKFold

EARLY_STOPPING = 15

class EarlyStoppingExceeded(optuna.exceptions.OptunaError):
    early_stop = EARLY_STOPPING
    early_stop_count = 0
    best_score = None

def early_stopping_opt(study, trial):
    if EarlyStoppingExceeded.best_score == None:
      EarlyStoppingExceeded.best_score = study.best_value

    if study.best_value < EarlyStoppingExceeded.best_score:
        EarlyStoppingExceeded.best_score = study.best_value
        EarlyStoppingExceeded.early_stop_count = 0
    else:
      if EarlyStoppingExceeded.early_stop_count > EarlyStoppingExceeded.early_stop:
            EarlyStoppingExceeded.early_stop_count = 0
            best_score = None
            raise EarlyStoppingExceeded()
      else:
            EarlyStoppingExceeded.early_stop_count=EarlyStoppingExceeded.early_stop_count+1
    return

def objective(trial):
    # Var. de controle da métrica
    avg_loss_func = 0

    # Criando os parametros a serem testados
    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 20.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 20.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.05, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.05, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 1000),
        "max_bin": trial.suggest_int("max_bin", 100, 1000),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0)
    }

    # Fazendo os Splits (Cross Validation)
    cv = StratifiedKFold(n_splits= 5, random_state= 1, shuffle= True)

    for train_idx, validation_idx in cv.split(X_train, y_train):
        # Instancia o Modelo
        model = LGBMClassifier(**params, random_state= 1, n_jobs= -1)

        # Vars Preditoras - Treino
        X_train_cv = X_train.iloc[train_idx]

        # Vars Preditoras - Validação
        X_validation_cv = X_train.iloc[validation_idx]

        # Target - Treino
        y_train_cv = y_train.iloc[train_idx]

        # Target - Validação
        y_validation_cv = y_train.iloc[validation_idx]

        # Aplicando o DATAPREP nos conjuntos de Treino
        X_train_cv_processed = pd.DataFrame(
            data = dp_boost_pipeline.fit_transform(X_train_cv, y_train_cv),
            columns = dp_boost_vars # Lista de variáveis que saem do pipeline na etapa de dataprep
        )[selected_features] # Filtrando apenas as features da etapa de feature selection

        # Aplicando o DATAPREP nos conjuntos de Validação
        X_validation_cv_processed = pd.DataFrame(
            data = dp_boost_pipeline.transform(X_validation_cv),
            columns= dp_boost_vars
        )[selected_features]

        # Treinando o Modelo
        model.fit(X_train_cv_processed, y_train_cv)

        # Fazendo as predições
        y_pred = model.predict(X_validation_cv_processed)

        avg_loss_func += functions.loss_function(y_validation_cv, y_pred)
    
    return avg_loss_func/5

study = optuna.create_study(
    direction= 'minimize',
    study_name= 'LGBM_v01',
    storage= 'sqlite:///LGBM_v01.db'
)

try:
    study.optimize(objective, timeout=7200, callbacks= [early_stopping_opt])

except EarlyStoppingExceeded:
    print(f'Sem novas melhorias a partir da iteração {len(study.trials)}')

best = study.best_trial

print(f'Melhor score: {best.value}')

[I 2024-07-12 17:48:43,329] A new study created in RDB with name: LGBM_v01
[I 2024-07-12 17:48:51,929] Trial 0 finished with value: 80000.0 and parameters: {'lambda_l1': 8.26578265308799e-07, 'lambda_l2': 0.8979220317570481, 'num_leaves': 212, 'feature_fraction': 0.7482983220747398, 'bagging_fraction': 0.41361673046197533, 'bagging_freq': 1, 'min_child_samples': 57, 'learning_rate': 0.0030831359239263303, 'max_depth': 20, 'min_data_in_leaf': 131, 'max_bin': 206, 'subsample': 0.694814856370432, 'colsample_bytree': 0.5679566074579296, 'subsample_freq': 6, 'min_split_gain': 0.7274415161160845}. Best is trial 0 with value: 80000.0.
[I 2024-07-12 17:49:00,739] Trial 1 finished with value: 30076.0 and parameters: {'lambda_l1': 6.845658675102691e-08, 'lambda_l2': 1.2384798722380232e-06, 'num_leaves': 213, 'feature_fraction': 0.7117381458600236, 'bagging_fraction': 0.5909716352672788, 'bagging_freq': 3, 'min_child_samples': 38, 'learning_rate': 0.09493558032042398, 'max_depth': 17, 'min_data_i

Sem novas melhorias a partir da iteração 19
Melhor score: 30076.0


In [13]:
# Salvando as melhores configurações
with open('./artefatos/LGBM/LGBM_best_config.pkl', 'wb') as f:
    pickle.dump(best, f)

> XGBoost

In [14]:
import optuna
from sklearn.model_selection import StratifiedKFold

EARLY_STOPPING = 15

class EarlyStoppingExceeded(optuna.exceptions.OptunaError):
    early_stop = EARLY_STOPPING
    early_stop_count = 0
    best_score = None

def early_stopping_opt(study, trial):
    if EarlyStoppingExceeded.best_score == None:
      EarlyStoppingExceeded.best_score = study.best_value

    if study.best_value < EarlyStoppingExceeded.best_score:
        EarlyStoppingExceeded.best_score = study.best_value
        EarlyStoppingExceeded.early_stop_count = 0
    else:
      if EarlyStoppingExceeded.early_stop_count > EarlyStoppingExceeded.early_stop:
            EarlyStoppingExceeded.early_stop_count = 0
            best_score = None
            raise EarlyStoppingExceeded()
      else:
            EarlyStoppingExceeded.early_stop_count=EarlyStoppingExceeded.early_stop_count+1
    return

def objective(trial):
    # Var. de controle da métrica
    avg_loss_func = 0

    # Criando os parametros a serem testados
    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "verbosity": 0,
        "booster": "gbtree",
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "eta": trial.suggest_float("eta", 1e-4, 1e-1, log=True),
        "gamma": trial.suggest_float("gamma", 1e-8, 10.0, log=True),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        "max_bin": trial.suggest_int("max_bin", 100, 1000),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1e-2, 10.0, log=True),
    }

    # Fazendo os Splits (Cross Validation)
    cv = StratifiedKFold(n_splits= 5, random_state= 1, shuffle= True)

    for train_idx, validation_idx in cv.split(X_train, y_train):
        # Instancia o Modelo
        model = XGBClassifier(**params, random_state= 1, n_jobs = -1)

        # Vars Preditoras - Treino
        X_train_cv = X_train.iloc[train_idx]

        # Vars Preditoras - Validação
        X_validation_cv = X_train.iloc[validation_idx]

        # Target - Treino
        y_train_cv = y_train.iloc[train_idx]

        # Target - Validação
        y_validation_cv = y_train.iloc[validation_idx]

        # Aplicando o DATAPREP nos conjuntos de Treino
        X_train_cv_processed = pd.DataFrame(
            data = dp_boost_pipeline.fit_transform(X_train_cv, y_train_cv),
            columns = dp_boost_vars # Lista de variáveis que saem do pipeline na etapa de dataprep
        )[selected_features] # Filtrando apenas as features da etapa de feature selection

        # Aplicando o DATAPREP nos conjuntos de Validação
        X_validation_cv_processed = pd.DataFrame(
            data = dp_boost_pipeline.transform(X_validation_cv),
            columns= dp_boost_vars
        )[selected_features]

        # Treinando o Modelo
        model.fit(X_train_cv_processed, y_train_cv)

        # Fazendo as predições
        y_pred = model.predict(X_validation_cv_processed)

        avg_loss_func += functions.loss_function(y_validation_cv, y_pred)
    
    return avg_loss_func/5

study = optuna.create_study(
    direction= 'minimize',
    study_name= 'XGBClassifier_v01',
    storage= 'sqlite:///XGBClassifier_v01.db'
)

try:
    study.optimize(objective, timeout=7200, callbacks= [early_stopping_opt])

except EarlyStoppingExceeded:
    print(f'Sem novas melhorias a partir da iteração {len(study.trials)}')

best = study.best_trial

print(f'Melhor score: {best.value}')

[I 2024-07-12 17:51:27,198] A new study created in RDB with name: XGBClassifier_v01
[I 2024-07-12 17:51:39,335] Trial 0 finished with value: 37412.0 and parameters: {'lambda': 0.0001495878742845022, 'alpha': 0.0027471681135135795, 'max_depth': 9, 'eta': 0.002303821512846776, 'gamma': 2.6851714638660824e-05, 'grow_policy': 'depthwise', 'min_child_weight': 39, 'subsample': 0.6935490448293671, 'colsample_bytree': 0.6753908691774138, 'colsample_bylevel': 0.5534726181406316, 'colsample_bynode': 0.8287326933404688, 'max_bin': 170, 'scale_pos_weight': 2.1147858206033257}. Best is trial 0 with value: 37412.0.
[I 2024-07-12 17:51:49,325] Trial 1 finished with value: 80000.0 and parameters: {'lambda': 0.0016042506856853928, 'alpha': 0.0026554853428155333, 'max_depth': 13, 'eta': 0.03358625939447244, 'gamma': 1.1545485666860259e-08, 'grow_policy': 'lossguide', 'min_child_weight': 96, 'subsample': 0.759191671031869, 'colsample_bytree': 0.6337217144887333, 'colsample_bylevel': 0.6641768689866292, '

Sem novas melhorias a partir da iteração 59
Melhor score: 15570.0


In [15]:
# Salvando as melhores configurações
with open('./artefatos/XGBM/XGBM_best_config.pkl', 'wb') as f:
    pickle.dump(best, f)

> GradientBoosting Classifier

In [16]:
import optuna
from sklearn.model_selection import StratifiedKFold

EARLY_STOPPING = 15

class EarlyStoppingExceeded(optuna.exceptions.OptunaError):
    early_stop = EARLY_STOPPING
    early_stop_count = 0
    best_score = None

def early_stopping_opt(study, trial):
    if EarlyStoppingExceeded.best_score == None:
      EarlyStoppingExceeded.best_score = study.best_value

    if study.best_value < EarlyStoppingExceeded.best_score:
        EarlyStoppingExceeded.best_score = study.best_value
        EarlyStoppingExceeded.early_stop_count = 0
    else:
      if EarlyStoppingExceeded.early_stop_count > EarlyStoppingExceeded.early_stop:
            EarlyStoppingExceeded.early_stop_count = 0
            best_score = None
            raise EarlyStoppingExceeded()
      else:
            EarlyStoppingExceeded.early_stop_count=EarlyStoppingExceeded.early_stop_count+1
    return

def objective(trial):
    # Var. de controle da métrica
    avg_loss_func = 0

    # Criando os parametros a serem testados
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 4e-1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 100),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 100),
        "min_weight_fraction_leaf": trial.suggest_float("min_weight_fraction_leaf", 0.0, 0.5),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 0.0, 1.0),
        "max_features": trial.suggest_float("max_features", 0.1, 1.0),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 1000),
        "validation_fraction": trial.suggest_float("validation_fraction", 0.1, 0.5),
        "n_iter_no_change": trial.suggest_int("n_iter_no_change", 5, 20),
    }

    # Fazendo os Splits (Cross Validation)
    cv = StratifiedKFold(n_splits= 5, random_state= 1, shuffle= True)

    for train_idx, validation_idx in cv.split(X_train, y_train):
        # Instancia o Modelo
        model = GradientBoostingClassifier(**params, random_state= 2)

        # Vars Preditoras - Treino
        X_train_cv = X_train.iloc[train_idx]

        # Vars Preditoras - Validação
        X_validation_cv = X_train.iloc[validation_idx]

        # Target - Treino
        y_train_cv = y_train.iloc[train_idx]

        # Target - Validação
        y_validation_cv = y_train.iloc[validation_idx]

        # Aplicando o DATAPREP nos conjuntos de Treino
        X_train_cv_processed = pd.DataFrame(
            data = dp_boost_pipeline.fit_transform(X_train_cv, y_train_cv),
            columns = dp_boost_vars # Lista de variáveis que saem do pipeline na etapa de dataprep
        )[selected_features] # Filtrando apenas as features da etapa de feature selection

        # Aplicando o DATAPREP nos conjuntos de Validação
        X_validation_cv_processed = pd.DataFrame(
            data = dp_boost_pipeline.transform(X_validation_cv),
            columns= dp_boost_vars
        )[selected_features]

        # Treinando o Modelo
        model.fit(X_train_cv_processed, y_train_cv)

        # Fazendo as predições
        y_pred = model.predict(X_validation_cv_processed)

        avg_loss_func += functions.loss_function(y_validation_cv, y_pred)
    
    return avg_loss_func/5

study = optuna.create_study(
    direction= 'minimize',
    study_name= 'GradBoostingClassifier_v01',
    storage= 'sqlite:///GradBoostingClassifier_v01.db'
)

try:
    study.optimize(objective, timeout=7200, callbacks= [early_stopping_opt])

except EarlyStoppingExceeded:
    print(f'Sem novas melhorias a partir da iteração {len(study.trials)}')

best = study.best_trial

print(f'Melhor score: {best.value}')

[I 2024-07-12 18:02:51,635] A new study created in RDB with name: GradBoostingClassifier_v01
[I 2024-07-12 18:03:55,382] Trial 0 finished with value: 80000.0 and parameters: {'learning_rate': 0.011134399022711591, 'n_estimators': 929, 'min_samples_split': 52, 'min_samples_leaf': 81, 'min_weight_fraction_leaf': 0.16536805573186908, 'max_depth': 4, 'min_impurity_decrease': 0.49603468861177324, 'max_features': 0.738324758973326, 'max_leaf_nodes': 318, 'validation_fraction': 0.2731717084357926, 'n_iter_no_change': 17}. Best is trial 0 with value: 80000.0.
[I 2024-07-12 18:04:16,431] Trial 1 finished with value: 80000.0 and parameters: {'learning_rate': 0.00011742360349010136, 'n_estimators': 91, 'min_samples_split': 38, 'min_samples_leaf': 79, 'min_weight_fraction_leaf': 0.0879395377047586, 'max_depth': 13, 'min_impurity_decrease': 0.8756415120040659, 'max_features': 0.990612707608473, 'max_leaf_nodes': 396, 'validation_fraction': 0.4467801733819773, 'n_iter_no_change': 18}. Best is trial 

Sem novas melhorias a partir da iteração 30
Melhor score: 41436.0


In [17]:
# Salvando as melhores configurações
with open('./artefatos/GBM/GBM_best_config.pkl', 'wb') as f:
    pickle.dump(best, f)