In [1]:
import pandas as pd
import optuna as opt

import numpy as np

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib 
from sklearn.metrics import log_loss

In [2]:
df = pd.read_parquet('./Data/061200_unidos_categorizado_under.parquet')

In [3]:
X = df.drop(["categoria_time_to_failure"], axis=1)
y = df["categoria_time_to_failure"]
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

XGBoost - Fine Tunnig

In [4]:

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.1, 1.0),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    return log_loss(y_test, model.predict_proba(X_test))

# Loop for optimization (note that the range of the loop has been reduced for practicality)
for i in range(10):
    try:
        study = joblib.load('./Studies/study_xg.pkl')   
        study.optimize(objective, n_trials=20)
        joblib.dump(study, './Studies/study_xg.pkl')
        print("ali")
    except Exception as e:
        print(e)
        print("aqui")
        study = opt.create_study(direction='minimize')
        study.optimize(objective, n_trials=20)
        joblib.dump(study, './Studies/study_xg.pkl')


[I 2023-09-22 11:21:09,463] Trial 174 finished with value: 0.5679458072329943 and parameters: {'n_estimators': 409, 'max_depth': 10, 'learning_rate': 0.05757246257278111, 'min_child_weight': 3, 'subsample': 0.9046412042604529, 'colsample_bytree': 0.9912940869996985}. Best is trial 84 with value: 0.5470244689220417.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.1, 1.0),
[I 2023-09-22 11:21:22,336] Trial 175 finished with value: 0.5514794851545988 and parameters: {'n_estimators': 333, 'max_depth': 10, 'learning_rate': 0.043957016772593054, 'min_child_weight': 2, 'subsample': 0.9970652517406463, 'colsample_bytree': 0.8147832625137402}. Best is trial 84 with value: 0.5470244689220417.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
  'colsampl

ali


[I 2023-09-22 11:22:06,716] Trial 180 finished with value: 0.5710080695650676 and parameters: {'n_estimators': 338, 'max_depth': 9, 'learning_rate': 0.02481561437678076, 'min_child_weight': 2, 'subsample': 0.8736572921802073, 'colsample_bytree': 0.7401761196149206}. Best is trial 84 with value: 0.5470244689220417.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
  'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.1, 1.0),
[I 2023-09-22 11:22:13,432] Trial 181 finished with value: 0.5537324766114536 and parameters: {'n_estimators': 170, 'max_depth': 10, 'learning_rate': 0.08179851967934908, 'min_child_weight': 2, 'subsample': 0.9469457945378271, 'colsample_bytree': 0.8243077769865608}. Best is trial 84 with value: 0.5470244689220417.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
  'colsample_

ali


In [5]:
print(study.best_params)
print(study.best_value)

{'n_estimators': 371, 'max_depth': 10, 'learning_rate': 0.034228357352041355, 'min_child_weight': 1, 'subsample': 0.9395768035081312, 'colsample_bytree': 0.960856220045787}
0.5470244689220417


RandomForestClassifier - Fine tunning

In [None]:

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 15),
        'max_features': trial.suggest_categorical('max_features', [1, 'sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    return log_loss(y_test, model.predict_proba(X_test))

# Loop for optimization
for i in range(10):
    try:
        study = joblib.load('./Studies/study_rf.pkl')  # Carregando o estudo existente (se houver)
        study.optimize(objective, n_trials=10)  # Executando 10 experimentos
        joblib.dump(study, './Studies/study_rf.pkl')  # Salvando o estudo atualizado
        print("ali")
    except Exception as e:
        print(e)
        print("aqui")
        study = opt.create_study(direction='minimize')  # Criando um novo estudo
        study.optimize(objective, n_trials=10)  # Executando 10 experimentos
        joblib.dump(study, './Studies/study_rf.pkl')  # Salvando o novo estudo

In [None]:
print(study.best_params)
print(study.best_value)