In [1]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pandas as pd
import os

# Carregar base de dados
diretorio_atual = os.getcwd()

caminho_dataset = os.path.join(diretorio_atual, 'datasets', 'parkinsons.data')

df = pd.read_csv(caminho_dataset)

X = df.drop(['status', 'name', 'APQ', 'D2',
            'Fhi(Hz)', 'Flo(Hz)', 'Fo(Hz)',
             'PPQ', 'RAP', 'spread1', 'spread2'], axis=1)

y = df['status']

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2)



In [3]:
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def treinar_modelo(params):
    learning_rate = params[0]
    min_child_weight = params[1]
    max_depth = params[2]
    colsample_bytree = params[3]
    gamma = params[4]
    scale_pos_weight = params[5]
    
    print(params, '\n')

    pipe = Pipeline(steps=[('StandardScaler', StandardScaler()),
                       ('MinMaxScaler', MinMaxScaler()),
                       ('XGBClassifier', XGBClassifier(learning_rate=learning_rate, min_child_weight=min_child_weight,
                            max_depth=max_depth, colsample_bytree=colsample_bytree,
                            gamma=gamma, scale_pos_weight=scale_pos_weight, n_estimators=50))]) # número de arvores é definido como fixo, no caso igual a 50
    
    pipe.fit(X_treino, y_treino)

    proba = pipe.predict_proba(X_teste)[:, 1]

    return -1 * roc_auc_score(y_teste, proba) # multiplicado por -1 porque é preciso minimizar a negativa do auc não o próprio auc


space = [(1e-3, 1, 'log-uniform'), # learning_rate, log-uniform dá mais importância para números menores 
         (1, 10), # min_child_weight
         (3, 10), # max_depth
         (0.5, 1.0), # colsample_bytree
         (0, 5), # gamma
         (1, 10)] # scale_pos_weight


Otimização Bayesiana

In [5]:
from skopt import gp_minimize

resultados_gp = gp_minimize(treinar_modelo, space, verbose=1, n_calls=50, n_random_starts=10)

Iteration No: 1 started. Evaluating function at random point.
[0.002349038947364565, 8, 4, 0.5336991521433834, 1, 3] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5081
Function value obtained: -0.9290
Current minimum: -0.9290
Iteration No: 2 started. Evaluating function at random point.
[0.0046752356784930645, 9, 9, 0.9507796820946783, 3, 4] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.0703
Function value obtained: -0.9321
Current minimum: -0.9321
Iteration No: 3 started. Evaluating function at random point.
[0.005642644566725601, 5, 7, 0.8301089704616824, 3, 5] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.0222
Function value obtained: -0.9383
Current minimum: -0.9383
Iteration No: 4 started. Evaluating function at random point.
[0.004352961937871119, 7, 5, 0.7709910626527308, 3, 10] 

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.0252
Function value obtained: -0.9398
Current minimum

Exploration Exploitation Tradeoff

In [6]:
resultados_gp.x


[0.18740935969636607, 5, 3, 1.0, 0, 7]