In [None]:
from IPython.display import display
import random
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import scikitplot as skplt


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from category_encoders.target_encoder import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder


import lightgbm as lgb
import xgboost as xgb


import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
def print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name):
    '''
    This function must receive a Pandas DataFrame.
    The first columns must be the parameters tested.
    The second column must be the results for the train dataset
    The third column must be the results for the test dataset
    '''
    # Find best parameter and print
    if maximum_minimum == 'maximum':
        best_parameter_result = df.iloc[:,2].max()
    elif maximum_minimum == 'minimum':
        best_parameter_result = df.iloc[:,2].min()
    print('Used metric:', metric_name)
    print('Best', tested_parameter, 'parameter:', df[np.in1d(df.iloc[:,2], best_parameter_result)].iloc[:,0].values[0])
    print('Best', tested_parameter, 'parameter result:', best_parameter_result, '\n')

    # Plot train test results
    parameters_tested = df.iloc[:,0]
    train_results = df.iloc[:,1]
    test_results = df.iloc[:,2]

    fig, ax = plt.subplots(figsize=(10, 6))
    # Plot first the results for train
    ax.plot(parameters_tested, train_results, color='red', linewidth = 0.7, linestyle = 'solid', label = 'Train')
    # Add dots in tested points
    ax.scatter(parameters_tested, train_results, s = 8, color = 'black')

    # Plot the results for test
    ax.plot(parameters_tested, test_results, color='blue', linewidth = 0.7, linestyle = 'solid', label = 'Test')
    # Add dots in tested points
    ax.scatter(parameters_tested, test_results, s = 8, color = 'black')


    ax.legend(loc = 'lower center', bbox_to_anchor=(0.5, -0.18))
    plt.show()
    plt.close()

In [None]:
# target encoder
from category_encoders.target_encoder import TargetEncoder

X_train_target_encoder = X_train.copy()
X_test_target_encoder = X_test.copy()

# target encoder for categorical varibles.
    target_encoder_dict = {}
    for col in df_train_bootstrap[categorical_variables]:
        target_encoder_dict[col] = TargetEncoder().fit(X_train_target_encoder[col], y_train)
    for key in target_encoder_dict:
        X_train_target_encoder[key] = target_encoder_dict[key].transform(X_train_target_encoder[key])
        X_test_target_encoder[key] = target_encoder_dict[key].transform(X_test_target_encoder[key])

In [None]:
# one-hot encoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, drop="first"), categorical_variables)], remainder='drop')

one_hot_fit = ct.fit(X_train[categorical_variables])
X_train_cat_one_hot  = one_hot_fit.transform(X_train[categorical_variables])
X_test_cat_one_hot  = one_hot_fit.transform(X_test[categorical_variables])
X_out_of_time_cat_one_hot = one_hot_fit.transform(X_out_of_time[categorical_variables])

X_train_one_hot = pd.concat([X_train[scalar_variables], pd.DataFrame(X_train_cat_one_hot, columns=ct.get_feature_names_out())], axis=1)
X_test_one_hot = pd.concat([X_test[scalar_variables], pd.DataFrame(X_test_cat_one_hot, columns=ct.get_feature_names_out())], axis=1)
X_out_of_time_one_hot = pd.concat([X_out_of_time[scalar_variables], pd.DataFrame(X_out_of_time_cat_one_hot, columns=ct.get_feature_names_out())], axis=1)

In [None]:
# Logistic regression with one-hot encoder
from sklearn.linear_model import LogisticRegression
def hyperparameter_logistic(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    The choice of the algorithm depends on the penalty chosen: Supported penalties by solver:
        ‘newton-cg’ - [‘l2’, ‘none’]
        ‘lbfgs’ - [‘l2’, ‘none’]
        ‘liblinear’ - [‘l1’, ‘l2’]
        ‘sag’ - [‘l2’, ‘none’]
        ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        lr = LogisticRegression(**hyperparameters_dict)

        # fit and predict
        lr_fit = lr.fit(X_train, y_train)
        y_pred_train = lr_fit.predict(X_train)
        y_pred_test = lr_fit.predict(X_test)
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Plot comparison between train and test 
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values



hyperparameters_dict = {'random_state': 1
                        , 'max_iter': 100
                        , 'solver': 'liblinear'
                        , 'penalty': 'l1'
                        , 'C': 1
                        , 'l1_ratio': None}

print('Test solver liblinear')
# Test C parameter with L1
hyperparameters_dict['penalty'] = 'l1'
hyperparameter_test_name = 'C'
test_hyperparameters = np.arange(start = 0.01, stop = 0.5, step = 0.05)
hyperparameter_result = hyperparameter_logistic(X_train_one_hot, y_train[target_name], X_test_one_hot, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)

# Test C parameter with L2
hyperparameters_dict['penalty'] = 'l2'
hyperparameter_test_name = 'C'
test_hyperparameters = np.arange(start = 0.01, stop = 0.5, step = 0.05)
hyperparameter_result = hyperparameter_logistic(X_train_one_hot, y_train[target_name], X_test_one_hot, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)


In [None]:
# Nystroem + SGD
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDClassifier
# probability estimates are not available for loss='hinge'

kwargs_Nystroem = {'kernel': 'rbf', 'random_state': 1}
nystroem = Nystroem(**kwargs_Nystroem)
X_train_target_encoder_nystroem = nystroem.fit_transform(X_train_target_encoder)
X_test_target_encoder_nystroem = nystroem.fit_transform(X_test_target_encoder)

def hyperparameter_SGDClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    max_iter, default=1000
    loss{'log_loss', 'modified_huber'}
    penalty{‘l2’, ‘l1’, ‘elasticnet’}, default=’l2’
    l1_ratio, default=0.15. Use only if penalty = elasticnet
    alpha, default=0.0001, Values must be in the range [0.0, inf)
    eta0, default=0.0
     
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        sgd = SGDClassifier(**hyperparameters_dict)
        
        # fit and predict
        sgd_fit = sgd.fit(X_train, y_train)
        y_pred_train = sgd_fit.predict_proba(X_train)[:,1]
        y_pred_test = sgd_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values



hyperparameters_dict = {'random_state': 1
                        , 'max_iter': 50000
                        , 'loss': 'log_loss'
                        , 'learning_rate': 'optimal'
                        , 'eta0': 0.0
                        , 'penalty': 'l2'
                        , 'alpha': 175
                        , 'l1_ratio': 0.15}

# Testa loss function
print('Test loss: log_loss or modified_huber.')
hyperparameter_test_name = 'loss'
test_hyperparameters = ['log_loss', 'modified_huber']
hyperparameter_result = hyperparameter_SGDClassifier(X_train_target_encoder_nystroem, y_train[target_name], X_test_target_encoder_nystroem, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# Testa penalty
print('Test penalty: l2, l1, elasticnet.')
hyperparameter_test_name = 'penalty'
test_hyperparameters = ['l2', 'l1', 'elasticnet']
hyperparameter_result = hyperparameter_SGDClassifier(X_train_target_encoder_nystroem, y_train[target_name], X_test_target_encoder_nystroem, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# Testa eta0
print('Test eta0: 0 to 0.7.')
hyperparameters_dict['learning_rate'] = 'constant'
hyperparameter_test_name = 'eta0'
test_hyperparameters = np.arange(start = 0.001, stop = 0.7, step = 0.01)
hyperparameter_result = hyperparameter_SGDClassifier(X_train_target_encoder_nystroem, y_train[target_name], X_test_target_encoder_nystroem, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict['learning_rate'] = 'optimal'
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# Testa alpha
print('Test alpha')
hyperparameter_test_name = 'alpha'
test_hyperparameters = [0.0001, 0.001, 0.01, 0.1, 1, 10, 50, 100, 125, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 250, 300]
hyperparameter_result = hyperparameter_SGDClassifier(X_train_target_encoder_nystroem, y_train[target_name], X_test_target_encoder_nystroem, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


In [None]:
# K Neighbors Classifier.
from sklearn.neighbors import KNeighborsClassifier  

def hyperparameter_KNeighborsClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    n_neighbors: int, default = 5
    weights: 'uniform', 'distance',  default = 'uniform'
    algorithm{'auto', 'ball_tree', 'kd_tree', 'brute'}, default = ’auto’
    p: 1 for l1, 2 for l2
    metric: “euclidean”, “manhattan”, “chebyshev”, “minkowski”, defalut = ’minkowski’  (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.DistanceMetric.html#sklearn.metrics.DistanceMetric)
     
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        knn = KNeighborsClassifier(**hyperparameters_dict)
        
        # fit and predict
        knn_fit = knn.fit(X_train, y_train)
        y_pred_train = knn_fit.predict_proba(X_train)[:,1]
        y_pred_test = knn_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

hyperparameters_dict = {'n_neighbors': 11
                        , 'weights': 'distance'
                        , 'algorithm': 'auto'
                        , 'p': 1
                        , 'metric': 'manhattan'}

# Testa metric
print('Test metric:  euclidean, manhattan, chebyshev, minkowski.')
hyperparameter_test_name = 'metric'
test_hyperparameters = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
hyperparameter_result = hyperparameter_KNeighborsClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa weights
print('Test weights: uniform, distance.')
hyperparameter_test_name = 'weights'
test_hyperparameters = ['uniform', 'distance']
hyperparameter_result = hyperparameter_KNeighborsClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa algorithm
print('Test algorithm: auto, ball_tree, kd_tree, brute.')
hyperparameter_test_name = 'algorithm'
test_hyperparameters = ['auto', 'ball_tree', 'kd_tree', 'brute']
hyperparameter_result = hyperparameter_KNeighborsClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa p
print('Test p: 1 or 2.')
hyperparameter_test_name = 'p'
test_hyperparameters = [1, 2]
hyperparameter_result = hyperparameter_KNeighborsClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa n_neighbors
print('Test n_neighbors: 1 to 30.')
hyperparameter_test_name = 'n_neighbors'
test_hyperparameters = np.arange(start = 1, stop = 31, step = 1)
hyperparameter_result = hyperparameter_KNeighborsClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# The best KNN model was
'n_neighbors': 11
'weights': 'distance'
'algorithm': 'auto'
'p': 1
'metric': 'manhattan'

In [None]:
# Decision tree
from sklearn.tree import DecisionTreeClassifier

def hyperparameter_DecisionTreeClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    criterion{'gini', 'entropy', 'log_loss'}, default=”gini”
    min_samples_split, default=2
    min_samples_leaf, default=1
    max_depth, default=None    
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        dt = DecisionTreeClassifier(**hyperparameters_dict)
        
        # fit and predict
        dt_fit = dt.fit(X_train, y_train)
        y_pred_train = dt_fit.predict_proba(X_train)[:,1]
        y_pred_test = dt_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

hyperparameters_dict = {'random_state': 1
                        , 'criterion': 'entropy'
                        , 'min_samples_split': 3000
                        , 'min_samples_leaf': 3000
                        , 'max_depth': 10}

# Testa criterion
print('Test criterion:  gini, entropy, log_loss.')
hyperparameter_test_name = 'criterion'
test_hyperparameters = ['gini', 'entropy', 'log_loss']
hyperparameter_result = hyperparameter_DecisionTreeClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_leaf
print('Test min_samples_leaf:  3000 to 15000.')
hyperparameter_test_name = 'min_samples_leaf'
test_hyperparameters = [3000, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 15000]
hyperparameter_result = hyperparameter_DecisionTreeClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_split
print('Test min_samples_split:  3000 to 15000.')
hyperparameter_test_name = 'min_samples_split'
test_hyperparameters = [3000, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 15000]
hyperparameter_result = hyperparameter_DecisionTreeClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_depth
print('Test max_depth:  1 to 40.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 10, stop = 41, step = 1)
hyperparameter_result = hyperparameter_DecisionTreeClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]



In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

def hyperparameter_RandomForestClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    n_estimators, default=100. The number of trees in the forest.
    criterion{'gini', 'entropy', 'log_loss'}, default=”gini”
    min_samples_split, default=2
    min_samples_leaf, default=1
    max_depth, default=None    
    bootstrap, default=True. 
    oob_score, default=False. Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True.
    max_samples, default=None. If int, then draw max_samples samples. If float, then draw max_samples * X.shape[0] samples. Thus, max_samples should be in the interval (0.0, 1.0].
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        RF = RandomForestClassifier(**hyperparameters_dict)
        
        # fit and predict
        RF_fit = RF.fit(X_train, y_train)
        y_pred_train = RF_fit.predict_proba(X_train)[:,1]
        y_pred_test = RF_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values


hyperparameters_dict = {'random_state': 1
                        , 'criterion': 'entropy'
                        , 'n_estimators': 50
                        , 'min_samples_split': 3000
                        , 'min_samples_leaf': 3000
                        , 'max_depth': 10
                        , 'oob_score': True
                        , 'max_samples': 1}


# Testa n_estimators
print('Test n_estimators:  10 to 1000.')
hyperparameter_test_name = 'n_estimators'
test_hyperparameters = [10, 25, 50, 75, 100, 150, 200, 250, 500, 750, 1000]
hyperparameter_result = hyperparameter_RandomForestClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa criterion
print('Test criterion:  gini, entropy, log_loss.')
hyperparameter_test_name = 'criterion'
test_hyperparameters = ['gini', 'entropy', 'log_loss']
hyperparameter_result = hyperparameter_RandomForestClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_leaf
print('Test min_samples_leaf:  3000 to 7000.')
hyperparameter_test_name = 'min_samples_leaf'
test_hyperparameters = [3000, 3250, 3500, 3750, 4000, 4500, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_RandomForestClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_split
print('Test min_samples_split:  3000 to 7000.')
hyperparameter_test_name = 'min_samples_split'
test_hyperparameters = [3000, 3250, 3500, 3750, 4000, 4500, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_RandomForestClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_samples
print('Test max_samples:  0.1 to 1.')
hyperparameter_test_name = 'max_samples'
test_hyperparameters = np.arange(start = 0.1, stop = 1.01, step = 0.1)
hyperparameter_result = hyperparameter_RandomForestClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_depth
print('Test max_depth:  1 to 40.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 10, stop = 20, step = 1)
hyperparameter_result = hyperparameter_RandomForestClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]




In [None]:
# Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier

def hyperparameter_ExtraTreesClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    n_estimators, default=100. The number of trees in the forest.
    criterion{'gini', 'entropy', 'log_loss'}, default=”gini”
    min_samples_split, default=2
    min_samples_leaf, default=1
    max_depth, default=None    
    bootstrap, default=False. 
    oob_score, default=False. Whether to use out-of-bag samples to estimate the generalization score. Only available if bootstrap=True.
    max_samples, default=None. If int, then draw max_samples samples. If float, then draw max_samples * X.shape[0] samples. Thus, max_samples should be in the interval (0.0, 1.0].
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        EF = ExtraTreesClassifier(**hyperparameters_dict)
        
        # fit and predict
        EF_fit = EF.fit(X_train, y_train)
        y_pred_train = EF_fit.predict_proba(X_train)[:,1]
        y_pred_test = EF_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

hyperparameters_dict = {'random_state': 1
                        , 'criterion': 'gini'
                        , 'n_estimators': 750
                        , 'min_samples_split': 7000
                        , 'min_samples_leaf': 3000
                        , 'max_depth': 10
                        , 'bootstrap': True
                        , 'oob_score': True
                        , 'max_samples': 0.9}

# Testa n_estimators
print('Test n_estimators:  10 to 1000.')
hyperparameter_test_name = 'n_estimators'
test_hyperparameters = [100, 200, 500, 600, 650, 700, 750, 800, 850, 900, 1000]
hyperparameter_result = hyperparameter_ExtraTreesClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa criterion
print('Test criterion:  gini, entropy, log_loss.')
hyperparameter_test_name = 'criterion'
test_hyperparameters = ['gini', 'entropy', 'log_loss']
hyperparameter_result = hyperparameter_ExtraTreesClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_leaf
print('Test min_samples_leaf:  3000 to 7000.')
hyperparameter_test_name = 'min_samples_leaf'
test_hyperparameters = [3000, 3250, 3500, 3750, 4000, 4500, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_ExtraTreesClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_split
print('Test min_samples_split:  3000 to 7000.')
hyperparameter_test_name = 'min_samples_split'
test_hyperparameters = [3000, 3500, 4000, 4500, 5000, 6000, 6500, 7000, 7500, 8000, 9000, 10000, 15000]
hyperparameter_result = hyperparameter_ExtraTreesClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_samples
print('Test max_samples:  0.1 to 1.')
hyperparameter_test_name = 'max_samples'
test_hyperparameters = np.arange(start = 0.1, stop = 1.01, step = 0.1)
hyperparameter_result = hyperparameter_ExtraTreesClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_depth
print('Test max_depth:  1 to 40.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 10, stop = 20, step = 1)
hyperparameter_result = hyperparameter_ExtraTreesClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Modelo final:
# 'random_state': 1
# 'n_estimators': 750
# 'criterion': 'gini'
# 'min_samples_split': 7000
# 'min_samples_leaf': 3000
# 'max_depth': 10
# 'oob_score': True
# 'max_samples': 0.9

In [None]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
def hyperparameter_GradientBoostingClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    n_estimators, default=100. The number of trees in the forest.
    loss{'log_loss', 'exponential'}, default = 'log_loss'
    learning_rate, default=0.1. It ranges from 0 to inf
    subsample, default=1.0. It ranges from 0 to 1
    criterion{'friedman_mse', 'squared_error'}, default=’friedman_mse’
    min_samples_split, default=2
    min_samples_leaf, default=1
    max_depth, default = 3  
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        GBC = GradientBoostingClassifier(**hyperparameters_dict)
        
        # fit and predict
        GBC_fit = GBC.fit(X_train, y_train)
        y_pred_train = GBC_fit.predict_proba(X_train)[:,1]
        y_pred_test = GBC_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

hyperparameters_dict = {'random_state': 1
                        , 'loss': 'exponential'
                        , 'criterion': 'friedman_mse'
                        , 'n_estimators': 100
                        , 'min_samples_split': 3000
                        , 'min_samples_leaf': 3000
                        , 'max_depth': 15
                        , 'subsample': 1
                        , 'learning_rate': 0.5}

# Testa loss
print('Test loss:  log_loss, exponential.')
hyperparameter_test_name = 'loss'
test_hyperparameters = ['log_loss', 'exponential']
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa criterion
print('Test criterion:  friedman_mse, squared_error.')
hyperparameter_test_name = 'criterion'
test_hyperparameters = ['friedman_mse', 'squared_error']
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa subsample
print('Test subsample:  0 to 1.')
hyperparameter_test_name = 'subsample'
test_hyperparameters = np.arange(start = 0.1, stop = 1.01, step = 0.1)
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_leaf
print('Test min_samples_leaf:  3000 to 7000.')
hyperparameter_test_name = 'min_samples_leaf'
test_hyperparameters = [3000, 3500, 4000, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa min_samples_split
print('Test min_samples_split:  3000 to 7000.')
hyperparameter_test_name = 'min_samples_split'
test_hyperparameters = [3000, 3500, 4000, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_depth
print('Test max_depth:  1 to 20.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 10, stop = 20, step = 1)
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa learning_rate
print('Test learning_rate:  0.1 to 1.')
hyperparameter_test_name = 'learning_rate'
test_hyperparameters = np.arange(start = 0.1, stop = 1.1, step = 0.1)
hyperparameter_result = hyperparameter_GradientBoostingClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]




Agora, vamos tentar otimizar os modelo anteriores usando o AdaBoostClassifier. \
Observação que o algoritmo AdaBoostClassifier somente funciona em modelos que possuem o atributo classes_ e n_classes_.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

def hyperparameter_AdaBoostClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    base_estimator, default = DecisionTreeClassifier initialized with max_depth=1
    n_estimators, default = 50. It ranges from 1 to inf.  
    learning_rate, default = 1.0. It ranges from 0 to inf.
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        ABC = AdaBoostClassifier(**hyperparameters_dict)
        
        # fit and predict
        ABC_fit = ABC.fit(X_train, y_train)
        y_pred_train = ABC_fit.predict_proba(X_train)[:,1]
        y_pred_test = ABC_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

DecisionTreeClassifier_hyperparameters_dict = {'random_state': 1
                                                , 'criterion': 'entropy'
                                                , 'min_samples_split': 3000
                                                , 'min_samples_leaf': 3000
                                                , 'max_depth': 10}

# RandomForestClassifier does not work well with AdaBoost.
# RandomForestClassifier_hyperparameters_dict = {'random_state': 1
#                         , 'criterion': 'entropy'
#                         , 'n_estimators': 50
#                         , 'min_samples_split': 3000
#                         , 'min_samples_leaf': 3000
#                         , 'max_depth': 10
#                         , 'oob_score': True
#                         , 'max_samples': 1}

# Demora muito testar o AdaBoost com ExtraTreesClassifier
# ExtraTreesClassifier_hyperparameters_dict = {'random_state': 1
#                         , 'criterion': 'gini'
#                         , 'n_estimators': 750
#                         , 'min_samples_split': 7000
#                         , 'min_samples_leaf': 3000
#                         , 'max_depth': 10
#                         , 'bootstrap': True
#                         , 'oob_score': True
#                         , 'max_samples': 0.9}

hyperparameters_dict = {'random_state': 1
                        , 'base_estimator': DecisionTreeClassifier(**DecisionTreeClassifier_hyperparameters_dict)
                        , 'n_estimators': 50       # O ganho em aumentar esse hiperparâmetro não é muito e aumenta muito o overit.
                        , 'learning_rate': 0.3}

# # Testa n_estimators para DecisionTreeClassifier
# print('Test n_estimators for DecisionTreeClassifier:  10 to 200.')
# hyperparameter_test_name = 'n_estimators'
# test_hyperparameters = [10, 50, 100, 200]
# hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # Plota resultados
# df = hyperparameter_result['resultados'].copy()
# tested_parameter = hyperparameter_test_name
# maximum_minimum = 'maximum'
# metric_name = 'ROC AUC'
# print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # Salva melhor resultado.
# hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# Testa learning_rate para DecisionTreeClassifier. ROC sem AdaBoost: 0.8374951583700451, com AdaBoost: 0.8666174044250164
print('Test learning_rate for DecisionTreeClassifier:  0.1 to 1.')
hyperparameter_test_name = 'learning_rate'
test_hyperparameters = [0.1, 0.2, 0.3, 0.5, 0.8, 1]
hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]



# # Testa n_estimators para RandomForestClassifier
# hyperparameters_dict['base_estimator'] = RandomForestClassifier(**RandomForestClassifier_hyperparameters_dict)
# print('Test n_estimators for RandomForestClassifier:  10 to 200.')
# hyperparameter_test_name = 'n_estimators'
# test_hyperparameters = [10, 50, 100, 200]
# hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # Plota resultados
# df = hyperparameter_result['resultados'].copy()
# tested_parameter = hyperparameter_test_name
# maximum_minimum = 'maximum'
# metric_name = 'ROC AUC'
# print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # Salva melhor resultado.
# hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# # Testa learning_rate para RandomForestClassifier. ROC sem AdaBoost: 0.8412149997416809, com AdaBoost: 0.5
# print('Test learning_rate for RandomForestClassifier:  0.1 to 1.')
# hyperparameters_dict['base_estimator'] = RandomForestClassifier(**RandomForestClassifier_hyperparameters_dict)
# hyperparameter_test_name = 'learning_rate'
# test_hyperparameters = [0.1, 0.2, 0.3, 0.5, 0.8, 1]
# hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # Plota resultados
# df = hyperparameter_result['resultados'].copy()
# tested_parameter = hyperparameter_test_name
# maximum_minimum = 'maximum'
# metric_name = 'ROC AUC'
# print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # Salva melhor resultado.
# hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]



# Demora muito testar o AdaBoost com ExtraTreesClassifier
# # Testa n_estimators para ExtraTreesClassifier
# hyperparameters_dict['base_estimator'] = ExtraTreesClassifier(**ExtraTreesClassifier_hyperparameters_dict)
# print('Test n_estimators for ExtraTreesClassifier:  10 to 200.')
# hyperparameter_test_name = 'n_estimators'
# test_hyperparameters = [10, 50, 100, 200]
# hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # Plota resultados
# df = hyperparameter_result['resultados'].copy()
# tested_parameter = hyperparameter_test_name
# maximum_minimum = 'maximum'
# metric_name = 'ROC AUC'
# print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # Salva melhor resultado.
# hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# # Testa learning_rate para ExtraTreesClassifier. ROC sem AdaBoost: 0.8195196098946576, com AdaBoost:  Demora muito!!
# hyperparameters_dict['base_estimator'] = ExtraTreesClassifier(**ExtraTreesClassifier_hyperparameters_dict)
# print('Test learning_rate for ExtraTreesClassifier:  0.1 to 1.')
# hyperparameter_test_name = 'learning_rate'
# test_hyperparameters = [0.1, 0.2, 0.3, 0.5, 0.8, 1]
# hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # Plota resultados
# df = hyperparameter_result['resultados'].copy()
# tested_parameter = hyperparameter_test_name
# maximum_minimum = 'maximum'
# metric_name = 'ROC AUC'
# print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # Salva melhor resultado.
# hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]





In [None]:
# # GradientBoostingClassifier com AdaBoost - Demora mais de 2:30 horas para correr.
# GradientBoostingClassifier_hyperparameters_dict = {'random_state': 1
#                         , 'loss': 'exponential'
#                         , 'criterion': 'friedman_mse'
#                         , 'n_estimators': 100
#                         , 'min_samples_split': 3000
#                         , 'min_samples_leaf': 3000
#                         , 'max_depth': 15
#                         , 'subsample': 1
#                         , 'learning_rate': 0.5}


# hyperparameters_dict = {'random_state': 1
#                         , 'base_estimator': GradientBoostingClassifier(**GradientBoostingClassifier_hyperparameters_dict)
#                         , 'n_estimators': 50       # O ganho em aumentar esse hiperparâmetro não é muito e aumenta muito o overit.
#                         , 'learning_rate': 0.3}

# # # Testa n_estimators para GradientBoostingClassifier
# # hyperparameters_dict['base_estimator'] = GradientBoostingClassifier(**GradientBoostingClassifier_hyperparameters_dict)
# # print('Test n_estimators for GradientBoostingClassifier:  10 to 200.')
# # hyperparameter_test_name = 'n_estimators'
# # test_hyperparameters = [10, 50, 100, 200]
# # hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # # Plota resultados
# # df = hyperparameter_result['resultados'].copy()
# # tested_parameter = hyperparameter_test_name
# # maximum_minimum = 'maximum'
# # metric_name = 'ROC AUC'
# # print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # # Salva melhor resultado.
# # hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# # Testa learning_rate para GradientBoostingClassifier. ROC sem AdaBoost: 0.8722012787817987, com AdaBoost:
# hyperparameters_dict['base_estimator'] = GradientBoostingClassifier(**GradientBoostingClassifier_hyperparameters_dict)
# print('Test learning_rate for GradientBoostingClassifier:  0.1 to 1.')
# hyperparameter_test_name = 'learning_rate'
# test_hyperparameters = [0.1, 0.2, 0.3, 0.5, 0.8, 1]
# hyperparameter_result = hyperparameter_AdaBoostClassifier(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# # Plota resultados
# df = hyperparameter_result['resultados'].copy()
# tested_parameter = hyperparameter_test_name
# maximum_minimum = 'maximum'
# metric_name = 'ROC AUC'
# print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# # Salva melhor resultado.
# hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

In [None]:
# LightGBM
import lightgbm as lgb
def hyperparameter_LGBMClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    n_estimators, default=100. The number of trees in the forest.
    loss{'log_loss', 'exponential'}, default = 'log_loss'
    learning_rate, default=0.1. It ranges from 0 to inf
    subsample, default=1.0. It ranges from 0 to 1
    criterion{'friedman_mse', 'squared_error'}, default=’friedman_mse’
    min_samples_split, default=2
    min_samples_leaf, default=1
    max_depth, default = 3  
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        clf = lgb.LGBMClassifier(**hyperparameters_dict)
        
        # fit and predict
        clf_fit = clf.fit(X_train, y_train)
        y_pred_train = clf_fit.predict_proba(X_train)[:,1]
        y_pred_test = clf_fit.predict_proba(X_test)[:,1]
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

hyperparameters_dict = {"objective": "binary",
            "metric": "binary_logloss",
            "boosting_type": "gbdt",
            'verbosity': -1,
            'seed': 1,
            "min_data_in_leaf": 3500,
            "num_leaves": 29,
            "max_depth": 16,
            "feature_fraction": 1,
            "learning_rate": 0.43}  


# Testa min_data_in_leaf
print('Test min_data_in_leaf:  from 3500 to 7000.')
hyperparameter_test_name = 'min_data_in_leaf'
test_hyperparameters = [3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_LGBMClassifier(X_train, y_train[target_name], X_test, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]
                        
                        
# Testa num_leaves
print('Test num_leaves:  from 10 to 40.')
hyperparameter_test_name = 'num_leaves'
test_hyperparameters = np.arange(start = 10, stop = 40, step = 1)
hyperparameter_result = hyperparameter_LGBMClassifier(X_train, y_train[target_name], X_test, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]
                        
                        
# Testa max_depth
print('Test max_depth:  from 2 to 30.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 2, stop = 31, step = 1)
hyperparameter_result = hyperparameter_LGBMClassifier(X_train, y_train[target_name], X_test, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]
                        
                        
# Testa learning_rate
print('Test learning_rate:  from 0.1 to 0.7.')
hyperparameter_test_name = 'learning_rate'
test_hyperparameters = np.arange(start = 0.01, stop = 0.701, step = 0.01)
hyperparameter_result = hyperparameter_LGBMClassifier(X_train, y_train[target_name], X_test, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]



In [None]:
# XGBoost
import xgboost as xgb
def hyperparameter_XGBRegressor(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    'max_depth' = 6
    'max_leaves' = 62
    'min_child_weight' = 1
    'subsample' = 0.95       # De 0 a 1
    'colsample_bytree' = 1   # De 0 a 1
    'learning_rate' = 0.3
    'gamma' = 0.365
    'reg_lambda' = 0
    'reg_alpha' = 0
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # Muda o valor do hyperparametro testado
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        XGBoost = xgb.XGBRegressor(**hyperparameters_dict)
        
        # fit and predict
        XGBoost_fit = XGBoost.fit(X_train, y_train)
        y_pred_train = XGBoost_fit.predict(X_train)
        y_pred_test = XGBoost_fit.predict(X_test)
        
        # Guarda métricas de comparação
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

hyperparameters_dict = {'objective': 'binary:logistic',
                        'eval_metric': 'auc',
                        'seed': 1,
                        'verbosity': 1,
                        'validate_parameters': True,
                        'tree_method': "hist",    
                        'booster': 'gbtree',       # gbtree, gblinear or dart.
                        'max_depth': 6,
                        'max_leaves': 62,
                        'min_child_weight': 1,
                        'subsample': 0.95,
                        'colsample_bytree': 1,
                        'learning_rate': 0.3,
                        'gamma': 0.365,
                        'reg_lambda': 0,
                        'reg_alpha': 0}  


# Testa min_child_weight
print('Test min_child_weight:  from 1 to 3000.')
hyperparameter_test_name = 'min_child_weight'
test_hyperparameters = [1, 50, 100, 200, 400, 600, 800, 1000, 1250, 1500, 1750, 2000, 2250, 2500, 2750, 3000]
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_depth
print('Test max_depth:  from 1 to 15.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 1, stop = 16, step = 1)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa max_leaves
print('Test max_leaves:  from 1 to 70.')
hyperparameter_test_name = 'max_leaves'
test_hyperparameters = np.arange(start = 1, stop = 71, step = 1)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa subsample
print('Test subsample:  from 0.4 to 1.')
hyperparameter_test_name = 'subsample'
test_hyperparameters = np.arange(start = 0.4, stop = 1.04, step = 0.05)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa colsample_bytree
print('Test colsample_bytree:  from 0.4 to 1.')
hyperparameter_test_name = 'colsample_bytree'
test_hyperparameters = np.arange(start = 0.4, stop = 1.04, step = 0.05)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa learning_rate
print('Test learning_rate:  from 0.01 to 0.5.')
hyperparameter_test_name = 'learning_rate'
test_hyperparameters = np.arange(start = 0.01, stop = 0.501, step = 0.01)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa gamma
print('Test gamma:  from 0 to 0.5.')
hyperparameter_test_name = 'gamma'
test_hyperparameters = np.arange(start = 0, stop = 0.5001, step = 0.005)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa reg_lambda
print('Test reg_lambda:  from 0 to 200.')
hyperparameter_test_name = 'reg_lambda'
test_hyperparameters = np.arange(start = 0, stop = 201, step = 5)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]


# Testa reg_alpha
print('Test reg_alpha:  from 0 to 200.')
hyperparameter_test_name = 'reg_alpha'
test_hyperparameters = np.arange(start = 0, stop = 201, step = 5)
hyperparameter_result = hyperparameter_XGBRegressor(X_train_target_encoder, y_train[target_name], X_test_target_encoder, y_test[target_name], hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plota resultados
df = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name)
# Salva melhor resultado.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]



Agora, vamos juntar os melhores modelos e adicionar a soft Voting Classifier.\
Primeiro, como nós usamos encoders diferentes em modelos diferentes, vamos recriar nossas tabelas de treino e teste e montar uma pipeline.

In [None]:
# Lembra as variáveis
# Define as variáveis contínuas
scalar_variables = ['col_1'
                    , 'col_2']

# Define as variáveis categóricas
categorical_variables = ['col_3', 'col_4']

# Define treino, teste e out of time
X_train = df_train[X_features].copy()
X_test = df_test[X_features].copy()
# Target
y_train = df_train[[target_name]].copy()
y_test = df_test[[target_name]].copy()
# Out of time
X_out_of_time = df_out_of_time[X_features].copy()
# Target
y_out_of_time = df_out_of_time[[target_name]].copy()

# Importante, na pipeline de preprocessamento é necessário que as colunas usadas em cada etapa fiquem juntas.
# Caso contraio, as colunas não usadas serão excluídas.
X_train = X_train[scalar_variables + categorical_variables]
X_test = X_test[scalar_variables + categorical_variables]
X_out_of_time = X_out_of_time[scalar_variables + categorical_variables]

# Ordinal encoder used in LightGBM
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from sklearn.preprocessing import StandardScaler
from category_encoders.target_encoder import OrdinalEncoder
pre_processamento_ordinal_encoder = ColumnTransformer([('num', StandardScaler(), scalar_variables), ('ordinal', OrdinalEncoder(), categorical_variables)])

# one-hot encoder
from sklearn.preprocessing import OneHotEncoder
pre_processamento_onehot_encoder = ColumnTransformer([('num', StandardScaler(), scalar_variables), ('onehot', OneHotEncoder(sparse=False, drop="first"), categorical_variables)])

# target encoder
from category_encoders.target_encoder import TargetEncoder
pre_processamento_target_encoder = ColumnTransformer([('num', StandardScaler(), scalar_variables), ('target', TargetEncoder(), categorical_variables)])


# modelos com one-hot encoder
from sklearn.linear_model import LogisticRegression
linear_model_hyperparameters_dict = {'random_state': 1 
                        , 'max_iter': 100 
                        , 'solver': 'liblinear' 
                        , 'penalty': 'l1' 
                        , 'C': 1 
                        , 'l1_ratio': None}
lr = LogisticRegression(**linear_model_hyperparameters_dict)
# Criando pipeline com o modelo
linear_model_imba_pipeline = make_pipeline_imblearn(pre_processamento_onehot_encoder, lr)  # ROC = 0.
# Fit model
linear_model_fit = linear_model_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])


# modelos com target encoder
from sklearn.neighbors import KNeighborsClassifier
knn_hyperparameters_dict = {'n_neighbors': 11 
                        , 'weights': 'distance' 
                        , 'algorithm': 'auto' 
                        , 'p': 1 
                        , 'metric': 'manhattan'} 
knn = KNeighborsClassifier(**knn_hyperparameters_dict)
# Criando pipeline com o modelo
knn_imba_pipeline = make_pipeline_imblearn(pre_processamento_target_encoder, knn)  # ROC = 0.
# Fit model
knn_fit = knn_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])

from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import AdaBoostClassifier 
DecisionTreeClassifier_hyperparameters_dict = {'random_state': 1 
                        , 'criterion': 'entropy' 
                        , 'min_samples_split': 3000 
                        , 'min_samples_leaf': 3000 
                        , 'max_depth': 10} 
AdaBoost_hyperparameters_dict = {'random_state': 1 
                        , 'base_estimator': DecisionTreeClassifier(**DecisionTreeClassifier_hyperparameters_dict) 
                        , 'n_estimators': 50    
                        , 'learning_rate': 0.3} 
ABC_DecisionTreeClassifier = AdaBoostClassifier(**AdaBoost_hyperparameters_dict)
# Criando pipeline com o modelo
ABC_DecisionTreeClassifier_imba_pipeline = make_pipeline_imblearn(pre_processamento_target_encoder, ABC_DecisionTreeClassifier)  # ROC = 0.8666174044250164
# Fit model
ABC_DecisionTreeClassifier_fit = ABC_DecisionTreeClassifier_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])


from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier_hyperparameters_dict = {'random_state': 1 
                        , 'criterion': 'entropy' 
                        , 'n_estimators': 50 
                        , 'min_samples_split': 3000 
                        , 'min_samples_leaf': 3000 
                        , 'max_depth': 10 
                        , 'oob_score': True 
                        , 'max_samples': 1} 
RF = RandomForestClassifier(**RandomForestClassifier_hyperparameters_dict)
# Criando pipeline com o modelo
RF_imba_pipeline = make_pipeline_imblearn(pre_processamento_target_encoder, RF)  # ROC = 0.
# Fit model
RF_fit = RF_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])


from sklearn.ensemble import ExtraTreesClassifier
ExtraTreesClassifier_hyperparameters_dict = {'random_state': 1 
                        , 'criterion': 'gini' 
                        , 'n_estimators': 750 
                        , 'min_samples_split': 7000 
                        , 'min_samples_leaf': 3000 
                        , 'max_depth': 10 
                        , 'bootstrap': True 
                        , 'oob_score': True 
                        , 'max_samples': 0.9} 
EF = ExtraTreesClassifier(**ExtraTreesClassifier_hyperparameters_dict)
# Criando pipeline com o modelo
EF_imba_pipeline = make_pipeline_imblearn(pre_processamento_target_encoder, EF)  # ROC = 0.
# Fit model
EF_fit = EF_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])

from sklearn.ensemble import GradientBoostingClassifier
GradientBoostingClassifier_hyperparameters_dict = {'random_state': 1 
                        , 'loss': 'exponential' 
                        , 'criterion': 'friedman_mse' 
                        , 'n_estimators': 100 
                        , 'min_samples_split': 3000 
                        , 'min_samples_leaf': 3000 
                        , 'max_depth': 15 
                        , 'subsample': 1 
                        , 'learning_rate': 0.5}
GBC = GradientBoostingClassifier(**GradientBoostingClassifier_hyperparameters_dict)
# Criando pipeline com o modelo
GBC_imba_pipeline = make_pipeline_imblearn(pre_processamento_target_encoder, GBC)  # ROC = 0.
# Fit model
GBC_fit = GBC_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])


import xgboost as xgb
xgb_hyperparameters_dict = { 'objective': 'binary:logistic', 
                            'use_label_encoder': False,             # Necessary only when using the scikit-learn api.
                            'eval_metric': 'auc', 
                            'seed': 1, 
                            'verbosity': 1, 
                            'validate_parameters': True, 
                            'tree_method': "hist",     
                            'booster': 'gbtree',       # gbtree, gblinear or dart. \
                            'max_depth': 6, 
                            'max_leaves': 62, 
                            'min_child_weight': 1, 
                            'subsample': 0.95, 
                            'colsample_bytree': 1, 
                            'learning_rate': 0.3, 
                            'gamma': 0.365, 
                            'reg_lambda': 0, 
                            'reg_alpha': 0}
# XGBoost = xgb.XGBRegressor(**xgb_hyperparameters_dict)
XGBoost = xgb.XGBClassifier(**xgb_hyperparameters_dict)          # We must use scikit-learn api to use the predict_proba method in the soft vote model.
# Criando pipeline com o modelo
XGBoost_imba_pipeline = make_pipeline_imblearn(pre_processamento_target_encoder, XGBoost)  # ROC = 0.
# Fit model
XGBoost_fit = XGBoost_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'])


# Modelos com ordinal encoder
import lightgbm as lgb
lightGBM_hyperparameters_dict = {"objective": "binary", 
            "metric": "binary_logloss", 
            "boosting_type": "gbdt", 
            'verbosity': -1, 
            'seed': 1, 
            "min_data_in_leaf": 3500, 
            "num_leaves": 29, 
            "max_depth": 16, 
            "feature_fraction": 1, 
            "learning_rate": 0.43}
lightGBM = lgb.LGBMClassifier(**lightGBM_hyperparameters_dict)
# Criando pipeline com o modelo
lightGBM_imba_pipeline = make_pipeline_imblearn(pre_processamento_ordinal_encoder, lightGBM) # ROC = 0.
# Fit model
lightGBM_fit = lightGBM_imba_pipeline.fit(X_train, y_train['FLAG_TARGET_'], lgbmclassifier__categorical_feature = [14, 15, 16, 17])


# Soft voting model
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
estimators = [('linear_model_fit', linear_model_fit)    # ROC = 0.
              , ('knn_fit', knn_fit)                    # ROC = 0. 
              , ('ABC_DecisionTreeClassifier_fit', ABC_DecisionTreeClassifier_fit)   # ROC = 0.
              , ('RF_fit', RF_fit)       # ROC = 0.
              , ('EF_fit', EF_fit)       # ROC = 0.
              , ('GBC_fit', GBC_fit)     # ROC = 0.0.
              , ('XGBoost_fit', XGBoost_fit)   # ROC = 0.0.
              , ('lightGBM_fit', lightGBM_fit)]   # ROC = 0. 

VC = VotingClassifier(estimators, voting='soft')
VC = VC.fit(X_train, y_train['FLAG_TARGET_'])
# Predict in our test.
y_pred_test = VC.predict_proba(X_test)[:,1]
roc_auc_score(y_test['FLAG_TARGET_'], y_pred_test)

Vamos tentar tirar alguns estimadores.

In [None]:
estimators = [#('linear_model_fit', linear_model_fit)    # ROC = 0.
               ('knn_fit', knn_fit)                    # ROC = 0. 
              , ('ABC_DecisionTreeClassifier_fit', ABC_DecisionTreeClassifier_fit)   # ROC = 0.
              , ('RF_fit', RF_fit)       # ROC = 0.
              , ('EF_fit', EF_fit)       # ROC = 0.
              , ('GBC_fit', GBC_fit)     # ROC = 0.0.
              , ('XGBoost_fit', XGBoost_fit)   # ROC = 0.0.
              , ('lightGBM_fit', lightGBM_fit)]   # ROC = 0. 

VC = VotingClassifier(estimators, voting='soft')
VC = VC.fit(X_train, y_train['FLAG_TARGET_'])
# Predict in our test.
y_pred_test = VC.predict_proba(X_test)[:,1]
roc_auc_score(y_test['FLAG_TARGET_'], y_pred_test)

In [None]:
estimators = [#('linear_model_fit', linear_model_fit)    # ROC = 0.
               ('knn_fit', knn_fit)                    # ROC = 0. 
              , ('ABC_DecisionTreeClassifier_fit', ABC_DecisionTreeClassifier_fit)   # ROC = 0.
              , ('RF_fit', RF_fit)       # ROC = 0.
              # , ('EF_fit', EF_fit)       # ROC = 0.
              , ('GBC_fit', GBC_fit)     # ROC = 0.0.
              , ('XGBoost_fit', XGBoost_fit)   # ROC = 0.0.
              , ('lightGBM_fit', lightGBM_fit)]   # ROC = 0. 

VC = VotingClassifier(estimators, voting='soft')
VC = VC.fit(X_train, y_train['FLAG_TARGET_'])
# Predict in our test.
y_pred_test = VC.predict_proba(X_test)[:,1]
roc_auc_score(y_test['FLAG_TARGET_'], y_pred_test)

Ao invés de usar um soft voting, vamos tentar o StackingClassifier com uma regressão logistica.

In [None]:
from sklearn.ensemble import StackingClassifier
estimators = [('linear_model_fit', linear_model_fit)    # ROC = 0.
              , ('knn_fit', knn_fit)                    # ROC = 0. 
              , ('ABC_DecisionTreeClassifier_fit', ABC_DecisionTreeClassifier_fit)   # ROC = 0.
              , ('RF_fit', RF_fit)       # ROC = 0.
              , ('EF_fit', EF_fit)       # ROC = 0.
              , ('GBC_fit', GBC_fit)     # ROC = 0.0.
              , ('XGBoost_fit', XGBoost_fit)   # ROC = 0.0.
              , ('lightGBM_fit', lightGBM_fit)]   # ROC = 0. 

SC = StackingClassifier(estimators = estimators, final_estimator=LogisticRegression())
SC_fit = SC.fit(X_train, y_train['FLAG_TARGET_'])
# Predict in our test.
y_pred_test = SC_fit.predict_proba(X_test)[:,1]
roc_auc_score(y_test['FLAG_TARGET_'], y_pred_test)

Vamos tentar tirar alguns estimadores.

In [None]:
estimators = [#('linear_model_fit', linear_model_fit)    # ROC = 0.
               ('knn_fit', knn_fit)                    # ROC = 0. 
              , ('ABC_DecisionTreeClassifier_fit', ABC_DecisionTreeClassifier_fit)   # ROC = 0.
              , ('RF_fit', RF_fit)       # ROC = 0.
              , ('EF_fit', EF_fit)       # ROC = 0.
              , ('GBC_fit', GBC_fit)     # ROC = 0.0.
              , ('XGBoost_fit', XGBoost_fit)   # ROC = 0.0.
              , ('lightGBM_fit', lightGBM_fit)]   # ROC = 0. 

SC = StackingClassifier(estimators = estimators, final_estimator=LogisticRegression())
SC_fit = SC.fit(X_train, y_train['FLAG_TARGET_'])
# Predict in our test.
y_pred_test = SC_fit.predict_proba(X_test)[:,1]
roc_auc_score(y_test['FLAG_TARGET_'], y_pred_test)