In [None]:
import io
from IPython.display import display
# from bbmagic import Hdfs

from trata_variaveis import trata_variaveis_categoricas

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as mpatches
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import scikitplot as skplt

import lightgbm as lgb
import xgboost as xgb

from sklearn.compose import ColumnTransformer
from imblearn.pipeline import make_pipeline as make_pipeline_imblearn
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders.target_encoder import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.impute import SimpleImputer


from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [None]:
# df.to_csv('df.csv', index = False)   # If index = True, it will be created a new columns with the index.

df = pd.read_csv('df.csv')

# Select only useful coluns for the model.
df = df[['year_month', 'target', 'col_4', 'col_5', 'col_6', 'col_7', 'value']]   

print('Observations:', len(df)) # XX.XXX
print('Variables:', df.shape[1]) # XX
df.head(5)

In [None]:
# Define o out of time
df_out_of_time = df[df["year_month"].isin([202111, 202112]) == True].copy()
df_out_of_time = df_out_of_time.drop('year_month', axis = 1).copy()

df_model = df[df["year_month"].isin([202111, 202112]) == False].copy()
df_model = df_model.drop(['year_month', 'value'], axis = 1).copy()


print('Observations:', len(df)) # XX.XXX
print('Variables:', df.shape[1]) # XX
df.head(5)

In [None]:
# Define how many variables we should use in the XGBoost. 
# This algorithm uses the forward method.
# First, it makes several models with only one variable and select the best one. After, select the best model with two variables given that one variable was already selected in the previous step. 
# It repeats the steps until making a model with all variables.
xgb_hyperparameters_dict = { 'objective': 'binary:logistic', 
                            'use_label_encoder': False,             # Necessary only when using the scikit-learn api.
                            'eval_metric': 'auc', 
                            'verbosity': 1, 
                            'validate_parameters': True, 
                            'tree_method': "hist",     
                            'booster': 'gbtree',       # gbtree, gblinear or dart. \
                            }
# Cross-validation
# Dataframes with the best variables and ROC_AUC.
df_auc_roc = pd.DataFrame()
df_variables = pd.DataFrame()
for cross_i in range(0, 10):
    # Use bootstrap to generate train and test.
    df_train_bootstrap = df_model.sample(len(df_model), replace=True).copy()
    df_test_bootstrap = df_model[~df_model.index.isin(df_train_bootstrap.index.to_list())].copy()

    y_train = df_train_bootstrap['target']
    df_train_bootstrap = df_train_bootstrap.drop('target', axis = 1)
    y_test = df_test_bootstrap['target']
    df_test_bootstrap = df_test_bootstrap.drop('target', axis = 1)

    # Define the categorical variables
    categorical_variables = ['col_6', 'col_7']
    # Define the categorical variables
    scalar_variables = ['col_4', 'col_5

    # Fill the nulls using the median of the train dataset.
    median_dict = {}
    for col in df_train_bootstrap[scalar_variables]:
        median_dict[col] = df_train_bootstrap[col].median()
    for key in median_dict:
        df_train_bootstrap[key] = df_train_bootstrap[key].fillna(median_dict[key])
        df_test_bootstrap[key] = df_test_bootstrap[key].fillna(median_dict[key])

    # Normalizeing is import if we want to use regularization.
    scaler = StandardScaler()
    scaler.fit(df_train_bootstrap[scalar_variables])
    df_train_bootstrap[scalar_variables] = scaler.transform(df_train_bootstrap[scalar_variables])
    df_test_bootstrap[scalar_variables] = scaler.transform(df_test_bootstrap[scalar_variables])

    # target encoder for categorical varibles.
    target_encoder_dict = {}
    for col in df_train_bootstrap[categorical_variables]:
        target_encoder_dict[col] = TargetEncoder().fit(df_train_bootstrap[col], y_train)
    for key in target_encoder_dict:
        df_train_bootstrap[key] = target_encoder_dict[key].transform(df_train_bootstrap[key])
        df_test_bootstrap[key] = target_encoder_dict[key].transform(df_test_bootstrap[key])

    
    auc_roc = []
    variables = ['target']
    columns = df_train_bootstrap.columns

    for i in range(0, len(columns)):
        auc_roc_best = 0
        variables_test = []
        variables_best = []

        for j in range(0, len(columns)):
            variables_test_aux = variables + [columns[j]]
            variables_test = pd.unique(variables_test_aux)

            if (len(variables_test) == len(variables_test_aux)):   # So we don't test the same model twice.
                XGBoost = xgb.XGBRegressor(**xgb_hyperparameters_dict)
                # fit and predict
                variables_test
                XGBoost_fit = XGBoost.fit(df_train_bootstrap[variables_test[1:]], y_train)
                y_pred_test = XGBoost_fit.predict(df_test_bootstrap[variables_test[1:]])
                auc_roc_aux = roc_auc_score(y_test, y_pred_test)
                if auc_roc_aux > auc_roc_best:
                    variables_best = columns[j]
                    auc_roc_best = auc_roc_aux

        variables = variables + [variables_best]
        auc_roc = auc_roc + [auc_roc_best]
    # Save final results    
    df_variables[cross_i] = variables
    df_auc_roc[cross_i] = auc_roc
    print(cross_i)
variables_count = np.arange(start = 1, stop = len(variables), step = 1)
df_variables['qnt_variables'] = df_variables.index
df_auc_roc['qnt_variables'] = variables_count

In [None]:
# Boxpot of ROC per quantity of variables used
df_auc_roc_2 = pd.melt(df_auc_roc, id_vars = ['qnt_variables'], value_vars = [0, 1, 2], var_name = 'cros_val', value_name = 'roc_auc')
title = 'Performance by used variables' 
plt.rcParams.update(plt.rcParamsDefault)
sns.set(rc = {'figure.figsize': (10, 6)})
bg_color = "white"
contorno = 'black'
color = 'black'
sns.set_style("darkgrid", {'axes.facecolor': bg_color
                          , 'axes.edgecolor': contorno})
meanpointprops = dict(color = color, linewidth = 1.5)

ax = sns.boxplot(y = 'roc_auc', data = df_auc_roc_2, x = 'qnt_variables'  #, order = hue_order
                    , showmeans = True, meanline = True, meanprops = meanpointprops)
ax.set_title(title, fontsize = 10)

In [None]:
# Verify which variables were used for each model. 
# There are some randomness in each model. Therefore, we need a heat map showing the proportion of how many times each variable was used in the cross-validation. 
df_variables_heatmap = (df_variables.iloc[1:,:-1].apply(pd.Series.value_counts, axis=1).fillna(0)) / 10
df_variables_heatmap = df_variables_heatmap.cumsum()
df_variables_heatmap = df_variables_heatmap.replace({0: np.nan})

size_x = 10
size_y = 10
plt.figure(figsize = (size_x, size_y))
sns.set(font_scale = 1)
corr_matrix = df.corr()
with sns.axes_style('white'):
    ax = sns.heatmap(df_variables_heatmap.T
                    , linewidth = 0.2
                    , annot = True, fmt = '.1f'
                    , cmap = 'seismic'
                    , vmin = -1, vmax = 1)
plt.title(title)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()
plt.close()

In [None]:
# Define how many variables we should use in the LightGbM. 
# This algorithm uses the forward method.
# First, it makes several models with only one variable and select the best one. After, select the best model with two variables given that one variable was already selected in the previous step. 
# It repeats the steps until making a model with all variables.
hyperparameters_LightGBM_dict = {"objective": "binary",
            "metric": "binary_logloss",
            "boosting_type": "gbdt",
            'verbose': -1,
            } 

# Cross-validation
# Dataframes with the best variables and ROC_AUC.
df_variables_lightgbm = pd.DataFrame()
df_auc_roc_lightgbm = pd.DataFrame()
for cross_i in range(0, 10):
    # Use bootstrap to generate train and test.
    df_train_bootstrap = df_model.sample(len(df_model), replace=True).copy()
    df_test_bootstrap = df_model[~df_model.index.isin(df_train_bootstrap.index.to_list())].copy()

    y_train = df_train_bootstrap['target']
    df_train_bootstrap = df_train_bootstrap.drop('target', axis = 1)
    y_test = df_test_bootstrap['target']
    df_test_bootstrap = df_test_bootstrap.drop('target', axis = 1)

    # Define the categorical variables
    categorical_variables = ['col_6', 'col_7']
    # Define the categorical variables
    scalar_variables = ['col_4', 'col_5

    # Fill the nulls using the median of the train dataset.
    median_dict = {}
    for col in df_train_bootstrap[scalar_variables]:
        median_dict[col] = df_train_bootstrap[col].median()
    for key in median_dict:
        df_train_bootstrap[key] = df_train_bootstrap[key].fillna(median_dict[key])
        df_test_bootstrap[key] = df_test_bootstrap[key].fillna(median_dict[key])

    # Normalizeing is import if we want to use regularization.
    scaler = StandardScaler()
    scaler.fit(df_train_bootstrap[scalar_variables])
    df_train_bootstrap[scalar_variables] = scaler.transform(df_train_bootstrap[scalar_variables])
    df_test_bootstrap[scalar_variables] = scaler.transform(df_test_bootstrap[scalar_variables])

    # Define the type of categorical variables as category.
    for col in df_train_bootstrap[categorical_variables]:
        df_train_bootstrap[col] = df_train_bootstrap[col].astype('category')
        df_test_bootstrap[col] = df_test_bootstrap[col].astype('category')

                        
    auc_roc = []
    variables = ['target']
    columns = df_train_bootstrap.columns

    for i in range(0,len(columns)):
        auc_roc_best = 0
        variables_test = []
        variables_best = []

        for j in range(0, len(columns)):
            variables_test_aux = variables + [columns[j]]
            variables_test = pd.unique(variables_test_aux)

            if (len(variables_test) == len(variables_test_aux)):   # So we don't test the same model twice.
                LightGBM = lgb.LGBMClassifier(**hyperparameters_LightGBM_dict)
                # fit and predict
                variables_test
                LightGBM_fit = LightGBM.fit(df_train_bootstrap[variables_test[1:]], y_train)
                y_pred_test = LightGBM_fit.predict_proba(df_test_bootstrap[variables_test[1:]])[:,1]
                auc_roc_aux = roc_auc_score(y_test, y_pred_test)
                if auc_roc_aux > auc_roc_best:
                    variables_best = columns[j]
                    auc_roc_best = auc_roc_aux

        variables = variables + [variables_best]
        auc_roc = auc_roc + [auc_roc_best]
    # Save final results      
    df_variables_lightgbm[cross_i] = variables
    df_auc_roc_lightgbm[cross_i] = auc_roc
    print(cross_i)
variables_count = np.arange(start = 1, stop = len(variables), step = 1)
df_variables_lightgbm['qnt_variables'] = df_variables_lightgbm.index
df_auc_roc_lightgbm['qnt_variables'] = variables_count

In [None]:
# Boxpot of ROC per quantity of variables used
df_auc_roc_lightgbm_2 = pd.melt(df_auc_roc_lightgbm, id_vars = ['qnt_variables'], value_vars = [0, 1, 2], var_name = 'cros_val', value_name = 'roc_auc')
title = 'Performance by used variables' 
plt.rcParams.update(plt.rcParamsDefault)
sns.set(rc = {'figure.figsize': (10, 6)})
bg_color = "white"
contorno = 'black'
color = 'black'
sns.set_style("darkgrid", {'axes.facecolor': bg_color
                          , 'axes.edgecolor': contorno})
meanpointprops = dict(color = color, linewidth = 1.5)

ax = sns.boxplot(y = 'roc_auc', data = df_auc_roc_lightgbm_2, x = 'qnt_variables'  #, order = hue_order
                    , showmeans = True, meanline = True, meanprops = meanpointprops)
ax.set_title(title, fontsize = 10)

In [None]:
# Verify which variables were used for each model. 
# There are some randomness in each model. Therefore, we need a heat map showing the proportion of how many times each variable was used in the cross-validation. 
df_variables_lightgbm_heatmap = (df_variables_lightgbm.iloc[1:,:-1].apply(pd.Series.value_counts, axis=1).fillna(0)) / 10
df_variables_lightgbm_heatmap = df_variables_lightgbm_heatmap.cumsum()
df_variables_lightgbm_heatmap = df_variables_lightgbm_heatmap.replace({0: np.nan})

size_x = 10
size_y = 10
plt.figure(figsize = (size_x, size_y))
sns.set(font_scale = 1)
corr_matrix = df.corr()
with sns.axes_style('white'):
    ax = sns.heatmap(df_variables_lightgbm_heatmap.T
                    , linewidth = 0.2
                    , annot = True, fmt = '.1f'
                    , cmap = 'seismic'
                    , vmin = -1, vmax = 1)
plt.title(title)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()
plt.close()

In [None]:
# Plot performance for train and test for each hyperparameter used.
def print_plot_best_hyperparameter_result(df, maximum_minimum, tested_parameter, metric_name):
    '''
    This function must receive a Pandas DataFrame.
    The first columns must be the parameters tested.
    The second column must be the results for the train dataset
    The third column must be the results for the test dataset
    '''
    # Find best parameter and print
    if maximum_minimum == 'maximum':
        best_parameter_result = df.iloc[:,2].max()
    elif maximum_minimum == 'minimum':
        best_parameter_result = df.iloc[:,2].min()
    print('Used metric:', metric_name)
    print('Best', tested_parameter, 'parameter:', df[np.in1d(df.iloc[:,2], best_parameter_result)].iloc[:,0].values[0])
    print('Best', tested_parameter, 'parameter result:', best_parameter_result, '\n')

    # Plot train test results
    plt.rcParams.update(plt.rcParamsDefault)
    parameters_tested = df.iloc[:,0]
    train_results = df.iloc[:,1]
    test_results = df.iloc[:,2]

    fig, ax = plt.subplots(figsize=(10, 6))
    # Plot first the results for train
    ax.plot(parameters_tested, train_results, color='red', linewidth = 0.7, linestyle = 'solid', label = 'Train')
    # Add dots in tested points
    ax.scatter(parameters_tested, train_results, s = 8, color = 'black')

    # Plot the results for test
    ax.plot(parameters_tested, test_results, color='blue', linewidth = 0.7, linestyle = 'solid', label = 'Test')
    # Add dots in tested points
    ax.scatter(parameters_tested, test_results, s = 8, color = 'black')


    ax.legend(loc = 'lower center', bbox_to_anchor=(0.5, -0.18))
    plt.show()
    plt.close()

In [None]:
# Define some hyperparameters for LightGBM Classifier
def hyperparameter_LGBMClassifier(X_train, y_train, X_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters):
    '''
    hiperparametros:
    random_state
    n_estimators, default=100. The number of trees in the forest.
    loss{'log_loss', 'exponential'}, default = 'log_loss'
    learning_rate, default=0.1. It ranges from 0 to inf
    subsample, default=1.0. It ranges from 0 to 1
    criterion{'friedman_mse', 'squared_error'}, default=’friedman_mse’
    num_iterations: 100
    min_samples_split, default=2
    min_samples_leaf, default=1
    max_depth, default = 3  
    lambda_l1: 0
    lambda_l2: 0
    cat_l2: 10
    cat_smooth: 10
    '''
    returned_values = {}
    metrica_comparacao_train = []
    metrica_comparacao_test = []
    parameters_tested = []
    for parameter in test_hyperparameters:
        parameters_tested.append(parameter)
        # change the value of the teted hyperparameter.
        hyperparameters_dict[hyperparameter_test_name] = parameter
        # Set model hyperparameters
        clf = lgb.LGBMClassifier(**hyperparameters_dict)
        
        # fit and predict
        clf_fit = clf.fit(X_train, y_train)
        y_pred_train = clf_fit.predict_proba(X_train)[:,1]
        y_pred_test = clf_fit.predict_proba(X_test)[:,1]
        
        # Save metrics to compare the performance
        metrica_comparacao_train.append(roc_auc_score(y_train, y_pred_train))
        metrica_comparacao_test.append(roc_auc_score(y_test, y_pred_test))
        
    # Save results
    metrica_comparacao_df = pd.DataFrame(list(zip(parameters_tested, metrica_comparacao_train, metrica_comparacao_test)), columns = ['parameter', 'roc_auc_score_train', 'roc_auc_score_test'])
    
    # Return the best hyperparameter and table with metrics
    returned_values[hyperparameter_test_name] = metrica_comparacao_df[np.in1d(metrica_comparacao_df['roc_auc_score_test'], metrica_comparacao_df['roc_auc_score_test'].max())]['parameter'].values[0]
    returned_values['resultados'] = metrica_comparacao_df
    return returned_values

df_train = df_train_bootstrap[variaveis_escolhidas_lightgbm]
df_test = df_test_bootstrap[variaveis_escolhidas_lightgbm]

hyperparameters_dict = {"objective": "binary",
            "metric": "binary_logloss",
            "boosting_type": "gbdt",
            'verbose': -1,
            'seed': 1,
            "num_iterations": 300,
            "min_data_in_leaf": 3000,
            "num_leaves": 8,
            "max_depth": 7,
            "feature_fraction": 0.45,
            "learning_rate": 0.12,
            "lambda_l1": 0.001,
            "lambda_l2": 0.01,
            "cat_l2": 10,
            "cat_smooth": 10}  


# Test num_iterations
print('Test num_iterations:  from 50 to 500.')
hyperparameter_test_name = 'num_iterations'
test_hyperparameters = np.arange(start = 50, stop = 501, step = 50)
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

# Test min_data_in_leaf
print('Test min_data_in_leaf:  from 3000 to 7000.')
hyperparameter_test_name = 'min_data_in_leaf'
test_hyperparameters = [3000, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 5000, 6000, 7000]
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]
                        
# Test feature_fraction
print('Test feature_fraction: from 0.05 to 1.')
hyperparameter_test_name = 'feature_fraction'
test_hyperparameters = np.arange(start = 0.05, stop = 1, step = 0.05)
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]    
    
# Test lambda_l1
print('Test lambda_l1: from 0.0001 to 400.')
hyperparameter_test_name = 'lambda_l1'
test_hyperparameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 2, 5, 7, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400]
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]   

# Test lambda_l2
print('Test lambda_l2: from 0.0001 to 400.')
hyperparameter_test_name = 'lambda_l2'
test_hyperparameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 2, 5, 7, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400]
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name] 

# Test cat_l2
print('Test cat_l2: from 0.0001 to 400.')
hyperparameter_test_name = 'cat_l2'
test_hyperparameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400]
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name] 

# Test cat_smooth
print('Test cat_smooth: from 0.0001 to 400.')
hyperparameter_test_name = 'cat_smooth'
test_hyperparameters = [0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 25, 50, 100, 150, 200, 250, 300, 350, 400]
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name] 

# Test num_leaves
print('Test num_leaves:  from 10 to 40.')
hyperparameter_test_name = 'num_leaves'
test_hyperparameters = np.arange(start = 5, stop = 21, step = 1)
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]
                        
                        
# Test max_depth
print('Test max_depth:  from 2 to 30.')
hyperparameter_test_name = 'max_depth'
test_hyperparameters = np.arange(start = 2, stop = 16, step = 1)
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]
                        
                        
# Test learning_rate
print('Test learning_rate:  from 0.1 to 0.7.')
hyperparameter_test_name = 'learning_rate'
test_hyperparameters = np.arange(start = 0.01, stop = 0.701, step = 0.01)
hyperparameter_result = hyperparameter_LGBMClassifier(df_train, y_train, df_test, y_test, hyperparameters_dict, hyperparameter_test_name, test_hyperparameters)
# Plot results
df_hyperparameter_result_resultados = hyperparameter_result['resultados'].copy()
tested_parameter = hyperparameter_test_name
maximum_minimum = 'maximum'
metric_name = 'ROC AUC'
print_plot_best_hyperparameter_result(df_hyperparameter_result_resultados, maximum_minimum, tested_parameter, metric_name)
# Save best result.
hyperparameters_dict[hyperparameter_test_name] = hyperparameter_result[hyperparameter_test_name]

In [None]:
# Final model
hyperparameters_dict = {"objective": "binary",
            "metric": "binary_logloss",
            "boosting_type": "gbdt",
            'verbose': -1,
            'seed': 1,
            "num_iterations": 300,
            "min_data_in_leaf": 3000,
            "num_leaves": 8,
            "max_depth": 7,
            "feature_fraction": 0.45,
            "learning_rate": 0.12,
            "lambda_l1": 0.001,
            "lambda_l2": 0.01,
            "cat_l2": 10,
            "cat_smooth": 10}  

lightGBM = lgb.LGBMClassifier(**hyperparameters_dict)
# fit and predict
lightGBM_fit = lightGBM.fit(df_train, y_train)
y_pred_train = lightGBM_fit.predict_proba(df_train)[:,1]
y_pred_test = lightGBM_fit.predict_proba(df_test)[:,1]
    
# Marca 0 ou 1 para y_pred
y_train_pred_binario = []
for e in y_pred_train:
    if e < 0.5:
        y_train_pred_binario.append(0)
    else:
        y_train_pred_binario.append(1)    

# Marca 0 ou 1 para y_pred
y_test_pred_binario = []
for e in y_pred_test:
    if e < 0.5:
        y_test_pred_binario.append(0)
    else:
        y_test_pred_binario.append(1)
        
# Out of time
y_pred_out_of_time = lightGBM_fit.predict_proba(df_out_of_time_2)[:,1]
# Marca 0 ou 1 para y_pred_out_of_time
y_pred_out_of_time_binario = []
for e in y_pred_out_of_time:
    if e < 0.5:
        y_pred_out_of_time_binario.append(0)
    else:
        y_pred_out_of_time_binario.append(1)

print()
print('Proportion of target = 1 in test:', np.mean(y_test))
print('Proportion of target = 1 predicted in test:', np.mean(y_test_pred_binario))
print('Proportion of target = 1 in out of time:', np.mean(y_out_of_time))
print('Proportion of target = 1 predicted in out of time:', np.mean(y_pred_out_of_time_binario))

In [None]:
# Verify feature importance for LightGBM.   
lgb.plot_importance(lightGBM, grid = False, figsize = (10,10))

In [None]:
# Show some metrics of our model.
def roc_curve_graph(y_true, y_pred, title):
    fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
    roc_auc = metrics.auc(fpr, tpr)

    plt.title(title)
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
def matriz_confusao(y_true, y_pred, title):
    # Matriz de confusão
    cf_matrix = confusion_matrix(y_true, y_pred)
    group_names = ['True Neg','False Pos','False Neg', 'True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    fig = plt.figure(figsize=(8,5))
    ax = sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
    ax.set_xlabel('\nPredicted values')
    ax.set_ylabel('Actual values')
    ax.set_title(title, fontsize = 10)
    ax.xaxis.set_ticklabels(['0','1'])
    ax.yaxis.set_ticklabels(['0','1'])
    plt.tight_layout()
    plt.show()
    plt.close()
    
def metrics(y_real, y_pred, y_pred_binario):
    roc = roc_auc_score(y_real, y_pred)
    accuracy = accuracy_score(y_real, y_pred_binario)
    precisao = precision_score(y_real, y_pred_binario)
    recall = recall_score(y_real, y_pred_binario)
    f1 = f1_score(y_real, y_pred_binario)

    print('roc:', roc)
    print('accuracy:', accuracy)
    print('precision:', precisao)
    print('recall:', recall)
    print('f1:', f1)
    print()

    title = 'Confusion matrix LightGBM: out of time.'
    matriz_confusao(y_real, y_pred_binario, title)

    # ROC curves, Cumulative gains, and KS - Out of Time
    print('ROC, Cumulative Gains, KS: Out of Time')
    y_out_of_time_pred_0 = 1 - y_pred 
    y_out_of_time_prob = list(zip(y_out_of_time_pred_0, y_pred))
    skplt.metrics.plot_roc(y_real, y_out_of_time_prob)
    skplt.metrics.plot_cumulative_gain(y_real, y_out_of_time_prob)
    skplt.metrics.plot_ks_statistic(y_real, y_out_of_time_prob)
    plt.show()
    plt.close()
    
metrics(y_train, y_pred_train, y_train_pred_binario)

In [None]:
# Estimate the potential gain in using the model
def tabela_recall(df, y_pred):
    df.loc[:,'POTENCIAL_GAIN'] = df_out_of_time['vl_opr'] 

    # Cria as faixas de estudo
    bins_probabilidade = [np.NINF, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, np.Inf] # Cortes definido no treino
    df.loc[:,'PROBABILITY_CUT'] = pd.cut(df[y_pred], bins = bins_probabilidade
                                                 , labels = ["0-10%", "10-20%", "20-30%", "30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"])
    df_potencial_faixa = df[['POTENCIAL_GAIN', 'PROBABILITY_CUT']].groupby(by = ['PROBABILITY_CUT'], dropna = False).sum()
    

    # Calculo da quantidade de clientes em potenciais
    df_clientes_faixa = df[['POTENCIAL_GAIN', 'PROBABILITY_CUT']].groupby(by = ['PROBABILITY_CUT'], dropna = False).size()
    df_potencial_faixa['POTENTIAL_CUSTOMERS'] = df_clientes_faixa
    df_potencial_faixa['% BASE'] = (df_potencial_faixa['POTENTIAL_CUSTOMERS'] / df_potencial_faixa['POTENTIAL_CUSTOMERS'].sum()).map('{:,.1%}'.format)
    # Calculo da quantidade de clientes que realmente contrataram
    df_contrataram_faixa = df[['target', 'PROBABILITY_CUT']].groupby(by = ['PROBABILITY_CUT'], dropna = False).sum()
    df_potencial_faixa['TARGET_1'] = df_contrataram_faixa

    df_potencial_faixa['TARGET_0'] = df_potencial_faixa['POTENTIAL_CUSTOMERS'] - df_potencial_faixa['TARGET_1']
    df_potencial_faixa['PRECISION'] = (df_potencial_faixa['TARGET_1'] / df_potencial_faixa['POTENTIAL_CUSTOMERS']).map('{:,.1%}'.format)
    df_potencial_faixa['RECALL'] = (df_potencial_faixa['TARGET_1']/df_potencial_faixa['TARGET_1'].sum()).map('{:,.1%}'.format)

    # Calcula as métricas acumuladas
    df_potencial_faixa.sort_index(ascending = False, inplace = True)
    df_potencial_faixa['POTENTIAL_CUSTOMERS_AC'] = df_potencial_faixa[['POTENTIAL_CUSTOMERS']].cumsum()
    df_potencial_faixa['% BASE AC'] = (df_potencial_faixa['POTENTIAL_CUSTOMERS_AC'] / df_potencial_faixa['POTENTIAL_CUSTOMERS'].sum()).map('{:,.1%}'.format)
    df_potencial_faixa['TARGET_1_AC'] = df_potencial_faixa[['TARGET_1']].cumsum()
    df_potencial_faixa['PRECISION_AC'] = (df_potencial_faixa['TARGET_1_AC'] / df_potencial_faixa['POTENTIAL_CUSTOMERS_AC']).map('{:,.1%}'.format)
    df_potencial_faixa['RECALL_AC'] = (df_potencial_faixa['TARGET_1_AC']/df_potencial_faixa['TARGET_1'].sum()).map('{:,.1%}'.format)
    df_potencial_faixa['POTENCIAL_GAIN_AC'] = df_potencial_faixa[['POTENCIAL_GAIN']].cumsum()
    df_potencial_faixa.loc[:,'POTENCIAL_GAIN'] = df_potencial_faixa['POTENCIAL_GAIN'].map('{:,.2f}'.format)
    df_potencial_faixa.loc[:,'POTENCIAL_GAIN_AC'] = df_potencial_faixa['POTENCIAL_GAIN_AC'].map('{:,.2f}'.format)

    # Reordena as colunas
    df_potencial_faixa = df_potencial_faixa[['POTENTIAL_CUSTOMERS', 'POTENTIAL_CUSTOMERS_AC', '% BASE', '% BASE AC'
                                     , 'TARGET_1', 'TARGET_0', 'TARGET_1_AC', 'PRECISION', 'PRECISION_AC', 'RECALL' ,'RECALL_AC', 'POTENCIAL_GAIN', 'POTENCIAL_GAIN_AC']]
    return df_potencial_faixa

In [None]:
# Show table with potential gain of using our model.
df_out_of_time_3 = df_out_of_time.copy()
df_out_of_time_3['y_pred_out_of_time'] = y_pred_out_of_time
print('Return table, período out of time:')
return_table_out_of_time = tabela_recall(df_out_of_time_3, 'y_pred_out_of_time')
display(return_table_out_of_time)