* Importando as Bibliotecas

In [1]:
import numpy             as np
import matplotlib.pyplot as plt
import pandas            as pd
import seaborn           as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, auc
from sklearn.linear_model import LogisticRegression
from model_metrics_functions import plot_metrics, calculate_metrics
from warnings import filterwarnings
filterwarnings(action= 'ignore')

* Carregando as bases e realizando o Split

In [2]:
df_train = pd.read_csv('./DATASETS/application_train.csv')
df_test = pd.read_csv('./DATASETS/application_test.csv')

In [3]:
X = df_train.drop('TARGET', axis=1)
y = df_train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state= 1)

* Montando o Pipeline

In [4]:
# Separando as variáveis categóricas e numéricas

num_vars = X.select_dtypes(exclude='object')

cat_vars = X.select_dtypes(include='object')

# Pipeline para tratar as variáveis categóricas
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder())
])

# Pipeline para tratar as variáveis numéricas
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('encoder', StandardScaler())

])

# Combinando as Pipelines
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_vars.columns),
    ('num', num_pipe, num_vars.columns)
])

preprocessor = Pipeline(
    steps= [('preprocessor', preprocessor)]
)

* Aplicando o Pipeline

In [5]:
# Conjunto de treino transformado
X_train_processed = pd.DataFrame(preprocessor.fit_transform(X_train, y_train), columns= X_train.columns)
# Conjunto de teste transformado
X_test_processed = pd.DataFrame(preprocessor.transform(X_test), columns= X_test.columns)

X_processed = pd.concat([X_train_processed, X_test_processed], axis=0)

# Conjunto de validação transformado
X_prod_processed = pd.DataFrame(preprocessor.transform(df_test), columns = df_test.columns)

* Treinando e Avaliando os Modelos

In [6]:
# Instanciando os modelos que serão utilizados

models = [
    DecisionTreeClassifier(criterion= 'gini', random_state=1),
    LogisticRegression(solver= 'liblinear', random_state=1),
    RandomForestClassifier(random_state=1),
    GradientBoostingClassifier(random_state=1),
    XGBClassifier(random_state=1),
    lgb.LGBMClassifier(random_state=1)
]

for model in models:
    model_name = str(model)[:str(model).find("(")]
    # Treinamento
    model.fit(X_train_processed, y_train)

    # Avaliação
    metrics = calculate_metrics(model_name, model, X_train_processed, y_train, X_test_processed, y_test)
    display(metrics)

Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,DecisionTreeClassifier,Treino,1.0,1.0,1.0,1.0,1.0,1.0
1,DecisionTreeClassifier,Teste,0.851993,0.134017,0.157772,0.534861,0.069722,0.070959


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LogisticRegression,Treino,0.918529,0.50431,0.009529,0.745826,0.491652,0.366089
1,LogisticRegression,Teste,0.920747,0.586957,0.010518,0.746223,0.492447,0.368067


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,RandomForestClassifier,Treino,0.999973,1.0,0.999674,1.0,1.0,1.0
1,RandomForestClassifier,Teste,0.920499,0.0,0.0,0.692609,0.385219,0.285866


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,GradientBoostingClassifier,Treino,0.919265,0.687708,0.016859,0.7661,0.5322,0.396157
1,GradientBoostingClassifier,Teste,0.92084,0.59322,0.013635,0.751106,0.502211,0.373384


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,XGBClassifier,Treino,0.929147,0.95667,0.136667,0.908577,0.817155,0.65001
1,XGBClassifier,Teste,0.919895,0.452323,0.036034,0.731318,0.462636,0.347898


[LightGBM] [Info] Number of positive: 12278, number of negative: 138401
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24035
[LightGBM] [Info] Number of data points in the train set: 150679, number of used features: 166
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081484 -> initscore=-2.422346
[LightGBM] [Info] Start training from score -2.422346


Unnamed: 0,Algoritmo,Conjunto,Acuracia,Precisao,Recall,AUC_ROC,GINI,KS
0,LGBMClassifier,Treino,0.920407,0.862595,0.02761,0.833379,0.666758,0.508259
1,LGBMClassifier,Teste,0.920747,0.559701,0.014608,0.750472,0.500945,0.376406
