## 00.Pacotes


In [None]:
import pandas as pd
import numpy as np
import random
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

import funcoes_basicas as funcs
from tqdm.notebook import tqdm
from tqdm import tqdm

# Definindo a semente
random.seed(123)

import warnings
warnings.filterwarnings("ignore")

  from pandas.core import (


## 01. Baixando a base
- Realizando a separação em dados de treino e teste, marcando cada período na base final

In [None]:
# Baixando os dados de treino
df_treino_full = pd.read_parquet(r'D:\projeto_modelo_credito\dados\base_treino_final.parquet\base_treino_final.parquet',
                                 engine='pyarrow')
df_treino_full.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,NAME_CONTRACT_TYPE_publico,CODE_GENDER_publico,FLAG_OWN_CAR_publico,FLAG_OWN_REALTY_publico,CNT_CHILDREN_publico,AMT_INCOME_TOTAL_publico,AMT_CREDIT_publico,AMT_ANNUITY_publico,...,VL_MAX_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo
0,100010,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,...,35.0,0.0,35.0,17.5,35.0,0.0,35.0,17.5,35.0,0.0
1,100263,0,Cash loans,F,N,Y,0,108000.0,585000.0,32665.5,...,5.0,0.0,79.0,19.75,5.0,0.0,79.0,19.75,5.0,0.0
2,100320,0,Cash loans,M,Y,Y,0,225000.0,481855.5,49374.0,...,,,,,,,,,,
3,100704,0,Cash loans,F,N,N,0,90000.0,760500.0,22234.5,...,,,,,,,,,,
4,100768,0,Cash loans,M,Y,N,0,112500.0,277969.5,10606.5,...,,,,,,,,,,


In [None]:
# Tirando a coluna CODE_GENDER_publico, não pode ser usada em modelos de risco de crédito
df_treino_full.drop('CODE_GENDER_publico', axis=1, inplace=True)

In [None]:
# Convertendo o target para numérico
df_treino_full['TARGET_publico'] = df_treino_full['TARGET_publico'].astype(int)

# Separando features, target e ID
X = df_treino_full.drop(columns=['TARGET_publico'])
y = df_treino_full['TARGET_publico']
ids = df_treino_full['SK_ID_CURR']

# Separando treino e teste com estratificação
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Separando os IDs
id_train = X_train_full['SK_ID_CURR']
id_test = X_test_full['SK_ID_CURR']

# Removendo ID dos conjuntos de treino e teste para modelagem
X_train = X_train_full.drop(columns=['SK_ID_CURR'])
X_test = X_test_full.drop(columns=['SK_ID_CURR'])


In [None]:
# Criando os DataFrames de cada grupo
df_treino_ids = pd.DataFrame({
    'SK_ID_CURR': id_train,
    'periodo': 'treino'
})

df_teste_ids = pd.DataFrame({
    'SK_ID_CURR': id_test,
    'periodo': 'teste'
})

# Concatenando os dois
df_periodo = pd.concat([df_treino_ids, df_teste_ids], ignore_index=True)

# Adicionando a coluna 'periodo' ao df_treino_full
df_treino_full_marcado = df_treino_full.merge(df_periodo, on='SK_ID_CURR', how='left')

In [None]:
df_treino_full_marcado.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,NAME_CONTRACT_TYPE_publico,FLAG_OWN_CAR_publico,FLAG_OWN_REALTY_publico,CNT_CHILDREN_publico,AMT_INCOME_TOTAL_publico,AMT_CREDIT_publico,AMT_ANNUITY_publico,AMT_GOODS_PRICE_publico,...,VL_MIN_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo,periodo
0,100010,0,Cash loans,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,...,0.0,35.0,17.5,35.0,0.0,35.0,17.5,35.0,0.0,treino
1,100263,0,Cash loans,N,Y,0,108000.0,585000.0,32665.5,585000.0,...,0.0,79.0,19.75,5.0,0.0,79.0,19.75,5.0,0.0,treino
2,100320,0,Cash loans,Y,Y,0,225000.0,481855.5,49374.0,463500.0,...,,,,,,,,,,treino
3,100704,0,Cash loans,N,N,0,90000.0,760500.0,22234.5,760500.0,...,,,,,,,,,,treino
4,100768,0,Cash loans,Y,N,0,112500.0,277969.5,10606.5,229500.0,...,,,,,,,,,,treino


In [None]:
# Verificando se base foi marcda corretamente
print(df_treino_full_marcado['periodo'].value_counts(dropna=False))

periodo
treino    147480
teste      36870
Name: count, dtype: int64


In [None]:
# Separando o dataframe de treino
df_treino = df_treino_full_marcado[df_treino_full_marcado['periodo'] == 'treino'].drop(columns=['periodo']).reset_index(drop=True).copy()
df_treino.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,NAME_CONTRACT_TYPE_publico,FLAG_OWN_CAR_publico,FLAG_OWN_REALTY_publico,CNT_CHILDREN_publico,AMT_INCOME_TOTAL_publico,AMT_CREDIT_publico,AMT_ANNUITY_publico,AMT_GOODS_PRICE_publico,...,VL_MAX_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo
0,100010,0,Cash loans,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,...,35.0,0.0,35.0,17.5,35.0,0.0,35.0,17.5,35.0,0.0
1,100263,0,Cash loans,N,Y,0,108000.0,585000.0,32665.5,585000.0,...,5.0,0.0,79.0,19.75,5.0,0.0,79.0,19.75,5.0,0.0
2,100320,0,Cash loans,Y,Y,0,225000.0,481855.5,49374.0,463500.0,...,,,,,,,,,,
3,100704,0,Cash loans,N,N,0,90000.0,760500.0,22234.5,760500.0,...,,,,,,,,,,
4,100768,0,Cash loans,Y,N,0,112500.0,277969.5,10606.5,229500.0,...,,,,,,,,,,


In [None]:
df_treino.shape

(147480, 964)

In [None]:
# Separando o dataframe de teste
df_teste = df_treino_full_marcado[df_treino_full_marcado['periodo'] == 'teste'].drop(columns=['periodo']).reset_index(drop=True).copy()
df_teste.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,NAME_CONTRACT_TYPE_publico,FLAG_OWN_CAR_publico,FLAG_OWN_REALTY_publico,CNT_CHILDREN_publico,AMT_INCOME_TOTAL_publico,AMT_CREDIT_publico,AMT_ANNUITY_publico,AMT_GOODS_PRICE_publico,...,VL_MAX_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo
0,101122,0,Cash loans,N,N,1,90000.0,640458.0,27265.5,517500.0,...,,,,,,,,,,
1,102745,0,Cash loans,N,Y,0,202500.0,607500.0,23670.0,607500.0,...,,,,,,,,,,
2,103634,0,Cash loans,N,Y,0,292500.0,1125000.0,32895.0,1125000.0,...,,,,,,,,,,
3,104454,0,Cash loans,N,N,0,101250.0,66222.0,7155.0,58500.0,...,,,,,,,,,,
4,104665,0,Cash loans,N,Y,0,180000.0,1024740.0,49428.0,900000.0,...,29.0,24.0,81.0,27.0,29.0,24.0,81.0,27.0,29.0,24.0


In [None]:
df_teste.shape

(36870, 964)

## 02. Ajustando o tipos das variáveis e os domínios das variáveis categóricas

In [None]:
# 1- Tratando as variáveis

# Função que otimiza os tipos das variáveis classificando-as do modo mais adequado

def otimiza_tipos(df):
    for col in df.columns:
        # Forçar conversão de colunas tipo 'object' para número, se possível
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_numeric(df[col])
            except:
                pass  # Mantém como objeto se não for possível converter

        if pd.api.types.is_integer_dtype(df[col]):
            if df[col].max() > np.iinfo(np.int32).max or df[col].min() < np.iinfo(np.int32).min:
                df[col] = df[col].astype('Int64')
            else:
                df[col] = df[col].astype('Int32')

        elif pd.api.types.is_float_dtype(df[col]):
            if df[col].max() > np.finfo(np.float32).max or df[col].min() < -np.finfo(np.float32).max:
                df[col] = df[col].astype('float64')  # Evita usar 'Float64' Pandas
            else:
                df[col] = df[col].astype('float32')

    return df

df_dados = otimiza_tipos(df_treino)
print('Variáveis otimizadas!')

Variáveis otimizadas!


In [None]:
#2- Separarnado as variáveis por tipo
# Separando as variáveis por tipo: categórica, numérica e algumas que possam ter erro
# tirando as 2 primeiras variáveis 'SK_ID_CURR' e 'TARGET_publico'

vars = df_treino.columns[2:].tolist()

lista_vars_num = []
lista_vars_categ = []
lista_vars_erro = []

for var in vars:
    tipo_coluna = df_treino[var].dtype
    if tipo_coluna.name in (['Int32', 'Int64', 'int32', 'int64', 'Float32', 'Float64', 'float32', 'float64']):
        lista_vars_num.append(var)
    elif tipo_coluna.name in (['object', 'string', 'category']):
        lista_vars_categ.append(var)
    else:
        lista_vars_erro.append(var)

print(f'lista_vars_num: {len(lista_vars_num)}')
print()
print(f'lista_vars_categ: {len(lista_vars_categ)}')
print()
print(f'lista_vars_erro: {len(lista_vars_erro)}')


lista_vars_num: 947

lista_vars_categ: 15

lista_vars_erro: 0


In [None]:
# 3- Ajustando os domínios das variáveis categóricas tirando: espaço, dois pontos, vírgula

def limpar_valores(val):
    if isinstance(val, str):  # Verifica se é string
        return '_'.join(val.replace('.', '').replace(':', '').replace(',', '').replace('/', '').split())
    return val


In [None]:
# Aplicando a função nas colunas categóricas
df_treino[lista_vars_categ] = df_treino[lista_vars_categ].applymap(limpar_valores)

## 03. Feature Selection inicial
 - Retirando as variáveis sem poder preditivo

In [None]:
vars_lista = df_treino.columns.tolist()

In [None]:
# Verificando as variáveis com alta quantidade de nulos

metadados = pd.DataFrame({'Variável': vars_lista,
                                'Tipo': df_treino[vars_lista].dtypes,
                                '% Nulos': round(df_treino[vars_lista].isnull().sum()/len(df_treino), 4)*100
                                }).reset_index(drop=True)

metadados.sort_values(by='% Nulos', ascending=False).head(10)


Unnamed: 0,Variável,Tipo,% Nulos
195,QT_MIN_QT_MIN_DAYS_ENDDATE_FACT_ULTIMOS_3_MESE...,float32,98.41
194,QT_MAX_QT_MIN_DAYS_ENDDATE_FACT_ULTIMOS_3_MESE...,float32,98.41
193,QT_MIN_QT_MAX_DAYS_ENDDATE_FACT_ULTIMOS_3_MESE...,float32,98.41
192,QT_MAX_QT_MAX_DAYS_ENDDATE_FACT_ULTIMOS_3_MESE...,float32,98.41
307,VL_MIN_VL_MAX_AMT_ANNUITY_ULTIMOS_3_MESES_externo,float32,96.54
308,VL_TOT_VL_MIN_AMT_ANNUITY_ULTIMOS_3_MESES_externo,float32,96.54
311,VL_MIN_VL_MIN_AMT_ANNUITY_ULTIMOS_3_MESES_externo,float32,96.54
296,VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_3_MESES_externo,float32,96.54
297,VL_MED_VL_TOT_AMT_ANNUITY_ULTIMOS_3_MESES_externo,float32,96.54
310,VL_MAX_VL_MIN_AMT_ANNUITY_ULTIMOS_3_MESES_externo,float32,96.54


 ### 1- Retirando as variáveis com alta porcentagem de nulo

In [None]:
##1- Retirando as variáveis com alta porcentagem de nulo

treshold_nulos = 80
vars_lista_nulos = metadados.loc[metadados['% Nulos'] > treshold_nulos, 'Variável'].tolist()
print(f'Foram removidas {len(vars_lista_nulos)} variáveis com {treshold_nulos}%')
print(f'Variáveis removidas: {vars_lista_nulos}')

vars_lista_inicial = [v for v in vars_lista if v not in vars_lista_nulos]


Foram removidas 184 variáveis com 80%
Variáveis removidas: ['QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_3_MESES_externo', 'QT_MIN_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_3_MESES_externo', 'QT_MAX_QT_MIN_CREDIT_DAY_OVERDUE_ULTIMOS_3_MESES_externo', 'QT_MIN_QT_MIN_CREDIT_DAY_OVERDUE_ULTIMOS_3_MESES_externo', 'QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_3_MESES_externo', 'QT_MIN_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_3_MESES_externo', 'QT_MAX_QT_MIN_DAYS_CREDIT_ENDDATE_ULTIMOS_3_MESES_externo', 'QT_MIN_QT_MIN_DAYS_CREDIT_ENDDATE_ULTIMOS_3_MESES_externo', 'QT_MAX_QT_MAX_DAYS_ENDDATE_FACT_ULTIMOS_3_MESES_externo', 'QT_MIN_QT_MAX_DAYS_ENDDATE_FACT_ULTIMOS_3_MESES_externo', 'QT_MAX_QT_MIN_DAYS_ENDDATE_FACT_ULTIMOS_3_MESES_externo', 'QT_MIN_QT_MIN_DAYS_ENDDATE_FACT_ULTIMOS_3_MESES_externo', 'VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_3_MESES_externo', 'VL_MED_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_3_MESES_externo', 'VL_MAX_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_3_MESES_externo', 'VL_MIN_VL_TOT_AMT_CREDIT_MA

In [None]:
print(f'Quantidade de variáveis restantes: {len(vars_lista_inicial)}')

Quantidade de variáveis restantes: 780


In [None]:
vars = df_treino[vars_lista_inicial].columns[:].tolist()
len(vars)

780

In [None]:
# Função que filtra as variáveis por tipo

def filtrar_variaveis(lista_vars_num, lista_vars_categ, vars_lista_inicial):
    """
    Filtra as variáveis numéricas e categóricas compatíveis com a lista final.

    Args:
        lista_vars_num (list): Lista de variáveis numéricas.
        lista_vars_categ (list): Lista de variáveis categóricas.
        vars_lista_inicial (list): Lista final de variáveis selecionadas.

    Returns:
        tuple: Duas listas contendo as variáveis numéricas e categóricas compatíveis.
    """
    vars_num_filtradas = [var for var in vars_lista_inicial if var in lista_vars_num]
    vars_categ_filtradas = [var for var in vars_lista_inicial if var in lista_vars_categ]

    return vars_num_filtradas, vars_categ_filtradas

vars_num_filtradas, vars_categ_filtradas = filtrar_variaveis(lista_vars_num, lista_vars_categ, vars_lista_inicial)


In [None]:
print("Variáveis Numéricas Filtradas:", vars_num_filtradas)
print("Variáveis Categóricas Filtradas:", vars_categ_filtradas)

Variáveis Numéricas Filtradas: ['CNT_CHILDREN_publico', 'AMT_INCOME_TOTAL_publico', 'AMT_CREDIT_publico', 'AMT_ANNUITY_publico', 'AMT_GOODS_PRICE_publico', 'REGION_POPULATION_RELATIVE_publico', 'DAYS_BIRTH_publico', 'DAYS_EMPLOYED_publico', 'DAYS_REGISTRATION_publico', 'DAYS_ID_PUBLISH_publico', 'OWN_CAR_AGE_publico', 'FLAG_MOBIL_publico', 'FLAG_EMP_PHONE_publico', 'FLAG_WORK_PHONE_publico', 'FLAG_CONT_MOBILE_publico', 'FLAG_PHONE_publico', 'FLAG_EMAIL_publico', 'CNT_FAM_MEMBERS_publico', 'REGION_RATING_CLIENT_publico', 'REGION_RATING_CLIENT_W_CITY_publico', 'HOUR_APPR_PROCESS_START_publico', 'REG_REGION_NOT_LIVE_REGION_publico', 'REG_REGION_NOT_WORK_REGION_publico', 'LIVE_REGION_NOT_WORK_REGION_publico', 'REG_CITY_NOT_LIVE_CITY_publico', 'REG_CITY_NOT_WORK_CITY_publico', 'LIVE_CITY_NOT_WORK_CITY_publico', 'EXT_SOURCE_1_publico', 'EXT_SOURCE_2_publico', 'EXT_SOURCE_3_publico', 'APARTMENTS_AVG_publico', 'BASEMENTAREA_AVG_publico', 'YEARS_BEGINEXPLUATATION_AVG_publico', 'YEARS_BUILD_AVG_

In [None]:
# Identificando as variáveis categóricas

print(len(vars_categ_filtradas))
print(vars_categ_filtradas)

15
['NAME_CONTRACT_TYPE_publico', 'FLAG_OWN_CAR_publico', 'FLAG_OWN_REALTY_publico', 'NAME_TYPE_SUITE_publico', 'NAME_INCOME_TYPE_publico', 'NAME_EDUCATION_TYPE_publico', 'NAME_FAMILY_STATUS_publico', 'NAME_HOUSING_TYPE_publico', 'OCCUPATION_TYPE_publico', 'WEEKDAY_APPR_PROCESS_START_publico', 'ORGANIZATION_TYPE_publico', 'FONDKAPREMONT_MODE_publico', 'HOUSETYPE_MODE_publico', 'WALLSMATERIAL_MODE_publico', 'EMERGENCYSTATE_MODE_publico']


In [None]:
vars_categ_filtradas

['NAME_CONTRACT_TYPE_publico',
 'FLAG_OWN_CAR_publico',
 'FLAG_OWN_REALTY_publico',
 'NAME_TYPE_SUITE_publico',
 'NAME_INCOME_TYPE_publico',
 'NAME_EDUCATION_TYPE_publico',
 'NAME_FAMILY_STATUS_publico',
 'NAME_HOUSING_TYPE_publico',
 'OCCUPATION_TYPE_publico',
 'WEEKDAY_APPR_PROCESS_START_publico',
 'ORGANIZATION_TYPE_publico',
 'FONDKAPREMONT_MODE_publico',
 'HOUSETYPE_MODE_publico',
 'WALLSMATERIAL_MODE_publico',
 'EMERGENCYSTATE_MODE_publico']

## 04. Pré Processamento e Pipeline

In [None]:
# Nova base com as variáveis sem alto missing

df_treino = df_treino[vars_lista_inicial]
print(df_treino.shape)
df_treino.head()

(147480, 780)


Unnamed: 0,SK_ID_CURR,TARGET_publico,NAME_CONTRACT_TYPE_publico,FLAG_OWN_CAR_publico,FLAG_OWN_REALTY_publico,CNT_CHILDREN_publico,AMT_INCOME_TOTAL_publico,AMT_CREDIT_publico,AMT_ANNUITY_publico,AMT_GOODS_PRICE_publico,...,VL_MAX_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo
0,100010,0,Cash_loans,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,...,35.0,0.0,35.0,17.5,35.0,0.0,35.0,17.5,35.0,0.0
1,100263,0,Cash_loans,N,Y,0,108000.0,585000.0,32665.5,585000.0,...,5.0,0.0,79.0,19.75,5.0,0.0,79.0,19.75,5.0,0.0
2,100320,0,Cash_loans,Y,Y,0,225000.0,481855.5,49374.0,463500.0,...,,,,,,,,,,
3,100704,0,Cash_loans,N,N,0,90000.0,760500.0,22234.5,760500.0,...,,,,,,,,,,
4,100768,0,Cash_loans,Y,N,0,112500.0,277969.5,10606.5,229500.0,...,,,,,,,,,,


In [None]:
## Substituindo pd.NA por np.nan

df_treino = df_treino.replace({pd.NA: np.nan})

In [None]:
# one-hot-encoder
# Separando os tipos de atributos
cat_atributos = df_treino.iloc[:, 2:].select_dtypes(include='object')
num_atributos = df_treino.iloc[:, 2:].select_dtypes(exclude='object')
key_atributos = ['SK_ID_CURR', 'TARGET_publico']

# Pipeline categórico: OneHotEncoder primeiro, depois imputação de missings com -999
cat_pipe = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),  # Gera colunas one-hot
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999))  # Substitui NaN por -99999
])

# Pipeline numérico: Imputação de missings com -99999
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999))
])

# Combinando os pipelines categóricos e numéricos (removendo FunctionTransformer)
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, cat_atributos.columns),
    ('num', num_pipe, num_atributos.columns)
])

# Criando o pipeline completo
pipeline_preprocessor = Pipeline([
    ('preprocessor', preprocessor)
])

In [None]:
# Aplicando a transformação
df_transformado = pipeline_preprocessor.fit_transform(df_treino)

# Obtendo os nomes das colunas geradas pelo OneHotEncoder
nomes_colunas_transformadas = pipeline_preprocessor.named_steps['preprocessor'].get_feature_names_out()

# Criando o DataFrame processado
df_processado = pd.DataFrame(df_transformado, columns=nomes_colunas_transformadas)

# Adicionando as colunas chave de volta (SK_ID_CURR e TARGET_publico)
df_processado[key_atributos] = df_treino[key_atributos].reset_index(drop=True)

print(df_processado.shape)
df_processado.info(verbose=True)


(147480, 907)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147480 entries, 0 to 147479
Data columns (total 907 columns):
 #    Column                                                              Dtype  
---   ------                                                              -----  
 0    cat__NAME_CONTRACT_TYPE_publico_Cash_loans                          float64
 1    cat__NAME_CONTRACT_TYPE_publico_Revolving_loans                     float64
 2    cat__FLAG_OWN_CAR_publico_N                                         float64
 3    cat__FLAG_OWN_CAR_publico_Y                                         float64
 4    cat__FLAG_OWN_REALTY_publico_N                                      float64
 5    cat__FLAG_OWN_REALTY_publico_Y                                      float64
 6    cat__NAME_TYPE_SUITE_publico_Children                               float64
 7    cat__NAME_TYPE_SUITE_publico_Family                                 float64
 8    cat__NAME_TYPE_SUITE_publico_Group_of_people    

#### Limpeza nas variáveis categóricas

In [None]:
#Removendo variáveis com final nan, XNA e escolhendo ou Y ou N

cols_para_remover = [col for col in df_processado.columns if col.endswith('nan')]
cols_para_remover

['cat__NAME_TYPE_SUITE_publico_nan',
 'cat__OCCUPATION_TYPE_publico_nan',
 'cat__FONDKAPREMONT_MODE_publico_nan',
 'cat__HOUSETYPE_MODE_publico_nan',
 'cat__WALLSMATERIAL_MODE_publico_nan',
 'cat__EMERGENCYSTATE_MODE_publico_nan']

In [None]:
df_processado.drop(columns=[
    'cat__NAME_TYPE_SUITE_publico_nan',
    'cat__OCCUPATION_TYPE_publico_nan',
    'cat__FONDKAPREMONT_MODE_publico_nan',
    'cat__HOUSETYPE_MODE_publico_nan',
    'cat__WALLSMATERIAL_MODE_publico_nan',
    'cat__EMERGENCYSTATE_MODE_publico_nan',
    'cat__FLAG_OWN_CAR_publico_Y',
    'cat__FLAG_OWN_REALTY_publico_N',
    'cat__EMERGENCYSTATE_MODE_publico_Yes',
    'cat__NAME_TYPE_SUITE_publico_nan',
    'cat__EMERGENCYSTATE_MODE_publico_Yes',
    'cat__ORGANIZATION_TYPE_publico_XNA'
], inplace=True)

In [None]:
## Separando as variáveis por tipo: categórica, numérica e algumas que possam ter erro
# tirando as 2 primeiras variáveis 'SK_ID_CURR' e 'TARGET_publico'
## Ajustando o formato das variáveis
vars = df_processado.columns[:-2].tolist()

lista_vars_num = []
lista_vars_categ = []
lista_vars_erro = []

for var in vars:
    tipo_coluna = df_processado[var].dtype
    if tipo_coluna.name in (['Int32', 'Int64', 'int32', 'int64', 'Float32', 'Float64', 'float32', 'float64']):
        lista_vars_num.append(var)
    elif tipo_coluna.name in (['object', 'string', 'category']):
        lista_vars_categ.append(var)
    else:
        lista_vars_erro.append(var)

print(f'lista_vars_num: {len(lista_vars_num)}')
print()
print(f'lista_vars_categ: {len(lista_vars_categ)}')
print()
print(f'lista_vars_erro: {len(lista_vars_erro)}')


lista_vars_num: 895

lista_vars_categ: 0

lista_vars_erro: 0


## 05. Processo mais robusto tirar mais variáveis

- refaz a univariada considerando os missing como -999
- tenho que tirar: vars constantes, quase constantes
- calcular o iv e ks
- calcular a correlação de Pearson e Spermamn

### 05.01. Univariada, exclusão constantes, quase constantes e missing

In [None]:
# Calculando a univariada das variáveis e considerando os valores missing como -999

uni_num = funcs.univariada_numerica_v2(df_processado, lista_vars_num)

In [None]:

variaveis_excluir, variaveis_manter = funcs.exclusao_vars_quase_constante_num(df_processado, lista_vars_num, 'SK_ID_CURR', threshold=0.92, dropna=True)

print(len(variaveis_excluir))

Analisando variáveis: 100%|█| 895/895 [00:14<00:00, 61.30it/s, variável=num__VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_exter

128





In [None]:
vars_excluir = funcs.excluindo_vars_constante_missing_num(df_processado, lista_vars_num, threshold_missing=0.9)
vars_constante_excluir = vars_excluir[1]
vars_missing_excluir = vars_excluir[0]

print(f'Removendo {len(vars_constante_excluir)} variáveis com variância zero, ou valor constante: \n', vars_constante_excluir)
print(f'Removendo {len(vars_missing_excluir)} variáveis com alta % missing: \n', vars_missing_excluir)


Removendo 0 variáveis com variância zero, ou valor constante: 
 []
Removendo 3 variáveis com alta % missing: 
 ['num__FLAG_MOBIL_publico', 'num__FLAG_DOCUMENT_2_publico', 'num__FLAG_DOCUMENT_12_publico']


In [None]:
variaveis_finalistas = [variavel for variavel in lista_vars_num if variavel not in variaveis_excluir+vars_constante_excluir+vars_missing_excluir]
print(len(variaveis_finalistas))
print()
print(variaveis_finalistas)

767

['cat__NAME_CONTRACT_TYPE_publico_Cash_loans', 'cat__NAME_CONTRACT_TYPE_publico_Revolving_loans', 'cat__FLAG_OWN_CAR_publico_N', 'cat__FLAG_OWN_REALTY_publico_Y', 'cat__NAME_TYPE_SUITE_publico_Family', 'cat__NAME_TYPE_SUITE_publico_Unaccompanied', 'cat__NAME_INCOME_TYPE_publico_Commercial_associate', 'cat__NAME_INCOME_TYPE_publico_Pensioner', 'cat__NAME_INCOME_TYPE_publico_Working', 'cat__NAME_EDUCATION_TYPE_publico_Higher_education', 'cat__NAME_EDUCATION_TYPE_publico_Secondary_secondary_special', 'cat__NAME_FAMILY_STATUS_publico_Civil_marriage', 'cat__NAME_FAMILY_STATUS_publico_Married', 'cat__NAME_FAMILY_STATUS_publico_Single_not_married', 'cat__NAME_HOUSING_TYPE_publico_House_apartment', 'cat__OCCUPATION_TYPE_publico_Core_staff', 'cat__OCCUPATION_TYPE_publico_Laborers', 'cat__OCCUPATION_TYPE_publico_Sales_staff', 'cat__WEEKDAY_APPR_PROCESS_START_publico_FRIDAY', 'cat__WEEKDAY_APPR_PROCESS_START_publico_MONDAY', 'cat__WEEKDAY_APPR_PROCESS_START_publico_SATURDAY', 'cat__WEEKDAY_A

### 05.02. Calculando IV, KS e WOE das variáveis

In [None]:
# Cálculo de IV, KS e WOE por ordem de maior IV e KS

tabela_iv_ks, tabela_iv_ks_completa = funcs.calcular_iv_ks_woe(
    dados=df_processado,
    variaveis=variaveis_finalistas,
    target='TARGET_publico',
    chave='SK_ID_CURR',
    bins=10,
    exibir_woe=False,
    ordenacao=['IV', 'KS']
)

tabela_iv_ks.head(15)

Calculando IV/KS/WOE: 100%|██████████████████████████████████████████████████████████| 767/767 [06:16<00:00,  2.04it/s]


Unnamed: 0,Variavel,IV,KS
0,num__EXT_SOURCE_3_publico,0.326,0.252
1,num__EXT_SOURCE_2_publico,0.28,0.212
2,num__EXT_SOURCE_1_publico,0.103,0.118
3,num__PAYMENT_RATE_publico,0.102,0.094
4,num__AMT_GOODS_PRICE_publico,0.101,0.092
5,num__DAYS_EMPLOYED_publico,0.098,0.085
6,num__DAYS_BIRTH_publico,0.08,0.115
7,num__AGE_YEARS_publico,0.079,0.116
8,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS...,0.078,0.133
9,num__VL_TOT_VL_MED_AMT_CREDIT_SUM_DEBT_ULTIMOS...,0.078,0.133


### 05.03. Selecionando as variáveis pelas correlações de Pearson e Spearman

In [None]:
# Correlação de Pearson
tabela_decisao, variaveis_selecionadas, matriz_corr = funcs.seleciona_variaveis_nao_correlacionadas(
    dados=df_processado,
    feature_priority=tabela_iv_ks['Variavel'].tolist(),
    threshold=0.7,
    method='pearson'
)

print(f'Variáveis selecionadas: {len(variaveis_selecionadas)}\n', variaveis_selecionadas)
display(tabela_decisao)

Analisando correlação: 100%|███████████████████████████████████████████████████████| 767/767 [00:00<00:00, 2897.93it/s]


Variáveis selecionadas: 142
 ['num__EXT_SOURCE_3_publico', 'num__EXT_SOURCE_2_publico', 'num__EXT_SOURCE_1_publico', 'num__PAYMENT_RATE_publico', 'num__AMT_GOODS_PRICE_publico', 'num__DAYS_EMPLOYED_publico', 'num__DAYS_BIRTH_publico', 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_12_MESES_externo', 'num__CREDIT_TO_GOODS_RATIO_publico', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_12_MESES_externo', 'num__INCOME_TO_EMPLOYED_RATIO_publico', 'num__VL_MED_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_24_MESES_externo', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_6_MESES_externo', 'num__QT_MIN_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_12_MESES_externo', 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_LIMIT_ULTIMOS_12_MESES_externo', 'num__REGION_RATING_CLIENT_W_CITY_publico', 'num__QT_MIN_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_6_MESES_externo', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULT

Unnamed: 0,Feature,Decision,Reason,Method,Threshold
0,num__EXT_SOURCE_3_publico,chosen,,pearson,0.7
1,num__EXT_SOURCE_2_publico,chosen,,pearson,0.7
2,num__EXT_SOURCE_1_publico,chosen,,pearson,0.7
3,num__PAYMENT_RATE_publico,chosen,,pearson,0.7
4,num__AMT_GOODS_PRICE_publico,chosen,,pearson,0.7
5,num__DAYS_EMPLOYED_publico,chosen,,pearson,0.7
6,num__DAYS_BIRTH_publico,chosen,,pearson,0.7
7,num__AGE_YEARS_publico,drop,correlated to num__DAYS_BIRTH_publico (-100.00%),pearson,0.7
8,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS...,chosen,,pearson,0.7
9,num__VL_TOT_VL_MED_AMT_CREDIT_SUM_DEBT_ULTIMOS...,drop,correlated to num__VL_TOT_VL_TOT_AMT_CREDIT_SU...,pearson,0.7


In [None]:
# Correlação de Spearman
tabela_decisao, variaveis_selecionadas, matriz_corr = funcs.seleciona_variaveis_nao_correlacionadas(
    dados=df_processado,
    feature_priority=variaveis_selecionadas,
    threshold=0.7,
    method='spearman'
)

print(f'Variáveis selecionadas: {len(variaveis_selecionadas)}\n', variaveis_selecionadas)
display(tabela_decisao)

Analisando correlação: 100%|███████████████████████████████████████████████████████| 142/142 [00:00<00:00, 3376.12it/s]

Variáveis selecionadas: 118
 ['num__EXT_SOURCE_3_publico', 'num__EXT_SOURCE_2_publico', 'num__EXT_SOURCE_1_publico', 'num__PAYMENT_RATE_publico', 'num__AMT_GOODS_PRICE_publico', 'num__DAYS_EMPLOYED_publico', 'num__DAYS_BIRTH_publico', 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo', 'num__CREDIT_TO_GOODS_RATIO_publico', 'num__INCOME_TO_EMPLOYED_RATIO_publico', 'num__VL_MED_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_24_MESES_externo', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_6_MESES_externo', 'num__REGION_RATING_CLIENT_W_CITY_publico', 'num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_36_MESES_externo', 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_LIMIT_ULTIMOS_24_MESES_externo', 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_24_MESES_externo', 'cat__NAME_EDUCATION_TYPE_publico_Higher_education', 'num__DAYS_LAST_PHONE_CHANGE_publico', 'cat__NAME_INCOME_TYPE_publico_Working', 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVER




Unnamed: 0,Feature,Decision,Reason,Method,Threshold
0,num__EXT_SOURCE_3_publico,chosen,,spearman,0.7
1,num__EXT_SOURCE_2_publico,chosen,,spearman,0.7
2,num__EXT_SOURCE_1_publico,chosen,,spearman,0.7
3,num__PAYMENT_RATE_publico,chosen,,spearman,0.7
4,num__AMT_GOODS_PRICE_publico,chosen,,spearman,0.7
5,num__DAYS_EMPLOYED_publico,chosen,,spearman,0.7
6,num__DAYS_BIRTH_publico,chosen,,spearman,0.7
7,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS...,chosen,,spearman,0.7
8,num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS...,drop,correlated to num__VL_TOT_VL_TOT_AMT_CREDIT_SU...,spearman,0.7
9,num__CREDIT_TO_GOODS_RATIO_publico,chosen,,spearman,0.7


In [None]:
tabela_iv_ks_filtrado = tabela_iv_ks[tabela_iv_ks['Variavel'].isin(variaveis_selecionadas)] \
                    .reset_index(drop=True)

tabela_iv_ks_filtrado

Unnamed: 0,Variavel,IV,KS
0,num__EXT_SOURCE_3_publico,0.326,0.252
1,num__EXT_SOURCE_2_publico,0.28,0.212
2,num__EXT_SOURCE_1_publico,0.103,0.118
3,num__PAYMENT_RATE_publico,0.102,0.094
4,num__AMT_GOODS_PRICE_publico,0.101,0.092
5,num__DAYS_EMPLOYED_publico,0.098,0.085
6,num__DAYS_BIRTH_publico,0.08,0.115
7,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS...,0.078,0.133
8,num__CREDIT_TO_GOODS_RATIO_publico,0.076,0.122
9,num__INCOME_TO_EMPLOYED_RATIO_publico,0.075,0.12


In [None]:
iv_ks_filtrado = tabela_iv_ks[
    (tabela_iv_ks['Variavel'].isin(variaveis_selecionadas)) &
    (tabela_iv_ks['IV'] >= 0.009)
].reset_index(drop=True)

iv_ks_filtrado

Unnamed: 0,Variavel,IV,KS
0,num__EXT_SOURCE_3_publico,0.326,0.252
1,num__EXT_SOURCE_2_publico,0.28,0.212
2,num__EXT_SOURCE_1_publico,0.103,0.118
3,num__PAYMENT_RATE_publico,0.102,0.094
4,num__AMT_GOODS_PRICE_publico,0.101,0.092
5,num__DAYS_EMPLOYED_publico,0.098,0.085
6,num__DAYS_BIRTH_publico,0.08,0.115
7,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS...,0.078,0.133
8,num__CREDIT_TO_GOODS_RATIO_publico,0.076,0.122
9,num__INCOME_TO_EMPLOYED_RATIO_publico,0.075,0.12


In [None]:
# Variáveis finais selecionadas

variaveis_iv_selecionadas = iv_ks_filtrado['Variavel'].tolist()

print(len(variaveis_iv_selecionadas))

variaveis_iv_selecionadas

41


['num__EXT_SOURCE_3_publico',
 'num__EXT_SOURCE_2_publico',
 'num__EXT_SOURCE_1_publico',
 'num__PAYMENT_RATE_publico',
 'num__AMT_GOODS_PRICE_publico',
 'num__DAYS_EMPLOYED_publico',
 'num__DAYS_BIRTH_publico',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo',
 'num__CREDIT_TO_GOODS_RATIO_publico',
 'num__INCOME_TO_EMPLOYED_RATIO_publico',
 'num__VL_MED_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_24_MESES_externo',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_6_MESES_externo',
 'num__REGION_RATING_CLIENT_W_CITY_publico',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_36_MESES_externo',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_LIMIT_ULTIMOS_24_MESES_externo',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_24_MESES_externo',
 'cat__NAME_EDUCATION_TYPE_publico_Higher_education',
 'num__DAYS_LAST_PHONE_CHANGE_publico',
 'cat__NAME_INCOME_TYPE_publico_Working',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIM

In [None]:
# Montando a base final com id, target e variáveis finais selecionadas

variaveis_finais = [
 'SK_ID_CURR',
 'TARGET_publico',
 'num__EXT_SOURCE_3_publico',
 'num__EXT_SOURCE_2_publico',
 'num__EXT_SOURCE_1_publico',
 'num__PAYMENT_RATE_publico',
 'num__AMT_GOODS_PRICE_publico',
 'num__DAYS_EMPLOYED_publico',
 'num__DAYS_BIRTH_publico',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo',
 'num__CREDIT_TO_GOODS_RATIO_publico',
 'num__INCOME_TO_EMPLOYED_RATIO_publico',
 'num__VL_MED_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_24_MESES_externo',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_6_MESES_externo',
 'num__REGION_RATING_CLIENT_W_CITY_publico',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_36_MESES_externo',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_LIMIT_ULTIMOS_24_MESES_externo',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_24_MESES_externo',
 'cat__NAME_EDUCATION_TYPE_publico_Higher_education',
 'num__DAYS_LAST_PHONE_CHANGE_publico',
 'cat__NAME_INCOME_TYPE_publico_Working',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_12_MESES_externo',
 'num__TOTALAREA_MODE_publico',
 'num__REGION_POPULATION_RELATIVE_publico',
 'num__VL_MIN_VL_TOT_AMT_CREDIT_SUM_ULTIMOS_36_MESES_externo',
 'num__DAYS_ID_PUBLISH_publico',
 'num__CAR_TO_EMPLOYED_RATIO_publico',
 'num__REG_CITY_NOT_WORK_CITY_publico',
 'num__FLAG_DOCUMENT_3_publico',
 'num__QT_MIN_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo',
 'num__DAYS_REGISTRATION_publico',
 'cat__OCCUPATION_TYPE_publico_Laborers',
 'num__INCOME_TO_BIRTH_RATIO_publico',
 'num__LIVINGAPARTMENTS_MEDI_publico',
 'num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo',
 'num__DEF_30_CNT_SOCIAL_CIRCLE_publico',
 'cat__WALLSMATERIAL_MODE_publico_Panel',
 'num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo',
 'num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo',
 'cat__NAME_CONTRACT_TYPE_publico_Cash_loans',
 'num__FLAG_DOCUMENT_6_publico',
 'num__FLAG_WORK_PHONE_publico'
]


In [None]:
# Dataframe final de treino

df_treino_processado = df_processado[variaveis_finais].copy()
df_treino_processado.head()

In [None]:
df_treino_processado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147480 entries, 0 to 147479
Data columns (total 43 columns):
 #   Column                                                              Non-Null Count   Dtype  
---  ------                                                              --------------   -----  
 0   SK_ID_CURR                                                          147480 non-null  int32  
 1   TARGET_publico                                                      147480 non-null  int32  
 2   num__EXT_SOURCE_3_publico                                           147480 non-null  float32
 3   num__EXT_SOURCE_2_publico                                           147480 non-null  float32
 4   num__EXT_SOURCE_1_publico                                           147480 non-null  float32
 5   num__PAYMENT_RATE_publico                                           147480 non-null  float32
 6   num__AMT_GOODS_PRICE_publico                                        147480 non-null  float32
 7   nu

In [None]:
# ## Salvando a base de treino em parquet
# df_treino_processado.to_parquet(r'D:\99_desktop\PodAcademy\36_projeto_para_portifolio_1_analise_credito\dados\df_treino_processado.parquet', engine='pyarrow', index=False)

In [None]:
# Baixando os dados de treino, verificação de salvamento
df_treino_processado1 = pd.read_parquet(r'D:\projeto_modelo_credito\dados\df_treino_processado.parquet',
                                 engine='pyarrow')
df_treino_processado1.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,num__EXT_SOURCE_3_publico,num__EXT_SOURCE_2_publico,num__EXT_SOURCE_1_publico,num__PAYMENT_RATE_publico,num__AMT_GOODS_PRICE_publico,num__DAYS_EMPLOYED_publico,num__DAYS_BIRTH_publico,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo,...,num__INCOME_TO_BIRTH_RATIO_publico,num__LIVINGAPARTMENTS_MEDI_publico,num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo,num__DEF_30_CNT_SOCIAL_CIRCLE_publico,cat__WALLSMATERIAL_MODE_publico_Panel,num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo,num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo,cat__NAME_CONTRACT_TYPE_publico_Cash_loans,num__FLAG_DOCUMENT_6_publico,num__FLAG_WORK_PHONE_publico
0,100010,0,0.540654,0.714279,-999.0,0.03,1530000.0,-449.0,-18850.0,-999.0,...,-19.1,-999.0,-999.0,0.0,0.0,0.0,-999.0,1.0,0.0,1.0
1,100263,0,0.270707,0.235314,0.770814,0.06,585000.0,365243.0,-23456.0,133240.5,...,-4.6,-999.0,-999.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,100320,0,-999.0,0.610447,-999.0,0.1,463500.0,-2907.0,-11996.0,-999.0,...,-18.76,-999.0,-999.0,0.0,0.0,-999.0,0.0,1.0,0.0,0.0
3,100704,0,0.621226,0.398296,0.67617,0.03,760500.0,-7042.0,-17781.0,29682.0,...,-5.06,-999.0,-999.0,1.0,0.0,-999.0,0.0,1.0,0.0,1.0
4,100768,0,0.556727,0.639849,0.722559,0.04,229500.0,-2271.0,-20438.0,-999.0,...,-5.5,-999.0,-999.0,1.0,0.0,-999.0,0.0,1.0,0.0,0.0


In [None]:
df_treino_processado1.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147480 entries, 0 to 147479
Data columns (total 43 columns):
 #   Column                                                              Non-Null Count   Dtype  
---  ------                                                              --------------   -----  
 0   SK_ID_CURR                                                          147480 non-null  int32  
 1   TARGET_publico                                                      147480 non-null  int32  
 2   num__EXT_SOURCE_3_publico                                           147480 non-null  float32
 3   num__EXT_SOURCE_2_publico                                           147480 non-null  float32
 4   num__EXT_SOURCE_1_publico                                           147480 non-null  float32
 5   num__PAYMENT_RATE_publico                                           147480 non-null  float32
 6   num__AMT_GOODS_PRICE_publico                                        147480 non-null  float32
 7   nu

## 06. Replicando o tratamento para a base de teste que separamos acima e tb a base teste dada

In [None]:
# Base teste separada acima
df_teste.shape

(36870, 964)

In [None]:
# Baixando os dados de teste dada
df_teste_full = pd.read_parquet(r'D:\projeto_modelo_credito\dados\base_teste_final.parquet\base_teste_final.parquet',
                                 engine='pyarrow')
df_teste_full.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE_publico,CODE_GENDER_publico,FLAG_OWN_CAR_publico,FLAG_OWN_REALTY_publico,CNT_CHILDREN_publico,AMT_INCOME_TOTAL_publico,AMT_CREDIT_publico,AMT_ANNUITY_publico,AMT_GOODS_PRICE_publico,...,VL_MAX_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_X_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo
0,100227,Cash loans,M,N,N,0,292500.0,1024740.0,55719.0,900000.0,...,8.0,12.0,60.0,15.0,8.0,12.0,60.0,15.0,8.0,12.0
1,100735,Cash loans,F,Y,Y,1,292500.0,720000.0,25636.5,720000.0,...,,,,,,,,,,
2,100964,Cash loans,F,N,Y,0,112500.0,327024.0,16033.5,270000.0,...,,,,,,,,,,
3,102521,Cash loans,F,N,Y,0,112500.0,296280.0,23539.5,225000.0,...,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
4,102536,Cash loans,F,Y,N,0,247500.0,1575000.0,41679.0,1575000.0,...,,,,,,,,,,


In [None]:
df_teste_full.shape

(79141, 964)

In [None]:
## Adicionando a coluna target com valor nulo par a df_teste_full, necessário para o pipeline funicoanr sem precisar de ajustes

df_teste_full['TARGET_publico'] = np.nan

In [None]:
# Função de tratamento, Pipeline

def tratar_base(df, is_treino=False):
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.impute import SimpleImputer

    # Colunas chave
    key_atributos = ['SK_ID_CURR', 'TARGET_publico']

    # 1. Remove coluna proibida
    df = df.drop(columns=['CODE_GENDER_publico'], errors='ignore')

    # 2. Otimiza tipos
    df = otimiza_tipos(df)

    # 3. Limpa os valores categóricos
    if is_treino:
        df[lista_vars_categ] = df[lista_vars_categ].applymap(limpar_valores)
    else:
        df[lista_vars_categ] = df[lista_vars_categ].applymap(limpar_valores)

    # 4. Substitui pd.NA por np.nan
    df = df.replace({pd.NA: np.nan})

    # 5. Mantém apenas as variáveis selecionadas (pós-filtragem de nulos)
    df = df[vars_lista_inicial]

    # 6. Se for treino, ajusta os pipelines
    if is_treino:
        cat_atributos = df.iloc[:, 2:].select_dtypes(include='object')
        num_atributos = df.iloc[:, 2:].select_dtypes(exclude='object')

        # Pipeline categórico
        cat_pipe = Pipeline([
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999))
        ])

        # Pipeline numérico
        num_pipe = Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999))
        ])

        # Combinando os pipelines
        global pipeline_preprocessor
        pipeline_preprocessor = Pipeline([
            ('preprocessor', ColumnTransformer([
                ('cat', cat_pipe, cat_atributos.columns),
                ('num', num_pipe, num_atributos.columns)
            ]))
        ])

        df_transformado = pipeline_preprocessor.fit_transform(df)

        # Nomes das colunas para uso posterior
        global nomes_colunas_transformadas
        nomes_colunas_transformadas = pipeline_preprocessor.named_steps['preprocessor'].get_feature_names_out()

    else:
        # Apenas transforma com o pipeline já treinado
        df_transformado = pipeline_preprocessor.transform(df)

    # Cria DataFrame com colunas e adiciona as chaves
    df_processado = pd.DataFrame(df_transformado, columns=nomes_colunas_transformadas)
    df_processado[key_atributos] = df[key_atributos].reset_index(drop=True)

    # Remove colunas específicas (mesmas do treino)
    colunas_remover = [
        'cat__NAME_TYPE_SUITE_publico_nan',
        'cat__OCCUPATION_TYPE_publico_nan',
        'cat__FONDKAPREMONT_MODE_publico_nan',
        'cat__HOUSETYPE_MODE_publico_nan',
        'cat__WALLSMATERIAL_MODE_publico_nan',
        'cat__EMERGENCYSTATE_MODE_publico_nan',
        'cat__FLAG_OWN_CAR_publico_Y',
        'cat__FLAG_OWN_REALTY_publico_N',
        'cat__EMERGENCYSTATE_MODE_publico_Yes',
        'cat__ORGANIZATION_TYPE_publico_XNA'
    ]
    df_processado.drop(columns=colunas_remover, errors='ignore', inplace=True)

    return df_processado

In [None]:
# Processando os dados de teste da divisão

df_teste_processado = tratar_base(df_teste, is_treino=False)
df_teste_processado.shape

(36870, 897)

In [None]:
df_teste_processado.head()

Unnamed: 0,cat__NAME_CONTRACT_TYPE_publico_Cash_loans,cat__NAME_CONTRACT_TYPE_publico_Revolving_loans,cat__FLAG_OWN_CAR_publico_N,cat__FLAG_OWN_REALTY_publico_Y,cat__NAME_TYPE_SUITE_publico_Children,cat__NAME_TYPE_SUITE_publico_Family,cat__NAME_TYPE_SUITE_publico_Group_of_people,cat__NAME_TYPE_SUITE_publico_Other_A,cat__NAME_TYPE_SUITE_publico_Other_B,cat__NAME_TYPE_SUITE_publico_Spouse_partner,...,num__VL_TOT_QTD_STATUS_5_ULTIMOS_36_MESES_externo,num__VL_MED_QTD_STATUS_5_ULTIMOS_36_MESES_externo,num__VL_MAX_QTD_STATUS_5_ULTIMOS_36_MESES_externo,num__VL_MIN_QTD_STATUS_5_ULTIMOS_36_MESES_externo,num__VL_TOT_QTD_STATUS_1_ULTIMOS_36_MESES_externo,num__VL_MED_QTD_STATUS_1_ULTIMOS_36_MESES_externo,num__VL_MAX_QTD_STATUS_1_ULTIMOS_36_MESES_externo,num__VL_MIN_QTD_STATUS_1_ULTIMOS_36_MESES_externo,SK_ID_CURR,TARGET_publico
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,101122,0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,102745,0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,103634,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,104454,0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,81.0,27.0,29.0,24.0,81.0,27.0,29.0,24.0,104665,0


In [None]:
# Processando os dados de teste dados pelo problema

df_teste_processado_full = tratar_base(df_teste_full, is_treino=False)
df_teste_processado_full.shape

(79141, 897)

In [None]:
## Varáveis finais para o teste,

variaveis_finais_teste = [
 'SK_ID_CURR',
 'TARGET_publico',
 'num__EXT_SOURCE_3_publico',
 'num__EXT_SOURCE_2_publico',
 'num__EXT_SOURCE_1_publico',
 'num__PAYMENT_RATE_publico',
 'num__AMT_GOODS_PRICE_publico',
 'num__DAYS_EMPLOYED_publico',
 'num__DAYS_BIRTH_publico',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo',
 'num__CREDIT_TO_GOODS_RATIO_publico',
 'num__INCOME_TO_EMPLOYED_RATIO_publico',
 'num__VL_MED_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_24_MESES_externo',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_6_MESES_externo',
 'num__REGION_RATING_CLIENT_W_CITY_publico',
 'num__QT_MAX_QT_MAX_DAYS_CREDIT_ENDDATE_ULTIMOS_36_MESES_externo',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_LIMIT_ULTIMOS_24_MESES_externo',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_24_MESES_externo',
 'cat__NAME_EDUCATION_TYPE_publico_Higher_education',
 'num__DAYS_LAST_PHONE_CHANGE_publico',
 'cat__NAME_INCOME_TYPE_publico_Working',
 'num__VL_TOT_VL_TOT_AMT_CREDIT_MAX_OVERDUE_ULTIMOS_12_MESES_externo',
 'num__TOTALAREA_MODE_publico',
 'num__REGION_POPULATION_RELATIVE_publico',
 'num__VL_MIN_VL_TOT_AMT_CREDIT_SUM_ULTIMOS_36_MESES_externo',
 'num__DAYS_ID_PUBLISH_publico',
 'num__CAR_TO_EMPLOYED_RATIO_publico',
 'num__REG_CITY_NOT_WORK_CITY_publico',
 'num__FLAG_DOCUMENT_3_publico',
 'num__QT_MIN_QT_MAX_DAYS_CREDIT_UPDATE_ULTIMOS_24_MESES_externo',
 'num__DAYS_REGISTRATION_publico',
 'cat__OCCUPATION_TYPE_publico_Laborers',
 'num__INCOME_TO_BIRTH_RATIO_publico',
 'num__LIVINGAPARTMENTS_MEDI_publico',
 'num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo',
 'num__DEF_30_CNT_SOCIAL_CIRCLE_publico',
 'cat__WALLSMATERIAL_MODE_publico_Panel',
 'num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo',
 'num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo',
 'cat__NAME_CONTRACT_TYPE_publico_Cash_loans',
 'num__FLAG_DOCUMENT_6_publico',
 'num__FLAG_WORK_PHONE_publico'
]

In [None]:
## Criando a base de teste final com as variáveis finais selecionadas do treino e na mesma ordem

df_teste_processado = df_teste_processado[variaveis_finais_teste].copy()
df_teste_processado.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,num__EXT_SOURCE_3_publico,num__EXT_SOURCE_2_publico,num__EXT_SOURCE_1_publico,num__PAYMENT_RATE_publico,num__AMT_GOODS_PRICE_publico,num__DAYS_EMPLOYED_publico,num__DAYS_BIRTH_publico,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo,...,num__INCOME_TO_BIRTH_RATIO_publico,num__LIVINGAPARTMENTS_MEDI_publico,num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo,num__DEF_30_CNT_SOCIAL_CIRCLE_publico,cat__WALLSMATERIAL_MODE_publico_Panel,num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo,num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo,cat__NAME_CONTRACT_TYPE_publico_Cash_loans,num__FLAG_DOCUMENT_6_publico,num__FLAG_WORK_PHONE_publico
0,101122,0,0.605836,0.347977,-999.0,0.04,517500.0,-2058.0,-12198.0,-999.0,...,-7.38,-999.0,-999.0,0.0,1.0,-999.0,0.0,0.0,0.0,0.0
1,102745,0,0.357293,0.591461,0.336847,0.04,607500.0,-2024.0,-17131.0,313573.5,...,-11.82,-999.0,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,1.0
2,103634,0,0.58499,0.707927,-999.0,0.03,1125000.0,-3391.0,-17445.0,-999.0,...,-16.77,-999.0,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,1.0
3,104454,0,0.519097,-999.0,-999.0,0.11,58500.0,-5093.0,-13772.0,0.0,...,-7.35,0.0154,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,0.0
4,104665,0,0.495666,0.303363,0.285083,0.05,900000.0,-191.0,-9269.0,-999.0,...,-19.42,0.1026,-999.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0


In [None]:
df_teste_processado.shape

(36870, 43)

In [None]:
## Criando a base de teste final com as variáveis finais selecionadas do treino e na mesma ordem

df_teste_processado_full = df_teste_processado_full[variaveis_finais_teste].copy()
df_teste_processado_full.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,num__EXT_SOURCE_3_publico,num__EXT_SOURCE_2_publico,num__EXT_SOURCE_1_publico,num__PAYMENT_RATE_publico,num__AMT_GOODS_PRICE_publico,num__DAYS_EMPLOYED_publico,num__DAYS_BIRTH_publico,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo,...,num__INCOME_TO_BIRTH_RATIO_publico,num__LIVINGAPARTMENTS_MEDI_publico,num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo,num__DEF_30_CNT_SOCIAL_CIRCLE_publico,cat__WALLSMATERIAL_MODE_publico_Panel,num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo,num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo,cat__NAME_CONTRACT_TYPE_publico_Cash_loans,num__FLAG_DOCUMENT_6_publico,num__FLAG_WORK_PHONE_publico
0,100227,,0.297087,0.634896,0.412944,0.05,900000.0,-240.0,-10010.0,63680.0,...,-29.219999,-999.0,-999.0,0.0,0.0,12.0,0.0,0.0,0.0,1.0
1,100735,,0.479449,0.626088,0.465245,0.04,720000.0,-7807.0,-14569.0,45072.0,...,-20.08,-999.0,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,0.0
2,100964,,-999.0,0.642601,0.591352,0.05,270000.0,-63.0,-12956.0,-999.0,...,-8.68,-999.0,-999.0,0.0,1.0,-999.0,-999.0,0.0,0.0,0.0
3,102521,,0.429424,0.121702,0.359249,0.08,225000.0,-415.0,-13675.0,-999.0,...,-8.23,-999.0,-999.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0
4,102536,,0.657784,0.512349,0.698138,0.03,1575000.0,-5670.0,-19937.0,-999.0,...,-12.41,-999.0,-999.0,0.0,1.0,-999.0,0.0,0.0,0.0,0.0


In [None]:
df_teste_processado_full.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79141 entries, 0 to 79140
Data columns (total 43 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   SK_ID_CURR                                                          79141 non-null  Int32  
 1   TARGET_publico                                                      0 non-null      float32
 2   num__EXT_SOURCE_3_publico                                           79141 non-null  float64
 3   num__EXT_SOURCE_2_publico                                           79141 non-null  float64
 4   num__EXT_SOURCE_1_publico                                           79141 non-null  float64
 5   num__PAYMENT_RATE_publico                                           79141 non-null  float64
 6   num__AMT_GOODS_PRICE_publico                                        79141 non-null  float64
 7   num__DAYS_EMP

In [None]:
# ## Salvando a base de teste em parquet
# df_teste_processado.to_parquet(r'D:\projeto_modelo_credito\dados\df_teste_processado.parquet', engine='pyarrow', index=False)

In [None]:
# ## Salvando a base de teste full dada em parquet
# df_teste_processado_full.to_parquet(r'D:\projeto_modelo_credito\dados\df_teste_processado_full.parquet', engine='pyarrow', index=False)

In [None]:
# Baixando os dados de teste
df_teste_processado_t = pd.read_parquet(r'D:\projeto_modelo_credito\dados\df_teste_processado.parquet',
                                 engine='pyarrow')
df_teste_processado_t.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,num__EXT_SOURCE_3_publico,num__EXT_SOURCE_2_publico,num__EXT_SOURCE_1_publico,num__PAYMENT_RATE_publico,num__AMT_GOODS_PRICE_publico,num__DAYS_EMPLOYED_publico,num__DAYS_BIRTH_publico,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo,...,num__INCOME_TO_BIRTH_RATIO_publico,num__LIVINGAPARTMENTS_MEDI_publico,num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo,num__DEF_30_CNT_SOCIAL_CIRCLE_publico,cat__WALLSMATERIAL_MODE_publico_Panel,num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo,num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo,cat__NAME_CONTRACT_TYPE_publico_Cash_loans,num__FLAG_DOCUMENT_6_publico,num__FLAG_WORK_PHONE_publico
0,101122,0,0.605836,0.347977,-999.0,0.04,517500.0,-2058.0,-12198.0,-999.0,...,-7.38,-999.0,-999.0,0.0,1.0,-999.0,0.0,0.0,0.0,0.0
1,102745,0,0.357293,0.591461,0.336847,0.04,607500.0,-2024.0,-17131.0,313573.5,...,-11.82,-999.0,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,1.0
2,103634,0,0.58499,0.707927,-999.0,0.03,1125000.0,-3391.0,-17445.0,-999.0,...,-16.77,-999.0,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,1.0
3,104454,0,0.519097,-999.0,-999.0,0.11,58500.0,-5093.0,-13772.0,0.0,...,-7.35,0.0154,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,0.0
4,104665,0,0.495666,0.303363,0.285083,0.05,900000.0,-191.0,-9269.0,-999.0,...,-19.42,0.1026,-999.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0


In [None]:
# Baixando os dados de teste full sem target
df_teste_processado_full_d = pd.read_parquet(r'D:\projeto_modelo_credito\dados\df_teste_processado_full.parquet',
                                 engine='pyarrow')
df_teste_processado_full_d.head()

Unnamed: 0,SK_ID_CURR,TARGET_publico,num__EXT_SOURCE_3_publico,num__EXT_SOURCE_2_publico,num__EXT_SOURCE_1_publico,num__PAYMENT_RATE_publico,num__AMT_GOODS_PRICE_publico,num__DAYS_EMPLOYED_publico,num__DAYS_BIRTH_publico,num__VL_TOT_VL_TOT_AMT_CREDIT_SUM_DEBT_ULTIMOS_12_MESES_externo,...,num__INCOME_TO_BIRTH_RATIO_publico,num__LIVINGAPARTMENTS_MEDI_publico,num__VL_TOT_VL_TOT_AMT_ANNUITY_ULTIMOS_24_MESES_externo,num__DEF_30_CNT_SOCIAL_CIRCLE_publico,cat__WALLSMATERIAL_MODE_publico_Panel,num__VL_MIN_QTD_STATUS_C_ULTIMOS_36_MESES_externo,num__QT_MAX_QT_MAX_CREDIT_DAY_OVERDUE_ULTIMOS_36_MESES_externo,cat__NAME_CONTRACT_TYPE_publico_Cash_loans,num__FLAG_DOCUMENT_6_publico,num__FLAG_WORK_PHONE_publico
0,100227,,0.297087,0.634896,0.412944,0.05,900000.0,-240.0,-10010.0,63680.0,...,-29.219999,-999.0,-999.0,0.0,0.0,12.0,0.0,0.0,0.0,1.0
1,100735,,0.479449,0.626088,0.465245,0.04,720000.0,-7807.0,-14569.0,45072.0,...,-20.08,-999.0,-999.0,0.0,0.0,-999.0,0.0,0.0,0.0,0.0
2,100964,,-999.0,0.642601,0.591352,0.05,270000.0,-63.0,-12956.0,-999.0,...,-8.68,-999.0,-999.0,0.0,1.0,-999.0,-999.0,0.0,0.0,0.0
3,102521,,0.429424,0.121702,0.359249,0.08,225000.0,-415.0,-13675.0,-999.0,...,-8.23,-999.0,-999.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0
4,102536,,0.657784,0.512349,0.698138,0.03,1575000.0,-5670.0,-19937.0,-999.0,...,-12.41,-999.0,-999.0,0.0,1.0,-999.0,0.0,0.0,0.0,0.0


In [None]:
df_teste_processado_full_d.shape

(79141, 43)

In [None]:
df_teste_processado_full_d.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79141 entries, 0 to 79140
Data columns (total 43 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   SK_ID_CURR                                                          79141 non-null  Int32  
 1   TARGET_publico                                                      0 non-null      float32
 2   num__EXT_SOURCE_3_publico                                           79141 non-null  float64
 3   num__EXT_SOURCE_2_publico                                           79141 non-null  float64
 4   num__EXT_SOURCE_1_publico                                           79141 non-null  float64
 5   num__PAYMENT_RATE_publico                                           79141 non-null  float64
 6   num__AMT_GOODS_PRICE_publico                                        79141 non-null  float64
 7   num__DAYS_EMP