# Pipeline 

Importando bibliotecas necessárias: 

In [22]:
# Manipulação dos dados
import pandas as pd
import numpy as np
from pathlib import Path 

# Visualização
import matplotlib.pyplot as plt
import seaborn as sns

#  Modelagem
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

Carregando os dados 

In [18]:
# identificando diretório base 
base_dir = Path.cwd()

# caminho para a pasta contendo o dataset  
data_dir = base_dir.parent / "data"

# caminho para o dataset 
df_path = data_dir / "dataset_2021-5-26-10-14.csv"

# importando o arquivo csv para um dataframe pandas
df = pd.read_csv(df_path, sep="\t", encoding="utf-8")

df.head()

Unnamed: 0,default_3months,ioi_36months,ioi_3months,valor_por_vencer,valor_vencido,valor_quitado,quant_protestos,valor_protestos,quant_acao_judicial,acao_judicial_valor,participacao_falencia_valor,dividas_vencidas_valor,dividas_vencidas_qtd,falencia_concordata_qtd,tipo_sociedade,opcao_tributaria,atividade_principal,forma_pagamento,valor_total_pedido,month,year,default
0,0,58.0,18.236092,0.0,0.0,242100.7,0,0.0,0,0.0,0.0,0.0,0,0,empresario (individual),simples nacional,papelaria,30/60/90,34665.674938,6,2019,0
1,1,16.052632,7.5,224132.85,0.0,4960846.21,0,0.0,0,0.0,0.0,0.0,0,0,sociedade empresaria limitada,missing,com de equipamentos de informatica,30/60/90,7134.489373,10,2018,0
2,0,13.25,3.904762,513043.83,0.0,158631.93,1,1800.0,0,0.0,0.0,0.0,0,0,sociedade empresaria limitada,simples nacional,servicos de vigilancia e guarda,missing,72653.621143,4,2018,0
3,0,136.925,10.144219,23273.64,0.0,669644.16,0,0.0,0,0.0,0.0,0.0,0,0,empresario (individual),simples nacional,com de equipamentos de informatica,missing,14576.805783,4,2017,1
4,0,140.333333,17.651678,0.0,0.0,2010.56,0,0.0,0,0.0,0.0,0.0,0,0,sociedade empresaria limitada,simples nacional,com de compon eletron e pecas para eletrod,30/60/90,2655.505663,10,2017,0


Limpeza dos dados

In [None]:
# Removendo valores "missing" no 'tipo_sociedade' e 'atividade_principal' (~ 0.5%) 
df = df.loc[(df['tipo_sociedade'] != 'missing') &  (df['atividade_principal'] != 'missing')].copy()

# Removendo valores negativos em 'valor_total_pedido'
df = df[df['valor_total_pedido'] >= 0].copy()

# Removendo variavel com apenas um valor
df.drop('participacao_falencia_valor', axis=1, inplace=True)

# Arredondando valores de 'ioi_36months' e 'ioi_3months' e transformando em inteiros
df["ioi_36months"] = df["ioi_36months"].round().astype(int)
df["ioi_3months"] = df["ioi_3months"].round().astype(int)

Transformação dos dados

In [25]:
# Criando colunas binárias para missing em 'opcao_tributaria' e 'forma_pagamento' 
df["opcao_tributaria_missing"] = np.where(df["opcao_tributaria"]=="missing", 1, 0)
df["forma_pagamento_missing"] = np.where(df["forma_pagamento"]=="missing", 1, 0)

# Criar colunas log transformadas para as variáveis financeiras
cols_to_log = ["valor_por_vencer","valor_vencido","valor_quitado", "valor_protestos", "acao_judicial_valor", "dividas_vencidas_valor", "valor_total_pedido"]

for col in cols_to_log:
    log_col = f"log1p_{col}"
    df[log_col] = np.log1p(df[col])

In [26]:
numeric_features = ["default_3months", "ioi_36months", "ioi_3months", "log1p_valor_por_vencer", "log1p_valor_vencido", "log1p_valor_quitado", "log1p_valor_protestos", "log1p_acao_judicial_valor",
    "log1p_dividas_vencidas_valor","log1p_valor_total_pedido"]

categorical_features = ["tipo_sociedade", "atividade_principal", "opcao_tributaria", "forma_pagamento"]

missing_features = ["opcao_tributaria_missing", "forma_pagamento_missing"]

selected_features = numeric_features + categorical_features + missing_features 

X = df[selected_features].copy()
y = df["default"].values