## Importação das bibliotecas

In [33]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from sklearn.pipeline import Pipeline 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import roc_auc_score

import mlflow

sys.path.append(os.path.abspath(".."))
from src.utils import DataWrangling
from src.utils import OneHotFeatureEncoder

### Configurações

In [34]:
pd.set_option('display.max_columns', 99)
pd.set_option('future.no_silent_downcasting', True)

sns.set_style('darkgrid')

## MLflow

O MLflow é uma plataforma open-source para gerenciar o ciclo de vida do Machine Learning. Ele permite rastrear experimentos, armazenar modelos e facilitar a implantação. Seus principais componentes incluem o **Tracking** (registro de métricas e parâmetros), **Projects** (padronização de código), **Models** (armazenamento e versionamento) e **Registry** (gestão de modelos para produção).

In [35]:
# Define a URL do servidor MLflow
mlflow.set_tracking_uri('http://127.0.0.1:5000')

# Define o experimento no qual os dados serão registrados
mlflow.set_experiment(experiment_id=654000327895154401)

<Experiment: artifact_location='mlflow-artifacts:/654000327895154401', creation_time=1741352105343, experiment_id='654000327895154401', last_update_time=1741352105343, lifecycle_stage='active', name='CreditRisk-Matheus', tags={}>

## Leitura e limpeza dos dados

Os dados serão carregados e limpos seguindo o mesmo processo utilizado no notebook de Análise Exploratória de Dados (EDA).

In [36]:
# Caminho do arquivo que contém os dados brutos
raw_data_path = '../data/raw/default_of_credit_card_clients__courseware_version_1_21_19.xls'

# O pandas é usado para ler o arquivo em excel
df = pd.read_excel(raw_data_path)

# Classe definida em utils.py que contém os métodos criados para limpeza e transformação dos dados
df = DataWrangling().fit_transform(df)
df

Unnamed: 0,LIMIT_BAL,EDUCATION,MARRIAGE,AGE,PAY_1,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT
0,3527.2,2,1,24,2,690.09668,547.06872,121.51204,0.00000,0.00000,0.00000,0.00000,121.51204,0.00000,0.00000,0.00000,0.00000,1,university
1,21163.2,2,2,26,-1,472.99752,304.22100,472.99752,577.04992,609.32380,575.10996,0.00000,176.36000,176.36000,176.36000,0.00000,352.72000,1,university
2,15872.4,2,2,34,0,5156.59004,2473.80172,2391.26524,2527.41516,2636.22928,2742.22164,267.71448,264.54000,176.36000,176.36000,176.36000,881.80000,0,university
3,8818.0,2,1,37,0,8287.15640,8506.37188,8692.96076,4993.45704,5107.20924,5210.90892,352.72000,356.07084,211.63200,193.99600,188.52884,176.36000,0,university
4,8818.0,2,1,57,-1,1519.69412,999.96120,6319.86060,3692.97840,3376.58856,3373.94316,352.72000,6469.06116,1763.60000,1587.24000,121.51204,119.74844,0,university
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,38799.2,3,1,39,0,33322.86928,34004.85340,36747.25140,15520.38544,5508.95732,2818.23280,1499.06000,3527.20000,882.32908,537.36892,881.80000,176.36000,0,high school
29996,26454.0,3,2,43,-1,296.81388,322.38608,617.61272,1583.53644,915.30840,0.00000,323.97332,621.84536,1586.88728,22.75044,0.00000,0.00000,0,high school
29997,5290.8,2,2,37,4,628.72340,591.86416,486.40088,3682.04408,3629.84152,3413.80052,0.00000,0.00000,3879.92000,740.71200,352.72000,546.71600,1,university
29998,14108.8,3,1,41,1,-290.11220,13822.92044,13456.97344,9307.22264,2090.74780,8631.76384,15149.32400,601.21124,207.75208,339.66936,9340.73104,318.15344,1,high school


## Separação dos modelos de treino e teste

A separação dos dados em **treino** e **teste** é essencial para avaliar o desempenho real de um modelo de Machine Learning. Se usarmos os mesmos dados para treinar e testar, o modelo pode simplesmente **memorizar** os padrões, sem aprender a **generalizar** para novos dados. Ao dividir os dados, garantimos que o modelo seja avaliado com exemplos que ele nunca viu, ajudando a identificar **overfitting** (quando ele se adapta demais aos dados de treino) e garantindo previsões mais confiáveis no mundo real.

In [16]:
# Separação das features (X) e target (y)
X = df.drop(['default payment next month'], axis=1)
dummies = pd.get_dummies(df['EDUCATION_CAT'], dtype=int, prefix='EDUCATION')
X = pd.concat([X, dummies], axis=1)
X = X.drop('EDUCATION_CAT', axis=1)

y = df['default payment next month']

selector = SelectKBest(score_func=f_classif, k=10)
selector.fit(X,y)
best_features = selector.get_feature_names_out().tolist()
X = X[best_features]

print('Shape de X:', X.shape)
print('Shape de y:', y.shape)

# Divisão dos conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)

# Verificação a distribuição do target nas bases de treino e teste
print('Taxa de resposta da base de treino:', y_train.mean())
print('Taxa de resposta da base de teste:', y_test.mean())

Shape de X: (29685, 10)
Shape de y: (29685,)
Taxa de resposta da base de treino: 0.2210712481051036
Taxa de resposta da base de teste: 0.22098703048677784


## Experimentos

### Pré-processamento

O **pré-processamento dos dados** é fundamental para garantir que um modelo de Machine Learning funcione de maneira eficaz. Dados brutos geralmente contêm **valores ausentes, outliers, formatos inconsistentes** e **escalas diferentes**, o que pode impactar negativamente o aprendizado do modelo. Técnicas como **normalização, padronização, remoção de duplicatas e tratamento de valores nulos** ajudam a melhorar a qualidade dos dados, tornando o treinamento mais eficiente e aumentando a precisão das previsões.

In [17]:
# Encoder
encoder = OneHotFeatureEncoder()

# Escalonamento de dados
min_max_sc = MinMaxScaler()
std_sc = StandardScaler()

### Experimentos

#### Grid de parâmetros

Um grid de parâmetros é uma estratégia para encontrar a melhor combinação de hiperparâmetros em um modelo de Machine Learning. Ele consiste em um conjunto predefinido de valores para cada hiperparâmetro, e o modelo é treinado com todas as combinações possíveis desses valores.

In [18]:
params = {
    'learning_rate': [0.01, 0.05, 0.1],  # Taxa de aprendizado
    'n_estimators': [500, 600, 700],  # Número de estimadores
    'max_depth': [6, 7, 8],  # Profundidade máxima das árvores
    'min_samples_split': [10, 15, 20],  # Número mínimo de amostras para dividir um nó
    'min_samples_leaf': [6, 7, 8],  # Número mínimo de amostras em um nó folha
    'max_features': ['log2', 'sqrt'],  # Número máximo de features a considerar
    'subsample': [0.8, 0.9, 1.0],  # Proporção de amostras a ser usada para cada árvore
    'loss': ['deviance', 'exponential']  # Função de perda a ser otimizada
}


#### Runs

Uma **run** no MLflow representa a execução de um experimento de Machine Learning, registrando informações como **parâmetros, métricas, artefatos (arquivos gerados)** e o código utilizado.  

No desenvolvimento do **modelo de risco de crédito**, foram realizadas diversas **runs**, permitindo comparar os resultados e selecionar a configuração mais eficiente para o nosso modelo.

In [19]:
# with mlflow.start_run():

#         mlflow.sklearn.autolog()

#         model = GradientBoostingClassifier(random_state=42)

#         random_search = RandomizedSearchCV(
#                 model, param_distributions=params, 
#                 n_iter=300,  # Número de combinações a testar
#                 scoring='roc_auc',  # Otimizando a AUC-ROC
#                 cv=5,  # Validação cruzada de 3 folds
#                 verbose=3,  # Mostrar progresso
#                 n_jobs=-1  # Usar todos os núcleos disponíveis
#                 )
        
#         # Pipeline
#         pipeline = Pipeline([
#                 ('OneHotEnconder', encoder),
#                 ('Scaler', min_max_sc),
#                 ('Selector', selector),
#                 ('Model', random_search)
#                 ])
        
#         pipeline.fit(X_train, y_train)
        
#         y_train_predict_proba = pipeline.predict_proba(X_train)
#         y_test_predict_proba = pipeline.predict_proba(X_test)

#         train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
#         test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

#         mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})

#### Melhor modelo

A melhor ***run***  foi selecionada na plataforma do mlflow e o código abaixo automatiza a tarefa de seleção no nosso notebook

In [20]:
# Cria uma instância do cliente MLflow, que permite interagir com o servidor do MLflow.
client = mlflow.client.MlflowClient()

# Obbtem a última versão do modelo registrado
version = max([int(i.version) for i in client.get_latest_versions('CreditRisk')])

# O modelo mais recente é carregado
model = mlflow.sklearn.load_model(f'models:/CreditRisk/{version}')
model

  version = max([int(i.version) for i in client.get_latest_versions('CreditRisk')])


In [44]:
teste = pd.read_csv('../artifacts/train.csv')
teste = DataWrangling().fit_transform(teste)
teste

Unnamed: 0,LIMIT_BAL,EDUCATION,MARRIAGE,AGE,PAY_1,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT
0,3527.2,2,1,34,1,3169.18920,3066.72404,3613.79276,3492.45708,3480.28824,3458.94868,0.00000,652.53200,0.00000,176.36000,352.72000,125.21560,0,university
1,3527.2,3,2,26,-1,3664.76080,3557.53392,3421.38400,3527.20000,3527.20000,0.00000,0.00000,0.00000,176.36000,0.00000,0.00000,0.00000,0,high school
2,5290.8,1,2,24,-1,538.77980,3243.61312,112.87040,0.00000,455.53788,820.60308,3246.25852,112.87040,0.00000,455.53788,820.60308,455.53788,0,graduate school
3,26454.0,2,1,36,0,59847.94236,23575.98116,17756.45388,25947.31772,14098.57112,12693.68736,1234.52000,617.26000,529.08000,924.47912,530.66724,705.44000,0,university
4,75834.8,2,2,27,-2,46.38268,46.38268,726.42684,774.22040,3695.09472,2744.51432,311.09904,726.60320,94.35260,3701.79640,441.95816,88.18000,1,university
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23994,29981.2,2,1,49,2,4760.13276,5306.67240,3192.82144,3359.65800,3312.56988,1418.28712,708.79084,0.00000,186.94160,0.00000,142.32252,0.00000,1,university
23996,5290.8,2,2,37,0,5538.40944,5696.78072,4929.43836,5021.85100,5129.43060,4728.21160,264.54000,260.30736,176.36000,5467.16000,176.36000,175.47820,1,university
23997,8818.0,2,1,43,2,8654.33792,8916.58524,8652.22160,7919.97488,5285.33284,5331.36280,705.44000,0.00000,352.72000,183.23804,193.99600,193.99600,0,university
23998,88180.0,1,2,35,-1,422.91128,808.25788,2384.21084,7627.74636,6045.44444,1526.74852,808.25788,2384.21084,7627.74636,0.00000,1526.74852,1878.05764,0,graduate school


In [39]:
X_train

Unnamed: 0,LIMIT_BAL,PAY_1,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,EDUCATION_graduate school,EDUCATION_others
2458,14108.8,1,528.72728,208.63388,246.90400,200.87404,0.00000,207.75208,0,0
8040,12345.2,0,414.79872,358.01080,317.44800,352.72000,352.72000,352.72000,0,0
2086,19399.6,0,0.00000,847.05708,0.00000,1763.60000,0.00000,1268.91020,0,0
11023,12345.2,-1,33.15568,52.73164,146.20244,59.60968,152.55140,146.90788,0,0
28741,8818.0,0,241.26048,234.55880,204.57760,58.19880,55.37704,64.19504,1,0
...,...,...,...,...,...,...,...,...,...,...
28395,14108.8,0,617.26000,617.26000,599.62400,705.44000,705.44000,529.08000,0,0
18554,12345.2,1,0.00000,212.69016,529.08000,32.62660,35.27200,0.00000,0,0
23874,15872.4,2,187.99976,176.36000,0.00000,0.00000,0.00000,0.00000,0,0
23942,3527.2,0,231.56068,372.11960,705.44000,1058.16000,705.44000,30.68664,0,0


In [38]:
X_train['LIMIT_BAL'].mean()

np.float64(29512.523225736906)