## Importação das bibliotecas

In [64]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import roc_auc_score

import mlflow

sys.path.append(os.path.abspath(".."))
from src.utils import DataWrangling
from src.utils import OneHotFeatureEncoder

### Configurações

In [2]:
pd.set_option('display.max_columns', 99)
pd.set_option('future.no_silent_downcasting', True)

sns.set_style('darkgrid')

## MLflow

In [17]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment(experiment_id=654000327895154401)

<Experiment: artifact_location='mlflow-artifacts:/654000327895154401', creation_time=1741352105343, experiment_id='654000327895154401', last_update_time=1741352105343, lifecycle_stage='active', name='CreditRisk-Matheus', tags={}>

## Leitura dos dados

In [3]:
raw_data_path = '../data/raw/default_of_credit_card_clients__courseware_version_1_21_19.xls'

df = pd.read_excel(raw_data_path)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,798fc410-45c1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,85698822-43f5,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,0737c11b-be42,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
wrangling = DataWrangling()

df = DataWrangling().fit_transform(df)
df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT
0,798fc410-45c1,20000,2,2,1,24,2,3913,3102,689,0,0,0,0,689,0,0,0,0,1,university
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1,university
2,85698822-43f5,90000,2,2,2,34,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0,university
3,0737c11b-be42,50000,2,2,1,37,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0,university
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0,university
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,ecff42d0-bdc6,220000,1,3,1,39,0,188948,192815,208365,88004,31237,15980,8500,20000,5003,3047,5000,1000,0,high school
29996,99d1fa0e-222b,150000,1,3,2,43,-1,1683,1828,3502,8979,5190,0,1837,3526,8998,129,0,0,0,high school
29997,95cdd3e7-4f24,30000,1,2,2,37,4,3565,3356,2758,20878,20582,19357,0,0,22000,4200,2000,3100,1,university
29998,00d03f02-04cd,80000,1,3,1,41,1,-1645,78379,76304,52774,11855,48944,85900,3409,1178,1926,52964,1804,1,high school


## Separação dos modelos de treino e teste

In [5]:
X = df.drop(['ID', 'default payment next month'], axis=1)
y = df['default payment next month']

print('Shape de X:', X.shape)
print('Shape de y:', y.shape)

Shape de X: (29685, 19)
Shape de y: (29685,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)

print('Taxa de resposta da base de treino:', y_train.mean())
print('Taxa de resposta da base de teste:', y_test.mean())

Taxa de resposta da base de treino: 0.2210712481051036
Taxa de resposta da base de teste: 0.22098703048677784


## Modelos

In [32]:
# Encoder
encoder = OneHotFeatureEncoder()

# Escalonamento de dados
min_max_sc = MinMaxScaler()
std_sc = StandardScaler()

### Regressão Logística

#### Paramentros

In [23]:
# Primeiro modelo a ser utilizado
C_vals = [80, 90, 100 , 110, 120]
param_C = {'C': C_vals}
param_C

{'C': [80, 90, 100, 110, 120]}

#### Runs

In [None]:
# with mlflow.start_run():

#         mlflow.sklearn.autolog()

#         lr = LogisticRegression(penalty='l1', 
#                                 solver='saga',
#                                 max_iter=1000)

#         grid_lr = GridSearchCV(lr, param_grid=param_C,
#                         scoring='roc_auc',
#                         n_jobs=None,
#                         refit=True,
#                         cv=5,
#                         pre_dispatch='None',
#                         error_score=np.nan,
#                         return_train_score=True)

#         # Pipeline
#         pipeline = Pipeline([
#                 ('OneHotEnconder', encoder),
#                 ('Scaler', min_max_sc),
#                 ('Model', grid_lr)
#                 ])

#         pipeline.fit(X_train, y_train)

#         y_train_predict_proba = pipeline.predict_proba(X_train)
#         y_test_predict_proba = pipeline.predict_proba(X_test)

#         train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
#         test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

#         mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})



🏃 View run learned-shrew-431 at: http://127.0.0.1:5000/#/experiments/654000327895154401/runs/329393233da9466c95cdee6622bd3f72
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654000327895154401


#### Melhor modelo

Pipeline:

1. OneHotFeatureEncoder()
2. PolynomialFeatures(include_bias=False, interaction_only=True)
3. MinMaxScaler()
4. LogisticRegression(C=100, max_iter=1000, penalty='l1', solver='saga')

## Random Forest

In [51]:
param_grid = {
    'n_estimators': [500],
    'min_samples_leaf': [25],
    'max_depth': [15],
    'max_features': [0.7, 0.8, 0.9, 1]
    }

In [None]:
# with mlflow.start_run():

#         mlflow.sklearn.autolog()

#         rfc = RandomForestClassifier(random_state=42, 
#                                      class_weight='balanced',
#                                      n_jobs=-1)

#         grid_rfc = GridSearchCV(rfc, param_grid=param_grid,
#                                 scoring='roc_auc',
#                                 n_jobs=None,
#                                 refit=True,
#                                 cv=5,
#                                 error_score=np.nan,
#                                 return_train_score=True)

#         # Pipeline
#         pipeline = Pipeline([
#                 ('OneHotEnconder', encoder),
#                 ('Scaler', std_sc),
#                 ('Model', grid_rfc)
#                 ])

#         pipeline.fit(X_train, y_train)

#         y_train_predict_proba = pipeline.predict_proba(X_train)
#         y_test_predict_proba = pipeline.predict_proba(X_test)

#         train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
#         test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

#         mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})



🏃 View run gaudy-grub-262 at: http://127.0.0.1:5000/#/experiments/654000327895154401/runs/cc172fd35dfd4f279b178d753e04cb2d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654000327895154401


## AdaBoost

In [58]:
param_grid = {
    'n_estimators': [90, 100, 150, 200],
    'learning_rate': [0.8, 0.9, 1, 1.1, 1.2, 1.5]
}

In [59]:
with mlflow.start_run():

        mlflow.sklearn.autolog()

        ada = AdaBoostClassifier(random_state=42)

        grid_ada = GridSearchCV(ada, param_grid=param_grid,
                                scoring='roc_auc',
                                n_jobs=None,
                                refit=True,
                                cv=5,
                                error_score=np.nan,
                                return_train_score=True)

        # Pipeline
        pipeline = Pipeline([
                ('OneHotEnconder', encoder),
                ('Scaler', std_sc),
                ('Model', grid_ada)
                ])

        pipeline.fit(X_train, y_train)

        y_train_predict_proba = pipeline.predict_proba(X_train)
        y_test_predict_proba = pipeline.predict_proba(X_test)

        train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
        test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

        mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})



🏃 View run popular-chimp-351 at: http://127.0.0.1:5000/#/experiments/654000327895154401/runs/bb6aab4748ba4b56b928fbd6222bcf8f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654000327895154401


## XGBoost

In [67]:
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

In [70]:
with mlflow.start_run():

        mlflow.xgboost.autolog()

        xgb = XGBClassifier()

        grid_xgb = RandomizedSearchCV(xgb, param_distributions=param_grid,
                                      scoring='roc_auc',
                                      n_iter= 1000,
                                      n_jobs=-1,
                                      refit=True,
                                      cv=5,
                                      error_score=np.nan,
                                      return_train_score=True)

        # Pipeline
        pipeline = Pipeline([
                ('OneHotEnconder', encoder),
                ('Scaler', std_sc),
                ('Model', grid_xgb)
                ])

        pipeline.fit(X_train, y_train)

        y_train_predict_proba = pipeline.predict_proba(X_train)
        y_test_predict_proba = pipeline.predict_proba(X_test)

        train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
        test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

        mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})





🏃 View run masked-duck-539 at: http://127.0.0.1:5000/#/experiments/654000327895154401/runs/e080e0a1958944d68fa038c07fd410f5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654000327895154401
