## Importação das bibliotecas

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from sklearn.pipeline import Pipeline 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import roc_auc_score

import mlflow

sys.path.append(os.path.abspath(".."))
from src.utils import DataWrangling
from src.utils import OneHotFeatureEncoder

### Configurações

In [3]:
pd.set_option('display.max_columns', 99)
pd.set_option('future.no_silent_downcasting', True)

sns.set_style('darkgrid')

## MLflow

In [4]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment(experiment_id=654000327895154401)

<Experiment: artifact_location='mlflow-artifacts:/654000327895154401', creation_time=1741352105343, experiment_id='654000327895154401', last_update_time=1741352105343, lifecycle_stage='active', name='CreditRisk-Matheus', tags={}>

## Leitura dos dados

In [66]:
raw_data_path = '../data/raw/default_of_credit_card_clients__courseware_version_1_21_19.xls'

df = pd.read_excel(raw_data_path)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,798fc410-45c1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,8a8c8f3b-8eb4,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,85698822-43f5,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,0737c11b-be42,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,3b7f77cc-dbc0,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [67]:
wrangling = DataWrangling()

df = DataWrangling().fit_transform(df)
df

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month,EDUCATION_CAT
0,20000,2,2,1,24,2,690.09668,547.06872,121.51204,0.00000,0.00000,0.00000,0.00000,121.51204,0.00000,0.00000,0.00000,0.00000,1,university
1,120000,2,2,2,26,-1,472.99752,304.22100,472.99752,577.04992,609.32380,575.10996,0.00000,176.36000,176.36000,176.36000,0.00000,352.72000,1,university
2,90000,2,2,2,34,0,5156.59004,2473.80172,2391.26524,2527.41516,2636.22928,2742.22164,267.71448,264.54000,176.36000,176.36000,176.36000,881.80000,0,university
3,50000,2,2,1,37,0,8287.15640,8506.37188,8692.96076,4993.45704,5107.20924,5210.90892,352.72000,356.07084,211.63200,193.99600,188.52884,176.36000,0,university
4,50000,1,2,1,57,-1,1519.69412,999.96120,6319.86060,3692.97840,3376.58856,3373.94316,352.72000,6469.06116,1763.60000,1587.24000,121.51204,119.74844,0,university
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,1,3,1,39,0,33322.86928,34004.85340,36747.25140,15520.38544,5508.95732,2818.23280,1499.06000,3527.20000,882.32908,537.36892,881.80000,176.36000,0,high school
29996,150000,1,3,2,43,-1,296.81388,322.38608,617.61272,1583.53644,915.30840,0.00000,323.97332,621.84536,1586.88728,22.75044,0.00000,0.00000,0,high school
29997,30000,1,2,2,37,4,628.72340,591.86416,486.40088,3682.04408,3629.84152,3413.80052,0.00000,0.00000,3879.92000,740.71200,352.72000,546.71600,1,university
29998,80000,1,3,1,41,1,-290.11220,13822.92044,13456.97344,9307.22264,2090.74780,8631.76384,15149.32400,601.21124,207.75208,339.66936,9340.73104,318.15344,1,high school


## Separação dos modelos de treino e teste

In [69]:
X = df.drop(['default payment next month', 'SEX'], axis=1)
y = df['default payment next month']

print('Shape de X:', X.shape)
print('Shape de y:', y.shape)

Shape de X: (29685, 18)
Shape de y: (29685,)


In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)

print('Taxa de resposta da base de treino:', y_train.mean())
print('Taxa de resposta da base de teste:', y_test.mean())

Taxa de resposta da base de treino: 0.2210712481051036
Taxa de resposta da base de teste: 0.22098703048677784


## Modelos

In [None]:
# Encoder
encoder = OneHotFeatureEncoder()

# Escalonamento de dados
min_max_sc = MinMaxScaler()
std_sc = StandardScaler()

selector = SelectKBest(score_func=f_classif, k=10)

### Regressão Logística

#### Paramentros

In [13]:
# Primeiro modelo a ser utilizado
C_vals = [80, 90, 100 , 110, 120]
param_C = {'C': C_vals}
param_C

{'C': [80, 90, 100, 110, 120]}

#### Runs

In [14]:
# with mlflow.start_run():

#         mlflow.sklearn.autolog()

#         lr = LogisticRegression(penalty='l1', 
#                                 solver='saga',
#                                 max_iter=1000)

#         grid_lr = GridSearchCV(lr, param_grid=param_C,
#                         scoring='roc_auc',
#                         n_jobs=None,
#                         refit=True,
#                         cv=5,
#                         pre_dispatch='None',
#                         error_score=np.nan,
#                         return_train_score=True)

#         # Pipeline
#         pipeline = Pipeline([
#                 ('OneHotEnconder', encoder),
#                 ('Scaler', min_max_sc),
#                 ('Model', grid_lr)
#                 ])

#         pipeline.fit(X_train, y_train)

#         y_train_predict_proba = pipeline.predict_proba(X_train)
#         y_test_predict_proba = pipeline.predict_proba(X_test)

#         train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
#         test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

#         mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})

#### Melhor modelo

Pipeline:

1. OneHotFeatureEncoder()
2. PolynomialFeatures(include_bias=False, interaction_only=True)
3. MinMaxScaler()
4. LogisticRegression(C=100, max_iter=1000, penalty='l1', solver='saga')

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500, 700, 800, 900],
    'min_samples_leaf': [2, 5, 9, 10],
    'max_depth': [14, 15, 16, 20, None],
    'max_features': ['sqrt', 'log2', 0.3, 0.5, 1],
    'bootstrap'=[False, True]
    }

In [77]:
with mlflow.start_run():

        mlflow.sklearn.autolog()

        rfc = RandomForestClassifier(random_state=42, 
                                     class_weight='balanced',
                                     n_jobs=-1)

        grid_rfc = RandomizedSearchCV(rfc, param_distributions=param_grid,
                                      n_iter= 500,
                                scoring='roc_auc',
                                n_jobs=None,
                                refit=True,
                                cv=5,
                                error_score=np.nan,
                                return_train_score=True)

        # Pipeline
        pipeline = Pipeline([
                ('OneHotEnconder', encoder),
                ('Scaler', std_sc),
                ('Selector', selector),
                ('Model', grid_rfc)
                ])

        pipeline.fit(X_train, y_train)

        y_train_predict_proba = pipeline.predict_proba(X_train)
        y_test_predict_proba = pipeline.predict_proba(X_test)

        train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
        test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

        mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})



🏃 View run serious-koi-996 at: http://127.0.0.1:5000/#/experiments/654000327895154401/runs/6e0cd5c49daf45c4a9413ecd2dee8a65
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654000327895154401


KeyboardInterrupt: 

## AdaBoost

In [17]:
param_grid = {
    'n_estimators': [90, 100, 150, 200],
    'learning_rate': [0.8, 0.9, 1, 1.1, 1.2, 1.5]
}

In [18]:
# with mlflow.start_run():

#         mlflow.sklearn.autolog()

#         ada = AdaBoostClassifier(random_state=42)

#         grid_ada = GridSearchCV(ada, param_grid=param_grid,
#                                 scoring='roc_auc',
#                                 n_jobs=None,
#                                 refit=True,
#                                 cv=5,
#                                 error_score=np.nan,
#                                 return_train_score=True)

#         # Pipeline
#         pipeline = Pipeline([
#                 ('OneHotEnconder', encoder),
#                 ('Scaler', std_sc),
#                 ('Model', grid_ada)
#                 ])

#         pipeline.fit(X_train, y_train)

#         y_train_predict_proba = pipeline.predict_proba(X_train)
#         y_test_predict_proba = pipeline.predict_proba(X_test)

#         train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
#         test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

#         mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})

## XGBoost

In [19]:
param_grid = {
    'n_estimators': [450, 500, 550],  # Testando valores próximos de 500
    'learning_rate': [0.005, 0.01, 0.02],  # Ajuste fino da taxa de aprendizado
    'max_depth': [6, 7, 8],  # Testando variações em torno de 7
    'min_child_weight': [4, 5, 6],  # Ajuste fino do peso mínimo das folhas
    'gamma': [0.05, 0.1, 0.15],  # Pequena variação no Gamma
    'colsample_bytree': [0.75, 0.8, 0.85]  # Pequena variação na amostragem de colunas
}

In [20]:
# with mlflow.start_run():

#         mlflow.xgboost.autolog()

#         xgb = XGBClassifier()

#         grid_xgb = GridSearchCV(xgb, param_grid=param_grid,
#                                      scoring='roc_auc',
#                                      n_jobs=-1,
#                                      refit=True,
#                                      cv=5,
#                                      error_score=np.nan,
#                                      return_train_score=True)

#         # Pipeline
#         pipeline = Pipeline([
#                 ('OneHotEnconder', encoder),
#                 ('Scaler', std_sc),
#                 ('Model', grid_xgb)
#                 ])

#         pipeline.fit(X_train, y_train)

#         y_train_predict_proba = pipeline.predict_proba(X_train)
#         y_test_predict_proba = pipeline.predict_proba(X_test)

#         train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
#         test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])

#         mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})

In [22]:
X_train

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,EDUCATION_CAT
2458,80000,2,2,2,25,1,5077.22804,5471.74536,5537.88036,5641.05096,5726.05648,5607.18984,528.72728,208.63388,246.90400,200.87404,0.00000,207.75208,university
8040,70000,1,3,2,32,0,11544.52560,11590.55556,8022.96912,8021.20552,8177.98956,8395.26508,414.79872,358.01080,317.44800,352.72000,352.72000,352.72000,high school
2086,110000,1,2,1,36,0,10076.32860,9814.08128,10496.59448,10222.53104,11815.23820,11597.78632,0.00000,847.05708,0.00000,1763.60000,0.00000,1268.91020,university
11023,70000,1,2,2,48,-1,34.03748,33.15568,52.73164,146.20244,59.60968,152.55140,33.15568,52.73164,146.20244,59.60968,152.55140,146.90788,university
28741,50000,2,1,2,29,0,5094.86404,3845.17708,3701.79640,2863.38096,1721.09724,1550.02804,241.26048,234.55880,204.57760,58.19880,55.37704,64.19504,graduate school
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28395,80000,2,3,1,43,0,12628.08144,12930.00976,13233.87804,13508.64692,13881.64832,14349.53140,617.26000,617.26000,599.62400,705.44000,705.44000,529.08000,high school
18554,70000,2,2,2,25,1,954.46032,913.01572,1109.48076,1602.23060,1634.85720,1202.95156,0.00000,212.69016,529.08000,32.62660,35.27200,0.00000,university
23874,90000,1,2,2,27,2,448.30712,627.13616,791.85640,0.00000,0.00000,0.00000,187.99976,176.36000,0.00000,0.00000,0.00000,0.00000,university
23942,20000,1,2,2,24,0,2774.14280,2958.61536,6178.94896,2591.43384,2982.95304,2482.09064,231.56068,372.11960,705.44000,1058.16000,705.44000,30.68664,university


In [28]:
X_train = X_train[['LIMIT_BAL', 'PAY_1']]
X_test = X_test[['LIMIT_BAL', 'PAY_1']]


with mlflow.start_run():
 
        mlflow.xgboost.autolog()
 
        xgb = XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=5, missing=np.nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=500, n_jobs=None,
              num_parallel_tree=None, random_state=None)
 
 
        # Pipeline
        pipeline = Pipeline([
                ('Scaler', std_sc),
                ('Model', xgb)
                ])
 
        pipeline.fit(X_train, y_train)
 
        y_train_predict_proba = pipeline.predict_proba(X_train)
        y_test_predict_proba = pipeline.predict_proba(X_test)
 
        train_roc_auc = roc_auc_score(y_train, y_train_predict_proba[:,1])
        test_roc_auc = roc_auc_score(y_test, y_test_predict_proba[:,1])
 
        mlflow.log_metrics({'roc_auc_train': train_roc_auc, 'roc_auc_test': test_roc_auc})



🏃 View run bold-kit-313 at: http://127.0.0.1:5000/#/experiments/654000327895154401/runs/aaca8be5551b43a7870d50be397b4cc4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654000327895154401


In [None]:
client = mlflow.client.MlflowClient()
version = max([int(i.version) for i in client.get_latest_versions('CreditRisk')])
model = mlflow.sklearn.load_model(f'models:/CreditRisk/{version}')
model.

  version = max([int(i.version) for i in client.get_latest_versions('CreditRisk')])
