# Segunda seção de modelos

Variáveis criadas e originais com IV até 0.5 e que não são correlacionadas

# Pacotes e funções

In [1]:
import pandas as pd 
import numpy as np
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, average_precision_score, brier_score_loss, confusion_matrix, classification_report, ConfusionMatrixDisplay, precision_recall_curve, log_loss
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost
from catboost import CatBoostClassifier
import lightgbm
from hyperopt import fmin, tpe, Trials, hp
from functools import partial
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Bases pré-processadas

In [2]:
treino_full = pd.read_parquet('../3.Pre_processamento/treino_pp.parquet')
val_full = pd.read_parquet('../3.Pre_processamento/validacao_pp.parquet')

In [3]:
# Fase amostral: 10% da base de treino
treino = treino_full.sample(frac=.1, random_state=123)

In [4]:
# Segunda seção de modelos
# Variáveis criadas e originais com IV até 0.5 e que não são correlacionadas
colunas = ['used_pin_number', 'log_DFH_padro', 'log_DFLT_padro', 'RROO', 'used_chip']

X_treino = treino[colunas]
y_treino = treino['fraud']

X_val = val_full[colunas]
y_val = val_full['fraud']

In [6]:
y_treino.value_counts(normalize=True)

fraud
0    0.91424
1    0.08576
Name: proportion, dtype: float64

# Set do experimento

In [5]:
experiment = mlflow.set_experiment(experiment_id = '189457669412080989')
experiment.experiment_id

'189457669412080989'

# Modelos

## Regressão logística

In [7]:
LR = sm.GLM(y_treino, sm.add_constant(X_treino), family=sm.families.Binomial())
LR_results = LR.fit()

In [8]:
LR_results.summary()

0,1,2,3
Dep. Variable:,fraud,No. Observations:,78300.0
Model:,GLM,Df Residuals:,78294.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-20496.0
Date:,"Tue, 29 Apr 2025",Deviance:,40991.0
Time:,19:40:28,Pearson chi2:,93000.0
No. Iterations:,9,Pseudo R-squ. (CS):,0.05985
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.1605,0.033,-97.073,0.000,-3.224,-3.097
used_pin_number,-3.6261,0.209,-17.315,0.000,-4.037,-3.216
log_DFH_padro,0.3313,0.015,22.826,0.000,0.303,0.360
log_DFLT_padro,0.2085,0.013,15.814,0.000,0.183,0.234
RROO,1.3823,0.035,39.484,0.000,1.314,1.451
used_chip,-0.4942,0.030,-16.702,0.000,-0.552,-0.436


In [9]:
if __name__ == '__main__':
    
    with mlflow.start_run(run_name = 'LR_Sample_VarOrigCria', experiment_id = experiment.experiment_id) as run: 
        
        signature = infer_signature(X_treino, LR_results.predict(sm.add_constant(X_treino)))
        mlflow.statsmodels.log_model(LR_results, signature=signature, artifact_path='modelo')
        
        mlflow.log_params(LR_results.params)

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, LR_results.predict(sm.add_constant(X_treino))))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, LR_results.predict(sm.add_constant(X_treino))))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, LR_results.predict(sm.add_constant(X_treino))))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, LR_results.predict(sm.add_constant(X_treino)), normalize=True))

        # Log das métricas na base de VALIDAÇÃO
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, LR_results.predict(sm.add_constant(X_val))))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, LR_results.predict(sm.add_constant(X_val))))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, LR_results.predict(sm.add_constant(X_val))))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, LR_results.predict(sm.add_constant(X_val)), normalize=True))



## Função objetivo

In [10]:
def func_objetivo_CV(parametros, modelo, folds, expr, X, y):
    # função objetivo para "minimizar", mas dependendo da métrica de interesse, na realidade, é maximizar 
    # parametros é o espaço paramétrico a ser explorado
    # expr é uma string que representa o id do experimento que foi criado
    # modelo é uma string de qual modelo será rodado: Random Forest ou XGBoost
    # folds é um int que diz quantos folds de validação serão usados
    # X e y são as bases que serão aplicadas o cross-validation

    # O output é o valor do score a ser minimizado/maximizado
    
    with mlflow.start_run(nested = True, experiment_id=expr) as run:

        SKF = StratifiedKFold(n_splits = folds, shuffle=True, random_state=1234)

        if modelo == 'RF':
            clf = RandomForestClassifier(**parametros) 
            clf.fit(X, y)
        elif modelo == 'XGB':
            clf = xgboost.XGBClassifier(**parametros)
            clf.fit(X, y)
        elif modelo == 'CAT':
            clf = CatBoostClassifier(**parametros, verbose=False, cat_features = ['used_chip', 'repeat_retailer', 'used_pin_number', 'online_order'], loss_function = 'Logloss')
            clf.fit(X, y)
        elif modelo == 'LGBM':
            clf = lightgbm.LGBMClassifier(**parametros, verbose=-1)
            clf.fit(X, y)
        
        score = cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_log_loss').mean()

        # Log de parâmetros e métricas

        mlflow.log_params(clf.get_params())
        mlflow.log_metric('average_precision_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='average_precision').mean())
        mlflow.log_metric('roc_auc_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='roc_auc').mean())
        mlflow.log_metric('neg_brier_score_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_brier_score').mean())
        mlflow.log_metric('neg_log_loss_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_log_loss').mean())
        
        signature = infer_signature(X, clf.predict_proba(X))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

    return -score

## Random Forest

In [11]:
# Com tunning de hiperparâmetros

if __name__ == '__main__':
    
    #test_imbalanced = [{0: len(y_treino2)/(2*np.bincount(y_treino2))[0], 1:len(y_teste2)/(2*np.bincount(y_teste2))[1]}, {0: 1, 1:1}]
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 100, dtype=int)),
        "max_depth": hp.choice('max_depth', np.arange(10, 50, dtype=int)),
        "min_samples_leaf": hp.choice('min_samples_leaf', np.arange(100, 500, dtype=int)),
        "min_samples_split": hp.choice('min_samples_split', np.arange(100, 500, dtype=int)),
        "criterion": hp.choice("criterion", ['gini', 'entropy', 'log_loss']),
        "class_weight": hp.choice("class_weight", ['balanced', 'balanced_subsample', None]),
        "max_features": hp.uniform("max_features", 0.1, 1),
        "max_samples": hp.uniform("max_samples", 0.1, 1)
    }
    
    with mlflow.start_run(run_name = 'RF_Sample_VarOrigCria_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'RF',
                X = X_treino,
                y = y_treino,
                folds = 5
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )

        if best_params['criterion'] == 0:
            best_params['criterion'] = 'gini'
        elif best_params['criterion'] == 1:
            best_params['criterion'] = 'entropy'
        else:
            best_params['criterion'] = 'log_loss'
            

        if best_params['class_weight'] == 0:
            best_params['class_weight'] = 'balanced'
        elif best_params['class_weight'] == 1:
            best_params['class_weight'] = 'balanced_subsample'
        else:
            best_params['class_weight'] = None
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = RandomForestClassifier(**best_params)
        clf.fit(X_treino, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino)[:,1], normalize=True))

        signature = infer_signature(X_treino, clf.predict_proba(X_treino))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]




  4%|▍         | 1/25 [01:13<29:32, 73.86s/trial, best loss: 0.22249764798969957]




## XGB

In [12]:
# Com tunning de hiperparâmetros

if __name__ == '__main__':
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 150, dtype=int)),
        "max_depth": hp.choice('max_depth', np.arange(10, 150, dtype=int)),
        "min_child_weight": hp.choice('min_child_weight', np.arange(100, 500, dtype=int)),
        "colsample_bytree": hp.quniform('colsample_bytree', 0.2, 1, 0.05),
        "colsample_bylevel": hp.quniform('colsample_bylevel', 0.2, 1, 0.05),
        "subsample": hp.quniform('subsample', 0.2, 1, 0.05),
        "colsample_bynode": hp.quniform('colsample_bynode', 0.2, 1, 0.05),
        "base_score": hp.quniform('base_score', 0.1, 1, 0.05),
        "learning_rate": hp.quniform('learning_rate', 0.0025, 0.5, 0.025),
        "gamma": hp.choice('gamma', np.arange(1, 20, dtype=int)),
        "lambda": hp.choice('lambda', np.arange(1, 20, dtype=int)),
        "alpha": hp.choice('alpha', np.arange(1, 20, dtype=int)),

    }
    
    with mlflow.start_run(run_name = 'XGB_Sample_VarOrigCria_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'XGB',
                X = X_treino,
                y = y_treino,
                folds = 5
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = xgboost.XGBClassifier(**best_params)
        clf.fit(X_treino, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino)[:,1], normalize=True))

        signature = infer_signature(X_treino, clf.predict_proba(X_treino))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]




  4%|▍         | 1/25 [00:13<05:16, 13.20s/trial, best loss: 0.24764082426350761]




## LGBM

In [13]:
# Com tunning de hiperparâmetros

if __name__ == '__main__':
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 150, dtype=int)), 
        "max_depth": hp.choice('max_depth', np.arange(10, 50, dtype=int)), 
        "min_child_weight": hp.choice('min_child_weight', np.arange(100, 500, dtype=int)),
        "colsample_bytree": hp.quniform('colsample_bytree', 0.2, 1, 0.05),
        "subsample": hp.quniform('subsample', 0.2, 1, 0.05), 
        "colsample_bytree": hp.quniform('colsample_bynode', 0.2, 1, 0.05),
        "learning_rate": hp.quniform('learning_rate', 0.0025, 0.5, 0.025), 
        "reg_lambda": hp.choice('reg_lambda', np.arange(1, 20, dtype=float)),
        "reg_alpha": hp.choice('reg_alpha', np.arange(1, 20, dtype=float))

    }
    
    with mlflow.start_run(run_name = 'LGBM_Sample_VarOrigCria_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'LGBM',
                X = X_treino,
                y = y_treino,
                folds = 5
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = lightgbm.LGBMClassifier(**best_params)
        clf.fit(X_treino, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino)[:,1], normalize=True))

        signature = infer_signature(X_treino, clf.predict_proba(X_treino))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]




  4%|▍         | 1/25 [00:06<02:33,  6.39s/trial, best loss: 0.24362677219584725]




  8%|▊         | 2/25 [00:14<02:45,  7.20s/trial, best loss: 0.2130010421570317] 


