# Objetivo

Identificar as relações entre as variáveis explicativa e resposta, usando diferentes modelos. No mundo real, a detecção de fraude precisa ser feita rapidamente de modo que haja equilíbrio entre os falsos positivos e falsos negativos.

Avaliação de alguns testes:

1. Modelo aplicado na base completa (desbalanceado);

2. Modelo aplicado em folds balanceados (undersampling e oversampling) e validados em folds originais (desbalanceados).

OBS: será usado uma amostra para esse estudo.

# Pacotes e funções

In [1]:
import pandas as pd 
import numpy as np
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, average_precision_score, brier_score_loss, confusion_matrix, classification_report, ConfusionMatrixDisplay, precision_recall_curve, log_loss
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost
from catboost import CatBoostClassifier
import lightgbm
from hyperopt import fmin, tpe, Trials, hp
from functools import partial
import statsmodels.api as sm

In [8]:
#!pip install xgboost
#!pip install catboost
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension
#!pip install hyperopt
#!pip install lightgbm

# Bases pré-processadas

In [2]:
treino_full = pd.read_parquet('../3.Pre_processamento/treino_pp.parquet')
val_full = pd.read_parquet('../3.Pre_processamento/validacao_pp.parquet')

In [3]:
treino_full.shape

(783000, 14)

In [4]:
treino = treino_full.sample(frac=.1, random_state=123)

In [5]:
treino.shape

(78300, 14)

In [6]:
treino['fraud'].value_counts()/treino.shape[0]

fraud
0    0.91424
1    0.08576
Name: count, dtype: float64

In [13]:
np.log(0.08576/0.91424)

-2.3665404217199164

In [7]:
treino.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,log_DFH,log_DFLT,log_RTMPP,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
275847,3.132982,0.136709,0.905399,1,0,0,1,0,1.141985,-1.989898,-0.09938,-0.82667,-1.104029,-0.08909
287479,20.395077,0.805684,2.582759,1,0,1,1,0,3.015294,-0.216063,0.948858,0.510707,-0.118208,0.865273
737603,55.31753,2.933958,0.573614,1,0,0,1,0,4.01309,1.076352,-0.555799,1.223046,0.60006,-0.504634
136843,378.405321,1.208453,2.18268,1,0,0,1,1,5.935966,0.189341,0.780553,2.595811,0.107098,0.712041
313895,9.967356,1.027505,0.781084,1,0,0,1,0,2.299315,0.027133,-0.247073,-0.000438,0.01695,-0.223556


In [8]:
# Somente as variáveis com IV até 0.5, pois existem poucas variáveis na base
colunas = ['repeat_retailer','used_chip','used_pin_number','online_order','log_DFH_padro','log_DFLT_padro']

X_treino = treino[colunas]
y_treino = treino['fraud']

X_val = val_full[colunas]
y_val = val_full['fraud']

In [9]:
y_treino.sum()/len(y_treino), y_val.sum()/len(y_val)

(0.08575989782886334, 0.0874017094017094)

# Criação do experimento

In [None]:
#experiment = mlflow.create_experiment(name = 'Modelos',
#                                      artifact_location = 'Artf_Modelos',
#                                      tags = {'Environment': 'Development', 'Version': '1.0.0'})

In [None]:
#experiment

'189457669412080989'

In [10]:
experiment = mlflow.set_experiment(experiment_id = '189457669412080989')
experiment.experiment_id

'189457669412080989'

# Modelos

## Regressão logística

In [11]:
# https://bookdown.org/egarpor/PM-UC3M/glm-deviance.html
# https://stats.stackexchange.com/questions/108995/interpreting-residual-and-null-deviance-in-glm-r

LR = sm.GLM(y_treino, sm.add_constant(X_treino), family=sm.families.Binomial())
LR_results = LR.fit()

In [12]:
LR_results.summary()

0,1,2,3
Dep. Variable:,fraud,No. Observations:,78300.0
Model:,GLM,Df Residuals:,78293.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-19206.0
Date:,"Wed, 19 Mar 2025",Deviance:,38413.0
Time:,21:16:38,Pearson chi2:,71100.0
No. Iterations:,9,Pseudo R-squ. (CS):,0.09031
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.8151,0.068,-41.415,0.000,-2.948,-2.682
repeat_retailer,-1.5955,0.054,-29.725,0.000,-1.701,-1.490
used_chip,-0.5194,0.030,-17.221,0.000,-0.579,-0.460
used_pin_number,-3.6776,0.210,-17.543,0.000,-4.089,-3.267
online_order,2.4200,0.054,44.405,0.000,2.313,2.527
log_DFH_padro,0.7188,0.017,43.251,0.000,0.686,0.751
log_DFLT_padro,0.2127,0.013,15.791,0.000,0.186,0.239


## Função objetivo

Função objetivo para ser minimizada e encontrar o melhor conjunto de hiperparâmetros com o uso de validação cruzada.

In [17]:
def func_objetivo_CV(parametros, modelo, folds, expr, X, y):
    # função objetivo para "minimizar", mas dependendo da métrica de interesse, na realidade, é maximizar 
    # parametros é o espaço paramétrico a ser explorado
    # expr é uma string que representa o id do experimento que foi criado
    # modelo é uma string de qual modelo será rodado: Random Forest ou XGBoost
    # folds é um int que diz quantos folds de validação serão usados
    # X e y são as bases que serão aplicadas o cross-validation

    # O output é o valor do score a ser minimizado/maximizado
    
    with mlflow.start_run(nested = True, experiment_id=expr) as run:

        SKF = StratifiedKFold(n_splits = folds, shuffle=True, random_state=1234)

        if modelo == 'RF':
            clf = RandomForestClassifier(**parametros) 
            clf.fit(X, y)
        elif modelo == 'XGB':
            clf = xgboost.XGBClassifier(**parametros)
            clf.fit(X, y)
        elif modelo == 'CAT':
            clf = CatBoostClassifier(**parametros)
            clf.fit(X, y)
        elif modelo == 'LGBM':
            clf = lightgbm.LGBMClassifier(**parametros)
            clf.fit(X, y)
        
        score = cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_log_loss').mean()

        # Log de parâmetros e métricas

        mlflow.log_params(clf.get_params())
        mlflow.log_metric('average_precision_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='average_precision').mean())
        mlflow.log_metric('roc_auc_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='roc_auc').mean())
        mlflow.log_metric('neg_brier_score_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_brier_score').mean())
        mlflow.log_metric('neg_log_loss_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_log_loss').mean())
        
        signature = infer_signature(X, clf.predict_proba(X))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

    return -score

## Random Forest

### Base Full

In [18]:
if __name__ == '__main__':
    
    with mlflow.start_run(run_name = 'RF_AllVars_Amostra', experiment_id = experiment.experiment_id) as run: 

        rf = RandomForestClassifier()
        rf.fit(X_treino, y_treino)

        # Log dos parâmetros do modelo
        mlflow.log_params(rf.get_params())

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, rf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, rf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, rf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, rf.predict_proba(X_treino)[:,1], normalize=True))

        # OBS: o ponto de corte utilizado é o 0.5 (ainda não otimizamos esse ponto)
        #mlflow.log_metric('F1_Treino', f1_score(y_treino, rf.predict(X_treino)))
        #mlflow.log_metric('Precision_Treino', precision_score(y_treino, rf.predict(X_treino)))
        #mlflow.log_metric('Recall_Treino', recall_score(y_treino, rf.predict(X_treino)))
        #mlflow.log_metric('Acuracia_Treino', accuracy_score(y_treino, rf.predict(X_treino)))

        # Log das métricas na base de VALIDAÇÃO
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, rf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, rf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, rf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, rf.predict_proba(X_val)[:,1], normalize=True))

        # OBS: o ponto de corte utilizado é o 0.5 (ainda não otimizamos esse ponto)
        #mlflow.log_metric('F1_Val', f1_score(y_val, rf.predict(X_val)))
        #mlflow.log_metric('Precision_Val', precision_score(y_val, rf.predict(X_val)))
        #mlflow.log_metric('Recall_Val', recall_score(y_val, rf.predict(X_val)))
        #mlflow.log_metric('Acuracia_Val', accuracy_score(y_val, rf.predict(X_val)))
        
        # Log do schema das variáveis do modelo e do modelo
        signature = infer_signature(X_treino, rf.predict_proba(X_treino))
        mlflow.sklearn.log_model(rf, signature=signature, artifact_path='modelo')



In [19]:
if __name__ == '__main__':
    
    #test_imbalanced = [{0: len(y_treino2)/(2*np.bincount(y_treino2))[0], 1:len(y_teste2)/(2*np.bincount(y_teste2))[1]}, {0: 1, 1:1}]
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 500, dtype=int)),
        "max_depth": hp.choice('max_depth', np.arange(10, 300, dtype=int)),
        "min_samples_leaf": hp.choice('min_samples_leaf', np.arange(200, 500, dtype=int)),
        "min_samples_split": hp.choice('min_samples_split', np.arange(200, 500, dtype=int)),
        "criterion": hp.choice("criterion", ['gini', 'entropy', 'log_loss']),
        "class_weight": hp.choice("class_weight", ['balanced', 'balanced_subsample', None]) 
    }
    
    with mlflow.start_run(run_name = 'RF_AllVars_Amostra_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'RF',
                X = X_treino,
                y = y_treino,
                folds = 3
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )

        if best_params['criterion'] == 0:
            best_params['criterion'] = 'gini'
        elif best_params['criterion'] == 1:
            best_params['criterion'] = 'entropy'
        else:
            best_params['criterion'] = 'log_loss'
            

        if best_params['class_weight'] == 0:
            best_params['class_weight'] = 'balanced'
        elif best_params['class_weight'] == 1:
            best_params['class_weight'] = 'balanced_subsample'
        else:
            best_params['class_weight'] = None
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = RandomForestClassifier(**best_params)
        clf.fit(X_treino, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino)[:,1], normalize=True))

        signature = infer_signature(X_treino, clf.predict_proba(X_treino))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]




  4%|▍         | 1/25 [28:45<11:30:13, 1725.54s/trial, best loss: 0.04148710138585226]




### Base parcial IV

In [20]:
if __name__ == '__main__':
    
    #test_imbalanced = [{0: len(y_treino2)/(2*np.bincount(y_treino2))[0], 1:len(y_teste2)/(2*np.bincount(y_teste2))[1]}, {0: 1, 1:1}]
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 500, dtype=int)),
        "max_depth": hp.choice('max_depth', np.arange(10, 300, dtype=int)),
        "min_samples_leaf": hp.choice('min_samples_leaf', np.arange(200, 500, dtype=int)),
        "min_samples_split": hp.choice('min_samples_split', np.arange(200, 500, dtype=int)),
        "criterion": hp.choice("criterion", ['gini', 'entropy', 'log_loss']),
        "class_weight": hp.choice("class_weight", ['balanced', 'balanced_subsample', None]) 
    }
    
    with mlflow.start_run(run_name = 'RF_IVVars_Amostra_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'RF',
                X = X_treino_IV,
                y = y_treino,
                folds = 3
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )

        if best_params['criterion'] == 0:
            best_params['criterion'] = 'gini'
        elif best_params['criterion'] == 1:
            best_params['criterion'] = 'entropy'
        else:
            best_params['criterion'] = 'log_loss'
            

        if best_params['class_weight'] == 0:
            best_params['class_weight'] = 'balanced'
        elif best_params['class_weight'] == 1:
            best_params['class_weight'] = 'balanced_subsample'
        else:
            best_params['class_weight'] = None
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = RandomForestClassifier(**best_params)
        clf.fit(X_treino_IV, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val_IV)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val_IV)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val_IV)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val_IV)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino_IV)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino_IV)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino_IV)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino_IV)[:,1], normalize=True))

        signature = infer_signature(X_treino_IV, clf.predict_proba(X_treino_IV))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]




  4%|▍         | 1/25 [09:06<3:38:37, 546.55s/trial, best loss: 0.23410350369495644]


