# Objetivo

Identificar as relações entre as variáveis explicativa e resposta, usando diferentes modelos. No mundo real, a detecção de fraude precisa ser feita rapidamente de modo que haja equilíbrio entre os falsos positivos e falsos negativos.

Avaliação de alguns testes:

1. Modelo aplicado na base completa (desbalanceado);

2. Modelo aplicado em folds balanceados (undersampling e oversampling) e validados em folds originais (desbalanceados).

# Pacotes e funções

In [9]:
import pandas as pd 
import numpy as np
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, average_precision_score, brier_score_loss, confusion_matrix, classification_report, ConfusionMatrixDisplay, precision_recall_curve, log_loss
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost
from catboost import CatBoostClassifier
import lightgbm
from hyperopt import fmin, tpe, Trials, hp
from functools import partial

In [8]:
#!pip install xgboost
#!pip install catboost
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension
#!pip install hyperopt
#!pip install lightgbm

# Bases pré-processadas

In [2]:
treino = pd.read_parquet('../3.Pre_processamento/treino_pp.parquet')
val = pd.read_parquet('../3.Pre_processamento/validacao_pp.parquet')

In [3]:
treino.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud,log_DFH,log_DFLT,log_RTMPP,log_DFH_padro,log_DFLT_padro,log_RTMPP_padro
0,5.689623,0.3472,1.105109,1,0,0,1,0,1.738644,-1.057854,0.099944,-0.400708,-0.586039,0.092383
1,58.380209,0.124193,0.334676,1,0,0,1,0,4.066977,-2.085922,-1.094593,1.261517,-1.157395,-0.995176
2,21.729568,2.809812,0.352684,1,0,0,0,0,3.078674,1.033118,-1.042184,0.555955,0.576032,-0.947461
3,49.862629,1.040213,0.943036,1,0,0,1,0,3.909272,0.039426,-0.058651,1.14893,0.023781,-0.052008
4,13.585701,3.446598,1.715601,1,0,0,0,0,2.609018,1.237388,0.539764,0.220662,0.689556,0.492815


In [3]:
colunas = ['repeat_retailer','used_chip','used_pin_number','online_order','log_DFH_padro','log_DFLT_padro','log_RTMPP_padro']

X_treino = treino[colunas]
y_treino = treino['fraud']

X_val = val[colunas]
y_val = val['fraud']

In [5]:
y_treino.sum()/len(y_treino), y_val.sum()/len(y_val)

(0.08740357598978289, 0.0874017094017094)

Consideração da base parcial, com o uso do filtro de IV.

In [4]:
# Somente as variáveis com IV entre 0.1 e 0.5
colunas_IV = ['used_pin_number','online_order','log_DFH_padro']

X_treino_IV = treino[colunas_IV]
X_val_IV = val[colunas_IV]

# Criação do experimento

In [None]:
#experiment = mlflow.create_experiment(name = 'Modelos',
#                                      artifact_location = 'Artf_Modelos',
#                                      tags = {'Environment': 'Development', 'Version': '1.0.0'})

In [None]:
#experiment

'189457669412080989'

In [5]:
experiment = mlflow.set_experiment(experiment_id = '189457669412080989')
experiment.experiment_id

'189457669412080989'

# Modelos

Função objetivo para ser minimizada e encontrar o melhor conjunto de hiperparâmetros com o uso de validação cruzada.

In [None]:
def func_objetivo_CV(parametros, modelo, folds, expr, X, y):
    # função objetivo para "minimizar", mas dependendo da métrica de interesse, na realidade, é maximizar 
    # parametros é o espaço paramétrico a ser explorado
    # expr é uma string que representa o id do experimento que foi criado
    # modelo é uma string de qual modelo será rodado: Random Forest ou XGBoost
    # folds é um int que diz quantos folds de validação serão usados
    # X e y são as bases que serão aplicadas o cross-validation

    # O output é o valor do score a ser minimizado/maximizado
    
    with mlflow.start_run(nested = True, experiment_id=expr) as run:

        SKF = StratifiedKFold(n_splits = folds, shuffle=True, random_state=1234)

        if modelo == 'RF':
            clf = RandomForestClassifier(**parametros) 
            clf.fit(X, y)
        elif modelo == 'XGB':
            clf = xgboost.XGBClassifier(**parametros)
            clf.fit(X, y)
        elif modelo == 'CAT':
            clf = CatBoostClassifier(**parametros)
            clf.fit(X, y)
        elif modelo == 'LGBM':
            clf = lightgbm.LGBMClassifier(**parametros)
            clf.fit(X, y)
        
        score = cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_log_loss').mean()

        # Log de parâmetros e métricas

        mlflow.log_params(clf.get_params())
        mlflow.log_metric('average_precision_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='average_precision').mean())
        mlflow.log_metric('roc_auc_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='roc_auc').mean())
        mlflow.log_metric('neg_brier_score_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_brier_score').mean())
        mlflow.log_metric('neg_log_loss_cv', cross_val_score(estimator = clf, X = X, y = y, cv = SKF, scoring='neg_log_loss').mean())
        
        signature = infer_signature(X, clf.predict_proba(X))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

    return -score

## Random Forest

### Base Full

In [17]:
if __name__ == '__main__':
    
    with mlflow.start_run(run_name = 'Base_RF_Base_Full', experiment_id = experiment.experiment_id) as run: 

        rf = RandomForestClassifier()
        rf.fit(X_treino, y_treino)

        # Log dos parâmetros do modelo
        mlflow.log_params(rf.get_params())

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, rf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, rf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, rf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, rf.predict_proba(X_treino)[:,1], normalize=True))

        # OBS: o ponto de corte utilizado é o 0.5 (ainda não otimizamos esse ponto)
        #mlflow.log_metric('F1_Treino', f1_score(y_treino, rf.predict(X_treino)))
        #mlflow.log_metric('Precision_Treino', precision_score(y_treino, rf.predict(X_treino)))
        #mlflow.log_metric('Recall_Treino', recall_score(y_treino, rf.predict(X_treino)))
        #mlflow.log_metric('Acuracia_Treino', accuracy_score(y_treino, rf.predict(X_treino)))

        # Log das métricas na base de VALIDAÇÃO
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, rf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, rf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, rf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, rf.predict_proba(X_val)[:,1], normalize=True))

        # OBS: o ponto de corte utilizado é o 0.5 (ainda não otimizamos esse ponto)
        #mlflow.log_metric('F1_Val', f1_score(y_val, rf.predict(X_val)))
        #mlflow.log_metric('Precision_Val', precision_score(y_val, rf.predict(X_val)))
        #mlflow.log_metric('Recall_Val', recall_score(y_val, rf.predict(X_val)))
        #mlflow.log_metric('Acuracia_Val', accuracy_score(y_val, rf.predict(X_val)))
        
        # Log do schema das variáveis do modelo e do modelo
        signature = infer_signature(X_treino, rf.predict_proba(X_treino))
        mlflow.sklearn.log_model(rf, signature=signature, artifact_path='modelo')



In [19]:
if __name__ == '__main__':
    
    #test_imbalanced = [{0: len(y_treino2)/(2*np.bincount(y_treino2))[0], 1:len(y_teste2)/(2*np.bincount(y_teste2))[1]}, {0: 1, 1:1}]
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 500, dtype=int)),
        "max_depth": hp.choice('max_depth', np.arange(10, 300, dtype=int)),
        "min_samples_leaf": hp.choice('min_samples_leaf', np.arange(200, 500, dtype=int)),
        "min_samples_split": hp.choice('min_samples_split', np.arange(200, 500, dtype=int)),
        "criterion": hp.choice("criterion", ['gini', 'entropy', 'log_loss']),
        "class_weight": hp.choice("class_weight", ['balanced', 'balanced_subsample', None]) 
    }
    
    with mlflow.start_run(run_name = 'Base_RF_Base_Full_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'RF',
                X = X_treino,
                y = y_treino,
                folds = 3
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )

        if best_params['criterion'] == 0:
            best_params['criterion'] = 'gini'
        elif best_params['criterion'] == 1:
            best_params['criterion'] = 'entropy'
        else:
            best_params['criterion'] = 'log_loss'
            

        if best_params['class_weight'] == 0:
            best_params['class_weight'] = 'balanced'
        elif best_params['class_weight'] == 1:
            best_params['class_weight'] = 'balanced_subsample'
        else:
            best_params['class_weight'] = None
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = RandomForestClassifier(**best_params)
        clf.fit(X_treino, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino)[:,1], normalize=True))

        signature = infer_signature(X_treino, clf.predict_proba(X_treino))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]




  4%|▍         | 1/25 [1:13:20<29:20:16, 4400.70s/trial, best loss: 0.023602231224237203]




### Base parcial IV

In [13]:
if __name__ == '__main__':
    
    #test_imbalanced = [{0: len(y_treino2)/(2*np.bincount(y_treino2))[0], 1:len(y_teste2)/(2*np.bincount(y_teste2))[1]}, {0: 1, 1:1}]
    
    space = {
        "n_estimators": hp.choice('n_estimators', np.arange(10, 500, dtype=int)),
        "max_depth": hp.choice('max_depth', np.arange(10, 300, dtype=int)),
        "min_samples_leaf": hp.choice('min_samples_leaf', np.arange(200, 500, dtype=int)),
        "min_samples_split": hp.choice('min_samples_split', np.arange(200, 500, dtype=int)),
        "criterion": hp.choice("criterion", ['gini', 'entropy', 'log_loss']),
        "class_weight": hp.choice("class_weight", ['balanced', 'balanced_subsample', None]) 
    }
    
    with mlflow.start_run(run_name = 'Base_RF_Base_IV_CV', experiment_id=experiment.experiment_id) as run:
        best_params = fmin(
            fn = partial(
                func_objetivo_CV,
                expr = experiment.experiment_id,
                modelo = 'RF',
                X = X_treino_IV,
                y = y_treino,
                folds = 3
            ),
            space = space,
            algo = tpe.suggest,
            max_evals = 25,
            trials = Trials(),
            timeout = 10
        )

        if best_params['criterion'] == 0:
            best_params['criterion'] = 'gini'
        elif best_params['criterion'] == 1:
            best_params['criterion'] = 'entropy'
        else:
            best_params['criterion'] = 'log_loss'
            

        if best_params['class_weight'] == 0:
            best_params['class_weight'] = 'balanced'
        elif best_params['class_weight'] == 1:
            best_params['class_weight'] = 'balanced_subsample'
        else:
            best_params['class_weight'] = None
        
        # Identificado o melhor conjunto de hiperparâmetros, treina o modelo com toda a base de treino e metrifica os escores na base de validação

        clf = RandomForestClassifier(**best_params)
        clf.fit(X_treino_IV, y_treino)
                   
        mlflow.log_params(clf.get_params())
        mlflow.log_metric('AUC_PR_Val', average_precision_score(y_val, clf.predict_proba(X_val_IV)[:,1]))
        mlflow.log_metric('Log_Loss_Val', log_loss(y_val, clf.predict_proba(X_val_IV)[:,1], normalize=True))
        mlflow.log_metric('AUC_ROC_Val', roc_auc_score(y_val, clf.predict_proba(X_val_IV)[:,1]))
        mlflow.log_metric('BS_Val', brier_score_loss(y_val, clf.predict_proba(X_val_IV)[:,1]))

        # Log das métricas na base de TREINO
        mlflow.log_metric('AUC_PR_Treino', average_precision_score(y_treino, clf.predict_proba(X_treino_IV)[:,1]))
        mlflow.log_metric('AUC_ROC_Treino', roc_auc_score(y_treino, clf.predict_proba(X_treino_IV)[:,1]))
        mlflow.log_metric('BS_Treino', brier_score_loss(y_treino, clf.predict_proba(X_treino_IV)[:,1]))
        mlflow.log_metric('Log_Loss_Treino', log_loss(y_treino, clf.predict_proba(X_treino_IV)[:,1], normalize=True))

        signature = infer_signature(X_treino_IV, clf.predict_proba(X_treino_IV))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')

  0%|          | 0/25 [1:57:40<?, ?trial/s, best loss=?]


KeyboardInterrupt: 