# Terceira seção de modelos

Construção dos modelos em bases balanceadas, com as variáveis criadas e originais com IV até 0.5 e que não são correlacionadas (idem as utilizadas na PT2).

- https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html

# Pacotes e funções

In [54]:
# !pip install imbalanced-learn

In [8]:
import pandas as pd 
import numpy as np
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, average_precision_score, brier_score_loss, confusion_matrix, classification_report, ConfusionMatrixDisplay, precision_recall_curve, log_loss
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from hyperopt import fmin, tpe, Trials, hp
from functools import partial
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Bases pré-processadas

In [2]:
treino_full = pd.read_parquet('../3.Pre_processamento/treino_pp.parquet')
val_full = pd.read_parquet('../3.Pre_processamento/validacao_pp.parquet')

In [3]:
treino_full['fraud'].value_counts()

fraud
0    714563
1     68437
Name: count, dtype: int64

In [4]:
treino_full['fraud'].value_counts(normalize=True)

fraud
0    0.912596
1    0.087404
Name: proportion, dtype: float64

In [14]:
colunas = ['used_pin_number', 'log_DFH_padro', 'log_DFLT_padro', 'RROO', 'used_chip']

X_treino = treino_full[colunas]
y_treino = treino_full['fraud']

X_val = val_full[colunas]
y_val = val_full['fraud']

In [6]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for i, (train_index, test_index) in enumerate(skf.split(X_treino, y_treino)):
    print(f"Fold {i}:")
    print(f"Treino: index={train_index}")
    print(f"Teste: index={test_index}")

Fold 0:
Treino: index=[     0      1      2 ... 782995 782996 782998]
Teste: index=[     4      9     11 ... 782993 782997 782999]
Fold 1:
Treino: index=[     4      5      9 ... 782997 782998 782999]
Teste: index=[     0      1      2 ... 782985 782995 782996]
Fold 2:
Treino: index=[     0      1      2 ... 782996 782997 782999]
Teste: index=[     5     10     12 ... 782990 782994 782998]


In [7]:
# Definindo o modelo
def treinar_modelo(X, y):
    
    modelo = RandomForestClassifier(random_state=42)
    modelo.fit(X, y)

    return modelo

# Definindo a função de avaliação
def avaliar_modelo(modelo, X, y):
    y_pred = modelo.predict(X)
    y_pred_proba = modelo.predict_proba(X)[:, 1]

    f1 = f1_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_pred_proba)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    average_precision = average_precision_score(y, y_pred_proba)
    brier = brier_score_loss(y, y_pred_proba)
    logloss = log_loss(y, y_pred_proba)

    return {
        'f1': f1,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'average_precision': average_precision,
        'brier': brier,
        'logloss': logloss
    }

In [8]:
print("Iniciando o treinamento e avaliação do modelo, SEM balanceamento de classes...")

aux = []

for i, (train_index, test_index) in enumerate(skf.split(X_treino, y_treino)):

    print(f"Fold {i}:")

    X_train, X_test = X_treino.iloc[train_index], X_treino.iloc[test_index]
    y_train, y_test = y_treino.iloc[train_index], y_treino.iloc[test_index]

    modelo = treinar_modelo(X_train, y_train)
    resultado = avaliar_modelo(modelo, X_test, y_test)
    aux.append(resultado)

Iniciando o treinamento e avaliação do modelo, SEM balanceamento de classes...
Fold 0:
Fold 1:
Fold 2:


In [9]:
aux

[{'f1': 0.43022607211995934,
  'roc_auc': 0.7704703760697316,
  'precision': 0.8553163731245923,
  'recall': 0.28739260038576186,
  'accuracy': 0.9334674329501915,
  'average_precision': 0.43403973763227005,
  'brier': 0.06344435249042145,
  'logloss': 0.42614934152592165},
 {'f1': 0.4343776460626588,
  'roc_auc': 0.7702597862848963,
  'precision': 0.8448188497593109,
  'recall': 0.29234613361388745,
  'accuracy': 0.933455938697318,
  'average_precision': 0.4375998573709957,
  'brier': 0.06349063563218392,
  'logloss': 0.4189416185938012},
 {'f1': 0.4299586641296503,
  'roc_auc': 0.7700048415148718,
  'precision': 0.8544790715869083,
  'recall': 0.28724849866304303,
  'accuracy': 0.9334252873563218,
  'average_precision': 0.43457218806478715,
  'brier': 0.06352058007662835,
  'logloss': 0.4235845192922809}]

In [None]:
print("Iniciando o treinamento e avaliação do modelo, COM balanceamento de classes...")

aux_rus = []

for i, (train_index, test_index) in enumerate(skf.split(X_treino, y_treino)):

    print(f"Fold {i}:")

    X_train, X_test = X_treino.iloc[train_index], X_treino.iloc[test_index]
    y_train, y_test = y_treino.iloc[train_index], y_treino.iloc[test_index]

    # Undersampling
    rus = RandomUnderSampler(random_state=42, sampling_strategy='not minority')
    X_train, y_train = rus.fit_resample(X_train, y_train)
    
    print(y_train.value_counts(normalize=True))
    print(y_train.value_counts())

    modelo = treinar_modelo(X_train, y_train)
    resultado = avaliar_modelo(modelo, X_test, y_test)
    aux_rus.append(resultado)

Iniciando o treinamento e avaliação do modelo, COM balanceamento de classes...
Fold 0:
fraud
0    0.5
1    0.5
Name: proportion, dtype: float64
fraud
0    45625
1    45625
Name: count, dtype: int64
Fold 1:
fraud
0    0.5
1    0.5
Name: proportion, dtype: float64
fraud
0    45625
1    45625
Name: count, dtype: int64
Fold 2:
fraud
0    0.5
1    0.5
Name: proportion, dtype: float64
fraud
0    45624
1    45624
Name: count, dtype: int64


In [15]:
aux_rus

[{'f1': 0.26230215349902114,
  'roc_auc': 0.7847124535530825,
  'precision': 0.16176229843660472,
  'recall': 0.6930562861651762,
  'accuracy': 0.65927969348659,
  'average_precision': 0.437928970583582,
  'brier': 0.19785536436781606,
  'logloss': 0.548058886640538},
 {'f1': 0.26081094412838307,
  'roc_auc': 0.7834335017012143,
  'precision': 0.16050756901157615,
  'recall': 0.6953357881816588,
  'accuracy': 0.6555095785440613,
  'average_precision': 0.4381461583338535,
  'brier': 0.19917187701149425,
  'logloss': 0.5483706898848205},
 {'f1': 0.2607796810395747,
  'roc_auc': 0.7824662295985738,
  'precision': 0.160410968581896,
  'recall': 0.6967080173585236,
  'accuracy': 0.6547586206896552,
  'average_precision': 0.433611749644622,
  'brier': 0.19991629540229888,
  'logloss': 0.5521478064515876}]

In [None]:
print("Iniciando o treinamento e avaliação do modelo, COM balanceamento de classes e procura de hiperparâmetros (RSCV)...")

aux_rus = []

for i, (train_index, test_index) in enumerate(skf.split(X_treino, y_treino)):

    print(f"Fold {i}:")

    X_train, X_test = X_treino.iloc[train_index], X_treino.iloc[test_index]
    y_train, y_test = y_treino.iloc[train_index], y_treino.iloc[test_index]

    # Undersampling
    rus = RandomUnderSampler(random_state=42, sampling_strategy='not minority')
    X_train, y_train = rus.fit_resample(X_train, y_train)
    
    print(y_train.value_counts(normalize=True))
    print(y_train.value_counts())

    modelo = treinar_modelo(X_train, y_train)
    resultado = avaliar_modelo(modelo, X_test, y_test)
    aux_rus.append(resultado)

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ParameterSampler.html#sklearn.model_selection.ParameterSampler

In [15]:
for j in range(0, 5):
    print(j)
    modelo = RandomForestClassifier()
    clf = RandomizedSearchCV(
            estimator=modelo,
            param_distributions={
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['auto', 'sqrt'],
                'class_weight': ['balanced', None]
            },
            n_iter=1,
            cv=1,
            verbose=2,
            random_state=42,
            n_jobs=-1)
    search = clf.fit(X_treino, y_treino)
    print(search.get_params)
    

0


InvalidParameterError: The 'cv' parameter of RandomizedSearchCV must be an int in the range [2, inf), an object implementing 'split' and 'get_n_splits', an iterable or None. Got 1 instead.