# Detecção de Fraude em Cartões de Crédito — Pipeline *leakage-safe*

- **Split correto**: `train` / `cal` / `test (holdout)`
- **Feature selection** ajustada **somente no treino**
- **Calibração + threshold** escolhidos **somente no CAL**
- **TEST** usado **apenas na avaliação final**


In [0]:
# 1) Imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_curve, roc_auc_score, average_precision_score,
    precision_recall_curve, precision_score, recall_score, confusion_matrix
)
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier


In [0]:
# 2) Funções utilitárias (métricas e gráficos)

def calcular_ks_statistic(y_true, y_score):
    """KS = max(TPR - FPR) ao longo dos thresholds."""
    fpr, tpr, _ = roc_curve(y_true, y_score)
    return float(np.max(tpr - fpr))

def get_scores(model, X):
    """Retorna score contínuo preferindo predict_proba[:,1]."""
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        return (s - s.min()) / (s.max() - s.min() + 1e-12)
    raise ValueError("Modelo não possui predict_proba nem decision_function.")

def avaliar_modelo(X_train, y_train, X_test, y_test, model, nm_modelo="Modelo"):
    # Scores
    s_tr = get_scores(model, X_train)
    s_te = get_scores(model, X_test)

    auc_tr = roc_auc_score(y_train, s_tr)
    auc_te = roc_auc_score(y_test, s_te)

    gini_tr = 2 * auc_tr - 1
    gini_te = 2 * auc_te - 1

    ks_tr = calcular_ks_statistic(y_train, s_tr)
    ks_te = calcular_ks_statistic(y_test, s_te)

    ap_tr = average_precision_score(y_train, s_tr)
    ap_te = average_precision_score(y_test, s_te)

    out = {
        "modelo": nm_modelo,
        "roc_auc_train": float(auc_tr),
        "roc_auc_test": float(auc_te),
        "gini_train": float(gini_tr),
        "gini_test": float(gini_te),
        "ks_train": float(ks_tr),
        "ks_test": float(ks_te),
        "pr_auc_train": float(ap_tr),
        "pr_auc_test": float(ap_te),
    }
    return out


## 3) Carregar dados



In [0]:
DATA_PATH = "/Volumes/workspace/default/fraudes_em_cartões_de_crédito/creditcard.csv"
df = pd.read_csv(DATA_PATH)

df.head()


## 4) Checagens rápidas


In [0]:
print("Shape:", df.shape)
print("Target balance (Class=1):", df["Class"].mean())
df.describe().T.head()


In [0]:
def missing_values_table(df):

  metadados = pd.DataFrame({
      'nome_variavel': df.columns,
      'tipos_de_dados': df.dtypes,
      'numero_de_missing': df.isnull().sum(),
      'percentual_de_missing': df.isnull().sum() * 100 / len(df),
      'unknown': df.isin(['unknown']).sum(),
      'percentual_unknown': df.isin(['unknown']).sum() * 100 / len(df),
      'numero_de_unicos': df.nunique(),
  })

  metadados = metadados.sort_values(by='numero_de_missing', ascending=False).round(1)
  metadados = metadados.reset_index(drop=True)

  return metadados

In [0]:
metadados = missing_values_table(df)
metadados.head(31)

## 5) Split leakage-safe: Train / Cal / Test (Holdout)
- `TRAIN`= Treino
- `TEST` = holdout final 
- `CAL` = calibração + threshold


In [0]:
TARGET = "Class"

X_all = df.drop(columns=[TARGET])
y_all = df[TARGET].astype(int)

# Holdout final
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_all, y_all, test_size=0.30, random_state=42, stratify=y_all
)

# Calibração (dentro do treino)
X_train, X_cal, y_train, y_cal = train_test_split(
    X_train_full, y_train_full, test_size=0.20, random_state=42, stratify=y_train_full
)

print("Taxa de fraude (train):", y_train.mean())
print("Taxa de fraude (cal):  ", y_cal.mean())
print("Taxa de fraude (test): ", y_test.mean())


## 6) Data Prep mínimo (imputação)
O dataset original costuma não ter nulos, mas mantemos o passo para robustez.


In [0]:
imputer = SimpleImputer(strategy="median")

X_train_imp = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_cal_imp   = pd.DataFrame(imputer.transform(X_cal), columns=X_cal.columns)
X_test_imp  = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)


## 7) Feature Selection (somente no TREINO)
### 7.1 Filtro de correlação (opcional)


In [0]:
def correlation_filter(X_df: pd.DataFrame, threshold: float = 0.85):
    corr = X_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    return X_df.drop(columns=to_drop), to_drop

X_corr, dropped_corr = correlation_filter(X_train_imp, threshold=0.85)
print(f"Originais: {X_train_imp.shape[1]} | Removidas por correlação: {len(dropped_corr)} | Restantes: {X_corr.shape[1]}")


### 7.2 RandomForest Importance (somente no treino)
Seleciona as top-N features por importância.


In [0]:
# Treina RF apenas para ranking de features (não é o modelo final)
rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample"
)
rf.fit(X_corr, y_train)

imp = pd.Series(rf.feature_importances_, index=X_corr.columns).sort_values(ascending=False)

TOP_N = 25  # ajuste aqui
selected_features = imp.head(TOP_N).index.tolist()

print("Top features selecionadas:", selected_features[:10], "...")


In [0]:
#Encontrando as variaveis mais importantes
import matplotlib.pyplot as plt


feature_importances = rf.feature_importances_

#Criando um dataframe com as features e a sua importância
feature = pd.DataFrame({
    'feature': X_corr.columns,
    'importance': feature_importances})

#Ordenando as variaveis por sua importância
feature.sort_values(by='importance', ascending=False)

#Estabelecendo um limite de corte para as feature importances (30%)
cutoff_max = 0.15

cut_off = cutoff_max * feature_importances.max()

#Selecionando Variaveis acima do limite

selected_features = X_corr.columns[feature_importances > cut_off].tolist()

print(f'Numero de features importances selecionadas {len(selected_features)}')
print(f'Features selecionadas {selected_features}')

#Ordenando as variaveis selecionadas
features = feature.sort_values(by='importance', ascending=True)

#Filtrar o dataframe para apenas features acima da corte
selected_features_df = features[features['importance'] > cut_off]

# Ajusta o tamanho da figura com base no número de features selecionadas
plt.figure(figsize=(10, len(selected_features_df)*0.4))

# Plota as features selecionadas
plt.barh(selected_features_df['feature'], selected_features_df['importance'], color=(0.25, 0.5, 1))
plt.xlabel("Feature Importance")
plt.title("Variáveis Selecionadas - Random Forest")
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.show()

### 7.3 Aplicar seleção no Train / Cal / Test


In [0]:
X_train_fs = X_train_imp[selected_features]
X_cal_fs   = X_cal_imp[selected_features]
X_test_fs  = X_test_imp[selected_features]


## 8) Modelo base (XGBoost)
Parâmetros conservadores orientados a **precision**.


In [0]:
ratio = (y_train == 0).sum() / (y_train == 1).sum()
scale_pos_weight = min(ratio, 100)

xgb_prec = XGBClassifier(
    objective="binary:logistic",
    eval_metric="aucpr",
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
    n_estimators=800,
    learning_rate=0.03,
    max_depth=2,
    min_child_weight=20,
    gamma=0.4,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=10.0,
    reg_alpha=0.2,
    scale_pos_weight=scale_pos_weight
)

xgb_prec.fit(X_train_fs, y_train)


## 9) Calibração + Threshold (somente no CAL)
⚠️ Importante: **não escolher threshold olhando o TEST**.


In [0]:
calibrated = CalibratedClassifierCV(estimator=xgb_prec, method="sigmoid", cv="prefit")
calibrated.fit(X_cal_fs, y_cal)

proba_cal = calibrated.predict_proba(X_cal_fs)[:, 1]
prec, rec, thr = precision_recall_curve(y_cal, proba_cal)

target_precision = 0.40  # 🎯 ajuste aqui
valid = np.where(prec[:-1] >= target_precision)[0]

if len(valid) == 0:
    best_thr = 0.99
else:
    best_idx = valid[np.argmax(rec[valid])]
    best_thr = float(thr[best_idx])

print("Threshold escolhido no CAL:", best_thr)


# ✅ Seção Final — Avaliação *sem leakage* (Holdout)
O conjunto `TEST` (holdout) é usado **uma única vez** aqui, para reportar o resultado final.


In [0]:
proba_test = calibrated.predict_proba(X_test_fs)[:, 1]
y_pred = (proba_test >= best_thr).astype(int)

print("Precision (TEST):", precision_score(y_test, y_pred, zero_division=0))
print("Recall (TEST):   ", recall_score(y_test, y_pred, zero_division=0))
print("ROC-AUC (TEST):  ", roc_auc_score(y_test, proba_test))
print("PR-AUC (TEST):   ", average_precision_score(y_test, proba_test))
print("KS (TEST):       ", calcular_ks_statistic(y_test, proba_test))
print("CM (TEST):\n", confusion_matrix(y_test, y_pred))

resultados = avaliar_modelo(X_train_fs, y_train, X_test_fs, y_test, calibrated, nm_modelo="XGBoost Calibrado (leakage-safe)")
resultados
