In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score

from face import FACE, run_face_experiment, get_path_probabilities

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("mstz/heloc")["train"]
df_heloc = dataset.to_pandas()

df_heloc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10459 entries, 0 to 10458
Data columns (total 24 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   estimate_of_risk                          10459 non-null  int64
 1   months_since_first_trade                  10459 non-null  int64
 2   months_since_last_trade                   10459 non-null  int64
 3   average_duration_of_resolution            10459 non-null  int64
 4   number_of_satisfactory_trades             10459 non-null  int64
 5   nr_trades_insolvent_for_over_60_days      10459 non-null  int64
 6   nr_trades_insolvent_for_over_90_days      10459 non-null  int64
 7   percentage_of_legal_trades                10459 non-null  int64
 8   months_since_last_illegal_trade           10459 non-null  int64
 9   maximum_illegal_trades_over_last_year     10459 non-null  int64
 10  maximum_illegal_trades                    10459 non-null  

In [3]:
df_heloc.isna().sum()

estimate_of_risk                            0
months_since_first_trade                    0
months_since_last_trade                     0
average_duration_of_resolution              0
number_of_satisfactory_trades               0
nr_trades_insolvent_for_over_60_days        0
nr_trades_insolvent_for_over_90_days        0
percentage_of_legal_trades                  0
months_since_last_illegal_trade             0
maximum_illegal_trades_over_last_year       0
maximum_illegal_trades                      0
nr_total_trades                             0
nr_trades_initiated_in_last_year            0
percentage_of_installment_trades            0
months_since_last_inquiry_not_recent        0
nr_inquiries_in_last_6_months               0
nr_inquiries_in_last_6_months_not_recent    0
net_fraction_of_revolving_burden            0
net_fraction_of_installment_burden          0
nr_revolving_trades_with_balance            0
nr_installment_trades_with_balance          0
nr_banks_with_high_ratio          

In [4]:
df_heloc.head()

Unnamed: 0,estimate_of_risk,months_since_first_trade,months_since_last_trade,average_duration_of_resolution,number_of_satisfactory_trades,nr_trades_insolvent_for_over_60_days,nr_trades_insolvent_for_over_90_days,percentage_of_legal_trades,months_since_last_illegal_trade,maximum_illegal_trades_over_last_year,...,months_since_last_inquiry_not_recent,nr_inquiries_in_last_6_months,nr_inquiries_in_last_6_months_not_recent,net_fraction_of_revolving_burden,net_fraction_of_installment_burden,nr_revolving_trades_with_balance,nr_installment_trades_with_balance,nr_banks_with_high_ratio,percentage_trades_with_balance,is_at_risk
0,55,144,4,84,20,3,0,83,2,3,...,0,0,0,33,-8,8,1,1,69,1
1,61,58,15,41,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,1
2,67,66,5,24,9,0,0,100,-7,7,...,0,4,4,53,66,4,2,1,86,1
3,66,169,1,73,28,1,1,93,76,6,...,0,5,4,72,83,6,4,3,91,1
4,81,333,27,132,12,0,0,100,-7,7,...,0,1,1,51,89,3,1,0,80,1


In [5]:
# Contar valores menores que zero para cada coluna
neg_counts = (df_heloc < 0).sum()

# Filtrar apenas as colunas que possuem valores negativos e ordenar do maior para o menor
neg_counts = neg_counts[neg_counts > 0].sort_values(ascending=False)

print("Quantidade de valores negativos por feature:")
print(neg_counts)

Quantidade de valores negativos por feature:
months_since_last_illegal_trade             5428
net_fraction_of_installment_burden          4007
months_since_last_inquiry_not_recent        2919
nr_installment_trades_with_balance          1449
nr_banks_with_high_ratio                    1171
months_since_first_trade                     827
net_fraction_of_revolving_burden             774
nr_revolving_trades_with_balance             744
percentage_trades_with_balance               606
estimate_of_risk                             598
maximum_illegal_trades                       588
maximum_illegal_trades_over_last_year        588
nr_trades_initiated_in_last_year             588
percentage_of_installment_trades             588
percentage_of_legal_trades                   588
nr_inquiries_in_last_6_months                588
nr_inquiries_in_last_6_months_not_recent     588
nr_trades_insolvent_for_over_90_days         588
nr_trades_insolvent_for_over_60_days         588
number_of_satisfactory_t

In [6]:
features = df_heloc.columns.drop(['is_at_risk'])

# 2. Criar uma máscara booleana: True para linhas onde alguma features é < 0
mask_all_negative = (df_heloc[features] < 0).any(axis=1)

# 3. Contar quantas linhas serão removidas (deve dar algo próximo a 588)
print(f"Linhas identificadas com alguma feature negativa: {mask_all_negative.sum()}")

# 4. Filtrar o DataFrame original mantendo apenas o inverso da máscara (~)
df_heloc_clean = df_heloc[~mask_all_negative].copy()

# 5. Verificar o novo shape
print(f"Shape original: {df_heloc.shape}")
print(f"Novo shape: {df_heloc_clean.shape}")

Linhas identificadas com alguma feature negativa: 7957
Shape original: (10459, 24)
Novo shape: (2502, 24)


In [7]:
df_heloc_clean['is_at_risk'].value_counts()

is_at_risk
1    1560
0     942
Name: count, dtype: int64

In [8]:
y = df_heloc_clean['is_at_risk']
X = df_heloc_clean.drop(columns=['is_at_risk', 'estimate_of_risk'])

# Split de Treino e Teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Normalização (CRUCIAL para o FACE)
# Guardamos o scaler para inverter a transformação nos contrafatuais depois
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Treino do Modelo (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Avaliação
y_pred = model.predict(X_test_scaled)
print(f"Acurácia: {accuracy_score(y_test, y_pred):.2f}")
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred))

Acurácia: 0.72

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.71      0.44      0.55       189
           1       0.73      0.89      0.80       312

    accuracy                           0.72       501
   macro avg       0.72      0.67      0.67       501
weighted avg       0.72      0.72      0.70       501



In [10]:
# Criando um DataFrame de análise para o teste
test_results = pd.DataFrame(X_test, columns=X.columns)
test_results['true_label'] = y_test.values
test_results['predicted_label'] = y_pred

# Filtrar instâncias onde o modelo previu risco (1) 
# e queremos transformá-las em (0)
candidates_for_explanation = test_results[test_results['predicted_label'] == 1]

print(f"Total de candidatos para gerar contrafatuais: {len(candidates_for_explanation)}")

Total de candidatos para gerar contrafatuais: 383


In [11]:
clients_at_risk = candidates_for_explanation.drop(columns=['true_label', 'predicted_label'])
clients_at_risk = scaler.transform(clients_at_risk)  # Escalar os dados dos candidatos

In [12]:
# 1. Unir as features (em escala)
X_full_scaled = np.vstack([X_train_scaled, X_test_scaled])

# 2. Unir os rótulos reais (Ground Truth)
y_full = np.concatenate([y_train.values, y_test.values])

## Experimentos com o FACE

In [13]:
face_explainer = FACE(model, X_full_scaled, y_full, n_neighbors=20)

df_steps, path_scaled = run_face_experiment(face_explainer, clients_at_risk, scaler, X.columns, instance_idx=1)

--- Experimento com Instância Index 1 ---
Sucesso! Caminho encontrado com 3 passos.
                                          Diferença (Δ)
months_since_first_trade                          -18.0
months_since_last_trade                            -6.0
average_duration_of_resolution                     16.0
number_of_satisfactory_trades                       9.0
percentage_of_legal_trades                          7.0
months_since_last_illegal_trade                     2.0
maximum_illegal_trades_over_last_year               2.0
maximum_illegal_trades                              2.0
nr_total_trades                                     9.0
nr_trades_initiated_in_last_year                    3.0
percentage_of_installment_trades                  -14.0
nr_inquiries_in_last_6_months                      -4.0
nr_inquiries_in_last_6_months_not_recent           -4.0
net_fraction_of_revolving_burden                  -34.0
net_fraction_of_installment_burden                  7.0
nr_revolving_trades_

In [14]:
probas_caminho = get_path_probabilities(model, path_scaled)
print(probas_caminho)

   Passo  Prob_Aprovado (0)  Prob_Risco (1)
0      0               0.16            0.84
1      1               0.12            0.88
2      2               0.75            0.25


In [15]:
df_steps

Unnamed: 0,months_since_first_trade,months_since_last_trade,average_duration_of_resolution,number_of_satisfactory_trades,nr_trades_insolvent_for_over_60_days,nr_trades_insolvent_for_over_90_days,percentage_of_legal_trades,months_since_last_illegal_trade,maximum_illegal_trades_over_last_year,maximum_illegal_trades,...,percentage_of_installment_trades,months_since_last_inquiry_not_recent,nr_inquiries_in_last_6_months,nr_inquiries_in_last_6_months_not_recent,net_fraction_of_revolving_burden,net_fraction_of_installment_burden,nr_revolving_trades_with_balance,nr_installment_trades_with_balance,nr_banks_with_high_ratio,percentage_trades_with_balance
0,336.0,13.0,93.0,17.0,1.0,1.0,89.0,4.0,2.0,4.0,...,33.0,0.0,4.0,4.0,65.0,53.0,4.0,2.0,2.0,75.0
1,340.0,3.0,86.0,21.0,1.0,0.0,91.0,2.0,3.0,5.0,...,17.0,0.0,2.0,2.0,50.0,64.0,6.0,2.0,3.0,67.0
2,318.0,7.0,109.0,26.0,1.0,1.0,96.0,6.0,4.0,6.0,...,19.0,0.0,0.0,0.0,31.0,60.0,8.0,3.0,1.0,58.0


### Adicionar restrições

In [16]:
# Mapeamento de nomes para índices
feature_names = X.columns.tolist()
constraints = {}

# --- Ações que o usuário deve AUMENTAR ---
increasing_features = [
    'months_since_first_trade', 
    'months_since_last_trade',
    'number_of_satisfactory_trades',
    'percentage_of_legal_trades',
    'months_since_last_illegal_trade',
    'nr_total_trades',
    'months_since_last_inquiry_not_recent'
]

# --- Ações que o usuário deve DIMINUIR ---
decreasing_features = [
    'nr_trades_insolvent_for_over_60_days',
    'nr_trades_insolvent_for_over_90_days',
    'maximum_illegal_trades_over_last_year',
    'maximum_illegal_trades',
    'nr_trades_initiated_in_last_year',
    'nr_inquiries_in_last_6_months',
    'nr_inquiries_in_last_6_months_not_recent',
    'net_fraction_of_revolving_burden',
    'net_fraction_of_installment_burden',
    'nr_revolving_trades_with_balance',
    'nr_installment_trades_with_balance',
    'nr_banks_with_high_ratio',
    'percentage_trades_with_balance',
    'average_duration_of_resolution'
]

# Mapeando para os índices
for f in increasing_features:
    constraints[feature_names.index(f)] = "increasing"

for f in decreasing_features:
    constraints[feature_names.index(f)] = "decreasing"

In [17]:
# Inicializa o FACE com as restrições
face_explainer_restricted = FACE(model, X_train_scaled, y_train, constraints=constraints, n_neighbors=100, mode='edge')

df_steps, path_scaled = run_face_experiment(face_explainer_restricted, clients_at_risk, scaler, X.columns, instance_idx=10)

--- Experimento com Instância Index 10 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.


In [18]:
for i in range(len(clients_at_risk)):
    res, path = run_face_experiment(face_explainer_restricted, clients_at_risk, scaler, X.columns, instance_idx=i)
    if path is not None and len(path) > 1:
        print(f"Sucesso encontrado no índice {i}!")
        break

--- Experimento com Instância Index 0 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 1 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 2 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 3 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 4 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 5 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 6 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 7 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 8 ---
Resultado: Nenhum caminho encontrado d

In [19]:
# 2. Inicializar o explicador com as restrições embutidas
face_explainer_endpointRestricted = FACE(
    model, 
    X_train_scaled, 
    y_train, 
    n_neighbors=100, 
    constraints=constraints,
    mode='endpoint'
)

df_steps, path_scaled = run_face_experiment(
    face_explainer_endpointRestricted, 
    clients_at_risk,
    scaler, 
    X.columns, 
    instance_idx=100
)

--- Experimento com Instância Index 100 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.


In [20]:
for i in range(len(clients_at_risk)):
    res, path = run_face_experiment(face_explainer_endpointRestricted, clients_at_risk, scaler, X.columns, instance_idx=i)
    if path is not None and len(path) > 1:
        print(f"Sucesso encontrado no índice {i}!")
        break

--- Experimento com Instância Index 0 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 1 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 2 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 3 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 4 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 5 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 6 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 7 ---
Resultado: Nenhum caminho encontrado dentro das restrições de densidade.
--- Experimento com Instância Index 8 ---
Resultado: Nenhum caminho encontrado d

In [21]:
probas_caminho = get_path_probabilities(model, path_scaled)
print(probas_caminho)

Nenhum caminho fornecido.
