In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import dice_ml
import joblib

## Setup

In [16]:
model = joblib.load('./data/credit_scoring_model.joblib')
scaler = joblib.load('./data/credit_scaler.joblib')
feature_names = joblib.load('./data/feature_names.joblib')

clients_at_risk = pd.read_csv('./data/clients_at_risk.csv')
train_scaled_df = pd.read_csv('./data/train_scaled.csv')

X_train_scaled_df = train_scaled_df.drop('SeriousDlqin2yrs', axis=1)
y_train = train_scaled_df['SeriousDlqin2yrs']

## Experimento com DICE

In [17]:
def translate_dice_deltas(exp_results, query_instance, scaler, feature_names):
    """
    Exibe a diferença exata (+/-) necessária para atingir o contrafatual.
    """

    # 1. Extrair e desescalonar os contrafatuais
    cf_df = exp_results.cf_examples_list[0].final_cfs_df
    features_scaled = cf_df.drop('SeriousDlqin2yrs', axis=1)
    features_raw = scaler.inverse_transform(features_scaled)
    df_cfs = pd.DataFrame(features_raw, columns=feature_names)
    
    # 2. Desescalonar a query_instance original
    query_raw = scaler.inverse_transform(query_instance)
    df_query = pd.DataFrame(query_raw, columns=feature_names)
    
    # 3. Criar DataFrame de Deltas (Diferenças)
    df_diff = df_cfs.copy().astype(object)
    
    for col in feature_names:
        original_val = df_query[col].values[0]
        cf_val = df_cfs[col].values
        
        # Calcula a diferença
        diff = cf_val - original_val
        
        # Aplica a formatação baseada na mudança
        formatted_col = []
        for d in diff:
            if np.isclose(d, 0, atol=1e-5):
                formatted_col.append("-") # Sem mudança
            elif d > 0:
                formatted_col.append(f"+{d:.2f}") # Aumento
            else:
                formatted_col.append(f"{d:.2f}") # Diminuição (o sinal de - já vem no float)
        
        df_diff[col] = formatted_col
        
    # Adicionar o status de aprovação
    df_diff['Inadimplente'] = cf_df['SeriousDlqin2yrs'].values
    
    return df_diff

def color_deltas(val):
    if isinstance(val, str):
        if '+' in val: return 'color: green'
        if '-' in val and val != '-': return 'color: red'
    return ''

In [None]:
# --- PASSO 1: Preparação dos Dados Escalonados ---
dice_data = dice_ml.Data(
    dataframe=train_scaled_df, 
    continuous_features=feature_names,
    outcome_name='SeriousDlqin2yrs'
)

# --- PASSO 2: Configuração do Modelo ---
dice_model = dice_ml.Model(model=model, backend="sklearn")

In [19]:
to_remove = ['age', 'NumberOfDependents', 'NumberOfTimes90DaysLate']
features_to_vary = [f for f in feature_names if f not in to_remove]

print(features_to_vary)

['RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse']


In [32]:
import logging

def generate_batch_with_dynamic_range(
    exp, 
    query_instances, 
    scaler, 
    feature_names, 
    features_to_vary, 
    num_cfs=2, 
    apply_constraints=True
):
    all_explanations = []
    failed_count = 0
    income_idx = feature_names.index('MonthlyIncome')
    
    # Silencia avisos desnecessários do DiCE durante o processamento em lote
    logging.getLogger().setLevel(logging.ERROR)

    for i in range(len(query_instances)):
        instance = query_instances.iloc[i:i+1]
        p_range = None
        
        try:
            if apply_constraints:
                # 1. Cálculo dinâmico para Renda (+10%)
                query_real = scaler.inverse_transform(instance)
                current_income = query_real[0][income_idx]
                max_income_real = current_income * 1.1
                
                dummy_point = query_real.copy()
                dummy_point[0][income_idx] = max_income_real
                max_income_scaled = scaler.transform(dummy_point)[0][income_idx]

                val_30_59 = instance['NumberOfTime30-59DaysPastDueNotWorse'].values[0]
                val_60_89 = instance['NumberOfTime60-89DaysPastDueNotWorse'].values[0]
                
                p_range = {
                    'MonthlyIncome': [instance['MonthlyIncome'].values[0], max_income_scaled],
                    'NumberOfTime30-59DaysPastDueNotWorse': [0, val_30_59],
                    'NumberOfTime60-89DaysPastDueNotWorse': [0, val_60_89]
                }
            
            # 2. Tentativa de geração
            dice_exp = exp.generate_counterfactuals(
                instance,
                total_CFs=num_cfs,
                desired_class=0,
                features_to_vary=features_to_vary,
                permitted_range=p_range
            )
            
            # 3. Verificação de validade do resultado
            # O DiCE às vezes retorna um objeto sem CFs em vez de lançar erro
            if dice_exp is None or dice_exp.cf_examples_list[0].final_cfs_df is None:
                failed_count += 1
                all_explanations.append(None)
            else:
                all_explanations.append(dice_exp)

        except Exception as e:
            # Captura erros como UserConfigValidationException (comum no KDTree)
            failed_count += 1
            all_explanations.append(None)
            print(f"Aviso: Falha na instância {i}. Motivo: {type(e).__name__}")
    
    return all_explanations, failed_count

## Análise Quantitativa

In [39]:
def perform_quantitative_analysis(exp_results_list, query_instances, scaler, feature_names):
    """
    Analisa os resultados ignorando falhas (None) e calculando métricas de esforço absoluto.
    """
    metrics = []
    total_queries = len(query_instances)
    
    for i, exp in enumerate(exp_results_list):
        # 1. Obter a query original (essencial para o cálculo de distância)
        query_scaled = query_instances.iloc[i:i+1]
        
        # --- VERIFICAÇÃO DE ROBUSTEZ ---
        # Pula se o resultado for None (falha no KDTree ou exceção capturada)
        if exp is None:
            continue
            
        # Tenta acessar a lista de exemplos; pula se estiver vazia ou malformada
        if not hasattr(exp, 'cf_examples_list') or not exp.cf_examples_list:
            continue
            
        cf_example = exp.cf_examples_list[0]
        
        # Verifica se o DataFrame de contrafatuais existe e não está vazio
        if cf_example.final_cfs_df is None or cf_example.final_cfs_df.empty:
            continue 
            
        # 2. Extração de Dados (Escalonados e Reais)
        query_raw = scaler.inverse_transform(query_scaled)
        
        # Remove a coluna target para calcular a distância apenas nas features
        cf_df_scaled = cf_example.final_cfs_df.drop('SeriousDlqin2yrs', axis=1).iloc[0:1]
        cf_raw = scaler.inverse_transform(cf_df_scaled)
        
        # 3. CÁLCULO DAS MÉTRICAS
        # Proximidade L1 (Distância Manhattan)
        l1_dist = np.sum(np.abs(query_scaled.values - cf_df_scaled.values))
        
        # Sparsity (Número de colunas que sofreram alteração significativa)
        sparsity = np.sum(np.abs(query_scaled.values - cf_df_scaled.values) > 1e-5)
        
        # Delta de Renda (Valor Real)
        income_idx = feature_names.index('MonthlyIncome')
        delta_income = cf_raw[0][income_idx] - query_raw[0][income_idx]
        
        metrics.append({
            'ID': i,
            'Success': 1,
            'L1_Distance': l1_dist,
            'Sparsity': sparsity,
            'Delta_Income_Real': delta_income
        })
    
    # Se nenhuma métrica foi coletada (falha total do batch)
    if not metrics:
        return pd.DataFrame(), {
            'Success_Rate': 0, 
            'Avg_L1_Dist': 0, 
            'Avg_Sparsity': 0, 
            'Avg_Magnitude_Income': 0
        }

    df_metrics = pd.DataFrame(metrics)
    
    # 4. RESUMO ESTATÍSTICO
    summary = {
        # Sucesso relativo ao total de tentativas (batch original)
        'Success_Rate': (len(df_metrics) / total_queries) * 100,
        
        # Médias calculadas apenas sobre os casos de sucesso
        'Avg_L1_Dist': df_metrics['L1_Distance'].mean(),
        'Avg_Sparsity': df_metrics['Sparsity'].mean(),
        
        # Magnitude absoluta: esforço médio de mudança na renda
        'Avg_Magnitude_Income': df_metrics['Delta_Income_Real'].abs().mean()
    }
    
    return df_metrics, summary

In [27]:
to_remove = ['age', 'NumberOfDependents', 'NumberOfTimes90DaysLate']
features_to_vary = [f for f in feature_names if f not in to_remove]

batch_risk = pd.DataFrame(
    clients_at_risk.iloc[0:50], 
    columns=feature_names
).astype(X_train_scaled_df.dtypes.to_dict())

In [None]:
dice_exp = dice_ml.Dice(dice_data, dice_model, method="genetic")

# permitir variação em todas as features
results_raw, n_fails_raw = generate_batch_with_dynamic_range(dice_exp, batch_risk,scaler, 
                                                feature_names, features_to_vary=feature_names, num_cfs=1, apply_constraints=False)

100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
100%|██████████| 1/1 [00:00<00:00,  4.27it/s]
100%|██████████| 1/1 [00:00<00:00,  4.36it/s]
100%|██████████| 1/1 [00:00<00:00,  4.43it/s]
100%|██████████| 1/1 [00:00<00:00,  4.37it/s]
100%|██████████| 1/1 [00:00<00:00,  4.45it/s]
100%|██████████| 1/1 [00:00<00:00,  4.36it/s]
100%|██████████| 1/1 [00:00<00:00,  4.30it/s]
100%|██████████| 1/1 [00:33<00:00, 33.74s/it]
100%|██████████| 1/1 [00:00<00:00,  4.67it/s]
100%|██████████| 1/1 [00:00<00:00,  4.59it/s]
100%|██████████| 1/1 [00:00<00:00,  4.62it/s]
100%|██████████| 1/1 [00:00<00:00,  4.58it/s]
100%|██████████| 1/1 [00:00<00:00,  4.71it/s]
100%|██████████| 1/1 [00:00<00:00,  4.62it/s]
100%|██████████| 1/1 [00:00<00:00,  4.70it/s]
100%|██████████| 1/1 [00:00<00:00,  4.57it/s]
100%|██████████| 1/1 [00:00<00:00,  4.51it/s]
100%|██████████| 1/1 [00:00<00:00,  4.60it/s]
100%|██████████| 1/1 [00:00<00:00,  4.63it/s]
100%|██████████| 1/1 [00:00<00:00,  4.69it/s]
100%|██████████| 1/1 [00:00<00:00,

In [None]:
dice_exp = dice_ml.Dice(dice_data, dice_model, method="random")

results_constrained, n_fails_constrained = generate_batch_with_dynamic_range(dice_exp, batch_risk,scaler, 
                                                feature_names, features_to_vary, num_cfs=1, apply_constraints=True)

100%|██████████| 1/1 [00:00<00:00,  4.60it/s]
100%|██████████| 1/1 [00:00<00:00,  4.54it/s]
100%|██████████| 1/1 [00:00<00:00,  4.42it/s]
100%|██████████| 1/1 [00:00<00:00,  4.16it/s]
100%|██████████| 1/1 [00:00<00:00,  4.75it/s]
100%|██████████| 1/1 [00:00<00:00,  4.75it/s]
100%|██████████| 1/1 [00:00<00:00,  4.73it/s]
100%|██████████| 1/1 [00:00<00:00,  4.78it/s]
100%|██████████| 1/1 [00:00<00:00,  4.81it/s]
100%|██████████| 1/1 [00:00<00:00,  4.78it/s]
100%|██████████| 1/1 [00:00<00:00,  4.76it/s]
100%|██████████| 1/1 [00:00<00:00,  4.69it/s]
100%|██████████| 1/1 [00:00<00:00,  4.32it/s]
100%|██████████| 1/1 [00:00<00:00,  4.45it/s]
100%|██████████| 1/1 [00:00<00:00,  4.39it/s]
100%|██████████| 1/1 [00:00<00:00,  4.48it/s]
100%|██████████| 1/1 [00:34<00:00, 34.01s/it]
100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
100%|██████████| 1/1 [00:00<00:00,  4.41it/s]
100%|██████████| 1/1 [00:00<00:00,  4.71it/s]
100%|██████████| 1/1 [00:00<00:00,  4.58it/s]
100%|██████████| 1/1 [00:00<00:00,

In [None]:
dice_exp = dice_ml.Dice(dice_data, dice_model, method="kdtree")

results_kdtree, n_fails_kdtree = generate_batch_with_dynamic_range(dice_exp, batch_risk,scaler, 
                                                feature_names, features_to_vary, num_cfs=1, apply_constraints=True)

In [None]:
# 1. Analisar Cenário Sem Restrições
metrics_raw, summary_raw = perform_quantitative_analysis(results_raw, batch_risk, scaler, feature_names)

# 2. Analisar Cenário Com Restrições (Sua função generate_batch_with_dynamic_range)
metrics_constr, summary_constr = perform_quantitative_analysis(results_constrained, batch_risk, scaler, feature_names)

# 1. Analisar Cenário Com Kdtree
metrics_kdtree, summary_kdtree = perform_quantitative_analysis(results_kdtree, batch_risk, scaler, feature_names)

comparison_df = pd.DataFrame({
    'Métrica': ['Taxa de Sucesso (%)', 'Distância Média (L1)', 'Mudanças Médias (Sparsity)', 'Mudança Média de Renda (R$)'],
    'Sem Restrições': [summary_raw['Success_Rate'], summary_raw['Avg_L1_Dist'], summary_raw['Avg_Sparsity'], summary_raw['Avg_Magnitude_Income']],
    'Com Restrições': [summary_constr['Success_Rate'], summary_constr['Avg_L1_Dist'], summary_constr['Avg_Sparsity'], summary_constr['Avg_Magnitude_Income']],
    'kdtree': [summary_kdtree['Success_Rate'], summary_kdtree['Avg_L1_Dist'], summary_kdtree['Avg_Sparsity'], summary_kdtree['Avg_Magnitude_Income']]
})

display(comparison_df)

Unnamed: 0,Métrica,Sem Restrições,Com Restrições,kdtree
0,Taxa de Sucesso (%),100.0,100.0,46.0
1,Distância Média (L1),0.902326,0.429109,0.127025
2,Mudanças Médias (Sparsity),1.78,1.48,5.0
3,Mudança Média de Renda (R$),545497.842782,86.837298,3146.695629
