In [15]:
!pip install Optuna
import numpy as np
import pandas as pd
import optuna
from sklearn.preprocessing import MinMaxScaler
import xgboost
from sklearn.preprocessing import LabelEncoder

from utils import encode_features, get_train_test_data, train_model, evaluate_model, generate_individual, epsilon_rounding, get_relevant_candidates

optuna.logging.set_verbosity(optuna.logging.WARNING)

Collecting Optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from Optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, Optuna
Successfully installed Optuna-4.6.0 colorlog-6.10.1


ModuleNotFoundError: No module named 'utils'

## Data

In [8]:
def load_data(data_filepath="../data/Loan_data_extracted.csv"):
    """
    Input: path to .csv data file
    
    TODO: specify in feature_info whether features are of type:
        fixed, meaning cannot change for the counterfactual
        unique, meaning can only take existing categorical values
        increase, meaning their value can only increase and not decrease
        range, meaning their new value can take a range of values

    Returns:
        dataframe and feature configuration dictionary
    """
    df = pd.read_csv(data_filepath)
    df = df.drop('Loan_ID', axis=1)
    df = df.dropna()
    
    feature_config = {
        "categorical": ["Gender", "Married", "Education", "Self_Employed", "Property_Area", "Loan_Status"],
    
        "feature_info": [
            ('Gender', 'todo'),
            ('Married', 'todo'),
            ('Dependents', 'todo'),
            ('Education', 'todo'),
            ('Self_Employed', 'todo'),
            ('ApplicantIncome', 'todo'),
            ('CoapplicantIncome', 'todo'),
            ('LoanAmount', 'todo'),
            ('Loan_Amount_Term', 'todo'),
            ('Credit_History', 'todo'),
            ('Property_Area', 'todo'),
        ],
    
        "categorical_features": ["Gender", "Married", "Education", "Self_Employed", "Property_Area"]
    }

    return df, feature_config

## Model

In [None]:
# Load the model from the saved file
model = xgboost.XGBClassifier()
model.load_model("xgboost_model.json")

## TODO: Code for counterfactual search

In [None]:
def misfit(x_prime, y_target, model):
    """
    Optimisation criterion 1
    Calculate absolute difference between y_target and y_prime_prediction.
    """
    y_pred = model.predict_proba(x_prime)[:, 1] if hasattr(model, 'predict_proba') else model.predict(x_prime)
    return np.abs(y_target - y_pred[0])

In [1]:
def distance(X, x, x_prime, numerical, categorical):
    """
    Optimisation criterion 2
    Calculate distance between x_prime and x.
    """
    scaler = MinMaxScaler()
    scaler.fit(X[numerical])
    x_normalized = scaler.transform(x[numerical])
    x_prime_normalized = scaler.transform(x_prime[numerical])
    # Euclidean distance for numerical features
    num_dist = np.linalg.norm(x_normalized - x_prime_normalized)
    # Hamming distance for categorical features
    cat_dist = np.sum(x[categorical].values != x_prime[categorical].values)
    return num_dist + cat_dist

In [2]:
def sparsity(x, x_prime):
    """
    Optimisation criterion 3
    Return number of unchanged features (higher is sparser, so we return negative for minimization).
    """
    unchanged = (x.values == x_prime.values).sum()
    return -unchanged

In [3]:
def closest_real(X, x_prime, categorical, numerical):
    """
    Optimisation criterion 4
    Return the minimum distance between x_prime and any point in X.
    """
    scaler = MinMaxScaler()
    X_normalized = scaler.fit_transform(X[numerical])
    x_prime_normalized = scaler.transform(x_prime[numerical])
    # Euclidean distance for numerical features
    num_dists = np.linalg.norm(X_normalized - x_prime_normalized, axis=1)
    # Hamming distance for categorical features
    cat_dists = (X[categorical].values != x_prime[categorical].values).sum(axis=1)
    total_dists = num_dists + cat_dists
    return np.min(total_dists)

In [4]:
def objective(trial, X, x, features, model, y_target, numerical, categorical):
    x_prime = x.copy()
    for feature in features:
        feature.sample(trial)
        x_prime[feature.name] = feature.value
    epsilon_rounding(x, x_prime, 1e-1)

    obj1 = misfit(x_prime, y_target, model)
    obj2 = distance(X, x, x_prime, numerical, categorical)
    obj3 = sparsity(x, x_prime)
    obj4 = closest_real(X, x_prime, categorical, numerical)

    return obj1, obj2, obj3, obj4

In [5]:
def get_counterfactuals(X, x, y_target, model, 
                        numerical, categorical, features, 
                        tol, optimization_steps, timeout):

    study = optuna.create_study(directions=['minimize', 'minimize', 'maximize', 'minimize'], 
                                sampler=optuna.samplers.NSGAIISampler(seed=42)) 
    
    study.optimize(lambda trial: objective(trial, X, x, features, model, 
                                           y_target, 
                                           numerical, 
                                           categorical), 
                   n_trials=optimization_steps, 
                   timeout=timeout)
    
    candidates_df = get_relevant_candidates(study, x, model, y_target, tol)
    
    return candidates_df

## Provided datapoint and data

In [12]:
def encode_features(df, categorical):
    """
    Input:
        df: dataframe containing feature names in columns
        caterogiral: list of all categorical feature names.

    Custom encoding function,
    Education is manually encoded to ensure correct boundaries when constructing counterfactuals 
    The remaining categorical features are encoded using LabelEncoder

    Returns:
        modified dataframe
    """

    print("--- \nEncoded categorical features as follows:")
    label_encoders = {}

    for _col in categorical:
        if _col == "Education":
            mapping = {"Not Graduate": 0, "Graduate": 1}
            df[_col] = df[_col].map(mapping)
            label_encoders[_col] = mapping  
        else:
            le = LabelEncoder()
            df[_col] = le.fit_transform(df[_col])
            label_encoders[_col] = le
            
    for _col in categorical: 
        if _col == "Education":
            print(_col, ": ", label_encoders[_col])
        else:
            classes = label_encoders[_col].classes_
            label_mapping = {label: idx for idx, label in enumerate(classes)}
            print(_col, ": ", label_mapping)

    print('---')
    return df

In [13]:
X_obs, feat_conf = load_data("https://raw.githubusercontent.com/HayBeeCoder/AIMS_XAI_Inga/refs/heads/main/data/Loan_data_extracted.csv")
# X_obs, feat_conf = load_data("../data/Loan_data_extracted.csv")
X_obs = encode_features(X_obs, feat_conf["categorical"])

--- 
Encoded categorical features as follows:


NameError: name 'LabelEncoder' is not defined

In [6]:
customer = np.array([0,1,0,0,0,2000,1500,1000,480,0,1])
x = pd.DataFrame([customer], columns=X_obs.columns[:-1].tolist())

In [None]:
x

In [None]:
# Check that our customer x did not get the loan
loan_pred = model.predict_proba(x)[:, 1] if hasattr(model, 'predict_proba') else model.predict(x)
print(f"Loan approval probability: {loan_pred[0]:.2f}")
if loan_pred[0] < 0.5:
    print("Customer did NOT get the loan.")
else:
    print("Customer got the loan.")

# and help her find out what she has to do in order to get the loan
# If you have implemented everything above correctly, the code below
# will find the counterfactuals

## Search for counterfactuals

In [None]:
# Make a list of Feature objects containing information about how 
# each feature is allowed to change when generating counterfactuals
change_features = generate_individual(X_obs, x, feat_conf["feature_info"])

In [None]:
# Set the desired new model prediction
y_CF = 0.7
print(f"Searching for counterfactuals with y_CF = {y_CF}...\n")
numerical_features = [x for x in df.columns if x not in feat_conf["categorical"]]
CFS = get_counterfactuals(X_obs, x, y_CF, model, 
                          numerical_features, 
                          feat_conf["categorical_features"], 
                          change_features, 
                          tol=0.05, 
                          optimization_steps=500, 
                          timeout=None) 

In [None]:
x

In [None]:
CFS