# Counterfactual explanations with one-hot encoded categorical variables

https://docs.seldon.io/projects/alibi/en/latest/examples/cfproto_cat_adult_ohe.html

In [None]:
def check_class( class_name, instance, label ):
  if( instance[class_name] == 1):
    prediction = "<span style='color:red'>" + label + "</span>"
  else:
    prediction = "<span style='color:green'>NOT " + label + "</span>"

  return prediction

# Patients features
def print_patients_data( data, indx ):
  
  instance = data[indx]

  prediction = check_class( "predictions", instance, "DIABETES" )
  ground_truth = check_class( "ground_truth", instance, "DIABETES" ) 

  result = "<h3>Analysing PATIENT = <span style='color:green'>" + str(indx) + "</span></h3><br/>" 
  result = result + "<p><b>Prediction: </b>" + prediction + "</p>&#9;&#9;<p><b>Correct Diagnosis: </b>" + ground_truth + "</p><br/>"

  temp = "<table>"
  temp = temp + "<tr><td><b>Data</b></td><td>&nbsp;&nbsp;&nbsp;&nbsp;</td><td><b>Values</b></td></tr><tr><td><hr></td><td></td><td><hr></td></tr>"
  for i in range(0, len(instance['original_vector'])):
    temp = temp + "<tr><td>" + feature_names[i] + "</td><td>&nbsp;&nbsp;&nbsp;&nbsp;</td><td>" + str(instance['original_vector'][i]) + "</td></td></tr>"
  temp = temp + "</table>"

  return HTML(result+temp+"<br/" )

In [3]:
# Install tensorflow
try:
    # tensorflow_version only exists in Colab
    %tensorflow_version 2.2.0
except Exception:
    pass

In [4]:
# install required libraries
!pip install pyagrum
!pip install lime
!pip install shap



In [6]:
pip install alibi

Collecting alibi
  Using cached alibi-0.5.8-py3-none-any.whl (312 kB)
Collecting spacy[lookups]<4.0.0,>=2.0.0
  Using cached spacy-3.0.6-cp38-cp38-manylinux2014_x86_64.whl (13.0 MB)
Collecting catalogue<2.1.0,>=2.0.3
  Using cached catalogue-2.0.4-py3-none-any.whl (16 kB)
Collecting blis<0.8.0,>=0.4.0
  Using cached blis-0.7.4-cp38-cp38-manylinux2014_x86_64.whl (9.8 MB)
Collecting typer<0.4.0,>=0.3.0
  Using cached typer-0.3.2-py3-none-any.whl (21 kB)
Collecting pathy>=0.3.5
  Using cached pathy-0.5.2-py3-none-any.whl (42 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Using cached pydantic-1.7.4-cp38-cp38-manylinux2014_x86_64.whl (12.3 MB)
Collecting cymem<2.1.0,>=2.0.2
  Using cached cymem-2.0.5-cp38-cp38-manylinux2014_x86_64.whl (35 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.5-cp38-cp38-manylinux2014_x86_64.whl (130 kB)
Collecting spacy-legacy<3.1.0,>=3.0.4
  Using cached spacy_legacy-3.0.5-py2.py3-none-any.whl (12 kB)
Collecting wasabi<1.1.0,>=0.8.1
  Using cached wasa

In [36]:
import tensorflow as tf
tf.get_logger().setLevel(40) # suppress deprecation messages
tf.compat.v1.disable_v2_behavior() # disable TF2 behaviour as alibi code still relies on TF1 constructs
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from time import time
from alibi.datasets import fetch_adult
from alibi.explainers import CounterFactualProto
from alibi.utils.mapping import ohe_to_ord, ord_to_ohe

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

TF version:  2.4.1
Eager execution enabled:  False


In [37]:
from IPython.core.display import HTML
import numpy as np
import pandas as pd
import random as rn
import time

# current explanable algorithms
import lime
import shap
from lime import lime_tabular

# import auxiliary functions
from learning import *

%matplotlib inline

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [38]:
# name of dataset
#DATASET_NAME = "diabetes.csv"
# load dataset into a dataframe
dataset_path0 = "diabetes.csv"
data = pd.read_csv( dataset_path0 )
# variable containing the class labels in this case the dataset contains:
# 0 - if not diabetes
# 1 - if diabetes
class_var = "Outcome"

# load dataset
#dataset_path = PATH + "datasets/" + DATASET_NAME
#data = pd.read_csv( dataset_path )

# features
feature_names = data.drop([class_var], axis=1).columns.to_list()
print("Features")
print(feature_names)

# balance dataset
sampled_data = data.sample(frac=1)
sampled_data = sampled_data[ sampled_data["Outcome"] == 0]

no_data = sampled_data.sample(frac=1)[0:268]
yes_data = data[ data["Outcome"] == 1]

balanced_data = [no_data,yes_data]
balanced_data = pd.concat(balanced_data)
balanced_data

Features
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
204,6,103,72,32,190,37.7,0.324,55,0
717,10,94,72,18,0,23.1,0.595,56,0
720,4,83,86,19,0,29.3,0.317,34,0
315,2,112,68,22,94,34.1,0.315,26,0
505,10,75,82,0,0,33.3,0.263,38,0
...,...,...,...,...,...,...,...,...,...
755,1,128,88,39,110,36.5,1.057,37,1
757,0,123,72,0,0,36.3,0.258,52,1
759,6,190,92,0,0,35.5,0.278,66,1
761,9,170,74,31,0,44.0,0.403,43,1


In [39]:
pip install 'h5py==2.10.0' --force-reinstall

Collecting h5py==2.10.0
  Using cached h5py-2.10.0-cp38-cp38-manylinux1_x86_64.whl (2.9 MB)
Collecting six
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Collecting numpy>=1.7
  Using cached numpy-1.20.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.4 MB)
Installing collected packages: six, numpy, h5py
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Uninstalling six-1.16.0:
      Successfully uninstalled six-1.16.0
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.3
    Uninstalling numpy-1.20.3:
      Successfully uninstalled numpy-1.20.3
  Attempting uninstall: h5py
    Found existing installation: h5py 2.10.0
    Uninstalling h5py-2.10.0:
      Successfully uninstalled h5py-2.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you hav

In [41]:
# creates a dictionary with the following information:
# ground_truth
# index
# original_vector
# scaled_vector
# prediction_type
#local_data_dict = generate_local_predictions( X_test, Y_test, model, scaler, encoder )

# separates vectors into true positives, true negatives
# false positives and false negatives
#true_positives,true_negatives, false_positives, false_negatives = wrap_information( local_data_dict )

# add class variable to the feature list
#feature_names.append("Diabetes?")

In [42]:
def generate_report_template():
  rep = HTML("<h2>REPORT</h2>" + 
      "<p style='font-size:120%;'>Knowing about the patient's <span style='color:red'>Glucose</span> levels makes <span style='color:red'>SkinThickness</span>, <span style='color:red'>Insulin</span>, " +
      "<span style='color:red'>BMI</span>, <span style='color:red'>Age</span> and <span style='color:red'>DiabetesPedigreeFunction</span><br>"+
      "NOT RELEVANT to assess <span style='color:red'>Diabetes</span>.</p>"+
      "<br/><p style='font-size:120%;'>The features CONTRIBUTING for <span style='color:red'>Diabetes</span> are <span style='color:red'>BloodPressure</span>" +
     " and <span style='color:red'>Pregnancies</span>.</p>"+
     "<br/><p style='font-size:120%;'><b>Recommendation:</b> Given the confidence of the explanation (100%), the model is <span style='color:red'><b>certain that the patient has Diabetes</b></span></p>")
  return rep

In [43]:
def generate_explanations(data_class, indx, highlight_class = "No"):
  instance = data_class[indx]

  [bn, inference, infoBN, markov_blanket] = generate_BN_explanationsMB(instance, label_lst, feature_names, class_var, 
                                                                       encoder, scaler, model, PATH, DATASET_NAME, variance = 0.1)
  if( highlight_class == "Yes"):
    inf1 = gnb.getInference(bn, evs={class_var : "Yes"}, targets=feature_names_cp)
    gnb.sideBySide(*[inf1, markov_blanket, infoBN ], captions=[ "Inference", "Markov Blanket", "Information BN" ])
  else:
    gnb.sideBySide(*[inference, markov_blanket, infoBN ], captions=[ "Inference", "Markov Blanket", "Information BN" ])

## Generating Explanations with Counterfactual

In [44]:
# variable we want to predict
class_var = "Outcome"

# load dataset
data = pd.read_csv( dataset_path0 )
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [45]:
# how many features and how many datapoints does this dataset have?
num_rows = data.shape[0]
num_columns = data.shape[1]
print("There are a total of %d data records with %d features\n" %(num_rows,num_columns))

# what are the features in this dataset?
feature_names = data.columns.to_list()
feature_names.remove('Outcome')
#feature_names = data.columns[:13].tolist()
print(feature_names)
class_var = "Outcome"
# what are the different heart diseases that we have in this dataset?
labels = data[class_var].unique().tolist()
print(labels)

There are a total of 768 data records with 9 features

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
[1, 0]


In [46]:
# check the distribution of the different heart diseases in the dataset
data.groupby(class_var).count()
#Note1: Groupby "target"
#Note2: Why is it important? Ensure there is less bias from the data set(example: 50/50 for heart disease or non-heart disease)

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,500,500,500,500,500,500,500,500
1,268,268,268,268,268,268,268,268


In [47]:
# balance dataset
sampled_data = data.sample(frac=1)
sampled_data = sampled_data[ sampled_data["Outcome"] == 0]
no_data = sampled_data.sample(frac=1)[0:268]

yes_data = data[ data["Outcome"] == 1]

balanced_data = [no_data,yes_data]
balanced_data = pd.concat(balanced_data)

# check how balanced the classes are
balanced_data.groupby(class_var).count()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,268,268,268,268,268,268,268,268
1,268,268,268,268,268,268,268,268


## Train a Model for the Balanced Dataset

In [50]:
# apply one hot encoder to data
# standardize the input between 0 and 1
# normalize features
enc = OneHotEncoder()
X, Y, encoder, scaler = encode_data( data, class_var)

n_features = X.shape[1]
n_classes = len(data[class_var].unique())
 
#flag = False  # DO NOT CHANGE! Data has already been generated. 
#if flag:
    # save training, test and validation data
#    generate_save_training_data( dataset_path, X, Y)
    # load data
#    X_train, Y_train, X_test, Y_test, X_validation, Y_validation= load_training_data( dataset_path )
#else:
    # load existing training data
#    X_train, Y_train, X_test, Y_test, X_validation, Y_validation= load_training_data( dataset_path )
    

In [62]:
encode_data

<function learning.encode_data(data, class_var)>

## Generate counterfactual

Initialize counterfactual parameters. The feature perturbations are applied in the numerical feature space, after transforming the categorical variables to numerical features. As a result, the dimensionality and values of feature_range are defined in the numerical space.

In [56]:
shape = X.shape
beta = .01
c_init = 1.
c_steps = 5
max_iterations = 500
rng = (-1., 1.)  # scale features between -1 and 1
rng_shape = (1,) + data.shape[1:]
feature_range = ((np.ones(rng_shape) * rng[0]).astype(np.float32),
                 (np.ones(rng_shape) * rng[1]).astype(np.float32))

Initialize explainer:

In [57]:
def set_seed(s=0):
    np.random.seed(s)
    tf.random.set_seed(s)

In [64]:
def describe_instance(X, explanation, eps=1e-2):
    print('Original instance: {}  -- proba: {}'.format(target_names[explanation.orig_class],
                                                       explanation.orig_proba[0]))
    print('Counterfactual instance: {}  -- proba: {}'.format(target_names[explanation.cf['class']],
                                                             explanation.cf['proba'][0]))
    print('\nCounterfactual perturbations...')
    print('\nCategorical:')
    X_orig_ord = ohe_to_ord(X, cat_vars_ohe)[0]
    X_cf_ord = ohe_to_ord(explanation.cf['X'], cat_vars_ohe)[0]
    delta_cat = {}
    for i, (_, v) in enumerate(category_map.items()):
        cat_orig = v[int(X_orig_ord[0, i])]
        cat_cf = v[int(X_cf_ord[0, i])]
        if cat_orig != cat_cf:
            delta_cat[feature_names[i]] = [cat_orig, cat_cf]
    if delta_cat:
        for k, v in delta_cat.items():
            print('{}: {}  -->   {}'.format(k, v[0], v[1]))
    print('\nNumerical:')
    delta_num = X_cf_ord[0, -4:] - X_orig_ord[0, -4:]
    n_keys = len(list(cat_vars_ord.keys()))
    for i in range(delta_num.shape[1]):
        if np.abs(delta_num[0, i]) > eps:
            print('{}: {:.2f}  -->   {:.2f}'.format(feature_names[i+n_keys],
                                            X_orig_ord[0,i+n_keys],
                                            X_cf_ord[0,i+n_keys]))

In [65]:
set_seed()
cf = CounterFactualProto(encode_data,
                         shape,
                         beta=beta,
                         cat_vars=cat_vars_ohe,
                         ohe=True,  # OHE flag
                         max_iterations=max_iterations,
                         feature_range=feature_range,
                         c_init=c_init,
                         c_steps=c_steps
                        )

NameError: name 'cat_vars_ohe' is not defined

In [168]:
def describe_instance(X, explanation, eps=1e-2):
    print('Original instance: {}  -- proba: {}'.format(target_names[explanation.orig_class],
                                                       explanation.orig_proba[0]))
    print('Counterfactual instance: {}  -- proba: {}'.format(target_names[explanation.cf['class']],
                                                             explanation.cf['proba'][0]))
    print('\nCounterfactual perturbations...')
    print('\nCategorical:')
    X_orig_ord = ohe_to_ord(X, cat_vars_ohe)[0]
    X_cf_ord = ohe_to_ord(explanation.cf['X'], cat_vars_ohe)[0]
    delta_cat = {}
    for i, (_, v) in enumerate(category_map.items()):
        cat_orig = v[int(X_orig_ord[0, i])]
        cat_cf = v[int(X_cf_ord[0, i])]
        if cat_orig != cat_cf:
            delta_cat[feature_names[i]] = [cat_orig, cat_cf]
    if delta_cat:
        for k, v in delta_cat.items():
            print('{}: {}  -->   {}'.format(k, v[0], v[1]))
    print('\nNumerical:')
    delta_num = X_cf_ord[0, -4:] - X_orig_ord[0, -4:]
    n_keys = len(list(cat_vars_ord.keys()))
    for i in range(delta_num.shape[1]):
        if np.abs(delta_num[0, i]) > eps:
            print('{}: {:.2f}  -->   {:.2f}'.format(feature_names[i+n_keys],
                                            X_orig_ord[0,i+n_keys],
                                            X_cf_ord[0,i+n_keys]))

In [174]:
def set_seed(s=0):
    np.random.seed(s)
    tf.random.set_seed(s)

In [179]:
set_seed()
cf = CounterFactualProto(nn,
                         shape,
                         beta=beta,
                         cat_vars=cat_vars_ohe,
                         ohe=True,  # OHE flag
                         max_iterations=max_iterations,
                         feature_range=feature_range,
                         c_init=c_init,
                         c_steps=c_steps
                        )

NameError: name 'cat_vars_ohe' is not defined

In [171]:
describe_instance(X, explanation)

NameError: name 'explanation' is not defined