# Counterfactuals

In [1]:
import tensorflow as tf
tf.get_logger().setLevel(40) # suppress deprecation messages
tf.compat.v1.disable_v2_behavior() # disable TF2 behaviour as alibi code still relies on TF1 constructs
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import to_categorical
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.datasets import load_boston
from alibi.explainers import CounterFactualProto, CounterFactual
import pandas as pd
from learning import *
from time import time

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False
seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)

TF version:  2.4.1
Eager execution enabled:  False


## Install Deps 

In [2]:
pip install alibi

Collecting six
  Using cached six-1.15.0-py2.py3-none-any.whl (10 kB)
Collecting numpy<2.0.0,>=1.16.2
  Using cached numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl (14.9 MB)
Installing collected packages: six, numpy
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Uninstalling six-1.16.0:
      Successfully uninstalled six-1.16.0
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.3
    Uninstalling numpy-1.20.3:
      Successfully uninstalled numpy-1.20.3
Successfully installed numpy-1.19.5 six-1.15.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
## Removing str encoding error.
!python3 -m pip install 'h5py==2.10.0' --force-reinstall

Collecting h5py==2.10.0
  Using cached h5py-2.10.0-cp38-cp38-manylinux1_x86_64.whl (2.9 MB)
Collecting six
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Collecting numpy>=1.7
  Using cached numpy-1.20.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.4 MB)
Installing collected packages: six, numpy, h5py
  Attempting uninstall: six
    Found existing installation: six 1.15.0
    Uninstalling six-1.15.0:
      Successfully uninstalled six-1.15.0
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.5
    Uninstalling numpy-1.19.5:
      Successfully uninstalled numpy-1.19.5
  Attempting uninstall: h5py
    Found existing installation: h5py 2.10.0
    Uninstalling h5py-2.10.0:
      Successfully uninstalled h5py-2.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.4.1 requires numpy~=1.19.2, but you hav

## Load Diabetes dataset

In [4]:
# Giving current root path
PATH = "./"

# name of dataset
DATASET_NAME = "diabetes.csv"

# variable containing the class labels in this case the dataset contains:
# 0 - if not diabetes
# 1 - if diabetes
class_var = "Outcome"

# load dataset
dataset_path = PATH + "datasets/" + DATASET_NAME
data = pd.read_csv( dataset_path )

# features
feature_names = data.drop([class_var], axis=1).columns.to_list()

# balance dataset
sampled_data = data.sample(frac=1)
sampled_data = sampled_data[ sampled_data["Outcome"] == 0]

no_data = sampled_data.sample(frac=1)[0:268]
yes_data = data[ data["Outcome"] == 1]

balanced_data = [no_data,yes_data]
balanced_data = pd.concat(balanced_data)

# apply one hot encoder to data
# standardize the input between 0 and 1
X, Y, encoder, scaler = encode_data( balanced_data, class_var)

n_features = X.shape[1]
n_classes = len(data[class_var].unique())

# load existing training data
print("Loading training data...")
X_train, Y_train, X_test, Y_test, X_validation, Y_validation= load_training_data( dataset_path )

print("====================Features====================")
print(feature_names)
print("================================================")

Loading training data...
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


## Load trained model

In [5]:
# the best performing model was obtained with 5 hidden layers with 12 neurons each
model_name = "model_h5_N12"

# specify paths where the blackbox model was saved
path_serialisation_model = PATH + "training/" + DATASET_NAME.replace(".csv", "") + "/model/" 
path_serialisation_histr = PATH + "training/" + DATASET_NAME.replace(".csv", "") + "/history/" 

# load model and model performance history
print("Loading Blackbox model...")
model_history = load_model_history( model_name, path_serialisation_histr )
model = load_model( model_name, path_serialisation_model )

# check modelxw
model.summary()

Loading Blackbox model...
Loaded model from disk
Model: "model_h5_N12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_293 (Dense)            (None, 12)                108       
_________________________________________________________________
dense_294 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_295 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_296 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_297 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_298 (Dense)            (None, 12)                156       
_________________________________________________________________
dense

In [6]:
## Dentermine the feature range by training set.
diabetes_feature_range = (X_train.min(axis=0), X_train.max(axis=0))

In [7]:
## Get an example instance from test set.
example_idx = 5 ## Change me!!
example_data = np.expand_dims(X_test[example_idx], axis=0)

In [8]:
def log_cf_found(data, scaler, explanation):
    '''
    Function for generating measuring the difference btw original instance and counterfactual.
    '''
    print("================================================")    
    print(f'Original prediction: {explanation.orig_class}')
    print('Counterfactual prediction: {}'.format(explanation.cf['class']))
    print("================================================")    
    pred_class = explanation.cf['class']
    proba = explanation.cf['proba'][0][pred_class]
    print(f'Counterfactual prediction: {pred_class} with probability {proba}')
    print("================================================")    
    orig =  scaler.inverse_transform(data)
    counterfactual = scaler.inverse_transform(explanation.cf['X'])
    delta = counterfactual - orig
    for i, f in enumerate(feature_names):
        if np.abs(delta[0][i]) > 1e-4:
            print('{}: {}'.format(f, delta[0][i]))
    print("================================================")    

In [9]:
cf = CounterFactualProto(model, example_data.shape, use_kdtree=True, theta=10., max_iterations=1000,
                         feature_range=diabetes_feature_range,
                         c_init=1., c_steps=10)

cf.fit(X_train)

""

`Model.state_updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.
No encoder specified. Using k-d trees to represent class prototypes.


''

In [10]:
start_time = time()
explanation = cf.explain(example_data)
print('Explanation took {:.3f} sec'.format(time() - start_time))

Explanation took 23.089 sec


In [11]:
log_cf_found(example_data, scaler, explanation)

Original prediction: 0
Counterfactual prediction: 1
Counterfactual prediction: 1 with probability 0.5011819005012512
Glucose: 7.383453369140625
DiabetesPedigreeFunction: 0.020378300310457398
Age: 1.9643936157226598


### b-Counterfactual

In [14]:
cf_b = CounterFactual(model, example_data.shape, distance_fn='l1', target_proba=1.0,
                    target_class='other', max_iter=1000, early_stop=50, lam_init=1e-1,
                    max_lam_steps=10, tol=0.05, learning_rate_init=0.1,
                    feature_range=diabetes_feature_range, eps=0.01, init='identity',
                    decay=True, write_dir=None, debug=False)



In [15]:
start_time = time()
explanation_b = cf_b.explain(example_data)
print('Explanation took {:.3f} sec'.format(time() - start_time))

Explanation took 9.992 sec


In [16]:
log_cf_found(example_data, scaler, explanation_b)

Original prediction: 0
Counterfactual prediction: 1
Counterfactual prediction: 1 with probability 0.9502203464508057
Pregnancies: -0.04968643188476474
Glucose: 89.0
BloodPressure: -0.07953431176358094
SkinThickness: -0.1350555419921875
Insulin: -1.71661376953125
BMI: 0.3655010223388686
DiabetesPedigreeFunction: 1.785336026312197
Age: 0.34562492370605824
