In [1]:
import tensorflow as tf
import dice_ml
import pandas as pd
from learning import *
from time import time

In [2]:
# Giving current root path
PATH = "./"

# name of dataset
DATASET_NAME = "diabetes.csv"

# variable containing the class labels in this case the dataset contains:
# 0 - if not diabetes
# 1 - if diabetes
class_var = "Outcome"

# load dataset
dataset_path = PATH + "datasets/" + DATASET_NAME
data = pd.read_csv( dataset_path )

# features
feature_names = data.drop([class_var], axis=1).columns.to_list()

# balance dataset
sampled_data = data.sample(frac=1)
sampled_data = sampled_data[ sampled_data["Outcome"] == 0]

no_data = sampled_data.sample(frac=1)[0:268]
yes_data = data[ data["Outcome"] == 1]

balanced_data = [no_data,yes_data]
balanced_data = pd.concat(balanced_data)

# apply one hot encoder to data
# standardize the input between 0 and 1
X, Y, encoder, scaler = encode_data( balanced_data, class_var)

n_features = X.shape[1]
n_classes = len(data[class_var].unique())

# load existing training data
print("Loading training data...")
X_train, Y_train, X_test, Y_test, X_validation, Y_validation= load_training_data( dataset_path )

print("====================Features====================")
print(feature_names)
print("================================================")

Loading training data...
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']


In [3]:
# the best performing model was obtained with 5 hidden layers with 12 neurons each
model_name = "model_h5_N12"

# specify paths where the blackbox model was saved
path_serialisation_model = PATH + "training/" + DATASET_NAME.replace(".csv", "") + "/model/" 
path_serialisation_histr = PATH + "training/" + DATASET_NAME.replace(".csv", "") + "/history/" 

# load model and model performance history
print("Loading Blackbox model...")
model_history = load_model_history( model_name, path_serialisation_histr )
model = load_model( model_name, path_serialisation_model )

# check modelxw
model.summary()

Loading Blackbox model...
Loaded model from disk
Model: "model_h5_N12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_293 (Dense)            (None, 12)                108       
_________________________________________________________________
dense_294 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_295 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_296 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_297 (Dense)            (None, 12)                156       
_________________________________________________________________
dense_298 (Dense)            (None, 12)                156       
_________________________________________________________________
dense

In [4]:
class KeepOneValue(tf.keras.layers.Layer):
    def __init__(self,):
        super(KeepOneValue, self).__init__()

    def call(self, inputs):
        return inputs[:, 1:2]

In [5]:
seq = tf.keras.Sequential(
    [
        model,
        KeepOneValue(),
    ]
)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [6]:
target_name = "Outcome"
feature_names = list(balanced_data.columns)
feature_names.remove(target_name)

In [7]:
d = dice_ml.Data(dataframe=balanced_data, continuous_features=feature_names, outcome_name=target_name)

In [8]:
m = dice_ml.Model(model=seq, backend="TF2")

In [9]:
test_df = pd.DataFrame(X_test, columns=feature_names)
example_idx = 2 ## Change me!!
example_data = test_df.iloc[example_idx:example_idx+1]

In [10]:
example_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
2,0.0,0.527638,0.737705,0.0,0.0,0.441133,0.047966,0.490196


In [11]:
### We can see the output value from model. Category 0 has a higher value, which means the instance has 60.6% prob of "not having diabetes" and 39.3% prob of "having diabetes".
model(np.array(example_data))

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.6069678, 0.3930322]], dtype=float32)>

In [12]:
### From the seq model (KeepOneValue layer added), we only get the prob of "having diabetes". But we can also get the prob of "not having diabetes" by (1 - "having diabetes").
### So if we get a higher value from the seq model, the instance will have a heigher prob of "having diabetes". This is the arichitecture DiCE want.
seq(np.array(example_data))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.3930322]], dtype=float32)>

In [13]:
exp = dice_ml.Dice(d, m)

In [14]:
dice_exp = exp.generate_counterfactuals(example_data, total_CFs=4, desired_class="opposite", proximity_weight= 0.1, diversity_weight=2)

Diverse Counterfactuals found! total time taken: 00 min 44 sec


In [15]:
###### Don't know why the couterfactual set Outcome is still 0. (Should be a bug) ######
dice_exp.visualize_as_dataframe(show_only_changes=True, display_sparse_df=True)

Query instance (original outcome : 0)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.0,1.0,1.0,0.0,0.0,0.4,0.048,0.0,0.081



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,-,79.0,-,39.0,815.0,6.9,0.0489999999999999,50.0,0.0
1,-,125.0,41.0,-,424.0,-,0.0489999999999999,45.0,0.0
2,-,121.0,10.0,9.0,73.0,46.5,0.636,30.0,0.0
3,-,199.0,-,-,-,-,0.0489999999999999,21.0,0.0
