In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.preprocessing import preprocess_df
from sklearn.model_selection import train_test_split
from utils.dice import generate_dice_result, process_results
from utils.models import train_three_models, evaluation_test, save_three_models, load_three_models
from utils.save import save_result_as_csv

pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.0.0
Eager execution enabled:  True


In [2]:
#### Select dataset ####'

dataset_name = 'breast_cancer' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [3]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)

In [4]:
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)

In [5]:
test_df

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
333,0.202045,0.171458,0.190657,0.104560,0.274713,0.077296,0.002281,0.014617,0.360101,0.228517,...,0.267591,0.157727,0.075575,0.300007,0.068545,0.004407,0.057285,0.246403,0.125541,0
273,0.130673,0.201556,0.122383,0.062100,0.340706,0.084381,0.025843,0.069930,0.515657,0.277169,...,0.236141,0.088052,0.041781,0.498778,0.064431,0.032292,0.177285,0.251528,0.175193,0
201,0.499740,0.324992,0.492779,0.342778,0.334477,0.308018,0.242737,0.372167,0.225253,0.104254,...,0.368337,0.443697,0.258995,0.441986,0.305333,0.280192,0.666323,0.268677,0.154991,1
178,0.285342,0.423064,0.264114,0.162418,0.089194,0.000000,0.003737,0.009205,0.169192,0.050126,...,0.453092,0.188107,0.104109,0.066565,0.006821,0.006371,0.031818,0.143899,0.022235,0
85,0.543282,0.297937,0.534241,0.395122,0.416268,0.263542,0.312793,0.437127,0.541414,0.216091,...,0.417377,0.506948,0.348457,0.453213,0.176199,0.252157,0.564261,0.419870,0.201692,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,0.275877,0.113290,0.259139,0.154952,0.313803,0.056316,0.021539,0.066103,0.205051,0.239469,...,0.110075,0.180736,0.094745,0.297365,0.026351,0.022029,0.137113,0.042973,0.119048,0
192,0.129632,0.287792,0.117062,0.061336,0.152298,0.012453,0.000000,0.000000,0.299495,0.305602,...,0.234808,0.058967,0.029149,0.000000,0.000000,0.000000,0.000000,0.067810,0.069198,0
246,0.294335,0.261075,0.278764,0.168865,0.176221,0.079320,0.101593,0.054920,0.215657,0.134583,...,0.421109,0.188605,0.102438,0.257082,0.119830,0.183546,0.170790,0.236941,0.111111,0
211,0.229968,0.312141,0.219197,0.120679,0.325720,0.152199,0.062535,0.069235,0.238889,0.223463,...,0.345682,0.173365,0.088749,0.375289,0.155922,0.117492,0.237560,0.191208,0.163256,0


In [6]:
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])

In [7]:
### Train models.
# models = train_three_models(X_train, y_train)

### Save models.
# save_three_models(models, dataset_name)

In [8]:
### Load models.
models = load_three_models(X_train.shape[-1], dataset_name)



2022-07-20 22:48:05.521745: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-20 22:48:05.522177: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [9]:
### Print out accuracy on testset.
evaluation_test(models, X_test, y_test)

DT: [0.9649] | RF [0.9912] | NN [0.9737]


# DiCE

In [10]:
### Setting up the CF generating amount.
num_instances = 20
num_cf_per_instance = 1

In [11]:
import dice_ml

In [12]:
from utils.dice import Recorder

In [13]:
# Generate CF
results = generate_dice_result(
    df_info,
    test_df,
    models,
    num_instances,
    num_cf_per_instance,
    sample_size=50,
    models_to_run=['nn']
)

## 1.  diabetes - freeze at 4th instance.
## 2.  breast cancer - freeze at 3rd instance.

result_dfs = process_results(df_info, results)


Finding counterfactual for nn
instance 0
CF 0
Before generating cf
Ground Truth is N


  0%|          | 0/1 [00:00<?, ?it/s]

Counterfactaul generating initial phase in the 3rd package lib.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

cp1
cp2
cp3
cp4
cp5
cp5.1
cp5.2
cp5.3
cp5.4
cp5.5
cp5.6
cp5 - sparsity true


100%|██████████| 1/1 [00:00<00:00,  1.63it/s]


cp6
Diverse Counterfactuals found! total time taken: 00 min 00 sec
cp7
After generating cf
instance 1
CF 0
Before generating cf
Ground Truth is N


100%|██████████| 1/1 [00:00<00:00,  8.48it/s]


Counterfactaul generating initial phase in the 3rd package lib.
cp1
cp2
cp3
cp4
cp5
cp5.1
cp5.2
cp5.3
cp5.4
cp5.5
cp5.6
cp5 - sparsity true
cp6
Diverse Counterfactuals found! total time taken: 00 min 00 sec
cp7
After generating cf
instance 2
CF 0
Before generating cf
Ground Truth is Y


  0%|          | 0/1 [00:00<?, ?it/s]

Counterfactaul generating initial phase in the 3rd package lib.
cp1
cp2
cp3
cp4
cp5
cp5.1
cp5.2
cp5.3
cp5.4
cp5.5
cp5.6
cp5 - sparsity true


100%|██████████| 1/1 [00:00<00:00,  8.64it/s]


cp6
Diverse Counterfactuals found! total time taken: 00 min 00 sec
cp7
After generating cf
instance 3
CF 0
Before generating cf
Ground Truth is N


  0%|          | 0/1 [00:00<?, ?it/s]

Counterfactaul generating initial phase in the 3rd package lib.
cp1
cp2
cp3
cp4
cp5
cp5.1
cp5.2
cp5.3
cp5.4
cp5.5
cp5.6
cp5 - sparsity true


  0%|          | 0/1 [00:54<?, ?it/s]


KeyboardInterrupt: 

In [14]:
dice_cfs = Recorder.dice_cfs
input_query = Recorder.input_query
sample_size = 50
k="nn"

In [15]:
input_query.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [17]:
# 2nd instance.
input_query[df_info.feature_names]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.285342,0.423064,0.264114,0.162418,0.089194,0.0,0.003737,0.009205,0.169192,0.050126,...,0.215937,0.453092,0.188107,0.104109,0.066565,0.006821,0.006371,0.031818,0.143899,0.022235


In [27]:
exp = dice_cfs[k].generate_counterfactuals(
    input_query[df_info.feature_names], total_CFs=2, sample_size=sample_size, desired_class="opposite", verbose=True, posthoc_sparsity_param=1e-2)

100%|██████████| 1/1 [00:00<00:00,  9.33it/s]

Counterfactaul generating initial phase in the 3rd package lib.
cp1
cp2
cp3
cp4
cp5
cp5.1
cp5.2
cp5.3
cp5.4
cp5.5
cp5.6
cp5 - sparsity true
cp6
Diverse Counterfactuals found! total time taken: 00 min 00 sec
cp7





In [None]:
exp.cf_examples_list[0].final_cfs_df

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,0.727737,0.324992,0.492779,0.342778,0.334477,0.308018,0.242737,0.372167,0.225253,0.104254,...,0.359238,0.443697,0.258995,0.441986,0.305333,0.280192,0.666323,0.268677,0.154991,0
1,0.727737,0.324992,0.492779,0.342778,0.654119,0.308018,0.242737,0.372167,0.225253,0.104254,...,0.359238,0.443697,0.258995,0.441986,0.305333,0.280192,0.666323,0.268677,0.154991,0


In [None]:
exp.cf_examples_list[0].final_cfs_df

<dice_ml.diverse_counterfactuals.CounterfactualExamples at 0x7fb7b040a890>

In [None]:
i = 0
example_input = df_info.scaled_df.iloc[test_df[i:i+1].index].iloc[0:1]

In [None]:
example_input

Unnamed: 0,default,account_check_status,duration_in_month,credit_history,purpose,credit_amount,savings,present_emp_since,installment_as_income_perc,personal_status_sex,...,present_res_since,property,age,other_installment_plans,housing,credits_this_bank,job,people_under_maintenance,telephone,foreign_worker
131,Y,< 0 DM,0.470588,delay in paying off in the past,(vacation - does not exist?),0.365192,... < 100 DM,1 <= ... < 4 years,1.0,male : single,...,0.666667,if not A121 : building society savings agreeme...,0.178571,stores,own,1,skilled employee / official,1,"yes, registered under the customers name",yes


In [None]:
print(Recorder.wrapped_models['nn'].predict(example_input))
print(Recorder.wrapped_msodels['dt'].predict(example_input))
print(Recorder.wrapped_models['rfc'].predict(example_input))

[1]


AttributeError: type object 'Recorder' has no attribute 'wrapped_msodels'

In [None]:
print(Recorder.wrapped_models['nn'].predict_proba(example_input))
print(Recorder.wrapped_models['dt'].predict_proba(example_input))
print(Recorder.wrapped_models['rfc'].predict_proba(example_input))

[[0.35511744 0.64488256]]
[[0. 1.]]
[[0.14 0.86]]


In [None]:
### Save result as file.
save_result_as_csv("dice", dataset_name, result_dfs)

Result has been saved to ./results/dice_diabetes


In [None]:
import pandas as pd

In [None]:
dice_dt = pd.read_csv(r'./datasets/eval_dice_diabetes_dt_result.csv')

In [None]:
dice_dt.L2.max()

1.303840481

In [None]:
dice_dt.L2.min()

0.066562646

In [None]:
dice_dt_highl2_row73 = dice_dt.iloc[72]
dice_dt_highl2_row73

Unnamed: 0                                     72.0
Unnamed: 0.1                                   14.0
Unnamed: 0.1.1                                  0.0
scaled_input_Pregnancies                   0.352941
scaled_input_Glucose                         0.9799
scaled_input_BloodPressure                  0.57377
scaled_input_SkinThickness                      0.0
scaled_input_Insulin                            0.0
scaled_input_BMI                           0.460507
scaled_input_DiabetesPedigreeFunction      0.106746
scaled_input_Age                           0.166667
scaled_input_Outcome                              Y
origin_input_Pregnancies                        6.0
origin_input_Glucose                     195.000006
origin_input_BloodPressure                69.999997
origin_input_SkinThickness                      0.0
origin_input_Insulin                            0.0
origin_input_BMI                               30.9
origin_input_DiabetesPedigreeFunction         0.328
origin_input

In [None]:
dice_dt_highl2_row73

Unnamed: 0                                     72.0
Unnamed: 0.1                                   14.0
Unnamed: 0.1.1                                  0.0
scaled_input_Pregnancies                   0.352941
scaled_input_Glucose                         0.9799
scaled_input_BloodPressure                  0.57377
scaled_input_SkinThickness                      0.0
scaled_input_Insulin                            0.0
scaled_input_BMI                           0.460507
scaled_input_DiabetesPedigreeFunction      0.106746
scaled_input_Age                           0.166667
scaled_input_Outcome                              Y
origin_input_Pregnancies                        6.0
origin_input_Glucose                     195.000006
origin_input_BloodPressure                69.999997
origin_input_SkinThickness                      0.0
origin_input_Insulin                            0.0
origin_input_BMI                               30.9
origin_input_DiabetesPedigreeFunction         0.328
origin_input

In [None]:
dice_dt_highl2_row97 = dice_dt.iloc[97]
dice_dt_highl2_row97

Unnamed: 0                                     97.0
Unnamed: 0.1                                   19.0
Unnamed: 0.1.1                                  0.0
scaled_input_Pregnancies                        0.0
scaled_input_Glucose                       0.502513
scaled_input_BloodPressure                 0.721311
scaled_input_SkinThickness                 0.606061
scaled_input_Insulin                       0.130024
scaled_input_BMI                           0.697466
scaled_input_DiabetesPedigreeFunction      0.377455
scaled_input_Age                           0.166667
scaled_input_Outcome                              N
origin_input_Pregnancies                        0.0
origin_input_Glucose                     100.000002
origin_input_BloodPressure                87.999997
origin_input_SkinThickness                60.000002
origin_input_Insulin                     110.000002
origin_input_BMI                          46.800002
origin_input_DiabetesPedigreeFunction         0.962
origin_input