In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.preprocessing import preprocess_df
from sklearn.model_selection import train_test_split
from utils.dice import generate_dice_result, process_results
from utils.models import train_three_models, evaluation_test, save_three_models, load_three_models
from utils.save import save_result_as_csv

pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.0.0
Eager execution enabled:  True


In [2]:
#### Select dataset ####'

dataset_name = 'diabetes' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [3]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)

In [4]:
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)

In [5]:
test_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
236,0.411765,0.909548,0.688525,0.212121,0.226950,0.535022,0.216909,0.500000,1
395,0.117647,0.638191,0.475410,0.242424,0.325059,0.412817,0.649872,0.066667,0
36,0.647059,0.693467,0.622951,0.000000,0.000000,0.494784,0.146029,0.233333,0
210,0.117647,0.407035,0.491803,0.222222,0.000000,0.412817,0.090521,0.066667,0
483,0.000000,0.422111,0.672131,0.313131,0.147754,0.569300,0.066183,0.033333,0
...,...,...,...,...,...,...,...,...,...
650,0.058824,0.457286,0.442623,0.252525,0.118203,0.375559,0.066610,0.033333,0
579,0.117647,0.989950,0.573770,1.000000,0.000000,0.517139,0.212212,0.683333,1
119,0.235294,0.497487,0.622951,0.151515,0.060284,0.345753,0.061913,0.000000,0
593,0.117647,0.412060,0.426230,0.222222,0.135934,0.424739,0.692143,0.066667,0


In [6]:
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])

In [7]:
### Train models.
# models = train_three_models(X_train, y_train)

### Save models.
# save_three_models(models, dataset_name)

In [8]:
### Load models.
models = load_three_models(X_train.shape[-1], dataset_name)



https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
2022-03-07 16:12:55.985293: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-07 16:12:55.985671: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [9]:
### Print out accuracy on testset.
evaluation_test(models, X_test, y_test)

DT: [0.7727] | RF [0.7987] | NN [0.7662]


# DiCE

In [10]:
### Setting up the CF generating amount.
num_instances = 3
num_cf_per_instance = 1

In [11]:
# Generate CF
results = generate_dice_result(
    df_info,
    test_df,
    models,
    num_instances,
    num_cf_per_instance,
    sample_size=50,
    models_to_run=['nn']
)
result_dfs = process_results(df_info, results)


Finding counterfactual for nn
instance 0
CF 0


  0%|          | 0/1 [00:00<?, ?it/s]



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



100%|██████████| 1/1 [00:00<00:00, 16.95it/s]


instance 1
CF 0


100%|██████████| 1/1 [00:00<00:00, 19.18it/s]


instance 2
CF 0


100%|██████████| 1/1 [00:00<00:00, 18.55it/s]


In [12]:
from utils.dice import Recorder

In [13]:
i = 0
example_input = df_info.scaled_df.iloc[test_df[i:i+1].index].iloc[0:1]

In [14]:
example_input

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
236,0.411765,0.909548,0.688525,0.212121,0.22695,0.535022,0.216909,0.5,Y


In [15]:
print(Recorder.wrapped_models['nn'].predict(example_input))
print(Recorder.wrapped_models['dt'].predict(example_input))
print(Recorder.wrapped_models['rfc'].predict(example_input))

[1]
[1]
[1]


In [16]:
print(Recorder.wrapped_models['nn'].predict_proba(example_input))
print(Recorder.wrapped_models['dt'].predict_proba(example_input))
print(Recorder.wrapped_models['rfc'].predict_proba(example_input))

[[0.2315318 0.7684682]]
[[0. 1.]]
[[0.16 0.84]]


In [17]:
### Save result as file.
save_result_as_csv("dice", dataset_name, result_dfs)

Result has been saved to ./results/dice_diabetes


In [3]:
import pandas as pd

In [4]:
dice_dt = pd.read_csv(r'./datasets/eval_dice_diabetes_dt_result.csv')

In [6]:
dice_dt.L2.max()

1.303840481

In [7]:
dice_dt.L2.min()

0.066562646

In [13]:
dice_dt_highl2_row73 = dice_dt.iloc[72]
dice_dt_highl2_row73

Unnamed: 0                                     72.0
Unnamed: 0.1                                   14.0
Unnamed: 0.1.1                                  0.0
scaled_input_Pregnancies                   0.352941
scaled_input_Glucose                         0.9799
scaled_input_BloodPressure                  0.57377
scaled_input_SkinThickness                      0.0
scaled_input_Insulin                            0.0
scaled_input_BMI                           0.460507
scaled_input_DiabetesPedigreeFunction      0.106746
scaled_input_Age                           0.166667
scaled_input_Outcome                              Y
origin_input_Pregnancies                        6.0
origin_input_Glucose                     195.000006
origin_input_BloodPressure                69.999997
origin_input_SkinThickness                      0.0
origin_input_Insulin                            0.0
origin_input_BMI                               30.9
origin_input_DiabetesPedigreeFunction         0.328
origin_input

In [14]:
dice_dt_highl2_row73

Unnamed: 0                                     72.0
Unnamed: 0.1                                   14.0
Unnamed: 0.1.1                                  0.0
scaled_input_Pregnancies                   0.352941
scaled_input_Glucose                         0.9799
scaled_input_BloodPressure                  0.57377
scaled_input_SkinThickness                      0.0
scaled_input_Insulin                            0.0
scaled_input_BMI                           0.460507
scaled_input_DiabetesPedigreeFunction      0.106746
scaled_input_Age                           0.166667
scaled_input_Outcome                              Y
origin_input_Pregnancies                        6.0
origin_input_Glucose                     195.000006
origin_input_BloodPressure                69.999997
origin_input_SkinThickness                      0.0
origin_input_Insulin                            0.0
origin_input_BMI                               30.9
origin_input_DiabetesPedigreeFunction         0.328
origin_input

In [11]:
dice_dt_highl2_row97 = dice_dt.iloc[97]
dice_dt_highl2_row97

Unnamed: 0                                     97.0
Unnamed: 0.1                                   19.0
Unnamed: 0.1.1                                  0.0
scaled_input_Pregnancies                        0.0
scaled_input_Glucose                       0.502513
scaled_input_BloodPressure                 0.721311
scaled_input_SkinThickness                 0.606061
scaled_input_Insulin                       0.130024
scaled_input_BMI                           0.697466
scaled_input_DiabetesPedigreeFunction      0.377455
scaled_input_Age                           0.166667
scaled_input_Outcome                              N
origin_input_Pregnancies                        0.0
origin_input_Glucose                     100.000002
origin_input_BloodPressure                87.999997
origin_input_SkinThickness                60.000002
origin_input_Insulin                     110.000002
origin_input_BMI                          46.800002
origin_input_DiabetesPedigreeFunction         0.962
origin_input

In [None]:
from sklearn import tree

In [15]:
X,y = dice_dt_highl2_row73, y_test
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,y)

NameError: name 'y_test' is not defined