In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from sklearn.model_selection import train_test_split
from utils.preprocessing import preprocess_df
from utils.models import train_three_models, evaluation_test, save_three_models, load_three_models
from utils.cf_proto import  generate_cf_proto_result, process_result

from utils.save import save_result_as_csv

### Disable TF2 and enable TF1 for alibi.
tf.get_logger().setLevel(40) 
tf.compat.v1.disable_v2_behavior()
tf.keras.backend.clear_session()
pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False


seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.4.0-rc0
Eager execution enabled:  False


In [6]:
#### Select dataset ####

dataset_name = 'adult' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [7]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)

In [8]:
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)

In [10]:
from utils.preprocessing import inverse_dummy
inverse_dummy(test_df[:20], df_info.cat_to_ohe_cat)

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class,workclass,education,marital-status,occupation,relationship,race,sex,native-country
20713,0.520548,0.0,0.0,0.142857,0,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
13495,0.657534,0.0,0.0,0.193878,0,Private,Bachelors,Married-civ-spouse,Transport-moving,Husband,White,Male,United-States
12367,0.164384,0.0,0.0,0.397959,0,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,Guatemala
22402,0.493151,0.0,0.0,0.397959,1,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
18338,0.205479,0.0,0.0,0.397959,0,Private,Bachelors,Never-married,Adm-clerical,Not-in-family,Asian-Pac-Islander,Male,United-States
1476,0.287671,0.0,0.0,0.479592,0,Private,HS-grad,Divorced,Other-service,Unmarried,White,Female,United-States
26209,0.205479,0.0,0.0,0.428571,1,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
18442,0.246575,0.0,0.0,0.397959,0,Private,Some-college,Married-civ-spouse,Prof-specialty,Husband,Amer-Indian-Eskimo,Male,United-States
28842,0.479452,0.0,0.0,0.5,0,Private,5th-6th,Never-married,Handlers-cleaners,Not-in-family,White,Male,United-States
13180,0.260274,0.0,0.0,0.397959,0,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States


In [5]:
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])

In [6]:
### Train models.
# models = train_three_models(X_train, y_train)

### Save models.
# save_three_models(models, dataset_name)

Train on 614 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [7]:
### Load models
models = load_three_models(X_train.shape[-1], dataset_name)



In [8]:
### Print out accuracy on testset.
evaluation_test(models, X_test, y_test)

DT: [0.7727] | RF [0.7987] | NN [0.7662]


# Alibi

## 1. Counterfactual Prototype

In [9]:
### Setting up the CF generating amount.
num_instances = 5
num_cf_per_instance = 1

In [10]:
### Generate CF
results = generate_cf_proto_result(df_info, train_df, models, num_instances, num_cf_per_instance, X_train, X_test, y_test, max_iters=500)
result_dfs = process_result(results, df_info)

Finding counterfactual for dt
instance 0
CF 0
Found CF
instance 1
CF 0
Found CF
instance 2
CF 0
Found CF
instance 3
CF 0
CF not found
instance 4
CF 0
CF not found
Finding counterfactual for rfc
instance 0
CF 0
Found CF
instance 1
CF 0
CF not found
instance 2
CF 0
Found CF
instance 3
CF 0
CF not found
instance 4
CF 0
CF not found
Finding counterfactual for nn
instance 0
CF 0
Found CF
instance 1
CF 0
CF not found
instance 2
CF 0
CF not found
instance 3
CF 0
CF not found
instance 4
CF 0
CF not found


In [11]:
### Save result as file.
save_result_as_csv("proto", dataset_name, result_dfs)

Result has been saved to ./results/proto_diabetes
