In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle

from time import time
from utils.df_loader import load_adult_df
from utils.preprocessing import remove_missing_values
from utils.preprocessing import label_encode, min_max_scale_numerical
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from alibi.explainers import CounterFactualProto, CounterFactual
from alibi_cf.utils import get_cat_vars_dict

tf.get_logger().setLevel(40) # suppress deprecation messages
tf.compat.v1.disable_v2_behavior() # disable TF2 behaviour as alibi code still relies on TF1 constructs
tf.keras.backend.clear_session()
pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False


seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.4.0-rc0
Eager execution enabled:  False


In [2]:
df, feature_names, numerical_cols, categorical_cols, columns_type, target_name, possible_outcomes = load_adult_df()

In [3]:
scaled_df, scaler = min_max_scale_numerical(df, numerical_cols)

In [4]:
enconded_df, encoder_dict = label_encode(scaled_df, categorical_cols)

In [5]:
train_df, test_df = train_test_split(enconded_df, train_size=.8, random_state=seed, shuffle=True)

In [6]:
X_train = np.array(train_df[feature_names])
y_train = np.array(train_df[target_name])
X_test = np.array(test_df[feature_names])
y_test = np.array(test_df[target_name])

In [7]:
### Train
nn = model= tf.keras.models.Sequential(
            [
                tf.keras.layers.Dense(24,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(1),
                tf.keras.layers.Activation(tf.nn.sigmoid),
            ]
        )
nn.compile(optimizer="Adam", loss='binary_crossentropy', metrics=['accuracy'])
nn.fit(X_train, y_train, batch_size=64, epochs=20, shuffle=True)

models = {
    "dt": DecisionTreeClassifier().fit(X_train,y_train),
    "rfc": RandomForestClassifier().fit(X_train,y_train),
    "nn": nn,
}

pickle.dump(models['dt'], open('./saved_models/dt.p', 'wb'))
pickle.dump(models['rfc'], open('./saved_models/rfc.p', 'wb'))
models['nn'].save('./saved_models/nn.h5',overwrite=True)


Train on 26048 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
### Load
models = {}
models['dt'] = pickle.load(open('./saved_models/dt.p', 'rb'))
models['rfc'] = pickle.load(open('./saved_models/rfc.p', 'rb'))
models['nn'] = tf.keras.models.load_model('./saved_models/nn.h5')

## Initialise NN output shape as (None, 1) for tensorflow.v1
models['nn'].predict(np.zeros((2, 12)))



array([[0.2731029],
       [0.2731029]], dtype=float32)

In [9]:
example_data = X_test[0, :].reshape(1,-1)

dt_pred = models['dt'].predict(example_data)[0]
rfc_pred = models['rfc'].predict(example_data)[0]
nn_pred = models['nn'].predict(example_data)[0][0]

print(f"DT [{dt_pred}], RFC [{rfc_pred}], NN [{nn_pred}]")

DT [0], RFC [0], NN [0.18025292456150055]


# Alibi

## 1. Counterfactual Prototype

In [10]:
cat_vars_dict = get_cat_vars_dict(enconded_df, categorical_cols, feature_names, target_name)

In [11]:
from alibi_cf import AlibiBinaryPredictWrapper

In [12]:
alibi_wrapped = {
    'dt': AlibiBinaryPredictWrapper(models['dt']),
    'rfc': AlibiBinaryPredictWrapper(models['rfc']),
    'nn': AlibiBinaryPredictWrapper(models['nn']),
}

In [13]:
feature_range = (np.amax(X_train, 0).reshape(1, -1), np.amin(X_train, 0).reshape(1, -1))

In [14]:
cf_p_dict = {}

for k in alibi_wrapped.keys():
    cf_p_dict[k] = CounterFactualProto(alibi_wrapped[k].predict, example_data.shape,
                                use_kdtree=True, theta=10., max_iterations=1000,
                                cat_vars=cat_vars_dict,
                                feature_range=feature_range,
                                ohe=False,
                                c_init=1., c_steps=10,
                                )

    cf_p_dict[k].fit(X_train)
    
""

''

In [15]:
num_instances = 20
num_cf_per_instance = 5

In [16]:
results = {}
for k in cf_p_dict.keys():
    results[k] = []
    print(f"Finding counterfactual for {k}")
    for idx, instance in enumerate(X_test[0:num_instances]):
        print(f"instance {idx}")
        example = instance.reshape(1, -1)
        for num_cf in range(num_cf_per_instance):
            print(f"CF {num_cf}")
            start_t = time()
            exp = cf_p_dict[k].explain(example)
            end_t = time ()
            running_time = end_t - start_t
            results[k].append({
                "input": example,
                "cf": exp.cf,
                "running_time": running_time,
            })

Finding counterfactual for dt
instance 0
instance 1
instance 2
instance 3
instance 4
Finding counterfactual for rfc
instance 0
instance 1
instance 2
instance 3
instance 4
Finding counterfactual for nn
instance 0
instance 1
instance 2
instance 3
instance 4


In [75]:
all_df = {}

for k in results.keys():

    all_data = []

    for i in range(len(results[k])):
        row = {}

        for f, v in zip(feature_names, results[k][i]['input'].flatten().tolist()):
            row[f"scaled_input_{f}"] = v  

        if not results[k][i]['cf'] is None:
            for f, v in zip(feature_names, results[k][i]['cf']['X'].flatten().tolist()):
                row[f"scaled_cf_{f}"] = v 

            row[f"Found"] = "Y"

        else:
            # for f in zip(feature_names):
            #     row[f"scaled_cf_{f}"] = "None"
            for f in feature_names:
                row[f"scaled_cf_{f}"] = float("nan")
            row[f"Found"] = "N"

        row['running_time'] = results[k][i]['running_time']

        all_data.append(row)

    all_df[k] = pd.DataFrame(all_data)
    

In [76]:
all_complete_df = {}

for df_k in all_df.keys():
    temp_df = all_df[df_k].copy(deep=True)

    ### Categorical data 
    for k in encoder_dict.keys():
        if k != target_name:

            ### Do it for input 
            temp_df[f'origin_input_{k}'] = encoder_dict[k].inverse_transform(temp_df[f'scaled_input_{k}'].astype(np.int))

            ### Do it for cf
            if (len(temp_df.loc[temp_df['Found']=='Y'])) > 0:
                temp_df.loc[temp_df['Found']=='Y', f'origin_cf_{k}'] = encoder_dict[k].inverse_transform(temp_df[temp_df['Found']=='Y'][f'scaled_cf_{k}'].astype(np.int))
            else:
                temp_df[f'origin_cf_{k}'] = [float('nan')] * len(temp_df)

    ### Numerical data

    temp_df[[f"origin_input_{col}" for col in numerical_cols]] = scaler.inverse_transform(temp_df[[f"scaled_input_{col}" for col in numerical_cols]])

    if (len(temp_df.loc[temp_df['Found']=='Y'])) > 0:
        temp_df.loc[temp_df['Found']=='Y',[f"origin_cf_{col}" for col in numerical_cols]] = scaler.inverse_transform(temp_df[temp_df['Found']=='Y'][[f"scaled_cf_{col}" for col in numerical_cols]])
    else: 
        temp_df[[f"origin_cf_{col}" for col in numerical_cols]] = np.ones_like(temp_df[[f"scaled_input_{col}" for col in numerical_cols]])* float('nan')

    all_complete_df[df_k] = temp_df
    all_complete_df[df_k].to_csv(f"{df_k}_result.csv")


In [None]:
numerical_col_idxs = [feature_names.index(col) for col in numerical_cols]
categorical_col_idxs = [feature_names.index(col) for col in categorical_cols if col != target_name]