In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle

from time import time
from utils.df_loader import load_adult_df
from utils.preprocessing import remove_missing_values
from utils.preprocessing import label_encode, min_max_scale_numerical
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from alibi.explainers import CounterFactualProto, CounterFactual
from alibi_cf.utils import get_cat_vars_dict

pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False


seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.4.0-rc0
Eager execution enabled:  True


In [2]:
df, feature_names, numerical_cols, categorical_cols, columns_type, target_name, possible_outcomes = load_adult_df()

In [3]:
scaled_df, scaler = min_max_scale_numerical(df, numerical_cols)

In [4]:
scaled_df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,0.301370,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,0.021740,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.000000,0.0,0.122449,United-States,<=50K
2,0.287671,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.000000,0.0,0.397959,United-States,<=50K
3,0.493151,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.000000,0.0,0.397959,United-States,<=50K
4,0.150685,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.000000,0.0,0.397959,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0.000000,0.0,0.377551,United-States,<=50K
32557,0.315068,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.000000,0.0,0.397959,United-States,>50K
32558,0.561644,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0.000000,0.0,0.397959,United-States,<=50K
32559,0.068493,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0.000000,0.0,0.193878,United-States,<=50K


In [5]:
dummy_df = pd.get_dummies(scaled_df, columns=  [ col for col in categorical_cols if col != target_name])

In [6]:
### We should have this amount of input.
sum([len(scaled_df[col].unique()) for col in categorical_cols if col != target_name]) + len(numerical_cols)

103

In [7]:
# label_enconded_df, encoder_dict = label_encode(scaled_df, categorical_cols)

In [8]:
all_cat_ohe_cols = {}
for c_col in categorical_cols:
    if c_col != target_name:
        all_cat_ohe_cols[c_col] = [ ohe_col for ohe_col in dummy_df.columns if ohe_col.startswith(c_col) and ohe_col != target_name]

In [9]:
ohe_feature_names = [ col for col in dummy_df.columns if col != target_name]

In [10]:
dummy_df

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.301370,0.021740,0.0,0.397959,<=50K,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.452055,0.000000,0.0,0.122449,<=50K,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.287671,0.000000,0.0,0.397959,<=50K,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,0.493151,0.000000,0.0,0.397959,<=50K,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0.000000,0.0,0.397959,<=50K,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.000000,0.0,0.377551,<=50K,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32557,0.315068,0.000000,0.0,0.397959,>50K,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32558,0.561644,0.000000,0.0,0.397959,<=50K,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
32559,0.068493,0.000000,0.0,0.193878,<=50K,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
def inverse_dummy(dummy_df, all_cat_ohe_cols):
    not_dummy_df = dummy_df.copy(deep=True)
    for k in all_cat_ohe_cols.keys():
        not_dummy_df[k] = dummy_df[all_cat_ohe_cols[k]].idxmax(axis=1)
        not_dummy_df[k] = not_dummy_df[k].apply(lambda x: x.replace(f'{k}_',""))
        not_dummy_df.drop(all_cat_ohe_cols[k], axis=1, inplace=True)
    return not_dummy_df

In [12]:
inverse_dummy(dummy_df, all_cat_ohe_cols)

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,0.301370,0.021740,0.0,0.397959,<=50K,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,0.452055,0.000000,0.0,0.122449,<=50K,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,0.287671,0.000000,0.0,0.397959,<=50K,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,0.493151,0.000000,0.0,0.397959,<=50K,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,0.150685,0.000000,0.0,0.397959,<=50K,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.000000,0.0,0.377551,<=50K,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,0.315068,0.000000,0.0,0.397959,>50K,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,0.561644,0.000000,0.0,0.397959,<=50K,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,0.068493,0.000000,0.0,0.193878,<=50K,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [13]:
# enconded_df, encoder_dict = label_encode(scaled_df, categorical_cols)

In [14]:
from sklearn.preprocessing import LabelEncoder

target_label_encoder = LabelEncoder()
dummy_df[target_name] = target_label_encoder.fit_transform(dummy_df[target_name])

In [15]:
train_df, test_df = train_test_split(dummy_df, train_size=.8, random_state=seed, shuffle=True)

In [16]:
X_train = np.array(train_df[ohe_feature_names])
y_train = np.array(train_df[target_name])
X_test = np.array(test_df[ohe_feature_names])
y_test = np.array(test_df[target_name])

In [17]:
### Train
nn = model= tf.keras.models.Sequential(
            [
                tf.keras.layers.Dense(24,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(1),
                tf.keras.layers.Activation(tf.nn.sigmoid),
            ]
        )
nn.compile(optimizer="Adam", loss='binary_crossentropy', metrics=['accuracy'])
nn.fit(X_train, y_train, batch_size=64, epochs=20, shuffle=True)

models = {
    "dt": DecisionTreeClassifier().fit(X_train,y_train),
    "rfc": RandomForestClassifier().fit(X_train,y_train),
    "nn": nn,
}

pickle.dump(models['dt'], open('./saved_models/dt.p', 'wb'))
pickle.dump(models['rfc'], open('./saved_models/rfc.p', 'wb'))
models['nn'].save('./saved_models/nn.h5',overwrite=True)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
### Load
models = {}
models['dt'] = pickle.load(open('./saved_models/dt.p', 'rb'))
models['rfc'] = pickle.load(open('./saved_models/rfc.p', 'rb'))
models['nn'] = tf.keras.models.load_model('./saved_models/nn.h5')

## Initialise NN output shape as (None, 1) for tensorflow.v1
models['nn'].predict(np.zeros((2, 103)))

array([[0.21752243],
       [0.21752243]], dtype=float32)

In [19]:
example_data = X_test[0, :].reshape(1,-1)

dt_pred = models['dt'].predict(example_data)[0]
rfc_pred = models['rfc'].predict(example_data)[0]
nn_pred = models['nn'].predict(example_data)[0][0]

print(f"DT [{dt_pred}], RFC [{rfc_pred}], NN [{nn_pred}]")

DT [0], RFC [0], NN [0.46238642930984497]


# DiCE

In [20]:
import dice_ml

In [21]:
scaled_df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,0.301370,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,0.021740,0.0,0.397959,United-States,<=50K
1,0.452055,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.000000,0.0,0.122449,United-States,<=50K
2,0.287671,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.000000,0.0,0.397959,United-States,<=50K
3,0.493151,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.000000,0.0,0.397959,United-States,<=50K
4,0.150685,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.000000,0.0,0.397959,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0.000000,0.0,0.377551,United-States,<=50K
32557,0.315068,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.000000,0.0,0.397959,United-States,>50K
32558,0.561644,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0.000000,0.0,0.397959,United-States,<=50K
32559,0.068493,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0.000000,0.0,0.193878,United-States,<=50K


In [22]:
d = dice_ml.Data(dataframe=scaled_df, continuous_features=numerical_cols, outcome_name=target_name)

In [23]:
# m = dice_ml.Model(model=models['dt'], backend="sklearn")
# exp = dice_ml.Dice(d,m)

In [24]:
class RecordWrapper():
    def __init__(self, model, all_cat_ohe_cols, ohe_feature_names):
        self.all_inputs = []
        self.model = model
        self.all_cat_ohe_cols = all_cat_ohe_cols
        self.ohe_feature_names = ohe_feature_names


    def dice_to_input(self, input_df):
        x = input_df.copy(deep=True)

        for k in all_cat_ohe_cols.keys():
            for ohe_col in all_cat_ohe_cols[k]:
                x[ohe_col] = x[k].apply(lambda v: 1 if v in ohe_col else 0) 
            x.drop([k], axis=1, inplace=True)
            
        return np.array(x[ohe_feature_names])

    def predict_proba(self, x):
        self.all_inputs.append(x)
        cf_input = self.dice_to_input(x)
        return self.model.predict_proba(cf_input)

    def predict(self, x):
        self.all_inputs.append(x)
        cf_input = self.dice_to_input(x)
        return self.model.predict(cf_input)

class NNRecordWrapper():
    def __init__(self, model, all_cat_ohe_cols, ohe_feature_names):
        self.all_inputs = []
        self.model = model
        self.all_cat_ohe_cols = all_cat_ohe_cols
        self.ohe_feature_names = ohe_feature_names


    def dice_to_input(self, input_df):
        x = input_df.copy(deep=True)

        for k in all_cat_ohe_cols.keys():
            for ohe_col in all_cat_ohe_cols[k]:
                x[ohe_col] = x[k].apply(lambda v: 1 if v in ohe_col else 0) 
            x.drop([k], axis=1, inplace=True)
            
        return np.array(x[ohe_feature_names])

    def predict_proba(self, x):
        self.all_inputs.append(x)
        cf_input = self.dice_to_input(x)
        return self.model.predict(tf.constant(cf_input.astype(float)))
        

In [25]:
all_wrapped_models = {
    'dt': RecordWrapper(models['dt'], all_cat_ohe_cols, ohe_feature_names),
    'rfc': RecordWrapper(models['rfc'], all_cat_ohe_cols, ohe_feature_names),
    'nn': NNRecordWrapper(models['nn'], all_cat_ohe_cols, ohe_feature_names),
}

In [26]:
# dt_record = RecordWrapper(models['dt'], all_cat_ohe_cols, ohe_feature_names)
# m = dice_ml.Model(model=dt_record, backend="sklearn")
# exp = dice_ml.Dice(d,m)
# dice_exp = exp.generate_counterfactuals(scaled_df.iloc[1:2], total_CFs=5, desired_class="opposite")
# dice_exp.cf_examples_list[0].final_cfs_df.iloc[0][:-1]

In [27]:
dice_cfs = {
    'dt': dice_ml.Dice(d,dice_ml.Model(model=all_wrapped_models['dt'], backend="sklearn")),
    'rfc': dice_ml.Dice(d,dice_ml.Model(model=all_wrapped_models['rfc'], backend="sklearn")),
    'nn': dice_ml.Dice(d,dice_ml.Model(model=all_wrapped_models['nn'], backend="sklearn"))
}

In [28]:
num_instances = 5
num_cf_per_instance = 1

In [29]:
results = {}

for k in dice_cfs.keys():
    results[k] = []
    print(f"Finding counterfactual for {k}")
    for idx, instance in enumerate(scaled_df.iloc[test_df[0:num_instances].index].iloc):
        print(f"instance {idx}")
        for num_cf in range(num_cf_per_instance):
            print(f"CF {num_cf}")
            start_t = time()

            input_query = pd.DataFrame([instance.to_dict()])
            ground_truth = input_query['class'][0]
            exp = dice_cfs[k].generate_counterfactuals(input_query, total_CFs=1, sample_size=200, desired_class="opposite")

            # dice_exp = dice_cfs['nn'].generate_counterfactuals(scaled_df.iloc[1:2], total_CFs=1, desired_class="opposite")
            # dice_exp.cf_examples_list[0].final_cfs_df.iloc[0][:-1]
            
            end_t = time ()
            running_time = end_t - start_t
            results[k].append({
                "input": input_query,
                "cf": exp.cf_examples_list[0].final_cfs_df,
                "running_time": running_time,
                "ground_truth": ground_truth,
                "prediction":target_label_encoder.inverse_transform(all_wrapped_models['dt'].predict(input_query))[0],
            })

  0%|          | 0/1 [00:00<?, ?it/s]

Finding counterfactual for dt
instance 0
CF 0


100%|██████████| 1/1 [00:00<00:00,  4.60it/s]
100%|██████████| 1/1 [00:00<00:00,  5.95it/s]

instance 1
CF 0
instance 2


100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


CF 0
instance 3
CF 0


100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
100%|██████████| 1/1 [00:00<00:00,  5.13it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

instance 4
CF 0
Finding counterfactual for rfc
instance 0
CF 0


100%|██████████| 1/1 [00:00<00:00,  4.97it/s]
100%|██████████| 1/1 [00:00<00:00,  5.56it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

instance 1
CF 0
instance 2
CF 0


100%|██████████| 1/1 [00:00<00:00,  5.19it/s]
100%|██████████| 1/1 [00:00<00:00,  5.36it/s]


instance 3
CF 0
instance 4

100%|██████████| 1/1 [00:00<00:00,  5.73it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


CF 0
Finding counterfactual for nn
instance 0
CF 0


100%|██████████| 1/1 [00:00<00:00,  1.73it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
instance 1
CF 0


100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
instance 2
CF 0


100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
instance 3
CF 0


100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec
instance 4
CF 0


100%|██████████| 1/1 [00:00<00:00,  1.87it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec





In [30]:
# scaled_input_df = results['dt'][0]['input'].copy(deep=True)
# origin_columns = [f"origin_input_{col}"  for col in scaled_input_df.columns]
# origin_input_df = scaled_input_df.copy(deep=True)
# scaled_input_df.columns = [f"scaled_input_{col}"  for col in scaled_input_df.columns]

# origin_input_df[numerical_cols] = scaler.inverse_transform(origin_input_df[numerical_cols])
# origin_input_df.columns = origin_columns

# scaled_cf_df = results['dt'][0]['cf'].copy(deep=True)
# scaled_cf_df.loc[0, target_name] = target_label_encoder.inverse_transform([scaled_cf_df.loc[0, target_name]])[0]
# origin_cf_columns = [f"origin_cf_{col}"  for col in scaled_cf_df.columns]
# origin_cf_df = scaled_cf_df.copy(deep=True)
# scaled_cf_df.columns = [f"scaled_cf_{col}"  for col in scaled_cf_df.columns]

# origin_cf_df[numerical_cols] = scaler.inverse_transform(origin_cf_df[numerical_cols])
# origin_cf_df.columns = origin_cf_columns

# final_df = pd.DataFrame([{}])
# final_df = final_df.join([scaled_input_df, origin_input_df, scaled_cf_df, origin_cf_df])
# final_df['running_time'] = results['dt'][0]['running_time']
# final_df['Found'] = "Y" if not results['dt'][0]['cf'] is None else "N"


In [31]:
all_df = {}

for k in results.keys():

    all_data = []

    for i in range(len(results[k])):
        final_df = pd.DataFrame([{}])

        scaled_input_df = results[k][i]['input'].copy(deep=True)
        origin_columns = [f"origin_input_{col}"  for col in scaled_input_df.columns]
        origin_input_df = scaled_input_df.copy(deep=True)
        scaled_input_df.columns = [f"scaled_input_{col}"  for col in scaled_input_df.columns]

        origin_input_df[numerical_cols] = scaler.inverse_transform(origin_input_df[numerical_cols])
        origin_input_df.columns = origin_columns

        final_df = final_df.join([scaled_input_df, origin_input_df])

        if not results[k][i]['cf'] is None:
            scaled_cf_df = results[k][i]['cf'].copy(deep=True)
            scaled_cf_df.loc[0, target_name] = target_label_encoder.inverse_transform([scaled_cf_df.loc[0, target_name]])[0]
            origin_cf_columns = [f"origin_cf_{col}"  for col in scaled_cf_df.columns]
            origin_cf_df = scaled_cf_df.copy(deep=True)
            scaled_cf_df.columns = [f"scaled_cf_{col}"  for col in scaled_cf_df.columns]

            origin_cf_df[numerical_cols] = scaler.inverse_transform(origin_cf_df[numerical_cols])
            origin_cf_df.columns = origin_cf_columns

            final_df = final_df.join([scaled_cf_df, origin_cf_df])

        # final_df = final_df.join([scaled_input_df, origin_input_df, scaled_cf_df, origin_cf_df])
        final_df['running_time'] = results[k][i]['running_time']
        final_df['Found'] = "Y" if not results[k][i]['cf'] is None else "N"
        final_df['ground_truth'] = results[k][i]['ground_truth'] 
        final_df['prediction'] = results[k][i]['prediction'] 

        all_data.append(final_df)

    all_df[k] = pd.concat(all_data)

In [32]:
for df_k in all_df.keys():
    all_df[df_k].to_csv(f"dice_{df_k}_result.csv")