In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle

from time import time
from utils.df_loader import load_adult_df
from utils.preprocessing import min_max_scale_numerical, remove_missing_values, inverse_dummy
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from alibi.explainers import CounterFactualProto, CounterFactual
from alibi_cf.utils import get_cat_vars_dict

tf.get_logger().setLevel(40) # suppress deprecation messages
tf.compat.v1.disable_v2_behavior() # disable TF2 behaviour as alibi code still relies on TF1 constructs
tf.keras.backend.clear_session()
pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False


seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


ModuleNotFoundError: No module named 'utils'

In [None]:
df, feature_names, numerical_cols, categorical_cols, columns_type, target_name, possible_outcomes = load_adult_df()

In [None]:
scaled_df, scaler = min_max_scale_numerical(df, numerical_cols)

In [None]:
scaled_df.head(5)

In [None]:
dummy_df = pd.get_dummies(scaled_df, columns=  [ col for col in categorical_cols if col != target_name])

In [None]:
### We should have this amount of input features.
sum([len(scaled_df[col].unique()) for col in categorical_cols if col != target_name]) + len(numerical_cols)

In [None]:
# enconded_df, encoder_dict = label_encode(scaled_df, categorical_cols)

In [None]:
cat_to_ohe_cat = {}
for c_col in categorical_cols:
    if c_col != target_name:
        cat_to_ohe_cat[c_col] = [ ohe_col for ohe_col in dummy_df.columns if ohe_col.startswith(c_col) and ohe_col != target_name]

In [None]:
ohe_feature_names = [ col for col in dummy_df.columns if col != target_name]

In [None]:
dummy_df.head(5)

In [None]:
inverse_dummy(dummy_df, cat_to_ohe_cat).head(5)

In [None]:
from sklearn.preprocessing import LabelEncoder

target_label_encoder = LabelEncoder()
dummy_df[target_name] = target_label_encoder.fit_transform(dummy_df[target_name])

dummy_df= dummy_df[ohe_feature_names + [target_name]]

In [None]:
train_df, test_df = train_test_split(dummy_df, train_size=.8, random_state=seed, shuffle=True)

In [None]:
X_train = np.array(train_df[ohe_feature_names])
y_train = np.array(train_df[target_name])
X_test = np.array(test_df[ohe_feature_names])
y_test = np.array(test_df[target_name])

In [None]:
### Train
nn = model= tf.keras.models.Sequential(
            [
                tf.keras.layers.Dense(24,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(12,activation='relu'),
                tf.keras.layers.Dense(1),
                tf.keras.layers.Activation(tf.nn.sigmoid),
            ]
        )
nn.compile(optimizer="Adam", loss='binary_crossentropy', metrics=['accuracy'])
nn.fit(X_train, y_train, batch_size=64, epochs=20, shuffle=True)

models = {
    "dt": DecisionTreeClassifier().fit(X_train,y_train),
    "rfc": RandomForestClassifier().fit(X_train,y_train),
    "nn": nn,
}

pickle.dump(models['dt'], open('./saved_models/dt.p', 'wb'))
pickle.dump(models['rfc'], open('./saved_models/rfc.p', 'wb'))
models['nn'].save('./saved_models/nn.h5',overwrite=True)


In [None]:
### Load
models = {}
models['dt'] = pickle.load(open('./saved_models/dt.p', 'rb'))
models['rfc'] = pickle.load(open('./saved_models/rfc.p', 'rb'))
models['nn'] = tf.keras.models.load_model('./saved_models/nn.h5')

## Initialise NN output shape as (None, 1) for tensorflow.v1
models['nn'].predict(np.zeros((2, X_train.shape[-1])))

In [None]:
example_data = X_test[0, :].reshape(1,-1)

dt_pred = models['dt'].predict(example_data)[0]
rfc_pred = models['rfc'].predict(example_data)[0]
nn_pred = models['nn'].predict(example_data)[0][0]

print(f"DT [{dt_pred}], RFC [{rfc_pred}], NN [{nn_pred}]")

# Alibi

## 1. Counterfactual Prototype

In [None]:
cat_vars_dict = get_cat_vars_dict(scaled_df, categorical_cols, feature_names, target_name)

In [None]:
cat_vars_dict

In [None]:
cat_feature_names = [ col for col in categorical_cols if col != target_name ] 

In [None]:
cat_vars_idx_info = []

for cat_col in cat_feature_names:
    num_unique_v = len([ col for col in train_df.columns if col.startswith(f"{cat_col}_")])
    first_index = min([ list(train_df.columns).index(col) for col in train_df.columns if col.startswith(f"{cat_col}_")])
    
    cat_vars_idx_info.append({
        "col": cat_col,
        "num_unique_v": num_unique_v,
        "first_index": first_index
    })

In [None]:
cat_vars_ohe = {}

for idx_info in cat_vars_idx_info:
    cat_vars_ohe[idx_info['first_index']] = idx_info['num_unique_v']


In [None]:
cat_vars_ohe

In [None]:
from alibi_cf import AlibiBinaryPredictWrapper

In [None]:
alibi_wrapped = {
    'dt': AlibiBinaryPredictWrapper(models['dt']),
    'rfc': AlibiBinaryPredictWrapper(models['rfc']),
    'nn': AlibiBinaryPredictWrapper(models['nn']),
}

In [None]:
feature_range = (np.ones((1, len(feature_names))), np.zeros((1, len(feature_names))))

In [None]:
cf_p_dict = {}

for k in alibi_wrapped.keys():
    cf_p_dict[k] = CounterFactualProto(
                                alibi_wrapped[k].predict,
                                example_data.shape,
                                cat_vars=cat_vars_ohe,
                                feature_range=feature_range,
                                max_iterations=500,
                                ohe=True,
                                )

    cf_p_dict[k].fit(X_train)
    
""

In [None]:
num_instances = 5
num_cf_per_instance = 1

In [None]:
results = {}
for k in cf_p_dict.keys():
    results[k] = []
    print(f"Finding counterfactual for {k}")
    for idx, instance in enumerate(X_test[0:num_instances]):
        print(f"instance {idx}")
        example = instance.reshape(1, -1)
        for num_cf in range(num_cf_per_instance):
            print(f"CF {num_cf}")
            start_t = time()
            exp = cf_p_dict[k].explain(example)
            end_t = time ()
            running_time = end_t - start_t

            if k=='nn':
                prediction = target_label_encoder.inverse_transform((models[k].predict(example)[0]> 0.5).astype(int))[0]
            else:
                prediction = target_label_encoder.inverse_transform(models[k].predict(example))[0]

            if (not exp.cf is None) and (len(exp.cf) > 0):
                print("Found CF")
                if k == 'nn':
                    cf = inverse_dummy(pd.DataFrame(exp.cf['X'], columns=ohe_feature_names), cat_to_ohe_cat)
                    cf.loc[0, target_name] = target_label_encoder.inverse_transform([exp.cf['class']])[0]
                else:
                    cf = inverse_dummy(pd.DataFrame(exp.cf['X'], columns=ohe_feature_names), cat_to_ohe_cat)
                    cf.loc[0, target_name] = target_label_encoder.inverse_transform([exp.cf['class']])[0]
            else:
                print("CF not found")
                cf = None

            input_df = inverse_dummy(pd.DataFrame(example, columns=ohe_feature_names), cat_to_ohe_cat)
            input_df.loc[0, target_name] = prediction

            results[k].append({
                "input": input_df,
                "cf": cf,
                'exp': exp,
                "running_time": running_time,
                "ground_truth": target_label_encoder.inverse_transform([y_test[idx]])[0],
                "prediction": prediction,
            })

In [None]:
all_df = {}

for k in results.keys():

    all_data = []

    for i in range(len(results[k])):
        final_df = pd.DataFrame([{}])

        scaled_input_df = results[k][i]['input'].copy(deep=True)
        origin_columns = [f"origin_input_{col}"  for col in scaled_input_df.columns]
        origin_input_df = scaled_input_df.copy(deep=True)
        scaled_input_df.columns = [f"scaled_input_{col}"  for col in scaled_input_df.columns]

        origin_input_df[numerical_cols] = scaler.inverse_transform(origin_input_df[numerical_cols])
        origin_input_df.columns = origin_columns

        final_df = final_df.join([scaled_input_df, origin_input_df])

        if not results[k][i]['cf'] is None:
            scaled_cf_df = results[k][i]['cf'].copy(deep=True)
            ## Comment this
            # scaled_cf_df.loc[0, target_name] = target_label_encoder.inverse_transform([scaled_cf_df.loc[0, target_name]])[0]
            origin_cf_columns = [f"origin_cf_{col}"  for col in scaled_cf_df.columns]
            origin_cf_df = scaled_cf_df.copy(deep=True)
            scaled_cf_df.columns = [f"scaled_cf_{col}"  for col in scaled_cf_df.columns]

            origin_cf_df[numerical_cols] = scaler.inverse_transform(origin_cf_df[numerical_cols])
            origin_cf_df.columns = origin_cf_columns

            final_df = final_df.join([scaled_cf_df, origin_cf_df])

        # final_df = final_df.join([scaled_input_df, origin_input_df, scaled_cf_df, origin_cf_df])
        final_df['running_time'] = results[k][i]['running_time']
        final_df['Found'] = "Y" if not results[k][i]['cf'] is None else "N"
        final_df['ground_truth'] = results[k][i]['ground_truth'] 
        final_df['prediction'] = results[k][i]['prediction'] 

        all_data.append(final_df)

    all_df[k] = pd.concat(all_data)

In [None]:
for df_k in all_df.keys():
    all_df[df_k].to_csv(f"./results/proto_adult_{df_k}_result.csv")