In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from surrogate import rules

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.preprocessing import preprocess_df
from sklearn.model_selection import train_test_split
from utils.dice import generate_dice_result, process_results
from utils.models import train_three_models, evaluation_test, save_three_models, load_three_models
from utils.save import save_result_as_csv
from IPython.display import Image
import PIL
import pydotplus
from six import StringIO
from sklearn.tree import export_graphviz
from pydotplus import *


pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.0.0
Eager execution enabled:  True


In [2]:
def get_location(scaler, col):
    if col in scaler.feature_names_in_:
        return np.where(scaler.feature_names_in_ == col)[0]
    
    else:
        raise Exception(f"Column [{col}] not a feature in this scaler, scaler features: {scaler.feature_names_in_}")


def get_scaled_value(scaler, col, X):
    loc=get_location(scaler, col)

    X *= scaler.scale_[loc]
    X += scaler.min_[loc]

    if scaler.clip:
        np.clip(X, scaler.feature_range[0][loc], scaler.feature_range[1][loc], out=X)
    
    return X[0]


def get_original_value(scaler, col, X):
    loc = get_location(scaler, col)

    X -= scaler.min_[loc]
    X /= scaler.scale_[loc]

    return X[0]

In [3]:
#### Select dataset ####'

dataset_name = 'adult' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [4]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])
### Load models.
models = load_three_models(X_train.shape[-1], dataset_name)



https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
2022-04-21 23:14:26.459296: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-21 23:14:26.459711: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [5]:
#pip install pillow
#pip install dtreeviz
#pip install Graphviz
from dtreeviz.trees import *
from sklearn.tree import DecisionTreeClassifier
proto_dt = pd.read_csv(r'./datasets/eval_proto_adult_dt_result.csv')


In [10]:

proto_dt


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_age,scaled_input_capital-gain,scaled_input_capital-loss,scaled_input_hours-per-week,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,...,origin_cf_race,origin_cf_sex,origin_cf_native-country,origin_cf_class,L1,L2,Sparsity,Realistic,MAD,Mahalanobis
0,0.0,0.0,0.520548,0.0,0.0,0.142857,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,...,,,,,,,,,,
1,1.0,0.0,0.520548,0.0,0.0,0.142857,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,...,,,,,,,,,,
2,2.0,0.0,0.520548,0.0,0.0,0.142857,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,...,,,,,,,,,,
3,3.0,0.0,0.520548,0.0,0.0,0.142857,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,...,,,,,,,,,,
4,4.0,0.0,0.520548,0.0,0.0,0.142857,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96.0,0.0,0.068493,0.0,0.0,0.397959,Private,HS-grad,Widowed,Adm-clerical,...,,,,,,,,,,
97,97.0,0.0,0.068493,0.0,0.0,0.397959,Private,HS-grad,Widowed,Adm-clerical,...,,,,,,,,,,
98,98.0,0.0,0.068493,0.0,0.0,0.397959,Private,HS-grad,Widowed,Adm-clerical,...,,,,,,,,,,
99,99.0,0.0,0.068493,0.0,0.0,0.397959,Private,HS-grad,Widowed,Adm-clerical,...,,,,,,,,,,


In [6]:

# VISUALISE_DECISION_TREE_PATH
# 
def visualize_decision_tree_path( clf, graph, instance, indx, exp_type, path ):  

  for i, node in enumerate(graph.get_node_list()):
    if node.get_attributes().get('label') is None:
        continue

    if 'samples = ' in node.get_attributes()['label']:
        labels = node.get_attributes()['label'].split('<br/>')
        for i, label in enumerate(labels):
            if label.startswith('samples = '):
                labels[i] = 'samples = 0'
        node.set('label', '<br/>'.join(labels))
        node.set_fillcolor('white')
    
    lable_str = node.get_attributes()['label']  
    feature_value_str = (lable_str.split("<br/>")[0]).replace('<','')
    feature_name = feature_value_str.split(" ")[0]
    if feature_name in df_info.scaler.feature_names_in_:
      feature_value = float(feature_value_str.split(" ")[-1])
      original_value = get_original_value(df_info.scaler, feature_name , feature_value)
      #print(f"name: {feature_name}. value: {feature_value}, original_value: {original_value}")
      original_lable_str = lable_str.replace(feature_value_str, ' '.join([*(feature_value_str.split(' '))[:-1],f'{original_value:.2f}']))
      node.set('label', original_lable_str)
    #raise StopIteration()

  samples = instance
  
  decision_paths = clf.decision_path( [samples] )

  for decision_path in decision_paths:
    for n, node_value in enumerate(decision_path.toarray()[0]):
      if node_value == 0:
        continue
      node = graph.get_node(str(n))[0]            
      node.set_fillcolor('green')
      labels = node.get_attributes()['label'].split('<br/>')
      for i, label in enumerate(labels):
        if label.startswith('samples = '):
          labels[i] = 'samples = {}'.format(int(label.split('=')[1]) + 1)
      
      node.set('label', '<br/>'.join(labels))

  filename = path
  graph.write_png(filename)

  im = PIL.Image.open(filename)
  return im


# EXTRACT_DECISION_PATH
# 
def extract_decision_path(clf, graph, instance ):

  #samples = instance['scaled_vector']
  #decision_paths = clf.decision_path( [samples] )
  samples = instance
  decision_paths = clf.decision_path( [samples] )

  for decision_path in decision_paths:
    path = []
    for n, node_value in enumerate(decision_path.toarray()[0]):
      if node_value == 0:
        continue

      node = graph.get_node(str(n))[0]
      labels = node.get_attributes()['label'].split('<br/>')
      labels.pop(1)
      try:
        labels.remove("samples = 1")
        labels.remove("samples = 0")
        labels.remove("gini = 0.0")
      except ValueError:
        labels
      path.append(labels)
  return path

def export_decision_tree(clf, class_names, DATASET_NAME, path):
  dot_data = StringIO()
  export_graphviz(clf, out_file=dot_data,  
                  filled=True, rounded=True,
                  special_characters=True, feature_names = feature_names,
                  class_names=class_names)

  # convert to png format
  graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
  graph.write_png(path)
  return graph


In [7]:
test1 = proto_dt.nsmallest(1,['L2'])
test1

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_age,scaled_input_capital-gain,scaled_input_capital-loss,scaled_input_hours-per-week,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,...,origin_cf_race,origin_cf_sex,origin_cf_native-country,origin_cf_class,L1,L2,Sparsity,Realistic,MAD,Mahalanobis
15,15.0,0.0,0.493151,0.0,0.0,0.397959,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,...,White,Female,Haiti,<=50K,10.89111,3.225146,12.0,True,2.19109,1.486033


In [8]:
test = proto_dt.nlargest(1,['L2'])
test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_age,scaled_input_capital-gain,scaled_input_capital-loss,scaled_input_hours-per-week,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,...,origin_cf_race,origin_cf_sex,origin_cf_native-country,origin_cf_class,L1,L2,Sparsity,Realistic,MAD,Mahalanobis
70,70.0,0.0,0.424658,0.076881,0.0,0.397959,Self-emp-not-inc,Some-college,Married-civ-spouse,Craft-repair,...,White,Female,Haiti,<=50K,14.899497,3.787429,17.0,True,3.091775,1.637083


In [9]:
#### import evaluation function.
from utils.evaluation import prepare_evaluation_dict
input_and_cf = prepare_evaluation_dict(proto_dt, df_info)


clf=models['dt']
#clf = DecisionTreeClassifier()
class_names = ["No", "Yes"]



In [10]:
#!pip install pydot

In [11]:
#filepath = PATH + 'Whitebox_Model/explanations/' + DATASET_NAME.replace(".csv", "")+ '/Decision_Tree/' + exp_type + '/decision_tree_general_' + str(INDX) + '.png'
PATH = "GitHub/Counterfactual-benchmark/"
dataset_name = 'compas' # [adult, german, compas]
filepath = "/Users/yu-liangchou/Desktop/cf_experiment/tree.png"
#feature_names=df_info.feature_names
feature_names=df_info.ohe_feature_names
graph = export_decision_tree(clf, class_names, dataset_name, filepath) 


#import pydot
#graph = pydot.Dot('"class dependency"', graph_type='digraph')
#graph.add_node(pydot.Node('"graph"'))
#graph.write_png('output.png')


#Image(graph.create_png())

InvocationException: Program terminated with status: 1. stderr follows: Error: not well-formed (invalid token) in line 1 
... <HTML>native-country_Trinadad&Tobago &le; 0.5 ...
in label of node 2293


In [11]:
#### Plot the dt tree.
#from sklearn import tree
#import matplotlib.pyplot as plt
#plt.figure(figsize=(80,40))
#tree.plot_tree(models['dt'], fontsize=10, feature_names=df_info.ohe_feature_names, node_ids=True,rounded=True )
##If you need to colored the class >> filled=True


In [12]:
#### retrieve the input and cf vectors.
proto_input = input_and_cf['input'].loc[15]
proto_cf = input_and_cf['cf'].loc[15]







In [11]:
INDX = 15
exp_type = "true_positives"
#instance = new_min_l2_instance_instance_arr

img = visualize_decision_tree_path( clf, export_decision_tree(clf, class_names, dataset_name, filepath), proto_input, INDX, 'true_positives', filepath)
img


NameError: name 'filepath' is not defined

In [None]:

#exp_type = "L2_max"
INDX = 5
#instance = instance
#exp_type = "true_positives"
#instance = new_min_l2_instance_instance_arr

img = visualize_decision_tree_path( clf,  export_decision_tree(clf, class_names, dataset_name, filepath), proto_cf, INDX, 'true_positives', filepath)
img