In [7]:
import tensorflow as tf
import pandas as pd
import numpy as np
from surrogate import rules

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.preprocessing import preprocess_df
from sklearn.model_selection import train_test_split
from utils.dice import generate_dice_result, process_results
from utils.models import train_three_models, evaluation_test, save_three_models, load_three_models
from utils.save import save_result_as_csv
from IPython.display import Image
import PIL
import pydotplus
from six import StringIO
from sklearn.tree import export_graphviz
from pydotplus import *


pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.0.0
Eager execution enabled:  True


In [8]:
def get_location(scaler, col):
    if col in scaler.feature_names_in_:
        return np.where(scaler.feature_names_in_ == col)[0]
    
    else:
        raise Exception(f"Column [{col}] not a feature in this scaler, scaler features: {scaler.feature_names_in_}")


def get_scaled_value(scaler, col, X):
    loc=get_location(scaler, col)

    X *= scaler.scale_[loc]
    X += scaler.min_[loc]

    if scaler.clip:
        np.clip(X, scaler.feature_range[0][loc], scaler.feature_range[1][loc], out=X)
    
    return X[0]


def get_original_value(scaler, col, X):
    loc = get_location(scaler, col)

    X -= scaler.min_[loc]
    X /= scaler.scale_[loc]

    return X[0]

In [9]:
#### Select dataset ####'

dataset_name = 'german' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [10]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])
### Load models.
models = load_three_models(X_train.shape[-1], dataset_name)



https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [15]:
feature_names=df_info.ohe_feature_names
totoal_ohe_feature_names = len(feature_names)
totoal_ohe_feature_names
totoal_ohe_feature_names

61

In [16]:
feature_names

['duration_in_month',
 'credit_amount',
 'installment_as_income_perc',
 'present_res_since',
 'age',
 'credits_this_bank',
 'people_under_maintenance',
 'account_check_status_0 <= ... < 200 DM',
 'account_check_status_< 0 DM',
 'account_check_status_>= 200 DM / salary assignments for at least 1 year',
 'account_check_status_no checking account',
 'credit_history_all credits at this bank paid back duly',
 'credit_history_critical account/ other credits existing (not at this bank)',
 'credit_history_delay in paying off in the past',
 'credit_history_existing credits paid back duly till now',
 'credit_history_no credits taken/ all credits paid back duly',
 'purpose_(vacation - does not exist?)',
 'purpose_business',
 'purpose_car (new)',
 'purpose_car (used)',
 'purpose_domestic appliances',
 'purpose_education',
 'purpose_furniture/equipment',
 'purpose_radio/television',
 'purpose_repairs',
 'purpose_retraining',
 'savings_.. >= 1000 DM ',
 'savings_... < 100 DM',
 'savings_100 <= ... <

In [11]:
#pip install pillow
#pip install dtreeviz
#pip install Graphviz
from dtreeviz.trees import *
from sklearn.tree import DecisionTreeClassifier
proto_dt = pd.read_csv(r'./datasets/eval_proto_german_dt_result.csv')


In [12]:
proto_dt

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_duration_in_month,scaled_input_credit_amount,scaled_input_installment_as_income_perc,scaled_input_present_res_since,scaled_input_age,scaled_input_credits_this_bank,scaled_input_people_under_maintenance,scaled_input_account_check_status,...,origin_cf_job,origin_cf_telephone,origin_cf_foreign_worker,origin_cf_default,L1,L2,Sparsity,Realistic,MAD,Mahalanobis
0,0,0,0.470588,0.365192,1.0,0.666667,0.178571,0.000000,0,< 0 DM,...,,,,,,,,,,
1,1,0,0.470588,0.365192,1.0,0.666667,0.178571,0.000000,0,< 0 DM,...,,,,,,,,,,
2,2,0,0.470588,0.365192,1.0,0.666667,0.178571,0.000000,0,< 0 DM,...,,,,,,,,,,
3,3,0,0.470588,0.365192,1.0,0.666667,0.178571,0.000000,0,< 0 DM,...,,,,,,,,,,
4,4,0,0.470588,0.365192,1.0,0.666667,0.178571,0.000000,0,< 0 DM,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0,0.205882,0.452350,0.0,0.333333,0.071429,0.333333,0,< 0 DM,...,,,,,17.995708,4.149781,21.0,#DIV/0!,1.163964,1.939782
96,96,0,0.205882,0.452350,0.0,0.333333,0.071429,0.333333,0,< 0 DM,...,,,,,,,,,,
97,97,0,0.205882,0.452350,0.0,0.333333,0.071429,0.333333,0,< 0 DM,...,,,,,,,,,,
98,98,0,0.205882,0.452350,0.0,0.333333,0.071429,0.333333,0,< 0 DM,...,,,,,,,,,,


In [6]:

# VISUALISE_DECISION_TREE_PATH
# 
def visualize_decision_tree_path( clf, graph, instance, indx, exp_type, path ):  

  for i, node in enumerate(graph.get_node_list()):
    if node.get_attributes().get('label') is None:
        continue

    if 'samples = ' in node.get_attributes()['label']:
        labels = node.get_attributes()['label'].split('<br/>')
        for i, label in enumerate(labels):
            if label.startswith('samples = '):
                labels[i] = 'samples = 0'
        node.set('label', '<br/>'.join(labels))
        node.set_fillcolor('white')
    
    lable_str = node.get_attributes()['label']  
    feature_value_str = (lable_str.split("<br/>")[0]).replace('<','')
    feature_name = feature_value_str.split(" ")[0]
    if feature_name in df_info.scaler.feature_names_in_:
      feature_value = float(feature_value_str.split(" ")[-1])
      original_value = get_original_value(df_info.scaler, feature_name , feature_value)
      #print(f"name: {feature_name}. value: {feature_value}, original_value: {original_value}")
      original_lable_str = lable_str.replace(feature_value_str, ' '.join([*(feature_value_str.split(' '))[:-1],f'{original_value:.2f}']))
      node.set('label', original_lable_str)
    #raise StopIteration()

  samples = instance
  
  decision_paths = clf.decision_path( [samples] )

  for decision_path in decision_paths:
    for n, node_value in enumerate(decision_path.toarray()[0]):
      if node_value == 0:
        continue
      node = graph.get_node(str(n))[0]            
      node.set_fillcolor('green')
      labels = node.get_attributes()['label'].split('<br/>')
      for i, label in enumerate(labels):
        if label.startswith('samples = '):
          labels[i] = 'samples = {}'.format(int(label.split('=')[1]) + 1)
      
      node.set('label', '<br/>'.join(labels))

  filename = path
  graph.write_png(filename)

  im = PIL.Image.open(filename)
  return im


# EXTRACT_DECISION_PATH
# 
def extract_decision_path(clf, graph, instance ):

  #samples = instance['scaled_vector']
  #decision_paths = clf.decision_path( [samples] )
  samples = instance
  decision_paths = clf.decision_path( [samples] )

  for decision_path in decision_paths:
    path = []
    for n, node_value in enumerate(decision_path.toarray()[0]):
      if node_value == 0:
        continue

      node = graph.get_node(str(n))[0]
      labels = node.get_attributes()['label'].split('<br/>')
      labels.pop(1)
      try:
        labels.remove("samples = 1")
        labels.remove("samples = 0")
        labels.remove("gini = 0.0")
      except ValueError:
        labels
      path.append(labels)
  return path

def export_decision_tree(clf, class_names, DATASET_NAME, path):
  dot_data = StringIO()
  export_graphviz(clf, out_file=dot_data,  
                  filled=True, rounded=True,
                  special_characters=True, feature_names = feature_names,
                  class_names=class_names)

  # convert to png format
  graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
  graph.write_png(path)
  return graph


In [7]:
test1 = proto_dt.nsmallest(1,['L2'])
test1

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_duration_in_month,scaled_input_credit_amount,scaled_input_installment_as_income_perc,scaled_input_present_res_since,scaled_input_age,scaled_input_credits_this_bank,scaled_input_people_under_maintenance,scaled_input_account_check_status,...,origin_cf_job,origin_cf_telephone,origin_cf_foreign_worker,origin_cf_default,L1,L2,Sparsity,Realistic,MAD,Mahalanobis
10,10,0,0.294118,0.114614,1.0,0.333333,0.178571,0.0,0,0 <= ... < 200 DM,...,unskilled - resident,none,yes,N,12.920637,3.498948,16.0,True,0.969441,1.542674


In [8]:
test = proto_dt.nlargest(1,['L2'])
test

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_duration_in_month,scaled_input_credit_amount,scaled_input_installment_as_income_perc,scaled_input_present_res_since,scaled_input_age,scaled_input_credits_this_bank,scaled_input_people_under_maintenance,scaled_input_account_check_status,...,origin_cf_job,origin_cf_telephone,origin_cf_foreign_worker,origin_cf_default,L1,L2,Sparsity,Realistic,MAD,Mahalanobis
55,55,0,0.117647,0.052272,1.0,1.0,0.071429,0.0,0,< 0 DM,...,unskilled - resident,none,yes,N,24.241348,4.901191,27.0,True,0.694347,2.468314


In [9]:
#### import evaluation function.
from utils.evaluation import prepare_evaluation_dict
input_and_cf = prepare_evaluation_dict(proto_dt, df_info)


clf=models['dt']
#clf = DecisionTreeClassifier()
class_names = ["No", "Yes"]



In [12]:
#filepath = PATH + 'Whitebox_Model/explanations/' + DATASET_NAME.replace(".csv", "")+ '/Decision_Tree/' + exp_type + '/decision_tree_general_' + str(INDX) + '.png'
PATH = "GitHub/Counterfactual-benchmark/"
dataset_name = 'german' # [adult, german, compas]
filepath = "/Users/yu-liangchou/Desktop/cf_experiment/tree.png"
#feature_names=df_info.feature_names
feature_names=df_info.ohe_feature_names
graph = export_decision_tree(clf, class_names, dataset_name, filepath) 



#Image(graph.create_png())

34 [label=<account_check_status_>= 200 DM / salary assignments for at least 1 year &le; 0.5<br/>gini = 0.198<br/>samples = 18<br/>value = [16, 2]<br/>class = No>, fillcolor="#e89152"] ;
   ^
Expected '}', found '['  (at char 4712), (line:71, col:4)


AttributeError: 'NoneType' object has no attribute 'write_png'

In [None]:
#### Plot the dt tree.
from sklearn import tree
import matplotlib.pyplot as plt
plt.figure(figsize=(80,40))
tree.plot_tree(models['dt'], fontsize=10, feature_names=df_info.ohe_feature_names, node_ids=True,rounded=True )
#If you need to colored the class >> filled=True


In [12]:
#### retrieve the input and cf vectors.
proto_input = input_and_cf['input'].loc[10]
proto_cf = input_and_cf['cf'].loc[10]







In [13]:
INDX = 5
exp_type = "true_positives"
#instance = new_min_l2_instance_instance_arr

img = visualize_decision_tree_path( clf, export_decision_tree(clf, class_names, dataset_name, filepath), proto_input, INDX, 'true_positives', filepath)
img



34 [label=<account_check_status_>= 200 DM / salary assignments for at least 1 year &le; 0.5<br/>gini = 0.198<br/>samples = 18<br/>value = [16, 2]<br/>class = No>, fillcolor="#e89152"] ;
   ^
Expected '}', found '['  (at char 4712), (line:71, col:4)


AttributeError: 'NoneType' object has no attribute 'write_png'

In [None]:
#exp_type = "L2_max"
INDX = 5
#instance = instance
#exp_type = "true_positives"
#instance = new_min_l2_instance_instance_arr

img = visualize_decision_tree_path( clf,  export_decision_tree(clf, class_names, dataset_name, filepath), proto_cf, INDX, 'true_positives', filepath)
img