In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from surrogate import rules

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.preprocessing import preprocess_df
from sklearn.model_selection import train_test_split
from utils.dice import generate_dice_result, process_results
from utils.models import train_three_models, evaluation_test, save_three_models, load_lp_three_models
from utils.save import save_result_as_csv
from IPython.display import Image
import PIL
import pydotplus
from six import StringIO
from sklearn.tree import export_graphviz
from pydotplus import *


pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.0.0
Eager execution enabled:  True


In [2]:
#### Select dataset ####'

dataset_name = 'compas' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [3]:
from alibi.datasets import fetch_adult

In [4]:
adult = fetch_adult()
data = adult.data
target = adult.target
feature_names = adult.feature_names
category_map_tmp = adult.category_map
target_names = adult.target_names

In [5]:
data.shape

(32561, 12)

In [6]:
feature_names

['Age',
 'Workclass',
 'Education',
 'Marital Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Capital Gain',
 'Capital Loss',
 'Hours per week',
 'Country']

In [7]:
category_map_tmp.keys()

dict_keys([1, 2, 3, 4, 5, 6, 7, 11])

In [8]:
df, feature_names, numerical_cols, categorical_cols, columns_type, target_name, possible_outcomes = load_compas_df()


In [9]:
# scaled_input_days_b_screening_arrest

In [10]:
numerical_cols

['age', 'priors_count', 'days_b_screening_arrest', 'length_of_stay']

In [11]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])
### Load models.
models = load_lp_three_models(X_train.shape[-1], dataset_name)



2022-07-07 15:37:57.835789: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-07 15:37:57.836920: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [12]:
from utils.evaluation import prepare_evaluation_dict

# proto_dt = pd.read_csv(r'./datasets/eval_proto_compas_dt_result.csv')
proto_dt = pd.read_csv(r'./results/proto_compas/proto_compas_dt_result.csv')
original_compas = pd.read_csv(r'./datasets/COMPAS.csv')
input_and_cf = prepare_evaluation_dict(proto_dt, df_info)

In [13]:
# proto_dt['scaled_cf_days_b_screening_arrest'].value_counts()
# df_info.scaler.inverse_transform(
#     input_instance[df_info.numerical_cols]
# ).max()

In [14]:
proto_dt

Unnamed: 0.1,Unnamed: 0,scaled_input_age,scaled_input_priors_count,scaled_input_days_b_screening_arrest,scaled_input_length_of_stay,scaled_input_age_cat,scaled_input_sex,scaled_input_race,scaled_input_c_charge_degree,scaled_input_is_recid,...,origin_cf_race,origin_cf_c_charge_degree,origin_cf_is_recid,origin_cf_is_violent_recid,origin_cf_two_year_recid,origin_cf_class,running_time,Found,ground_truth,prediction
0,0,0.128205,0.026316,0.0,0.0,25 - 45,Female,Other,F,0,...,African-American,F,1.0,0.0,1.0,High,23.306272,Y,Medium-Low,Medium-Low
1,0,0.051282,0.157895,0.15894,0.077597,Less than 25,Male,Caucasian,F,1,...,,,,,,,28.059915,N,High,High
2,0,0.333333,0.0,0.000946,0.002503,25 - 45,Male,African-American,M,0,...,African-American,F,1.0,0.0,1.0,High,22.860507,Y,Medium-Low,Medium-Low


In [15]:
input_and_cf['input']

Unnamed: 0,age,priors_count,days_b_screening_arrest,length_of_stay,age_cat_25 - 45,age_cat_Greater than 45,age_cat_Less than 25,sex_Female,sex_Male,race_African-American,...,race_Native American,race_Other,c_charge_degree_F,c_charge_degree_M,is_recid_0,is_recid_1,is_violent_recid_0,is_violent_recid_1,two_year_recid_0,two_year_recid_1
0,0.128205,0.026316,0.0,0.0,1,0,0,1,0,0,...,0,1,1,0,1,0,1,0,1,0
1,0.051282,0.157895,0.15894,0.077597,0,0,1,0,1,0,...,0,0,1,0,0,1,1,0,0,1
2,0.333333,0.0,0.000946,0.002503,1,0,0,0,1,1,...,0,0,0,1,1,0,1,0,1,0


In [16]:
INDX = 0

proto_input = input_and_cf['input'].loc[INDX]
proto_cf = input_and_cf['cf'].loc[INDX]

In [17]:
from dtreeviz.trees import *

In [18]:
proto_input # should drop the categorical column with only two values.s

age                        0.128205
priors_count               0.026316
days_b_screening_arrest    0.000000
length_of_stay             0.000000
age_cat_25 - 45            1.000000
age_cat_Greater than 45    0.000000
age_cat_Less than 25       0.000000
sex_Female                 1.000000
sex_Male                   0.000000
race_African-American      0.000000
race_Asian                 0.000000
race_Caucasian             0.000000
race_Hispanic              0.000000
race_Native American       0.000000
race_Other                 1.000000
c_charge_degree_F          1.000000
c_charge_degree_M          0.000000
is_recid_0                 1.000000
is_recid_1                 0.000000
is_violent_recid_0         1.000000
is_violent_recid_1         0.000000
two_year_recid_0           1.000000
two_year_recid_1           0.000000
Name: 0, dtype: float64

In [19]:
proto_cf

age                        0.0
priors_count               0.0
days_b_screening_arrest    0.0
length_of_stay             0.0
age_cat_25 - 45            1.0
age_cat_Greater than 45    0.0
age_cat_Less than 25       0.0
sex_Female                 1.0
sex_Male                   0.0
race_African-American      1.0
race_Asian                 0.0
race_Caucasian             0.0
race_Hispanic              0.0
race_Native American       0.0
race_Other                 0.0
c_charge_degree_F          1.0
c_charge_degree_M          0.0
is_recid_0                 0.0
is_recid_1                 1.0
is_violent_recid_0         1.0
is_violent_recid_1         0.0
two_year_recid_0           0.0
two_year_recid_1           1.0
Name: 0, dtype: float64

In [20]:
clf=models['dt']
class_names = ["No", "Yes"]
viz = dtreeviz(clf, 
               x_data=X_train,
               y_data=y_train,
               target_name='target',
               # feature_names=df_info.feature_names,
               feature_names=df_info.ohe_feature_names, 
               class_names=class_names,
               title="Decison Tree - COMPAS with decision path",
               # orientation="LR", 
               # fancy=False,
               # X=arr,
               X=proto_cf, # B1. value not acceptable by this function.
               # scale=.3,
               show_just_path = True,
               max_X_features_TD= 1,
               max_X_features_LR=1,
               # depth_range_to_display=(0, 3)
               # X=X_test[0]
               
               )

## Seems this one work.         
viz.save("./testing-1.svg")

In [21]:
"/"

'/'