In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from surrogate import rules

from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.preprocessing import preprocess_df
from sklearn.model_selection import train_test_split
from utils.dice import generate_dice_result, process_results
from utils.models import train_three_models, evaluation_test, save_three_models, load_three_models
from utils.save import save_result_as_csv
from IPython.display import Image
import PIL
import pydotplus
from six import StringIO
from sklearn.tree import export_graphviz
from pydotplus import *


pd.options.mode.chained_assignment = None 

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)


TF version:  2.0.0
Eager execution enabled:  True


In [2]:
#### Select dataset ####'

dataset_name = 'compas' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
elif dataset_name == 'breast_cancer':
    dataset_loading_fn = load_breast_cancer_df
else:
    raise Exception("Unsupported dataset")

In [66]:
df, feature_names, numerical_cols, categorical_cols, columns_type, target_name, possible_outcomes = load_compas_df()


In [None]:
# scaled_input_days_b_screening_arrest

In [67]:
numerical_cols

['age',
 'priors_count',
 'days_b_screening_arrest',
 'is_recid',
 'is_violent_recid',
 'two_year_recid',
 'length_of_stay']

In [3]:
#### Load datafram info.
df_info = preprocess_df(dataset_loading_fn)
### Seperate to train and test set.
train_df, test_df = train_test_split(df_info.dummy_df, train_size=.8, random_state=seed, shuffle=True)
### Get training and testing array.
X_train = np.array(train_df[df_info.ohe_feature_names])
y_train = np.array(train_df[df_info.target_name])
X_test = np.array(test_df[df_info.ohe_feature_names])
y_test = np.array(test_df[df_info.target_name])
### Load models.
models = load_three_models(X_train.shape[-1], dataset_name)



https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
2022-07-01 15:39:41.013150: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-01 15:39:41.014232: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [4]:
from utils.evaluation import prepare_evaluation_dict

proto_dt = pd.read_csv(r'./datasets/eval_proto_compas_dt_result.csv')
original_compas = pd.read_csv(r'./datasets/COMPAS.csv')
input_and_cf = prepare_evaluation_dict(proto_dt, df_info)

In [34]:
proto_dt['scaled_input_age'].max()

0.448717949

In [35]:
proto_dt['scaled_input_age'].min()

0.051282051

In [37]:
proto_dt.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'scaled_input_age',
       'scaled_input_priors_count', 'scaled_input_days_b_screening_arrest',
       'scaled_input_is_recid', 'scaled_input_is_violent_recid',
       'scaled_input_two_year_recid', 'scaled_input_length_of_stay',
       'scaled_input_age_cat', 'scaled_input_sex', 'scaled_input_race',
       'scaled_input_c_charge_degree', 'scaled_input_class',
       'origin_input_age', 'origin_input_priors_count',
       'origin_input_days_b_screening_arrest', 'origin_input_is_recid',
       'origin_input_is_violent_recid', 'origin_input_two_year_recid',
       'origin_input_length_of_stay', 'origin_input_age_cat',
       'origin_input_sex', 'origin_input_race', 'origin_input_c_charge_degree',
       'origin_input_class', 'running_time', 'running_time.1', 'Found',
       'ground_truth', 'prediction', 'scaled_cf_age', 'scaled_cf_priors_count',
       'scaled_cf_days_b_screening_arrest', 'scaled_cf_is_recid',
       'scaled_cf_is_violent_recid', 'sca

In [39]:
proto_dt['origin_input_age']

0     28
1     28
2     28
3     28
4     28
      ..
95    50
96    50
97    50
98    50
99    50
Name: origin_input_age, Length: 100, dtype: int64

In [55]:
proto_dt.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'scaled_input_age',
       'scaled_input_priors_count', 'scaled_input_days_b_screening_arrest',
       'scaled_input_is_recid', 'scaled_input_is_violent_recid',
       'scaled_input_two_year_recid', 'scaled_input_length_of_stay',
       'scaled_input_age_cat', 'scaled_input_sex', 'scaled_input_race',
       'scaled_input_c_charge_degree', 'scaled_input_class',
       'origin_input_age', 'origin_input_priors_count',
       'origin_input_days_b_screening_arrest', 'origin_input_is_recid',
       'origin_input_is_violent_recid', 'origin_input_two_year_recid',
       'origin_input_length_of_stay', 'origin_input_age_cat',
       'origin_input_sex', 'origin_input_race', 'origin_input_c_charge_degree',
       'origin_input_class', 'running_time', 'running_time.1', 'Found',
       'ground_truth', 'prediction', 'scaled_cf_age', 'scaled_cf_priors_count',
       'scaled_cf_days_b_screening_arrest', 'scaled_cf_is_recid',
       'scaled_cf_is_violent_recid', 'sca

In [70]:
proto_dt['scaled_input_days_b_screening_arrest'].max()

300

In [61]:
proto_dt['scaled_input_days_b_screening_arrest']

0     0
1     0
2     0
3     0
4     0
     ..
95    1
96    1
97    1
98    1
99    1
Name: scaled_input_days_b_screening_arrest, Length: 100, dtype: int64

In [71]:
df_info.numerical_cols

['age',
 'priors_count',
 'days_b_screening_arrest',
 'is_recid',
 'is_violent_recid',
 'two_year_recid',
 'length_of_stay']

In [74]:
input_and_cf['input']['age'].min()

0.051282051

In [81]:
input_and_cf['input']['length_of_stay'].min()

0

In [53]:
input_and_cf['input']['days_b_screening_arrest'].max()

300

In [5]:
INDX = 5

proto_input = input_and_cf['input'].loc[INDX]
proto_cf = input_and_cf['cf'].loc[INDX]

In [6]:
models

{'dt': DecisionTreeClassifier(),
 'rfc': RandomForestClassifier(),
 'nn': <tensorflow.python.keras.engine.sequential.Sequential at 0x7f97c3b9bb50>}

In [7]:
from dtreeviz.trees import *

In [23]:
X_train.max()

1.0

In [20]:
clf=models['dt']
class_names = ["No", "Yes"]
viz = dtreeviz(clf, 
               x_data=X_train,
               y_data=y_train,
               target_name='target',
               # feature_names=df_info.feature_names,
               feature_names=df_info.ohe_feature_names, 
               class_names=class_names,
               title="Decison Tree - COMPAS with decision path",
               # orientation="LR", 
               # fancy=False,
               # X=arr,
               X=proto_cf, # B1. value not acceptable by this function.
               # scale=.3,
               show_just_path = True,
            #    max_X_features_TD= 1,
               # max_X_features_LR=1,
               # depth_range_to_display=(0, 3)
               # X=X_test[0]
               
               )
viz.save("./testing-1.svg")

In [None]:
"/"