In [31]:
import numpy as np 
import pandas as pd 
import os
from utils.preprocessing import preprocess_df
from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.evaluation import get_evaluations, EvaluationMatrix

In [70]:
dataset_name = 'breast_cancer'
model_name = 'dt'
algorithm_name = 'proto' # [dice, proto]

In [71]:
def get_loading_fn(dataset_name):
    if dataset_name == 'adult':
        dataset_loading_fn = load_adult_df
    elif dataset_name == 'german':
        dataset_loading_fn = load_german_df
    elif dataset_name == 'compas':
        dataset_loading_fn = load_compas_df
    elif dataset_name == 'diabetes':
        dataset_loading_fn = load_diabetes_df
    elif dataset_name == 'breast_cancer':
        dataset_loading_fn = load_breast_cancer_df
    else:
        raise Exception("Unsupported dataset")
    return dataset_loading_fn

In [72]:
df_info = preprocess_df(get_loading_fn(dataset_name))
print(f"[{dataset_name}] | #Features: [{len(df_info.feature_names)}]| #Numerical: [{len(df_info.numerical_cols)}] | #Categorical: [{len( [c for c in df_info.categorical_cols if c != df_info.target_name])}] | #OHE Features: [{len(df_info.ohe_feature_names)}] |")

[breast_cancer] | #Features: [30]| #Numerical: [30] | #Categorical: [0] | #OHE Features: [30] |


In [73]:
folder_name = f'{algorithm_name}_{dataset_name}'
file_name = f'{folder_name}_{model_name}_result.csv'
result_path = f'./results/{folder_name}/{file_name}'

In [74]:
os.path.isfile(result_path)

True

In [75]:
result_df = pd.read_csv(result_path)

In [76]:
evaluation_df = result_df.copy(deep=True)
found_idx = evaluation_df[evaluation_df['Found']=="Y"].index

In [77]:
cf_found_eaval_df = evaluation_df.loc[[found_idx[0]]].copy(deep=True)

In [78]:
cf_found_eaval_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_radius_mean,scaled_input_texture_mean,scaled_input_perimeter_mean,scaled_input_area_mean,scaled_input_smoothness_mean,scaled_input_compactness_mean,scaled_input_concavity_mean,scaled_input_concave points_mean,...,origin_cf_texture_worst,origin_cf_perimeter_worst,origin_cf_area_worst,origin_cf_smoothness_worst,origin_cf_compactness_worst,origin_cf_concavity_worst,origin_cf_concave points_worst,origin_cf_symmetry_worst,origin_cf_fractal_dimension_worst,origin_cf_diagnosis
10,2,0,0.49974,0.324992,0.492779,0.342778,0.334477,0.308018,0.242737,0.372167,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,N


In [79]:
from utils.evaluation import prepare_evaluation_dict

In [80]:
input_and_cf = prepare_evaluation_dict(cf_found_eaval_df, df_info)

In [81]:
input = input_and_cf['input']

In [82]:
from utils import evaluation as evaluator

In [83]:
evaluator.get_L1(**input_and_cf)

array([8.08252142])

In [84]:
def get_sparsity(**kwargs):
    e = 1e-10
    # should remove the target column first.
    input_df = kwargs['not_dummy_input']
    cf_df = kwargs['not_dummy_cf']

    input_array = np.array(input_df)
    cf_array = np.array(cf_df)

    return (np.abs(input_array - cf_array) > e).astype(int).sum(axis=1)

In [85]:
evaluator.get_sparsity(**input_and_cf)

array([30])

In [86]:
get_sparsity(**input_and_cf)

array([30])

In [14]:
cf = input_and_cf['cf']

In [15]:
np.linalg.norm(np.array(input)-np.array(cf), axis=1, ord=1)

array([0.26882547])

In [16]:
ohe_cat_cols = df_info.get_ohe_cat_cols()
ohe_num_cols = df_info.get_ohe_num_cols()
numerical_mads = df_info.get_numerical_mads()

In [17]:
numerical_mads

{'age': 0.15327646057129196,
 'capital-gain': 0.01977393211073296,
 'capital-loss': 0.03821442946900573,
 'hours-per-week': 0.07737987255391114}

In [18]:
eps = 1e-8

In [19]:
mad_df = pd.DataFrame({}, columns= df_info.ohe_feature_names)
mad_df[ohe_cat_cols] = (input[ohe_cat_cols] != cf[ohe_cat_cols]).astype(int)
for num_col in ohe_num_cols: 
    mad_df[num_col] = abs(cf[num_col] - input[num_col]) / (numerical_mads[num_col] + eps)



In [24]:
mad_df[ohe_num_cols].mean(axis=1)

0    2.09008
dtype: float64

In [25]:
mad_df[ohe_cat_cols].mean(axis=1)

0    0.10101
dtype: float64

In [20]:
(mad_df[ohe_num_cols].mean(axis=1) + mad_df[ohe_cat_cols].mean(axis=1)).tolist()

[2.1910904801656854]

In [21]:
mad_df.mean(axis=1).tolist()

[0.1782555487050712]

In [22]:
# if len(ohe_cat_cols) > 0 and len(ohe_num_cols) > 0:
#     return (mad_df[ohe_num_cols].mean(axis=1) + mad_df[ohe_cat_cols].mean(axis=1)).tolist()
# elif len(ohe_num_cols) > 0:
#     return mad_df[ohe_num_cols].mean(axis=1).tolist()
# elif len(ohe_cat_cols) > 0:
#     return mad_df[ohe_cat_cols].mean(axis=1).tolist()
# else:
#     raise Exception("No columns provided for MAD.")

In [23]:
# mad_df[ohe_num_cols].mean(axis=1)