In [1]:
import numpy as np 
import pandas as pd 
import os
from utils.preprocessing import preprocess_df
from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.evaluation import get_evaluations, EvaluationMatrix

In [2]:
dataset_name = 'adult'
model_name = 'dt'
algorithm_name = 'proto' # [dice, proto]

In [3]:
def get_loading_fn(dataset_name):
    if dataset_name == 'adult':
        dataset_loading_fn = load_adult_df
    elif dataset_name == 'german':
        dataset_loading_fn = load_german_df
    elif dataset_name == 'compas':
        dataset_loading_fn = load_compas_df
    elif dataset_name == 'diabetes':
        dataset_loading_fn = load_diabetes_df
    elif dataset_name == 'breast_cancer':
        dataset_loading_fn = load_breast_cancer_df
    else:
        raise Exception("Unsupported dataset")
    return dataset_loading_fn

In [4]:
df_info = preprocess_df(get_loading_fn(dataset_name))
print(f"[{dataset_name}] | #Features: [{len(df_info.feature_names)}]| #Numerical: [{len(df_info.numerical_cols)}] | #Categorical: [{len( [c for c in df_info.categorical_cols if c != df_info.target_name])}] | #OHE Features: [{len(df_info.ohe_feature_names)}] |")

[adult] | #Features: [12]| #Numerical: [4] | #Categorical: [8] | #OHE Features: [103] |


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col] == '?'] = df[col].value_counts().index[0]


In [5]:
folder_name = f'{algorithm_name}_{dataset_name}'
file_name = f'{folder_name}_{model_name}_result.csv'
result_path = f'./results/{folder_name}/{file_name}'

In [6]:
os.path.isfile(result_path)

True

In [7]:
result_df = pd.read_csv(result_path)

In [8]:
evaluation_df = result_df.copy(deep=True)
found_idx = evaluation_df[evaluation_df['Found']=="Y"].index

In [9]:
cf_found_eaval_df = evaluation_df.loc[[found_idx[0]]].copy(deep=True)

In [10]:
cf_found_eaval_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,scaled_input_age,scaled_input_capital-gain,scaled_input_capital-loss,scaled_input_hours-per-week,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,...,origin_cf_hours-per-week,origin_cf_workclass,origin_cf_education,origin_cf_marital-status,origin_cf_occupation,origin_cf_relationship,origin_cf_race,origin_cf_sex,origin_cf_native-country,origin_cf_class
15,3,0,0.493151,0.0,0.0,0.397959,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,...,1.0,State-gov,9th,Married-civ-spouse,Protective-serv,Wife,White,Female,Haiti,<=50K


In [11]:
from utils.evaluation import prepare_evaluation_dict

In [12]:
input_and_cf = prepare_evaluation_dict(cf_found_eaval_df, df_info)

In [13]:
input = input_and_cf['input']

In [14]:
cf = input_and_cf['cf']

In [15]:
np.linalg.norm(np.array(input)-np.array(cf), axis=1, ord=1)

array([10.89110987])

In [16]:
ohe_cat_cols = df_info.get_ohe_cat_cols()
ohe_num_cols = df_info.get_ohe_num_cols()
numerical_mads = df_info.get_numerical_mads()

In [17]:
numerical_mads

{'age': 0.15327646057129196,
 'capital-gain': 0.01977393211073296,
 'capital-loss': 0.03821442946900573,
 'hours-per-week': 0.07737987255391114}

In [18]:
eps = 1e-8

In [19]:
mad_df = pd.DataFrame({}, columns= df_info.ohe_feature_names)
mad_df[ohe_cat_cols] = (input[ohe_cat_cols] != cf[ohe_cat_cols]).astype(int)
for num_col in ohe_num_cols: 
    mad_df[num_col] = abs(cf[num_col] - input[num_col]) / (numerical_mads[num_col] + eps)



In [24]:
mad_df[ohe_num_cols].mean(axis=1)

0    2.09008
dtype: float64

In [25]:
mad_df[ohe_cat_cols].mean(axis=1)

0    0.10101
dtype: float64

In [20]:
(mad_df[ohe_num_cols].mean(axis=1) + mad_df[ohe_cat_cols].mean(axis=1)).tolist()

[2.1910904801656854]

In [21]:
mad_df.mean(axis=1).tolist()

[0.1782555487050712]

In [22]:
# if len(ohe_cat_cols) > 0 and len(ohe_num_cols) > 0:
#     return (mad_df[ohe_num_cols].mean(axis=1) + mad_df[ohe_cat_cols].mean(axis=1)).tolist()
# elif len(ohe_num_cols) > 0:
#     return mad_df[ohe_num_cols].mean(axis=1).tolist()
# elif len(ohe_cat_cols) > 0:
#     return mad_df[ohe_cat_cols].mean(axis=1).tolist()
# else:
#     raise Exception("No columns provided for MAD.")

In [23]:
# mad_df[ohe_num_cols].mean(axis=1)