In [1]:
import pandas as pd 
import os
from utils.preprocessing import preprocess_df
from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df, load_breast_cancer_df
from utils.evaluation import get_evaluations, EvaluationMatrix

In [2]:
all_dataset_names = [
    "adult",
    "german",
    "compas",
    "diabetes",
    "breast_cancer",
]

all_algorithm_names = ["dice", "GS", "proto", "watcher"]

all_models = ["dt", "rfc", "nn"]

In [3]:
def get_loading_fn(dataset_name):
    if dataset_name == 'adult':
        dataset_loading_fn = load_adult_df
    elif dataset_name == 'german':
        dataset_loading_fn = load_german_df
    elif dataset_name == 'compas':
        dataset_loading_fn = load_compas_df
    elif dataset_name == 'diabetes':
        dataset_loading_fn = load_diabetes_df
    elif dataset_name == 'breast_cancer':
        dataset_loading_fn = load_breast_cancer_df
    else:
        raise Exception("Unsupported dataset")
    return dataset_loading_fn


In [4]:
for dataset_name in all_dataset_names:
    df_info = preprocess_df(get_loading_fn(dataset_name))
    print(f"[{dataset_name}] | #Features: [{len(df_info.feature_names)}]| #Numerical: [{len(df_info.numerical_cols)}] | #Categorical: [{len( [c for c in df_info.categorical_cols if c != df_info.target_name])}] | #OHE Features: [{len(df_info.ohe_feature_names)}] |")



# Because the tree size is huge => not generating.
# one of the reason causing the huge size of tree could be the #features.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col] == '?'] = df[col].value_counts().index[0]


[adult] | #Features: [12]| #Numerical: [4] | #Categorical: [8] | #OHE Features: [103] |
[german] | #Features: [20]| #Numerical: [5] | #Categorical: [15] | #OHE Features: [65] |
[compas] | #Features: [11]| #Numerical: [4] | #Categorical: [7] | #OHE Features: [23] |
[diabetes] | #Features: [8]| #Numerical: [8] | #Categorical: [0] | #OHE Features: [8] |
[breast_cancer] | #Features: [30]| #Numerical: [30] | #Categorical: [0] | #OHE Features: [30] |


In [5]:
#### Select dataset ####
# dataset_name = 'adult' # [adult, german, compas, breast_cancer, diabetes]
# cf_algorithm= 'proto' # ["dice", "GS", "proto", "watcher"]
# model_name = 'dt' # ["dt", "rfc", "nn"]

for dataset_name in all_dataset_names:
    df_info = preprocess_df(get_loading_fn(dataset_name))
    for cf_algorithm in all_algorithm_names:
        folder_name = f'{cf_algorithm}_{dataset_name}'
        for model_name in all_models:
            file_name = f'{folder_name}_{model_name}_result.csv'
            result_path = f'./results/{folder_name}/{file_name}'
            if  os.path.isfile(result_path):
                result_df = pd.read_csv(result_path)
                evaluation_df = get_evaluations(result_df, df_info, matrix = [EvaluationMatrix.L1, EvaluationMatrix.L2, EvaluationMatrix.Sparsity, EvaluationMatrix.Realistic, EvaluationMatrix.MAD, EvaluationMatrix.Mahalanobis])

                csv_save_result_path = f'results/{folder_name}/eval_{file_name}'
                evaluation_df.to_csv(csv_save_result_path)
                print(f"Have saved file to {csv_save_result_path}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col] == '?'] = df[col].value_counts().index[0]


Have saved file to results/dice_adult/eval_dice_adult_dt_result.csv
Have saved file to results/dice_adult/eval_dice_adult_rfc_result.csv
Have saved file to results/dice_adult/eval_dice_adult_nn_result.csv
Have saved file to results/proto_adult/eval_proto_adult_dt_result.csv
Have saved file to results/proto_adult/eval_proto_adult_rfc_result.csv
Have saved file to results/proto_adult/eval_proto_adult_nn_result.csv
Have saved file to results/dice_german/eval_dice_german_dt_result.csv
Have saved file to results/dice_german/eval_dice_german_rfc_result.csv
Have saved file to results/dice_german/eval_dice_german_nn_result.csv
Have saved file to results/proto_german/eval_proto_german_dt_result.csv
Have saved file to results/proto_german/eval_proto_german_rfc_result.csv
Have saved file to results/proto_german/eval_proto_german_nn_result.csv
Have saved file to results/dice_compas/eval_dice_compas_dt_result.csv
Have saved file to results/dice_compas/eval_dice_compas_rfc_result.csv
Have saved file

In [6]:
# #### Select dataset ####
# dataset_name = 'german' # [adult, german, compas, breast_cancer, diabetes]
# cf_algorithm= 'dice' # ["dice", "GS", "proto", "watcher"]
# model_name = 'dt' # ["dt", "rfc", "nn"]

# df_info = preprocess_df(get_loading_fn(dataset_name))
# folder_name = f'{cf_algorithm}_{dataset_name}'
# file_name = f'{folder_name}_{model_name}_result.csv'
# result_path = f'./results/{folder_name}/{file_name}'
# if  os.path.isfile(result_path):
#     result_df = pd.read_csv(result_path)
#     evaluation_df = get_evaluations(result_df, df_info, matrix = [EvaluationMatrix.L1, EvaluationMatrix.L2, EvaluationMatrix.Sparsity, EvaluationMatrix.Realistic, EvaluationMatrix.MAD, EvaluationMatrix.Mahalanobis])

#     csv_save_result_path = f'results/{folder_name}/eval_{file_name}'
#     evaluation_df.to_csv(csv_save_result_path)
#     print(f"Have saved file to {csv_save_result_path}")


In [7]:
# from utils.evaluation import prepare_evaluation_dict
# found_idx = evaluation_df[evaluation_df['Found']=="Y"].index
# cf_found_eaval_df = evaluation_df.loc[found_idx].copy(deep=True)
# input_and_cf = prepare_evaluation_dict(cf_found_eaval_df, df_info)

In [8]:
# evaluation_df[evaluation_df['Found']=="Y"]

In [9]:
# for col in input_and_cf['not_dummy_input'].columns:
#     if not col in df_info.feature_names:
#         print(col)