In [1]:
import pandas as pd 

from utils.preprocessing import preprocess_df
from utils.df_loader import load_adult_df, load_compas_df, load_german_df, load_diabetes_df

In [2]:
#### Select dataset ####
dataset_name = 'adult' # [adult, german, compas]

if dataset_name == 'adult':
    dataset_loading_fn = load_adult_df
elif dataset_name == 'german':
    dataset_loading_fn = load_german_df
elif dataset_name == 'compas':
    dataset_loading_fn = load_compas_df
elif dataset_name == 'diabetes':
    dataset_loading_fn = load_diabetes_df
else:
    raise Exception("Unsupported dataset")

df_info = preprocess_df(dataset_loading_fn)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df[col] == '?'] = df[col].value_counts().index[0]


In [3]:
folder_name = 'dice_adult'
file_name = 'dice_adult_dt_result.csv'
result_path = f'results/{folder_name}/{file_name}'
result_df = pd.read_csv(result_path)

In [4]:
evaluation_df = result_df.copy(deep=True)

In [5]:
from utils.evaluation import prepare_evaluation_dict

In [6]:
input_and_cf = prepare_evaluation_dict(evaluation_df, df_info)

In [7]:
input_df = input_and_cf['input']
cf_df = input_and_cf['cf']

In [8]:
input_df

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.520548,0.0,0.0,0.142857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.657534,0.0,0.0,0.193878,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.164384,0.0,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.493151,0.0,0.0,0.397959,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.205479,0.0,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
import numpy as np

In [10]:

realistic = np.all(np.logical_and(np.array(input_df[df_info.numerical_cols]) >= 0, np.array(input_df[df_info.numerical_cols]) <= 1 ), axis=1)

In [11]:
evaluation_df['realistic'] = realistic

In [12]:
evaluation_df

Unnamed: 0.1,Unnamed: 0,scaled_input_age,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,scaled_input_relationship,scaled_input_race,scaled_input_sex,scaled_input_capital-gain,...,origin_cf_capital-gain,origin_cf_capital-loss,origin_cf_hours-per-week,origin_cf_native-country,origin_cf_class,running_time,Found,ground_truth,prediction,realistic
0,0,0.520548,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,0.0,0.0,15.000001,United-States,>50K,0.184975,Y,<=50K,<=50K,True
1,0,0.657534,Private,Bachelors,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,0.0,0.0,20.0,United-States,>50K,0.167884,Y,<=50K,<=50K,True
2,0,0.164384,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,0.0,...,89999.1,0.0,39.999999,Guatemala,>50K,0.168001,Y,<=50K,<=50K,True
3,0,0.493151,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,0.0,2178.0,19.919363,United-States,<=50K,0.169464,Y,>50K,>50K,True
4,0,0.205479,Private,Bachelors,Never-married,Adm-clerical,Not-in-family,Asian-Pac-Islander,Male,0.0,...,0.0,0.0,39.999999,United-States,>50K,0.166596,Y,<=50K,<=50K,True


In [13]:
input_df[df_info.numerical_cols]

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,0.520548,0.0,0.0,0.142857
1,0.657534,0.0,0.0,0.193878
2,0.164384,0.0,0.0,0.397959
3,0.493151,0.0,0.0,0.397959
4,0.205479,0.0,0.0,0.397959


In [14]:
numerical_mads = df_info.get_numerical_mads()

In [30]:
eps = 1e-8

ohe_cat_cols = df_info.get_ohe_cat_cols()
ohe_num_cols = df_info.get_ohe_num_cols()

numerical_mads = df_info.get_numerical_mads()

mad_df = pd.DataFrame({}, columns= df_info.ohe_feature_names)
mad_df[ohe_cat_cols] = (input_df[ohe_cat_cols] != cf_df[ohe_cat_cols]).astype(int)
for num_col in ohe_num_cols: 
    mad_df[num_col] = abs(cf_df[num_col] - input_df[num_col]) / (numerical_mads[num_col] + eps)

mad = mad_df[ohe_num_cols].mean(axis=1) + mad_df[ohe_cat_cols].mean(axis=1) 

In [31]:
evaluation_df['MAD'] = mad

In [32]:
evaluation_df

Unnamed: 0.1,Unnamed: 0,scaled_input_age,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,scaled_input_relationship,scaled_input_race,scaled_input_sex,scaled_input_capital-gain,...,origin_cf_capital-loss,origin_cf_hours-per-week,origin_cf_native-country,origin_cf_class,running_time,Found,ground_truth,prediction,realistic,MAD
0,0,0.520548,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,0.0,15.000001,United-States,>50K,0.184975,Y,<=50K,<=50K,True,0.040404
1,0,0.657534,Private,Bachelors,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,0.0,20.0,United-States,>50K,0.167884,Y,<=50K,<=50K,True,0.346676
2,0,0.164384,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,0.0,...,0.0,39.999999,Guatemala,>50K,0.168001,Y,<=50K,<=50K,True,11.398813
3,0,0.493151,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,2178.0,19.919363,United-States,<=50K,0.169464,Y,>50K,>50K,True,3.933023
4,0,0.205479,Private,Bachelors,Never-married,Adm-clerical,Not-in-family,Asian-Pac-Islander,Male,0.0,...,0.0,39.999999,United-States,>50K,0.166596,Y,<=50K,<=50K,True,0.020202


In [24]:
eps = 1e-8
for num_col in ohe_num_cols: 
    mad_df[num_col] = abs(cf_df[num_col] - input_df[num_col]) / numerical_mads[num_col]

mad_df[ohe_num_cols].mean(axis=1) + mad_df[ohe_cat_cols].mean(axis=1) 
    

0     0.040404
1     0.346676
2    11.398819
3     3.933024
4     0.020202
dtype: float64

0     0.000000
1     0.346676
2    11.378617
3     3.933024
4     0.000000
dtype: float64

In [33]:
from scipy.spatial import distance

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.520548,0.0,0.0,0.142857,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.657534,0.0,0.0,0.193878,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.164384,0.0,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.493151,0.0,0.0,0.397959,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.205479,0.0,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.301370,0.021740,0.0,0.397959,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.452055,0.000000,0.0,0.122449,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.287671,0.000000,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.493151,0.000000,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.150685,0.000000,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.136986,0.000000,0.0,0.377551,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,0.315068,0.000000,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,0.561644,0.000000,0.0,0.397959,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,0.068493,0.000000,0.0,0.193878,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [41]:

evaluation_df['Mahalanobis'] =  [distance.mahalanobis(input_df[df_info.ohe_feature_names].iloc[i].to_numpy(),
                                cf_df[df_info.ohe_feature_names].iloc[i].to_numpy(),
                                df_info.dummy_df[df_info.ohe_feature_names].cov().to_numpy()) for i in range(len(input_df))]

In [42]:
evaluation_df

Unnamed: 0.1,Unnamed: 0,scaled_input_age,scaled_input_workclass,scaled_input_education,scaled_input_marital-status,scaled_input_occupation,scaled_input_relationship,scaled_input_race,scaled_input_sex,scaled_input_capital-gain,...,origin_cf_hours-per-week,origin_cf_native-country,origin_cf_class,running_time,Found,ground_truth,prediction,realistic,MAD,Mahalanobis
0,0,0.520548,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,15.000001,United-States,>50K,0.184975,Y,<=50K,<=50K,True,0.040404,0.711427
1,0,0.657534,Private,Bachelors,Married-civ-spouse,Transport-moving,Husband,White,Male,0.0,...,20.0,United-States,>50K,0.167884,Y,<=50K,<=50K,True,0.346676,0.039716
2,0,0.164384,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,Male,0.0,...,39.999999,Guatemala,>50K,0.168001,Y,<=50K,<=50K,True,11.398813,0.940138
3,0,0.493151,State-gov,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,...,19.919363,United-States,<=50K,0.169464,Y,>50K,>50K,True,3.933023,0.051733
4,0,0.205479,Private,Bachelors,Never-married,Adm-clerical,Not-in-family,Asian-Pac-Islander,Male,0.0,...,39.999999,United-States,>50K,0.166596,Y,<=50K,<=50K,True,0.020202,0.877878


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./datasets/breast_cancer.csv',
                     delimiter=',', skipinitialspace=True)

In [3]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [None]:
def load_diabetes_df():
    
    target_name = 'diagnosis'

    df = pd.read_csv('./datasets/breast_cancer.csv',
                     delimiter=',', skipinitialspace=True)

    del df['id']
    del df['Unnamed: 32"']        

    df[target_name] = df[target_name].apply(lambda x: "Y" if x==1 else "N")

    feature_names = [col for col in df.columns if col != target_name]

    df = remove_missing_values(df)

    possible_outcomes = list(df[target_name].unique())

    numerical_cols, categorical_cols, columns_type = get_columns_type(df)

    return df, feature_names, numerical_cols, categorical_cols, columns_type, target_name, possible_outcomes