In [3]:
import warnings
import numpy as np
import pandas as pd
import os, json
from sklearn.neighbors import KernelDensity
import prose.datainsights as di
warnings.filterwarnings('ignore')

In [4]:
def read_json(file_name_with_path):
    if os.path.isfile(file_name_with_path):
        with open(file_name_with_path) as f:
            res = json.load(f)
        return res
    else:
        raise ValueError('Not exist', file_name_with_path)

def combine_violation(x):
    idx_map = {'00': 0, '01': 1, '10': 2, '11': 3}
    cur = '{}{}'.format(int(x.iloc[0]), int(x.iloc[1]))
    return x.iloc[2 + idx_map[cur]]


def learn_cc_models(data_name, seed,
                    res_path='intermediate/models/',
                    data_path='data/processed/',
                    set_suffix='S_1',
                    n_groups=2, n_labels=2, sensi_col='A', y_col='Y',
                    dense_n=0.2, dense_h=1.0, dense_kernal='tophat', algorithm='auto'):

    cur_dir = res_path + data_name + '/'

    train_df = pd.read_csv(cur_dir + '-'.join(['train', str(seed), set_suffix]) + '.csv')
    test_df = pd.read_csv(cur_dir + '-'.join(['test', str(seed), set_suffix]) + '.csv')
    meta_info = read_json(data_path + data_name + '.json')
    n_cond_features = len(meta_info['continuous_features'])

    cc_cols = ['X{}'.format(i) for i in range(1, n_cond_features+1)]

    train_df[cc_cols] = (train_df[cc_cols] - train_df[cc_cols].mean()) / train_df[cc_cols].std()

    for group_i in range(n_groups):
        for label_i in range(n_labels):
            group_input = train_df[(train_df[sensi_col] == group_i) & (train_df[y_col] == label_i)]

            group_X = group_input[cc_cols].to_numpy()
            kde = KernelDensity(bandwidth=dense_h, kernel=dense_kernal, algorithm=algorithm)
            kde.fit(group_X)

            group_input['density'] = kde.score_samples(group_X)
            group_input.sort_values(by=['density'], ascending=False, inplace=True)
            
            cc_input = group_input.head(int(dense_n*group_input.shape[0]))

            group_cc_rules = di.learn_assertions(cc_input[cc_cols], max_self_violation=1.0)
            train_cc_res = group_cc_rules.evaluate(train_df[cc_cols], explanation=True, normalizeViolation=True)
            train_df['vio_G{}_L{}'.format(group_i, label_i)] = train_cc_res.row_wise_violation_summary['violation']

            test_cc_res = group_cc_rules.evaluate(test_df[cc_cols], explanation=True, normalizeViolation=True)
            test_df['vio_G{}_L{}'.format(group_i, label_i)] = test_cc_res.row_wise_violation_summary['violation']
            train_pert = sum(train_df['vio_G{}_L{}'.format(group_i, label_i)]>0)/train_df.shape[0]
            test_pert = sum(test_df['vio_G{}_L{}'.format(group_i, label_i)]>0)/test_df.shape[0]
            print(group_i, label_i, round(train_pert, 3), round(test_pert, 3))
    return train_df, test_df

## Parameters
- Bandwidth: 0.1 for all the datasets, kernel as Gaussian
- Kernel: Other datasets with Guassian
   - guassian: Cardio, ACSM, ACSI
   - exponential: Bank, LSAC, MEPS, ACSP
   - tophat: Credit, ACSE
   - epanechnikov: ACSH

In [None]:
# datasets = ['lsac']
datasets = ['cardio', 'bank', 'meps16', 'lsac', 'credit', 'ACSE', 'ACSP', 'ACSH', 'ACSM', 'ACSI']
dense_h_values = [x/10 for x in range(1, 10)]
dense_alg_values = ['tophat', 'gaussian', 'epanechnikov', 'exponential', 'linear', 'cosine']
for data_name in datasets:
    print(data_name, '----'*8)
    for kernel_i in dense_alg_values:
        print(kernel_i, '==='*8)
        cur_train, cur_test = learn_cc_models(data_name, 1, dense_kernal=kernel_i)
        
    print(data_name, '----'*8, '\n')

cardio --------------------------------
0 0 0.826 1.0
0 1 0.79 1.0
1 0 0.805 1.0
1 1 0.792 1.0
0 0 0.843 1.0
0 1 0.779 1.0
1 0 0.815 1.0
1 1 0.783 1.0
0 0 0.849 1.0
0 1 0.766 1.0
1 0 0.816 1.0
1 1 0.774 1.0
0 0 0.848 1.0
0 1 0.766 1.0
1 0 0.815 1.0
1 1 0.776 1.0
0 0 0.852 1.0
0 1 0.748 1.0
1 0 0.815 1.0
1 1 0.767 1.0
0 0 0.841 1.0
0 1 0.762 1.0
1 0 0.818 1.0
1 1 0.771 1.0
cardio -------------------------------- 

bank --------------------------------
0 0 0.813 1.0
0 1 0.949 1.0
1 0 0.803 1.0
1 1 0.511 1.0
0 0 0.837 1.0
0 1 0.971 1.0
1 0 0.776 1.0
1 1 0.666 1.0
0 0 0.841 1.0
0 1 0.978 1.0
1 0 0.813 1.0
1 1 0.65 1.0
0 0 0.87 1.0
0 1 0.968 1.0
1 0 0.78 1.0
1 1 0.729 1.0
0 0 0.846 1.0
0 1 0.958 1.0
1 0 0.815 1.0
1 1 0.67 1.0
0 0 0.842 1.0
0 1 0.978 1.0
1 0 0.815 1.0
1 1 0.65 1.0
bank -------------------------------- 

meps16 --------------------------------
0 0 0.865 1.0
0 1 0.736 1.0
1 0 0.836 1.0
1 1 0.893 1.0
0 0 0.876 1.0
0 1 0.84 1.0
1 0 0.867 1.0
1 1 0.893 1.0
0 0 0.877 1.0
0 1 0.694

In [24]:
cur_train

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y,A,vio_G0_L0,vio_G0_L1,vio_G1_L0,vio_G1_L1
0,22693,1.440022,0.204875,0.321595,1.151710,-0.203008,-0.047300,2,0.0,4,0.0,1,1,0.103412,0.126986,0.282402,0.000000
1,12427,-0.003916,-0.772776,0.321595,-1.205643,0.583770,0.162542,4,1.0,3,0.0,0,1,0.083874,0.138994,0.142099,0.120669
2,25845,0.936832,0.400405,-0.910791,-0.000774,-1.147141,-0.661564,3,0.0,3,1.0,0,1,0.122649,0.185233,0.208303,0.266671
3,13301,-0.113305,-1.163837,-0.602695,0.365926,0.741126,0.132020,3,1.0,4,0.0,1,1,0.168788,0.252065,0.082458,0.231663
4,23088,-0.003916,1.378055,0.629691,-0.000774,0.426415,0.845482,4,0.0,3,0.0,0,1,0.240700,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19225,4446,0.488336,-0.381716,0.629691,-0.136976,-0.203008,-0.264772,4,1.0,3,0.0,1,0,0.000366,0.000000,0.000000,0.000000
19226,21946,-0.003916,1.378055,0.475643,-0.000774,-0.989785,-0.184651,3,1.0,2,0.0,0,1,0.105768,0.101732,0.027923,0.111416
19227,22418,0.772748,-0.381716,0.321595,1.277436,-0.675074,-0.608150,4,1.0,1,0.0,1,0,0.066951,0.084800,0.135316,0.081338
19228,11225,-0.003916,-1.750427,-4.453900,-0.000774,0.741126,-0.085453,5,0.0,3,0.0,1,1,0.429153,0.364160,0.300865,0.416882


In [18]:
mi_df = cur_train[(cur_train['A'] == 0) & (cur_train['Y'] == 0)]

In [19]:
mi_df.shape[0]

1126