In [1]:
import warnings
import sys
sys.path.append('../')
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import os
from PrepareData import read_json, make_folder

# EXP 1: Compare MCC and SCC over Synthetic Data

## Read results from disc

In [3]:
repo_dir = sys.path[0].replace('notebooks', '')
eval_path=repo_dir+ 'eval/'

In [4]:
# for visualization change the values to be consistent with the order that higher is better
def normalize_fairness_measures(x):
    if 'Diff' in x.iloc[0]: # difference change to 1-abs(x)
        return 1-abs(x.iloc[1])
    
    elif x.iloc[0] == 'DI':
        if x.iloc[1] > 1:
            return min(x.iloc[1], 1/x.iloc[1])
        else:
            return x.iloc[1]
    else:# other metrics
        return x.iloc[1]
def add_vis_flag(x):
    if 'Diff' in x.iloc[0]: # difference change to 1-abs(x)
        if x.iloc[0] in ['ERRDiff', 'FNRDiff', 'FPRDiff']: # measures with lower value means better
            if x.iloc[1] > 0: 
                return 0
            else: # G0 has better outcome
                return 1
        else: # for measures like eqdiff, avgoddsdiff with higher value means better
            if x.iloc[1] < 0: 
                return 0
            else: # G0 has better outcome
                return 1
    
    elif x.iloc[0] == 'DI':
        if x.iloc[1] > 1:
            return 1
        else:
            return 0
    else:# other metrics
        return 0


In [5]:
seeds = [1, 12345, 6, 2211, 15]
# seeds = [1, 12345, 6, 2211, 15, 88, 121, 433, 500, 1121, 50, 583, 5278, 100000, 0xbeef, 0xcafe, 0xdead, 7777, 100, 923]
# seeds = [88, 121, 433, 500, 1121, 50, 583, 5278, 100000, 0xbeef, 0xcafe, 0xdead, 7777, 100, 923]

models = ['LR', 'TR']

datasets = ['syn{}'.format(x) for x in seeds]

In [6]:
# eval_suffix = '-min_g0-0.5'
eval_suffix = 'res'
# eval_suffix = 'res-min'
eval_file = 'syn_{}_n{}_{}_noerror_flip2_center.csv'.format(len(datasets), len(seeds), eval_suffix)
if os.path.exists(eval_path+eval_file):
    eval_df = pd.read_csv(eval_path+eval_file)
    print('Read evaluation results at {}'.format(eval_path+eval_file))
else:
    eval_df = pd.DataFrame()
    for data_name in datasets:
        cur_eval_df = pd.read_csv(eval_path+'{}-{}.csv'.format(eval_suffix, data_name))
#         print(cur_eval_df.head())
        cur_eval_df['norm_value'] = cur_eval_df[['metric', 'value']].apply(lambda x: normalize_fairness_measures(x), axis=1)
        cur_eval_df['norm_flag'] = cur_eval_df[['metric', 'value']].apply(lambda x: add_vis_flag(x), axis=1)

        eval_df = pd.concat([eval_df, cur_eval_df])
    
    eval_df.to_csv(eval_path+eval_file, index=False)
    print('Save evaluation results at {}'.format(eval_path+eval_file))

Save evaluation results at /Users/keyang/Projects/PubRepo/NonInvasiveTool4FairML/eval/syn_5_n5_res_noerror_flip2_center.csv


In [7]:
eval_df.head()

Unnamed: 0,data,model,seed,method,group,metric,value,norm_value,norm_flag
0,syn1,LR,1,MCC-MIN,all,AUC,0.677,0.677,0
1,syn1,LR,1,MCC-MIN,all,ACC,0.801,0.801,0
2,syn1,LR,1,MCC-MIN,all,SR,0.112667,0.112667,0
3,syn1,LR,1,MCC-MIN,all,BalAcc,0.67664,0.67664,0
4,syn1,LR,1,MCC-MIN,G0,AUC,0.992,0.992,0


In [8]:
eval_df.query('data == "syn6" and model=="LR" and seed==1 and group == "all" and metric == "DI"')

Unnamed: 0,data,model,seed,method,group,metric,value,norm_value,norm_flag
13,syn6,LR,1,MCC-MIN,all,DI,1.231061,0.812308,1
33,syn6,LR,1,MCC-W1,all,DI,0.450918,0.450918,0
53,syn6,LR,1,MCC-W2,all,DI,0.979625,0.979625,0
73,syn6,LR,1,SEP,all,DI,0.178899,0.178899,0
93,syn6,LR,1,ORIG,all,DI,0.193388,0.193388,0
113,syn6,LR,1,SCC-KAM,all,DI,0.275218,0.275218,0


In [14]:
eval_df.query('data == "syn6" and model=="LR" and seed==1 and group == "G0" and metric == "SR"')

Unnamed: 0,data,model,seed,method,group,metric,value,norm_value,norm_flag
6,syn6,LR,1,MCC-MIN,G0,SR,0.548485,0.548485,0
26,syn6,LR,1,MCC-W1,G0,SR,1.0,1.0,0
46,syn6,LR,1,MCC-W2,G0,SR,0.484848,0.484848,0
66,syn6,LR,1,SEP,G0,SR,0.548485,0.548485,0
86,syn6,LR,1,ORIG,G0,SR,0.584848,0.584848,0
106,syn6,LR,1,SCC-KAM,G0,SR,0.566667,0.566667,0


In [15]:
eval_df.query('data == "syn6" and model=="LR" and seed==1 and group == "G1" and metric == "SR"')

Unnamed: 0,data,model,seed,method,group,metric,value,norm_value,norm_flag
10,syn6,LR,1,MCC-MIN,G1,SR,0.182051,0.182051,0
30,syn6,LR,1,MCC-W1,G1,SR,1.0,1.0,0
50,syn6,LR,1,MCC-W2,G1,SR,0.765812,0.765812,0
70,syn6,LR,1,SEP,G1,SR,0.37265,0.37265,0
90,syn6,LR,1,ORIG,G1,SR,0.371795,0.371795,0
110,syn6,LR,1,SCC-KAM,G1,SR,0.375214,0.375214,0


## Draw barplots

In [9]:
def bar_plots(df, output_name, vis_datasets, vis_metric, vis_settings, group_input=None,
              legend_names=None, font_label=26, font_legend=18, 
              colors=['#ffffff', '#fffacd', '#3cb371','#20603d', '#0e6670'], bg_color = '#f3f3f3', x_tick_offset=6.3,
              x_ticks=None, y_label=None, x_label=None, legend=True, legend_col=5, save_to_disc=True):
    
    fig, ax = plt.subplots(1, figsize=(10, 4), dpi=200)
    input_df = df.copy()
    bar_mean = []
    bar_std = []
    dash_filling = []
    line_styles = []
    x_bars = []
    ind = 0
    
    for off_i, name in enumerate(vis_datasets):
        vis_df = input_df[(input_df['data']==name) & (input_df['metric']==vis_metric) & (input_df['group']==group_input)].copy()
        for setting_i in vis_settings:
            set_df = vis_df[vis_df['method']==setting_i]
            
            if set_df.shape[0] > 0:
                y_values = np.array(set_df['norm_value'])
                n_reverse = sum(np.array(set_df['norm_flag']))
                if n_reverse > int(len(y_values) * 0.9): # majoirty of cases in which G0 has better outcomes
                    dash_filling.append(True)
                else:
                    dash_filling.append(False)
                
                cur_mean = np.mean(y_values)
                cur_std = np.std(y_values)
#                 if data_name == 'lsac' and setting_i == 'OMN-ONE':
#                     print(data_name, cur_mean, cur_std)
                if cur_mean == 0:
#                     print('++', name, setting_i, cur_mean)
                    cur_mean = 0.01 # for visualization purpose so that the bar exists in the plot
                    line_styles.append('solid')
                elif vis_metric == 'BalAcc' and cur_std < 0.01 and abs(cur_mean-0.5) < 0.1:
                    # dashed border
                    line_styles.append('dashed')
                elif vis_metric in ['DI', 'AvgOddsDiff', 'EQDiff'] and cur_std < 0.01 and abs(1-cur_mean) < 0.0001:
                    line_styles.append('dashed')
                    cur_mean = 0.001
                else:
                    line_styles.append('solid')
                    
                bar_mean.append(cur_mean)
                bar_std.append(cur_std)
            else: # no model is returned
                dash_filling.append(False)
                line_styles.append('dashed')
                bar_mean.append(0)
                bar_std.append(0)
                
            x_bars.append(ind+off_i*2)
                
            ind += 0.83
    bplot = ax.bar(x_bars, bar_mean, yerr=bar_std)
#     print('-->', vis_metric, bar_mean)
    
    n_bars = len(vis_settings)
    for idx, patch in enumerate(bplot):
        patch.set_facecolor(colors[idx % n_bars])
        
        if dash_filling[idx]:
            patch.set_hatch('//')
            patch.set_edgecolor("#cb4154")
        else:
            patch.set_edgecolor("black")
        patch.set_linestyle(line_styles[idx])
        
    if legend_names:
        legends = legend_names
    else:
        legends = vis_settings
    # add labels for settings 
    for idx, setting_i, color_i, legend_i in zip(range(len(vis_settings)), vis_settings, colors, legends):
        ax.bar(-2, 1, ec='black', fc=color_i, label=legend_i)

    ax.set_facecolor(bg_color)
    ax.yaxis.grid(True)
    plt.xlim([-1, max(x_bars)+1])
    plt.xticks([(x-1)*x_tick_offset+0.8 for x in range(1, len(vis_datasets)+1)])
    if x_ticks:
        locs, labels=plt.xticks();
        plt.xticks(locs, x_ticks, horizontalalignment='center', fontsize=font_label-10, rotation=0);

    plt.ylim([0.0, 1.0])
    plt.yticks(fontsize=font_label);

    if y_label:
        plt.ylabel(y_label, fontsize=font_label)

    if x_label:
        plt.xlabel(x_label, fontsize=font_label)

    if legend:
        plt.legend(bbox_to_anchor=(0, 1, 1, 0), loc="lower center", mode="expand", ncol=legend_col, frameon=False, borderaxespad=0, handlelength=0.9, handletextpad=0.3, fontsize=font_label-7)

    if save_to_disc:
        plt.savefig(output_name, bbox_inches="tight")
        print('Bar plot is saved at ', output_name)
        plt.close()

In [10]:
plot_path = repo_dir+ 'intermediate/plots/syn/'
make_folder(plot_path)

In [11]:
plot_path

'/Users/keyang/Projects/PubRepo/NonInvasiveTool4FairML/intermediate/plots/syn/'

In [23]:
exp_datasets = datasets
exp_ticks = ['Syn1', 'Syn2', 'Syn3', 'Syn4', 'Syn5']

mcc_settings = ['ORIG', 'SEP', 'MCC-MIN', 'MCC-W1', 'MCC-W2', 'SCC-KAM']
mcc_legends = ['ORIG', 'SEP', 'MCC-MIN', 'MCC-W1', 'MCC-W2', 'SCC+K']
mcc_colors = ['#ffffff', '#fffacd', '#3cb371','#20603d', '#0e6670', '#2e8b57']

# mcc_settings = ['ORIG', 'SEP', 'MCC-W2', 'SCC-KAM']
# mcc_legends = ['ORIG', 'SEP', 'MCC-W2', 'SCC+K']
# mcc_colors = ['#ffffff', '#fffacd', '#0e6670', '#2e8b57']

eval_metrics = ['BalAcc', 'DI', 'AvgOddsDiff', 'EQDiff', 'FPRDiff', 'FNRDiff', 'ERRDiff'][:2]

x_tick_set = 7.4
# x_tick_set = 5.6
for model_name in models:
        
    vis_df = eval_df.query('model=="{}"'.format(model_name))
    
    for exp_metric in eval_metrics:
        output_name = '{}{}-{}-{}-DI-all.png'.format(plot_path, model_name, 'mcc', exp_metric)
        bar_plots(vis_df, output_name, exp_datasets, exp_metric, mcc_settings, group_input='all', x_ticks=exp_ticks, colors=mcc_colors, 
                  legend_names=mcc_legends, x_tick_offset=x_tick_set,
                  legend_col=len(mcc_settings),
                  save_to_disc=True)

            

Bar plot is saved at  /Users/keyang/Projects/PubRepo/NonInvasiveTool4FairML/intermediate/plots/syn/LR-mcc-BalAcc-DI-all.png
Bar plot is saved at  /Users/keyang/Projects/PubRepo/NonInvasiveTool4FairML/intermediate/plots/syn/LR-mcc-DI-DI-all.png
Bar plot is saved at  /Users/keyang/Projects/PubRepo/NonInvasiveTool4FairML/intermediate/plots/syn/TR-mcc-BalAcc-DI-all.png
Bar plot is saved at  /Users/keyang/Projects/PubRepo/NonInvasiveTool4FairML/intermediate/plots/syn/TR-mcc-DI-DI-all.png
