In [1]:
import json
from typing import List, Dict

import pandas as pd
import numpy as np
from pathlib import Path
# import plotnine as pn
from typeguard import typechecked

In [2]:
def read_data_baseline_new(path):
    """
    Generates a dict object representing the data in the baseline_splits.json.
            Parameters:
                    path : path of the file baseline_splits.json
            Returns:
                    dict (Dict): a preprocessed dict representing the data
    """
    dict = {}
    with open(path, 'r') as f:
        results = json.load(f)
        
    for dataset_key, dataset_value in results["results"].items():
        for preprocessing_key, preprocessing_value in dataset_value.items():
            if preprocessing_key == 'baseline':
                for classifier_key, classifier_value in preprocessing_value.items():
                    key = dataset_key + "-" + preprocessing_key + "-" + classifier_key
                    dict[key] = {'dataset':dataset_key, 'preprocessing':preprocessing_key, 'classifier':classifier_key, 'classifier_value':classifier_value}
            elif preprocessing_key != "DisparateImpactRemover":
                for classifier_key, classifier_value in preprocessing_value[0]["results"].items():
                    key = dataset_key + "-" + preprocessing_key + "-" + classifier_key
                    dict[key] = {'dataset':dataset_key, 'preprocessing':preprocessing_key, 'classifier':classifier_key, 'classifier_value':classifier_value}
    return dict

In [3]:
def read_data_fairboost_new(path):
    """
    Generates a dict object representing the data in the fairboost_splits.json.
            Parameters:
                    path : path of the file fairboost_splits.json
            Returns:
                    dict (Dict): a preprocessed dict representing the data
    """
    dict = {}
    with open(path, 'r') as f:
        results = json.load(f)
    for dataset_key, dataset_value in results["results"].items():
        for preprocessing_key, preprocessing_value in dataset_value.items():
            if preprocessing_key == "fairboost":
                for i in range(6, len(preprocessing_value), 7):
                    for classifier_key, classifier_value in preprocessing_value[i]["results"].items():
                        key = "Fairboost : " + dataset_key + "-" + preprocessing_key + "-" + classifier_key + "-" \
                              + preprocessing_value[i]["hyperparameters"]["init"]['bootstrap_type']
                        dict[key] = {'dataset':dataset_key, 'preprocessing':preprocessing_value[i]["hyperparameters"]["init"]['bootstrap_type'], 'classifier':classifier_key, 'classifier_value':classifier_value}
#             else:
#                 for i in range(len(preprocessing_value)):
#                     for classifier_key, classifier_value in preprocessing_value[i]["results"].items():
#                         if preprocessing_value[i]["hyperparameters"]['bootstrap_type'] == "NONE":
#                             key = "Fairboost : " + dataset_key + "-" + preprocessing_key + "-" + classifier_key + "-" \
#                                   + preprocessing_value[i]["hyperparameters"]['bootstrap_type']
#                             dict[key] = {'dataset':dataset_key, 'preprocessing':preprocessing_value[i]["hyperparameters"]['bootstrap_type'], 'classifier':classifier_key, 'classifier_value':classifier_value}
    return dict

In [4]:
@typechecked
def to_dataframe(data: Dict, dataset_name="", classifier_name=""):
    """
    Generates a DataFrame object, necessary for the plotting.
            Parameters:
                    data : List of preprocessing dicts.
            Returns:
                    d (DataFrame): returns the dataframe
    """
    x1 = []
    x2 = []
    y1 = []
    y2 = []
    t = []
    for key, value in data.items():
        if (dataset_name in key) and (classifier_name in key):
            mean_accuracy = np.mean(value["accuracy"])
            mean_fairness = np.mean(value["disparate_impact"])
            std_accuracy = np.std(value["accuracy"])
            std_fairness = np.std(value["disparate_impact"])
            x1.append(mean_accuracy - (std_accuracy / 2))
            x2.append(mean_accuracy + (std_accuracy / 2))
            y1.append(mean_fairness - (std_fairness / 2))
            y2.append(mean_fairness + (std_fairness / 2))
            t.append(key)
    d = pd.DataFrame({"x1": x1, "x2": x2, "y1": y1, "y2": y2, "t": t, "r": t})
    return d

In [5]:
@typechecked
def read_data() -> Dict:
    """
    Read data from files and return its content in dictionnaries.
            Returns:
                    data: the data contained in both files
    """
    data_path = Path("raw_data")
    fairboost_results_path = Path(data_path, 'fairboost_splits.json')
    baseline_results_path = Path(data_path, 'baseline_splits.json')
    data_baseline = read_data_baseline_new(baseline_results_path)
    data_fairboost = read_data_fairboost_new(fairboost_results_path)
    return {**data_baseline, **data_fairboost}

In [6]:
data = read_data()

In [7]:
import statistics

def caclculate_accuracy_fairness_h_mean(f1_scores, DI):
    
    # calculate 1-abs(1-DI)     
    normalized_DI = [round(1-abs(1-di_score),6) for di_score in DI]
    
    harmonic_means = list(map(lambda x, y:statistics.harmonic_mean([x,y]), f1_scores, normalized_DI))
    return sum(harmonic_means)/len(harmonic_means)
#     print()

In [8]:
result_tuples = []
for config, config_info in data.items():
    classifier_results = config_info['classifier_value']
    h_mean = caclculate_accuracy_fairness_h_mean(classifier_results['f1-score'],  classifier_results['disparate_impact'])
    result_tuples.append((config_info['dataset'], config_info['preprocessing'], config_info['classifier'], h_mean))
    
df = pd.DataFrame(result_tuples, columns=['dataset', 'preprocessing', 'classifier', 'h_mean'])

In [9]:
df.to_csv('best_method.csv', index=False)

In [22]:
group_1 = df.groupby(['dataset', 'preprocessing',]).agg(['mean']).reset_index()
group_1.to_csv('best_method_combined_classifiers.csv', index=False)

  df.groupby(['dataset', 'preprocessing',]).agg(['mean']).reset_index()


Unnamed: 0_level_0,dataset,preprocessing,h_mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean
0,adult,CUSTOM,0.44215
1,adult,DEFAULT,0.469858
2,adult,LFR,0.534773
3,adult,NONE,0.45045
4,adult,OptimPreproc,0.498712
5,adult,Reweighing,0.54763
6,adult,baseline,0.0
7,compas,CUSTOM,0.739312
8,compas,DEFAULT,0.745459
9,compas,LFR,0.715122


In [None]:
group_2 = df.groupby(['preprocessing',]).agg(['mean']).reset_index()
group_2.to_csv('best_method_overall.csv', index=False)

In [None]:
group_2 = df.groupby(['preprocessing',]).agg(['mean']).reset_index()
group_2.to_csv('best_method_overall.csv', index=False)