# Needed imports

In [1]:
# General imports
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy

import warnings
import os
import sys
import itertools

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"

# Sklearn imports
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from GroupFairnessNotions import confusion_matrix_scorer, Statistical_parity, Metric_disparity, Equal_opportunity,Predictive_equality,Treatment_equality, Overall_accuracy,Predictive_rate_parity

## Static parameters

In [2]:
nb_seed = 20

n_splits = 10 # cross validation 

# Reading dataset
dataset = 'S1_0.6' # ['S1_0.44','S1_0.52', 'S1_0.6']

# Target attribute
target = 'Y' 

# Protected attribute 
protected_att = 'A'

# Privacy settings
dict_setting = {"indsLDP": [protected_att,'C','M'],
                "combsLDP":[protected_att,'C','M']}

# Folder where results are saved as csv files
results_path = 'Results/'
# Fariness metrics and info no privacy results
header_nldp = ["Seed_num","SP_maj", "SP_min", "SPD","EO_maj","EO_min","EOD","PE_maj", "PE_min", "PED","TE_maj","TE_min","TED","OA_maj", "OA_min", "OAD","PRP_maj","PRP_min","PRPD","CSP_B0_maj", "CSP_B0_min", "CSD_B0","CSP_B1_maj", "CSP_B1_min", "CSD_B1","Acc"]
# Fariness metrics and info for privacy settings
header_ldp = ["Seed_num","epsilon","SP_maj", "SP_min", "SPD","EO_maj","EO_min","EOD","PE_maj", "PE_min", "PED","TE_maj","TE_min","TED","OA_maj", "OA_min", "OAD","PRP_maj","PRP_min","PRPD","CSP_B0_maj", "CSP_B0_min", "CSD_B0","CSP_B1_maj", "CSP_B1_min", "CSD_B1","Acc"]

# Epsilon values for privacy
lst_eps =  [16,8,5,4,3,2,1,0.5,0.1]


## Writing functions

In [3]:
# Writing to csv function for saving the results
def write_to_csv(setting,dataset,header):
    with open(results_path + dataset + '_'+ setting +'_results.csv', mode='a', newline='') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)    
        writer.writerow(header)   
    file.close()


## Reading preprocessed dataset

In [4]:
df = pd.read_csv('Datasets/' + dataset + '.csv')
df


Unnamed: 0,C,A,M,Y
0,1,1,1,0
1,1,0,0,0
2,0,0,1,1
3,0,0,0,1
4,0,0,0,0
...,...,...,...,...
99995,0,1,1,1
99996,0,1,0,0
99997,0,0,0,0
99998,1,1,2,1


## Results for no privacy setting (baseline setting)

In [5]:
#folder_name of the results
setting = 'NoLDP'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_nldp)

df_cp = copy.deepcopy(df)

# Evaluating nb_seed times due to randomness
np_acc = []
np_sp_min, np_sp_maj = [], []
np_tv = []
np_eo_min, np_eo_maj = [], []
np_eod = []
np_pe_min, np_pe_maj = [], []
np_ped = []
np_te_min, np_te_maj = [], []
np_ted = []
np_oa_min, np_oa_maj = [], []
np_oad = []
np_prp_min, np_prp_maj = [], []
np_prpd = []
np_csp_min_B0, np_csp_maj_B0 = [], []
np_csd_B0 = []
np_csp_min_B1, np_csp_maj_B1 = [], []
np_csd_B1 = []

for seed in range(nb_seed):

    # Preparing X and y using pandas
    X = copy.deepcopy(df_cp.drop(target, axis=1))
    y = copy.deepcopy(df_cp[target])

    
    kf = StratifiedKFold(n_splits = n_splits,shuffle=True)

    pred = []
    pred_min = []
    pred_maj = []
    pred_A0_B0 = []
    pred_A0_B1 = []
    pred_A1_B0 = []
    pred_A1_B1 = []
    pred_proba_F = []
    pred_proba_M = []
    matrix_min = []
    matrix_maj = []
    test_min = []
    test_maj = []
    acc = []

    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # instantiate and train the model
        model = RandomForestClassifier(n_jobs=-1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test) # prediction of the actual samples
        pred.append(y_pred)
        acc.append(accuracy_score(y_test, y_pred))

        # retrieving minority, majority from the test set
        X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

        # predicted outcomes for minority, majority
        y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)
        pred_min.append(y_pred_min)
        pred_maj.append(y_pred_maj)

        # predicted probabilities for minority, majority
        y_pred_proba_min, y_pred_proba_maj = model.predict_proba(X_test_min)[:,1], model.predict_proba(X_test_maj)[:,1]
        pred_proba_F.append(y_pred_proba_min)
        pred_proba_M.append(y_pred_proba_maj)

        indices_min, indices_maj = X_test_min.index, X_test_maj.index
        y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
        test_min.append(y_test_min)
        test_maj.append(y_test_maj)

        # confusion matrix for minority, majority
        conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)
        matrix_min.append(conf_matrix_min)
        matrix_maj.append(conf_matrix_maj)

        # Needed for the computation of Cond.Stat.Disp
        # retrieving four groups: A=0_B=0, A0_B=1, A1_B=0, A1_B=1 from the test set
        X_test_min_B0, X_test_min_B1, X_test_maj_B0, X_test_maj_B1 = X_test[(X_test[protected_att] == 0) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['M'] == 1)],X_test[(X_test[protected_att] == 1) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['M'] == 1)]


# predicted outcomes for the four groups
    y_pred_min_B0, y_pred_min_B1, y_pred_maj_B0, y_pred_maj_B1 = model.predict(X_test_min_B0), model.predict(X_test_min_B1), model.predict(X_test_maj_B0), model.predict(X_test_maj_B1)  
    pred_A0_B0.append(y_pred_min_B0)
    pred_A0_B1.append(y_pred_min_B1)
    pred_A1_B0.append(y_pred_maj_B0)
    pred_A1_B1.append(y_pred_maj_B1)

    # fairness metrics
    np_sp_min.append(Statistical_parity(pred_min))
    np_sp_maj.append(Statistical_parity(pred_maj))
    np_tv.append(Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)))
    np_eo_min.append(Equal_opportunity(matrix_min))
    np_eo_maj.append(Equal_opportunity(matrix_maj))
    np_eod.append(Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)))
    np_pe_min.append(Predictive_equality(matrix_min))
    np_pe_maj.append(Predictive_equality(matrix_maj))
    np_ped.append(Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)))
    np_te_min.append(Treatment_equality(matrix_min))
    np_te_maj.append(Treatment_equality(matrix_maj))
    np_ted.append(Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)))
    np_oa_min.append(Overall_accuracy(test_min, pred_min))
    np_oa_maj.append(Overall_accuracy(test_maj, pred_maj))
    np_oad.append(Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)))
    np_prp_min.append(Predictive_rate_parity(test_min, pred_min))
    np_prp_maj.append(Predictive_rate_parity(test_maj, pred_maj))
    np_prpd.append(Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min))) 
    np_acc.append(np.mean(acc))
    np_csp_min_B0 = Statistical_parity(pred_A0_B0)
    np_csp_maj_B0 = Statistical_parity(pred_A1_B0)
    np_csd_B0.append(Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)))
    np_csp_min_B1 = Statistical_parity(pred_A0_B1)
    np_csp_maj_B1 = Statistical_parity(pred_A1_B1)
    np_csd_B1.append(Metric_disparity(Statistical_parity(pred_A1_B1), Statistical_parity(pred_A0_B1)))
    #writing the results to the csv file
    write_to_csv(setting,dataset,[str(seed), Statistical_parity(pred_maj),Statistical_parity(pred_min),Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)),Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min), Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)),Predictive_equality(matrix_maj), Predictive_equality(matrix_min),Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)),Treatment_equality(matrix_maj), Treatment_equality(matrix_min), Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)),Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min),Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)),Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min), Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min)),Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0),Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)), Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1),Metric_disparity(Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1)),np.mean(acc)])
write_to_csv(setting,dataset,[20,np.std(np_sp_maj),np.std(np_sp_min),np.std(np_tv),np.std(np_eo_maj),np.std(np_eo_min),np.std(np_eod),np.std(np_pe_maj),np.std(np_pe_min),np.std(np_ped),np.std(np_te_maj),np.std(np_te_min),np.std(np_ted),np.std(np_oa_maj),np.std(np_oa_min),np.std(np_oad),np.std(np_prp_maj),np.std(np_prp_min),np.std(np_prpd),np.std(np_csp_maj_B0),np.std(np_csp_min_B0),np.std(np_csd_B0),np.std(np_csp_maj_B1),np.std(np_csp_min_B1),np.std(np_csd_B1),np.std(np_acc)])
print('That took {} seconds'.format(time.time() - starttime)) 

That took 483.92775678634644 seconds


## LDP setting 1: Only the protected attribute is obfuscated

In [6]:
#folder_name of the results
setting = 'sLDP'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_ldp)

df_cp = copy.deepcopy(df)

for epsilon in lst_eps:
    print(epsilon)

    ldp_acc = []
    ldp_sp_min, ldp_sp_maj = [], []
    ldp_tv = []
    ldp_eo_min, ldp_eo_maj = [], []
    ldp_eod = []
    ldp_pe_min, ldp_pe_maj = [], []
    ldp_ped = []
    ldp_te_min, ldp_te_maj = [], []
    ldp_ted = []
    ldp_oa_min, ldp_oa_maj = [], []
    ldp_oad = []
    ldp_prp_min, ldp_prp_maj = [], []
    ldp_prpd = []
    ldp_csp_min_B0, ldp_csp_maj_B0 = [], []
    ldp_csd_B0 = []
    ldp_csp_min_B1, ldp_csp_maj_B1 = [], []
    ldp_csd_B1 = []

    for seed in range(nb_seed):

        # Preparing X and y using original datasets
        X = copy.deepcopy(df_cp.drop(target, axis=1))
        y = copy.deepcopy(df_cp[target])

        # Attribute's domain size
        k = len(set(X[protected_att]))

        # Cross validation using random forest
        kf = StratifiedKFold(n_splits = n_splits, shuffle=True)

        pred = []
        pred_min = []
        pred_maj = []
        pred_A0_B0 = []
        pred_A0_B1 = []
        pred_A1_B0 = []
        pred_A1_B1 = []
        pred_proba_F = []
        pred_proba_M = []
        matrix_min = []
        matrix_maj = []
        test_min = []
        test_maj = []
        acc = []

        for train_index, test_index in kf.split(X,y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]

            # Applying GRR to the protected attribute of the training set
            X_train[protected_att] = X_train[protected_att].apply(lambda x: GRR_Client(x, k, epsilon))

            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # instantiate and train model
            model = RandomForestClassifier(n_jobs=-1)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test) # prediction of the actual samples
            pred.append(y_pred)
            acc.append(accuracy_score(y_test, y_pred))

            # retrieving minority, majority from the test set
            X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

            # predicted outcomes for minority, majority
            y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)
            pred_min.append(y_pred_min)
            pred_maj.append(y_pred_maj)

            # predicted probabilities for minority, majority
            y_pred_proba_min, y_pred_proba_maj = model.predict_proba(X_test_min)[:,1], model.predict_proba(X_test_maj)[:,1]
            pred_proba_F.append(y_pred_proba_min)
            pred_proba_M.append(y_pred_proba_maj)

            indices_min, indices_maj = X_test_min.index, X_test_maj.index
            y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
            test_min.append(y_test_min)
            test_maj.append(y_test_maj)

            # confusion matrix for minority, majority
            conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)
            matrix_min.append(conf_matrix_min)
            matrix_maj.append(conf_matrix_maj)

            # retrieving the four groups: A0B0, A0B1, A1B0, A1B1 from the test set

            X_test_min_B0, X_test_min_B1, X_test_maj_B0, X_test_maj_B1 = X_test[(X_test[protected_att] == 0) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['M'] == 1)],X_test[(X_test[protected_att] == 1) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['M'] == 1)]

    # predicted outcomes for the four groups
            y_pred_min_B0, y_pred_min_B1, y_pred_maj_B0, y_pred_maj_B1 = model.predict(X_test_min_B0), model.predict(X_test_min_B1), model.predict(X_test_maj_B0), model.predict(X_test_maj_B1)  
            pred_A0_B0.append(y_pred_min_B0)
            pred_A0_B1.append(y_pred_min_B1)
            pred_A1_B0.append(y_pred_maj_B0)
            pred_A1_B1.append(y_pred_maj_B1)

        # sldp fairness metrics
        ldp_sp_min.append(Statistical_parity(pred_min))
        ldp_sp_maj.append(Statistical_parity(pred_maj))
        ldp_tv.append(Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)))
        ldp_eo_min.append(Equal_opportunity(matrix_min))
        ldp_eo_maj.append(Equal_opportunity(matrix_maj))
        ldp_eod.append(Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)))
        ldp_pe_min.append(Predictive_equality(matrix_min))
        ldp_pe_maj.append(Predictive_equality(matrix_maj))
        ldp_ped.append(Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)))
        ldp_te_min.append(Treatment_equality(matrix_min))
        ldp_te_maj.append(Treatment_equality(matrix_maj))
        ldp_ted.append(Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)))
        ldp_oa_min.append(Overall_accuracy(test_min, pred_min))
        ldp_oa_maj.append(Overall_accuracy(test_maj, pred_maj))
        ldp_oad.append(Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)))
        ldp_prp_min.append(Predictive_rate_parity(test_min, pred_min))
        ldp_prp_maj.append(Predictive_rate_parity(test_maj, pred_maj))
        ldp_prpd.append(Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min))) 
        ldp_acc.append(np.mean(acc))
        ldp_csp_min_B0 = Statistical_parity(pred_A0_B0)
        ldp_csp_maj_B0 = Statistical_parity(pred_A1_B0)
        ldp_csd_B0.append(Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)))
        ldp_csp_min_B1 = Statistical_parity(pred_A0_B1)
        ldp_csp_maj_B1 = Statistical_parity(pred_A1_B1)
        ldp_csd_B1.append(Metric_disparity(Statistical_parity(pred_A1_B1), Statistical_parity(pred_A0_B1)))
        #writing the results to the csv file
        write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(pred_maj),Statistical_parity(pred_min),Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)),Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min), Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)),Predictive_equality(matrix_maj), Predictive_equality(matrix_min),Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)),Treatment_equality(matrix_maj), Treatment_equality(matrix_min), Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)),Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min),Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)),Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min), Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min)),Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0),Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)), Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1),Metric_disparity(Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1)),np.mean(acc)])
    write_to_csv(setting,dataset,[20,str(epsilon),np.std(ldp_sp_maj),np.std(ldp_sp_min),np.std(ldp_tv),np.std(ldp_eo_maj),np.std(ldp_eo_min),np.std(ldp_eod),np.std(ldp_pe_maj),np.std(ldp_pe_min),np.std(ldp_ped),np.std(ldp_te_maj),np.std(ldp_te_min),np.std(ldp_ted),np.std(ldp_oa_maj),np.std(ldp_oa_min),np.std(ldp_oad),np.std(ldp_prp_maj),np.std(ldp_prp_min),np.std(ldp_prpd),np.std(ldp_csp_maj_B0),np.std(ldp_csp_min_B0),np.std(ldp_csd_B0),np.std(ldp_csp_maj_B1),np.std(ldp_csp_min_B1),np.std(ldp_csd_B1),np.std(ldp_acc)])        
print('That took {} seconds'.format(time.time() - starttime)) 

16
8
5
4
3
2
1
0.5
0.1
That took 4718.981489896774 seconds


## LDP setting 2: all the sensitive attributes are obfuscated using the combined krr setting

In [7]:
#folder_name of the results
setting = 'combsLDP'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_ldp)

df_cp = copy.deepcopy(df)

lst_sensitive = dict_setting[setting]


# New sensitive attribute (marginal of all in lst_sensitive)
new_protected_att = '_'.join(lst_sensitive)

# to compute all possible permutations
all_list = [list(df_cp[att].unique()) for att in lst_sensitive]
all_perm = list(itertools.product(*all_list))
k = len(all_perm)  # new domain size

df_cp[new_protected_att] = df_cp[lst_sensitive].astype(str).T.agg(', '.join)

for epsilon in lst_eps:
    print(epsilon)
    
    # Evaluating nb_seed times due to randomness
    ldp_sp_min, ldp_sp_maj = [], []
    ldp_tv = []
    ldp_eo_min, ldp_eo_maj = [], []
    ldp_pe_min, ldp_pe_maj = [], []
    ldp_te_min, ldp_te_maj = [], []
    ldp_oa_min, ldp_oa_maj = [], []
    ldp_prp_min, ldp_prp_maj = [], []
    ldp_acc = []
    ldp_csp_min_B0, ldp_csp_maj_B0, ldp_csp_min_B1, ldp_csp_maj_B1 = [], [], [], []
    ldp_csd_B1,ldp_csd_B0  = [], []
    ldp_eod = []
    ldp_ped = []
    ldp_ted = []
    ldp_oad = []
    ldp_prpd = []


    for seed in range(nb_seed):
        # Preparing X and y using pandas
        X = copy.deepcopy(df_cp.drop(target, axis=1))
        y = copy.deepcopy(df_cp[target])

        LE = LabelEncoder()
        LE.fit([str(val).replace('(', '').replace(')', '') for val in all_perm])


        # Cross validation (using StratifiedKFold)
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True)

        pred = []
        pred_min = []
        pred_maj = []
        pred_proba_F = []
        pred_proba_M = []
        pred_A0_B0 = []
        pred_A0_B1 = []
        pred_A1_B0 = []
        pred_A1_B1 = []
        matrix_min = []
        matrix_maj = []
        test_min = []
        test_maj = []
        acc = []

        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if target not in lst_sensitive:
                X_train[new_protected_att] = LE.transform(X_train[new_protected_att].astype(str))
                # applying GRR to all sensitive attributes
                X_train[new_protected_att] = X_train[new_protected_att].apply(lambda x: GRR_Client(x, k, epsilon))  # Applying GRR
                X_train[new_protected_att] = LE.inverse_transform(X_train[new_protected_att])

                for idx in range(len(lst_sensitive)):
                    X_train[lst_sensitive[idx]] = X_train[new_protected_att].apply(lambda x: x.split(',')[idx]).astype(int)

                del X_train[new_protected_att]  # deleting marginal
                del X_test[new_protected_att]  # deleting marginal

            else:
                df_train = pd.concat([X_train, y_train], axis=1)
                df_train[new_protected_att] = LE.transform(df_train[new_protected_att].astype(str))
                df_train[new_protected_att] = df_train[new_protected_att].apply(lambda x: GRR_Client(x, k, epsilon))  # Applying GRR
                df_train[new_protected_att] = LE.inverse_transform(df_train[new_protected_att])

                for idx in range(len(lst_sensitive)):
                    df_train[lst_sensitive[idx]] = df_train[new_protected_att].apply(lambda x: x.split(',')[idx]).astype(int)

                del df_train[new_protected_att]  # deleting marginal
                del X_test[new_protected_att]  # deleting marginal

                X_train = df_train.drop(target, axis=1)
                y_train = df_train[target]

            # instantiate and train the model
            model = RandomForestClassifier(n_jobs=-1)#, random_state=seed            
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)  # prediction of the actual samples
            pred.append(y_pred)
            acc.append(accuracy_score(y_test, y_pred))

            # retrieving minority, majority from the test set
            X_test_min, X_test_maj = X_test[X_test[protected_att].astype(int) == 0], X_test[
                X_test[protected_att].astype(int) == 1]

            # predicted outcomes for minority, majority
            y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)
            pred_min.append(y_pred_min)
            pred_maj.append(y_pred_maj)

            # predicted probabilities for minority, majority
            y_pred_proba_min, y_pred_proba_maj = model.predict_proba(X_test_min)[:, 1], model.predict_proba(X_test_maj)[
                                                                                        :, 1]
            pred_proba_F.append(y_pred_proba_min)
            pred_proba_M.append(y_pred_proba_maj)

            indices_min, indices_maj = X_test_min.index, X_test_maj.index
            y_test_min, y_test_maj = y_test.get(key=indices_min), y_test.get(key=indices_maj)
            test_min.append(y_test_min)
            test_maj.append(y_test_maj)

            # confusion matrix for minority, majority
            conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min, y_pred_min), confusion_matrix_scorer(
                y_test_maj, y_pred_maj)
            matrix_min.append(conf_matrix_min)
            matrix_maj.append(conf_matrix_maj)

            # retrieving the four groups: A0B0, A0B1, A1B0, A1B1 from the test set
            X_test_min_B0, X_test_min_B1, X_test_maj_B0, X_test_maj_B1 = X_test[(X_test[protected_att] == 0) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['M'] == 1)],X_test[(X_test[protected_att] == 1) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['M'] == 1)]
            # predicted outcomes for the four groups
            y_pred_min_B0, y_pred_min_B1, y_pred_maj_B0, y_pred_maj_B1 = model.predict(X_test_min_B0), model.predict(X_test_min_B1), model.predict(X_test_maj_B0), model.predict(X_test_maj_B1)  
            pred_A0_B0.append(y_pred_min_B0)
            pred_A0_B1.append(y_pred_min_B1)
            pred_A1_B0.append(y_pred_maj_B0)
            pred_A1_B1.append(y_pred_maj_B1)

        # comldp fairness metrics
        ldp_sp_min.append(Statistical_parity(pred_min))
        ldp_sp_maj.append(Statistical_parity(pred_maj))
        ldp_tv.append(Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)))
        ldp_eo_min.append(Equal_opportunity(matrix_min))
        ldp_eo_maj.append(Equal_opportunity(matrix_maj))
        ldp_eod.append(Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)))
        ldp_pe_min.append(Predictive_equality(matrix_min))
        ldp_pe_maj.append(Predictive_equality(matrix_maj))
        ldp_ped.append(Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)))
        ldp_te_min.append(Treatment_equality(matrix_min))
        ldp_te_maj.append(Treatment_equality(matrix_maj))
        ldp_ted.append(Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)))
        ldp_oa_min.append(Overall_accuracy(test_min, pred_min))
        ldp_oa_maj.append(Overall_accuracy(test_maj, pred_maj))
        ldp_oad.append(Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)))
        ldp_prp_min.append(Predictive_rate_parity(test_min, pred_min))
        ldp_prp_maj.append(Predictive_rate_parity(test_maj, pred_maj))
        ldp_prpd.append(Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min))) 
        ldp_acc.append(np.mean(acc))
        ldp_csp_min_B0.append(Statistical_parity(pred_A0_B0))
        ldp_csp_maj_B0.append(Statistical_parity(pred_A1_B0))
        ldp_csd_B0.append(Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)))
        ldp_csp_min_B1.append(Statistical_parity(pred_A0_B1))
        ldp_csp_maj_B1.append(Statistical_parity(pred_A1_B1))
        ldp_csd_B1.append(Metric_disparity(Statistical_parity(pred_A1_B1), Statistical_parity(pred_A0_B1)))
        #writing the results to the csv file
        write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(pred_maj),Statistical_parity(pred_min),Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)),Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min), Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)),Predictive_equality(matrix_maj), Predictive_equality(matrix_min),Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)),Treatment_equality(matrix_maj), Treatment_equality(matrix_min), Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)),Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min),Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)),Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min), Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min)),Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0),Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)), Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1),Metric_disparity(Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1)),np.mean(acc)])
    write_to_csv(setting,dataset,[20,str(epsilon),np.std(ldp_sp_maj),np.std(ldp_sp_min),np.std(ldp_tv),np.std(ldp_eo_maj),np.std(ldp_eo_min),np.std(ldp_eod),np.std(ldp_pe_maj),np.std(ldp_pe_min),np.std(ldp_ped),np.std(ldp_te_maj),np.std(ldp_te_min),np.std(ldp_ted),np.std(ldp_oa_maj),np.std(ldp_oa_min),np.std(ldp_oad),np.std(ldp_prp_maj),np.std(ldp_prp_min),np.std(ldp_prpd),np.std(ldp_csp_maj_B0),np.std(ldp_csp_min_B0),np.std(ldp_csd_B0),np.std(ldp_csp_maj_B1),np.std(ldp_csp_min_B1),np.std(ldp_csd_B1),np.std(ldp_acc)])        
print('That took {} seconds'.format(time.time() - starttime)) 

16
8
5
4
3
2
1
0.5
0.1
That took 3609.4293472766876 seconds


## LDP setting 3: all the sensitive attributes are obfuscated using the krr independent setting

In [8]:
#folder_name of the results
setting = 'indsLDP'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_ldp)

df_cp = copy.deepcopy(df)

lst_sensitive = dict_setting[setting]

#d = len(lst_sensitive)  # number of attributes
#eps_spl = epsilon / d


# domain size (total) of sensitive attributes
total_k = sum([len(set(df_cp[att])) for att in lst_sensitive])


protected_att = lst_sensitive[0]


for epsilon in lst_eps:
    print(epsilon)
    
    # Evaluating nb_seed times due to randomness
    ldp_sp_min, ldp_sp_maj = [], []
    ldp_tv = []
    ldp_eo_min, ldp_eo_maj = [], []
    ldp_pe_min, ldp_pe_maj = [], []
    ldp_te_min, ldp_te_maj = [], []
    ldp_oa_min, ldp_oa_maj = [], []
    ldp_prp_min, ldp_prp_maj = [], []
    ldp_acc = []
    ldp_csp_min_B0, ldp_csp_maj_B0, ldp_csp_min_B1, ldp_csp_maj_B1 = [], [], [], []
    ldp_csd_B1,ldp_csd_B0  = [], []
    ldp_eod = []
    ldp_ped = []
    ldp_ted = []
    ldp_oad = []
    ldp_prpd = []


    for seed in range(nb_seed):
        #np.random.seed(seed)

        # Preparing X and y using pandas
        X = copy.deepcopy(df_cp.drop(target, axis=1))
        y = copy.deepcopy(df_cp[target])

        kf = StratifiedKFold(n_splits=n_splits, shuffle=True)
        
        pred = []
        pred_inv = []
        pred_min = []
        pred_maj = []
        pred_proba_F = []
        pred_proba_M = []
        pred_A0_B0 = []
        pred_A0_B1 = []
        pred_A1_B0 = []
        pred_A1_B1 = []
        matrix_min = []
        matrix_maj = []
        test_min = []
        test_maj = []
        acc = []

        for train_index, test_index in kf.split(X,y):

            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            if target not in lst_sensitive:           

                # Applying GRR to all sensitive attributes except y of training set (reminder: change how the privacy budget is distributed among vars (not even distribution)

                for att in lst_sensitive:
                    k = len(set(df_cp[att]))
                    #X_train[att] = X_train[att].apply(lambda x: GRR_Client(x, k, eps_spl))
                    X_train[att] = X_train[att].apply(lambda x: GRR_Client(x, k, (epsilon*k)/ total_k ))
            else:

                # Applying GRR to all sensitive attributes including y of training set
                df_train = pandas.concat([X_train, y_train], axis=1)
                for att in lst_sensitive:
                    k = len(set(df_cp[att]))
                    #df_train[att] = df_train[att].apply(lambda x: GRR_Client(x, k, eps_spl))
                    df_train[att] = df_train[att].apply(lambda x: GRR_Client(x, k, (epsilon*k)/ total_k ))

                X_train = df_train.drop(target, axis=1)
                y_train = df_train[target]        


            model = RandomForestClassifier(n_jobs=-1)#, random_state=seed
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test) # prediction of the actual samples
            pred.append(y_pred)
            acc.append(accuracy_score(y_test, y_pred))

    #         # Generating synthetic samples by inverting the sensitive feature only
    #         X_test_inv_sensitive = X_test.copy()
    #         # inverting the sensitive feature value for all samples
    #         X_test_inv_sensitive[protected_att].replace([1,0],[0,1],inplace=True)
    #         # prediction of the synthetic samples 
    #         y_pred_inv = model.predict(X_test_inv_sensitive)
    #         pred_inv.append(y_pred_inv)

            # retrieving minority, majority from the test set
            X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

            # predicted outcomes for minority, majority
            y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)
            pred_min.append(y_pred_min)
            pred_maj.append(y_pred_maj)

            # predicted probabilities for minority, majority
            y_pred_proba_min, y_pred_proba_maj = model.predict_proba(X_test_min)[:,1], model.predict_proba(X_test_maj)[:,1]
            pred_proba_F.append(y_pred_proba_min)
            pred_proba_M.append(y_pred_proba_maj)

            indices_min, indices_maj = X_test_min.index, X_test_maj.index
            y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
            test_min.append(y_test_min)
            test_maj.append(y_test_maj)

            # confusion matrix for minority, majority
            conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)
            matrix_min.append(conf_matrix_min)
            matrix_maj.append(conf_matrix_maj)
            # retrieving the four groups: A0B0, A0B1, A1B0, A1B1 from the test set
            X_test_min_B0, X_test_min_B1, X_test_maj_B0, X_test_maj_B1 = X_test[(X_test[protected_att] == 0) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['M'] == 1)],X_test[(X_test[protected_att] == 1) & (X_test['M'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['M'] == 1)]
            # predicted outcomes for the four groups
            y_pred_min_B0, y_pred_min_B1, y_pred_maj_B0, y_pred_maj_B1 = model.predict(X_test_min_B0), model.predict(X_test_min_B1), model.predict(X_test_maj_B0), model.predict(X_test_maj_B1)  
            pred_A0_B0.append(y_pred_min_B0)
            pred_A0_B1.append(y_pred_min_B1)
            pred_A1_B0.append(y_pred_maj_B0)
            pred_A1_B1.append(y_pred_maj_B1)

        #********************
        # indldp fairness metrics
        ldp_sp_min.append(Statistical_parity(pred_min))
        ldp_sp_maj.append(Statistical_parity(pred_maj))
        ldp_tv.append(Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)))

        ldp_eo_min.append(Equal_opportunity(matrix_min))
        ldp_eo_maj.append(Equal_opportunity(matrix_maj))
        ldp_eod.append(Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)))

        ldp_pe_min.append(Predictive_equality(matrix_min))
        ldp_pe_maj.append(Predictive_equality(matrix_maj))
        ldp_ped.append(Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)))

        ldp_te_min.append(Treatment_equality(matrix_min))
        ldp_te_maj.append(Treatment_equality(matrix_maj))
        ldp_ted.append(Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)))

        ldp_oa_min.append(Overall_accuracy(test_min, pred_min))
        ldp_oa_maj.append(Overall_accuracy(test_maj, pred_maj))
        ldp_oad.append(Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)))

        ldp_prp_min.append(Predictive_rate_parity(test_min, pred_min))
        ldp_prp_maj.append(Predictive_rate_parity(test_maj, pred_maj))
        ldp_prpd.append(Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min))) 

        ldp_acc.append(np.mean(acc))
        ldp_csp_min_B0.append(Statistical_parity(pred_A0_B0))
        ldp_csp_maj_B0.append(Statistical_parity(pred_A1_B0))
        ldp_csd_B0.append(Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)))

        ldp_csp_min_B1.append(Statistical_parity(pred_A0_B1))
        ldp_csp_maj_B1.append(Statistical_parity(pred_A1_B1))
        ldp_csd_B1.append(Metric_disparity(Statistical_parity(pred_A1_B1), Statistical_parity(pred_A0_B1)))

        #writing the results to the csv file
        write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(pred_maj),Statistical_parity(pred_min),Metric_disparity(Statistical_parity(pred_maj), Statistical_parity(pred_min)),Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min), Metric_disparity(Equal_opportunity(matrix_maj), Equal_opportunity(matrix_min)),Predictive_equality(matrix_maj), Predictive_equality(matrix_min),Metric_disparity(Predictive_equality(matrix_maj), Predictive_equality(matrix_min)),Treatment_equality(matrix_maj), Treatment_equality(matrix_min), Metric_disparity(Treatment_equality(matrix_maj), Treatment_equality(matrix_min)),Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min),Metric_disparity(Overall_accuracy(test_maj, pred_maj), Overall_accuracy(test_min, pred_min)),Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min), Metric_disparity(Predictive_rate_parity(test_maj, pred_maj), Predictive_rate_parity(test_min, pred_min)),Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0),Metric_disparity(Statistical_parity(pred_A1_B0), Statistical_parity(pred_A0_B0)), Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1),Metric_disparity(Statistical_parity(pred_A1_B1),Statistical_parity(pred_A0_B1)),np.mean(acc)])
    write_to_csv(setting,dataset,[20,str(epsilon),np.std(ldp_sp_maj),np.std(ldp_sp_min),np.std(ldp_tv),np.std(ldp_eo_maj),np.std(ldp_eo_min),np.std(ldp_eod),np.std(ldp_pe_maj),np.std(ldp_pe_min),np.std(ldp_ped),np.std(ldp_te_maj),np.std(ldp_te_min),np.std(ldp_ted),np.std(ldp_oa_maj),np.std(ldp_oa_min),np.std(ldp_oad),np.std(ldp_prp_maj),np.std(ldp_prp_min),np.std(ldp_prpd),np.std(ldp_csp_maj_B0),np.std(ldp_csp_min_B0),np.std(ldp_csd_B0),np.std(ldp_csp_maj_B1),np.std(ldp_csp_min_B1),np.std(ldp_csd_B1),np.std(ldp_acc)])        
print('That took {} seconds'.format(time.time() - starttime)) 


    #return np.mean(ldp_sp_min), np.mean(ldp_sp_maj), np.mean(ldp_tv), np.mean(ldp_eo_min), np.mean(ldp_eo_maj), np.mean(ldp_pe_min), np.mean(ldp_pe_maj), np.mean(ldp_te_min), np.mean(ldp_te_maj), np.mean(ldp_oa_min), np.mean(ldp_oa_maj), np.mean(ldp_prp_min), np.mean(ldp_prp_maj), np.mean(ldp_acc)

16
8
5
4
3
2
1
0.5
0.1
That took 3765.864130973816 seconds
