# Needed imports

In [23]:
# General imports
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import warnings
import os
import sys
import itertools

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"



# Sklearn imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
# LDP import
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from GRR import GRR_Client
# Fairness imports
from GroupFairnessNotions import confusion_matrix_scorer, Statistical_parity, Metric_disparity, Equal_opportunity, Predictive_equality, Treatment_equality, Overall_accuracy, Predictive_rate_parity, CSD

## Static parameters

In [24]:
nb_seed = 20


dataset_path = 'Datasets/Synthetic/'

results_path = 'Results/'

# list of epsilon


lst_eps = [16,8,2,1,0.85,0.5,0.4,0.3,0.2,0.1]

# Target attribute
target = 'Y' 

# Protected attribute 
protected_att = 'A'

test_size = 0.2


# Reading dataset
dataset = 'S2' # ['S1','S3', 'S4', 'S5']

# Folder where results are saved as csv files
results_path = 'Results/'
# Fariness metrics and info no privacy results
header_nldp = ["Seed_num","SP_maj", "SP_min", "SPD","EO_maj","EO_min","EOD","CSP_X0_maj", "CSP_X0_min", "CSD_X0","CSP_X1_maj", "CSP_X1_min", "CSD_X1"]
# Fariness metrics and info for privacy settings
header_ldp = ["Seed_num","epsilon","SP_maj", "SP_min", "SPD","EO_maj","EO_min","EOD","CSP_X0_maj", "CSP_X0_min", "CSD_X0","CSP_X1_maj", "CSP_X1_min", "CSD_X1"]


In [25]:
# Writing to csv function for saving the results
def write_to_csv(setting,dataset,header):
    with open(results_path + dataset + '_'+ setting +'_results.csv', mode='a', newline='') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)    
        writer.writerow(header)   
    file.close()

## Reading dataset

In [26]:
df = pd.read_csv(dataset_path + dataset + '.csv')
df


Unnamed: 0,X,A,Y
0,1,0,1
1,0,1,1
2,0,1,1
3,1,1,1
4,0,1,1
...,...,...,...
99995,1,0,1
99996,0,1,1
99997,0,1,1
99998,0,1,1


## Results for no privacy setting (baseline setting)

In [27]:
#folder_name of the results
setting = 'NoLDP'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_nldp)

df_cp = copy.deepcopy(df)


np_sp_min1, np_sp_maj1, np_sd = [], [], []
ndp_csp_minX0, ndp_csp_majX0, np_csd_X0, ndp_csp_minX1, ndp_csp_majX1, np_csd_X1 = [], [] ,[] , [],[], []
np_eo_min1, np_eo_maj1, np_eod = [], [], []


for seed in range(nb_seed):
    print(seed)
    np.random.seed(seed) # for reproducibility
    
    # Use original datasets
    X = copy.deepcopy(df_cp.drop(target, axis=1))
    y = copy.deepcopy(df_cp[target])

    # Train test splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
    
    # instantiate and train model
    model = RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # retrieving minority, majority from the test set
    X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

    # predicted outcomes for minority, majority
    y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)

    indices_min, indices_maj = X_test_min.index, X_test_maj.index
    y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
    
    # Needed for the computation of Cond.Stat.Disp
            
    # retrieving four groups: A=0_X=0, A0_X=1, A1_X=0, A1_X=1 from the test set

    X_test_min_X0, X_test_min_X1, X_test_maj_X0, X_test_maj_X1 = X_test[(X_test[protected_att] == 0) & (X_test['X'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['X'] == 1)], X_test[(X_test[protected_att] == 1) & (X_test['X'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['X'] == 1)]
      
    # confusion matrix for minority, majority
    conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)
    
    
    
    # predicted outcomes for the six groups
    y_pred_A0_X0, y_pred_A0_X1,y_pred_A1_X0, y_pred_A1_X1 = model.predict(X_test_min_X0), model.predict(X_test_min_X1), model.predict(X_test_maj_X0), model.predict(X_test_maj_X1)

    # computing fairness metrics
    np_sp_min1.append(Statistical_parity(y_pred_min))
    np_sp_maj1.append(Statistical_parity(y_pred_maj))
    np_sd.append(Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)))
    np_eo_min1.append(Equal_opportunity(conf_matrix_min))
    np_eo_maj1.append(Equal_opportunity(conf_matrix_maj))
    np_eod.append(Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)))    
    ndp_csp_minX0.append(Statistical_parity(y_pred_A0_X0))
    ndp_csp_majX0.append(Statistical_parity(y_pred_A1_X0))
    np_csd_X0.append(Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)))
    ndp_csp_minX1.append(Statistical_parity(y_pred_A0_X1))
    ndp_csp_majX1.append(Statistical_parity(y_pred_A1_X1))  
    np_csd_X1.append(Metric_disparity(Statistical_parity(y_pred_A1_X1), Statistical_parity(y_pred_A0_X1)))
    #writing the results to the csv file
    write_to_csv(setting,dataset,[str(seed), Statistical_parity(y_pred_maj),Statistical_parity(y_pred_min),Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)),Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min), Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)),Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0),Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)), Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1),Metric_disparity(Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1))])

print('That took {} seconds'.format(time.time() - starttime)) 


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
That took 26.859607934951782 seconds


## LDP setting 

In [None]:
print("\n========= Setting 1: Local DP: only the sensitive feature is obfuscated =========\n")

#folder_name of the results
setting = 'sLDP'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_ldp)

df_cp = copy.deepcopy(df)


sldp_sp_min1, sldp_sp_maj1, sldp_sd, sd_std  = [], [], [], []
sldp_csp_minX0, sldp_csp_majX0, sldp_csd_X0, csd_X0_std, sldp_csp_minX1, sldp_csp_majX1, sldp_csd_X1, csd_X1_std= [], [] , [], [], [],[], [], []
sldp_oa_min1, sldp_oa_maj1, sldp_oad, oad_std = [], [], [], []
sldp_eo_min1, sldp_eo_maj1, sldp_eod, eod_std = [], [], [], []


for epsilon in lst_eps:
    print(epsilon)
    
    ldp_sp_min, ldp_sp_maj, ldp_sd = [], [], []
    ldp_csp_minX0, ldp_csp_majX0, ldp_csd_X0, ldp_csp_minX1, ldp_csp_majX1,ldp_csd_X1 = [], [],[], [], [], []
    ldp_oa_min, ldp_oa_maj, ldp_oad  = [], [], []
    ldp_eo_min, ldp_eo_maj, ldp_eod  = [], [], []
    
    for seed in range(nb_seed):
        #np.random.seed(seed)


        # Preparing X and y using pandas
        X = copy.deepcopy(df_cp.drop(target, axis=1))
        y = copy.deepcopy(df_cp[target])

        # Train test splitting
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)

        # Attribute's domain size
        k = len(set(X[protected_att]))

        # Applying GRR to the protected attribute of the training set
        X_train[protected_att] = X_train[protected_att].apply(lambda x: GRR_Client(x, k, epsilon))
        
        # instantiate and train model
        model = RandomForestClassifier(n_jobs=-1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test) # prediction of the actual samples
        
        acc = accuracy_score(y_test, y_pred)

        # retrieving minority, majority from the test set
        X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

        # predicted outcomes for minority, majority
        y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)

        indices_min, indices_maj = X_test_min.index, X_test_maj.index
        y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
        
        
        # Needed for the computation of Cond.Stat.Disp
            
        # retrieving four groups: A=0_X=0, A0_X=1, A1_X=0, A1_X=1 from the test set
            
        X_test_min_X0, X_test_min_X1, X_test_maj_X0, X_test_maj_X1 = X_test[(X_test[protected_att] == 0) & (X_test['X'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['X'] == 1)], X_test[(X_test[protected_att] == 1) & (X_test['X'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['X'] == 1)]
        
        # confusion matrix for minority, majority
        conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)

        
        # predicted outcomes for the four groups
        y_pred_A0_X0, y_pred_A0_X1,y_pred_A1_X0, y_pred_A1_X1 = model.predict(X_test_min_X0), model.predict(X_test_min_X1), model.predict(X_test_maj_X0), model.predict(X_test_maj_X1)

        # computing fairness metrics with obfuscated A
        ldp_sp_min.append(Statistical_parity(y_pred_min))
        ldp_sp_maj.append(Statistical_parity(y_pred_maj))
        ldp_sd.append(Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)))
        
        ldp_eo_min.append(Equal_opportunity(conf_matrix_min))   
        ldp_eo_maj.append(Equal_opportunity(conf_matrix_maj))
        ldp_eod.append(Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)))
        
        ldp_csp_minX0.append(Statistical_parity(y_pred_A0_X0))
        ldp_csp_majX0.append(Statistical_parity(y_pred_A1_X0))
        ldp_csd_X0.append(Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)))
        
        ldp_csp_minX1.append(Statistical_parity(y_pred_A0_X1))
        ldp_csp_majX1.append(Statistical_parity(y_pred_A1_X1))
        ldp_csd_X1.append(Metric_disparity(Statistical_parity(y_pred_A1_X1), Statistical_parity(y_pred_A0_X1)))
        write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(y_pred_maj),Statistical_parity(y_pred_min),Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)),Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min), Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)),Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0),Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)), Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1),Metric_disparity(Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1))])
    sldp_sp_min1.append(np.mean(ldp_sp_min))
    sldp_sp_maj1.append(np.mean(ldp_sp_maj))
    sldp_sd.append(np.mean(ldp_sd))
    sd_std.append(np.std(sldp_sd))
    sldp_csp_minX0.append(np.mean(ldp_csp_minX0))
    sldp_csp_majX0.append(np.mean(ldp_csp_majX0))
    sldp_csd_X0.append(np.mean(ldp_csd_X0))
    csd_X0_std.append(np.std(sldp_csd_X0))
    sldp_csp_minX1.append(np.mean(ldp_csp_minX1))
    sldp_csp_majX1.append(np.mean(ldp_csp_majX1))
    sldp_csd_X1.append(np.mean(ldp_csd_X1))
    csd_X1_std.append(np.std(sldp_csd_X1))
    sldp_eo_min1.append(np.mean(ldp_eo_min))
    sldp_eo_maj1.append(np.mean(ldp_eo_maj))
    sldp_eod.append(np.mean(ldp_eod))
    eod_std.append(np.std(sldp_eod))
    #writing the results to the csv file
    #write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(y_pred_maj),Statistical_parity(y_pred_min),Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)),Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min), Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)),Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0),Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)), Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1),Metric_disparity(Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1))])
print('That took {} seconds'.format(time.time() - starttime)) 



16
8
2
1
0.85
0.5
0.4
0.3
0.2
