# Needed imports

In [9]:
# General imports
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import warnings
import os
import sys
import itertools

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"



# Sklearn imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
# LDP import
from multi_freq_ldpy.pure_frequency_oracles.GRR import GRR_Client
from GRR import GRR_Client
# Fairness imports
from GroupFairnessNotions import confusion_matrix_scorer, Statistical_parity, Metric_disparity, Equal_opportunity, Predictive_equality, Treatment_equality, Overall_accuracy, Predictive_rate_parity, CSD

## Static parameters

In [10]:
nb_seed = 100

test_size = 0.2


# Reading dataset
dataset = 'compas_good_outcome' 
# Target attribute
target = 'v_decile_score' 

# Protected attribute 
protected_att = 'race'


# Folder where results are saved as csv files
results_path = 'Results/Accuracy/'
# Fariness metrics and info no privacy results
header_nldp = ["Seed_num","SP_maj", "SP_min", "SPD","EO_maj","EO_min","EOD","CSP_X0_maj", "CSP_X0_min", "CSD_X0","CSP_X1_maj", "CSP_X1_min", "CSD_X1","Accuracy"]
# Fariness metrics and info for privacy settings
header_ldp = ["Seed_num","epsilon","SP_maj", "SP_min", "SPD","EO_maj","EO_min","EOD","CSP_X0_maj", "CSP_X0_min", "CSD_X0","CSP_X1_maj", "CSP_X1_min", "CSD_X1","Accuracy"]

# Epsilon values for privacy
lst_eps = [50,16,8,5,4,3,2,1,0.5]


## Needed functions

In [11]:
# Writing to csv function for saving the results
def write_to_csv(setting,dataset,header):
    with open(results_path + dataset + '_'+ setting +'_results.csv', mode='a', newline='') as file:
        writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)    
        writer.writerow(header)   
    file.close()


## Reading preprocessed dataset

In [12]:
df = pd.read_csv('Datasets/Real_data/Compas/' + dataset + '.csv')
df


Unnamed: 0,sex,race,age_cat,priors_count,v_decile_score
0,1,0,0,0,1
1,1,0,2,1,0
2,1,1,0,1,0
3,0,1,0,0,1
4,1,1,2,0,0
...,...,...,...,...,...
5910,1,0,0,0,0
5911,1,0,2,0,0
5912,1,0,2,0,0
5913,1,0,2,0,0


In [13]:
A0_X0_Y0 = len(df[(df['race'] == 0) & (df['priors_count'] == 0) & (df['v_decile_score'] == 0)])
A0_X1_Y0 = len(df[(df['race'] == 0) & (df['priors_count'] == 1) & (df['v_decile_score'] == 0)])

A1_X0_Y0 = len(df[(df['race'] == 1) & (df['priors_count'] == 0) & (df['v_decile_score'] == 0)])
A1_X1_Y0 = len(df[(df['race'] == 1) & (df['priors_count'] == 1) & (df['v_decile_score'] == 0)])

A0_X0_Y1 = len(df[(df['race'] == 0) & (df['priors_count'] == 0) & (df['v_decile_score'] == 1)])
A0_X1_Y1 = len(df[(df['race'] == 0) & (df['priors_count'] == 1) & (df['v_decile_score'] == 1)])

A1_X0_Y1 = len(df[(df['race'] == 1) & (df['priors_count'] == 0) & (df['v_decile_score'] == 1)])
A1_X1_Y1 = len(df[(df['race'] == 1) & (df['priors_count'] == 1) & (df['v_decile_score'] == 1)])

print(f'A0_X0_Y0: {A0_X0_Y0/5915}\n') 
print(f'A0_X1_Y0: {A0_X1_Y0/5915}\n') 
print(f'A1_X0_Y0: {A1_X0_Y0/5915}\n') 
print(f'A1_X1_Y0: {A1_X1_Y0/5915}\n') 
print(f'A0_X0_Y1: {A0_X0_Y1/5915}\n') 
print(f'A0_X1_Y1: {A0_X1_Y1/5915}\n') 
print(f'A1_X0_Y1: {A1_X0_Y1/5915}\n') 
print(f'A1_X1_Y1: {A1_X1_Y1/5915}\n')


l = [A0_X0_Y0,A0_X1_Y0,A1_X0_Y0,A1_X1_Y0,A0_X0_Y1,A0_X1_Y1,A1_X0_Y1,A1_X1_Y1]
x = sum(l)/5915

A0_X0_Y0: 0.24970414201183433

A0_X1_Y0: 0.25528317836010145

A1_X0_Y0: 0.15316990701606087

A1_X1_Y0: 0.09873203719357565

A0_X0_Y1: 0.0610312764158918

A0_X1_Y1: 0.03195266272189349

A1_X0_Y1: 0.11868131868131868

A1_X1_Y1: 0.03144547759932375



In [14]:
x

1.0

## Results for no privacy setting (baseline setting)

In [15]:
#folder_name of the results
setting = 'NoLDP3'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_nldp)

df_cp = copy.deepcopy(df)


np_sp_min1, np_sp_maj1, np_sd = [], [], []
ndp_csp_minX0, ndp_csp_majX0, np_csd_X0, ndp_csp_minX1, ndp_csp_majX1, np_csd_X1 = [], [] ,[] , [],[], []
np_eo_min1, np_eo_maj1, np_eod, acc = [], [], [], []


for seed in range(nb_seed):
    print(seed)
    np.random.seed(seed) # for reproducibility
    
    # Use original datasets
    X = copy.deepcopy(df_cp.drop(target, axis=1))
    y = copy.deepcopy(df_cp[target])

    # Train test splitting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)
    
    # instantiate and train model
    model = RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # accuracy
    accuracy = accuracy_score(y_pred, y_test)

    # retrieving minority, majority from the test set
    X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

    # predicted outcomes for minority, majority
    y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)

    indices_min, indices_maj = X_test_min.index, X_test_maj.index
    y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
    
    # Needed for the computation of Cond.Stat.Disp
            
    # retrieving four groups: A=0_X=0, A0_X=1, A1_X=0, A1_X=1 from the test set

    X_test_min_X0, X_test_min_X1, X_test_maj_X0, X_test_maj_X1 = X_test[(X_test[protected_att] == 0) & (X_test['priors_count'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['priors_count'] == 1)], X_test[(X_test[protected_att] == 1) & (X_test['priors_count'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['priors_count'] == 1)]
      
    # confusion matrix for minority, majority
    conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)
    
    
    
    # predicted outcomes for the six groups
    y_pred_A0_X0, y_pred_A0_X1,y_pred_A1_X0, y_pred_A1_X1 = model.predict(X_test_min_X0), model.predict(X_test_min_X1), model.predict(X_test_maj_X0), model.predict(X_test_maj_X1)

    # computing fairness metrics
    np_sp_min1.append(Statistical_parity(y_pred_min))
    np_sp_maj1.append(Statistical_parity(y_pred_maj))
    np_sd.append(Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)))
    np_eo_min1.append(Equal_opportunity(conf_matrix_min))
    np_eo_maj1.append(Equal_opportunity(conf_matrix_maj))
    np_eod.append(Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)))    
    ndp_csp_minX0.append(Statistical_parity(y_pred_A0_X0))
    ndp_csp_majX0.append(Statistical_parity(y_pred_A1_X0))
    np_csd_X0.append(Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)))
    ndp_csp_minX1.append(Statistical_parity(y_pred_A0_X1))
    ndp_csp_majX1.append(Statistical_parity(y_pred_A1_X1))  
    np_csd_X1.append(Metric_disparity(Statistical_parity(y_pred_A1_X1), Statistical_parity(y_pred_A0_X1)))
    acc.append(accuracy)
    #writing the results to the csv file
    write_to_csv(setting,dataset,[str(seed), Statistical_parity(y_pred_maj),Statistical_parity(y_pred_min),Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)),Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min), Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)),Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0),Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)), Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1),Metric_disparity(Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1)), accuracy])

print('That took {} seconds'.format(time.time() - starttime)) 


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
That took 107.86753416061401 seconds


## LDP setting 1: Only the protected attribute is obfuscated

In [16]:
print("\n========= Setting 1: Local DP: only the sensitive feature is obfuscated =========\n")

#folder_name of the results
setting = 'sLDP3'

starttime = time.time()

# write head of csv file
write_to_csv(setting,dataset,header_ldp)

df_cp = copy.deepcopy(df)


sldp_sp_min1, sldp_sp_maj1, sldp_sd, sd_std  = [], [], [], []
sldp_csp_minX0, sldp_csp_majX0, sldp_csd_X0, csd_X0_std, sldp_csp_minX1, sldp_csp_majX1, sldp_csd_X1, csd_X1_std= [], [] , [], [], [],[], [], []
sldp_oa_min1, sldp_oa_maj1, sldp_oad, oad_std = [], [], [], []
sldp_eo_min1, sldp_eo_maj1, sldp_eod, eod_std, acc = [], [], [], [], []


for epsilon in lst_eps:
    print(epsilon)
    
    ldp_sp_min, ldp_sp_maj, ldp_sd = [], [], []
    ldp_csp_minX0, ldp_csp_majX0, ldp_csd_X0, ldp_csp_minX1, ldp_csp_majX1,ldp_csd_X1 = [], [],[], [], [], []
    ldp_oa_min, ldp_oa_maj, ldp_oad  = [], [], []
    ldp_eo_min, ldp_eo_maj, ldp_eod, acc = [], [], [], []
    
    for seed in range(nb_seed):
        #np.random.seed(seed)

        # Preparing X and y using pandas
        X = copy.deepcopy(df_cp.drop(target, axis=1))
        y = copy.deepcopy(df_cp[target])

        # Train test splitting
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y, random_state=seed)

        # Attribute's domain size
        k = len(set(X[protected_att]))

        # Applying GRR to the protected attribute of the training set
        X_train[protected_att] = X_train[protected_att].apply(lambda x: GRR_Client(x, k, epsilon))
        
        # instantiate and train model
        model = RandomForestClassifier(n_jobs=-1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test) # prediction of the actual samples
        
        accuracy = accuracy_score(y_test, y_pred)

        # retrieving minority, majority from the test set
        X_test_min, X_test_maj = X_test[X_test[protected_att] == 0], X_test[X_test[protected_att] == 1]

        # predicted outcomes for minority, majority
        y_pred_min, y_pred_maj = model.predict(X_test_min), model.predict(X_test_maj)

        indices_min, indices_maj = X_test_min.index, X_test_maj.index
        y_test_min, y_test_maj = y_test.get(key = indices_min), y_test.get(key = indices_maj)
        
        
        # Needed for the computation of Cond.Stat.Disp
            
        # retrieving four groups: A=0_X=0, A0_X=1, A1_X=0, A1_X=1 from the test set
            
        X_test_min_X0, X_test_min_X1, X_test_maj_X0, X_test_maj_X1 = X_test[(X_test[protected_att] == 0) & (X_test['priors_count'] == 0)], X_test[(X_test[protected_att] == 0) & (X_test['priors_count'] == 1)], X_test[(X_test[protected_att] == 1) & (X_test['priors_count'] == 0)], X_test[(X_test[protected_att] == 1) & (X_test['priors_count'] == 1)]
        
        # confusion matrix for minority, majority
        conf_matrix_min, conf_matrix_maj = confusion_matrix_scorer(y_test_min,y_pred_min), confusion_matrix_scorer(y_test_maj,y_pred_maj)

        
        # predicted outcomes for the four groups
        y_pred_A0_X0, y_pred_A0_X1,y_pred_A1_X0, y_pred_A1_X1 = model.predict(X_test_min_X0), model.predict(X_test_min_X1), model.predict(X_test_maj_X0), model.predict(X_test_maj_X1)

        # computing fairness metrics with obfuscated A
        ldp_sp_min.append(Statistical_parity(y_pred_min))
        ldp_sp_maj.append(Statistical_parity(y_pred_maj))
        ldp_sd.append(Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)))
        
        ldp_eo_min.append(Equal_opportunity(conf_matrix_min))   
        ldp_eo_maj.append(Equal_opportunity(conf_matrix_maj))
        ldp_eod.append(Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)))
        
        ldp_csp_minX0.append(Statistical_parity(y_pred_A0_X0))
        ldp_csp_majX0.append(Statistical_parity(y_pred_A1_X0))
        ldp_csd_X0.append(Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)))
        
        ldp_csp_minX1.append(Statistical_parity(y_pred_A0_X1))
        ldp_csp_majX1.append(Statistical_parity(y_pred_A1_X1))
        ldp_csd_X1.append(Metric_disparity(Statistical_parity(y_pred_A1_X1), Statistical_parity(y_pred_A0_X1)))
        write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(y_pred_maj),Statistical_parity(y_pred_min),Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)),Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min), Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)),Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0),Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)), Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1),Metric_disparity(Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1)),accuracy])
    
    sldp_sp_min1.append(np.mean(ldp_sp_min))
    sldp_sp_maj1.append(np.mean(ldp_sp_maj))
    sldp_sd.append(np.mean(ldp_sd))
    sd_std.append(np.std(sldp_sd))
    sldp_csp_minX0.append(np.mean(ldp_csp_minX0))
    sldp_csp_majX0.append(np.mean(ldp_csp_majX0))
    sldp_csd_X0.append(np.mean(ldp_csd_X0))
    csd_X0_std.append(np.std(sldp_csd_X0))
    sldp_csp_minX1.append(np.mean(ldp_csp_minX1))
    sldp_csp_majX1.append(np.mean(ldp_csp_majX1))
    sldp_csd_X1.append(np.mean(ldp_csd_X1))
    csd_X1_std.append(np.std(sldp_csd_X1))
    sldp_eo_min1.append(np.mean(ldp_eo_min))
    sldp_eo_maj1.append(np.mean(ldp_eo_maj))
    sldp_eod.append(np.mean(ldp_eod))
    eod_std.append(np.std(sldp_eod))
        #writing the results to the csv file
        #write_to_csv(setting,dataset,[str(seed), str(epsilon), Statistical_parity(y_pred_maj),Statistical_parity(y_pred_min),Metric_disparity(Statistical_parity(y_pred_maj), Statistical_parity(y_pred_min)),Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min), Metric_disparity(Equal_opportunity(conf_matrix_maj), Equal_opportunity(conf_matrix_min)),Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0),Metric_disparity(Statistical_parity(y_pred_A1_X0), Statistical_parity(y_pred_A0_X0)), Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1),Metric_disparity(Statistical_parity(y_pred_A1_X1),Statistical_parity(y_pred_A0_X1))])
print('That took {} seconds'.format(time.time() - starttime)) 



50
16
8
5
4
3
2
1
0.5
That took 1025.6451888084412 seconds
