# About this notebook

This notebook extract the initial biased model (human/decision-maker), and get the predition ($Y_{DM}$) for all data.

For the four dataset, we first get the training set (by concatenate the train/valid set) and test set.

Then, we split the training set into two parts (train/valid) with 80% train set and 20% test set as the authors mentioned in Appendix C page 17 of "Predict Responsibly: Improving Fairness and Accuracy by
Learning to Defer"

We extract the "attr" which is the protected column but does not drop them from the X.

Even though we have the flag ["use_attr": false] in model_name.json file, since our X includes the protected feature, we in fact use the "attr". The reason for us to set the flag as false and keep the data is that we want the X to be the same for all the models.

We then save new data as npz file as the required data format of predict-responsibly.



In [1]:
import pandas as pd
import numpy as np
import pickle
from random import sample,random

# Adult Sex

In [3]:
for K in range(1,6):
    db = pickle.load(open('experiment/adult{}_sex.pkl'.format(K),'rb')) 
    Ytrain = db['Ytrain'] 
    Yval = db['Yval']
    Ytest = db['Ytest'] 
    clf = db['clf'] 
    Ybtrain = db['Ybtrain'] # this is the predicted label of the biased decision-maker on training data
    Ybtest = db['Ybtest']
    Ybval = db['Ybval']
    Xtrain = db['Xtrain'] 
    Xtest = db['Xtest'] 
    Xval = db['Xval']
    sigmoid = lambda x: 1 / (1 + np.exp(2-4*x))
    ydm_train = sigmoid(clf.predict(Xtrain))
    ydm_test = sigmoid(clf.predict(Xtest))
    ydm_val = sigmoid(clf.predict(Xval))
    new_data = {}
    train_inds = np.random.choice(range(len(Ytrain)),size=int(len(Ytrain)*0.8),replace=False)
    valid_inds = np.array([i for i in range(len(Ytrain)) if i not in train_inds])
    #We cannot give IDK the same train,valid inds because IDK performs super poorly on the validation set
    new_data['y_train'] = np.concatenate([Ytrain,Yval]).reshape(-1,1)
    new_data['y_test'] = Ytest.reshape(-1,1)
    new_data['ydm_train'] = np.concatenate([ydm_train,ydm_val]).reshape(-1,1)
    new_data['ydm_test'] = ydm_test.reshape(-1,1)
    new_data["train_inds"] = train_inds
    new_data["valid_inds"] = valid_inds
    attr_train = np.concatenate([Xtrain["sex_Female"],Xval["sex_Female"]]).reshape(-1,1)
    attr_test = np.array(Xtest["sex_Female"]).reshape(-1,1)
    xtrain = Xtrain.to_numpy()
    xval = Xval.to_numpy()
    xtest = Xtest.to_numpy()
    new_data["x_train"] = np.concatenate([xtrain,xtest])
    new_data["x_test"] = xtest
    new_data["attr_train"] = np.concatenate([attr_train,attr_test])
    new_data["attr_test"] = attr_test
    #save new data as npz file
    np.savez("data/adult/adult{}_sex.npz".format(K),**new_data)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# COMPASS SEX

recidivism4_sex_prob_query

In [3]:
for K in range(1,6):
    db = pickle.load(open('experiments/recidivism{}_sex.pkl'.format(K),'rb')) 
    #flip labels
    Ytrain = 1 - db['Ytrain'] 
    Yval = 1 - db['Yval']
    Ytest = 1 - db['Ytest'] 
    clf = db['clf'] 
    Ybtrain = db['Ybtrain'] # this is the predicted label of the biased decision-maker on training data
    Ybtest = db['Ybtest']
    Ybval = db['Ybval']
    Xtrain = db['Xtrain'] 
    Xtest = db['Xtest'] 
    Xval = db['Xval']
    sigmoid = lambda x: 1 / (1 + np.exp(2-4*x))
    #flip predictions
    ydm_train = 1 - sigmoid(clf.predict(Xtrain))
    ydm_test = 1 - sigmoid(clf.predict(Xtest))
    ydm_val = 1 - sigmoid(clf.predict(Xval))
    new_data = {}
    train_inds = np.random.choice(range(len(Ytrain)),size=int(len(Ytrain)*0.8),replace=False)
    valid_inds = np.array([i for i in range(len(Ytrain)) if i not in train_inds])
    #We cannot give IDK the same train,valid inds because IDK performs super poorly on the validation set
    new_data['y_train'] = np.concatenate([Ytrain,Yval]).reshape(-1,1)
    new_data['y_test'] = Ytest.reshape(-1,1)
    new_data['ydm_train'] = np.concatenate([ydm_train,ydm_val]).reshape(-1,1)
    new_data['ydm_test'] = ydm_test.reshape(-1,1)
    new_data["train_inds"] = train_inds
    new_data["valid_inds"] = valid_inds
    attr_train = np.concatenate([Xtrain["Probationerssex_Female"],Xval["Probationerssex_Female"]]).reshape(-1,1)
    attr_test = np.array(Xtest["Probationerssex_Female"]).reshape(-1,1)
    xtrain = Xtrain.to_numpy()
    xval = Xval.to_numpy()
    xtest = Xtest.to_numpy()
    new_data["x_train"] = np.concatenate([xtrain,xtest])
    new_data["x_test"] = xtest
    new_data["attr_train"] = np.concatenate([attr_train,attr_test])
    new_data["attr_test"] = attr_test
    #save new data as npz file
    np.savez("data/compas/recidivism{}_sex.npz".format(K),**new_data)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


# Zindi Sex

In [4]:
for K in range(1,6):
    db = pickle.load(open('experiments/zindi{}_sex.pkl'.format(K),'rb')) 
    Ytrain = db['Ytrain'] 
    Yval = db['Yval']
    Ytest = db['Ytest'] 
    clf = db['clf'] 
    Ybtrain = db['Ybtrain'] # this is the predicted label of the biased decision-maker on training data
    Ybtest = db['Ybtest']
    Ybval = db['Ybval']
    Xtrain = db['Xtrain'] 
    Xtest = db['Xtest'] 
    Xval = db['Xval']
    sigmoid = lambda x: 1 / (1 + np.exp(2-4*x))
    ydm_train = sigmoid(clf.predict(Xtrain))
    ydm_test = sigmoid(clf.predict(Xtest))
    ydm_val = sigmoid(clf.predict(Xval))
    new_data = {}
    train_inds = np.random.choice(range(len(Ytrain)),size=int(len(Ytrain)*0.8),replace=False)
    valid_inds = np.array([i for i in range(len(Ytrain)) if i not in train_inds])
    #We cannot give IDK the same train,valid inds because IDK performs super poorly on the validation set
    new_data['y_train'] = np.concatenate([Ytrain,Yval]).reshape(-1,1)
    new_data['y_test'] = Ytest.reshape(-1,1)
    new_data['ydm_train'] = np.concatenate([ydm_train,ydm_val]).reshape(-1,1)
    new_data['ydm_test'] = ydm_test.reshape(-1,1)
    new_data["train_inds"] = train_inds
    new_data["valid_inds"] = valid_inds
    attr_train = np.concatenate([Xtrain["sex"],Xval["sex"]]).reshape(-1,1)
    attr_test = np.array(Xtest["sex"]).reshape(-1,1)
    xtrain = Xtrain.to_numpy()
    xval = Xval.to_numpy()
    xtest = Xtest.to_numpy()
    new_data["x_train"] = np.concatenate([xtrain,xtest])
    new_data["x_test"] = xtest
    new_data["attr_train"] = np.concatenate([attr_train,attr_test])
    new_data["attr_test"] = attr_test
    #save new data as npz file
    np.savez("data/zindi/zindi{}_sex.npz".format(K),**new_data)



In [5]:
Xtrain.shape

(15055, 47)

# Bank Age

In [5]:
for K in range(1,6):
    db = pickle.load(open('experiments/bank{}_age.pkl'.format(K),'rb')) 
    Ytrain = db['Ytrain'] 
    Yval = db['Yval']
    Ytest = db['Ytest'] 
    clf = db['clf'] 
    Ybtrain = db['Ybtrain'] # this is the predicted label of the biased decision-maker on training data
    Ybtest = db['Ybtest']
    Ybval = db['Ybval']
    Xtrain = db['Xtrain'] 
    Xtest = db['Xtest'] 
    Xval = db['Xval']
    sigmoid = lambda x: 1 / (1 + np.exp(2-4*x))
    ydm_train = sigmoid(clf.predict(Xtrain))
    ydm_test = sigmoid(clf.predict(Xtest))
    ydm_val = sigmoid(clf.predict(Xval))
    new_data = {}
    train_inds = np.random.choice(range(len(Ytrain)),size=int(len(Ytrain)*0.8),replace=False)
    valid_inds = np.array([i for i in range(len(Ytrain)) if i not in train_inds])
    #We cannot give IDK the same train,valid inds because IDK performs super poorly on the validation set
    new_data['y_train'] = np.concatenate([Ytrain,Yval]).reshape(-1,1)
    new_data['y_test'] = Ytest.reshape(-1,1)
    new_data['ydm_train'] = np.concatenate([ydm_train,ydm_val]).reshape(-1,1)
    new_data['ydm_test'] = ydm_test.reshape(-1,1)
    new_data["train_inds"] = train_inds
    new_data["valid_inds"] = valid_inds
    attr_train = np.concatenate([Xtrain["age"],Xval["age"]]).reshape(-1,1)
    attr_test = np.array(Xtest["age"]).reshape(-1,1)
    xtrain = Xtrain.to_numpy()
    xval = Xval.to_numpy()
    xtest = Xtest.to_numpy()
    new_data["x_train"] = np.concatenate([xtrain,xtest])
    new_data["x_test"] = xtest
    new_data["attr_train"] = np.concatenate([attr_train,attr_test])
    new_data["attr_test"] = attr_test
    #save new data as npz file
    np.savez("data/bank/bank{}_age.npz".format(K),**new_data)



In [4]:
Xtrain.shape

(19512, 84)