# FAIR POST-PROCESSING

This notebook implements the Reject Option Classification post-processor [(Kamiran et al. 2012)](https://ieeexplore.ieee.org/abstract/document/6413831).

The notebook applies reject option classification post-processir to classifier predictions. It loads the data exported in `code_00_partitinoing.ipynb` and predictions of base classifers produced in `code_08_postprocess1.R`. The post-processed predictions are exported as CSV files. A further analysis of the processor outputs is performed in `code_12_postprocess5.R`.

## 1. Parameters and preparations

In [1]:
##### PARAMETERS

# working paths
%run 'code_00_working_paths.py'

# specify data set
# one of ['bene', 'german', 'uk', 'taiwan', 'pkdd', 'gmsc', 'homecredit']
data = 'taiwan' 

# partitioning
num_folds = 5
seed      = 1

In [2]:
##### POST-PROCESSOR PARAMS

metric_name      = 'Statistical parity difference'
num_class_thresh = 100
num_ROC_margin   = 50
all_bound        = [0.1, 0.2, 0.3]

In [None]:
##### PACKAGES

import sys
sys.path.append(func_path)

import pickle
import numpy as np
import time

from load_data import *

from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification import RejectOptionClassification

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MaxAbsScaler

import matplotlib.pyplot as plt

## 2. Data import

In [4]:
##### RANDOM SEED

np.random.seed(seed)

In [5]:
##### LOAD PARTITIONING

dataset_orig_test = pickle.load(open(data_path + 'prepared/' + data + '_orig_test.pkl', 'rb'))
te                = dataset_orig_test.convert_to_dataframe()[0]
print(te.shape)

(15000, 186)


In [6]:
##### DATA PREP

# protected attribute
protected           = 'AGE'
privileged_groups   = [{'AGE': 1}] 
unprivileged_groups = [{'AGE': 0}]

## 3. Fair processing

In [7]:
##### MODELING

# timer
cv_start = time.time()

# base models
model_names = ['glm', 
               "rf", 
               "xgbTree", 
               "nnet"]

# loop through folds
for fold in range(num_folds):
    
    ##### LOAD DATA
    
    # feedback
    print('-'*30)
    print('- FOLD ' + str(fold) + '...')
    print('-'*30)

    # import data
    data_train = pickle.load(open(data_path + 'prepared/' + data + '_scaled_' + str(fold) + '_train.pkl', 'rb'))
    data_valid = pickle.load(open(data_path + 'prepared/' + data + '_scaled_' + str(fold) + '_valid.pkl', 'rb'))
    data_test  = pickle.load(open(data_path + 'prepared/' + data + '_scaled_' + str(fold) + '_test.pkl',  'rb'))


    ##### MODELING
    
    # import prediction results from R
    dataset_trainResults_valid = pd.read_csv(res_path + 'intermediate/' + data + '_' + str(fold) + '_POST_training_results_dval.csv')
    dataset_trainResults_test  = pd.read_csv(res_path + 'intermediate/' + data + '_' + str(fold) + '_POST_training_results_dtest.csv')
    
    # copy preds
    dataset_orig_valid_pred = data_valid.copy(deepcopy = True)
    dataset_orig_test_pred  = data_test.copy(deepcopy  = True)
    
    
    # loop through bound values
    for i in all_bound:
        
        # feedback
        print('-- BOUND ' + str(i) + '...')
    
        # placeholder
        ROC_test = pd.DataFrame()

        # loop through base classifiers
        for m in model_names:

            # extract validation preds
            scores_valid = np.array(dataset_trainResults_valid[m + '_scores']).reshape(len(dataset_trainResults_valid.index),1)
            labels_valid = np.where(dataset_trainResults_valid[m + '_class'] == 'Good', 1.0, 2.0).reshape(len(dataset_trainResults_valid.index), 1)

            # extract test preds
            scores_test = np.array(dataset_trainResults_test[m + '_scores']).reshape(len(dataset_trainResults_test.index),1)
            labels_test = np.where(dataset_trainResults_test[m + '_class'] == 'Good', 1.0, 2.0).reshape(len(dataset_trainResults_test.index), 1)

            # write predictions
            dataset_orig_valid_pred.scores = scores_valid
            dataset_orig_valid_pred.labels = labels_valid
            dataset_orig_test_pred.scores  = scores_test
            dataset_orig_test_pred.labels  = labels_test

            # fit ROC
            ROC = RejectOptionClassification(unprivileged_groups = unprivileged_groups, 
                                             privileged_groups   = privileged_groups, 
                                             num_class_thresh    = num_class_thresh, 
                                             num_ROC_margin      = num_ROC_margin,
                                             metric_name         = metric_name,
                                             metric_ub           = i, 
                                             metric_lb           = -i)
            ROC = ROC.fit(data_valid, dataset_orig_valid_pred)

            # predict test scores
            dataset_transf_test_pred    = ROC.predict(dataset_orig_test_pred)
            ROC_test[m + "_fairScores"] = dataset_transf_test_pred.scores.flatten()
            label_names                 = np.where(dataset_transf_test_pred.labels == 1, 'Good', 'Bad')
            ROC_test[m + "_fairLabels"] = label_names

        # export CSV
        ROC_test.to_csv(res_path + 'intermediate/' + data + '_' + str(fold) + '_ROC_' + str(i) + '_predictions_test.csv', index = None, header = True)
        
    # feedback
    print('')

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

------------------------------
- FOLD 0...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 1...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 2...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 3...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...

------------------------------
- FOLD 4...
------------------------------
-- BOUND 0.1...
-- BOUND 0.2...
-- BOUND 0.3...


Finished in 60.05 minutes
