#### This notebook demonstrates the use of the EOP post-processing algorithm for bias mitigation.


In [37]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from eq_odds_postprocessing import EqOddsPostprocessing
from common_utils import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider
import pandas as pd
import pickle
from sklearn.linear_model import Lasso
from random import sample

## Huangrui's Dataset 

In [38]:
def code_continuous(df,collist,Nlevel):
    for col in collist:
        for q in range(1,Nlevel,1):
            threshold = df[~np.isnan(df[col])][col].quantile(float(q)/Nlevel)
            df[col+'_geq'+str(int(q))+'q'+str(threshold)] = (df[col] >= threshold).astype(float)
    df.drop(collist,axis = 1, inplace = True)
    
class Zindi(StandardDataset):
    """financial-inclusion-in-africa dataset.
    """

    def __init__(self, path, label_name='Y', favorable_classes=[1],  
                 protected_attribute_names=['sex'],
                 privileged_classes=[[1]],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_drop=[],
                 features_to_keep=[],
                 na_values=[], custom_preprocessing=None,
                 metadata=None):
        """See :obj:`RegressionDataset` for a description of the arguments."""
        

        df = pd.read_csv(path)
        numericals = [col for col in df.columns if len(df[col].unique())>2 and max(df[col])>1]
        code_continuous(df,numericals, 5)

        super(Zindi, self).__init__(
            df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)

#### Load dataset and specify options

In [39]:
privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]
# Metric used (should be one of allowed_metrics)
metric_name = "Equal opportunity difference"
#random seed for calibrated equal odds prediction
random_seed = 12345679
np.random.seed(random_seed)
# Verify metric name
allowed_metrics = ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]
if metric_name not in allowed_metrics:
    raise ValueError("Metric name should be one of allowed metrics")

#### Split into train, test and validation

In [47]:
experiments_info = {}
bef_experiments_info = {}
budget = 0.01
for K in range(1, 6):
    dataset_orig_train= Zindi(path="./Huangrui/zindi/zindi_train{}.csv".format(K),protected_attribute_names=['sex'],
                privileged_classes=[[1]])
    dataset_orig_test= Zindi(path="./Huangrui/zindi/zindi_test{}.csv".format(K),protected_attribute_names=['sex'],
                privileged_classes=[[1]])
    #only use the budget% of the training data
    dataset_orig_train,_ = dataset_orig_train.split([budget], shuffle=False)
    # Lasso linear classifier and predictions
    X_train = dataset_orig_train.features
    y_train = dataset_orig_train.labels.ravel()
    lmod = pickle.load(open('experiments/zindi'+str(K)+'_sex_bmodel.pkl','rb'))["clf"]
   
    y_train_pred = lmod.predict(X_train)

    dataset_orig_train_pred = dataset_orig_train.copy(deepcopy=True)
    dataset_orig_train_pred.labels = y_train_pred>0.5
    sigmoid = lambda x: 1 / (1 + np.exp(0.5-x))
    dataset_orig_train_pred.scores = sigmoid(y_train_pred).reshape(-1,1)


    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = dataset_orig_test.features
    y_test = dataset_orig_test.labels
    y_test_pred = lmod.predict(X_test)
    dataset_orig_test_pred.scores = sigmoid(lmod.predict(X_test)).reshape(-1,1)
    dataset_orig_test_pred.labels = (y_test_pred>0.5).reshape(-1,1)
    #load EOP Model
    EOP = EqOddsPostprocessing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups, seed=random_seed)
    #fit EOP model
    EOP = EOP.fit(dataset_orig_train, dataset_orig_train_pred)
    print("Fitted the EOP model")
    # get the EOP predictions for test (Transform the test set)
    dataset_transf_test_pred = EOP.predict(dataset_orig_test_pred)
    metric_test_bef = compute_metrics(dataset_orig_test, dataset_orig_test_pred, 
                    unprivileged_groups, privileged_groups)
    metric_test_aft = compute_metrics(dataset_orig_test, dataset_transf_test_pred, 
                    unprivileged_groups, privileged_groups)

    #自己计算error, 不是balanced accuracy！！！
    print("K = {}, budget = {}".format(K, budget))
    print("The Error for the test dataset is {:.4}".format(np.mean(dataset_orig_test.labels!=dataset_transf_test_pred.labels)))
    print("The Equal opportunity difference for the test dataset is {:.4}".format(metric_test_aft["Equal opportunity difference"]))
    experiments_info["K = {}, budget = {}".format(K, budget)] = {"Error": np.mean(dataset_orig_test.labels!=dataset_transf_test_pred.labels), "Equal opportunity difference": metric_test_aft["Equal opportunity difference"]}
    bef_experiments_info["K = {}, budget = {}".format(K, budget)] = {"Error": np.mean(dataset_orig_test.labels!=dataset_orig_test_pred.labels), "Equal opportunity difference": metric_test_bef["Equal opportunity difference"]}

Fitted the EOP model
Balanced accuracy = 0.5726
Statistical parity difference = -0.0372
Disparate impact = 0.3022
Average odds difference = -0.0625
Equal opportunity difference = -0.1179
Theil index = 0.1327
Balanced accuracy = 0.5631
Statistical parity difference = -0.0488
Disparate impact = 0.1631
Average odds difference = -0.0886
Equal opportunity difference = -0.1612
Theil index = 0.1360
K = 1, budget = 0.01
The Error for the test dataset is 0.1335
The Equal opportunity difference for the test dataset is -0.1612
Fitted the EOP model
Balanced accuracy = 0.6481
Statistical parity difference = -0.0485
Disparate impact = 0.4949
Average odds difference = -0.0426
Equal opportunity difference = -0.0600
Theil index = 0.1091
Balanced accuracy = 0.6292
Statistical parity difference = -0.0317
Disparate impact = 0.5998
Average odds difference = -0.0023
Equal opportunity difference = 0.0167
Theil index = 0.1148
K = 2, budget = 0.01
The Error for the test dataset is 0.1216
The Equal opportunity 

In [48]:
bef_experiments_info

{'K = 1, budget = 0.01': {'Error': 0.13010204081632654,
  'Equal opportunity difference': -0.11790285526747622},
 'K = 2, budget = 0.01': {'Error': 0.11732199787460149,
  'Equal opportunity difference': -0.06001077655095577},
 'K = 3, budget = 0.01': {'Error': 0.1179596174282678,
  'Equal opportunity difference': -0.08358662613981763},
 'K = 4, budget = 0.01': {'Error': 0.1179596174282678,
  'Equal opportunity difference': -0.11075930342077311},
 'K = 5, budget = 0.01': {'Error': 0.1226354941551541,
  'Equal opportunity difference': -0.07465328001361354}}

In [49]:
experiments_info

{'K = 1, budget = 0.01': {'Error': 0.13350340136054423,
  'Equal opportunity difference': -0.16122415490646538},
 'K = 2, budget = 0.01': {'Error': 0.12157279489904357,
  'Equal opportunity difference': 0.01668538864078467},
 'K = 3, budget = 0.01': {'Error': 0.798087141339001,
  'Equal opportunity difference': -0.17885339603525074},
 'K = 4, budget = 0.01': {'Error': 0.13177470775770456,
  'Equal opportunity difference': -0.05934750806597061},
 'K = 5, budget = 0.01': {'Error': 0.13007438894792775,
  'Equal opportunity difference': -0.13676508125584957}}