In [5]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification\
        import RejectOptionClassification
from common_utils import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider
import pandas as pd
import pickle
from sklearn.linear_model import Lasso

## Huangrui's Dataset 

In [6]:
def code_continuous(df,collist,Nlevel):
    for col in collist:
        for q in range(1,Nlevel,1):
            threshold = df[~np.isnan(df[col])][col].quantile(float(q)/Nlevel)
            df[col+'_geq'+str(int(q))+'q'+str(threshold)] = (df[col] >= threshold).astype(float)
    df.drop(collist,axis = 1, inplace = True)
    
class BankDataset(StandardDataset):
    """financial-inclusion-in-africa dataset.
    """

    def __init__(self, path, label_name='y', favorable_classes=[1],  
                 protected_attribute_names=['age'],
                 privileged_classes=[[1]],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_drop=[],
                 features_to_keep=[],
                 na_values=[], custom_preprocessing=None,
                 metadata=None):
        """See :obj:`RegressionDataset` for a description of the arguments."""
        

        df = pd.read_csv(path)
        df["cons.conf.idx"] = -df["cons.conf.idx"]
        numericals = [col for col in df.columns if len(df[col].unique())>2 and max(df[col])>1]
        code_continuous(df,numericals, 5)

        super(BankDataset, self).__init__(
            df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)

#### Load dataset and specify options

In [7]:
privileged_groups = [{'age': 1}]
unprivileged_groups = [{'age': 0}]

# Metric used (should be one of allowed_metrics)
metric_name = "Equal opportunity difference"
        
#random seed for calibrated equal odds prediction
random_seed = 12345679
np.random.seed(random_seed)

# Verify metric name
allowed_metrics = ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]
if metric_name not in allowed_metrics:
    raise ValueError("Metric name should be one of allowed metrics")

#### Split into train, test and validation

In [8]:

for K in range(1, 6):
    dataset_orig_train_valid= BankDataset(path="./Huangrui/bank/bank_train{}.csv".format(K))
    dataset_orig_test= BankDataset(path="./Huangrui/bank/bank_test{}.csv".format(K))
    # 80% train, 20% valid
    dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split([0.8], shuffle=True) 
        
    # Lasso linear classifier and predictions
    
    lmod = pickle.load(open('experiments/bank'+str(K)+'_age_bmodel.pkl','rb'))["clf"]

    #training data
    X_train = dataset_orig_train.features
    y_train = dataset_orig_train.labels.ravel()
    y_train_pred = lmod.predict(X_train)>0.5
    #validation data
    X_valid = dataset_orig_valid.features
    y_valid = dataset_orig_valid.labels.ravel()
    y_valid_pred = lmod.predict(X_valid)>0.5
    #test data
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = dataset_orig_test_pred.features
    y_test = dataset_orig_test_pred.labels
    y_test_pred = lmod.predict(X_test)>0.5

    #make X_train, X_valid, X_test dataframe
    X_train = pd.DataFrame(X_train, columns = dataset_orig_train.feature_names)
    X_valid = pd.DataFrame(X_valid, columns = dataset_orig_train.feature_names)
    X_test = pd.DataFrame(X_test, columns = dataset_orig_train.feature_names)

    # save all the data
    db = {}
    db['Xtrain'] = X_train
    db['Ytrain'] = y_train
    db['Xval'] = X_valid
    db['Yval'] = y_valid
    db['Xtest'] = X_test
    db['Ytest'] = y_test
    db['clf'] = lmod
    db['Ybtrain'] = y_train_pred
    db['Ybval'] = y_valid_pred
    db['Ybtest'] = y_test_pred
    pickle.dump(db, open('experiments/bank'+str(K)+'_age.pkl','wb'))
   
    

In [9]:
db = pickle.load(open('experiments/'+'bank1_age.pkl','rb')) 
Ytrain = db['Ytrain'] 
Yval = db['Yval']
Ytest = db['Ytest'] 
clf = db['clf'] 
Ybtrain = db['Ybtrain'] # this is the predicted label of the biased decision-maker on training data
Ybtest = db['Ybtest']
Ybval = db['Ybval']
Xtrain = db['Xtrain'] 
Xtest = db['Xtest'] 
Xval = db['Xval']

In [10]:
np.mean(Ybval!=Yval)

0.11190817790530846

In [11]:
Xtrain

Unnamed: 0,age,job=admin.,job=blue-collar,job=entrepreneur,job=housemaid,job=management,job=retired,job=self-employed,job=services,job=student,...,cons.conf.idx_geq3q42.0,cons.conf.idx_geq4q46.2,euribor3m_geq1q1.281,euribor3m_geq2q4.0760000000000005,euribor3m_geq3q4.859,euribor3m_geq4q4.962,nr.employed_geq1q5099.1,nr.employed_geq2q5191.0,nr.employed_geq3q5195.8,nr.employed_geq4q5228.1
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19507,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
19508,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
19509,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
19510,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Xtrain

In [12]:
dataset_orig_train.features

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 1., 0.]])

In [13]:
Xval

Unnamed: 0,age,job=admin.,job=blue-collar,job=entrepreneur,job=housemaid,job=management,job=retired,job=self-employed,job=services,job=student,...,cons.conf.idx_geq3q42.0,cons.conf.idx_geq4q46.2,euribor3m_geq1q1.281,euribor3m_geq2q4.0760000000000005,euribor3m_geq3q4.859,euribor3m_geq4q4.962,nr.employed_geq1q5099.1,nr.employed_geq2q5191.0,nr.employed_geq3q5195.8,nr.employed_geq4q5228.1
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4874,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4875,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4876,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4877,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
