In [17]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification\
        import RejectOptionClassification
from common_utils import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider
import pandas as pd
import pickle
from sklearn.linear_model import Lasso

## Huangrui's Dataset 

In [18]:
class GermanDataset(StandardDataset):
    """German credit Dataset.

    See :file:`aif360/data/raw/german/README.md`.
    """

    def __init__(self, path, label_name='Y', favorable_classes=[1],
                 protected_attribute_names=['sex', 'age'],
                 privileged_classes=[[1],[1]],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_keep=[], features_to_drop=[],
                 na_values=[], custom_preprocessing=None,
                 metadata=None):
        
        df = pd.read_csv(path)
        

        super(GermanDataset, self).__init__(df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)

#### Load dataset and specify options

In [19]:
## import dataset
dataset_used = "german" # "german", "german", "compas"
protected_attribute_used = 1 # 1, 2


#     dataset_orig = GermanDataset()
if protected_attribute_used == 1:
    privileged_groups = [{'sex': 1}]
    unprivileged_groups = [{'sex': 0}]
else:
    privileged_groups = [{'age': 1}]
    unprivileged_groups = [{'age': 0}]
        
# Metric used (should be one of allowed_metrics)
metric_name = "Equal opportunity difference"

# Upper and lower bound on the fairness metric used
metric_ub = 0.05
metric_lb = -0.05
        
        
#random seed for calibrated equal odds prediction
random_seed = 12345679
np.random.seed(random_seed)

# Verify metric name
allowed_metrics = ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]
if metric_name not in allowed_metrics:
    raise ValueError("Metric name should be one of allowed metrics")

#### Split into train, test and validation

In [20]:
protected_attribute_used = 2
for K in range(1, 6):
    if protected_attribute_used == 1:
        dataset_orig_train_valid= GermanDataset(path="./Huangrui/german/german_train{}.csv".format(K),protected_attribute_names=['sex'],
                    privileged_classes=[[1]])
        dataset_orig_test= GermanDataset(path="./Huangrui/german/german_test{}.csv".format(K),protected_attribute_names=['sex'],
                    privileged_classes=[[1]])
    else:
        dataset_orig_train_valid= GermanDataset(path="./Huangrui/german/german_train{}.csv".format(K),protected_attribute_names=['age'],
                    privileged_classes=[[1]])
        dataset_orig_test= GermanDataset(path="./Huangrui/german/german_test{}.csv".format(K),protected_attribute_names=['age'],
                    privileged_classes=[[1]])
        
    # 80% train, 20% valid
    dataset_orig_train, dataset_orig_valid = dataset_orig_train_valid.split([0.8], shuffle=True) 
        
    # Lasso linear classifier and predictions
    if protected_attribute_used == 1:
        lmod = pickle.load(open('experiments/german'+str(K)+'_sex_bmodel.pkl','rb'))["clf"]
    else:
        lmod = pickle.load(open('experiments/german'+str(K)+'_age_bmodel.pkl','rb'))["clf"]
    #training data
    X_train = dataset_orig_train.features
    y_train = dataset_orig_train.labels.ravel()
    y_train_pred = lmod.predict(X_train)>0.5
    #validation data
    X_valid = dataset_orig_valid.features
    y_valid = dataset_orig_valid.labels.ravel()
    y_valid_pred = lmod.predict(X_valid)>0.5
    #test data
    dataset_orig_test_pred = dataset_orig_test.copy(deepcopy=True)
    X_test = dataset_orig_test_pred.features
    y_test = dataset_orig_test_pred.labels
    y_test_pred = lmod.predict(X_test)>0.5

    # save all the data
    db = {}

    db['Xtrain'] = X_train
    db['Ytrain'] = y_train
    db['Xval'] = X_valid
    db['Yval'] = y_valid
    db['Xtest'] = X_test
    db['Ytest'] = y_test
    db['clf'] = lmod
    db['Ybtrain'] = y_train_pred
    db['Ybval'] = y_valid_pred
    db['Ybtest'] = y_test_pred
    if protected_attribute_used==1:
        pickle.dump(db, open('experiments/german'+str(K)+'_sex.pkl','wb'))
    else:
        pickle.dump(db, open('experiments/german'+str(K)+'_age.pkl','wb'))
    

In [21]:
db = pickle.load(open('experiments/'+'german1_age.pkl','rb')) 
Ytrain = db['Ytrain'] 
Yval = db['Yval']
Ytest = db['Ytest'] 
clf = db['clf'] 
Ybtrain = db['Ybtrain'] # this is the predicted label of the biased decision-maker on training data
Ybtest = db['Ybtest']
Ybval = db['Ybval']
Xtrain = db['Xtrain'] 
Xtest = db['Xtest'] 
Xval = db['Xval']

In [22]:
dataset_orig_train.features

array([[2.100e+01, 5.248e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.500e+01, 1.478e+03, 4.000e+00, ..., 1.000e+00, 1.000e+00,
        0.000e+00],
       [1.200e+01, 1.123e+03, 4.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [2.100e+01, 3.414e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.200e+01, 6.260e+02, 4.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [9.000e+00, 1.422e+03, 3.000e+00, ..., 1.000e+00, 1.000e+00,
        0.000e+00]])

In [23]:
Xval

array([[1.300e+01, 1.409e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.400e+01, 2.828e+03, 4.000e+00, ..., 1.000e+00, 1.000e+00,
        0.000e+00],
       [6.000e+00, 2.108e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [1.000e+01, 3.949e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.400e+01, 1.851e+03, 4.000e+00, ..., 1.000e+00, 1.000e+00,
        0.000e+00],
       [1.800e+01, 1.239e+03, 4.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])