In [14]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification\
        import RejectOptionClassification
from common_utils import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider
import pandas as pd
import pickle
from sklearn.linear_model import Lasso

## Huangrui's Dataset 

In [15]:
def code_continuous(df,collist,Nlevel):
    for col in collist:
        for q in range(1,Nlevel,1):
            threshold = df[~np.isnan(df[col])][col].quantile(float(q)/Nlevel)
            df[col+'_geq'+str(int(q))+'q'+str(threshold)] = (df[col] >= threshold).astype(float)
    df.drop(collist,axis = 1, inplace = True)
    
class BankDataset(StandardDataset):
    """financial-inclusion-in-africa dataset.
    """

    def __init__(self, path, label_name='y', favorable_classes=[1],  
                 protected_attribute_names=['age'],
                 privileged_classes=[[1]],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_drop=[],
                 features_to_keep=[],
                 na_values=[], custom_preprocessing=None,
                 metadata=None):
        """See :obj:`RegressionDataset` for a description of the arguments."""
        

        df = pd.read_csv(path)
        df["cons.conf.idx"] = -df["cons.conf.idx"]
        numericals = [col for col in df.columns if len(df[col].unique())>2 and max(df[col])>1]
        code_continuous(df,numericals, 5)

        super(BankDataset, self).__init__(
            df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)

#### Load dataset and specify options

In [17]:
privileged_groups = [{'age': 1}]
unprivileged_groups = [{'age': 0}]

# Metric used (should be one of allowed_metrics)
metric_name = "Equal opportunity difference"
        
#random seed for calibrated equal odds prediction
random_seed = 12345679
np.random.seed(random_seed)

# Verify metric name
allowed_metrics = ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]
if metric_name not in allowed_metrics:
    raise ValueError("Metric name should be one of allowed metrics")

#### Split into train, test and validation

In [40]:
K = 4

dataset_orig_train= BankDataset(path="./Huangrui/bank/bank_train{}.csv".format(K))
dataset_orig_test= BankDataset(path="./Huangrui/bank/bank_test{}.csv".format(K))

import pickle
from sklearn.linear_model import Lasso
# 用部分 data训练 biased model
selected_dataset,_ = dataset_orig_train.split([0.04], shuffle=True)
_ =  dataset_orig_test
# Logistic regression classifier and predictions
X_train = selected_dataset.features
y_train = selected_dataset.labels.ravel()
X_test = _.features
y_test = _.labels.ravel()
lmod = Lasso(alpha = 0.001)
lmod.fit(X_train, y_train)
y_train_pred = lmod.predict(X_train)>0.5
y_test_pred = lmod.predict(X_test)>0.5
dataset_orig_train_pred = selected_dataset.copy(deepcopy=True)
dataset_orig_train_pred.labels = y_train_pred
dataset_orig_test_pred = _.copy(deepcopy=True)
dataset_orig_test_pred.labels = y_test_pred
print(np.mean(y_train_pred!=y_train))
print(np.mean(y_test_pred!=y_test))
metric_test = compute_metrics(_, dataset_orig_test_pred, 
                unprivileged_groups, privileged_groups)
metric_train = compute_metrics(selected_dataset, dataset_orig_train_pred,unprivileged_groups, privileged_groups)


0.11076923076923077
0.10888816005247622
Balanced accuracy = 0.6373
Statistical parity difference = 0.1785
Disparate impact = 4.3871
Average odds difference = 0.1871
Equal opportunity difference = 0.2429
Theil index = 0.1014
Balanced accuracy = 0.6516
Statistical parity difference = 0.1743
Disparate impact = 4.2129
Average odds difference = 0.1302
Equal opportunity difference = 0.2787
Theil index = 0.1066


In [41]:
#save model
bmodels = {}
bmodels['clf'] = lmod
pickle.dump(bmodels, open("experiments/bank"+str(K)+'_age_bmodel.pkl','wb'))

In [20]:
# print out some labels, names, etc.
display(Markdown("#### Training Dataset shape"))
print(dataset_orig_train.features.shape)
display(Markdown("#### Favorable and unfavorable labels"))
print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)
display(Markdown("#### Protected attribute names"))
print(dataset_orig_train.protected_attribute_names)
display(Markdown("#### Privileged and unprivileged protected attribute values"))
print(dataset_orig_train.privileged_protected_attributes, 
      dataset_orig_train.unprivileged_protected_attributes)
display(Markdown("#### Dataset feature names"))
print(dataset_orig_train.feature_names)

#### Training Dataset shape

(24391, 84)


#### Favorable and unfavorable labels

1.0 0.0


#### Protected attribute names

['age']


#### Privileged and unprivileged protected attribute values

[array([1.])] [array([0.])]


#### Dataset feature names

['age', 'job=admin.', 'job=blue-collar', 'job=entrepreneur', 'job=housemaid', 'job=management', 'job=retired', 'job=self-employed', 'job=services', 'job=student', 'job=technician', 'job=unemployed', 'marital=divorced', 'marital=married', 'marital=single', 'education=basic.4y', 'education=basic.6y', 'education=basic.9y', 'education=high.school', 'education=illiterate', 'education=professional.course', 'education=university.degree', 'default=no', 'default=yes', 'housing=no', 'housing=yes', 'loan=no', 'loan=yes', 'contact=cellular', 'contact=telephone', 'month=apr', 'month=aug', 'month=dec', 'month=jul', 'month=jun', 'month=mar', 'month=may', 'month=nov', 'month=oct', 'month=sep', 'day_of_week=fri', 'day_of_week=mon', 'day_of_week=thu', 'day_of_week=tue', 'day_of_week=wed', 'poutcome=failure', 'poutcome=nonexistent', 'poutcome=success', 'duration_geq1q89.0', 'duration_geq2q147.0', 'duration_geq3q223.0', 'duration_geq4q369.0', 'campaign_geq1q1.0', 'campaign_geq2q1.0', 'campaign_geq3q2.0', 