In [17]:
%matplotlib inline
# Load all necessary packages
import sys
sys.path.append("../")
import numpy as np
from tqdm import tqdm
from warnings import warn

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric, BinaryLabelDatasetMetric
from aif360.algorithms.postprocessing.reject_option_classification\
        import RejectOptionClassification
from common_utils import compute_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from IPython.display import Markdown, display
import matplotlib.pyplot as plt
from ipywidgets import interactive, FloatSlider
import pandas as pd
import pickle
from sklearn.linear_model import Lasso

## Huangrui's Dataset 

In [23]:
def code_continuous(df,collist,Nlevel):
    for col in collist:
        for q in range(1,Nlevel,1):
            threshold = df[~np.isnan(df[col])][col].quantile(float(q)/Nlevel)
            df[col+'_geq'+str(int(q))+'q'+str(threshold)] = (df[col] >= threshold).astype(float)
    df.drop(collist,axis = 1, inplace = True)
    
class LawSchoolPassBarDataset(StandardDataset):
    """Law School PassBar dataset.

    See https://github.com/microsoft/tempeh for details.
    """

    def __init__(self, path, label_name='pass_bar', favorable_classes=[1],  
                 protected_attribute_names=['race'],
                 privileged_classes=[[1]],
                 instance_weights_name=None,
                 categorical_features=[],
                 features_to_drop=[],
                 features_to_keep=[],
                 na_values=[], custom_preprocessing=None,
                 metadata=None):
        """See :obj:`RegressionDataset` for a description of the arguments."""
        df = pd.read_csv(path)
        numericals = [col for col in df.columns if len(df[col].unique())>2 and col not in ['pass_bar']]
        code_continuous(df,numericals, 5)
        

        super(LawSchoolPassBarDataset, self).__init__(
            df=df, label_name=label_name,
            favorable_classes=favorable_classes,
            protected_attribute_names=protected_attribute_names,
            privileged_classes=privileged_classes,
            instance_weights_name=instance_weights_name,
            categorical_features=categorical_features,
            features_to_keep=features_to_keep,
            features_to_drop=features_to_drop, na_values=na_values,
            custom_preprocessing=custom_preprocessing, metadata=metadata)

#### Load dataset and specify options

In [24]:

privileged_groups = [{'race': 1}]
unprivileged_groups = [{'race': 0}]

# Metric used (should be one of allowed_metrics)
metric_name = "Equal opportunity difference"
        
#random seed for calibrated equal odds prediction
random_seed = 12345679
np.random.seed(random_seed)

# Verify metric name
allowed_metrics = ["Statistical parity difference",
                   "Average odds difference",
                   "Equal opportunity difference"]
if metric_name not in allowed_metrics:
    raise ValueError("Metric name should be one of allowed metrics")

#### Split into train, test and validation

In [26]:
K = 2

dataset_orig_train= LawSchoolPassBarDataset(path="./Huangrui/law/law_train{}.csv".format(K))
dataset_orig_test= LawSchoolPassBarDataset(path="./Huangrui/law/law_test{}.csv".format(K))

import pickle
from sklearn.linear_model import Lasso
# 用部分 data训练 biased model
selected_dataset,_ = dataset_orig_train.split([0.8], shuffle=False)
_ =  dataset_orig_test
# Logistic regression classifier and predictions
X_train = selected_dataset.features
y_train = selected_dataset.labels.ravel()
X_test = _.features
y_test = _.labels.ravel()
lmod = Lasso(alpha = 0.001)
lmod.fit(X_train, y_train)
y_train_pred = lmod.predict(X_train)>0.5
y_test_pred = lmod.predict(X_test)>0.5
dataset_orig_train_pred = selected_dataset.copy(deepcopy=True)
dataset_orig_train_pred.labels = y_train_pred
dataset_orig_test_pred = _.copy(deepcopy=True)
dataset_orig_test_pred.labels = y_test_pred
print(np.mean(y_train_pred!=y_train))
print(np.mean(y_test_pred!=y_test))
metric_test = compute_metrics(_, dataset_orig_test_pred, 
                unprivileged_groups, privileged_groups)
metric_train = compute_metrics(selected_dataset, dataset_orig_train_pred,unprivileged_groups, privileged_groups)

0.0491891064871481
0.05163974547234459
Balanced accuracy = 0.5000
Statistical parity difference = 0.0000
Disparate impact = 1.0000
Average odds difference = 0.0000
Equal opportunity difference = 0.0000
Theil index = 0.0177
Balanced accuracy = 0.5000
Statistical parity difference = 0.0000
Disparate impact = 1.0000
Average odds difference = 0.0000
Equal opportunity difference = 0.0000
Theil index = 0.0170


In [21]:
# #save model
# bmodels = {}
# bmodels['clf'] = lmod
# pickle.dump(bmodels, open("experiments/german"+str(K)+'_age_bmodel.pkl','wb'))

In [22]:
# print out some labels, names, etc.
display(Markdown("#### Training Dataset shape"))
print(dataset_orig_train.features.shape)
display(Markdown("#### Favorable and unfavorable labels"))
print(dataset_orig_train.favorable_label, dataset_orig_train.unfavorable_label)
display(Markdown("#### Protected attribute names"))
print(dataset_orig_train.protected_attribute_names)
display(Markdown("#### Privileged and unprivileged protected attribute values"))
print(dataset_orig_train.privileged_protected_attributes, 
      dataset_orig_train.unprivileged_protected_attributes)
display(Markdown("#### Dataset feature names"))
print(dataset_orig_train.feature_names)

#### Training Dataset shape

(16341, 57)


#### Favorable and unfavorable labels

1.0 0.0


#### Protected attribute names

['race']


#### Privileged and unprivileged protected attribute values

[array([1.])] [array([0.])]


#### Dataset feature names

['race', 'fulltime', 'parttime', 'sex', 'dropout', 'decile1b_geq1q3.0', 'decile1b_geq2q5.0', 'decile1b_geq3q7.0', 'decile1b_geq4q9.0', 'decile3_geq1q3.0', 'decile3_geq2q5.0', 'decile3_geq3q7.0', 'decile3_geq4q9.0', 'decile1_geq1q3.0', 'decile1_geq2q5.0', 'decile1_geq3q7.0', 'decile1_geq4q9.0', 'cluster_geq1q1.0', 'cluster_geq2q3.0', 'cluster_geq3q3.0', 'cluster_geq4q4.0', 'lsat_geq1q32.5', 'lsat_geq2q36.0', 'lsat_geq3q38.0', 'lsat_geq4q41.0', 'ugpa_geq1q2.9', 'ugpa_geq2q3.1', 'ugpa_geq3q3.4', 'ugpa_geq4q3.6', 'zfygpa_geq1q-0.7', 'zfygpa_geq2q-0.16', 'zfygpa_geq3q0.32', 'zfygpa_geq4q0.89', 'DOB_yr_geq1q63.0', 'DOB_yr_geq2q67.0', 'DOB_yr_geq3q68.0', 'DOB_yr_geq4q69.0', 'zgpa_geq1q-0.85', 'zgpa_geq2q-0.25', 'zgpa_geq3q0.28', 'zgpa_geq4q0.87', 'fam_inc_geq1q3.0', 'fam_inc_geq2q3.0', 'fam_inc_geq3q4.0', 'fam_inc_geq4q4.0', 'age_geq1q-62.0', 'age_geq2q-61.0', 'age_geq3q-60.0', 'age_geq4q-57.0', 'tier_geq1q3.0', 'tier_geq2q3.0', 'tier_geq3q4.0', 'tier_geq4q5.0', 'index6040_geq1q663.157879', '