In [1]:
#install packages
!pip install numba==0.48
!pip install aif360==0.3.0rc0
!pip install BlackBoxAuditing



In [2]:
#import packages
import numpy as np
import pandas as pd

import BlackBoxAuditing

from aif360.algorithms.preprocessing import Reweighing, DisparateImpactRemover
from aif360.datasets import AdultDataset, StandardDataset, BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.postprocessing import EqOddsPostprocessing, RejectOptionClassification
from aif360.algorithms.preprocessing.optim_preproc_helpers.data_preproc_functions import load_preproc_data_adult

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from google.colab import files
import io

Matplotlib Error, comment out matplotlib.use('TkAgg')


In [3]:
#read in the dataset
uploaded = files.upload()
heart = pd.read_csv(io.BytesIO(uploaded['heart.csv']))

heart.head()

Saving heart.csv to heart (2).csv


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
#split data into X and y and scale the X
X = heart[list(heart.columns)[0:-1]]
y = heart['target']

X_norm = normalize(X, norm='l2')

In [6]:
#split the dataset into train, val, test using the same seeds as the ADS
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.33, random_state=101) 
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [32]:
#PROBLEM: BY NORMALIZING IN THE MANNER IN WHICH HE DID, HE CHANGED THE INPUT SPACE OF METRICS, LIKE SEX
  #WE NEED TO MAP THIS NEW COLUMN BACK TO A BINARY 0/1 COLUMN IN THE DATASET BEFORE FITTING THE DATASET

#resplitting data with same seed without normalizing (used to validate above hypothesis)
X_check_train, X_thing = train_test_split(X, test_size=0.33, random_state=101) 
X_check_val, X_check_test = train_test_split(X_thing, test_size=0.5, random_state=42)

In [35]:
#save datasets at BinaryLabelDatasets

#join X and y data together into an array
train_arr = np.hstack((X_train, y_train.to_numpy().reshape(-1,1)))
val_arr = np.hstack((X_val, y_val.to_numpy().reshape(-1,1)))
test_arr = np.hstack((X_test, y_test.to_numpy().reshape(-1,1)))

#convert back into dataframe
train_df = pd.DataFrame(data=train_arr, columns=heart.columns)
val_df = pd.DataFrame(data=val_arr, columns=heart.columns)
test_df = pd.DataFrame(data=test_arr, columns=heart.columns)

#make sure sex remains binary (this was undone with scaling but 0s remained 0)
train_df.loc[train_df.sex != 0, 'sex'] = 1
val_df.loc[val_df.sex != 0, 'sex'] = 1
test_df.loc[test_df.sex != 0, 'sex'] = 1

#convert sex and target back to int
test_df.sex = test_df.sex.astype(int)
test_df.target = test_df.target.astype(int)

val_df.sex = val_df.sex.astype(int)
val_df.target = val_df.target.astype(int)

train_df.sex = train_df.sex.astype(int)
train_df.target = train_df.target.astype(int)

In [36]:
#generate binary label datasets for each of the datasets with the truth value for the target
heart_train_dataset_truth = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=train_df,
    label_names=['target'],
    protected_attribute_names=['sex'])

heart_val_dataset_truth = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=val_df,
    label_names=['target'],
    protected_attribute_names=['sex'])

heart_test_dataset_truth = BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=test_df,
    label_names=['target'],
    protected_attribute_names=['sex'])

#save copies of these datasets in order to swap in predictions
heart_train_dataset_preds = heart_train_dataset_truth.copy()
heart_val_dataset_preds = heart_val_dataset_truth.copy()
heart_test_dataset_preds = heart_test_dataset_truth.copy()

In [9]:
#run this model matching ADS params
model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5000, 10), random_state=1)
model.fit(X_train, y_train)

#confirm same prediction scores as ADS
print('Val score:',model.score(X_val, y_val))
print('Test score:',model.score(X_test, y_test))

#save preditions on test data
y_preds = model.predict(X_test)

Val score: 0.78
Test score: 0.92


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [41]:
#save predictions into the dataset
heart_test_dataset_preds.labels = y_preds

privileged_groups = [{'sex': 1}]
unprivileged_groups = [{'sex': 0}]

#get classification metrics on test data
metrics = ClassificationMetric(heart_test_dataset_truth, heart_test_dataset_preds,
                      unprivileged_groups=unprivileged_groups,
                      privileged_groups=privileged_groups)

In [59]:
#print comparizon metrics
print("Overall Test Accuracy:", metrics.accuracy())
print("Male Test Accuracy:", metrics.accuracy(privileged=True))
print("Female Test Accuracy:", metrics.accuracy(privileged=False))
print('')
print("Test Disparate Impact:", metrics.disparate_impact())
print('')
print("Test FPR:", metrics.false_positive_rate())
print("Test FPR Difference:", metrics.false_positive_rate_difference())
print("")
print("Test FNR:", metrics.false_negative_rate())
print("Test FNR Difference:", metrics.false_negative_rate_difference())
print('')
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_preds))

Overall Test Accuracy: 0.92
Male Test Accuracy: 0.9166666666666666
Female Test Accuracy: 0.9285714285714286

Test Disparate Impact: 1.9285714285714286

Test FPR: 0.15384615384615385
Test FPR Difference: 0.20289855072463767

Test FNR: 0.0
Test FNR Difference: 0.0

Confusion Matrix:
[[22  4]
 [ 0 24]]


In [None]:
#NEXT STEPS:
  #1) GET THE MEANING OF EACH OF THESE
    #Disparate Impact >1 implies "favorable" outcomes for women
    #But in this case, "favorable" means more likely to be diagnosed with heart disease
    #DI is Pr(targ=1|women)/Pr(targ=1|men)
  #2) SHOULD WE EVALUATE THESE METRICS ON THE VAL AND TEST COMBINED GIVEN THE SIZE OF THE DATASET AND LACK OF HYPERPARAMETER TESTING