In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
sys.path.append("../scripts/")
import warnings
import numpy as np
import aif360
import eval_metrics as evaluation
import load_data as data_load
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.algorithms.preprocessing import DisparateImpactRemover
warnings.filterwarnings("ignore")


In [4]:
path_to_csv = '../dataset/dataset.csv'
test_size = 0.3
random_seed = 1

In [5]:
# X_train, X_test, y_train, y_test, sens_train, sens_test= data_load.load_preprocessing_split(path_to_csv, split_data = True, unawareness= True,
#                                                                                          test_size=test_size, random_state=random_seed)

df, X, y, sensitive_attribute = data_load.load_preprocessing_split(path_to_csv, split_data = False, unawareness= False,
                                                                                         test_size=test_size, random_state=random_seed)

In [9]:

binaryLabelDataset = aif360.datasets.BinaryLabelDataset(
    favorable_label=1,
    unfavorable_label=0,
    df=df,
    label_names=['Target'],
    protected_attribute_names=['Gender'])

## Test for Fairness on original datasets before

In [10]:
metric = BinaryLabelDatasetMetric(binaryLabelDataset, 
                                  unprivileged_groups=[{'Gender': 0}], 
                                  privileged_groups=[{'Gender': 1}])
print("Disparate Impact: ", metric.disparate_impact())
print("Statistical Parity Difference: ", metric.statistical_parity_difference())

Disparate Impact:  0.5572433881066498
Statistical Parity Difference:  -0.1994681137128782


In [20]:
df_female = df[df['Gender'] == 0]# females

df_male = df[df['Gender'] == 1]# males

female_nodrop = df_female['Target'].value_counts()[0]
female_drop = df_female['Target'].value_counts()[1]

male_nodrop = df_male['Target'].value_counts()[0]
male_drop = df_male['Target'].value_counts()[1]

print(f'Number of females that drop out are {female_drop} and no drop out {female_nodrop}, hence disparate is {female_nodrop/len(df_female)}')
print(f'Number of males that drop out are {male_drop} and no drop out {male_nodrop}, hence disparate impact is {male_nodrop/len(df_male)}')

Number of females that drop out are 720 and no drop out 2148, hence disparate is 0.7489539748953975
Number of males that drop out are 701 and no drop out 855, hence disparate impact is 0.5494858611825193


### Reweighting (Pre Processing Method)

In [21]:
'''
Gender 

1: Male
0: Female
'''

from aif360.algorithms.preprocessing import Reweighing
RW = Reweighing(unprivileged_groups=[{'Gender': 0}], 
                privileged_groups=[{'Gender': 1}])
dataset_transf = RW.fit_transform(binaryLabelDataset)

### Test for Fairness on original datasets after accounting for fairness

In [22]:

metric = BinaryLabelDatasetMetric(dataset_transf, 
                                  unprivileged_groups=[{'Gender': 0}], 
                                  privileged_groups=[{'Gender': 1}])
print("Disparate Impact: ", metric.disparate_impact())
print("Statistical Parity Difference: ", metric.statistical_parity_difference())

Disparate Impact:  1.0
Statistical Parity Difference:  0.0


In [42]:
# transformed_df = dataset_transf.convert_to_dataframe()
# convert_to_dataframe(dataset_transf=False)