In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# from keras.models import Sequential, Model
# from keras.layers import Dense, Input
# from keras.layers import Dropout, Reshape
# from keras.layers import Conv1D, Flatten, AveragePooling1D, MaxPooling1D
# from keras.utils import to_categorical
# from keras.wrappers.scikit_learn import KerasClassifier

# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import StratifiedKFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.cluster import KMeans

In [114]:
def discrimination(data, target, sens, expl, max_corr=.1):
    # target
    # sens: sensitive attribute
    # expl: explanatory attribute(s), str or list
    group_priv = data[data[sens]==2]
    group_prot = data[data[sens]==1]
    n_priv = group_priv.shape[0]
    n_prot = group_prot.shape[0]
    
    D_all = np.sum(group_priv[target]==1)/n_priv - np.sum(group_prot[target] == 1)/n_prot
    print('Total discrimination: %.2f' % D_all)
    
    # multiple explanatory attributes
    if isinstance(expl, list):
        high_corr = list(data.columns[np.abs(data.corr()[sens].sort_values()) > max_corr])
        for e in expl: 
            if e in high_corr: 
                print(e, 'is highly correlated with', sens)
        expl = [e for e in expl if e not in high_corr]
        data_expl = pd.Series(KMeans(n_clusters=6).fit(data[expl]).labels_)
    else:
        data_expl = data[expl]
    
    data_expl_priv = data_expl[data[sens]==2]
    data_expl_prot = data_expl[data[sens]==1]
        
    expl_values = data_expl.unique()
    D_expl = 0 
    
    for e_i in expl_values:
        P_star = (np.sum((group_priv[target]==1) & (data_expl_priv == e_i))/n_priv + 
                  np.sum((group_prot[target]==1) & (data_expl_prot == e_i))/n_prot)/2
        D_expl += (np.sum(data_expl_priv == e_i)/n_priv - np.sum(data_expl_prot == e_i)/n_prot) * P_star
        
    print('Discrimination explainable by %s: %.2f' % (', '.join(expl), D_expl))
    
    D_illegal = D_all - D_expl
    print('Unexplainable discrimination: %.2f' % D_illegal)
    
    return (D_all, D_expl, D_illegal)

In [115]:
credit_df = pd.read_csv('./dataset.csv')
credit_df.columns

Index(['Creditability', 'Account Balance', 'Duration of Credit (month)',
       'Payment Status of Previous Credit', 'Purpose', 'Credit Amount',
       'Value Savings/Stocks', 'Length of current employment',
       'Instalment per cent', 'Sex & Marital Status', 'Guarantors',
       'Duration in Current address', 'Most valuable available asset',
       'Age (years)', 'Concurrent Credits', 'Type of apartment',
       'No of Credits at this Bank', 'Occupation', 'No of dependents',
       'Telephone', 'Foreign Worker'],
      dtype='object')

Split the data into four groups based on gender and nationality: native male, native female, foreign male, foreign female. 

In [116]:
male_idx = (credit_df['Sex & Marital Status']==1) | (credit_df['Sex & Marital Status']==3) | \
                 (credit_df['Sex & Marital Status']==4)
female_idx = (credit_df['Sex & Marital Status']==2) | (credit_df['Sex & Marital Status']==5)
native_idx = (credit_df['Foreign Worker']==2)
foreign_idx = (credit_df['Foreign Worker']==1)

# insert a column of gender, 1 female, 2 female
credit_df.insert(loc=len(credit_df.columns), column='gender', value=1)
credit_df.loc[male_idx, 'gender'] = 2

native_male = credit_df[male_idx & native_idx]
native_female = credit_df[female_idx & native_idx]
foreign_male = credit_df[male_idx & foreign_idx]
foreign_female = credit_df[female_idx & foreign_idx]

print('Number of samples of different sub-gropups (native male, native female, foreign male, foreign female): ',
     native_male.shape[0], native_female.shape[0], foreign_male.shape[0], foreign_female.shape[0])
print('Credible samples of different sub-gropups (native male, native female, foreign male, foreign female): ',
     native_male[native_male['Creditability']==1].shape[0], 
      native_female[native_female['Creditability']==1].shape[0], 
      foreign_male[foreign_male['Creditability']==1].shape[0], 
      foreign_female[foreign_female['Creditability']==1].shape[0])

# # resample female data to have equal number of samples as male
# resampled_female = resample(female, n_samples=len(male))
# resampled_gender = pd.concat([male, resampled_female], axis=0, ignore_index=True)
# resampled_gender.to_csv('./resampled_gender.csv', index=False)

Number of samples of different sub-gropups (native male, native female, foreign male, foreign female):  32 5 658 305
Credible samples of different sub-gropups (native male, native female, foreign male, foreign female):  29 4 470 197


Then resample 250 samples from each sub-group with replacement. 

In [117]:
n_samp = 250
re_native_male = resample(native_male, n_samples=n_samp)
re_native_female = resample(native_female, n_samples=n_samp)
re_foreign_male = resample(foreign_male, n_samples=n_samp)
re_foreign_female = resample(foreign_female, n_samples=n_samp)

resampled_df = pd.concat([re_native_male, re_native_female, re_foreign_male, re_foreign_female], 
                             ignore_index=True, axis=0)
resampled_df = resampled_df.reset_index(drop=True)
resampled_df.to_csv('./resampled_nation_gender.csv', index=False)

Discrimination in resampled data

In [118]:
legal = resampled_df.columns[ [1, 3, 5, 6, 8, 10, 12, 14, 16 ] ]
maybe = resampled_df.columns[ [1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 14, 15, 16, 17 ] ]

discrimination(resampled_df, 'Creditability', 'Foreign Worker', list(legal))
print('************************************')
discrimination(resampled_df, 'Creditability', 'gender', list(legal))

Total discrimination: 0.17
Discrimination explainable by Account Balance, Payment Status of Previous Credit, Credit Amount, Value Savings/Stocks, Instalment per cent, Guarantors, Most valuable available asset, Concurrent Credits, No of Credits at this Bank: -0.03
Unexplainable discrimination: 0.19
************************************
Total discrimination: 0.14
Discrimination explainable by Account Balance, Payment Status of Previous Credit, Credit Amount, Value Savings/Stocks, Instalment per cent, Guarantors, Most valuable available asset, Concurrent Credits, No of Credits at this Bank: 0.05
Unexplainable discrimination: 0.09


(0.14200000000000002, 0.04892800000000001, 0.093072)