In [13]:
#OMSCS2024Spring-Final_Project

#Team member 1: Juejing Han – jhan446
#Team member 2: Qian Fang – qfang36

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)  # None means unlimited
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [14]:
from aif360.algorithms.preprocessing import Reweighing
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import BinaryLabelDatasetMetric

In [15]:
def TransformtoDF(data, label):
    features = data.features
    labels = data.labels.reshape(-1, 1)  # Reshape labels to match (n_samples, 1)
    protected_attributes = data.protected_attributes
    instance_weights = data.instance_weights.reshape(-1, 1)  # Reshape weights to match (n_samples, 1)

    # Combine features, labels, protected attributes, and instance weights into DataFrame
    df_transformed = pd.DataFrame(data=features, columns=data.feature_names)
    df_transformed[label] = labels
    for i, attr in enumerate(data.protected_attribute_names):
        df_transformed[attr] = protected_attributes[:, i]
    df_transformed['instance_weights'] = instance_weights
    df_transformed.to_csv('data_Credit_tsftodftest.csv', index=False)

    return df_transformed

In [16]:
def plot_DI(DI, dependent, attribute, privilege, unprivilege, title):
    plt.figure(figsize=(10, 6))

    plt.plot([0, 5], [1, 1], color='black', linestyle='-', linewidth=1)
    plt.plot([0, 5], [1.5, 1.5], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 5], [0.5, 0.5], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 5], [0, 0], color='grey', linestyle='--', linewidth=1)
    plt.fill_between([0, 5], 0.8, 1.2, color='lightgrey', alpha=0.5)
    if DI < 0.8 or DI > 1.2:
      plt.fill_between([0, 5], 0, 0.8, color='mistyrose', alpha=0.5)

    x_range = [2.5, 4]
    x_text = sum(x_range) / 2

    # Using fill_between to create a 'bar'
    plt.fill_between(x_range, 0, DI, color='dimgray', step="pre")
    # Add the DI value on top of the 'bar'
    plt.text(x_text, DI, f'{DI:.2f}', ha='center', va='bottom', fontsize=14)

    plt.text(x=5.5, y=0.95, s='Fair', verticalalignment='bottom', horizontalalignment='right', color='black',
             fontsize=14)
    plt.text(x=6, y=1.1, s=f"Privileged Group: {privilege}", verticalalignment='bottom', horizontalalignment='left',
             color='black', fontsize=14)
    plt.text(x=6, y=0.9, s=f"Unprivileged Group: {unprivilege}", verticalalignment='bottom', horizontalalignment='left',
             color='black', fontsize=14)
    if DI < 0.8 or DI > 1.2:
        plt.text(x=5.5, y=0.35, s='Bias', verticalalignment='bottom', horizontalalignment='right', color='red', fontsize=14)
    else:
        plt.text(x=5.5, y=0.35, s='Bias', verticalalignment='bottom', horizontalalignment='right', color='grey', fontsize=14)

    plt.xlim(0, 10)
    plt.xticks([])
    plt.ylim(-0.2, 1.7)
    plt.yticks([0, 0.5, 1, 1.5], fontsize=12)

    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)

    plt.title('Disparate Impact for '+ dependent + ' (Protected Attribute: ' + attribute +')', fontsize=16)
    plt.grid(False)

    plt.savefig(f"{title}-{dependent}-{attribute}_DI.png", bbox_inches='tight')
    plt.close()

In [17]:
def plot_SPD(SPD, dependent, attribute, privilege, unprivilege, title):
    plt.figure(figsize=(10, 6))

    plt.plot([0, 5], [0, 0], color='black', linestyle='-', linewidth=1)
    plt.plot([0, 5], [-1, -1], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 5], [-0.5, -0.5], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 5], [0.5, 0.5], color='grey', linestyle='--', linewidth=1)
    plt.plot([0, 5], [1, 1], color='grey', linestyle='--', linewidth=1)
    plt.fill_between([0, 5], -0.1, 0.1, color='lightgrey', alpha=0.5)
    if SPD < -0.1 or SPD > 0.1:
        plt.fill_between([0, 5], -1, -0.1, color='mistyrose', alpha=0.5)

    x_range = [2.5, 4]
    x_text = sum(x_range) / 2

    # Using fill_between to create a 'bar'
    plt.fill_between(x_range, 0, SPD, color='dimgray', step="pre")
    # Add the SPD value on top of the 'bar'
    va_align = 'bottom' if SPD >= 0 else 'top'
    plt.text(x_text, SPD, f'{SPD:.2f}', ha='center', va=va_align, fontsize=14)

    plt.text(x=5.5, y=-0.05, s='Fair', verticalalignment='bottom', horizontalalignment='right', color='black',
             fontsize=14)
    if SPD < -0.1 or SPD > 0.1:
        plt.text(x=5.5, y=-0.6, s='Bias', verticalalignment='bottom', horizontalalignment='right', color='red', fontsize=14)
    else:
        plt.text(x=5.5, y=-0.6, s='Bias', verticalalignment='bottom', horizontalalignment='right', color='grey', fontsize=14)

    plt.text(x=6, y=0.1, s=f"Privileged Group: {privilege}", verticalalignment='bottom', horizontalalignment='left',
             color='black', fontsize=14)
    plt.text(x=6, y=-0.1, s=f"Privileged Group: {unprivilege}", verticalalignment='bottom', horizontalalignment='left',
             color='black', fontsize=14)

    plt.xlim(0, 10)
    plt.xticks([])
    plt.ylim(-1.1, 1.1)
    plt.yticks([-1, -0.5, 0, 0.5, 1], fontsize=12)

    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)

    plt.title('Statistical Parity Difference for ' + dependent + ' (Protected Attribute: ' + attribute + ')', fontsize=16)
    plt.grid(False)

    plt.savefig(f"{title}-{dependent}-{attribute}_SPD.png", bbox_inches='tight')
    plt.close()

In [18]:
def plot_bar(dependent, proclass, subgroup, value):
    N = len(subgroup)
    # Positions of the left bar-groups
    barWidth = 0.3
    r1 = np.arange(N)
    r2 = [x + barWidth for x in r1]
    plt.figure(figsize=(10, 6))
    if N == 2:
        approved_values = [value['approved_'+subgroup[0]], value['approved_'+subgroup[1]]]
        rejected_values = [value['rejected_'+subgroup[0]], value['rejected_'+subgroup[1]]]
    elif N == 3:
        approved_values = [value['approved_' + subgroup[0]], value['approved_' + subgroup[1]], value['approved_' + subgroup[2]]]
        rejected_values = [value['rejected_' + subgroup[0]], value['rejected_' + subgroup[1]], value['rejected_' + subgroup[2]]]

    plt.bar(r1, approved_values, color='blue', width=barWidth, edgecolor='grey', label='Approved')
    plt.bar(r2, rejected_values, color='red', width=barWidth, edgecolor='grey', label='Rejected')
    plt.xlabel(proclass, fontsize=18)
    plt.xticks([r + barWidth / 2 for r in range(N)], subgroup, fontsize=16)
    plt.yticks(fontsize=16)
    plt.ylabel('Frequency', fontsize=18)
    plt.title('Frequency of ' + proclass + ' with ' + dependent, fontsize=16)

    plt.legend(fontsize=14)
    plt.tight_layout()
    plt.savefig('Frequency of ' + proclass + ' with ' + dependent + '.png', bbox_inches='tight')
    plt.close()

In [19]:
def preprocess(filename):
    data = pd.read_csv(filename, low_memory=False)
    print('---------------------------------------------')
    print('                Explor the Data              ')
    print('---------------------------------------------')
    print('Data Shape    : ', data.shape)
    # print(data['Citizen'].unique())

    '''
    ----------------------------------------------------------------------
    1. Derive Y (Dependent Variable) from Dataset
    ----------------------------------------------------------------------
    '''
    #print('Debt Median       ', (data['Debt'].median()))
    #print('Income Median     ', (data['Income'].median()))
    #print('CreditScore Median', (data['CreditScore'].median()))
    #print('-------------------------------')
    #print('Original Reject No.        ', data['Approved'].value_counts().get(0))
    #print('Original Approved No.      ', (data['Approved'].value_counts().get(1)))

    data['Credit'] = 0
    data.loc[data['Approved'] == 1, 'Credit'] += 40
    data.loc[(data['PriorDefault'] == 0 ), 'Credit'] += 20
    data.loc[(data['Income'] > 5) & (data['Income'] <= 50), 'Credit'] += 20
    data.loc[(data['Income'] > 50), 'Credit'] += 30
    data.loc[(data['CreditScore'] > 0), 'Credit'] += 10

    data['Y'] = data['Credit'].apply(lambda x: 0 if x < 60 else 1)
    #print('New Variable Y Reject No.  ', data['Y'].value_counts().get(0))
    #print('New Variable Y Approved No.', data['Y'].value_counts().get(1))
    correlation = np.corrcoef(data['Y'], data['Approved'])[0, 1]
    #print('Correlation coefficient:', correlation)
    #print('-------------------------------')
    #data.to_csv('data_Credit.csv', index=False)
    '''
    ----------------------------------------------------------------------
    2. Convert Protected Classes Values to Numerical (5 subgroups)
    ----------------------------------------------------------------------
    # Two subgroups are already numerical
    # Gender: Female = 0, Male = 1
    # Married: Single/Divorced/etc = 0, Married = 1
    '''
    # Age: 40&Under = 0, Above40 = 1
    # Race: White = 0, Black = 1, (Asian, Latino, Other) = 2
    # Citizen: ByBirth = 0, (ByOtherMeans, Temporary) = 1
    data['Age_new'] = data['Age'].apply(lambda x: 0 if x <= 40 else 1)
    data['Race_new'] = data['Ethnicity'].apply(lambda x: 0 if x == 'White' else (1 if x == 'Black' else 2))
    data['Citizen_new'] = data['Citizen'].apply(lambda x: 0 if x == 'ByBirth' else 1)
    #print(data[['Citizen', 'Citizen_new']])
    data_original = data.drop(columns=['Industry', 'Age', 'Ethnicity', 'Citizen', 'Credit'])

    return data_original

In [20]:
def Step2(data):
    # 1) Sex
    Female = data['Gender'].value_counts().get(0)
    Male = data['Gender'].value_counts().get(1)
    print('[Sex]    ', Female + Male, '     Female Count:', Female, ',    Male Count:', Male)
    # 2) Age
    under = data['Age_new'].value_counts().get(0)
    above = data['Age_new'].value_counts().get(1)
    print('[Age]    ', under + above,  '   40&Under Count:', under, ', 40Above Count:', above)
    # 3) Marital
    noMarried = data['Married'].value_counts().get(0)
    Married = data['Married'].value_counts().get(1)
    print('[Marital]', noMarried + Married, '  noMarried Count:', noMarried, ', Married Count:', Married)
    # 4) Race
    White = data['Race_new'].value_counts().get(0)
    Black = data['Race_new'].value_counts().get(1)
    Other = data['Race_new'].value_counts().get(2)
    print('[Race]   ', White + Black + Other, '      White Count:', White, ',   Black Count:', Black, ',    Other Count:', Other)
    # 5) Citizen
    ByBirth = data['Citizen_new'].value_counts().get(0)
    Other = data['Citizen_new'].value_counts().get(1)
    print('[Citizen]', ByBirth + Other, '    ByBirth Count:', ByBirth, ', Other Count:', Other)

    print('---------------------------------------------')
    print('                    Step 2                   ')
    print('            Statistics for Report            ')
    print('---------------------------------------------')
    # Dependent Variables
    Vars = ['Approved', 'Y']
    dicts_dependent = {}
    for var in Vars:
        # dict for plot_bar
        sex_dict = {}
        age_dict = {}
        marital_dict = {}
        race_dict = {}
        citizen_dict = {}
        # 1) Sex
        data_filter = data[(data['Gender'] == 0) & (data[var] == 1)]
        approved_female = len(data_filter)
        data_filter = data[(data['Gender'] == 1) & (data[var] == 1)]
        approved_male = len(data_filter)

        data_filter = data[(data['Gender'] == 0) & (data[var] == 0)]
        rejected_female = len(data_filter)
        data_filter = data[(data['Gender'] == 1) & (data[var] == 0)]
        rejected_male = len(data_filter)

        sex_dict['approved_female'] = approved_female
        sex_dict['approved_male'] = approved_male
        sex_dict['rejected_female'] = rejected_female
        sex_dict['rejected_male'] = rejected_male
        # 2) Age
        data_filter = data[(data['Age_new'] == 0) & (data[var] == 1)]
        approved_under = len(data_filter)
        data_filter = data[(data['Age_new'] == 1) & (data[var] == 1)]
        approved_above = len(data_filter)

        data_filter = data[(data['Age_new'] == 0) & (data[var] == 0)]
        rejected_under = len(data_filter)
        data_filter = data[(data['Age_new'] == 1) & (data[var] == 0)]
        rejected_above = len(data_filter)

        age_dict['approved_40&Under'] = approved_under
        age_dict['approved_Above40'] = approved_above
        age_dict['rejected_40&Under'] = rejected_under
        age_dict['rejected_Above40'] = rejected_above
        # 3) Marital
        data_filter = data[(data['Married'] == 0) & (data[var] == 1)]
        approved_nonmarr = len(data_filter)
        data_filter = data[(data['Married'] == 1) & (data[var] == 1)]
        approved_married = len(data_filter)

        data_filter = data[(data['Married'] == 0) & (data[var] == 0)]
        rejected_nonmarr = len(data_filter)
        data_filter = data[(data['Married'] == 1) & (data[var] == 0)]
        rejected_married = len(data_filter)

        marital_dict['approved_Single/Divorced/etc.'] = approved_nonmarr
        marital_dict['approved_Married'] = approved_married
        marital_dict['rejected_Single/Divorced/etc.'] = rejected_nonmarr
        marital_dict['rejected_Married'] = rejected_married
        # 4) Race
        data_filter = data[(data['Race_new'] == 0) & (data[var] == 1)]
        approved_white = len(data_filter)
        data_filter = data[(data['Race_new'] == 1) & (data[var] == 1)]
        approved_black = len(data_filter)
        data_filter = data[(data['Race_new'] == 2) & (data[var] == 1)]
        approved_other = len(data_filter)

        data_filter = data[(data['Race_new'] == 0) & (data[var] == 0)]
        rejected_white = len(data_filter)
        data_filter = data[(data['Race_new'] == 1) & (data[var] == 0)]
        rejected_black = len(data_filter)
        data_filter = data[(data['Race_new'] == 2) & (data[var] == 0)]
        rejected_other = len(data_filter)

        race_dict['approved_White'] = approved_white
        race_dict['approved_Black'] = approved_black
        race_dict['approved_Other'] = approved_other
        race_dict['rejected_White'] = rejected_white
        race_dict['rejected_Black'] = rejected_black
        race_dict['rejected_Other'] = rejected_other
        # 5) Citizen
        data_filter = data[(data['Citizen_new'] == 0) & (data[var] == 1)]
        approved_bybirth = len(data_filter)
        data_filter = data[(data['Citizen_new'] == 1) & (data[var] == 1)]
        approved_otherme = len(data_filter)

        data_filter = data[(data['Citizen_new'] == 0) & (data[var] == 0)]
        rejected_bybirth = len(data_filter)
        data_filter = data[(data['Citizen_new'] == 1) & (data[var] == 0)]
        rejected_otherme = len(data_filter)

        citizen_dict['approved_ByBirth'] = approved_bybirth
        citizen_dict['approved_OtherMeans'] = approved_otherme
        citizen_dict['rejected_ByBirth'] = rejected_bybirth
        citizen_dict['rejected_OtherMeans'] = rejected_otherme

        print('----------------')
        print('Dependent Variable:', var)
        print('----------------')
        print('Sex', approved_female + approved_male + rejected_female + rejected_male)
        print('Approved Female:', approved_female, ', Approved Male:', approved_male)
        print('Rejected Female:', rejected_female, ', Rejected Male:', rejected_male)
        print('****************************')
        print('Age', approved_under + approved_above + rejected_under + rejected_above)
        print('Approved Under40:', approved_under, ', Approved Above40:', approved_above)
        print('Rejected Under40:', rejected_under, ', Rejected Above40:', rejected_above)
        print('****************************')
        print('Marital', approved_nonmarr + approved_married + rejected_nonmarr + rejected_married)
        print('Approved Nonmarr:', approved_nonmarr, ', Approved Married:', approved_married)
        print('Rejected Nonmarr:', rejected_nonmarr, ', Rejected Married:', rejected_married)
        print('****************************')
        print('Race', approved_white + approved_black + approved_other + rejected_white + rejected_black + rejected_other)
        print('Approved White:', approved_white, ', Approved Black:', approved_black, ', Approved Other:', approved_other)
        print('Rejected White:', rejected_white, ', Rejected Black:', rejected_black, ', Rejected Other:', rejected_other)
        print('****************************')
        print('Citizen ', approved_bybirth + approved_otherme + rejected_bybirth + rejected_otherme)
        print('Approved Birth:', approved_bybirth, ', Approved Birth:', approved_otherme)
        print('Rejected Other:', rejected_bybirth, ', Rejected Other:', rejected_otherme)
        print('---------------------------------------------')

        '''
        Step 2.5 Frenquency of Subgroups
        '''
        proclass_names = ['Sex', 'Age', 'Familial Status', 'Race', 'Citizenship']
        dicts = {}
        proclass = {}
        proclass['Sex'] = ['male', 'female']
        proclass['Age'] = ['Above40', '40&Under']
        proclass['Familial Status'] = ['Married', 'Single/Divorced/etc.']
        proclass['Race'] = ['White', 'Black', 'Other']
        proclass['Citizenship'] = ['ByBirth', 'OtherMeans']

        sex_dict['approved_female'] = approved_female
        sex_dict['approved_male'] = approved_male
        sex_dict['rejected_female'] = rejected_female
        sex_dict['rejected_male'] = rejected_male

        sex_dict['approved_40&Under'] = approved_under
        sex_dict['approved_Above40'] = approved_above
        sex_dict['rejected_40&Under'] = rejected_under
        sex_dict['rejected_Above40'] = rejected_above

        dicts['Sex'] = sex_dict
        dicts['Age'] = age_dict
        dicts['Familial Status'] = marital_dict
        dicts['Race'] = race_dict
        dicts['Citizenship'] = citizen_dict

        if var == 'Approved':
            var_name = 'Dependent Variable 1'
        elif var == 'Y':
            var_name = 'Dependent Variable 2'
        for proclass_name in proclass_names:
            plot_bar(var_name, proclass_name, proclass[proclass_name], dicts[proclass_name])

In [21]:
def Step3(data):
    print('---------------------------------------------')
    print('                    Step 3                   ')
    print('              Original Dataset               ')
    print('---------------------------------------------')
    # For Plotting
    dependent = {}
    dependent['Approved'] = 'Dependent Variable 1'
    dependent['Y'] = 'Dependent Variable 2'

    dependent_flip = {}
    dependent_flip['Approved'] = 'Dependent Variable 2'
    dependent_flip['Y'] = 'Dependent Variable 1'

    attribute = {}
    attribute['Age_new'] = 'Age'
    attribute['Married'] = 'Familial Status'

    privilege = {}
    privilege['Age_new'] = 'Above 40'
    privilege['Married'] = 'Married'

    unprivilege = {}
    unprivilege['Age_new'] = '40 & Under'
    unprivilege['Married'] = 'Single/Divorced/etc.'

    # Dependent Variables
    Vars = ['Approved', 'Y']

    for var in Vars:
        print('----------------')
        print('Dependent Variable:', var)
        print('----------------')

        protected_attributes = ['Age_new', 'Married']
        for protected_attribute in protected_attributes:
            binary_dataset = BinaryLabelDataset(df=data, label_names=[var], protected_attribute_names=[protected_attribute])
            unprivileged_groups = [{protected_attribute: 0}]
            privileged_groups = [{protected_attribute: 1}]

            metric = BinaryLabelDatasetMetric(binary_dataset,
                                              unprivileged_groups=unprivileged_groups,
                                              privileged_groups=privileged_groups)
            DI = metric.disparate_impact()
            SPD = metric.statistical_parity_difference()
            plot_DI(DI, dependent[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], '3.2')
            plot_SPD(SPD, dependent[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], '3.2')

            print(f"Disparate Impact for {protected_attribute}: {DI:.2f}")
            print(f"Statistical Parity Difference: {SPD:.2f}")

    '''
    Step 3.3-3.4 Transform Data
    '''
    print('---------------------------------------------')
    print('                    Step 3                   ')
    print('             Transformed Dataset             ')
    print('---------------------------------------------')
    # Prepare for Reweighing
    privileged_groups = []
    unprivileged_groups = []

    privileged_group = {}
    unprivileged_group = {}
    protected_attributes = ['Age_new', 'Married']
    for protected_attribute in protected_attributes:
        privileged_groups.append({protected_attribute: 1})
        unprivileged_groups.append({protected_attribute: 0})

        privileged_group[protected_attribute] = [{protected_attribute: 1}]
        unprivileged_group[protected_attribute] = [{protected_attribute: 0}]

    # Reweighing
    data_reweigh2 = []
    dep = {}
    dep['Approved'] = 'Y'
    dep['Y'] = 'Approved'
    for var in ['Approved']:
        data_trans = BinaryLabelDataset(df=data, label_names=[var], protected_attribute_names=protected_attributes,
                                        favorable_label=1, unfavorable_label=0)
        reweigher = Reweighing(unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)
        data_reweighed = reweigher.fit_transform(data_trans)
        data_reweigh2.append(data_reweighed)

        # Second Dependent Variable
        data_trans_Y = BinaryLabelDataset(df=data, label_names=[dep[var]], protected_attribute_names=protected_attributes,
                                          favorable_label=1, unfavorable_label=0)
        # Adjust instance weights to match those from the reweighing based on 'approved'
        data_trans_Y.instance_weights = data_reweighed.instance_weights

        # Fairness Metric for Main Dependent and Second Dependent
        print('----------------')
        print('Dependent Variable:', var)
        print('----------------')
        for protected_attribute in protected_attributes:
            # Main Dependent
            metric_reweighed = BinaryLabelDatasetMetric(data_reweighed,
                                                        unprivileged_groups=unprivileged_group[protected_attribute],
                                                        privileged_groups=privileged_group[protected_attribute])
            DI_reweighed = metric_reweighed.disparate_impact()
            SPD_reweighed = metric_reweighed.statistical_parity_difference()
            plot_DI(DI_reweighed, dependent[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], f"3.4_trans_{var}")
            plot_SPD(SPD_reweighed, dependent[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], f"3.4_trans_{var}")
            print(f"Disparate Impact for {protected_attribute}: {DI_reweighed:.2f}")
            print(f"Statistical Parity Difference for {protected_attribute}: {SPD_reweighed:.2f}")

        print('----------------')
        print('Dependent Variable:', dep[var])
        print('----------------')
        for protected_attribute in protected_attributes:
            # Second Dependent
            metric_Y = BinaryLabelDatasetMetric(data_trans_Y,
                                                unprivileged_groups=unprivileged_group[protected_attribute],
                                                privileged_groups=privileged_group[protected_attribute])
            DI_reweighed = metric_Y.disparate_impact()
            SPD_reweighed = metric_Y.statistical_parity_difference()
            plot_DI(DI_reweighed, dependent_flip[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], f"3.4_trans_{var}")
            plot_SPD(SPD_reweighed, dependent_flip[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], f"3.4_trans_{var}")
            print(f"Disparate Impact for {protected_attribute}: {DI_reweighed:.2f}")
            print(f"Statistical Parity Difference for {protected_attribute}: {SPD_reweighed:.2f}")

    return data_reweigh2[0]

In [22]:
def Step4(data_org, data_tsf, var0):
    print('---------------------------------------------')
    print('                    Step 4                   ')
    print('                Mitigating Bias              ')
    print('---------------------------------------------')


    for i, data in enumerate([data_org, data_tsf]):
        dataset_type = "Original Dataset" if i == 0 else "Transformed Dataset"
        print('----------------')
        print(dataset_type)
        print('----------------')

        X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=[var0]),
                                                        data[var0], test_size=0.2, random_state=42)

        X_train.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        y_train.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)

        if i == 1:
            instance_weights = data['instance_weights']
            clf = RandomForestClassifier(random_state=42)
            clf.fit(X_train, y_train, sample_weight=instance_weights[X_train.index])
        else:
            clf = RandomForestClassifier(random_state=42)
            clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        y_pred_df = pd.DataFrame(y_pred, columns=[var0])
        y_pred_df.reset_index(drop=True, inplace=True)
        # X_test_with_test = X_test.join(y_test)
        X_test_with_pred = X_test.join(y_pred_df)
        # X_test_with_test.to_csv('data_Credit_S4t.csv', index=False)
        X_test_with_pred.to_csv('data_Credit_S4p.csv', index=False)

        # Dependent Variables
        Vars = ['Approved', 'Y']

        # For Plotting
        dependent = {}
        dependent['Approved'] = 'Dependent Variable 1'
        dependent['Y'] = 'Dependent Variable 2'

        dependent_flip = {}
        dependent_flip['Approved'] = 'Dependent Variable 2'
        dependent_flip['Y'] = 'Dependent Variable 1'

        attribute = {}
        attribute['Age_new'] = 'Age'
        attribute['Married'] = 'Familial Status'

        privilege = {}
        privilege['Age_new'] = 'Above 40'
        privilege['Married'] = 'Married'

        unprivilege = {}
        unprivilege['Age_new'] = '40 & Under'
        unprivilege['Married'] = 'Single/Divorced/etc.'

        dep = {}
        dep['Approved'] = 'Y'
        dep['Y'] = 'Approved'

        for var in Vars:
            print('----------------')
            print('Dependent Variable:', var)
            print('----------------')

            protected_attributes = ['Age_new', 'Married']
            for protected_attribute in protected_attributes:
                binary_dataset = BinaryLabelDataset(df=X_test_with_pred,
                                                    label_names=[var],
                                                    protected_attribute_names=[protected_attribute],
                                                    favorable_label=1, unfavorable_label=0)

                unprivileged_groups = [{protected_attribute: 0}]
                privileged_groups = [{protected_attribute: 1}]

                metric = BinaryLabelDatasetMetric(binary_dataset,
                                                  unprivileged_groups=unprivileged_groups,
                                                  privileged_groups=privileged_groups)

                DI = metric.disparate_impact()
                SPD = metric.statistical_parity_difference()
                plot_DI(DI, dependent[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], f"4_{dataset_type}")
                plot_SPD(SPD, dependent[var], attribute[protected_attribute], privilege[protected_attribute], unprivilege[protected_attribute], f"4_{dataset_type}")

                print(f"Disparate Impact for {protected_attribute}: {DI:.2f}")
                print(f"Statistical Parity Difference for {protected_attribute}: {SPD:.2f}")

            print('---------------------------------------------')

In [23]:
'''
#################################################
            Read Data and Run Experiments
#################################################
'''
if __name__ == "__main__":
    filename = 'clean_dataset.csv'
    data_original = preprocess(filename)
    Step2(data_original)
    # data_transformed1: dependent variable = Dependent Variable 1
    data_transformed1 = Step3(data_original)
    # step 4 outcome variable = step 3.3 outcome variable
    data_transformed_df = TransformtoDF(data_transformed1, 'Approved')
    Step4(data_original, data_transformed_df, 'Approved')
    # data_transformed_df = TransformtoDF(data_transformed2, 'Y')
    # Step4(data_original, data_transformed_df, 'Y')

---------------------------------------------
                Explor the Data              
---------------------------------------------
Data Shape    :  (690, 16)
[Sex]     690      Female Count: 210 ,    Male Count: 480
[Age]     690    40&Under Count: 548 , 40Above Count: 142
[Marital] 690   noMarried Count: 165 , Married Count: 525
[Race]    690       White Count: 408 ,   Black Count: 138 ,    Other Count: 144
[Citizen] 690     ByBirth Count: 625 , Other Count: 65
---------------------------------------------
                    Step 2                   
            Statistics for Report            
---------------------------------------------
----------------
Dependent Variable: Approved
----------------
Sex 690
Approved Female: 98 , Approved Male: 209
Rejected Female: 112 , Rejected Male: 271
****************************
Age 690
Approved Under40: 221 , Approved Above40: 86
Rejected Under40: 327 , Rejected Above40: 56
****************************
Marital 690
Approved Nonmarr: 47