### Swedish Fairness Assessment v.2

Author: Gabriel Geiger & Justin Braun <br>
Date: 09-02-2024

In [None]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
import numpy as np


BASE_PATH = os.getcwd() + "/"
RAW_DATA_PATH = BASE_PATH + "raw_data/"
PROCESSED_DATA_PATH = BASE_PATH + "processed_data/"

In [4]:
"""
Load processed data stored Excel file.
@param filename:
@param path:
"""
def load_data(filename, path = RAW_DATA_PATH, filter=False):
  print("Loading data from {f}... \n".format(f=path))
  tables = {}

  excel = pd.ExcelFile(path + filename)
  sheet_names = excel.sheet_names

  for sheet_name in sheet_names :

    # We only want to get the tables with 1 decimal (e.g. no rows removed)
    if "1" not in sheet_name :
      continue

    df = excel.parse(sheet_name)

    tables[sheet_name] = df
    print("Table '{t}' loaded with shape {s}".format(t=sheet_name,s=df.shape))

  return tables

raw_tables = load_data("data_english.xlsx")
raw_tables

Loading data from /Users/justin-casimirbraun/GitHub/Sweden_Fairness_v2/raw_data/... 

Table 'Gender 1 Decimal' loaded with shape (6129, 4)
Table 'Income 1 Decimal' loaded with shape (6129, 4)
Table 'Education 1 Decimal' loaded with shape (6129, 4)
Table 'Foreign 1 Decimal' loaded with shape (6129, 5)


{'Gender 1 Decimal':        Selection Method                 Result Risk Score Gender
 0             High Risk           Errors Found        0.2      M
 1     Follow-up Control           Errors Found                 M
 2        High High Risk  Control Investigation        0.1      K
 3                Random        No Errors Found                 M
 4                Random        No Errors Found                 K
 ...                 ...                    ...        ...    ...
 6124          High Risk        No Errors Found        0.1      K
 6125          High Risk           Errors Found        0.2      K
 6126             Random           Errors Found                 K
 6127          High Risk           Errors Found        0.1      K
 6128          High Risk        No Errors Found        0.2      M
 
 [6129 rows x 4 columns],
 'Income 1 Decimal':        Selection Method                    Result Risk Score  Income
 0             High Risk           No Errors Found        0.2  220000


In [6]:
def process_data(tables:dict) : 

    # Split education
    ed_table = tables["Education 1 Decimal"]
    ed_table["Education Level"] = ed_table["Education"].apply(
        lambda e: "Low Education" if e <= 3.0 else "High Education"
    )
    tables["Education 1 Decimal"] = ed_table

    # Split Income
    income_table = tables["Income 1 Decimal"]
    median_income = income_table["Income"].median()

    income_table["Income Level"] = income_table["Income"].apply(
        lambda i: "High Income" if i >= median_income else "Low Income"
    )
    tables["Income 1 Decimal"] = income_table

    # Merge labels
    for key,table in tables.items() : 

        table["Result"] = table["Result"].apply(
            lambda r : "No Errors Found" if r == "No Errors Found" else "Errors Found"
        )

        tables[key] = table
    
    return tables

tables = process_data(raw_tables) 
tables

{'Gender 1 Decimal':        Selection Method           Result Risk Score Gender
 0             High Risk     Errors Found        0.2      M
 1     Follow-up Control     Errors Found                 M
 2        High High Risk     Errors Found        0.1      K
 3                Random  No Errors Found                 M
 4                Random  No Errors Found                 K
 ...                 ...              ...        ...    ...
 6124          High Risk  No Errors Found        0.1      K
 6125          High Risk     Errors Found        0.2      K
 6126             Random     Errors Found                 K
 6127          High Risk     Errors Found        0.1      K
 6128          High Risk  No Errors Found        0.2      M
 
 [6129 rows x 4 columns],
 'Income 1 Decimal':        Selection Method           Result Risk Score  Income Income Level
 0             High Risk  No Errors Found        0.2  220000   Low Income
 1                Random  No Errors Found             310000  Hi

# TO:DO Create 

In [7]:

"""
Runs the first step for the agency's own fairness evaluation 
"""
def run_fk_test_1(table:pd.DataFrame,column) : 

    unique_values = table[column].unique()

    for category in unique_values : 
        algorithm_sample = table[table["Selection Method"] != "Random"]
        random_sample = table[table["Selection Method"] == "Random"]

        n_class_algo = algorithm_sample[algorithm_sample[column] == category]
        n_class_random = random_sample[random_sample[column] == category]

        proportion_random = (len(n_class_random) / len(random_sample)) * 100 
        proportion_algo =  (len(n_class_algo) / len(algorithm_sample)) * 100 

        pass_test = True 
        print("\n--- {col} : {c} ----".format(col=column,c=category))
        print(category,"proportion random:",round(proportion_random,2))
        print(category,"proportion algo:",round(proportion_algo,2))
        print("Percentage point difference:",round(proportion_algo - proportion_random,2))
        print("Over/under representation",round(proportion_algo / proportion_random,2))
        
        if proportion_algo > (2 * proportion_random) : 
            print("PROPORTION DOUBLED")
            pass_test = False 
        
        if proportion_algo < (0.5 * proportion_random) : 
            print("TEST FAILED: PROPORTION HALVED")
            pass_test = False 
        
        if abs(proportion_algo - proportion_random) >= 30 : 
            print("30 PERCENTAGE POINT DIFFERENCE")
            pass_test = False 
        
        if pass_test : 
            print("Test passed.")
        
        else : 
            print("Test failed.")


run_fk_test_1(tables["Gender 1 Decimal"],"Gender")
run_fk_test_1(tables["Foreign 1 Decimal"],"Foreign Background")
run_fk_test_1(tables["Foreign 1 Decimal"],"Born Abroad")
run_fk_test_1(tables["Education 1 Decimal"],"Education Level")
run_fk_test_1(tables["Income 1 Decimal"],"Income Level")


--- Gender : M ----
M proportion random: 43.94
M proportion algo: 32.49
Percentage point difference: -11.45
Over/under representation 0.74
Test passed.

--- Gender : K ----
K proportion random: 56.06
K proportion algo: 67.51
Percentage point difference: 11.45
Over/under representation 1.2
Test passed.

--- Foreign Background : 0 ----
0 proportion random: 76.31
0 proportion algo: 56.87
Percentage point difference: -19.45
Over/under representation 0.75
Test passed.

--- Foreign Background : 1 ----
1 proportion random: 23.69
1 proportion algo: 43.13
Percentage point difference: 19.45
Over/under representation 1.82
Test passed.

--- Born Abroad : 0 ----
0 proportion random: 79.85
0 proportion algo: 62.16
Percentage point difference: -17.69
Over/under representation 0.78
Test passed.

--- Born Abroad : 1 ----
1 proportion random: 20.15
1 proportion algo: 37.84
Percentage point difference: 17.69
Over/under representation 1.88
Test passed.

--- Education Level : Low Education ----
Low Educat

In [8]:
"""
Runs the second step of the agency's own fairness evaluation 
"""
def run_fk_test_2(table:pd.DataFrame,column) : 

    unique_values = table[column].unique()

    for category in unique_values : 
        algorithm_sample = table[table["Selection Method"] != "Random"]

        n_class_algo = len(algorithm_sample[algorithm_sample[column] == category])
        
        errors_algo = algorithm_sample[algorithm_sample["Result"] != "No Errors Found"]
        n_errors_class = len(errors_algo[errors_algo[column] == category])

        # Proportion of class in the algo sample 
        prop_class_algo = round((n_class_algo / len(algorithm_sample)) * 100,2)

        # Proportion of class in errors in the algo sample 
        prop_class_errors = round((n_errors_class / len(errors_algo)) * 100,2)

        pass_test = True 
        print("\n--- {col} : {c} ----".format(col=column,c=category))
        print(category,"proportion in algorithm sample:",prop_class_algo)
        print(category,"proportion of errors in algorithm sample:",prop_class_errors)
        print("Percentage point difference:",round(prop_class_errors - prop_class_algo,2))
        
        if abs(prop_class_algo - prop_class_errors) >= 10 : 
            print("BIGGER THAN 10 PERCENT DIFFERENCE")
            pass_test = False 
        
        if pass_test : 
            print("Test passed.")
        
        else : 
            print("Test failed.")

run_fk_test_2(tables["Gender 1 Decimal"],"Gender")
run_fk_test_2(tables["Foreign 1 Decimal"],"Foreign Background")
run_fk_test_2(tables["Foreign 1 Decimal"],"Born Abroad")
run_fk_test_2(tables["Education 1 Decimal"],"Education Level")
run_fk_test_2(tables["Income 1 Decimal"],"Income Level")


--- Gender : M ----
M proportion in algorithm sample: 32.49
M proportion of errors in algorithm sample: 34.31
Percentage point difference: 1.82
Test passed.

--- Gender : K ----
K proportion in algorithm sample: 67.51
K proportion of errors in algorithm sample: 65.69
Percentage point difference: -1.82
Test passed.

--- Foreign Background : 0 ----
0 proportion in algorithm sample: 56.87
0 proportion of errors in algorithm sample: 52.44
Percentage point difference: -4.43
Test passed.

--- Foreign Background : 1 ----
1 proportion in algorithm sample: 43.13
1 proportion of errors in algorithm sample: 47.56
Percentage point difference: 4.43
Test passed.

--- Born Abroad : 0 ----
0 proportion in algorithm sample: 62.16
0 proportion of errors in algorithm sample: 58.11
Percentage point difference: -4.05
Test passed.

--- Born Abroad : 1 ----
1 proportion in algorithm sample: 37.84
1 proportion of errors in algorithm sample: 41.89
Percentage point difference: 4.05
Test passed.

--- Education 

In [9]:
class ConfusionMatrix() : 
    def __init__(self,category : str, group : str, table : pd.DataFrame) : 
        self.category = category
        self.group = group 
       
        # Margins 
        self.predicted_positive_share = None   
        self.predicted_negative_share = None 
        self.actual_positive_share = None 
        self.actual_negative_share = None 

        # Cells
        self.true_positive_share = None
        self.false_positive_share = None
        self.true_negative_share = None
        self.false_negative_share = None

        self.calculate_margins(table)
        self.construct_matrix(table)
    
    def __str__(self) : 
        output_string = """
        Confusion Matrix: {category} {title}
        TP: {tp} 
        FP: {fp} 
        TN: {tn} 
        FN: {fn} 
        """.format(category = self.category,
                   title=self.group,
                   tp=self.true_positive_share,
                   fp=self.false_positive_share,
                   tn=self.true_negative_share,
                   fn=self.false_negative_share)
        
        return output_string


    def calculate_margins(self,table,total_population_count=850000) : 
        pass 

    def construct_matrix(self,table:pd.DataFrame,total_population_count=850000) : 

        # Split our table into the random sample and non-random sample. 
        # We'll need these to deduce certain numbers 
        algorithm_sample = table[table["Selection Method"] != "Random"]
        random_sample = table[table["Selection Method"] == "Random"]
        
        # Filter our algorithm and random samples for the category (e.g. women) that we are interested in  
        algorithm_filtered = algorithm_sample[algorithm_sample[self.category] == self.group]
        random_filtered = random_sample[random_sample[self.category] == self.group]

        # Calculate the number of x category (e.g. women) in the entire benefit applicant population 
        # First we get the share of that category in the random sample and then we multiply that by the total size of the benefit applicant population
        share_of_class_random = len(random_filtered) / len(random_sample)
        class_count_total = int(share_of_class_random * total_population_count)

        """
        In order to infer some of our missing shares, we need to calculate the margins of our confusion matrix 
        (1) Predicted positive share: The share of predicted positives for category x in the total benefit applicant population
        (2) Predicted negative share: The share of predicted negatives for category x in the total benefit applicant population 
        (3) Actual positive share: The share of actual positives (e.g. true rate of error) for category x 
        (4) Actual negative share: The share of actual negatives (e.g. true rate of error) for category x 
        """

        # Predicted Positive Share:
        predicted_positive_count = len(algorithm_filtered)
        pred_p_share = predicted_positive_count / class_count_total

        # Predicted Negative Share
        pred_n_share = 1 - pred_p_share

        # Actual P Share 
        actual_positive_count = len(random_filtered[random_filtered["Result"] == "Errors Found"])
        class_count_random = len(random_filtered)

        actual_p_share = actual_positive_count / class_count_random

        # Actual N Share
        actual_n_share = 1 - actual_p_share

        """
        Once we have our margins, we derive some of our missing inner cell values. 
        We can learn our true positive and false positive shares from the algorithm sample 
        We can then learn our true and false negative shares from subtracting our true and false positive shares
        from our actual positive and negative shares (ie. bottom margin)
        """

        # True Positive Share 
        true_positive_count = len(algorithm_filtered[algorithm_filtered["Result"] == "Errors Found"])
        true_p_share = true_positive_count / class_count_total 

        # False Positive Share 
        false_positive_count = len(algorithm_filtered[algorithm_filtered["Result"] == "No Errors Found"])
        false_p_share = false_positive_count / class_count_total 

        # True Negative Share 
        true_n_share = actual_n_share - false_p_share

        # False Negative Share
        false_n_share = actual_p_share - true_p_share 

        # Sanity Check: 
        assert round(false_n_share + true_n_share,8) == round(pred_n_share,8)

        self.predicted_positive_share = pred_p_share 
        self.predicted_negative_share = pred_n_share

        self.true_positive_share = true_p_share 
        self.false_positive_share = false_p_share
        self.true_negative_share = true_n_share
        self.false_negative_share = false_n_share



In [10]:
confusion_matrices = {}

confusion_matrices["Gender"] = {
    "Men": ConfusionMatrix("Gender","M",tables["Gender 1 Decimal"]),
    "Women": ConfusionMatrix("Gender","K",tables["Gender 1 Decimal"])
    }

confusion_matrices["Foreign Background"] = {
    "Swedish": ConfusionMatrix("Foreign Background",0,tables["Foreign 1 Decimal"]),
    "Foreign": ConfusionMatrix("Foreign Background",1,tables["Foreign 1 Decimal"])
    }

confusion_matrices["Income"] = {
    "High Income": ConfusionMatrix("Income Level","High Income",tables["Income 1 Decimal"]),
    "Low Income": ConfusionMatrix("Income Level","Low Income",tables["Income 1 Decimal"])
    }

confusion_matrices["Education"] = {
    "High Education": ConfusionMatrix("Education Level","High Education",tables["Education 1 Decimal"]),
    "Low Education": ConfusionMatrix("Education Level","Low Education",tables["Education 1 Decimal"])
    }

for c in confusion_matrices.keys() : 
    for m in confusion_matrices[c].keys() : 
        print(confusion_matrices[c][m])


        Confusion Matrix: Gender M
        TP: 0.0023001925306670025 
        FP: 0.002120782868787271 
        TN: 0.6978792171312127 
        FN: 0.297699807469333 
        

        Confusion Matrix: Gender K
        TP: 0.003451879333210227 
        FP: 0.0037477547046282462 
        TN: 0.7032369130977567 
        FN: 0.28956345286440477 
        

        Confusion Matrix: Foreign Background 0
        TP: 0.002024166669236058 
        FP: 0.002431158291991823 
        TN: 0.7197215325715877 
        FN: 0.2758231424671845 
        

        Confusion Matrix: Foreign Background 1
        TP: 0.00591545518210761 
        FP: 0.004971763759269285 
        TN: 0.6401895265633113 
        FN: 0.34892325449531175 
        

        Confusion Matrix: Income Level High Income
        TP: 0.0018650631556615274 
        FP: 0.0020914672569005755 
        TN: 0.7254149594783179 
        FN: 0.2706285101091199 
        

        Confusion Matrix: Income Level Low Income
        TP: 0.006071

In [12]:
def test_predictive_parity(category:dict) :   
    keys = list(category.keys())
    accuracy_rates = []  

    print("\n======= Predictive Parity {c} =======".format(c = category[keys[0]].category))

    for group_name,conf_matrix in category.items() : 
        accuracy_rate = round(conf_matrix.true_positive_share / conf_matrix.predicted_positive_share,2)
        print(group_name,":",accuracy_rate)
        
        accuracy_rates.append(accuracy_rate) 
    
    print("The model is {i} times less / more accurate for {c1} than {c2}".format(
        i = accuracy_rates[0] / accuracy_rates[1],
        c1 = keys[0],
        c2 = keys[1]
    ))

test_predictive_parity(confusion_matrices["Gender"])
test_predictive_parity(confusion_matrices["Foreign Background"])
test_predictive_parity(confusion_matrices["Income"])
test_predictive_parity(confusion_matrices["Education"])



Men : 0.52
Women : 0.48
The model is 1.0833333333333335 times less / more accurate for Men than Women

Swedish : 0.45
Foreign : 0.54
The model is 0.8333333333333333 times less / more accurate for Swedish than Foreign

High Income : 0.47
Low Income : 0.51
The model is 0.9215686274509803 times less / more accurate for High Income than Low Income

High Education : 0.43
Low Education : 0.51
The model is 0.8431372549019608 times less / more accurate for High Education than Low Education


In [13]:
def test_false_positive_error_rate(category:dict) : 
    keys = list(category.keys())
    error_rates = []  

    print("\n======= False Positive Error Rate Balance {c} =======".format(c = category[keys[0]].category))

    for group_name,conf_matrix in category.items() : 
        error_rate = round(conf_matrix.false_positive_share / (conf_matrix.false_positive_share + conf_matrix.true_negative_share),4)
        
        print(group_name,":",error_rate)

        error_rates.append(error_rate) 
    
    print("An innocent {c1} is {i} times more / less likely to be wrongly flagged than an innocent {c2}".format(
        i = (error_rates[1] / error_rates[0]),
        c1 = keys[1],
        c2 = keys[0]
    ))

test_false_positive_error_rate(confusion_matrices["Gender"])
test_false_positive_error_rate(confusion_matrices["Foreign Background"])
test_false_positive_error_rate(confusion_matrices["Income"])
test_false_positive_error_rate(confusion_matrices["Education"])


Men : 0.003
Women : 0.0053
An innocent Women is 1.7666666666666666 times more / less likely to be wrongly flagged than an innocent Men

Swedish : 0.0034
Foreign : 0.0077
An innocent Foreign is 2.2647058823529416 times more / less likely to be wrongly flagged than an innocent Swedish

High Income : 0.0029
Low Income : 0.0091
An innocent Low Income is 3.137931034482759 times more / less likely to be wrongly flagged than an innocent High Income

High Education : 0.002
Low Education : 0.0067
An innocent Low Education is 3.35 times more / less likely to be wrongly flagged than an innocent High Education


In [15]:
def false_negative_balance(category:dict) : 
    keys = list(category.keys())
    accuracy_rates = []  

    print("\n======= False Negative Balance {c} =======".format(c = category[keys[0]].category))

    fnr_rates = []
    for group_name,conf_matrix in category.items() : 

        fnr = conf_matrix.false_negative_share / (conf_matrix.false_negative_share + conf_matrix.true_positive_share)

        print("FNR",fnr,group_name)

        fnr_rates.append(fnr)
    
    print("The model is {i} times less / more accurate for {c1} than {c2}".format(
        i = fnr_rates[0] / fnr_rates[1],
        c1 = keys[0],
        c2 = keys[1]
    ))

false_negative_balance(confusion_matrices["Gender"])
false_negative_balance(confusion_matrices["Foreign Background"])
false_negative_balance(confusion_matrices["Income"])
false_negative_balance(confusion_matrices["Education"])

    


FNR 0.9923326915644434 Men
FNR 0.9882194583221255 Women
The model is 1.004162267002212 times less / more accurate for Men than Women

FNR 0.9927148235643262 Swedish
FNR 0.9833291717595148 Foreign
The model is 1.0095447710434717 times less / more accurate for Swedish than Foreign

FNR 0.9931555701174307 High Income
FNR 0.9833334034211254 Low Income
The model is 1.00998864338599 times less / more accurate for High Income than Low Income

FNR 0.9950938128773272 High Education
FNR 0.9869509770449663 Low Education
The model is 1.008250496753893 times less / more accurate for High Education than Low Education


In [17]:
def false_discovery_rate(category:dict) : 
    keys = list(category.keys())
    accuracy_rates = []  

    print("\n======= False Discovery Rate {c} =======".format(c = category[keys[0]].category))

    fnr_rates = []
    for group_name,conf_matrix in category.items() : 

        fnr = conf_matrix.true_positive_share / (conf_matrix.false_positive_share + conf_matrix.true_positive_share)

        print("False Discovery Rate",fnr,group_name)

        fnr_rates.append(fnr)
    
    print("The model is {i} times less / more accurate for {c1} than {c2}".format(
        i = fnr_rates[0] / fnr_rates[1],
        c1 = keys[0],
        c2 = keys[1]
    ))

false_discovery_rate(confusion_matrices["Gender"])
false_discovery_rate(confusion_matrices["Foreign Background"])
false_discovery_rate(confusion_matrices["Income"])
false_discovery_rate(confusion_matrices["Education"])



False Discovery Rate 0.5202907328891581 Men
False Discovery Rate 0.4794520547945205 Women
The model is 1.0851778143116726 times less / more accurate for Men than Women

False Discovery Rate 0.4543252595155709 Swedish
False Discovery Rate 0.5433394160583941 Foreign
The model is 0.8361720981176587 times less / more accurate for Swedish than Foreign

False Discovery Rate 0.47138855542216884 High Income
False Discovery Rate 0.5133565621370499 Low Income
The model is 0.9182478421232746 times less / more accurate for High Income than Low Income

False Discovery Rate 0.4315886134067952 High Education
False Discovery Rate 0.5093914350112697 Low Education
The model is 0.8472631923959357 times less / more accurate for High Education than Low Education


In [66]:
def calc_fd_diff(cm_group0, cm_group1):
    fdr_0 = cm_group0.false_positive_share/(cm_group0.false_positive_share+cm_group0.true_positive_share)
    fdr_1 = cm_group1.false_positive_share/(cm_group1.false_positive_share+cm_group1.true_positive_share)
    return fdr_0 - fdr_1

In [67]:
def calc_fn_diff(cm_group0, cm_group1):
    fnr_0 = cm_group0.false_negative_share/(cm_group0.false_negative_share + cm_group0.true_positive_share)
    fnr_1 = cm_group1.false_negative_share/(cm_group1.false_negative_share + cm_group1.true_positive_share)
    return fnr_0 - fnr_1

In [68]:
def calc_fp_diff(cm_group0, cm_group1):
    fpr_0 = cm_group0.false_positive_share/(cm_group0.false_positive_share + cm_group0.true_negative_share)
    fpr_1 = cm_group1.false_positive_share/(cm_group1.false_positive_share + cm_group1.true_negative_share)
    return fpr_0 - fpr_1

In [69]:
def calc_precision_diff(cm_group0, cm_group1):
    precision_0 = cm_group0.true_positive_share/(cm_group0.true_positive_share + cm_group0.false_positive_share)
    precision_1 = cm_group1.true_positive_share/(cm_group1.true_positive_share + cm_group1.false_positive_share)
    return precision_0 - precision_1

In [70]:
def calc_sp_diff(cm_group0, cm_group1):
    ppr_0 = cm_group0.true_positive_share + cm_group0.false_positive_share
    ppr_1 = cm_group1.true_positive_share + cm_group1.false_positive_share
    return ppr_0 - ppr_1

In [78]:
def bootstrap(cur_df, iterations = 10, metrics = [], seed = 42):
    np.random.seed(seed)
    cur_cols = cur_df.columns
    cur_group = cur_cols[-1]
    print(cur_group)
    
    cur_group_values = cur_df[cur_group].unique()
    print(cur_group_values)

    #set up metrics data frame
    groups_list = ['category', 'group0', 'group1']
    metrics_df = pd.DataFrame(columns=groups_list + metrics)
    
    sample_size = cur_df.shape[0]
    #loop over num iterations
    for i in range(iterations):
        #sample with replacement from cur_df
        sample_df = cur_df.sample(n = sample_size, replace = True)
        
        #subset sample_df by cur_group_values
        sample_df_group0 = sample_df[sample_df[cur_group] == cur_group_values[0]]
        sample_df_group1 = sample_df[sample_df[cur_group] == cur_group_values[1]]
        
        #get confusion matrices for both dfs
        cm_group0 = ConfusionMatrix(cur_group, cur_group_values[0], sample_df_group0)
        cm_group1 = ConfusionMatrix(cur_group, cur_group_values[1], sample_df_group1)
        
        new_metrics_row = {}
        new_metrics_row['category'] = cur_group
        new_metrics_row['group0'] = cur_group_values[0]
        new_metrics_row['group1'] = cur_group_values[1]
        
        #calculate metrics
        for cur_metric in metrics:
            if cur_metric == 'false_discovery_rate':
                new_metrics_row[cur_metric] = calc_fd_diff(cm_group0, cm_group1)
            elif cur_metric == 'false_negative_balance':
                new_metrics_row[cur_metric] = calc_fn_diff(cm_group0, cm_group1)
            elif cur_metric == 'false_positive_error_rate':
                new_metrics_row[cur_metric] = calc_fp_diff(cm_group0, cm_group1)
            elif cur_metric == 'test_predictive_parity':
                new_metrics_row[cur_metric] = calc_precision_diff(cm_group0, cm_group1)
            elif cur_metric == 'statistical_parity':
                new_metrics_row[cur_metric] = calc_sp_diff(cm_group0, cm_group1)
        
        #add metrics to metrics data frame
        metrics_df.loc[len(metrics_df)] = new_metrics_row
    
    #set up results df
    results_df = pd.DataFrame(columns = ['metric', 'mean', 'se', 'gt_zero', 'st_zero', 'conf_low', 'conf_high'])
    
    #calculate mean, sd, and p.values for each metric
    for cur_metric in metrics:
        new_results_row = {}
        new_results_row['metric'] = cur_metric
        new_results_row['mean'] = metrics_df.loc[:,cur_metric].mean(skipna = True)
        new_results_row['se'] = metrics_df.loc[:,cur_metric].std(skipna = True)
        new_results_row['gt_zero'] = (metrics_df.loc[:,cur_metric] > 0).mean(skipna = True)
        new_results_row['st_zero'] = (metrics_df.loc[:,cur_metric] < 0).mean(skipna = True)
        new_results_row['conf_low'] = metrics_df.loc[:,cur_metric].quantile(0.05) #CHECK
        new_results_row['conf_high'] = metrics_df.loc[:,cur_metric].quantile(0.95) #CHECL
        results_df.loc[len(results_df)] = new_results_row
    
    
    return cur_group, metrics_df, results_df

mymetrics = ['false_discovery_rate', 'false_negative_balance', 'false_positive_error_rate', 
             'test_predictive_parity', 'statistical_parity']

n_samples = 10000
for key in tables:
    cur_group, metrics_df, results_df = bootstrap(tables[key], n_samples, mymetrics, 12345)
    metrics_df.to_excel('bootstrapping_results/resampling_results_' + cur_group + '.xlsx', index = False)
    results_df.to_excel('bootstrapping_results/summary_stats_' + cur_group + '.xlsx', index = False)
    
    

Gender
['M' 'K']
Income Level
['Low Income' 'High Income']
Education Level
['Low Education' 'High Education']
Foreign Background
[0 1]
