### Swedish Fairness Assessment

This notebook applies various fairness definitions to 2017 a dataset containing outcomes of randomly and algorithmically selected investigations carried out by the Swedish Social Security Agency.

Author: Gabriel Geiger <br>

In [15]:
import pandas as pd
import os

from util.ConfusionMatrix import ConfusionMatrix
from util.bootstrap import bootstrap

BASE_PATH = os.getcwd() + "/"
RAW_DATA_PATH = BASE_PATH + "data/"
PROCESSED_DATA_PATH = BASE_PATH + "processed_data/"

### Load Data

Loads data from an Excel file where each sheet is a demographic category. To comply with GDPR, the ISF disclosed two versions of the dataset, one with 2 decimals for the risk score with a small number of rows removed and another with 1 decimal for the risk score with no rows removed. We use the version with 1 decimal because the risk score is not necessary for this analysis. 

In [16]:
"""
Load processed data stored Excel file.
@param filename: The name of the file to load (data_english or data_swedish)
@param path: The path to the file (default is RAW_DATA_PATH)

@return tables: A dictionary where each key is the category (e.g. gender) and each value is a Dataframe of the corresponding table. 
"""
def load_data(filename, path = RAW_DATA_PATH) -> dict[str : pd.DataFrame] :
  print("Loading data from {f}... \n".format(f=path))
  tables = {}

  excel = pd.ExcelFile(path + filename)
  sheet_names = excel.sheet_names

  for sheet_name in sheet_names :

    # We only want to get the tables with 1 decimal (e.g. no rows removed)
    if "1" not in sheet_name :
      continue

    df = excel.parse(sheet_name)

    tables[sheet_name] = df
    print("Table '{t}' loaded with shape {s}".format(t=sheet_name,s=df.shape))

  return tables

raw_tables = load_data("data_english.xlsx")

raw_tables['Gender 1 Decimal']

Loading data from /Users/gsgeiger/Sweden_Fairness_v2/data/... 

Table 'Gender 1 Decimal' loaded with shape (6129, 4)
Table 'Income 1 Decimal' loaded with shape (6129, 4)
Table 'Education 1 Decimal' loaded with shape (6129, 4)
Table 'Foreign 1 Decimal' loaded with shape (6129, 5)


Unnamed: 0,Selection Method,Result,Risk Score,Gender
0,High Risk,Errors Found,0.2,M
1,Follow-up Control,Errors Found,,M
2,High High Risk,Control Investigation,0.1,K
3,Random,No Errors Found,,M
4,Random,No Errors Found,,K
...,...,...,...,...
6124,High Risk,No Errors Found,0.1,K
6125,High Risk,Errors Found,0.2,K
6126,Random,Errors Found,,K
6127,High Risk,Errors Found,0.1,K


### Data Processing 

We conduct a few small processing steps to make our analysis easier. 

- We create education into a binary categorical variable between low (no university degree) and high (has university degree)
- We make a split between lower and higher income based on the median income. 
- We merge labels into "Errors found" and "No errors found." In practice, there are a few other smaller labels. 
- We merge selection methods 'high risk', 'high high risk' and 'follow-up control' into 'algorithm.' All of these selection methods involved being selected by the algorithm, but 'high high risk' selection defines the highest risk scores and 'follow-up control' are people who were previously selected by the algorithm who are checked again.  

In [17]:

"""
This function runs some basic preprocessing steps on the data. 

@input tables: A dictionary where each key is a category and each value is a dataframe
@return tables: The same as input, but with all dataframe correctly formatted
"""

def process_data(tables:dict) -> dict[str : pd.DataFrame] : 

    # Split education into a high and low education
    ed_table = tables["Education 1 Decimal"]
    ed_table["Education Level"] = ed_table["Education"].apply(
        lambda e: "Low Education" if e <= 3.0 else "High Education"
    )
    tables["Education 1 Decimal"] = ed_table

    # Split Income into a high and low income based on the median. 
    income_table = tables["Income 1 Decimal"]
    median_income = income_table["Income"].median()

    income_table["Income Level"] = income_table["Income"].apply(
        lambda i: ">Median Income" if i >= median_income else "<Median Income"
    )
    tables["Income 1 Decimal"] = income_table

    # Merge labels into "No Errors Found" and "Errors Found"
    for category, table in tables.items() : 

        table["Result"] = table["Result"].apply(
            lambda r : "No Errors Found" if r == "No Errors Found" else "Errors Found"
        )

        # Merge Selection method into algorithm (flatten high risk and high high risk) and random 
        table["Selection Method"] = table["Selection Method"].apply(
            lambda s : "Random" if s == "Random" else "Algorithm"
        )

        tables[category] = table
   
    return tables

tables = process_data(raw_tables) 

# Print Random / Algorithm Split 
tables["Gender 1 Decimal"]["Selection Method"].value_counts()


Selection Method
Algorithm    5082
Random       1047
Name: count, dtype: int64

### Compare model to random sample.

In [18]:
df = tables['Gender 1 Decimal']

pd.crosstab(df['Selection Method'], df['Result'], normalize='index')

Result,Errors Found,No Errors Found
Selection Method,Unnamed: 1_level_1,Unnamed: 2_level_1
Algorithm,0.492719,0.507281
Random,0.296084,0.703916


### Building our Confusion Matrices 

We use our ConfusionMatrix utility class to build ConfusionMatrix objects for each group (e.g. men, women). 

In [19]:
confusion_matrices = {}

confusion_matrices["Gender"] = {
    "Men": ConfusionMatrix("Gender","M",tables["Gender 1 Decimal"]),
    "Women": ConfusionMatrix("Gender","K",tables["Gender 1 Decimal"])
    }

confusion_matrices["Foreign Background"] = {
    "Swedish": ConfusionMatrix("Foreign Background",0,tables["Foreign 1 Decimal"]),
    "Foreign": ConfusionMatrix("Foreign Background",1,tables["Foreign 1 Decimal"])
    }

confusion_matrices["Income"] = {
    "High Income": ConfusionMatrix("Income Level",">Median Income",tables["Income 1 Decimal"]),
    "Low Income": ConfusionMatrix("Income Level","<Median Income",tables["Income 1 Decimal"])
    }

confusion_matrices["Education"] = {
    "High Education": ConfusionMatrix("Education Level","High Education",tables["Education 1 Decimal"]),
    "Low Education": ConfusionMatrix("Education Level","Low Education",tables["Education 1 Decimal"])
    }

for category in confusion_matrices.keys() : 
    for matrix in confusion_matrices[category].keys() : 
        print(confusion_matrices[category][matrix])


        Confusion Matrix: Gender M
        TP: 0.0019996927131104418 
        FP: 0.0018437213373497902 
        TN: 0.6981562786626502 
        FN: 0.29800030728688953 
        

        Confusion Matrix: Gender K
        TP: 0.0030009322044720274 
        FP: 0.003258154964855344 
        TN: 0.7037265128375296 
        FN: 0.290014399993143 
        

        Confusion Matrix: Foreign Background 0
        TP: 0.0017597304516462793 
        FP: 0.0021135528729978544 
        TN: 0.7200391379905816 
        FN: 0.27608757868477424 
        

        Confusion Matrix: Foreign Background 1
        TP: 0.0051426646861722335 
        FP: 0.0043222563819130195 
        TN: 0.6408390339406677 
        FN: 0.3496960449912471 
        

        Confusion Matrix: Income Level >Median Income
        TP: 0.0016214125558995055 
        FP: 0.0018182393772013979 
        TN: 0.7256881873580171 
        FN: 0.270872160708882 
        

        Confusion Matrix: Income Level <Median Income
        

### 1. Statistical Parity

Statistical parity checks whether a group is overrepresented in algorithm's predictions compared to its share in the population. 

In [20]:
def calculate_statistical_parity(table : pd.DataFrame) -> pd.DataFrame : 
    
    category_name = table.columns[-1]

    # Results Dict 
    results = {
        "Category": [],
        "Class": [],
        "random_share": [],
        "algo_share":[],
    }
    
    # Get unique classes 
    classes = table[category_name].unique()

    # Get number of people in total in random sample & algorithm sample
    random_sample = table[table["Selection Method"] == "Random"]
    algorithm_sample = table[table["Selection Method"] != "Random"]

    # Loop through each class (e.g. men and women)
    for class_ in classes : 

        # Filter our random and algorithm sample to only include people from relevant class
        class_random = random_sample[random_sample[category_name] == class_]
        class_algorithm = algorithm_sample[algorithm_sample[category_name] == class_]

        # Get the share of that class in the random and algorithmic sample
        share_random = round(len(class_random) / len(random_sample),3)
        share_algorithm = round(len(class_algorithm) / len(algorithm_sample),3)

        # Add to dataframe
        results["Category"].append(category_name)
        results["Class"].append(class_)
        results["random_share"].append(share_random)
        results["algo_share"].append(share_algorithm)
    
    return pd.DataFrame(results)

stat_parity_results = []

for category,table in tables.items() : 
    stat_parity_results.append(calculate_statistical_parity(table))

results_df = pd.concat(stat_parity_results)
results_df.reset_index(inplace=True)

results_df.drop(columns=['index'],inplace=True)

results_df

Unnamed: 0,Category,Class,random_share,algo_share
0,Gender,M,0.439,0.325
1,Gender,K,0.561,0.675
2,Income Level,<Median Income,0.257,0.508
3,Income Level,>Median Income,0.743,0.492
4,Education Level,Low Education,0.527,0.786
5,Education Level,High Education,0.473,0.214
6,Foreign Background,0,0.763,0.569
7,Foreign Background,1,0.237,0.431


### Baseline Mistake Rates  

We want to check whether case workers find more mistakes in certain group's applications in the random sample. 

In [21]:

results_dict = {
    "Category":[],
    "Class":[],
    "Percentage of mistakes":[]
}

for key, table in tables.items() : 
    category_name = table.columns[-1]

    classes = table[category_name].unique()

    for class_ in classes : 

        # Filter dataframe for class 
        class_df = table[table[category_name] == class_]

        # Filter for random 
        class_df_random = class_df[class_df['Selection Method'] == 'Random']

        # Number of mistakes detected randomly for this class 
        class_random_mistakes = class_df_random[class_df_random['Result'] != "No Errors Found"]

        # Percentage of class x who make mistakes 
        mistake_percentage = round(len(class_random_mistakes) / len(class_df_random),2)

        results_dict['Category'].append(category_name)
        results_dict['Class'].append(class_)
        results_dict["Percentage of mistakes"].append(mistake_percentage)

results_df = pd.DataFrame(results_dict)
results_df


Unnamed: 0,Category,Class,Percentage of mistakes
0,Gender,M,0.3
1,Gender,K,0.29
2,Income Level,<Median Income,0.36
3,Income Level,>Median Income,0.27
4,Education Level,Low Education,0.35
5,Education Level,High Education,0.24
6,Foreign Background,0,0.28
7,Foreign Background,1,0.35


### Predictive Parity

Predictive parity checks whether the algorithm's precision (true positives over all positives) is equal across groups. 

In [22]:
pp_results = []

for category, df in tables.items() : 
    pp_results.append(bootstrap(df, cur_metric="predictive_parity"))


results_df = pd.concat(pp_results)
results_df.reset_index(inplace=True)

results_df.drop(columns=['index'],inplace=True)

results_df

Bootstraping metrics predictive_parity for category Gender and values ['M' 'K']
Bootstraping metrics predictive_parity for category Income Level and values ['<Median Income' '>Median Income']
Bootstraping metrics predictive_parity for category Education Level and values ['Low Education' 'High Education']
Bootstraping metrics predictive_parity for category Foreign Background and values [0 1]


Unnamed: 0,metric,category_name,group0,group1,conf_low_group0,mean_group0,conf_high_group0,conf_low_group1,mean_group1,conf_high_group1,mean_conf_low,mean_difference,mean_conf_high,sd_difference,se_difference,gt_difference_zero,st_difference_zero
0,predictive_parity,Gender,M,K,0.4997,0.5202,0.5403,0.4655,0.4795,0.4933,0.0161,0.0407,0.0652,0.0149,0.0003,0.9969,0.0031
1,predictive_parity,Income Level,<Median Income,>Median Income,0.4972,0.5133,0.5293,0.4551,0.4715,0.4882,0.0189,0.0417,0.0647,0.0139,0.0003,0.9987,0.0013
2,predictive_parity,Education Level,Low Education,High Education,0.4963,0.5094,0.5227,0.4073,0.4318,0.4564,0.0499,0.0777,0.1052,0.0168,0.0003,1.0,0.0
3,predictive_parity,Foreign Background,0,1,0.4393,0.4542,0.469,0.5254,0.5433,0.561,-0.1124,-0.0891,-0.0657,0.0143,0.0003,0.0,1.0


### False Positive Error Rate 

False positive error rate compares the rate at which people who did NOT make mistake are flagged by the algorithm across groups. 

In [23]:
fp_results = []

for category, df in tables.items() : 
    fp_results.append(bootstrap(df, cur_metric="false_positive_error_rate"))


results_df = pd.concat(fp_results)
results_df.reset_index(inplace=True)

results_df.drop(columns=['index'],inplace=True)

results_df

Bootstraping metrics false_positive_error_rate for category Gender and values ['M' 'K']
Bootstraping metrics false_positive_error_rate for category Income Level and values ['<Median Income' '>Median Income']
Bootstraping metrics false_positive_error_rate for category Education Level and values ['Low Education' 'High Education']
Bootstraping metrics false_positive_error_rate for category Foreign Background and values [0 1]


Unnamed: 0,metric,category_name,group0,group1,conf_low_group0,mean_group0,conf_high_group0,conf_low_group1,mean_group1,conf_high_group1,mean_conf_low,mean_difference,mean_conf_high,sd_difference,se_difference,gt_difference_zero,st_difference_zero
0,false_positive_error_rate,Gender,M,K,0.0024,0.0026,0.0029,0.0043,0.0046,0.005,-0.0025,-0.002,-0.0015,0.0003,0.0,0.0,1.0
1,false_positive_error_rate,Income Level,<Median Income,>Median Income,0.007,0.0079,0.0089,0.0024,0.0025,0.0027,0.0044,0.0054,0.0065,0.0006,0.0,1.0,0.0
2,false_positive_error_rate,Education Level,Low Education,High Education,0.0054,0.0058,0.0063,0.0016,0.0018,0.0019,0.0036,0.0041,0.0046,0.0003,0.0,1.0,0.0
3,false_positive_error_rate,Foreign Background,0,1,0.0028,0.0029,0.0031,0.0059,0.0067,0.0077,-0.0048,-0.0038,-0.0029,0.0006,0.0,0.0,1.0


### False Negative Balance

False negative balance compares the rate at which people who DID make a mistake are not flagged by the algorithm between groups 

In [24]:
fnr_results = []

for category, df in tables.items() : 
    fnr_results.append(bootstrap(df, cur_metric="false_negative_balance"))


results_df = pd.concat(fnr_results)
results_df.reset_index(inplace=True)

results_df.drop(columns=['index'],inplace=True)

results_df

Bootstraping metrics false_negative_balance for category Gender and values ['M' 'K']
Bootstraping metrics false_negative_balance for category Income Level and values ['<Median Income' '>Median Income']
Bootstraping metrics false_negative_balance for category Education Level and values ['Low Education' 'High Education']
Bootstraping metrics false_negative_balance for category Foreign Background and values [0 1]


Unnamed: 0,metric,category_name,group0,group1,conf_low_group0,mean_group0,conf_high_group0,conf_low_group1,mean_group1,conf_high_group1,mean_conf_low,mean_difference,mean_conf_high,sd_difference,se_difference,gt_difference_zero,st_difference_zero
0,false_negative_balance,Gender,M,K,0.9923,0.9933,0.9942,0.9884,0.9897,0.9909,0.0019,0.0036,0.0053,0.001,0.0,0.9994,0.0006
1,false_negative_balance,Income Level,<Median Income,>Median Income,0.9828,0.9854,0.9876,0.9933,0.994,0.9946,-0.0114,-0.0087,-0.0063,0.0016,0.0,0.0,1.0
2,false_negative_balance,Education Level,Low Education,High Education,0.9873,0.9886,0.9898,0.9949,0.9957,0.9963,-0.0087,-0.0071,-0.0056,0.001,0.0,0.0,1.0
3,false_negative_balance,Foreign Background,0,1,0.9929,0.9936,0.9943,0.9826,0.9853,0.9877,0.0058,0.0083,0.0113,0.0017,0.0,1.0,0.0


### Equal Burden

Equal burden compares a group's share of mistakes in the random sample to its share in the algorithm's predictions. 

In [25]:
def calculate_equal_burden(table : pd.DataFrame) -> pd.DataFrame : 
    
    category_name = table.columns[-1]

    # Results Dict 
    results = {
        "Category": [],
        "Class": [],
        "share_random_mistakes": [],
        "algo_share":[],
    }
    
    # Get unique classes 
    classes = table[category_name].unique()

    # Get number of people in total in random sample & algorithm sample
    random_sample = table[table["Selection Method"] == "Random"]
    algorithm_sample = table[table["Selection Method"] != "Random"]

    for class_ in classes : 
        # Filter for mistakes in our random sample
        random_mistakes = random_sample[random_sample["Result"] == "Errors Found"]
        class_random_mistakes = random_mistakes[random_mistakes[category_name] == class_]

        # Filter algorithmic sample for class 
        class_algorithm = algorithm_sample[algorithm_sample[category_name] == class_]

        # Get the share of that class in randomly detected mistakes  
        share_random_mistakes = round(len(class_random_mistakes) / len(random_mistakes),4) * 100 

        # Get the share of that class in the algorithm 
        share_algorithm = round(len(class_algorithm) / len(algorithm_sample),4) * 100

        results["Category"].append(category_name)
        results["Class"].append(class_)
        results["share_random_mistakes"].append(share_random_mistakes)
        results["algo_share"].append(share_algorithm)
    
    return pd.DataFrame(results)

stat_parity_results = []

for category,table in tables.items() : 
    stat_parity_results.append(calculate_equal_burden(table))

results_df = pd.concat(stat_parity_results)
results_df.reset_index(inplace=True)

# Calculate the difference between a classes share in randomly detected mistakes and share in the algorithm 
results_df["difference"] = results_df["algo_share"] - results_df['share_random_mistakes']
results_df.drop(columns=['index'],inplace=True)

results_df

Unnamed: 0,Category,Class,share_random_mistakes,algo_share,difference
0,Gender,M,44.52,32.49,-12.03
1,Gender,K,55.48,67.51,12.03
2,Income Level,<Median Income,31.61,50.83,19.22
3,Income Level,>Median Income,68.39,49.17,-19.22
4,Education Level,Low Education,61.94,78.57,16.63
5,Education Level,High Education,38.06,21.43,-16.63
6,Foreign Background,0,71.61,56.87,-14.74
7,Foreign Background,1,28.39,43.13,14.74


### ISF Definition 

The fairness metric used by the ISF is similar to equal burden. It compares a group's share of mistakes in the random sample to its share of mistakes in the algorithm sample. 

In [26]:
def calculate_isf(table : pd.DataFrame) -> pd.DataFrame : 
    
    category_name = table.columns[-1]

    # Results Dict 
    results = {
        "Category": [],
        "Class": [],
        "share_random_mistakes": [],
        "share_algo_mistakes":[],
    }
    
    # Get unique classes 
    classes = table[category_name].unique()

    # Get number of people in total in random sample & algorithm sample
    random_sample = table[table["Selection Method"] == "Random"]
    algorithm_sample = table[table["Selection Method"] != "Random"]

    for class_ in classes : 
        # Filter for mistakes in our random sample
        random_mistakes = random_sample[random_sample["Result"] == "Errors Found"]
        class_random_mistakes = random_mistakes[random_mistakes[category_name] == class_]

        # Filter for mistakes in our algorithm sample
        algorithm_mistakes = algorithm_sample[algorithm_sample["Result"] == "Errors Found"]
        class_algorithm_mistakes = algorithm_mistakes[algorithm_mistakes[category_name] == class_]

        # Get share of mistakes for that class in both algorithm and random samples 
        share_random_mistakes = round(len(class_random_mistakes) / len(random_mistakes),4) * 100
        share_algorithm_mistakes = round(len(class_algorithm_mistakes) / len(algorithm_mistakes),4) * 100

        results["Category"].append(category_name)
        results["Class"].append(class_)
        results["share_random_mistakes"].append(share_random_mistakes)
        results["share_algo_mistakes"].append(share_algorithm_mistakes)
    
    return pd.DataFrame(results)

stat_parity_results = []

for category,table in tables.items() : 
    stat_parity_results.append(calculate_isf(table))

results_df = pd.concat(stat_parity_results)

# Calculate difference between share of mistakes in random sample and algorithmic sample 
results_df['difference'] = results_df['share_algo_mistakes'] - results_df['share_random_mistakes']

results_df.reset_index(inplace=True)

results_df.drop(columns=['index'],inplace=True)

results_df

Unnamed: 0,Category,Class,share_random_mistakes,share_algo_mistakes,difference
0,Gender,M,44.52,34.31,-10.21
1,Gender,K,55.48,65.69,10.21
2,Income Level,<Median Income,31.61,52.96,21.35
3,Income Level,>Median Income,68.39,47.04,-21.35
4,Education Level,Low Education,61.94,81.23,19.29
5,Education Level,High Education,38.06,18.77,-19.29
6,Foreign Background,0,71.61,52.44,-19.17
7,Foreign Background,1,28.39,47.56,19.17
