In [1]:
import itertools
import pandas as pd
import numpy as np

def create_applicant(counter, school, gpa, degree, location, gender, vs, wa, dis, race):
    return {
        'Applicant ID': counter,
        'School Name': school,
        'GPA': gpa,
        'Degree': degree,
        'Location': location,
        'Gender': gender,
        'Veteran status': vs,
        'Work authorization': wa,
        'Disability': dis,
        'Ethnicity': race,
        'Role 1': 'N/A',
        'Start 1': 'N/A',
        'End 1': 'N/A',
        'Role 2': 'N/A',
        'Start 2': 'N/A',
        'End 2': 'N/A',
        'Role 3': 'N/A',
        'Start 3': 'N/A',
        'End 3': 'N/A'
    }

def generate_applicants():
    schools = ['State Providence College', 'Providence State University', 'Providence School', 'Providence University']
    locations = ['Miami', 'Chicago', 'Boston', 'Providence']
    degrees = ['Phd', 'Bachelors', 'Masters']
    genders = ['M', 'F']
    gpa_increments = [2.00 + i * 0.2 for i in range(11)]
    applicant_list = []
    counter = 1

    attributes = list(itertools.product(
        gpa_increments, schools, [0], [0], locations, degrees, [0, 1], [0, 1], genders
    ))

    for gpa, school, vs, wa, location, degree, race, dis, gender, in attributes:
        applicant = create_applicant(counter, school, gpa, degree, location, gender, vs, wa, dis, race)
        applicant_list.append(applicant)
        counter += 1

    return applicant_list


In [2]:
#generate applicant pool and save to a csv file
applicants = generate_applicants()
applicants = pd.DataFrame(applicants)
applicants.to_csv('applicants.csv',index = False)

## We then submit this csv to the resume scorer API

In [3]:
#read the result of resume scorer
resume_score_result = pd.read_csv('resume_score_result.csv')
applicants['Resume score'] = resume_score_result['score']
applicants.to_csv('candidates.csv',index = False)

## We then submit this csv to the candidate prediction API

In [4]:
#read the result of resume scorer
candidate_prediction_result = pd.read_csv('candidate_eval_result.csv')
candidates = applicants.copy()
candidates['prediction'] = candidate_prediction_result['prediction']

In [5]:
#define evaluation metrics
def spd(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    """
    Calculate the Statistical Parity Difference (SPD) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - predicted_labels (pd.Series): Predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - spd (float): Statistical Parity Difference between majority and minority classes.
    """
    spd_val = np.mean(predicted_labels[sensitive_attribute == minority_class]) - np.mean(predicted_labels[sensitive_attribute == majority_class])
    return spd_val

def di(sensitive_attribute, dataset, predicted_labels, majority_class, minority_class):
    """
    Calculate the Disparate Impact (DI) between majority and minority classes based on predicted labels.

    Parameters:
    - sensitive_attribute (str): Name of the column representing the sensitive attribute.
    - dataset (pd.DataFrame): The dataset containing the sensitive attribute and true outcome variable.
    - predicted_labels (pd.Series): Predicted labels for the outcome variable.
    - majority_class: Value representing the majority class in the sensitive attribute.
    - minority_class: Value representing the minority class in the sensitive attribute.

    Returns:
    - di (float): Disparate Impact between majority and minority classes.
    """
    di_val = np.mean(predicted_labels[sensitive_attribute == minority_class]) / np.mean(predicted_labels[sensitive_attribute == majority_class])
    return di_val

In [6]:
#calculate the evaluation metrics for the resume scorer model on gender
rs_spd_gender = spd(applicants['Gender'],applicants,applicants['Resume score'],'M','F')
rs_di_gender = di(applicants['Gender'],applicants,applicants['Resume score'],'M','F')
#calculate the evaluation metrics for the resume scorer model on disability
rs_spd_race = spd(applicants['Disability'],applicants,applicants['Resume score'],0,1)
rs_di_race = di(applicants['Disability'],applicants,applicants['Resume score'],0,1)

In [7]:
print(f'The statistical parity difference on gender for the resume scorer model is {rs_spd_gender}')
print(f'The disparate impact on gender for the resume scorer model is {rs_di_gender}')
print(f'The statistical parity difference on disability for the resume scorer model is {rs_spd_race}')
print(f'The disparate impact on disability for the resume scorer model is {rs_di_race}')

The statistical parity difference on gender for the resume scorer model is 0.15704071969696898
The disparate impact on gender for the resume scorer model is 1.0320354440454274
The statistical parity difference on disability for the resume scorer model is 0.09684185606060591
The disparate impact on disability for the resume scorer model is 1.0196346475257734


In [8]:
#calculate the evaluation metrics for the candidates evaluation model on gender
ce_spd_gender = spd(candidates['Gender'],candidates,candidates['prediction'],'M','F')
ce_di_gender = di(candidates['Gender'],candidates,candidates['prediction'],'M','F')
#calculate the evaluation metrics for the candidates evaluation model on disability
ce_spd_race = spd(candidates['Disability'],candidates,candidates['prediction'],0,1)
ce_di_race = di(candidates['Disability'],candidates,candidates['prediction'],0,1)

In [9]:
print(f'The statistical parity difference on gender for the candidates evaluation model is {ce_spd_gender}')
print(f'The disparate impact on gender for the candidates evaluation model is {ce_di_gender}')
print(f'The statistical parity difference on disability for the candidates evaluation model is {ce_spd_race}')
print(f'The disparate impact on disability for the candidates evaluation model is {ce_di_race}')

The statistical parity difference on gender for the candidates evaluation model is -0.2277462121212121
The disparate impact on gender for the candidates evaluation model is 0.6179507545671168
The statistical parity difference on disability for the candidates evaluation model is -0.007102272727272707
The disparate impact on disability for the candidates evaluation model is 0.9853801169590644
