In [42]:
import numpy as np
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

In [43]:
schools = ["Brown University", "Providence University", "Providence College", "University of Rhode Island", "Providence State University", "State Providence College", "Providence School", "Bryant University", "Boston College", "Boston University", "Northeastern University", "Columbia University", "Harvard University", "MIT", "UC Berkeley", "UCLA", "UCSB"]
GPA = np.linspace(1, 4, 100)
Degree = ["Bachelors", "Masters", "PhD"]
Location = ["Boston", "Providence", "Los Angeles", "Philadelphia", "Miami", "Chicago", "New York", "San Francisco", "Las Vegas", "Washington"]
Gender = ["M", "F", "N/A"]
veteran = [1, 0, "N/A"]
work_auth = [1, 0]
disable = [0, 1, "N/A"]
ethnicity = [0, 1, 2, 3, 4]
role = ["Software Engineer", "Machine Learning Engineer", "Data Scientist", "Hardware Engineer", "Data Analyst", "Research Scientist", "Quantitative Analyst", "Software Tester"]

In [44]:
def diff_month(d1, d2):
    return (d2.year - d1.year) * 12 + d2.month - d1.month

def random_date(start, end):
    date1 = datetime.datetime.strptime(start, "%m/%Y")
    date2 = datetime.datetime.strptime(end, "%m/%Y")

    total_months = diff_month(date1, date2)
    random_months = np.random.randint(0, total_months)
    rand_date = date1 + relativedelta(months=random_months)
    
    rand_date = rand_date.strftime("%m/%Y")
    
    month, year = rand_date.split("/")
    return f"{month}/{year[2:]}"

def generate_a_sample(applicant_id: int) -> dict:
    fields = {
        "Applicant ID" : applicant_id,
        "School Name": np.random.choice(schools),
        "GPA": np.round(np.random.uniform(1.0, 4.0, 1), 2)[0],
        "Degree": np.random.choice(Degree),
        "Location": np.random.choice(Location),
        "Gender": np.random.choice(Gender),
        "Veteran status": np.random.choice(veteran),
        "Work authorization": np.random.choice(work_auth),
        "Disability": np.random.choice(disable),
        "Ethnicity": np.random.choice(ethnicity),
        "Role 1": np.random.choice(role),
        "Start 1": random_date("12/2018", "12/2020"),
        "End 1": random_date("01/2021", "12/2023"),
        "Role 2": np.random.choice(role),
        "Start 2": random_date("12/2016", "12/2018"),
        "End 2": random_date("01/2019", "12/2021"),
        "Role 3": np.random.choice(role),
        "Start 3": random_date("12/2013", "12/2016"),
        "End 3": random_date("01/2017", "12/2019")
    }
    
    return fields

In [49]:
np.random.seed(1951)
samples = []
for i in range(3000):
    samples.append(generate_a_sample(i))

samples = pd.DataFrame(samples)

In [51]:
samples.to_csv("resume_without_bias/resume.csv", index=False)
samples

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,0,University of Rhode Island,2.34,Bachelors,Las Vegas,,1,1,1,4,Quantitative Analyst,06/19,11/22,Machine Learning Engineer,10/17,02/21,Hardware Engineer,03/15,10/19
1,1,Bryant University,3.71,PhD,San Francisco,M,,0,,0,Software Tester,04/19,03/23,Hardware Engineer,05/17,05/20,Data Scientist,12/13,06/17
2,2,Brown University,3.06,Masters,New York,,0,0,,1,Hardware Engineer,10/20,05/22,Software Engineer,04/17,04/21,Research Scientist,07/14,06/18
3,3,University of Rhode Island,1.04,Bachelors,Chicago,M,1,1,,0,Data Scientist,04/20,11/23,Software Tester,07/18,06/20,Data Scientist,04/15,06/19
4,4,University of Rhode Island,1.35,Masters,Philadelphia,,1,0,0,0,Research Scientist,05/20,10/22,Data Scientist,05/18,06/21,Research Scientist,04/15,05/18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,State Providence College,3.90,Masters,Boston,M,0,1,0,1,Machine Learning Engineer,06/19,02/22,Software Engineer,04/18,08/19,Hardware Engineer,06/16,04/19
2996,2996,Boston University,2.65,Bachelors,Philadelphia,M,,0,0,2,Machine Learning Engineer,03/19,10/21,Hardware Engineer,03/18,12/20,Data Scientist,10/15,03/17
2997,2997,Columbia University,3.75,PhD,Philadelphia,,1,0,1,1,Software Engineer,05/19,11/21,Software Engineer,03/18,09/20,Software Tester,11/15,10/19
2998,2998,Providence School,1.77,Bachelors,Las Vegas,M,1,0,,1,Quantitative Analyst,06/19,05/23,Machine Learning Engineer,08/18,11/19,Quantitative Analyst,03/15,10/18


### Modify the generated data to add bias into data

In [53]:
original = pd.read_csv("resume_without_bias/resume.csv")

for col in original.columns:
    if original[col].dtype != 'object':
        original[col] = original[col].astype('object')
original.fillna("N/A", inplace=True)
original

Unnamed: 0,Applicant ID,School Name,GPA,Degree,Location,Gender,Veteran status,Work authorization,Disability,Ethnicity,Role 1,Start 1,End 1,Role 2,Start 2,End 2,Role 3,Start 3,End 3
0,0,University of Rhode Island,2.34,Bachelors,Las Vegas,,1.0,1,1.0,4,Quantitative Analyst,06/19,11/22,Machine Learning Engineer,10/17,02/21,Hardware Engineer,03/15,10/19
1,1,Bryant University,3.71,PhD,San Francisco,M,,0,,0,Software Tester,04/19,03/23,Hardware Engineer,05/17,05/20,Data Scientist,12/13,06/17
2,2,Brown University,3.06,Masters,New York,,0.0,0,,1,Hardware Engineer,10/20,05/22,Software Engineer,04/17,04/21,Research Scientist,07/14,06/18
3,3,University of Rhode Island,1.04,Bachelors,Chicago,M,1.0,1,,0,Data Scientist,04/20,11/23,Software Tester,07/18,06/20,Data Scientist,04/15,06/19
4,4,University of Rhode Island,1.35,Masters,Philadelphia,,1.0,0,0.0,0,Research Scientist,05/20,10/22,Data Scientist,05/18,06/21,Research Scientist,04/15,05/18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2995,State Providence College,3.9,Masters,Boston,M,0.0,1,0.0,1,Machine Learning Engineer,06/19,02/22,Software Engineer,04/18,08/19,Hardware Engineer,06/16,04/19
2996,2996,Boston University,2.65,Bachelors,Philadelphia,M,,0,0.0,2,Machine Learning Engineer,03/19,10/21,Hardware Engineer,03/18,12/20,Data Scientist,10/15,03/17
2997,2997,Columbia University,3.75,PhD,Philadelphia,,1.0,0,1.0,1,Software Engineer,05/19,11/21,Software Engineer,03/18,09/20,Software Tester,11/15,10/19
2998,2998,Providence School,1.77,Bachelors,Las Vegas,M,1.0,0,,1,Quantitative Analyst,06/19,05/23,Machine Learning Engineer,08/18,11/19,Quantitative Analyst,03/15,10/18


### Does the Resume-Scorer prefer applicants of certain gender?

In [54]:
original["Gender"].value_counts()

Gender
F      1042
N/A     993
M       965
Name: count, dtype: int64

In [66]:
# Modify original to make 90% of individuals male
np.random.seed(1951)
non_male_indices = list(original[original["Gender"] != "M"].index)
indices_to_modify = np.random.choice(non_male_indices, 1735, replace=False)

male_bias = original.copy()
male_bias.loc[indices_to_modify, "Gender"] = "M"

print(male_bias["Gender"].value_counts())
male_bias.to_csv("resume_with_bias/gender_bias/male.csv", index=False)

Gender
M      2700
N/A     151
F       149
Name: count, dtype: int64


In [69]:
# Modify original to make 90% of individuals female
np.random.seed(1951)
non_female_indices = list(original[original["Gender"] != "F"].index)
indices_to_modify = np.random.choice(non_female_indices, 1658, replace=False)

female_bias = original.copy()
female_bias.loc[indices_to_modify, "Gender"] = "F"

print(female_bias["Gender"].value_counts())
female_bias.to_csv("resume_with_bias/gender_bias/female.csv", index=False)

Gender
F      2700
M       151
N/A     149
Name: count, dtype: int64


In [73]:
# Modify original to make 90% of individuals not declaring gender
np.random.seed(1951)
nan_indices = list(original[original["Gender"] != "N/A"].index)
indices_to_modify = np.random.choice(nan_indices, 1707, replace=False)

nan_bias = original.copy()
nan_bias.loc[indices_to_modify, "Gender"] = "N/A"

print(nan_bias["Gender"].value_counts())
nan_bias.to_csv("resume_with_bias/gender_bias/na.csv", index=False)

Gender
N/A    2700
F       159
M       141
Name: count, dtype: int64


### Does the Resume-Scorer prefer applicants who do not need work authorization?

In [76]:
original["Work authorization"].value_counts()

Work authorization
0    1537
1    1463
Name: count, dtype: int64

In [77]:
# Modify original to make 90% of applicants require work authorization
np.random.seed(1951)
work_1_indices = list(original[original["Work authorization"] != 1].index)
indices_to_modify = np.random.choice(work_1_indices, 1237, replace=False)

work_1_bias = original.copy()
work_1_bias.loc[indices_to_modify, "Work authorization"] = 1

print(work_1_bias["Work authorization"].value_counts())
work_1_bias.to_csv("resume_with_bias/authorization_bias/auth_1.csv", index=False)

Work authorization
1    2700
0     300
Name: count, dtype: int64


In [78]:
# Modify original to make 90% of applicants not require work authorization
np.random.seed(1951)
work_0_indices = list(original[original["Work authorization"] != 0].index)
indices_to_modify = np.random.choice(work_0_indices, 1163, replace=False)

work_0_bias = original.copy()
work_0_bias.loc[indices_to_modify, "Work authorization"] = 0

print(work_0_bias["Work authorization"].value_counts())
work_0_bias.to_csv("resume_with_bias/authorization_bias/auth_0.csv", index=False)

Work authorization
0    2700
1     300
Name: count, dtype: int64


### Does the Resume-Scorer prefer applicants in certain ethnicity groups?

In [82]:
original["Ethnicity"].value_counts()

Ethnicity
1    623
2    622
3    596
4    592
0    567
Name: count, dtype: int64

In [85]:
# Modify original to make 90% of applicants White
np.random.seed(1951)
eth_0_indices = list(original[original["Ethnicity"] != 0].index)
indices_to_modify = np.random.choice(eth_0_indices, 2133, replace=False)
eth_0_bias = original.copy()
eth_0_bias.loc[indices_to_modify, "Ethnicity"] = 0

print(eth_0_bias["Ethnicity"].value_counts())
eth_0_bias.to_csv("resume_with_bias/ethnicity_bias/eth_0.csv", index=False)

Ethnicity
0    2700
3      97
4      75
1      74
2      54
Name: count, dtype: int64


In [86]:
# Modify original to make 90% of applicants Black
np.random.seed(1951)
eth_1_indices = list(original[original["Ethnicity"] != 1].index)
indices_to_modify = np.random.choice(eth_1_indices, 2077, replace=False)
eth_1_bias = original.copy()
eth_1_bias.loc[indices_to_modify, "Ethnicity"] = 1

print(eth_1_bias["Ethnicity"].value_counts())
eth_1_bias.to_csv("resume_with_bias/ethnicity_bias/eth_1.csv", index=False)

Ethnicity
1    2700
4      83
2      81
3      70
0      66
Name: count, dtype: int64


In [87]:
# Modify original to make 90% of applicants Native American
np.random.seed(1951)
eth_2_indices = list(original[original["Ethnicity"] != 2].index)
indices_to_modify = np.random.choice(eth_2_indices, 2078, replace=False)
eth_2_bias = original.copy()
eth_2_bias.loc[indices_to_modify, "Ethnicity"] = 2

print(eth_2_bias["Ethnicity"].value_counts())
eth_2_bias.to_csv("resume_with_bias/ethnicity_bias/eth_2.csv", index=False)

Ethnicity
2    2700
3      85
4      80
1      76
0      59
Name: count, dtype: int64


In [88]:
# Modify original to make 90% of applicants Asian American & Pacific Islander
np.random.seed(1951)
eth_3_indices = list(original[original["Ethnicity"] != 3].index)
indices_to_modify = np.random.choice(eth_3_indices, 2104, replace=False)
eth_3_bias = original.copy()
eth_3_bias.loc[indices_to_modify, "Ethnicity"] = 3

print(eth_3_bias["Ethnicity"].value_counts())
eth_3_bias.to_csv("resume_with_bias/ethnicity_bias/eth_3.csv", index=False)

Ethnicity
3    2700
1      77
4      77
2      76
0      70
Name: count, dtype: int64


In [89]:
# Modify original to make 90% applicants with ethnicity "Other"
np.random.seed(1951)
eth_4_indices = list(original[original["Ethnicity"] != 4].index)
indices_to_modify = np.random.choice(eth_4_indices, 2108, replace=False)
eth_4_bias = original.copy()
eth_4_bias.loc[indices_to_modify, "Ethnicity"] = 4

print(eth_4_bias["Ethnicity"].value_counts())
eth_4_bias.to_csv("resume_with_bias/ethnicity_bias/eth_4.csv", index=False)

Ethnicity
4    2700
1      79
2      77
3      72
0      72
Name: count, dtype: int64


### Uneven Distribution in Veteran Status

In [95]:
original["Veteran status"].value_counts()

Veteran status
N/A    1066
0.0     971
1.0     963
Name: count, dtype: int64

In [96]:
# Modify original to make 90% applicants veterans
np.random.seed(1951)
no_vet_indices = list(original[original["Veteran status"] != 1].index)
indices_to_modify = np.random.choice(no_vet_indices, 1737, replace=False)
vet_bias = original.copy()
vet_bias.loc[indices_to_modify, "Veteran status"] = 1

print(vet_bias["Veteran status"].value_counts())
vet_bias.to_csv("resume_with_bias/veteran_bias/vet_1.csv", index=False)

Veteran status
1.0    2700
N/A     156
0.0     144
Name: count, dtype: int64


In [97]:
# Modify original to make 90% applicants non-veterans
np.random.seed(1951)
vet_indices = list(original[original["Veteran status"] != 0].index)
indices_to_modify = np.random.choice(vet_indices, 1729, replace=False)
nvet_bias = original.copy()
nvet_bias.loc[indices_to_modify, "Veteran status"] = 0

print(nvet_bias["Veteran status"].value_counts())
nvet_bias.to_csv("resume_with_bias/veteran_bias/vet_0.csv", index=False)

Veteran status
0      2700
N/A     171
1.0     129
Name: count, dtype: int64


In [98]:
# Modify original to make 90% applicants not providing veteran status information
np.random.seed(1951)
na_indices = list(original[original["Veteran status"] != "N/A"].index)
indices_to_modify = np.random.choice(na_indices, 1634, replace=False)
na_vet_bias = original.copy()
na_vet_bias.loc[indices_to_modify, "Veteran status"] = "N/A"

print(na_vet_bias["Veteran status"].value_counts())
na_vet_bias.to_csv("resume_with_bias/veteran_bias/vet_na.csv", index=False)

Veteran status
N/A    2700
1.0     171
0.0     129
Name: count, dtype: int64


### Make Disability Status Distribution Uneven

In [99]:
original["Disability"].value_counts()

Disability
N/A    1030
0.0    1004
1.0     966
Name: count, dtype: int64

In [100]:
# Modify original to make 90% of applicants have disability
np.random.seed(1951)
dis_indices = list(original[original["Disability"] != 1].index)
indices_to_modify = np.random.choice(dis_indices, 1734, replace=False)
dis_bias = original.copy()
dis_bias.loc[indices_to_modify, "Disability"] = 1

print(dis_bias["Disability"].value_counts())
dis_bias.to_csv("resume_with_bias/disability_bias/dis_1.csv", index=False)

Disability
1.0    2700
N/A     150
0.0     150
Name: count, dtype: int64


In [101]:
# Modify original to make 90% of applicants not have disability
np.random.seed(1951)
ndis_indices = list(original[original["Disability"] != 0].index)
indices_to_modify = np.random.choice(ndis_indices, 1696, replace=False)
ndis_bias = original.copy()
ndis_bias.loc[indices_to_modify, "Disability"] = 0

print(ndis_bias["Disability"].value_counts())
ndis_bias.to_csv("resume_with_bias/disability_bias/dis_0.csv", index=False)

Disability
0      2700
N/A     162
1.0     138
Name: count, dtype: int64


In [102]:
# Modify original to make 90% of applicants not reporting disability status
np.random.seed(1951)
na_dis_indices = list(original[original["Disability"] != "N/A"].index)
indices_to_modify = np.random.choice(na_dis_indices, 1670, replace=False)
na_dis_bias = original.copy()
na_dis_bias.loc[indices_to_modify, "Disability"] = "N/A"

print(na_dis_bias["Disability"].value_counts())
na_dis_bias.to_csv("resume_with_bias/disability_bias/dis_na.csv", index=False)

Disability
N/A    2700
1.0     155
0.0     145
Name: count, dtype: int64
