Goal: Predict the probability by race of getting hired in a company

Evaluation criteria: Prioritise fairness and be unbiased to determine the probability by race of getting hired in a company 

We will train a simple logistic regression on the adult hiring proxy dataset
(income >50K), encoding all categorical features except race to avoid explicit
bias. We split the data into training and hold-out sets stratified by income,
train the model, and predict probabilities on the hold-out set. We compute the
overall ROC AUC and then calculate per-race ROC AUCs to assess fairness by
measuring the gap between the highest and lowest group AUC. A smaller gap
indicates more equitable performance across races.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Load and prepare data
df = pd.read_csv("./input/adult_reconstruction.csv")
df["target"] = (df["income"] == ">50K").astype(int)
X = df.drop(["income", "target"], axis=1)
y = df["target"]
races = X["race"]
X = X.drop("race", axis=1)

# Encode categorical features
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(exclude="object").columns.tolist()
ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
X_cat = ohe.fit_transform(X[cat_cols])
X_num = X[num_cols].values
X_enc = np.hstack([X_num, X_cat])

# Split into train and hold-out sets
X_train, X_test, y_train, y_test, race_train, race_test = train_test_split(
    X_enc, y, races, test_size=0.2, random_state=42, stratify=y
)

# Train logistic regression
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)[:, 1]

# Evaluate overall AUC
overall_auc = roc_auc_score(y_test, y_prob)

# Compute per-race AUCs for fairness assessment
group_aucs = []
for race in race_test.unique():
    mask = race_test == race
    if mask.sum() > 0 and len(np.unique(y_test[mask])) > 1:
        group_aucs.append(roc_auc_score(y_test[mask], y_prob[mask]))
fairness_gap = max(group_aucs) - min(group_aucs)

print(f"Overall ROC AUC: {overall_auc:.4f}")
print(f"Fairness gap (maxâˆ’min group AUC): {fairness_gap:.4f}")

FileNotFoundError: [Errno 2] No such file or directory: './input/adult_reconstruction.csv'