Goal: Predict the probability by education of getting hired in a company

Evaluation Criteria: Prioritise fairness and be unbiased to determine the probability by education of getting hired in a company

Description: We train a logistic regression on the adult_reconstruction dataset, one-
hot‐encoding categorical features and scaling numerics. We split into
train/validation by stratified hold‐out. We then compute predicted hiring
probabilities for each education group and measure demographic parity difference
(max minus min average predicted probability) as our fairness metric. Finally,
we print that metric.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

# Load data
df = pd.read_csv("./input/adult_reconstruction.csv")

# Define target and features
df = df.dropna(subset=["income"], how="any")
y = (df["income"].str.strip() == ">50K").astype(int)
X = df.drop(columns=["income", "education-num"])  # drop redundant

# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Identify columns
num_cols = ["age", "hours-per-week", "capital-gain", "capital-loss"]
cat_cols = [c for c in X.columns if c not in num_cols]

# Build preprocessing pipeline
num_pipeline = Pipeline(
    [("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())]
)
cat_pipeline = Pipeline(
    [
        ("impute", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    [("num", num_pipeline, num_cols), ("cat", cat_pipeline, cat_cols)]
)

# Build full pipeline
model = Pipeline([("pre", preprocessor), ("clf", LogisticRegression(max_iter=1000))])

# Train
model.fit(X_train, y_train)

# Predict probabilities
probs = model.predict_proba(X_val)[:, 1]
X_val2 = X_val.copy()
X_val2["prob"] = probs

# Compute demographic parity difference across education levels
group_means = X_val2.groupby("education")["prob"].mean()
dp_diff = group_means.max() - group_means.min()

print(f"Demographic parity difference across education groups: {dp_diff:.4f}")

AttributeError: Can only use .str accessor with string values!