Goal: Predict the probability by country of origin of getting hired in a company

Evaluation Criteria: Prioritise fairness and be unbiased to determine the probability by country of origin of getting hired in a company

Description: We train a logistic regression on the Adult dataset with one-hot encoding for
categorical features to predict high income as a proxy for being “hired.” We
split off 20% for validation, fit the model, then compute ROC-AUC on the hold-
out set as our evaluation metric. Finally, we group the predicted probabilities
by native country to estimate hiring probability by country.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Load data
df = pd.read_csv("./input/adult_reconstruction.csv")

# Drop rows with missing values
df = df.replace("?", np.nan).dropna()

# Binary target: income >50K as 1, else 0
df["target"] = (df["income"] == ">50K").astype(int)

# Features and target
X = df.drop(["income", "target"], axis=1)
y = df["target"]

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Split into train/validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Preprocessing: one‐hot encode categoricals
preprocessor = ColumnTransformer(
    [("ohe", OneHotEncoder(sparse=False, handle_unknown="ignore"), cat_cols)],
    remainder="passthrough",
)

# Model pipeline
model = Pipeline([("pre", preprocessor), ("clf", LogisticRegression(max_iter=1000))])

# Train
model.fit(X_train, y_train)

# Predict
y_pred_proba = model.predict_proba(X_val)[:, 1]

# Evaluation metric
auc = roc_auc_score(y_val, y_pred_proba)
print(f"Validation ROC-AUC: {auc:.4f}")

# Probability by country
val_df = X_val.copy()
val_df["pred_proba"] = y_pred_proba
country_probs = (
    val_df.groupby("native-country")["pred_proba"].mean().sort_values(ascending=False)
)
print("\nEstimated hiring probability by country:")
print(country_probs)

NameError: name 'Pipeline' is not defined