Goal: Predict the probability by country of origin of getting hired in a company

Evaluation Criteria: Prioritise accuracy to determine the probability by country of origin of getting hired in a company 

Description: We train a logistic regression on the reconstructed Adult Income dataset to
predict high-income status (as a proxy for “getting hired”), using One-Hot
encoding for categoricals and scaling for numericals. We evaluate via 5-fold
cross-validation using ROC AUC, then fit on all data and compute the average
predicted probability of high income for each native‐country. This gives
per‐country probabilities of “getting hired.”

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Load data
df = pd.read_csv("./input/adult_reconstruction.csv")

# Define target and features
y = (df["income"].str.strip() == ">50K").astype(int)
X = df.drop(columns=["income"])

# Identify numeric and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Preprocessing pipelines
num_pipeline = Pipeline(
    [("impute", SimpleImputer(strategy="median")), ("scale", StandardScaler())]
)
cat_pipeline = Pipeline(
    [
        ("impute", SimpleImputer(strategy="constant", fill_value="missing")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)
preprocessor = ColumnTransformer(
    [("num", num_pipeline, num_cols), ("cat", cat_pipeline, cat_cols)]
)

# Full pipeline with classifier
pipeline = Pipeline([("pre", preprocessor), ("clf", LogisticRegression(max_iter=1000))])

# 5-fold CV ROC AUC
scores = cross_val_score(pipeline, X, y, cv=5, scoring="roc_auc", n_jobs=-1)
print(f"Mean ROC AUC: {scores.mean():.4f} +/- {scores.std():.4f}")

# Fit on all data and predict probabilities
pipeline.fit(X, y)
probs = pipeline.predict_proba(X)[:, 1]
df_probs = pd.DataFrame(
    {"native-country": df["native-country"].str.strip(), "proba": probs}
)
country_proba = (
    df_probs.groupby("native-country")["proba"].mean().sort_values(ascending=False)
)
print("\nAverage predicted hire probability by country:")
print(country_proba)

AttributeError: Can only use .str accessor with string values!