## Setup
- Setup: paths, environment, imports

In [1]:
# Subgroup predictive performance (test set): Accuracy / F1 / ROC-AUC
# Exports: ../results/subgroup_performance_summary.csv and .tex


import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# ---- Paths / imports (match your other notebooks) ----
repo_root = Path.cwd().resolve().parents[0]
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

os.makedirs("../results", exist_ok=True)

# Set local data path (DO NOT COMMIT dataset)
os.environ.setdefault("DATA_PATH", "/Users/munaugas/Desktop/Thesis/adult_reconstruction.csv")
print("Using DATA_PATH:", os.environ.get("DATA_PATH"))

Using DATA_PATH: /Users/munaugas/Desktop/Thesis/adult_reconstruction.csv


## Pipeline and model training
- Load data, preprocess, split, and train modelsPipeline

In [2]:
# ---- Pipeline: load, engineer, encode, split, train ----
from thesis_pipeline.preprocessing.clean_data import load_data
from thesis_pipeline.preprocessing.feature_engineering import engineer_features_and_target
from thesis_pipeline.preprocessing.encode_features import encode_features
from thesis_pipeline.splitting.split_data import stratified_train_val_test_split

from thesis_pipeline.model_training.train_rf import train_random_forest
from thesis_pipeline.model_training.train_gbdt import train_gbdt
from thesis_pipeline.model_training.train_xgboost import train_xgboost

RANDOM_STATE = 42

df_raw = load_data()
X_raw, y, df_with_target = engineer_features_and_target(df_raw)

X, encoder, categorical_cols, numeric_cols = encode_features(X_raw)

splits = stratified_train_val_test_split(X, y, random_state=RANDOM_STATE)
X_train, y_train = splits.X_train, splits.y_train
X_val, y_val     = splits.X_val, splits.y_val
X_test, y_test   = splits.X_test, splits.y_test

rf_model, *_   = train_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)
gbdt_model, *_ = train_gbdt(X_train, y_train, X_val, y_val, X_test, y_test)

try:
    xgb_model, *_ = train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test)
except Exception as e:
    xgb_model = None
    print("WARNING: XGBoost training failed. Skipping XGBoost.\n", repr(e))

best_models = {
    "RandomForest": rf_model,
    "GBDT": gbdt_model,
}
if xgb_model is not None:
    best_models["XGBoost"] = xgb_model

print("Models available:", list(best_models.keys()))
assert all(X_test.index == y_test.index), "X_test and y_test indices must align."


Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Models available: ['RandomForest', 'GBDT', 'XGBoost']


## Subgroup definitions
- Create subgroup metadata (gender, race_binary, age_group)

In [3]:
# Build subgroup metadata (matches SHAP/LIME notebooks)

meta_test = df_with_target.loc[X_test.index, ["gender", "race", "age"]].copy()

meta_test["race_binary"] = np.where(meta_test["race"] == "White", "White", "Non-White")

age_bins   = [17, 30, 45, 60, 90]
age_labels = ["18-30", "31-45", "46-60", "61+"]
meta_test["age_group"] = pd.cut(meta_test["age"], bins=age_bins, labels=age_labels)

display(meta_test.head())

Unnamed: 0,gender,race,age,race_binary,age_group
21460,Male,White,21,White,18-30
35060,Male,Amer-Indian-Eskimo,51,Non-White,46-60
1633,Male,White,34,White,31-45
22480,Female,White,26,White,18-30
47104,Female,White,28,White,18-30


## Subgroup performance evaluation
- Helpers: probability extraction and metric computation
- Compute subgroup performance table (model Ã— subgroup)

In [4]:
# Metric helpers + subgroup evaluation

def get_proba_pos_class(model, X_df: pd.DataFrame) -> np.ndarray:
    """Return P(y=1) if available, else decision_function mapped to [0,1] via sigmoid."""
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X_df)
        # shape (n,2) expected; if (n,) or (n,1), handle defensively
        if proba.ndim == 1:
            return proba
        if proba.shape[1] == 1:
            return proba[:, 0]
        return proba[:, 1]

    # fallback: decision function -> sigmoid
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X_df)
        scores = np.asarray(scores).reshape(-1)
        return 1.0 / (1.0 + np.exp(-scores))

    # last resort: hard predictions as "probabilities"
    return model.predict(X_df).astype(float)


def compute_metrics(model, X_sub: pd.DataFrame, y_sub: pd.Series) -> dict:
    y_true = y_sub.astype(int).values
    y_pred = model.predict(X_sub).astype(int)

    out = {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "f1": float(f1_score(y_true, y_pred, zero_division=0)),
    }

    # ROC-AUC requires both classes to be present
    if len(np.unique(y_true)) < 2:
        out["roc_auc"] = np.nan
    else:
        y_score = get_proba_pos_class(model, X_sub)
        out["roc_auc"] = float(roc_auc_score(y_true, y_score))

    return out


group_cols = ["gender", "race_binary", "age_group"]

rows = []
for model_name, model in best_models.items():
    print(f"\n=== Subgroup performance: {model_name} ===")

    for group_col in group_cols:
        # dropna so we don't create "nan" subgroup
        for group_value in meta_test[group_col].dropna().unique():
            mask = (meta_test[group_col] == group_value)
            idxs = meta_test.index[mask]

            X_sub = X_test.loc[idxs]
            y_sub = y_test.loc[idxs]

            # basic counts
            n = int(len(y_sub))
            n_pos = int((y_sub == 1).sum())
            pos_rate = float(n_pos / n) if n > 0 else np.nan

            m = compute_metrics(model, X_sub, y_sub)

            rows.append({
                "model": model_name,
                "group_col": group_col,
                "group_value": str(group_value),
                "n_samples": n,
                "pos_rate": pos_rate,
                "accuracy": m["accuracy"],
                "f1": m["f1"],
                "roc_auc": m["roc_auc"],
            })

subgroup_perf_df = pd.DataFrame(rows)

# Nice ordering
subgroup_perf_df["group_col"] = pd.Categorical(
    subgroup_perf_df["group_col"],
    categories=["gender", "race_binary", "age_group"],
    ordered=True
)
subgroup_perf_df = subgroup_perf_df.sort_values(["group_col", "group_value", "model"]).reset_index(drop=True)

display(subgroup_perf_df.head(20))




=== Subgroup performance: RandomForest ===

=== Subgroup performance: GBDT ===

=== Subgroup performance: XGBoost ===


Unnamed: 0,model,group_col,group_value,n_samples,pos_rate,accuracy,f1,roc_auc
0,GBDT,gender,Female,2457,0.11396,0.938543,0.68866,0.944492
1,RandomForest,gender,Female,2457,0.11396,0.932438,0.651261,0.941531
2,XGBoost,gender,Female,2457,0.11396,0.93895,0.691358,0.94474
3,GBDT,gender,Male,4973,0.299216,0.84818,0.728905,0.912442
4,RandomForest,gender,Male,4973,0.299216,0.837523,0.699851,0.902507
5,XGBoost,gender,Male,4973,0.299216,0.847979,0.728253,0.914598
6,GBDT,race_binary,Non-White,1080,0.14537,0.919444,0.688172,0.94124
7,RandomForest,race_binary,Non-White,1080,0.14537,0.915741,0.67148,0.937631
8,XGBoost,race_binary,Non-White,1080,0.14537,0.92037,0.695035,0.942206
9,GBDT,race_binary,White,6350,0.253701,0.871024,0.726179,0.925513


## Visualising subgroup performance
- Plot subgroup performance (grouped bar charts)

In [5]:
# Plot subgroup performance (grouped bars) and save figures

import os
import numpy as np
import matplotlib.pyplot as plt

os.makedirs("../figures", exist_ok=True)

def plot_grouped_bars(df, metric: str, outpath: str, title: str):
    """
    Grouped bar chart: x-axis = subgroup (group_col + group_value),
    bars = models, y = metric.
    """
    # Create readable subgroup labels
    dfp = df.copy()
    dfp["subgroup"] = dfp["group_col"].astype(str) + "=" + dfp["group_value"].astype(str)

    # Keep a stable order: by group_col then group_value
    dfp = dfp.sort_values(["group_col", "group_value", "model"])

    subgroups = dfp["subgroup"].unique().tolist()
    models = dfp["model"].unique().tolist()

    x = np.arange(len(subgroups))
    width = 0.8 / max(len(models), 1)

    plt.figure(figsize=(12, 5))

    for i, m in enumerate(models):
        sub = dfp[dfp["model"] == m].set_index("subgroup").reindex(subgroups)
        y = sub[metric].values.astype(float)

        # Replace NaNs (e.g., ROC-AUC undefined for single-class subgroup) with np.nan
        plt.bar(x + i * width, y, width=width, label=m)

    plt.xticks(x + (len(models) - 1) * width / 2, subgroups, rotation=30, ha="right")
    plt.ylabel(metric)
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"Saved: {outpath}")

# Accuracy
plot_grouped_bars(
    subgroup_perf_df,
    metric="accuracy",
    outpath="../figures/subgroup_performance_accuracy.png",
    title="Subgroup performance (test set): Accuracy",
)

# F1
plot_grouped_bars(
    subgroup_perf_df,
    metric="f1",
    outpath="../figures/subgroup_performance_f1.png",
    title="Subgroup performance (test set): F1-score",
)

# ROC-AUC (may contain NaNs if subgroup has only one class)
plot_grouped_bars(
    subgroup_perf_df,
    metric="roc_auc",
    outpath="../figures/subgroup_performance_roc_auc.png",
    title="Subgroup performance (test set): ROC-AUC",
)


Saved: ../figures/subgroup_performance_accuracy.png
Saved: ../figures/subgroup_performance_f1.png
Saved: ../figures/subgroup_performance_roc_auc.png


## Subgroup error analysis
- Error-type breakdown at threshold 0.5 (TP/FP/TN/FN + FPR/FNR/precision/recall)

In [6]:
# --- Error analysis: subgroup FP/FN breakdown (CSV + LaTeX) ---

import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix

os.makedirs("../results", exist_ok=True)

RANDOM_STATE = 42
MIN_GROUP_SIZE = 50
THRESHOLD = 0.5

# Build subgroup metadata aligned to X_test (use df or df_with_target depending on your notebook)
# If your notebook already has meta_test, you can skip this block.
if "meta_test" not in globals():
    # adjust df source name if needed
    meta_test = df.loc[X_test.index, ["gender", "race", "age"]].copy()
    meta_test["race_binary"] = np.where(meta_test["race"] == "White", "White", "Non-White")
    age_bins   = [17, 30, 45, 60, 90]
    age_labels = ["18-30", "31-45", "46-60", "61+"]
    meta_test["age_group"] = pd.cut(meta_test["age"], bins=age_bins, labels=age_labels)

assert all(X_test.index == y_test.index), "X_test and y_test indices must align."
assert meta_test.index.equals(X_test.index), "meta_test must align with X_test indices."

group_cols = ["gender", "race_binary", "age_group"]

def predict_labels(model, X: pd.DataFrame, threshold: float = 0.5) -> np.ndarray:
    """Prefer predict_proba thresholding for consistency; fallback to model.predict."""
    if hasattr(model, "predict_proba"):
        p1 = model.predict_proba(X)[:, 1]
        return (p1 >= threshold).astype(int)
    return model.predict(X)

rows = []

for model_name, model in best_models.items():
    # Also compute overall (not subgrouped) as a reference
    y_true_all = y_test.loc[X_test.index].values
    y_pred_all = predict_labels(model, X_test, threshold=THRESHOLD)
    tn, fp, fn, tp = confusion_matrix(y_true_all, y_pred_all, labels=[0, 1]).ravel()

    fpr = fp / (fp + tn) if (fp + tn) > 0 else np.nan
    fnr = fn / (fn + tp) if (fn + tp) > 0 else np.nan
    precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
    recall = tp / (tp + fn) if (tp + fn) > 0 else np.nan

    rows.append({
        "model": model_name,
        "group_col": "overall",
        "group_value": "all",
        "n_samples": int(len(X_test)),
        "tp": int(tp), "fp": int(fp), "tn": int(tn), "fn": int(fn),
        "fpr": float(fpr), "fnr": float(fnr),
        "precision": float(precision), "recall": float(recall),
    })

    # Subgroups
    for group_col in group_cols:
        groups = meta_test[group_col]
        for group_value in groups.dropna().unique():
            mask = (groups == group_value)
            n = int(mask.sum())
            if n < MIN_GROUP_SIZE:
                continue

            Xg = X_test.loc[mask]
            yg = y_test.loc[mask].values

            y_pred_g = predict_labels(model, Xg, threshold=THRESHOLD)
            tn, fp, fn, tp = confusion_matrix(yg, y_pred_g, labels=[0, 1]).ravel()

            fpr = fp / (fp + tn) if (fp + tn) > 0 else np.nan
            fnr = fn / (fn + tp) if (fn + tp) > 0 else np.nan
            precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan
            recall = tp / (tp + fn) if (tp + fn) > 0 else np.nan

            rows.append({
                "model": model_name,
                "group_col": group_col,
                "group_value": str(group_value),
                "n_samples": n,
                "tp": int(tp), "fp": int(fp), "tn": int(tn), "fn": int(fn),
                "fpr": float(fpr), "fnr": float(fnr),
                "precision": float(precision), "recall": float(recall),
            })

subgroup_error_df = pd.DataFrame(rows)

# Save CSV
out_csv = "../results/subgroup_error_breakdown.csv"
subgroup_error_df.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")

# Save LaTeX (rounded)
df_tex = subgroup_error_df.copy()
for c in ["fpr", "fnr", "precision", "recall"]:
    df_tex[c] = df_tex[c].round(3)

out_tex = "../results/subgroup_error_breakdown.tex"
df_tex.to_latex(
    out_tex,
    longtable=True,
    index=False,
    float_format="%.3f",
    caption=(
        "Subgroup error-type breakdown at a fixed decision threshold (0.5): "
        "TP/FP/TN/FN counts and derived rates (FPR, FNR, precision, recall) for each model."
    ),
    label="tab:subgroup-error-breakdown",
    escape=True,
)
print(f"Saved: {out_tex}")

subgroup_error_df.head(10)


Saved: ../results/subgroup_error_breakdown.csv
Saved: ../results/subgroup_error_breakdown.tex


Unnamed: 0,model,group_col,group_value,n_samples,tp,fp,tn,fn,fpr,fnr,precision,recall
0,RandomForest,overall,all,7430,1097,303,5359,671,0.053515,0.379525,0.783571,0.620475
1,RandomForest,gender,Male,4973,942,262,3223,546,0.075179,0.366935,0.782392,0.633065
2,RandomForest,gender,Female,2457,155,41,2136,125,0.018833,0.446429,0.790816,0.553571
3,RandomForest,race_binary,White,6350,1004,276,4463,607,0.05824,0.376785,0.784375,0.623215
4,RandomForest,race_binary,Non-White,1080,93,27,896,64,0.029252,0.407643,0.775,0.592357
5,RandomForest,age_group,18-30,2317,64,18,2147,88,0.008314,0.578947,0.780488,0.421053
6,RandomForest,age_group,46-60,1595,416,104,869,206,0.106886,0.33119,0.8,0.66881
7,RandomForest,age_group,31-45,2866,556,162,1832,316,0.081244,0.362385,0.774373,0.637615
8,RandomForest,age_group,61+,537,61,19,396,61,0.045783,0.5,0.7625,0.5
9,GBDT,overall,all,7430,1182,320,5342,586,0.056517,0.331448,0.786951,0.668552


## Export results
- Export subgroup performance summary (CSV + LaTeX)

In [7]:
# Export CSV + LaTeX for Overleaf

out_csv = "../results/subgroup_performance_summary.csv"
subgroup_perf_df.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")

# For LaTeX: round for readability
df_tex = subgroup_perf_df.copy()
for c in ["pos_rate", "accuracy", "f1", "roc_auc"]:
    df_tex[c] = df_tex[c].round(3)

out_tex = "../results/subgroup_performance_summary.tex"
df_tex.to_latex(
    out_tex,
    index=False,
    escape=True,
    float_format="%.3f",
    caption="Subgroup predictive performance on the clean test set (accuracy, F1, ROC-AUC).",
    label="tab:subgroup-performance",
)
print(f"Saved: {out_tex}")

Saved: ../results/subgroup_performance_summary.csv
Saved: ../results/subgroup_performance_summary.tex
