In [12]:
import pandas as pd
import numpy as np
import polars as pl
import random
import joblib
import pathlib
import shap
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
)

# Display options
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

random.seed(42)
np.random.seed(42)

## Step 1 — Load artifacts

In [13]:
# Load path and get all the required files
def get_paths(base="fairness_artifacts"):
    base = pathlib.Path(base)
    return {
        "X_train": base / "X_train.parquet",
        "X_test": base / "X_test.parquet",
        "y_train": base / "y_train.parquet",
        "y_test": base / "y_test.parquet",
        "df_meta": base / "df_meta.parquet",
        "ids_test": base / "ids_test.parquet",
        "model": base / "final_lightgbm_model.pkl",
    }


def load_artifacts(base="fairness_artifacts"):
    paths = get_paths(base)

    X_train = pd.read_parquet(paths["X_train"])
    X_test = pd.read_parquet(paths["X_test"])
    y_train = pd.read_parquet(paths["y_train"])["hospitalized"]
    y_test = pd.read_parquet(paths["y_test"])["hospitalized"]
    df_meta = pd.read_parquet(paths["df_meta"])
    ids_test = pd.read_parquet(paths["ids_test"])["person_id"]
    model = joblib.load(paths["model"])

    return X_train, X_test, y_train, y_test, df_meta, ids_test, model

In [14]:
X_train, X_test, y_train, y_test, df_meta, ids_test, final_lgb = load_artifacts()

In [15]:
print("Shape:")
print("X Train:", X_train.shape)
print("X Test:", X_test.shape)
print("y Train:", y_train.shape)
print("y Test:", y_test.shape)
print("ids_test:", ids_test.shape)
print("df_meta:", df_meta.shape)

Shape:
X Train: (15135, 65)
X Test: (3784, 65)
y Train: (15135,)
y Test: (3784,)
ids_test: (3784,)
df_meta: (18919, 10)


## Step 2 — Build base evaluation table

For the fairness playground, I need a clean table that captures what the model actually did on the test set. Thus, I will now build a base evaluation table on the test set that includes person_id, the true label, the predicted probability, and the final 0/1 prediction. This is the core model-output layer I’ll use when joining with demographics and computing group-wise fairness metrics.

In [16]:
def make_base_eval_table(X_test, y_test, ids_test, model, threshold=0.5):
    """
    Build a base evaluation table on the test set.
    """
    proba = model.predict_proba(X_test)[:, 1]
    y_pred = (proba >= threshold).astype(int)

    df_eval = pd.DataFrame(
        {
            "person_id": ids_test.values,
            "y_true": y_test.values,
            "y_pred_proba": proba,
            "y_pred": y_pred,
        }
    )
    return df_eval

In [17]:
df_eval = make_base_eval_table(X_test, y_test, ids_test, final_lgb)
display(df_eval.head())

Unnamed: 0,person_id,y_true,y_pred_proba,y_pred
0,2795536102,0,0.000125,0
1,2796668104,0,0.000609,0
2,2815945102,0,0.000196,0
3,2813232102,0,0.002306,0
4,2810968102,0,0.000353,0


### Step 3 — Merge with demographics

In [18]:
def merge_eval_and_meta(df_eval, df_meta):
    df_merged = df_eval.merge(df_meta, on="person_id", how="left")
    return df_merged

In [19]:
# merge model outputs with demographics
df_fair = merge_eval_and_meta(df_eval, df_meta)
print(df_fair.head())

    person_id  y_true  y_pred_proba  y_pred  age  sex  race_ethnicity  \
0  2795536102       0      0.000125       0   44    1               2   
1  2796668104       0      0.000609       0    4    1               1   
2  2815945102       0      0.000196       0   60    2               1   
3  2813232102       0      0.002306       0   59    1               2   
4  2810968102       0      0.000353       0   30    1               2   

   hispanic  poverty_category  insurance_coverage  family_income  \
0         2                 4                   1         142202   
1         1                 1                   2              0   
2         1                 4                   3          64010   
3         2                 5                   1         335489   
4         2                 4                   3          47840   

   self_rated_health  self_rated_mental_health  
0                2.0                       2.0  
1                4.0                       3.0  
2    

### Step 4: Recoding demographic columns with human-readable labels

The demographic variables in MEPS are all encoded as numbers, so before I can compute fairness metrics or show any results in the playground, I need them in human-readable form. This step converts all the MEPS-coded fields — sex, race/ethnicity, Hispanic status, poverty category, insurance coverage, and self-rated health — into clear labels. This makes the fairness results interpretable and avoids exposing raw codes in the playground.

In [20]:
# Recode sex
def recode_sex(df):
    sex_map = {1: "Male", 2: "Female"}
    df["sex"] = df["sex"].map(sex_map)
    return df


# Recode Race/Ethnicity (RACETHX)
def recode_race_ethnicity(df):
    race_map = {
        1: "Hispanic",
        2: "White",
        3: "Black",
        4: "Asian",
        5: "Other OR Multiple",
    }
    df["race_ethnicity"] = df["race_ethnicity"].map(race_map)
    return df


# Recode Hispanic Flag (HISPANX)
def recode_hispanic(df):
    hisp_map = {1: "Hispanic", 2: "Not Hispanic"}
    df["hispanic"] = df["hispanic"].map(hisp_map)
    return df


# Poverty Category (POVCAT23)
def recode_poverty(df):
    pov_map = {
        1: "Poor OR negative",
        2: "Low income",
        3: "Middle income",
        4: "High income",
        5: "Unclassifiable",
    }
    df["poverty_category"] = df["poverty_category"].map(pov_map)
    return df


# Insurance Coverage (INSCOV23)
def recode_insurance(df):
    ins_map = {1: "Any private", 2: "Public only", 3: "Uninsured"}
    df["insurance_coverage"] = df["insurance_coverage"].map(ins_map)
    return df


# Recode Self-Rated Health (RTHLTH53)
def recode_self_rated_health(df):
    health_map = {
        1: "Excellent",
        2: "Very good",
        3: "Good",
        4: "Fair",
        5: "Poor",
    }
    df["self_rated_health"] = (
        df["self_rated_health"].round().astype("Int64").map(health_map)
    )
    return df


# Recode Self-Rated Mental Health (MNHLTH53)
def recode_self_rated_mental(df):
    mental_map = {
        1: "Excellent",
        2: "Very good",
        3: "Good",
        4: "Fair",
        5: "Poor",
    }
    df["self_rated_mental_health"] = (
        df["self_rated_mental_health"].round().astype("Int64").map(mental_map)
    )
    return df


# Apply all recodings
def apply_all_recodings(df):
    return (
        df.pipe(recode_sex)
        .pipe(recode_race_ethnicity)
        .pipe(recode_hispanic)
        .pipe(recode_poverty)
        .pipe(recode_insurance)
        .pipe(recode_self_rated_health)
        .pipe(recode_self_rated_mental)
    )

In [21]:
df_fair = apply_all_recodings(df_fair)
print(df_fair.head())

    person_id  y_true  y_pred_proba  y_pred  age     sex race_ethnicity  \
0  2795536102       0      0.000125       0   44    Male          White   
1  2796668104       0      0.000609       0    4    Male       Hispanic   
2  2815945102       0      0.000196       0   60  Female       Hispanic   
3  2813232102       0      0.002306       0   59    Male          White   
4  2810968102       0      0.000353       0   30    Male          White   

       hispanic  poverty_category insurance_coverage  family_income  \
0  Not Hispanic       High income        Any private         142202   
1      Hispanic  Poor OR negative        Public only              0   
2      Hispanic       High income          Uninsured          64010   
3  Not Hispanic    Unclassifiable        Any private         335489   
4  Not Hispanic       High income          Uninsured          47840   

  self_rated_health self_rated_mental_health  
0         Very good                Very good  
1              Fair         

### Step 5: Group level fairness Metrics

Now that the fairness dataset is fully assembled, the next step is to compute standard group-level performance metrics. This helps quantify how the model behaves for different demographic groups. For example, I can compare recall, false negative rate, or positive prediction rate across race, sex, or income levels. These metrics form the backbone of the fairness playground and make it easy to see where the model treats groups differently.

In [22]:
def compute_group_metrics(df, group_col):
    """
    Compute standard model performance metrics for each demographic group.
    """
    results = []

    for group, g in df.groupby(group_col):
        y_true = g["y_true"]
        y_pred = g["y_pred"]

        # Core metrics
        acc = accuracy_score(y_true, y_pred)
        prec = precision_score(y_true, y_pred, zero_division=0)
        rec = recall_score(y_true, y_pred, zero_division=0)

        # Derived metrics
        fnr = 1 - rec
        fpr = ((y_pred.eq(1) & y_true.eq(0)).sum()) / max((y_true == 0).sum(), 1)
        pos_rate = y_pred.mean()
        avg_proba = g["y_pred_proba"].mean()

        results.append(
            {
                group_col: group,
                "accuracy": acc,
                "precision": prec,
                "recall": rec,
                "false_negative_rate": fnr,
                "false_positive_rate": fpr,
                "positive_prediction_rate": pos_rate,
                "avg_pred_probability": avg_proba,
                "count": len(g),
            }
        )

    return pd.DataFrame(results)

In [23]:
# Compute metrics for each demographic group
race_metrics = compute_group_metrics(df_fair, "race_ethnicity")
sex_metrics = compute_group_metrics(df_fair, "sex")
poverty_metrics = compute_group_metrics(df_fair, "poverty_category")
insurance_metrics = compute_group_metrics(df_fair, "insurance_coverage")

### Step 6 - Disparity Gap Metrics

Group metrics tell me how the model performs within each demographic group, but they don’t directly show how far apart the groups are. In this step, I compute simple disparity metrics by picking a reference group and measuring gaps in key quantities like positive prediction rate, recall, and false negative rate. These gaps are what the fairness playground will display when highlighting where the model is more or less sensitive for different groups.

In [None]:
def compute_disparities(group_df, group_col, reference_group=None):
    """
    Compute gaps in metrics versus a reference group.
    If reference_group is None, uses the group with the largest count.
    """
    df = group_df.copy()

    # Choose reference
    if reference_group is None:
        reference_group = df.sort_values("count", ascending=False)[group_col].iloc[0]

    ref_row = df[df[group_col] == reference_group].iloc[0]

    # Metrics we want gaps for
    gap_metrics = [
        "positive_prediction_rate",
        "recall",
        "precision",
        "false_negative_rate",
        "false_positive_rate",
    ]

    for m in gap_metrics:
        gap_col = f"{m}_gap_vs_ref"
        df[gap_col] = df[m] - ref_row[m]

    df["reference_group"] = reference_group
    return df

In [None]:
# Compute disparities (gaps) for each demographic
race_disparities = compute_disparities(race_metrics, "race_ethnicity")
sex_disparities = compute_disparities(sex_metrics, "sex")
poverty_disparities = compute_disparities(poverty_metrics, "poverty_category")
insurance_disparities = compute_disparities(insurance_metrics, "insurance_coverage")

In [None]:
def compute_global_shap(model, X_test):
    """
    Compute SHAP values for the LightGBM model on the test set.
    Returns explainer and properly formatted SHAP values.
    """
    explainer = shap.TreeExplainer(model)

    # Get SHAP values - LightGBM returns a list for binary classification
    shap_values_raw = explainer.shap_values(X_test)

    # Handle the list format - extract class 1 (hospitalized)
    if isinstance(shap_values_raw, list):
        shap_values = shap_values_raw[1]  # Positive class
    else:
        shap_values = shap_values_raw

    # Verify shape
    expected_shape = (len(X_test), len(X_test.columns))
    actual_shape = shap_values.shape

    print(f"Expected SHAP shape: {expected_shape}")
    print(f"Actual SHAP shape: {actual_shape}")

    if actual_shape != expected_shape:
        raise ValueError(
            f"SHAP values shape mismatch! Expected {expected_shape}, got {actual_shape}"
        )

    return explainer, shap_values


# ============================================================================
# CELL 16: Compute SHAP Values
# ============================================================================

# Compute SHAP values
explainer, shap_values_test = compute_global_shap(final_lgb, X_test)
print("✓ SHAP computation successful")


# ============================================================================
# CELL 17: Global SHAP Importance Function
# ============================================================================


def global_shap_importance(shap_values, feature_names, top_n=15):
    """
    Calculate global feature importance from SHAP values.
    """
    mean_abs = np.abs(shap_values).mean(axis=0)

    df_imp = (
        pd.DataFrame({"feature": feature_names, "mean_abs_shap": mean_abs})
        .sort_values("mean_abs_shap", ascending=False)
        .head(top_n)
        .reset_index(drop=True)
    )

    return df_imp


# ============================================================================
# CELL 18: Compute Global Importance
# ============================================================================

# Compute and display global importance
global_importance = global_shap_importance(shap_values_test, X_test.columns, top_n=15)

print("\n" + "=" * 70)
print("TOP 15 MOST IMPORTANT FEATURES (GLOBAL)")
print("=" * 70)
print(global_importance.to_string(index=False))


# ============================================================================
# CELL 19: Attach SHAP to DataFrame Function
# ============================================================================


def attach_shap_to_df(df_fair, shap_values, feature_names):
    """
    Attach SHAP values to the fairness dataframe.
    """
    # Verify shapes match
    if len(df_fair) != shap_values.shape[0]:
        raise ValueError(
            f"Length mismatch: df_fair has {len(df_fair)} rows, "
            f"but shap_values has {shap_values.shape[0]} rows"
        )

    if len(feature_names) != shap_values.shape[1]:
        raise ValueError(
            f"Feature mismatch: {len(feature_names)} feature names provided, "
            f"but shap_values has {shap_values.shape[1]} columns"
        )

    # Create SHAP dataframe with proper shape
    shap_df = pd.DataFrame(
        shap_values, columns=[f"shap_{col}" for col in feature_names]
    )

    # Add person_id for merging
    shap_df["person_id"] = df_fair["person_id"].values

    # Merge with original dataframe
    df_with_shap = df_fair.merge(shap_df, on="person_id", how="left")

    print(f"Successfully attached {shap_values.shape[1]} SHAP features")
    print(f"Final shape: {df_with_shap.shape}")

    return df_with_shap


# ============================================================================
# CELL 20: Attach SHAP Values
# ============================================================================

# Attach SHAP values to fairness dataframe
df_fair_shap = attach_shap_to_df(df_fair, shap_values_test, X_test.columns)
print("✓ SHAP values attached to dataframe")

# Show sample
print("\nSample of data with SHAP values:")
shap_cols = [col for col in df_fair_shap.columns if col.startswith("shap_")]
print(df_fair_shap[["person_id", "y_pred_proba"] + shap_cols[:3]].head())


# ============================================================================
# CELL 21: Group SHAP Importance Function
# ============================================================================


def group_shap_importance(df_with_shap, group_col, feature_names, top_n=10):
    """
    Compute SHAP-based feature importance for each demographic group.
    """
    # Get SHAP column names
    shap_cols = [f"shap_{col}" for col in feature_names]

    # Verify all SHAP columns exist
    missing = [col for col in shap_cols if col not in df_with_shap.columns]
    if missing:
        raise ValueError(f"Missing SHAP columns: {missing[:5]}...")

    results = {}

    for group, g in df_with_shap.groupby(group_col):
        # Calculate mean absolute SHAP for this group
        avg_abs = g[shap_cols].abs().mean()

        # Remove 'shap_' prefix for cleaner display
        avg_abs.index = [col.replace("shap_", "") for col in avg_abs.index]

        # Sort and get top N
        top_features = avg_abs.sort_values(ascending=False).head(top_n)

        results[group] = top_features

    return results


# ============================================================================
# CELL 22: Compute Group SHAP Importance
# ============================================================================

# Compute group-level SHAP importance
race_shap = group_shap_importance(df_fair_shap, "race_ethnicity", X_test.columns)
sex_shap = group_shap_importance(df_fair_shap, "sex", X_test.columns)
poverty_shap = group_shap_importance(df_fair_shap, "poverty_category", X_test.columns)

print("\n" + "=" * 70)
print("SHAP FEATURE IMPORTANCE BY RACE/ETHNICITY")
print("=" * 70)
for group, importance in race_shap.items():
    print(f"\n{group}:")
    print(importance.to_string())

print("\n" + "=" * 70)
print("SHAP FEATURE IMPORTANCE BY SEX")
print("=" * 70)
for group, importance in sex_shap.items():
    print(f"\n{group}:")
    print(importance.to_string())


# ============================================================================
# CELL 23: Compare Group Feature Importance Function
# ============================================================================


def compare_group_feature_importance(group_shap_dict, top_k=5):
    """
    Compare which features are most important for each group.
    Helps identify if model relies on different features for different groups.
    """
    print("\nTop Features by Group (for fairness analysis):")
    print("=" * 70)

    for group, importance in group_shap_dict.items():
        top_features = importance.head(top_k).index.tolist()
        print(f"{group:25} -> {', '.join(top_features)}")

    # Find features that appear in top K for some groups but not others
    all_top_features = set()
    group_top_features = {}

    for group, importance in group_shap_dict.items():
        top = set(importance.head(top_k).index)
        group_top_features[group] = top
        all_top_features.update(top)

    print(
        f"\nTotal unique features in any group's top {top_k}: {len(all_top_features)}"
    )

    # Features that don't appear in all groups' top K
    divergent_features = []
    for feature in all_top_features:
        groups_with_feature = [
            g for g, tops in group_top_features.items() if feature in tops
        ]
        if len(groups_with_feature) < len(group_shap_dict):
            divergent_features.append((feature, groups_with_feature))

    if divergent_features:
        print(f"\nFeatures with divergent importance across groups:")
        for feature, groups in divergent_features:
            print(f"  - {feature}: important for {groups}")
    else:
        print("\nAll top features are consistent across groups")

    return divergent_features


# ============================================================================
# CELL 24: Compare Feature Importance
# ============================================================================

# Compare feature importance across groups
print("\n" + "=" * 70)
print("FEATURE IMPORTANCE COMPARISON BY RACE/ETHNICITY")
print("=" * 70)
race_divergent = compare_group_feature_importance(race_shap, top_k=5)

print("\n" + "=" * 70)
print("FEATURE IMPORTANCE COMPARISON BY SEX")
print("=" * 70)
sex_divergent = compare_group_feature_importance(sex_shap, top_k=5)


# ============================================================================
# CELL 25: Local Explanation Function
# ============================================================================


def local_explanation(explainer, model, X_row, top_k=5):
    """
    Generate a local SHAP explanation for a single individual.
    Shows which features most increased/decreased their predicted risk.
    """
    x = X_row.values.reshape(1, -1)

    # Get SHAP values
    shap_vals_raw = explainer.shap_values(x)

    # Handle list format
    if isinstance(shap_vals_raw, list):
        shap_vals = shap_vals_raw[1][0]  # Class 1, first sample
        base = explainer.expected_value[1]
    else:
        shap_vals = shap_vals_raw[0]
        base = explainer.expected_value

    # Prediction
    proba = model.predict_proba(x)[0, 1]

    # Get top contributing features
    contrib = pd.Series(shap_vals, index=X_row.index)
    pos = contrib[contrib > 0].sort_values(ascending=False).head(top_k)
    neg = contrib[contrib < 0].sort_values(ascending=True).head(top_k)

    return {
        "prediction_proba": float(proba),
        "base_value": float(base),
        "top_positive": list(pos.items()),
        "top_negative": list(neg.items()),
    }


# ============================================================================
# CELL 26: Generate Local Explanation Example
# ============================================================================

# Example: Get local explanation for first person in test set
idx = X_test.index[0]
row = X_test.loc[idx]
local_exp = local_explanation(explainer, final_lgb, row)

print("\n" + "=" * 70)
print("LOCAL EXPLANATION FOR INDIVIDUAL")
print("=" * 70)
print(f"Person ID: {ids_test.iloc[0]}")
print(f"Predicted probability: {local_exp['prediction_proba']:.3f}")
print(f"Base value (avg): {local_exp['base_value']:.3f}")
print("\nTop features INCREASING risk:")
for feat, val in local_exp["top_positive"]:
    print(f"  {feat:30} +{val:.4f}")
print("\nTop features DECREASING risk:")
for feat, val in local_exp["top_negative"]:
    print(f"  {feat:30} {val:.4f}")


# ============================================================================
# CELL 27: Summary Function
# ============================================================================


def summarize_fairness_findings(race_metrics, sex_metrics, df_fair):
    """
    Generate a summary of key fairness findings.
    """
    print("\n" + "=" * 70)
    print("FAIRNESS ANALYSIS SUMMARY")
    print("=" * 70)

    # Overall model performance
    print("\n1. OVERALL MODEL PERFORMANCE:")
    print(f"   - Test set size: {len(df_fair)}")
    print(
        f"   - Positive cases: {df_fair['y_true'].sum()} ({df_fair['y_true'].mean():.1%})"
    )
    print(f"   - Mean predicted probability: {df_fair['y_pred_proba'].mean():.3f}")
    print(f"   - Positive prediction rate: {df_fair['y_pred'].mean():.3f}")

    # Race/ethnicity disparities
    print("\n2. RACE/ETHNICITY DISPARITIES:")
    recall_range = race_metrics["recall"].max() - race_metrics["recall"].min()
    fnr_range = (
        race_metrics["false_negative_rate"].max()
        - race_metrics["false_negative_rate"].min()
    )
    print(f"   - Recall range: {recall_range:.3f} (max-min across groups)")
    print(f"   - False negative rate range: {fnr_range:.3f}")
    print(
        f"   - Highest recall: {race_metrics.loc[race_metrics['recall'].idxmax(), 'race_ethnicity']} ({race_metrics['recall'].max():.3f})"
    )
    print(
        f"   - Lowest recall: {race_metrics.loc[race_metrics['recall'].idxmin(), 'race_ethnicity']} ({race_metrics['recall'].min():.3f})"
    )

    # Sex disparities
    print("\n3. SEX DISPARITIES:")
    male_recall = sex_metrics.loc[sex_metrics["sex"] == "Male", "recall"].values[0]
    female_recall = sex_metrics.loc[sex_metrics["sex"] == "Female", "recall"].values[0]
    recall_diff = male_recall - female_recall
    print(f"   - Male recall: {male_recall:.3f}")
    print(f"   - Female recall: {female_recall:.3f}")
    print(f"   - Difference: {recall_diff:.3f}")

    # Model reliability by group size
    print("\n4. GROUP SIZES:")
    print("   Race/Ethnicity:")
    for _, row in race_metrics.iterrows():
        print(f"     - {row['race_ethnicity']:20} {row['count']:5} samples")

    return None


# ============================================================================
# CELL 28: Generate Summary
# ============================================================================

# Generate summary
summarize_fairness_findings(race_metrics, sex_metrics, df_fair)


# ============================================================================
# CELL 29: Save Results
# ============================================================================

# Create output directory
output_dir = pathlib.Path("fairness_results")
output_dir.mkdir(exist_ok=True)

# Save fairness metrics
race_metrics.to_parquet(output_dir / "race_metrics.parquet", index=False)
sex_metrics.to_parquet(output_dir / "sex_metrics.parquet", index=False)
poverty_metrics.to_parquet(output_dir / "poverty_metrics.parquet", index=False)
insurance_metrics.to_parquet(output_dir / "insurance_metrics.parquet", index=False)

# Save disparities
race_disparities.to_parquet(output_dir / "race_disparities.parquet", index=False)
sex_disparities.to_parquet(output_dir / "sex_disparities.parquet", index=False)
poverty_disparities.to_parquet(output_dir / "poverty_disparities.parquet", index=False)
insurance_disparities.to_parquet(
    output_dir / "insurance_disparities.parquet", index=False
)

# Save global SHAP importance
global_importance.to_parquet(output_dir / "global_shap_importance.parquet", index=False)

# Save fairness dataframe with SHAP values (for interactive playground)
df_fair_shap.to_parquet(output_dir / "df_fair_with_shap.parquet", index=False)

# Save explainer for real-time explanations
joblib.dump(explainer, output_dir / "shap_explainer.pkl")

print("\n" + "=" * 70)
print("RESULTS SAVED")
print("=" * 70)
print(f"Output directory: {output_dir}")
print(f"Files saved: {len(list(output_dir.glob('*')))}")
print("\nSaved files:")
for f in sorted(output_dir.glob("*")):
    print(f"  - {f.name}")


# ============================================================================
# CELL 30: Age Group Analysis
# ============================================================================

# Create age groups for additional analysis
df_fair_shap["age_group"] = pd.cut(
    df_fair_shap["age"],
    bins=[0, 18, 35, 50, 65, 100],
    labels=["0-18", "19-35", "36-50", "51-65", "65+"],
)

# Compute metrics by age group
age_metrics = compute_group_metrics(df_fair_shap, "age_group")

print("\n" + "=" * 70)
print("FAIRNESS METRICS BY AGE GROUP")
print("=" * 70)
print(age_metrics.to_string(index=False))

# Save age metrics
age_metrics.to_parquet(output_dir / "age_metrics.parquet", index=False)


# ============================================================================
# CELL 31: Final Summary
# ============================================================================

print("\n" + "=" * 70)
print("NOTEBOOK COMPLETE")
print("=" * 70)
print("\nThis notebook has:")
print("1. ✓ Loaded model artifacts and test data")
print("2. ✓ Merged predictions with demographics")
print("3. ✓ Computed fairness metrics across multiple demographic groups")
print("4. ✓ Calculated disparity gaps versus reference groups")
print("5. ✓ Generated global SHAP feature importance")
print("6. ✓ Computed group-specific SHAP importance")
print("7. ✓ Identified divergent features across groups")
print("8. ✓ Created local explanation framework")
print("9. ✓ Saved all results for the fairness playground")
print("\nNext steps:")
print("- Build interactive playground using saved results")
print("- Implement counterfactual explanations")
print("- Create visualizations for fairness metrics")
print("- Add threshold optimization analysis")

NameError: name 'df_fair' is not defined