In [1]:
# Adult Income â€” extract hidden-layer activations from the saved DNN
# Inputs :
#   data/adult_model.csv
#   models/adult/dnn_model.keras
# Outputs:
#   results/activations/adult_dnn_layer1.npy     (Dense(64) activations, X_test)
#   results/activations/adult_dnn_layer2.npy     (Dense(32) activations, X_test)
#   results/activations/adult_dnn_metadata.csv   (index mapping + sens attrs + y + y_prob + y_pred)
#   results/activations/summary.txt              (basic shapes and checks)
#   (optional) results/activations/adult_dnn_layer2_corr_sex.csv
#   (optional) results/activations/adult_dnn_layer2_corr_race.csv

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

import tensorflow as tf
from tensorflow import keras

# --------------------------------------------------------------------
# Paths
# --------------------------------------------------------------------
project_root = Path.cwd().resolve().parent   # run from notebooks/
data_dir     = project_root / "data"
models_dir   = project_root / "models" / "adult"
results_dir  = project_root / "results" / "activations"
results_dir.mkdir(parents=True, exist_ok=True)

model_path   = models_dir / "dnn_model.keras"
src_csv      = data_dir / "adult_model.csv"

# --------------------------------------------------------------------
# Load data and define splits/features exactly as training
# --------------------------------------------------------------------
df = pd.read_csv(src_csv)

y = df["label"].astype(int).values
X = df.drop(columns=["label"]).copy()

# Keep sensitive attributes for later analysis
sensitive_cols = [c for c in ["sex", "race"] if c in X.columns]
sens_all = X[sensitive_cols].copy() if sensitive_cols else pd.DataFrame(index=X.index)

# Train/test split (same seed and stratify as before to reproduce)
X_train, X_test, y_train, y_test, sens_train, sens_test = train_test_split(
    X, y, sens_all, test_size=0.25, random_state=42, stratify=y
)

# Numeric vs categorical columns
num_cols = [c for c in X_train.columns if np.issubdtype(X_train[c].dtype, np.number)]
cat_cols = [c for c in X_train.columns if c not in num_cols]

# OneHotEncoder API compatibility (sklearn >=1.2 uses sparse_output)
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", ohe, cat_cols),
    ],
    remainder="drop",
    sparse_threshold=1.0,
)

X_train_m = preprocess.fit_transform(X_train)
X_test_m  = preprocess.transform(X_test)

# Convert sparse to dense for Keras
X_train_m = X_train_m.toarray()
X_test_m  = X_test_m.toarray()

# --------------------------------------------------------------------
# Load trained DNN and build activation model (with warm-up)
# --------------------------------------------------------------------
model = keras.models.load_model(model_path)

# Warm up the model so it defines its input/output tensors
_ = model.predict(X_train_m[:1], verbose=0)

# Identify Dense layers robustly
dense_layers = [ly for ly in model.layers if isinstance(ly, keras.layers.Dense)]
if len(dense_layers) < 2:
    raise RuntimeError(f"Expected at least 2 Dense layers; found: {[ly.name for ly in dense_layers]}")

dense1 = dense_layers[0]  # first hidden Dense (e.g., 64 units)
dense2 = dense_layers[1]  # second hidden Dense (e.g., 32 units)

# Build a model that returns both hidden-layer activations and the final output
act_model = keras.Model(
    inputs=model.inputs,
    outputs=[dense1.output, dense2.output, model.outputs[0]]
)

# --------------------------------------------------------------------
# Forward pass on test set to get activations and predictions
# --------------------------------------------------------------------
layer1_act, layer2_act, y_prob = act_model.predict(X_test_m, verbose=0)
y_prob = y_prob.ravel()
y_pred = (y_prob >= 0.5).astype(int)

# --------------------------------------------------------------------
# Save activations and metadata
# --------------------------------------------------------------------
np.save(results_dir / "adult_dnn_layer1.npy", layer1_act)
np.save(results_dir / "adult_dnn_layer2.npy", layer2_act)

meta = sens_test.copy().reset_index(drop=True)
meta["y_true"] = y_test
meta["y_prob"] = y_prob
meta["y_pred"] = y_pred
meta.to_csv(results_dir / "adult_dnn_metadata.csv", index=False)

# Basic summary file
with open(results_dir / "summary.txt", "w") as f:
    f.write("Adult DNN activations (test set)\n")
    f.write(f"layer1 shape: {layer1_act.shape}\n")
    f.write(f"layer2 shape: {layer2_act.shape}\n")
    f.write(f"metadata rows: {len(meta)}\n")
    f.write("\nPerformance on test (using saved DNN):\n")
    f.write(f"accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
    try:
        f.write(f"roc_auc: {roc_auc_score(y_test, y_prob):.4f}\n")
    except Exception:
        pass
    f.write(f"f1: {f1_score(y_test, y_pred):.4f}\n")

print("Saved:")
print(" ", results_dir / "adult_dnn_layer1.npy")
print(" ", results_dir / "adult_dnn_layer2.npy")
print(" ", results_dir / "adult_dnn_metadata.csv")
print(" ", results_dir / "summary.txt")

# --------------------------------------------------------------------
# Optional: quick association preview between hidden units and sensitive attrs
# --------------------------------------------------------------------
try:
    import scipy.stats as stats
    meta_num = meta.copy()
    for c in sensitive_cols:
        meta_num[c] = meta_num[c].astype("category").cat.codes

    if "sex" in sensitive_cols:
        sex_codes = meta_num["sex"].values
        corrs = [stats.spearmanr(layer2_act[:, j], sex_codes, nan_policy="omit").correlation
                 for j in range(layer2_act.shape[1])]
        pd.DataFrame({"unit": np.arange(layer2_act.shape[1]),
                      "spearman_with_sex": corrs}).to_csv(
            results_dir / "adult_dnn_layer2_corr_sex.csv", index=False
        )
        print(" ", results_dir / "adult_dnn_layer2_corr_sex.csv")

    if "race" in sensitive_cols:
        race_codes = meta_num["race"].values
        corrs = [stats.spearmanr(layer2_act[:, j], race_codes, nan_policy="omit").correlation
                 for j in range(layer2_act.shape[1])]
        pd.DataFrame({"unit": np.arange(layer2_act.shape[1]),
                      "spearman_with_race": corrs}).to_csv(
            results_dir / "adult_dnn_layer2_corr_race.csv", index=False
        )
        print(" ", results_dir / "adult_dnn_layer2_corr_race.csv")

except Exception as e:
    print("Correlation preview skipped:", e)


Saved:
  C:\Users\hana1\Documents\iva-bias-project\results\activations\adult_dnn_layer1.npy
  C:\Users\hana1\Documents\iva-bias-project\results\activations\adult_dnn_layer2.npy
  C:\Users\hana1\Documents\iva-bias-project\results\activations\adult_dnn_metadata.csv
  C:\Users\hana1\Documents\iva-bias-project\results\activations\summary.txt
  C:\Users\hana1\Documents\iva-bias-project\results\activations\adult_dnn_layer2_corr_sex.csv
  C:\Users\hana1\Documents\iva-bias-project\results\activations\adult_dnn_layer2_corr_race.csv


  corrs = [stats.spearmanr(layer2_act[:, j], sex_codes, nan_policy="omit").correlation
  corrs = [stats.spearmanr(layer2_act[:, j], race_codes, nan_policy="omit").correlation


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

results_dir = Path.cwd().resolve().parent / "results" / "activations"

# Load correlation results
corr_sex_path  = results_dir / "adult_dnn_layer2_corr_sex.csv"
corr_race_path = results_dir / "adult_dnn_layer2_corr_race.csv"

def top_abs(df, col, k=10):
    df2 = df.copy()
    df2 = df2.replace([np.inf, -np.inf], np.nan).dropna(subset=[col])
    df2["abs_corr"] = df2[col].abs()
    return df2.sort_values("abs_corr", ascending=False).head(k)

corr_sex  = pd.read_csv(corr_sex_path)
corr_race = pd.read_csv(corr_race_path)

print("Top units linked to SEX:")
display(top_abs(corr_sex, "spearman_with_sex", k=10))

print("\nTop units linked to RACE:")
display(top_abs(corr_race, "spearman_with_race", k=10))


Top units linked to SEX:


Unnamed: 0,unit,spearman_with_sex,abs_corr
4,4,-0.49221,0.49221
20,20,-0.491613,0.491613
9,9,-0.456258,0.456258
1,1,-0.444235,0.444235
29,29,-0.438859,0.438859
2,2,-0.412508,0.412508
10,10,-0.391908,0.391908
30,30,-0.387147,0.387147
12,12,-0.37598,0.37598
13,13,-0.366654,0.366654



Top units linked to RACE:


Unnamed: 0,unit,spearman_with_race,abs_corr
4,4,-0.214673,0.214673
12,12,-0.197983,0.197983
20,20,-0.185326,0.185326
6,6,-0.18309,0.18309
10,10,-0.181577,0.181577
21,21,0.181281,0.181281
29,29,-0.172227,0.172227
1,1,-0.166187,0.166187
9,9,-0.155239,0.155239
30,30,-0.150143,0.150143
