## Setup
- Setup: paths, environment, imports

In [None]:
import os
import sys
from pathlib import Path
from lime.lime_tabular import LimeTabularExplainer

# --- Paths + environment ---
import os, sys
from pathlib import Path

# Repo root: works when notebook is in a subfolder (e.g., 04_/05_/06_)
repo_root = Path.cwd().resolve().parents[0]
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Optional: load .env locally (recommended)
try:
    from dotenv import load_dotenv
    load_dotenv(repo_root / ".env")
except Exception:
    pass

DATA_PATH = os.environ.get("DATA_PATH", "")
if not DATA_PATH:
    raise RuntimeError(
        "DATA_PATH is not set. Create a .env file (not committed) with:\n"
        "DATA_PATH=/absolute/path/to/adult_reconstruction.csv"
    )

print("Repo root:", repo_root)
print("Using DATA_PATH:", DATA_PATH)




Working directory: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/06_interpretability
Repo root: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis
Added to sys.path: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/src
Using DATA_PATH: /Users/munaugas/Desktop/Thesis/adult_reconstruction.csv


## Pipeline and model training
- Load data, preprocess, split, and train models

In [13]:
# --- Pipeline (re)run: create df, X_train/y_train, X_test/y_test, best_models, numeric_cols, categorical_cols ---

import os

from thesis_pipeline.preprocessing.clean_data import load_data
from thesis_pipeline.preprocessing.feature_engineering import engineer_features_and_target
from thesis_pipeline.preprocessing.encode_features import encode_features
from thesis_pipeline.splitting.split_data import stratified_train_val_test_split

from thesis_pipeline.model_training.train_rf import train_random_forest
from thesis_pipeline.model_training.train_gbdt import train_gbdt
from thesis_pipeline.model_training.train_xgboost import train_xgboost

RANDOM_STATE = 42

# 1) Load raw/clean data (uses DATA_PATH env var set in your setup cell)
df = load_data()
print("Using DATA_PATH:", os.environ.get("DATA_PATH"))
print("Loaded df shape:", df.shape)

# 2) Feature engineering: build target + split X/y
X_raw, y, df_with_target = engineer_features_and_target(df)
df = df_with_target  # keep df aligned with X/y indices for subgroup metadata later

# 3) Encoding (IMPORTANT: pass X_raw, not df)
X, encoder, categorical_cols, numeric_cols = encode_features(X_raw)
print(f"#categorical cols: {len(categorical_cols)} | #numeric cols: {len(numeric_cols)}")
print("X shape:", X.shape, "| y shape:", y.shape)

# 4) Stratified split (70/15/15)
splits = stratified_train_val_test_split(X, y, random_state=RANDOM_STATE)

X_train, X_val, X_test = splits.X_train, splits.X_val, splits.X_test
y_train, y_val, y_test = splits.y_train, splits.y_val, splits.y_test

print("Split sizes:", len(X_train), len(X_val), len(X_test))

# 5) Train models
# NOTE: your train_* functions return tuples (model, metrics, params) in your project,
# so we explicitly unpack the first element (the fitted estimator).

rf_model, *_   = train_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)
gbdt_model, *_ = train_gbdt(X_train, y_train, X_val, y_val, X_test, y_test)

# XGBoost can fail on macOS if libomp isn't installed. If it fails, we skip it gracefully.
try:
    xgb_model, *_  = train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test)
except Exception as e:
    xgb_model = None
    print("WARNING: XGBoost training failed. Skipping XGBoost.\n", repr(e))

best_models = {
    "RandomForest": rf_model,
    "GBDT": gbdt_model,
}
if xgb_model is not None:
    best_models["XGBoost"] = xgb_model

print("Models available:", list(best_models.keys()))

# Sanity
assert all(X_test.index == y_test.index), "X_test and y_test indices must align."


Using DATA_PATH: /Users/munaugas/Desktop/Thesis/adult_reconstruction.csv
Loaded df shape: (49531, 14)
#categorical cols: 8 | #numeric cols: 5
X shape: (49531, 13) | y shape: (49531,)
Split sizes: 34671 7430 7430
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Models available: ['RandomForest', 'GBDT', 'XGBoost']


- Prerequisites and sanity checks

In [14]:
required = ["X_train", "y_train", "X_test", "y_test",
            "best_models", "numeric_cols", "categorical_cols", "df"]
missing = [name for name in required if name not in globals()]
if missing:
    raise RuntimeError(
        f"Missing required objects: {missing}. "
        "Run the pipeline cell above (or load objects) before continuing."
    )

# Sanity: make sure indices align for subgroup analysis
assert all(X_test.index == y_test.index), "X_test and y_test indices must align."


## Subgroup setup 
- Build subgroup metadata (gender, race_binary, age_group) (same as SHAP)

In [15]:
meta_test = df.loc[X_test.index, ["gender", "race", "age"]].copy()

# Race: White vs Non-White
meta_test["race_binary"] = np.where(meta_test["race"] == "White", "White", "Non-White")

# Age groups
age_bins   = [17, 30, 45, 60, 90]
age_labels = ["18-30", "31-45", "46-60", "61+"]
meta_test["age_group"] = pd.cut(meta_test["age"], bins=age_bins, labels=age_labels)

meta_test.head()


Unnamed: 0,gender,race,age,race_binary,age_group
21460,Male,White,21,White,18-30
35060,Male,Amer-Indian-Eskimo,51,Non-White,46-60
1633,Male,White,34,White,31-45
22480,Female,White,26,White,18-30
47104,Female,White,28,White,18-30


## LIME explainer and helpers
- Configure LIME explainer (training distribution)
- Prediction wrapper and repeated-explanation helper

In [16]:
RANDOM_STATE = 42

feature_names = list(X_train.columns)

# Indices of categorical features in the encoded matrix
categorical_feature_indices = [
    feature_names.index(c) for c in categorical_cols if c in feature_names
]

explainer = LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=feature_names,
    class_names=["<=50K", ">50K"],
    categorical_features=categorical_feature_indices,
    discretize_continuous=False,
    mode="classification",
    random_state=RANDOM_STATE
)

def make_lime_predict_fn(model):
    """
    LIME expects predict_fn(X) -> array of shape (n_samples, n_classes).
    This wrapper ensures that, even if a model returns only proba for class 1.
    """
    def _predict_fn(X_np):
        proba = model.predict_proba(X_np)
        # If model returns (n,) or (n,1), convert to (n,2)
        if proba.ndim == 1:
            proba = np.vstack([1 - proba, proba]).T
        if proba.shape[1] == 1:
            p1 = proba[:, 0]
            proba = np.vstack([1 - p1, p1]).T
        return proba
    return _predict_fn

def explain_instance_repeated(model, X_row: pd.Series, *, n_runs: int, num_features: int):
    """
    Repeat LIME explanations for one instance and return:
    - weights_matrix: (n_runs, n_features) aligned to X columns (zeros for absent features)
    - unique_features_count: number of unique features that appeared at least once across runs
    """
    weights = np.zeros((n_runs, len(feature_names)), dtype=float)
    appeared = set()

    predict_fn = make_lime_predict_fn(model)

    for i in range(n_runs):
        exp = explainer.explain_instance(
            data_row=X_row.values,
            predict_fn=predict_fn,
            num_features=num_features
        )

        mapped = exp.as_map()[1]  # positive class
        for feat_idx, w in mapped:
            weights[i, feat_idx] = w
            appeared.add(feat_idx)

    return weights, len(appeared)


## LIME robustness experiment
- Select test instances for LIME robustness

In [17]:
# Choose instances (keep small: LIME is expensive)
RANDOM_STATE = 42
n_instances = 50

instance_idx = X_test.sample(n=min(n_instances, len(X_test)), random_state=RANDOM_STATE).index
X_lime = X_test.loc[instance_idx]
y_lime = y_test.loc[instance_idx]

X_lime.shape



(50, 13)

- Run LIME robustness across perturbation conditions

In [18]:
# Run LIME robustness across conditions and export summary table

noise_sigmas = [0.1, 0.5, 1.0]
conditions = [("clean", None)] + [(f"gauss_sigma_{s}", s) for s in noise_sigmas] + [("shifted", "shifted")]

n_runs = 20
num_features = 20

rows = []

for model_name, model in best_models.items():
    print(f"\n=== LIME robustness for {model_name} ===")

    for cond_name, cond_param in conditions:
        # Build perturbed version of X_lime
        if cond_name == "clean":
            X_cond = X_lime
        elif cond_name.startswith("gauss_sigma_"):
            X_cond = add_gaussian_noise(X_lime, numeric_cols, sigma=float(cond_param), random_state=RANDOM_STATE)
        elif cond_name == "shifted":
            X_cond = apply_simple_shift(X_lime, numeric_cols)
        else:
            raise ValueError(cond_name)

        coeff_stds = []
        unique_feats = []

        for idx in X_cond.index:
            weights_matrix, unique_count = explain_instance_repeated(
                model, X_cond.loc[idx], n_runs=n_runs, num_features=num_features
            )
            coeff_stds.append(weights_matrix.std(axis=0).mean())
            unique_feats.append(unique_count)

        rows.append({
            "model": model_name,
            "condition": cond_name,
            "n_instances": int(len(X_cond)),
            "n_runs": int(n_runs),
            "num_features": int(num_features),
            "mean_coeff_std": float(np.mean(coeff_stds)),
            "mean_unique_features": float(np.mean(unique_feats)),
        })

lime_robustness_df = pd.DataFrame(rows)
lime_robustness_df




=== LIME robustness for RandomForest ===







=== LIME robustness for GBDT ===





=== LIME robustness for XGBoost ===


Unnamed: 0,model,condition,n_instances,n_runs,num_features,mean_coeff_std,mean_unique_features
0,RandomForest,clean,50,20,20,0.006243,13.0
1,RandomForest,gauss_sigma_0.1,50,20,20,0.006199,13.0
2,RandomForest,gauss_sigma_0.5,50,20,20,0.006258,13.0
3,RandomForest,gauss_sigma_1.0,50,20,20,0.006261,13.0
4,RandomForest,shifted,50,20,20,0.006286,13.0
5,GBDT,clean,50,20,20,0.008609,13.0
6,GBDT,gauss_sigma_0.1,50,20,20,0.008514,13.0
7,GBDT,gauss_sigma_0.5,50,20,20,0.008494,13.0
8,GBDT,gauss_sigma_1.0,50,20,20,0.008596,13.0
9,GBDT,shifted,50,20,20,0.008588,13.0


- Save robustness outputs (CSV + figures)

In [19]:
# Save LIME robustness CSV + figures

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

os.makedirs("../results", exist_ok=True)
os.makedirs("../figures", exist_ok=True)

# --- Save CSV ---
out_csv = "../results/lime_robustness_summary.csv"
lime_robustness_df.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")

# Consistent ordering (optional but recommended)
models = ["RandomForest", "GBDT", "XGBoost"]
conds  = ["clean", "gauss_sigma_0.1", "gauss_sigma_0.5", "gauss_sigma_1.0", "shifted"]

dfp = lime_robustness_df.copy()
dfp["condition"] = pd.Categorical(dfp["condition"], categories=conds, ordered=True)

# --- Plot coefficient std ---
plt.figure()
for model in models:
    sub = dfp[dfp["model"] == model].sort_values("condition")
    if sub.empty:
        continue
    plt.plot(sub["condition"].astype(str), sub["mean_coeff_std"], marker="o", label=model)

plt.ylabel("Mean coefficient std (across repeated LIME runs)")
plt.title("LIME robustness: coefficient variability")
plt.xticks(rotation=30, ha="right")
plt.legend()

out_fig = "../figures/lime_robustness_coeff_std.png"
plt.savefig(out_fig, dpi=300, bbox_inches="tight")
plt.close()
print(f"Saved: {out_fig}")

# --- Plot unique features (with tiny x-offsets so overlapping lines are visible) ---
plt.figure()
x = np.arange(len(conds))
offsets = np.linspace(-0.06, 0.06, num=len(models))

for off, model in zip(offsets, models):
    sub = dfp[dfp["model"] == model].sort_values("condition")
    if sub.empty:
        continue
    plt.plot(
        x + off,
        sub["mean_unique_features"].values,
        marker="o",
        linewidth=2,
        alpha=0.8,
        label=model
    )

plt.xticks(x, conds, rotation=30, ha="right")
plt.ylabel("Mean # unique features (across runs)")
plt.title("LIME robustness: feature selection variability")
plt.legend()

out_fig = "../figures/lime_robustness_unique_features.png"
plt.savefig(out_fig, dpi=300, bbox_inches="tight")
plt.close()
print(f"Saved: {out_fig}")



Saved: ../results/lime_robustness_summary.csv
Saved: ../figures/lime_robustness_coeff_std.png
Saved: ../figures/lime_robustness_unique_features.png


## Local LIME examples
- Save example LIME explanation plot per model (clean)

In [20]:
# Save 1 example explanation per model (clean)

RANDOM_STATE = 42
example_idx = X_test.sample(n=1, random_state=RANDOM_STATE).index[0]
x0 = X_test.loc[example_idx]

for model_name, model in best_models.items():
    predict_fn = make_lime_predict_fn(model)
    exp = explainer.explain_instance(
        data_row=x0.values,
        predict_fn=predict_fn,
        num_features=20
    )
    fig = exp.as_pyplot_figure()
    plt.title(f"LIME explanation (clean) - {model_name}")
    out_fig = f"../figures/lime_example_{model_name}_clean.png".replace(" ", "")
    plt.savefig(out_fig, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"Saved: {out_fig}")




Saved: ../figures/lime_example_RandomForest_clean.png
Saved: ../figures/lime_example_GBDT_clean.png
Saved: ../figures/lime_example_XGBoost_clean.png


- Compute subgroup LIME stability (clean)

In [21]:
group_cols = ["gender", "race_binary", "age_group"]
min_group_size = 50

rows = []

for model_name, model in best_models.items():
    print(f"\n=== Subgroup LIME stability for {model_name} ===")
    for group_col in group_cols:
        for group_value in meta_test[group_col].dropna().unique():

            mask = (meta_test[group_col] == group_value)
            if int(mask.sum()) < min_group_size:
                continue

            idxs = X_test.loc[mask].sample(
                n=min(30, int(mask.sum())), random_state=RANDOM_STATE
            ).index

            coeff_stds = []
            unique_feats = []

            for idx in idxs:
                weights_matrix, unique_count = explain_instance_repeated(
                    model, X_test.loc[idx], n_runs=20, num_features=20
                )
                coeff_stds.append(weights_matrix.std(axis=0).mean())
                unique_feats.append(unique_count)

            rows.append({
                "model": model_name,
                "group_col": group_col,
                "group_value": str(group_value),
                "n_instances": int(len(idxs)),
                "mean_coeff_std": float(np.mean(coeff_stds)),
                "mean_unique_features": float(np.mean(unique_feats)),
            })

lime_subgroup_df = pd.DataFrame(rows)

out_csv = "../results/lime_subgroup_stability_clean.csv"
lime_subgroup_df.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")

lime_subgroup_df.head()




=== Subgroup LIME stability for RandomForest ===





=== Subgroup LIME stability for GBDT ===





=== Subgroup LIME stability for XGBoost ===
Saved: ../results/lime_subgroup_stability_clean.csv


Unnamed: 0,model,group_col,group_value,n_instances,mean_coeff_std,mean_unique_features
0,RandomForest,gender,Male,30,0.006436,13.0
1,RandomForest,gender,Female,30,0.006514,13.0
2,RandomForest,race_binary,White,30,0.006125,13.0
3,RandomForest,race_binary,Non-White,30,0.007796,13.0
4,RandomForest,age_group,18-30,30,0.006122,13.0


- Generate subgroup LIME explanation plots

In [22]:
# Generate subgroup LIME explanation plots (one representative instance per subgroup)
# This cell expects the earlier pipeline cells to have created:
# - best_models (dict of fitted models)
# - X_test (feature matrix as DataFrame)
# - meta_test (DataFrame with subgroup columns)
# - explainer (lime.lime_tabular.LimeTabularExplainer)
# - make_lime_predict_fn(model) -> callable for LIME

import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def _find_repo_root(start: Path) -> Path:
    """Walk upwards until we find a typical repo marker."""
    cur = start.resolve()
    for _ in range(10):
        if (cur / "requirements.txt").exists() or (cur / ".git").exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

repo_root = _find_repo_root(Path.cwd())

FIG_SUBDIR = Path(os.environ.get("SUBGROUP_LIME_DIR", str(repo_root / "figures" / "subgroup_lime")))
FIG_SUBDIR.mkdir(parents=True, exist_ok=True)
print("Saving subgroup LIME plots to:", str(FIG_SUBDIR))

# Use existing settings if already defined elsewhere in the notebook
GROUP_COLS = globals().get("GROUP_COLS", globals().get("group_cols", ["gender", "race_binary", "age_group"]))
MIN_GROUP_SIZE = int(globals().get("MIN_GROUP_SIZE", globals().get("min_group_size", 50)))
N_PLOTS_PER_SUBGROUP = int(globals().get("N_PLOTS_PER_SUBGROUP", 1))
NUM_FEATURES_PLOT = int(globals().get("NUM_FEATURES_PLOT", 10))
RANDOM_STATE = int(globals().get("RANDOM_STATE", 42))

def safe_name(x) -> str:
    """Make a filesystem-safe name."""
    x = str(x)
    x = x.replace(" ", "")
    x = x.replace("/", "_")
    x = x.replace("+", "plus")
    x = re.sub(r"[^A-Za-z0-9_\-\.]", "_", x)
    return x

def explain_once(model, X_row: pd.Series, *, num_features: int = 10):
    """Single LIME explanation for one instance (positive class)."""
    predict_fn = make_lime_predict_fn(model)
    return explainer.explain_instance(
        data_row=X_row.values,
        predict_fn=predict_fn,
        num_features=num_features,
    )

# Sanity: required objects from earlier cells
required = ["best_models", "X_test", "meta_test", "explainer", "make_lime_predict_fn"]
missing = [k for k in required if k not in globals()]
if missing:
    raise RuntimeError(
        "Missing required objects before plotting subgroup LIME: "
        + ", ".join(missing)
        + ". Run the setup/training cells above first."
    )

# Generate plots
for model_name, model in best_models.items():
    print(f"\n--- Model: {model_name} ---")

    for group_col in GROUP_COLS:
        if group_col not in meta_test.columns:
            print(f"Skipping {group_col}: not found in meta_test")
            continue

        for group_value in meta_test[group_col].dropna().unique():
            mask = (meta_test[group_col] == group_value)
            n_group = int(mask.sum())
            if n_group < MIN_GROUP_SIZE:
                continue

            # Sample 1 (or N) representative instance(s) from this subgroup
            idxs = X_test.loc[mask].sample(
                n=min(N_PLOTS_PER_SUBGROUP, n_group),
                random_state=RANDOM_STATE,
            ).index

            for j, idx in enumerate(idxs, start=1):
                exp = explain_once(model, X_test.loc[idx], num_features=NUM_FEATURES_PLOT)
                fig = exp.as_pyplot_figure()
                plt.title(f"LIME ({model_name}) - {group_col}={group_value}")

                fname = f"lime_{safe_name(model_name)}_{safe_name(group_col)}_{safe_name(group_value)}_{j}.png"
                outpath = FIG_SUBDIR / fname
                fig.savefig(outpath, dpi=300, bbox_inches="tight")
                plt.close(fig)

                print(f"Saved: {outpath}")


Saving subgroup LIME plots to: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime

--- Model: RandomForest ---




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_gender_Male_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_gender_Female_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_race_binary_White_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_race_binary_Non-White_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_age_group_18-30_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_age_group_46-60_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_age_group_31-45_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_RandomForest_age_group_61plus_1.png

--- Model: GBDT ---
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_gender_Male_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_gender_Female_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_race_binary_White_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_race_binary_Non-White_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_age_group_18-30_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_age_group_46-60_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_age_group_31-45_1.png




Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_GBDT_age_group_61plus_1.png

--- Model: XGBoost ---
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_XGBoost_gender_Male_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_XGBoost_gender_Female_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_XGBoost_race_binary_White_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_XGBoost_race_binary_Non-White_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_XGBoost_age_group_18-30_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/subgroup_lime/lime_XGBoost_age_group_46-60_1.png
Saved: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/figures/s

- Export subgroup stability table to LaTeX

In [23]:
# Export LIME robustness results to LaTeX (Overleaf)
out_tex = "../results/lime_robustness_summary.tex"

cols = ["model", "condition", "mean_coeff_std", "mean_unique_features"]
df_tex = lime_robustness_df[cols].copy()

# Make it pretty
df_tex["mean_coeff_std"] = df_tex["mean_coeff_std"].map(lambda x: f"{x:.3f}")
df_tex["mean_unique_features"] = df_tex["mean_unique_features"].map(lambda x: f"{x:.2f}")

latex = df_tex.to_latex(
    index=False,
    escape=True,
    caption="LIME robustness summary (mean coefficient variability and feature selection variability) across perturbation conditions.",
    label="tab:lime-robustness",
)

with open(out_tex, "w") as f:
    f.write(latex)

print(f"Saved: {out_tex}")


Saved: ../results/lime_robustness_summary.tex


- Export subgroup LIME importance table (Top-K)

In [24]:
# Export LIME subgroup stability results to LaTeX (Overleaf)
out_tex = "../results/lime_subgroup_stability_clean.tex"

cols = ["model", "group_col", "group_value", "mean_coeff_std", "mean_unique_features"]
df_tex = lime_subgroup_df[cols].copy()

df_tex["mean_coeff_std"] = df_tex["mean_coeff_std"].map(lambda x: f"{x:.3f}")
df_tex["mean_unique_features"] = df_tex["mean_unique_features"].map(lambda x: f"{x:.2f}")

latex = df_tex.to_latex(
    index=False,
    escape=True,
    caption="LIME subgroup stability (clean test set).",
    label="tab:lime-subgroup-stability",
)

with open(out_tex, "w") as f:
    f.write(latex)

print(f"Saved: {out_tex}")


Saved: ../results/lime_subgroup_stability_clean.tex
