## Setup
- Setup: paths, environment, imports

In [None]:
# --- Path + environment setup ---
import os
import sys
from pathlib import Path

# --- Paths + environment ---
import os, sys
from pathlib import Path

# Repo root: works when notebook is in a subfolder (e.g., 04_/05_/06_)
repo_root = Path.cwd().resolve().parents[0]
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Optional: load .env locally (recommended)
try:
    from dotenv import load_dotenv
    load_dotenv(repo_root / ".env")
except Exception:
    pass

DATA_PATH = os.environ.get("DATA_PATH", "")
if not DATA_PATH:
    raise RuntimeError(
        "DATA_PATH is not set. Create a .env file (not committed) with:\n"
        "DATA_PATH=/absolute/path/to/adult_reconstruction.csv"
    )

print("Repo root:", repo_root)
print("Using DATA_PATH:", DATA_PATH)



Using DATA_PATH: /Users/munaugas/Desktop/Thesis/adult_reconstruction.csv


## Load pipeline artifacts and train models
- Load data, preprocess, split, and train models

In [2]:
# --- Load data, preprocess, split, and train models (same as baseline) ---

from thesis_pipeline.preprocessing.clean_data import load_data
from thesis_pipeline.preprocessing.feature_engineering import engineer_features_and_target
from thesis_pipeline.preprocessing.encode_features import encode_features
from thesis_pipeline.splitting.split_data import stratified_train_val_test_split

from thesis_pipeline.model_training.train_rf import train_random_forest
from thesis_pipeline.model_training.train_gbdt import train_gbdt
from thesis_pipeline.model_training.train_xgboost import train_xgboost

# Load raw data
df = load_data()

# Feature engineering: produce X_raw and y
X_raw, y, df_with_target = engineer_features_and_target(df)

# Encoding
X, encoder, categorical_cols, numeric_cols = encode_features(X_raw)

# Splitting
splits_obj = stratified_train_val_test_split(X, y)
X_train, y_train = splits_obj.X_train, splits_obj.y_train
X_val, y_val     = splits_obj.X_val, splits_obj.y_val
X_test, y_test   = splits_obj.X_test, splits_obj.y_test

# Training (unpack outputs: model, eval_df, best_params)
rf_model, rf_eval, rf_params = train_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)
gbdt_model, gbdt_eval, gbdt_params = train_gbdt(X_train, y_train, X_val, y_val, X_test, y_test)
xgb_model, xgb_eval, xgb_params = train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test)

best_models = {
    "RandomForest": rf_model,
    "GBDT": gbdt_model,
    "XGBoost": xgb_model,
}

print("Models trained:", list(best_models.keys()))
print("Numeric columns used for perturbations:", len(numeric_cols))




Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Models trained: ['RandomForest', 'GBDT', 'XGBoost']
Numeric columns used for perturbations: 5


## Evaluations helper
- Metric helpers and unified evaluation function

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def _scores_for_auc(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        s = model.decision_function(X)
        denom = s.max() - s.min()
        return np.zeros_like(s, dtype=float) if denom == 0 else (s - s.min()) / denom
    return model.predict(X)

def evaluate_model(model, X, y, model_name, condition):
    y_pred = model.predict(X)
    y_score = _scores_for_auc(model, X)

    # Safety: ROC-AUC undefined if only one class is present
    roc_auc = float(roc_auc_score(y, y_score)) if len(set(y)) > 1 else float("nan")

    return {
        "model": model_name,
        "condition": condition,
        "accuracy": float(accuracy_score(y, y_pred)),
        "f1": float(f1_score(y, y_pred)),
        "roc_auc": roc_auc,
    }


## Robustness experiment design
- Perturbation settings (Gaussian noise levels, distribution shift)
- Run robustness evaluation (clean + noise + shift)

In [4]:
noise_sigmas = [0.1, 0.5, 1.0]
robustness_results = []

for model_name, model in best_models.items():
    # Baseline on clean test set
    robustness_results.append(
        evaluate_model(model, X_test, y_test, model_name, "test_clean")
    )

    # Gaussian noise perturbations
    for sigma in noise_sigmas:
        X_noisy = add_gaussian_noise(X_test, numeric_cols, sigma=sigma, random_state=42)
        robustness_results.append(
            evaluate_model(model, X_noisy, y_test, model_name, f"test_gauss_sigma_{sigma}")
        )

    # Simple structured shift (no random_state assumed; add only if your function supports it)
    X_shifted = apply_simple_shift(X_test, numeric_cols)
    robustness_results.append(
        evaluate_model(model, X_shifted, y_test, model_name, "test_shifted")
    )

robustness_df = pd.DataFrame(robustness_results)
robustness_df


Unnamed: 0,model,condition,accuracy,f1,roc_auc
0,RandomForest,test_clean,0.86891,0.692551,0.921222
1,RandomForest,test_gauss_sigma_0.1,0.869044,0.693157,0.921293
2,RandomForest,test_gauss_sigma_0.5,0.867699,0.689022,0.920764
3,RandomForest,test_gauss_sigma_1.0,0.867429,0.686206,0.91846
4,RandomForest,test_shifted,0.859219,0.682646,0.913281
5,GBDT,test_clean,0.878062,0.722936,0.928638
6,GBDT,test_gauss_sigma_0.1,0.878062,0.722936,0.928638
7,GBDT,test_gauss_sigma_0.5,0.877389,0.721662,0.927206
8,GBDT,test_gauss_sigma_1.0,0.87214,0.707152,0.923909
9,GBDT,test_shifted,0.864065,0.700119,0.912594


## Export results

- Save robustness summary (CSV)

In [5]:
import os
os.makedirs("../results", exist_ok=True)

robustness_df.to_csv("../results/robustness_performance_summary.csv", index=False)
print("Saved: ../results/robustness_performance_summary.csv")


Saved: ../results/robustness_performance_summary.csv


- Save robustness figures

In [6]:
import os
import matplotlib.pyplot as plt
import numpy as np
import re

os.makedirs("../figures", exist_ok=True)

df = robustness_df.copy()

def extract_sigma(cond):
    m = re.search(r"test_gauss_sigma_(\d+(\.\d+)?)", cond)
    return float(m.group(1)) if m else np.nan

df["sigma"] = df["condition"].apply(extract_sigma)



## Visualisations

- Gaussian noise figures

In [7]:
metrics = ["accuracy", "f1", "roc_auc"]
noise_levels = [0.1, 0.5, 1.0]

df_noise = df[df["condition"].str.contains("test_gauss_sigma")].copy()
df_noise = df_noise[df_noise["sigma"].isin(noise_levels)]

for metric in metrics:
    plt.figure()
    for model in df_noise["model"].unique():
        sub = df_noise[df_noise["model"] == model].sort_values("sigma")
        plt.plot(sub["sigma"], sub[metric], marker="o", label=model)

    plt.xlabel("Gaussian noise level Ïƒ")
    plt.ylabel(metric.upper())
    plt.title(f"Robustness under Gaussian noise ({metric.upper()})")
    plt.legend()

    outpath = f"../figures/robustness_noise_{metric}.png"
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Saved: {outpath}")



Saved: ../figures/robustness_noise_accuracy.png
Saved: ../figures/robustness_noise_f1.png
Saved: ../figures/robustness_noise_roc_auc.png


- Clean versus shifted comparison

In [8]:
df_shift = df[df["condition"].isin(["test_clean", "test_shifted"])].copy()

for metric in metrics:
    plt.figure()

    pivot = df_shift.pivot(index="model", columns="condition", values=metric)
    models = pivot.index.tolist()
    x = np.arange(len(models))
    width = 0.35

    clean_vals = pivot["test_clean"].values
    shifted_vals = pivot["test_shifted"].values

    plt.bar(x - width/2, clean_vals, width, label="Clean")
    plt.bar(x + width/2, shifted_vals, width, label="Shifted")

    plt.xticks(x, models)
    plt.ylabel(metric.upper())
    plt.title(f"Clean vs shifted performance ({metric.upper()})")
    plt.legend()

    outpath = f"../figures/robustness_shift_{metric}.png"
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"Saved: {outpath}")

Saved: ../figures/robustness_shift_accuracy.png
Saved: ../figures/robustness_shift_f1.png
Saved: ../figures/robustness_shift_roc_auc.png


- Export robustness summary (LaTeX)

In [9]:
# Export robustness results to LaTeX (Overleaf-ready)
import os

os.makedirs("../results", exist_ok=True)

robustness_fmt = robustness_df.copy()

# Round only metric columns (keep strings like model/condition unchanged)
metric_cols = ["accuracy", "f1", "roc_auc"]
robustness_fmt[metric_cols] = robustness_fmt[metric_cols].round(3)

out_tex = "../results/robustness_performance_summary.tex"

robustness_fmt.to_latex(
    out_tex,
    index=False,
    caption="Predictive performance under Gaussian noise and a simple distribution shift on the test set.",
    label="tab:robustness-performance",
    float_format="%.3f",
    escape=True,   # keep safe for underscores etc.
)

print(f"Saved: {out_tex}")


Saved: ../results/robustness_performance_summary.tex
