# Baseline performance evaluation (clean data)

This notebook reproduces the baseline predictive performance results reported in the thesis.
It executes the end-to-end pipeline on the Adult Reconstruction dataset:

1) Load raw data (via `DATA_PATH`)  
2) Preprocess + feature engineering + encoding  
3) Stratified split (70/15/15)  
4) Train/tune models (RF, GBDT, XGBoost)  
5) Evaluate baseline performance on train/validation/test  
6) Export tables to `results/` and figures to `figures/`


## Pipeline setup: data loading, preprocessing, and model training

In [1]:
# --- Path setup: make src/thesis_pipeline importable ---
import sys
from pathlib import Path

# If notebook runs inside a subfolder (e.g., 04_baseline_performance),
# go one level up to reach the repo root
repo_root = Path.cwd().parent
src_path = repo_root / "src"

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print("Working directory:", Path.cwd())
print("Repo root:", repo_root)
print("Added to sys.path:", src_path)


Working directory: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/04_baseline_performance
Repo root: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis
Added to sys.path: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/src


In [2]:
import thesis_pipeline
print("thesis_pipeline loaded from:", thesis_pipeline.__file__)


thesis_pipeline loaded from: /Users/munaugas/MSc_Data_Science_Thesis/MSc_Data_Science_Thesis/src/thesis_pipeline/__init__.py


In [3]:
import os

from thesis_pipeline.baseline_performance.evaluate_baseline import evaluate_baseline_models

# 1. Load data
from thesis_pipeline.preprocessing.clean_data import load_data
from thesis_pipeline.preprocessing.encode_features import encode_features
from thesis_pipeline.preprocessing.feature_engineering import engineer_features_and_target
from thesis_pipeline.splitting.split_data import stratified_train_val_test_split


from thesis_pipeline.model_training.train_rf import train_random_forest
from thesis_pipeline.model_training.train_gbdt import train_gbdt
from thesis_pipeline.model_training.train_xgboost import train_xgboost

# Load raw data
os.environ["DATA_PATH"] = "/Users/munaugas/Desktop/Thesis/adult_reconstruction.csv"
print("Using DATA_PATH:", os.environ["DATA_PATH"])

df = load_data()
print("Using DATA_PATH:", os.environ.get("DATA_PATH"))

# Preprocessing
X_raw, y, df_with_target = engineer_features_and_target(df)
X, encoder, categorical_cols, numeric_cols = encode_features(X_raw)


# Splitting
splits = stratified_train_val_test_split(X, y)

X_train = splits.X_train
X_val   = splits.X_val
X_test  = splits.X_test
y_train = splits.y_train
y_val   = splits.y_val
y_test  = splits.y_test


# Model training (each training function returns: model, eval_table, best_params)
rf_model, rf_eval, rf_params = train_random_forest(X_train, y_train, X_val, y_val, X_test, y_test)
gbdt_model, gbdt_eval, gbdt_params = train_gbdt(X_train, y_train, X_val, y_val, X_test, y_test)
xgb_model, xgb_eval, xgb_params = train_xgboost(X_train, y_train, X_val, y_val, X_test, y_test)

best_models = {
    "RandomForest": rf_model,
    "GBDT": gbdt_model,
    "XGBoost": xgb_model,
}


Using DATA_PATH: /Users/munaugas/Desktop/Thesis/adult_reconstruction.csv
Using DATA_PATH: /Users/munaugas/Desktop/Thesis/adult_reconstruction.csv
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Fitting 3 folds for each of 15 candidates, totalling 45 fits


Assemble splits

In [4]:
splits = {
    "train": {"X": X_train, "y": y_train},
    "val": {"X": X_val, "y": y_val},
    "test": {"X": X_test, "y": y_test},
}


Evaluate

In [5]:
baseline_results = evaluate_baseline_models(best_models, splits)
baseline_results



Unnamed: 0,model,split,accuracy,f1,roc_auc
0,RandomForest,train,0.920654,0.819571,0.976762
1,RandomForest,val,0.862046,0.680785,0.912252
2,RandomForest,test,0.86891,0.692551,0.921222
3,GBDT,train,0.883678,0.736112,0.94226
4,GBDT,val,0.870256,0.704838,0.924899
5,GBDT,test,0.878062,0.722936,0.928638
6,XGBoost,train,0.892994,0.757801,0.951456
7,XGBoost,val,0.873217,0.712805,0.925909
8,XGBoost,test,0.878062,0.722766,0.930265


Save baseline performance figures

In [6]:
import matplotlib.pyplot as plt
os.makedirs("../figures", exist_ok=True)

df_test = baseline_results[baseline_results["split"] == "test"].copy()
metrics = ["accuracy", "f1", "roc_auc"]

for metric in metrics:
    plt.figure()
    plt.bar(df_test["model"], df_test[metric])
    plt.ylabel(metric.upper())
    plt.title(f"Baseline test performance ({metric.upper()})")

    outpath = f"../figures/baseline_test_{metric}.png"
    plt.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close()
    print(f"Saved: {outpath}")


Saved: ../figures/baseline_test_accuracy.png
Saved: ../figures/baseline_test_f1.png
Saved: ../figures/baseline_test_roc_auc.png


Save results

In [7]:
os.makedirs("../results", exist_ok=True)

baseline_results.to_csv("../results/baseline_metrics.csv", index=False)
print("Saved: ../results/baseline_metrics.csv")

Saved: ../results/baseline_metrics.csv


In [8]:
baseline_results_fmt = baseline_results.copy()

# Round numeric columns
metric_cols = ["accuracy", "f1", "roc_auc"]
baseline_results_fmt[metric_cols] = baseline_results_fmt[metric_cols].round(3)

baseline_results_fmt.to_latex(
    "../results/baseline_metrics.tex",
    index=False,
    caption="Baseline predictive performance on train, validation, and test sets.",
    label="tab:baseline-performance",
    float_format="%.3f"
)
