## Imports & Setup

In [1]:
# === 0. Imports & Global Config ===
import os
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier
from tabpfn import TabPFNClassifier  # adjust import if needed
import torch, numpy as np, random

RANDOM_STATE = 42
torch.manual_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

np.random.seed(RANDOM_STATE)



## Data Loading

`stress_binary_personal-full` pickle:

- `D-2`: `/var/nfs_share/Overfitting/D-2/Intermediate/stress_binary_personal-full.pkl`
- `D-3`: `/var/nfs_share/Overfitting/D-3/Intermediate/stress_binary_personal-full_D#3.pkl`
- `D-4`: `/var/nfs_share/Overfitting/D-4/Intermediate/stress_binary_personal-full.pkl`

In [2]:
data_path = '/var/nfs_share/Overfitting/'

FILES = {
    'D-2': os.path.join(data_path, 'D-2', 'Intermediate', 'stress_binary_personal-full.pkl'),
    'D-3': os.path.join(data_path, 'D-3', 'Intermediate', 'stress_binary_personal-full_D#3.pkl'),
    'D-4': os.path.join(data_path, 'D-4', 'Intermediate', 'stress_binary_personal-full.pkl'),
}

def load_and_attach(path, dataset_tag):
    df, y, groups, t, datetimes = pd.read_pickle(path)

    # meta (including dataset tag) but we will NOT keep label inside X
    meta = pd.DataFrame({
        'META#dataset': dataset_tag,
        'PIF#participantID': groups,
        'PIF#time_offset': t,
        'PIF#timestamp': datetimes,
    })

    assert len(df) == len(meta), f"Row mismatch in {dataset_tag}"
    out = pd.concat(
        [meta.reset_index(drop=True), df.reset_index(drop=True)],
        axis=1
    )

    # meta first, then sorted feature columns
    meta_cols = ['META#dataset', 'PIF#participantID', 'PIF#time_offset', 'PIF#timestamp']

    feature_cols = sorted(
        c for c in out.columns
        if c not in meta_cols)

    X = out[meta_cols + feature_cols]
    y = pd.Series(y, name='PIF#stress_label')
    return X, y


# ---- Load datasets ----
df_1, df_y_1 = load_and_attach(FILES['D-2'], 'D-2')
df_2, df_y_2 = load_and_attach(FILES['D-3'], 'D-3')
df_3, df_y_3 = load_and_attach(FILES['D-4'], 'D-4')


In [3]:
# Get the overlap columns
META_COLS = ['META#dataset', 'PIF#participantID', 'PIF#time_offset', 'PIF#timestamp']

def get_common_cols(dfs):
    feature_sets = [
        set(df.columns) - set(META_COLS)
        for df in dfs
    ]
    common = sorted(set.intersection(*feature_sets))
    return META_COLS + common

# get overlap
COMMON_COLS = get_common_cols([df_1, df_2, df_3])

# aligned datasets (same columns, same order)
df_1_over = df_1[COMMON_COLS]
df_2_over = df_2[COMMON_COLS]
df_3_over = df_3[COMMON_COLS]


In [4]:
df_1_over.shape, df_2_over.shape, df_3_over.shape

((10075, 5021), (20831, 5021), (21619, 5021))

In [5]:
df_1_over

Unnamed: 0,META#dataset,PIF#participantID,PIF#time_offset,PIF#timestamp,ACE_BCC#ASC#ImmediatePast_15,ACE_BCC#ASC#ImmediatePast_30,ACE_BCC#ASC#YesterdayAfternoon,ACE_BCC#ASC#YesterdayDawn,ACE_BCC#ASC#YesterdayEvening,ACE_BCC#ASC#YesterdayLateAfternoon,...,WLS#ETP##YesterdayMorning,WLS#ETP##YesterdayNight,WLS#ETP#_TodayAfternoon,WLS#ETP#_TodayDawn,WLS#ETP#_TodayEvening,WLS#ETP#_TodayLateAfternoon,WLS#ETP#_TodayMorning,WLS#ETP#_TodayNight,WLS#VAL=BT_OFF,WLS#VAL=BT_ON
0,D-2,P033,40609.415,2020-02-08 11:16:49.415000+09:00,3.06,3.86,0.000000,0.0,16.949999,9.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True
1,D-2,P033,47486.022,2020-02-08 13:11:26.022000+09:00,0.34,1.05,0.000000,0.0,16.949999,9.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True
2,D-2,P033,50006.386,2020-02-08 13:53:26.386000+09:00,1.19,2.90,0.000000,0.0,16.949999,9.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True
3,D-2,P033,58937.042,2020-02-08 16:22:17.042000+09:00,1.35,1.64,0.000000,0.0,16.949999,9.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True
4,D-2,P033,62263.538,2020-02-08 17:17:43.538000+09:00,0.40,1.44,0.000000,0.0,16.949999,9.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10070,D-2,P138,4310332.717,2020-03-28 21:18:52.717000+09:00,0.00,0.28,8.879999,0.0,1.160000,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False
10071,D-2,P138,4364048.818,2020-03-29 12:14:08.818000+09:00,0.16,4.14,2.680000,0.0,4.059999,4.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False
10072,D-2,P138,4379381.637,2020-03-29 16:29:41.637000+09:00,0.00,0.00,2.680000,0.0,4.059999,4.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False
10073,D-2,P138,4443941.837,2020-03-30 10:25:41.837000+09:00,0.88,0.88,0.700000,0.0,5.740000,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False


##  Baseline models benchmarking in D3

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tabpfn import TabPFNClassifier

In [7]:
# Define the columns that are identifiers or targets
drop_cols = [
    'PIF#participantID', 
    'PIF#timestamp', 
    'PIF#participationStartTimestamp', 
    'PIF#time_offset', 
    'META#dataset', 
    '__src',
    'PIF#stress_label'
]

# Create your Feature set (X) and Target (y)
X = df_3_over.drop(columns=drop_cols, errors='ignore') # errors='ignore' prevents crash if col is missing
y = df_y_3

In [8]:
# === 1. Train–Test Split ===

# If it's classification, strongly prefer stratified splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=RANDOM_STATE,
    stratify=y  # remove if regression, but for your case it's prob. classification
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

Train shape: (14484, 5016)
Test shape : (7135, 5016)


In [9]:
# === 2. Define Models ===
def get_models():
    models = {}

    # 1) TabPFN
    models["TabPFN"] = TabPFNClassifier(
        device="cuda",
        ignore_pretraining_limits=True,
    )

    # 2) MLP (with scaling in a pipeline)
    mlp = MLPClassifier(
        hidden_layer_sizes=(256, 128),
        activation="relu",
        solver="adam",
        batch_size=256,
        learning_rate_init=1e-3,
        alpha=1e-4,
        random_state=RANDOM_STATE,
    )


    models["MLP"] = make_pipeline(
        StandardScaler(),  # with_mean=False if X is sparse
        mlp
    )

    # 3) XGBoost
    models["XGBoost"] = XGBClassifier(
        objective="binary:logistic",  # change if multi-class
        eval_metric="logloss",
        random_state=RANDOM_STATE,
    )

    return models

models = get_models()
models


{'TabPFN': TabPFNClassifier(device='cuda', ignore_pretraining_limits=True),
 'MLP': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('mlpclassifier',
                  MLPClassifier(batch_size=256, hidden_layer_sizes=(256, 128),
                                random_state=42))]),
 'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_est

In [10]:
# === 3. Evaluation Utilities ===

def compute_metrics(y_true, y_pred, y_proba=None):
    """Return a dict of key metrics for classification."""
    metrics = {}
    metrics["accuracy"] = accuracy_score(y_true, y_pred)
    metrics["f1_macro"] = f1_score(y_true, y_pred, average="macro")

    # ROC-AUC only if we have probabilities
    if y_proba is not None:
        n_classes = len(np.unique(y_true))
        try:
            if n_classes == 2:
                # y_proba is (N,2) or (N,)
                if y_proba.ndim == 2:
                    y_score = y_proba[:, 1]
                else:
                    y_score = y_proba
                metrics["roc_auc"] = roc_auc_score(y_true, y_score)
            else:
                # multiclass
                metrics["roc_auc_ovr"] = roc_auc_score(
                    y_true, y_proba, multi_class="ovr"
                )
        except Exception as e:
            # Don't crash experiment because of ROC-AUC issues
            metrics["roc_auc_error"] = str(e)

    return metrics

In [11]:

def evaluate_model(name, model, X_train, y_train, X_test, y_test, verbose=False):
    """Fit model, predict, compute metrics, and (optionally) print a report."""
    print(f"\n=== Training {name} ===")
    model.fit(X_train, y_train)

    print(f"Predicting with {name}...")
    y_pred = model.predict(X_test)

    # Probabilities if available
    y_proba = None
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)
    elif hasattr(model, "decision_function"):
        # some models use decision_function instead of predict_proba
        y_proba = model.decision_function(X_test)

    metrics = compute_metrics(y_test, y_pred, y_proba=y_proba)

    if verbose:
        print(f"\n{name} – classification report")
        print(classification_report(y_test, y_pred))
        print(f"{name} – confusion matrix")
        print(confusion_matrix(y_test, y_pred))

    return metrics


In [12]:
# === 4. Run Benchmark (Single Train/Test Split) ===

results = []

for name, model in models.items():
    metrics = evaluate_model(
        name,
        model,
        X_train, y_train,
        X_test, y_test,
        verbose=True   # set False if you don’t want full report each time
    )
    row = {"model": name}
    row.update(metrics)
    results.append(row)

results_df = pd.DataFrame(results).set_index("model")
results_df

# Optional: sort by accuracy or any metric
results_df.sort_values("accuracy", ascending=False)


=== Training TabPFN ===
Predicting with TabPFN...


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.62 GiB. GPU 0 has a total capacity of 23.64 GiB of which 1.04 GiB is free. Process 256521 has 21.71 GiB memory in use. Including non-PyTorch memory, this process has 904.00 MiB memory in use. Of the allocated memory 368.77 MiB is allocated by PyTorch, and 73.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# === 5. Optional: K-Fold Cross-Validation Benchmark ===

def crossval_benchmark(models, X, y, n_splits=5):
    skf = StratifiedKFold(
        n_splits=n_splits,
        shuffle=True,
        random_state=RANDOM_STATE
    )

    all_rows = []

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
        print(f"\n===== Fold {fold_idx}/{n_splits} =====")

        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]

        for name, base_model in models.items():
            # Important: clone models per fold to avoid state leakage
            from sklearn.base import clone
            model = clone(base_model)

            metrics = evaluate_model(
                f"{name} (fold {fold_idx})",
                model,
                X_tr, y_tr,
                X_te, y_te,
                verbose=False
            )

            row = {
                "model": name,
                "fold": fold_idx,
            }
            row.update(metrics)
            all_rows.append(row)

    cv_df = pd.DataFrame(all_rows)
    summary = (
        cv_df
        .groupby("model")
        .agg(["mean", "std"])
        .round(4)
    )

    return cv_df, summary


# Run CV benchmark (can be expensive, esp. TabPFN)
cv_results, cv_summary = crossval_benchmark(models, X, y, n_splits=5)

cv_summary


===== Fold 1/5 =====

=== Training TabPFN (fold 1) ===
Predicting with TabPFN (fold 1)...

=== Training MLP (fold 1) ===
Predicting with MLP (fold 1)...

=== Training XGBoost (fold 1) ===
Predicting with XGBoost (fold 1)...

===== Fold 2/5 =====

=== Training TabPFN (fold 2) ===
Predicting with TabPFN (fold 2)...

=== Training MLP (fold 2) ===
Predicting with MLP (fold 2)...

=== Training XGBoost (fold 2) ===
Predicting with XGBoost (fold 2)...

===== Fold 3/5 =====

=== Training TabPFN (fold 3) ===
Predicting with TabPFN (fold 3)...


In [None]:
cv_results