### Imports

In [None]:
import os
import numpy as np
import pandas as pd
from typing import Tuple, Optional, Dict, Any, List

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import ClassifierMixin

from joblib import Parallel, delayed

# Optional progress support
try:
    from tqdm.autonotebook import tqdm
    from tqdm_joblib import tqdm_joblib
    _HAS_TQDM = True
except Exception:
    _HAS_TQDM = False

### Data Loading

In [None]:
# --- 1) Data loading

def load_fcc_dataset(folder_path: str) -> Tuple[np.ndarray, np.ndarray, List[str], Dict[int, str]]:
    """
    Load the alarm-flood benchmark dataset from a given root folder.

    Structure expected:
        folder_path/
            classA/
                file1.csv
                file2.csv
                ...
            classB/
                file1.csv
                ...

    Each CSV is read with pandas, values are transposed (like in user's original code),
    and appended as one sample. Labels are assigned according to the sorted subfolder order.

    Returns
    -------
    X : np.ndarray, shape (n_samples, n_features, ?)
        Stacks each CSV (transposed) along axis=0. If CSVs have varying lengths/features,
        this will raise; ensure consistency across files/versions.
    y : np.ndarray, shape (n_samples,)
        Integer labels (0..n_classes-1) aligned with sorted subfolder names.
    files : List[str]
        List of file paths in the order they were added to X/y.
    label_map : Dict[int, str]
        Mapping from integer label -> class (subfolder) name for reference.
    """
    import pandas as pd

    data = []
    labels = []
    files = []

    subfolders = [d for d in sorted(os.listdir(folder_path)) if os.path.isdir(os.path.join(folder_path, d))]
    if not subfolders:
        raise ValueError(f"No class subfolders found under: {folder_path}")

    for l, sub in enumerate(subfolders):
        sub_path = os.path.join(folder_path, sub)
        for fname in sorted(os.listdir(sub_path)):
            if fname.endswith(".csv"):
                fpath = os.path.join(sub_path, fname)
                df = pd.read_csv(fpath)
                data.append(df.values.transpose())
                labels.append(l)
                files.append(fpath)

    if not data:
        raise ValueError(f"No CSV files found under class subfolders in: {folder_path}")

    # Convert to array (will fail if shapes differ -> enforce consistent preprocessing upstream)
    X = np.array(data, dtype=float)
    y = np.array(labels, dtype=int)
    label_map = {i: sub for i, sub in enumerate(subfolders)}

    print(f"[load_fcc_dataset] Loaded from: {folder_path}")
    # print(f"  Data shape:  {X.shape}")
    # print(f"  Labels shape:{y.shape}")
    # print(f"  Classes:     {label_map}")

    return X, y, files, label_map

### Data Splits and Training/Testing Suite

In [3]:
# --- 2) Single-split train/test training + optional model return

def _predict_labels_generic(
    clf: ClassifierMixin, X: np.ndarray, use_argmin: bool = False
) -> np.ndarray:
    """
    Predict labels allowing the argmin/argmax option on probabilities (if available).
    Fallbacks:
      - decision_function -> argmax
      - predict -> direct
    """
    if hasattr(clf, "predict_proba"):
        proba = clf.predict_proba(X)
        if use_argmin:
            return np.argmin(proba, axis=1)
        return np.argmax(proba, axis=1)
    elif hasattr(clf, "decision_function"):
        scores = clf.decision_function(X)
        # Ensure 2D
        if scores.ndim == 1:  # binary case
            scores = np.vstack([-scores, scores]).T
        return np.argmax(scores, axis=1)
    else:
        return clf.predict(X)

In [4]:
def train_test_once(
    classifier: ClassifierMixin,
    X: np.ndarray,
    y: np.ndarray,
    test_size: float = 0.2,
    random_state: int = 42,
    use_argmin: bool = False,
    return_model: bool = False,
    predefined_indices: Optional[Tuple[np.ndarray, np.ndarray]] = None,
) -> Dict[str, Any]:
    """
    Train a classifier once on a stratified split and evaluate on the test set.

    Parameters
    ----------
    classifier : ClassifierMixin
        Any sklearn-compatible classifier.
    X, y : np.ndarray
        Data and labels.
    test_size : float
        Fraction for test split.
    random_state : int
        Seed for reproducibility.
    use_argmin : bool
        If True and predict_proba available, uses argmin instead of argmax for label selection.
    return_model : bool
        If True, the trained model is included in the output dict under 'model'.
    predefined_indices : (train_idx, test_idx) or None
        If provided, use these indices instead of creating a new split.

    Returns
    -------
    result : dict
        {
          'accuracy': float,
          'y_true': np.ndarray,
          'y_pred': np.ndarray,
          'train_idx': np.ndarray,
          'test_idx': np.ndarray,
          'model': fitted model (if return_model=True)
        }
    """
    if predefined_indices is None:
        train_idx, test_idx = train_test_split(
            np.arange(len(y)),
            test_size=test_size,
            random_state=random_state,
            stratify=y
        )
    else:
        train_idx, test_idx = predefined_indices

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    classifier.fit(X_train, y_train)

    y_pred = _predict_labels_generic(classifier, X_test, use_argmin=use_argmin)
    acc = accuracy_score(y_test, y_pred)

    out = {
        "accuracy": acc,
        "y_true": y_test,
        "y_pred": y_pred,
        "train_idx": train_idx,
        "test_idx": test_idx,
    }
    if return_model:
        out["model"] = classifier
    return out

In [None]:
# ---------- helper: dataset distance ----------
def _dataset_distance(X_ref: np.ndarray, X_new: np.ndarray) -> float:
    diff = X_new - X_ref
    nonzero = np.count_nonzero(diff)
    total = diff.size
    return float(nonzero) / float(total) if total > 0 else 0.0

# ---------- helper: two-level traversal ----------
def _discover_two_level_versions(versions_root: str) -> list[tuple[str, str]]:
    """Returns list of (version_name, version_path) with version_name='chance_xx/change_yy'."""
    version_dirs = []
    for lvl1 in sorted(os.listdir(versions_root)):
        lvl1_path = os.path.join(versions_root, lvl1)
        if not os.path.isdir(lvl1_path):
            continue
        for lvl2 in sorted(os.listdir(lvl1_path)):
            lvl2_path = os.path.join(lvl1_path, lvl2)
            if not os.path.isdir(lvl2_path):
                continue
            version_dirs.append((f"{lvl1}/{lvl2}", lvl2_path))
    if not version_dirs:
        raise ValueError(f"No two-level version folders found under: {versions_root}")
    return version_dirs

# ---------- main: CV experiment with train/val/test (parallelize VERSIONS within each fold) ----------
def run_experiment_cv(
    benchmark_folder: str,
    versions_root: str,
    model: ClassifierMixin,
    model_params: Any,
    *,
    time_slice: Optional[int] = 60,
    n_splits: int = 5,
    val_size: float = 0.2,   # fraction of TRAIN+VAL used as VAL within each fold
    random_state: int = 42,
    use_argmin_benchmark: bool = True,
    use_argmin_eval: bool = True,
    enforce_shape_match: bool = True,
    n_jobs: int = -1,        # NOW used to parallelize versions per fold
) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, np.ndarray]]:
    """
    5-fold CV on the benchmark dataset with train/val/test per fold.
    For each version dataset, reuses the exact indices to evaluate:
      - benchmark-trained model on val & test,
      - alt model trained on version's train, evaluated on val & test.
    Returns per-fold results and fold-averaged summary.
    """
    # 1) Load & optionally time-slice BENCHMARK dataset
    X_bench, y_bench, _, _ = load_fcc_dataset(benchmark_folder)
    if time_slice is not None:
        X_bench_used = X_bench[:, :, :time_slice]
    else:
        X_bench_used = X_bench
    print(f"[Info] Benchmark loaded: X={X_bench.shape}, sliced={X_bench_used.shape}, y={y_bench.shape}")

    # 2) Prepare version discovery and shape checks
    versions = _discover_two_level_versions(versions_root)
    total_samples = len(y_bench)
    print(f"[Info] Discovered {len(versions)} version(s) under '{versions_root}'")

    # ensure versions match the benchmark shape if requested
    def _load_version_or_skip(vname: str, vpath: str):
        try:
            Xv, yv, _, _ = load_fcc_dataset(vpath)
        except Exception as e:
            print(f"[WARN] Skipping version '{vname}' due to load error: {e}")
            return None
        if Xv.shape[0] != total_samples:
            msg = (f"[WARN] Version '{vname}' has #samples {Xv.shape[0]} "
                   f"!= benchmark {total_samples}.")
            if enforce_shape_match:
                print(msg + " Skipping.")
                return None
            else:
                print(msg + " Continuing.")
        if time_slice is not None:
            Xv = Xv[:, :, :time_slice]
        # shape full check (including time dimension)
        if Xv.shape != X_bench_used.shape:
            msg = (f"[WARN] Version '{vname}' shape {Xv.shape} "
                   f"!= benchmark slice shape {X_bench_used.shape}.")
            if enforce_shape_match:
                print(msg + " Skipping.")
                return None
            else:
                print(msg + " Continuing.")
        return Xv, yv

    # 3) CV splitting (folds stay sequential; versions inside each fold are parallelized)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold_splits = list(enumerate(skf.split(np.arange(total_samples), y_bench)))
    print(f"[Info] Prepared {n_splits}-fold stratified CV")

    fold_entries: List[Dict[str, Any]] = []
    indices_meta: Dict[str, np.ndarray] = {}

    # --- worker for a single VERSION inside a given fold ---
    def _process_version(
        fold_id: int,
        version_name: str,
        version_path: str,
        tr_idx_local: np.ndarray,
        val_idx_local: np.ndarray,
        test_idx: np.ndarray,
        bench_model_fold: Any,   # fitted on benchmark train of this fold
    ):
        out_entries: List[Dict[str, Any]] = []
        log_line: str = ""
        loaded = _load_version_or_skip(version_name, version_path)
        if loaded is None:
            log_line = f"[Fold {fold_id}] Skipping version '{version_name}' due to load/shape issues."
            return out_entries, log_line  # skip silently; warnings already printed
        Xv, yv = loaded

        # Compute distance once per version (global arrays)
        ds_distance = _dataset_distance(X_bench_used, Xv)

        # Slice version with SAME indices as the fold
        Xtr_v, ytr_v = Xv[tr_idx_local],  yv[tr_idx_local]
        Xva_v, yva_v = Xv[val_idx_local], yv[val_idx_local]
        Xte_v, yte_v = Xv[test_idx],      yv[test_idx]

        acc_b_val = acc_b_test = acc_a_val = acc_a_test = np.nan
        # (A) Evaluate BENCHMARK model (trained on benchmark train) on version val/test
        for split_name, X_, y_ in (("val", Xva_v, yva_v), ("test", Xte_v, yte_v)):
            y_pred_b = _predict_labels_generic(bench_model_fold, X_, use_argmin=use_argmin_eval)
            acc_b = (y_pred_b == y_).mean()
            out_entries.append({
                "fold": fold_id,
                "dataset_version": version_name,
                "evaluation": "benchmark_model",
                "split": split_name,
                "accuracy": float(acc_b),
                "dataset_distance_to_benchmark": float(ds_distance),
            })
            if split_name == "val":
                acc_b_val = acc_b
            elif split_name == "test":
                acc_b_test = acc_b

        # (B) Train ALT model on version train, evaluate on version val/test
        alt_clf = model(params=model_params)
        alt_clf.fit(Xtr_v, ytr_v)
        for split_name, X_, y_ in (("val", Xva_v, yva_v), ("test", Xte_v, yte_v)):
            y_pred_a = _predict_labels_generic(alt_clf, X_, use_argmin=use_argmin_eval)
            acc_a = (y_pred_a == y_).mean()
            out_entries.append({
                "fold": fold_id,
                "dataset_version": version_name,
                "evaluation": "alt_model",
                "split": split_name,
                "accuracy": float(acc_a),
                "dataset_distance_to_benchmark": float(ds_distance),
            })
            if split_name == "val":
                acc_a_val = acc_a
            elif split_name == "test":
                acc_a_test = acc_a

        log_line = (
            f"[Fold {fold_id}] Version '{version_name}': "
            f"bench_val_acc={float(acc_b_val):.4f}, bench_test_acc={float(acc_b_test):.4f}, "
            f"alt_val_acc={float(acc_a_val):.4f}, alt_test_acc={float(acc_a_test):.4f}, "
            f"ds_distance={float(ds_distance):.4f}"
        )
        return out_entries, log_line

    # --- process folds sequentially; parallelize versions within each fold ---
    for fold_id, (trainval_idx, test_idx) in fold_splits:
        print(f"[Info] Processing fold {fold_id+1}/{n_splits} …")

        # Stratified VAL split inside the train+val pool
        tv_y = y_bench[trainval_idx]
        tr_idx_local, val_idx_local = train_test_split(
            trainval_idx,
            test_size=val_size,
            random_state=random_state + fold_id,  # vary seed per fold
            stratify=tv_y
        )
        # Benchmark train/val/test sets
        Xtr_b, ytr_b = X_bench_used[tr_idx_local],  y_bench[tr_idx_local]
        Xva_b, yva_b = X_bench_used[val_idx_local], y_bench[val_idx_local]
        Xte_b, yte_b = X_bench_used[test_idx],      y_bench[test_idx]

        # Train a fresh BENCHMARK model on the benchmark train split (per fold)
        bench_model_fold = model(params=model_params)
        bench_model_fold.fit(Xtr_b, ytr_b)

        # Evaluate benchmark model on benchmark val/test (kept as-is)
        for split_name, X_, y_ in (("val", Xva_b, yva_b), ("test", Xte_b, yte_b)):
            y_pred = _predict_labels_generic(bench_model_fold, X_, use_argmin=use_argmin_benchmark)
            acc = (y_pred == y_).mean()
            fold_entries.append({
                "fold": fold_id, "dataset_version": "__benchmark__",
                "evaluation": "benchmark_model", "split": split_name,
                "accuracy": acc, "dataset_distance_to_benchmark": 0.0
            })

        # Cache indices for reuse (same as before)
        indices_meta[f"fold_{fold_id}_train_idx"] = tr_idx_local
        indices_meta[f"fold_{fold_id}_val_idx"]   = val_idx_local
        indices_meta[f"fold_{fold_id}_test_idx"]  = test_idx

        # Parallelize the VERSIONS within this fold
        version_iterable = versions
        if _HAS_TQDM:
            print(f"[Info]  └─ Evaluating {len(versions)} version(s) in parallel …")
            with tqdm_joblib(tqdm(total=len(versions), desc=f"Fold {fold_id} versions", leave=True)):
                results = Parallel(n_jobs=n_jobs, prefer="processes")(
                    delayed(_process_version)(
                        fold_id, vname, vpath, tr_idx_local, val_idx_local, test_idx, bench_model_fold
                    )
                    for (vname, vpath) in version_iterable
                )
        else:
            print(f"[Info]  └─ Evaluating {len(versions)} version(s) in parallel (tqdm not available) …")
            results = Parallel(n_jobs=n_jobs, prefer="processes")(
                delayed(_process_version)(
                    fold_id, vname, vpath, tr_idx_local, val_idx_local, test_idx, bench_model_fold
                )
                for (vname, vpath) in version_iterable
            )

        # Collect per-version results for this fold
        for out_entries, log_line in results:
            fold_entries.extend(out_entries)
            if log_line:
                pass
                # print(log_line)

    # Build per-fold results (unchanged)
    fold_results = pd.DataFrame(fold_entries).sort_values(
        ["dataset_version", "evaluation", "split", "fold"]
    ).reset_index(drop=True)

    # Aggregate across folds (unchanged)
    summary = (
        fold_results
        .groupby(["dataset_version", "evaluation", "split"], as_index=False)
        .agg(accuracy_mean=("accuracy", "mean"),
             accuracy_std=("accuracy", "std"),
             dataset_distance_to_benchmark=("dataset_distance_to_benchmark", "first"))
        .sort_values(["dataset_version", "evaluation", "split"])
        .reset_index(drop=True)
    )

    print("[Info] Completed. Rows -> fold_results:", len(fold_results), " | summary:", len(summary))
    return fold_results, summary, indices_meta

  from tqdm.autonotebook import tqdm


In [None]:
def save_results(
    df: pd.DataFrame,
    *,
    split: str = "test",   # 'val' or 'test'
    save_csv_path: Optional[str] = None,
):
    if df is None or df.empty:
        print("No results to plot.")
        return

    # detect per-fold vs summary
    if "fold" in df.columns and "accuracy" in df.columns:
        # average per version/evaluation for the chosen split
        plot_df = (
            df[df["split"] == split]
            .groupby(["dataset_version", "evaluation"], as_index=False)
            .agg(
                accuracy_mean=("accuracy", "mean"),
                dataset_distance_to_benchmark=("dataset_distance_to_benchmark", "first"),
            )
        )
    else:
        plot_df = df[df["split"] == split].rename(columns={"accuracy_mean": "accuracy_mean"})

    df_bench = plot_df[plot_df["evaluation"] == "benchmark_model"]
    df_alt   = plot_df[plot_df["evaluation"] == "alt_model"]

    if df_bench.empty and df_alt.empty:
        print(f"No data to plot for split='{split}'.")
        return

    df.to_csv(save_csv_path, index=False)
    print(f"Saved results to: {save_csv_path}")

### Experiments

In [None]:
# classifiers
from classifiers.WDI_1NN import WDI_1NN
from classifiers.ACM_SVM import ACM_SVM
from classifiers.CASIM import CASIM
from classifiers.EAC_1NN import EAC_1NN
from classifiers.MBW_LR import MBW_LR

# ---------------- Params 
wdi_1nn_params = {"template_threshold": 0.5}
acm_svm_params = None
casim_params = {
    "num_features": 672,
    "n_estimators": 1,
    "n_jobs_multirocket": 1,
    "random_state": 42,
    "alphas": np.logspace(-3, 3, 10),
}
eac_1nn_params = {"attenuation_coefficient_per_min": 0.001}  # 0.0667}
mbw_lr_params = {
    "penalty": None,
    "fit_intercept": False,
    "solver": "lbfgs",
    "multi_class": "ovr",
    "decision_bounds": True,
    "confidence_interval": 1.96,
}

# list of models and model_params to evaluate
models_to_evaluate = [
    (WDI_1NN, wdi_1nn_params, "WDI-1NN", True),
    (CASIM, casim_params, "CASIM", False),
    (EAC_1NN, eac_1nn_params, "EAC-1NN", True),
    (MBW_LR, mbw_lr_params, "MBW-LR", False),
    (ACM_SVM, acm_svm_params, "ACM-SVM", False),
]

#### FCC Evaluation

In [None]:
# iterate over models
for model, model_params, model_name, use_argmin in models_to_evaluate:
    print(f"\n=== Running experiment for model: {model_name} ===")
    fold_results, summary, indices_meta = run_experiment_cv(
        benchmark_folder="data/fcc/benchmark",
        versions_root="data/fcc/perturbed_data",
        model=model,
        model_params=model_params,
        time_slice=60,
        n_splits=5,
        val_size=0.2,
        random_state=42,
        use_argmin_benchmark=use_argmin,
        use_argmin_eval=use_argmin,
        enforce_shape_match=True,
    )
    # save results
    save_results(fold_results, split="test",
                 save_csv_path=f"results/fcc_results_{model_name.lower().replace('-', '_')}.csv")

#### TEP Evaluation

In [None]:
# iterate over models
for model, model_params, model_name, use_argmin in models_to_evaluate:
    print(f"\n=== Running experiment for model: {model_name} ===")
    fold_results, summary, indices_meta = run_experiment_cv(
        benchmark_folder="data/tep/benchmark",
        versions_root="data/tep/perturbed_data",
        model=model,
        model_params=model_params,
        time_slice=60,
        n_splits=5,
        val_size=0.2,
        random_state=42,
        use_argmin_benchmark=use_argmin,
        use_argmin_eval=use_argmin,
        enforce_shape_match=True,
    )
    # save results
    save_results(fold_results, split="test",
                 save_csv_path=f"results/tep_results_{model_name.lower().replace('-', '_')}.csv")