In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Test, Training, and Validation Sets
- A completely randomly sampled split
 							
- A stratified split
 							
- A split that is chosen in a non-random way, so that your test and/or validation sets can be considered to more accurately represent the data that will be seen when the system is deployed


In [3]:
# 1. randomly samplef split
def train_test_index_split_random(n, k, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(n)
    rng.shuffle(idx)
    folds = np.array_split(idx, k)
    return [
        (np.concatenate(folds[:i] + folds[i+1:]), folds[i])
        for i in range(k)
    ]

In [9]:
#2. stratified split
def train_test_index_split_stratified(y, k, seed=42):
    rng = np.random.default_rng(seed)
    y = np.asarray(y)
    folds = [[] for _ in range(k)]
    # distribute classes evenly across folds
    for cls in np.unique(y):
        indexes = rng.permutation(np.where(y == cls)[0])
        split = np.array_split(indexes, k)
        for i in range(k):
            folds[i].extend(split[i])
    splits = []
    for i in range(k):
        test_idx = np.array(folds[i])
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])
        splits.append((train_idx, test_idx))
    return splits

In [16]:
import numpy as np

def cross_validate(X, y, k=5, split="stratified", seed=42):
    X = np.asarray(X)
    y = np.asarray(y)

    # split method
    if split == "stratified":
        splits = train_test_index_split_stratified(y, k, seed)
    elif split == "random":
        splits = train_test_index_split_random(len(y), k, seed)
    else:
        raise ValueError("split type not found")

    results = []
    for fold, (train_idx, test_idx) in enumerate(splits, 1):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        if model_fn is None:
            results.append({
                "fold": fold,
                "train_idx": train_idx, "test_idx": test_idx,
                "X_train": X_train, "y_train": y_train,
                "X_test": X_test, "y_test": y_test,
            })
        else:
            # TODO: create model_fn for the three methods
            metrics = model_fn(X_train, y_train, X_test, y_test)  
            metrics["fold"] = fold
            results.append(metrics)
    return results