In [1]:
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold


def get_folds(
    X,
    y,
    cv_type="kfold",
    n_splits=5,
    shuffle=True,
    random_state=42,
    groups=None,
):
    """
    Unified CV builder for Kaggle workflows.
    Returns list of (train_idx, valid_idx) pairs.
    """

    cv_type = cv_type.lower()

    if cv_type not in ["kfold", "stratified", "group"]:
        raise ValueError(f"Unknown cv_type: {cv_type}")

    # Convert to numpy for safe indexing
    y = np.array(y)

    # -------------------------
    # Stratified KFold
    # -------------------------
    if cv_type == "stratified":
        # Safety: each class must appear at least n_splits times
        unique, counts = np.unique(y, return_counts=True)
        if np.any(counts < n_splits):
            raise ValueError(
                "Some classes have fewer samples than n_splits. "
                "StratifiedKFold would be invalid."
            )

        splitter = StratifiedKFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        folds = list(splitter.split(X, y))

    # -------------------------
    # Group KFold
    # -------------------------
    elif cv_type == "group":
        if groups is None:
            raise ValueError("groups must be provided for GroupKFold.")

        splitter = GroupKFold(n_splits=n_splits)
        folds = list(splitter.split(X, y, groups))

    # -------------------------
    # Standard KFold
    # -------------------------
    else:  # "kfold"
        splitter = KFold(
            n_splits=n_splits,
            shuffle=shuffle,
            random_state=random_state,
        )
        folds = list(splitter.split(X, y))

    return folds


In [3]:
import numpy as np

X = np.zeros((12, 3))
y = np.array([0,1,0,1,0,1,0,1,0,1,0,1])
groups = np.array([1,1,2,2,2,3,3,3,4,4,5,5])

print("KFold:")
print(get_folds(X, y, "kfold"))

print("\nStratified:")
print(get_folds(X, y, "stratified"))

print("\nGroup:")
print(get_folds(X, y, "group", groups=groups))

KFold:
[(array([ 1,  2,  3,  4,  5,  6,  7,  8, 11]), array([ 0,  9, 10])), (array([ 0,  1,  3,  4,  6,  7,  9, 10, 11]), array([2, 5, 8])), (array([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 10]), array([ 1, 11])), (array([ 0,  1,  2,  3,  5,  6,  8,  9, 10, 11]), array([4, 7])), (array([ 0,  1,  2,  4,  5,  7,  8,  9, 10, 11]), array([3, 6]))]

Stratified:
[(array([ 1,  4,  5,  6,  7,  8,  9, 10, 11]), array([0, 2, 3])), (array([ 0,  1,  2,  3,  4,  8,  9, 10, 11]), array([5, 6, 7])), (array([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 11]), array([ 1, 10])), (array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 10]), array([ 8, 11])), (array([ 0,  1,  2,  3,  5,  6,  7,  8, 10, 11]), array([4, 9]))]

Group:
[(array([ 0,  1,  2,  3,  4,  8,  9, 10, 11]), array([5, 6, 7])), (array([ 0,  1,  5,  6,  7,  8,  9, 10, 11]), array([2, 3, 4])), (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([10, 11])), (array([ 0,  1,  2,  3,  4,  5,  6,  7, 10, 11]), array([8, 9])), (array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), 