# Stratified Cross Validation

In many real-world scenarios, data can be imbalanced (e.g., 10% “positive” vs. 90% “negative”). In standard K-fold cross validation, if the split is done randomly, you might end up with folds that do not reflect the true class distribution—some folds might have very few or no positive samples, which can lead to misleading performance metrics.

In [4]:
import numpy as np

from sklearn.datasets import load_iris

In [5]:
iris = load_iris()
X, y = iris.data, iris.target


In [None]:
labels = np.unique(y)


array([0, 1, 2])

In [13]:
X_with_y = np.column_stack((X, y))

In [22]:
X_with_y[X_with_y[:, -1] == 2]

array([[6.3, 3.3, 6. , 2.5, 2. ],
       [5.8, 2.7, 5.1, 1.9, 2. ],
       [7.1, 3. , 5.9, 2.1, 2. ],
       [6.3, 2.9, 5.6, 1.8, 2. ],
       [6.5, 3. , 5.8, 2.2, 2. ],
       [7.6, 3. , 6.6, 2.1, 2. ],
       [4.9, 2.5, 4.5, 1.7, 2. ],
       [7.3, 2.9, 6.3, 1.8, 2. ],
       [6.7, 2.5, 5.8, 1.8, 2. ],
       [7.2, 3.6, 6.1, 2.5, 2. ],
       [6.5, 3.2, 5.1, 2. , 2. ],
       [6.4, 2.7, 5.3, 1.9, 2. ],
       [6.8, 3. , 5.5, 2.1, 2. ],
       [5.7, 2.5, 5. , 2. , 2. ],
       [5.8, 2.8, 5.1, 2.4, 2. ],
       [6.4, 3.2, 5.3, 2.3, 2. ],
       [6.5, 3. , 5.5, 1.8, 2. ],
       [7.7, 3.8, 6.7, 2.2, 2. ],
       [7.7, 2.6, 6.9, 2.3, 2. ],
       [6. , 2.2, 5. , 1.5, 2. ],
       [6.9, 3.2, 5.7, 2.3, 2. ],
       [5.6, 2.8, 4.9, 2. , 2. ],
       [7.7, 2.8, 6.7, 2. , 2. ],
       [6.3, 2.7, 4.9, 1.8, 2. ],
       [6.7, 3.3, 5.7, 2.1, 2. ],
       [7.2, 3.2, 6. , 1.8, 2. ],
       [6.2, 2.8, 4.8, 1.8, 2. ],
       [6.1, 3. , 4.9, 1.8, 2. ],
       [6.4, 2.8, 5.6, 2.1, 2. ],
       [7.2, 3

In [None]:
import numpy as np


def k_fold_cv(data, k):
    
    n = data.shape[0]
    data = np.random.shuffle(data)
    batch_size = n // k
    
    totals = []
    for i in range(0, n, batch_size):
        mask = np.ones(n, dtype=bool)
        mask[i:i+batch_size] = False
        
        train_data = data[mask]
        test_data = data[~mask]
        totals.append(train_data, test_data)
    return totals
        

def k_fold(data, k):
    folds = []
    n = data.shape[0]
    np.random.shuffle(data)
    
    batch_size = n // k
    
    for i in range(0, n, batch_size):
        mask = np.ones(n, dtype=bool)
        mask[i:i+batch_size] = False
        
        train_data = data[mask]
        test_data = data[mask]

        folds.append(train_data, test_data)
    return folds
    

def stratified_k_fold(data, k, shuffle=True, random_state=None):
    """
    data: 2D array where the last column is the label.
          shape = (num_samples, num_features + 1)
    k:    number of folds
    """
    # Optionally fix random seed
    if random_state is not None:
        np.random.seed(random_state)
    
    # Extract unique labels from the last column
    labels = np.unique(data[:, -1])
    
    # For each of the k folds, we will collect a partial train set and a partial test set
    # from each label, then concatenate them.
    # Initialize a structure that will eventually hold k folds
    # folds[i] = (list_of_train_arrays_across_labels, list_of_test_arrays_across_labels)
    folds = [ ([], []) for _ in range(k) ]
    
    # 1) Split by label
    for label in labels:
        # Subset of data for this label
        subset = data[data[:, -1] == label]
        
        # 2) Apply k_fold to this subset
        #    We can reuse our corrected k_fold function, but we should not shuffle again inside it
        #    if we already handle shuffling out here.
        subset_folds = k_fold(subset, k, shuffle=False)  # or pass shuffle=True if you prefer
        
        # subset_folds is a list of (train_subset, test_subset) for each of k folds
        # We'll merge them into the main folds array
        for i, (train_part, test_part) in enumerate(subset_folds):
            folds[i][0].append(train_part)  # add to train list
            folds[i][1].append(test_part)   # add to test list
    
    # 3) Now each fold[i] has a list of train parts for each label and
    #    a list of test parts for each label. We concatenate them:
    final_folds = []
    for i in range(k):
        train_list, test_list = folds[i]
        # train_list and test_list are lists of arrays
        # across all labels (dog subset, cat subset, etc.)
        train_data = np.concatenate(train_list, axis=0)
        test_data  = np.concatenate(test_list, axis=0)
        
        final_folds.append((train_data, test_data))
    
    return final_folds

        
    
    
    
    
    
    
        
        
    