In [None]:
import numpy as np

#### Split data into random train and test subsets.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]])
# X = np.arange(14).reshape((7, 2)) + 1

y = np.array([0, 0, 0, 0, 0, 1, 1])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
print(X_train)
print(y_train)

In [None]:
print(X_test)
print(y_test)

#### Stratified train-test split
Ensures that relative class frequencies are approximately preserved in each train and test datasets. This is relevant in the case of large imbalance in the distribution of the target classes: for instance there could be several times more negative samples than positive samples.

In [None]:
X = np.arange(18).reshape((9, 2)) + 1
print('X:', X)

y = [0,0,0,0,0,0,1,1,1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

print("=== Training dataset ===")
print(X_train)
print(y_train)
print("=== Test dataset ===")
print(X_test)
print(y_test)

---
#### ShuffleSplit: random permutation cross-validator

Yields indices to split data into training and test sets.

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit

In [None]:
from sklearn.model_selection import ShuffleSplit

In [None]:
#X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13,14], [15,16]])
X = np.arange(16).reshape((8, 2)) + 1
y = np.array([0, 1, 0, 1, 0, 1, 1, 1])

In [None]:
rs  = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
print(rs)

In [None]:
rs.get_n_splits(X)

In [None]:
# Generator object
rs.split(X)

In [None]:
for i, (train_index, test_index) in enumerate(rs.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

In [None]:
# Specify train and test size
rs = ShuffleSplit(n_splits = 5, train_size = 0.5, test_size =. 25, random_state = 0)
for i, (train_index, test_index) in enumerate(rs.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

#### Classical K-fold cross-validation

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [None]:
from sklearn.model_selection import KFold

In [None]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]])
y = np.array([0, 0, 0, 0, 1, 1, 1])
kf = KFold(n_splits=3,shuffle=False)
print(kf)

Number of splitting iterations in the cross-validator

In [None]:
n_splits = kf.get_n_splits(X)
print(n_splits)

In [None]:
n_samples = len(X)
print(n_samples)

In [None]:
kf.split(X)

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    
    #print(f"  Train dataset={X[train_index]}, {y[train_index]}")
    #print(f"  Test dataset={X[test_index]}, {y[test_index]}")

The first n_samples % n_splits folds have size n_samples // n_splits + 1, other folds have size n_samples // n_splits, where n_samples is the number of samples.

In [None]:
n_samples % n_splits

In [None]:
n_samples // n_splits + 1

In [None]:
n_samples // n_splits

---
#### StratifiedKFold ensures a data split that preserves the proportion of classes

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([0, 0, 0, 0, 1, 1])

In [None]:
skf = StratifiedKFold(n_splits=2, shuffle=False)
print(skf)

In [None]:
skf.get_n_splits(X, y)

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

---
#### RepeatedStratifiedKFold repeats stratified K-Fold n times with different randomization in each repetition.

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedStratifiedKFold.html

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([0, 0, 1, 1])

In [None]:
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=3, random_state=36851234)
print(rskf)

In [None]:
rskf.get_n_splits(X, y)

In [None]:
for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")