# Cross Validation
- How to evaluate the quality of estimator?

In [13]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Optional scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

model = LogisticRegression(max_iter=200)

## Complementary Subsets
- Random complementary subsets 
- Train on a larger subset, test on a small
- Multiple rounds to decrease variance

In [14]:
def manual_complementary_subsets_cv(X, y, model, subset_size=20, repeats=5):
    n = len(X)
    scores = []

    for _ in range(repeats):
        subset_idx = np.random.choice(n, subset_size, replace=False)
        complement_idx = np.array([i for i in range(n) if i not in subset_idx])

        model.fit(X[subset_idx], y[subset_idx])
        scores.append(model.score(X[complement_idx], y[complement_idx]))

    return scores

scores_comp = manual_complementary_subsets_cv(X, y, model)
print("Manual Complementary Subsets CV Scores:", scores_comp)

Manual Complementary Subsets CV Scores: [0.7461538461538462, 0.9153846153846154, 0.8692307692307693, 0.9230769230769231, 0.8846153846153846]


In [17]:
from sklearn.model_selection import PredefinedSplit, cross_val_score
from sklearn.utils import shuffle

# Shuffle X and y together
X_shuf, y_shuf = shuffle(X, y, random_state=42)

subset_size = 20
test_fold = np.ones(len(X_shuf))
test_fold[:subset_size] = -1  # training subset

ps = PredefinedSplit(test_fold)

scores_ps = cross_val_score(model, X_shuf, y_shuf, cv=ps)

print("Sklearn Complementary Subsets (PredefinedSplit) Scores:", scores_ps)

Sklearn Complementary Subsets (PredefinedSplit) Scores: [0.86153846]


## Leave-One-Out
- For each point, we train on the others and test
- Testing on $n$ points requires $n$ trainings
- Expensive!

In [18]:
def manual_leave_one_out(X, y, model):
    n = len(X)
    scores = []

    for i in range(n):
        train_idx = np.array([j for j in range(n) if j != i])
        test_idx = np.array([i])

        model.fit(X[train_idx], y[train_idx])
        scores.append(model.score(X[test_idx], y[test_idx]))

    return scores

scores_loo = manual_leave_one_out(X, y, model)
print("Manual LOO CV Accuracy:", np.mean(scores_loo))

Manual LOO CV Accuracy: 0.9533333333333334


In [19]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
scores_loo = cross_val_score(model, X, y, cv=loo)

print("Sklearn LOO CV Accuracy:", scores_loo.mean())

Sklearn LOO CV Accuracy: 0.9533333333333334


## $k$-Folds
- Create $k$ partitions of equal sizes, e.g., $k=2$ yields two subsets
- Pick a single partition and train on the other $(k\!-\!1)$ 
- Repeat for all $k$ partitions - requires $k$ trainings
- Leave-One-Out is a special case with $k=n$

In [20]:
def manual_kfold_cv(X, y, model, k=5):
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    folds = np.array_split(indices, k)

    scores = []
    for i in range(k):
        test_idx = folds[i]
        train_idx = np.concatenate([folds[j] for j in range(k) if j != i])

        model.fit(X[train_idx], y[train_idx])
        scores.append(model.score(X[test_idx], y[test_idx]))

    return scores

scores_kfold = manual_kfold_cv(X, y, model, k=5)
print("Manual K-Fold Scores:", scores_kfold)

Manual K-Fold Scores: [0.9, 1.0, 0.9333333333333333, 1.0, 0.9666666666666667]


In [21]:
from sklearn.model_selection import KFold, cross_val_score

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores_kf = cross_val_score(model, X, y, cv=kf)

print("Sklearn K-Fold Scores:", scores_kf)

Sklearn K-Fold Scores: [1.         0.96666667 0.93333333 0.93333333 0.96666667]
