# Data Splitting and Cross-Validation

We use `scikit-learn` to manage our data splits correctly.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, KFold

# 1. Generate Dummy Data (100 samples)
X = np.arange(100).reshape((50, 2)) # 50 samples, 2 features
y = np.arange(50) # 50 labels

print(f"X shape: {X.shape}, y shape: {y.shape}")

## 1. Simple Train-Test Split

In [None]:
# Split 80/20. Random state ensures reproducibility.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train size: {len(X_train)} samples")
print(f"Test size: {len(X_test)} samples")

# Notice indices are shuffled
print(f"First 5 Test Labels: {y_test[:5]}")

## 2. K-Fold Cross Validation

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_n = 1
for train_index, val_index in kf.split(X):
    print(f"Fold {fold_n}:")
    print(f"  Train Indices: {len(train_index)}")
    print(f"  Val Indices: {val_index}") 
    fold_n += 1
    # In practice, you would train model here on X[train_index] and score on X[val_index]

## 3. Avoiding Leakage (Preprocessing)
WRONG WAY: Scale huge dataset, then split.
RIGHT WAY: Split, fit scaler on train, transform test.

In [None]:
from sklearn.preprocessing import StandardScaler

# 1. Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 2. Initialize Scaler
scaler = StandardScaler()

# 3. Fit ONLY on training data
scaler.fit(X_train)

# 4. Transform both
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Mean of Train (should be 0):", np.mean(X_train_scaled))
print("Mean of Test (will NOT be exactly 0):", np.mean(X_test_scaled))