# TP: Techniques to Avoid Overfitting

In this practical, we explore three fundamental techniques to mitigate overfitting:

- **Early Stopping** — stop training before the model memorizes the training data
- **Batch Normalization** — normalize intermediate activations to stabilize training
- **Regularization** — constrain the model via L1/L2 penalties and Dropout

We use Fashion-MNIST with a small training subset and a simple MLP as our baseline, then progressively add each technique and compare results.

## Setup

In [None]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import copy

torch.manual_seed(42)
np.random.seed(42)

device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f'Using device: {device}')

## Load Fashion-MNIST

We deliberately use a **small subset** of the training data (5,000 out of 60,000 images) to make overfitting clearly visible.

We split this subset into a **training set** and a **validation set** in a 2-to-1 proportion. The validation set will be used for early stopping.

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.2860,), (0.3530,)),
])

full_train_set = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_set = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# Use a small subset to encourage overfitting
subset_size = 5000
small_set = torch.utils.data.Subset(full_train_set, range(subset_size))

# Split into training (2/3) and validation (1/3)
n_train = int(subset_size * 2 / 3)
n_val = subset_size - n_train
train_set, val_set = torch.utils.data.random_split(small_set, [n_train, n_val])

train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=256, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=256, shuffle=False)

print(f'Training samples: {len(train_set)}')
print(f'Validation samples: {len(val_set)}')
print(f'Test samples: {len(test_set)}')

Fashion-MNIST (Xiao et al., 2017) is a drop-in replacement for MNIST with 10 clothing categories: T-shirt, Trouser, Pullover, Dress, Coat, Sandal, Shirt, Sneaker, Bag, and Ankle boot. Each image is 28×28 grayscale. It is harder than MNIST, making overfitting easier to observe.

In [None]:
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    img, label = full_train_set[i]
    ax.imshow(img.squeeze(), cmap='gray')
    ax.set_title(class_names[label])
    ax.axis('off')
plt.tight_layout()
plt.show()

## Utility Functions

The following helpers are provided. No modification is needed.

In [None]:
def evaluate(model, data_loader):
    """Compute loss and accuracy on a dataset."""
    model.eval()
    loss_fn = nn.CrossEntropyLoss()
    running_loss = 0.0
    running_corrects = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            preds = outputs.argmax(dim=1)
            running_corrects += (preds == labels).sum().item()
            total += inputs.size(0)
    return running_loss / total, running_corrects / total


def plot_history(histories, title=''):
    """Plot train/test loss and accuracy for one or more runs."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    for name, h in histories.items():
        ax1.plot(h['train_loss'], label=f'{name} (train)')
        ax1.plot(h['test_loss'], '--', label=f'{name} (test)')
    ax1.set_xlabel('Epoch'); ax1.set_ylabel('Loss'); ax1.set_title('Loss'); ax1.legend()
    for name, h in histories.items():
        ax2.plot(h['train_acc'], label=f'{name} (train)')
        ax2.plot(h['test_acc'], '--', label=f'{name} (test)')
    ax2.set_xlabel('Epoch'); ax2.set_ylabel('Accuracy'); ax2.set_title('Accuracy'); ax2.legend()
    if title:
        fig.suptitle(title, fontsize=14)
    plt.tight_layout()
    plt.show()

## Baseline: A Simple MLP that Overfits

**TODO:** Implement a simple MLP for classification over 10 classes with the following architecture:

- Flatten
- Linear(?, 512)
- ReLU
- Linear(512, 256)
- ReLU
- Linear(?, ?)

In [None]:
class BaselineMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # TODO

    def forward(self, x):
        # TODO
        return x

**TODO:** Implement the Training Loop

**Hints:**
- Set model to training mode with `model.train()`
- For each batch: zero gradients, forward pass, compute loss, backward pass, optimizer step
- Use `outputs.argmax(dim=1)` to get predictions
- At the end of each epoch, call `evaluate(model, test_loader)` to get test metrics

In [None]:
def train(model, train_loader, test_loader, optimizer, n_epochs):
    loss_fn = nn.CrossEntropyLoss()
    history = {'train_loss': [], 'test_loss': [], 'train_acc': [], 'test_acc': []}

    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # TODO: Implement the training step

            # TODO: Track statistics
            # running_loss += ...
            # running_corrects += ...
            total += inputs.size(0)

        train_loss = running_loss / total
        train_acc = running_corrects / total
        test_loss, test_acc = evaluate(model, test_loader)

        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f'Epoch {epoch+1:3d}/{n_epochs} — '
                  f'Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} — '
                  f'Test Loss: {test_loss:.4f} Acc: {test_acc:.4f}')

    return history

In [None]:
N_EPOCHS = 60

torch.manual_seed(42)
baseline_model = BaselineMLP().to(device)
optimizer = torch.optim.Adam(baseline_model.parameters(), lr=1e-3)
baseline_history = train(baseline_model, train_loader, test_loader, optimizer, N_EPOCHS)

plot_history({'Baseline': baseline_history})

The training loss keeps decreasing while the test loss starts increasing after a few epochs. The gap between train and test accuracy is the *generalization gap* — this is overfitting.

## Early Stopping

### Idea

Training a neural network too long lets it memorize the training data. **Early stopping** monitors the validation error and stops training when it starts increasing.

The procedure is:
1. Split the training data into a training set and a validation set (2-to-1 proportion)
2. Train only on the training set and evaluate the per-example error on the validation set once in a while, e.g. **every 5 epochs**
3. **Stop** training as soon as the validation error is higher than it was the last time it was checked
4. Use the weights the network had at that previous step as the result of the training run

**TODO:** Reimplement the training loop with early stopping.

1. Every 5 epochs, evaluate on `val_loader` using the `evaluate` function
2. If `val_loss` is higher than the previous check: restore the saved weights with `model.load_state_dict(...)` and `break`
3. Otherwise: save the current weights with `copy.deepcopy(model.state_dict())`

In [None]:
def train_early_stopping(model, train_loader, val_loader, test_loader, optimizer, n_epochs, check_every=5):
    loss_fn = nn.CrossEntropyLoss()
    history = {'train_loss': [], 'test_loss': [], 'train_acc': [], 'test_acc': []}

    last_val_loss = float('inf')
    best_weights = copy.deepcopy(model.state_dict())

    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # TODO: Implement the training step

            # TODO: Track statistics
            # running_loss += ...
            # running_corrects += ...
            total += inputs.size(0)

        train_loss = running_loss / total
        train_acc = running_corrects / total
        test_loss, test_acc = evaluate(model, test_loader)

        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)

        # TODO: Every check_every epochs, evaluate on val_loader
        # - If val_loss > last_val_loss: restore best_weights and break
        # - Otherwise: update last_val_loss and save best_weights

        if (epoch + 1) % check_every == 0 or epoch == 0:
            print(f'Epoch {epoch+1:3d}/{n_epochs} — '
                  f'Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} — '
                  f'Test Loss: {test_loss:.4f} Acc: {test_acc:.4f}')

    return history

In [None]:
torch.manual_seed(42)
es_model = BaselineMLP().to(device)
optimizer = torch.optim.Adam(es_model.parameters(), lr=1e-3)
es_history = train_early_stopping(es_model, train_loader, val_loader, test_loader, optimizer, N_EPOCHS)

plot_history({'Baseline': baseline_history, 'Early Stopping': es_history})

How does the test accuracy of the early-stopped model compare to the baseline at its last epoch? What happens if you change `check_every`?

## Batch Normalization

A common first step for data analysis: normalize the input data, for example to have mean $0$ and standard deviation $1$: `X = (X - X.mean()) / X.std()`

Batch normalization applies the same transform to the outputs of a layer by introducing two new learnable parameters: one for the mean and one for the standard deviation.

The noise introduced by computing statistics over mini-batches acts as a mild form of regularization.

**TODO:** Adapt the baseline network to add a batch normalization after each linear layer

- Flatten
- Linear
- BatchNorm1d
- ReLU
- Linear
- BatchNorm1d
- ReLU
- Linear

**Documentation:** [nn.BatchNorm1d](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html)

In [None]:
class BatchNormMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # TODO: add batch norm layers

    def forward(self, x):
        # TODO: apply Linear -> BatchNorm -> ReLU
        return x

In [None]:
torch.manual_seed(42)
bn_model = BatchNormMLP().to(device)
optimizer = torch.optim.Adam(bn_model.parameters(), lr=1e-3)
bn_history = train(bn_model, train_loader, test_loader, optimizer, N_EPOCHS)

plot_history({'Baseline': baseline_history, 'BatchNorm': bn_history})

Does batch normalization reduce the generalization gap? Does the model train faster (reach a given accuracy in fewer epochs)?

## Regularization

### L1 Regularization

L1 regularization adds a penalty proportional to the absolute value of the weights:

$$\mathcal{L}_{\text{reg}} = \mathcal{L} + \lambda \sum_i |w_i|$$

L1 tends to produce **sparse** weights (many weights driven exactly to zero), effectively performing feature selection.

**TODO:** Reimplement the training loop with L1 regularization.

In [None]:
def train_l1(model, train_loader, test_loader, optimizer, n_epochs, l1_lambda=1e-4):
    loss_fn = nn.CrossEntropyLoss()
    history = {'train_loss': [], 'test_loss': [], 'train_acc': [], 'test_acc': []}

    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # TODO: Implement the training step

            # TODO: Track statistics
            # running_loss += ...
            # running_corrects += ...
            total += inputs.size(0)

        train_loss = running_loss / total
        train_acc = running_corrects / total
        test_loss, test_acc = evaluate(model, test_loader)

        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f'Epoch {epoch+1:3d}/{n_epochs} — '
                  f'Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} — '
                  f'Test Loss: {test_loss:.4f} Acc: {test_acc:.4f}')

    return history

In [None]:
torch.manual_seed(42)
l1_model = BaselineMLP().to(device)
optimizer = torch.optim.Adam(l1_model.parameters(), lr=1e-3)
l1_history = train_l1(l1_model, train_loader, test_loader, optimizer, N_EPOCHS)

plot_history({'Baseline': baseline_history, 'L1': l1_history})

Let's visualize how L1 regularization promotes sparsity by comparing the weight distributions.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

baseline_weights = baseline_model.fc1.weight.detach().cpu().flatten().numpy()
l1_weights = l1_model.fc1.weight.detach().cpu().flatten().numpy()

ax1.hist(baseline_weights, bins=50, alpha=0.7)
ax1.set_title('Baseline — fc1 weights')
ax1.set_xlabel('Weight value')

ax2.hist(l1_weights, bins=50, alpha=0.7, color='orange')
ax2.set_title('L1 — fc1 weights')
ax2.set_xlabel('Weight value')

plt.tight_layout()
plt.show()

print(f'Baseline — weights near zero (<0.01): {(np.abs(baseline_weights) < 0.01).mean():.1%}')
print(f'L1       — weights near zero (<0.01): {(np.abs(l1_weights) < 0.01).mean():.1%}')

### L2 Regularization (Weight Decay)

L2 regularization adds a penalty proportional to the squared magnitude of the weights:

$$\mathcal{L}_{\text{reg}} = \mathcal{L} + \lambda \sum_i w_i^2$$

This discourages large weights, pushing the model toward simpler solutions.

We can implement it manually, just like L1.

**TODO:** Write a `train_l2` function, similar to `train_l1`, that adds an L2 penalty to the loss.

**Hints:**
- Compute the L2 norm: `l2_norm = sum(p.pow(2).sum() for p in model.parameters())`
- Add it to the loss: `loss = loss + l2_lambda * l2_norm`
- Try `l2_lambda` values like `1e-3`, `1e-4`

In [None]:
def train_l2(model, train_loader, test_loader, optimizer, n_epochs, l2_lambda=1e-3):
    loss_fn = nn.CrossEntropyLoss()
    history = {'train_loss': [], 'test_loss': [], 'train_acc': [], 'test_acc': []}

    for epoch in range(n_epochs):
        model.train()
        running_loss = 0.0
        running_corrects = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # TODO: Implement the training step

            # TODO: Track statistics
            # running_loss += ...
            # running_corrects += ...
            total += inputs.size(0)

        train_loss = running_loss / total
        train_acc = running_corrects / total
        test_loss, test_acc = evaluate(model, test_loader)

        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f'Epoch {epoch+1:3d}/{n_epochs} — '
                  f'Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} — '
                  f'Test Loss: {test_loss:.4f} Acc: {test_acc:.4f}')

    return history

In [None]:
torch.manual_seed(42)
l2_model = BaselineMLP().to(device)
optimizer = torch.optim.Adam(l2_model.parameters(), lr=1e-3)
l2_history = train_l2(l2_model, train_loader, test_loader, optimizer, N_EPOCHS)

plot_history({'Baseline': baseline_history, 'L2': l2_history})

#### L2 with `weight_decay`

Since L2 regularization is so common, PyTorch provides a shortcut: the `weight_decay` parameter in the optimizer does exactly the same thing.

Verify that you get the same result by training the baseline model with `weight_decay=1e-3` using the standard `train` function.

In [None]:
torch.manual_seed(42)
l2_wd_model = BaselineMLP().to(device)

# TODO: create optimizer with weight_decay=1e-3
optimizer = torch.optim.Adam(l2_wd_model.parameters(), lr=1e-3)

l2_wd_history = train(l2_wd_model, train_loader, test_loader, optimizer, N_EPOCHS)

plot_history({'Baseline': baseline_history, 'L2 (manual)': l2_history, 'L2 (weight_decay)': l2_wd_history})