# Sesión 12 — CNN completas para clasificación (PyTorch)

En esta sesión entrenaremos una **CNN completa** para MNIST y la compararemos con un MLP.

**Objetivos**
- Construir una CNN end-to-end
- Entrenarla para clasificación
- Comparar parámetros y rendimiento con un MLP

Pregunta guía: **¿Por qué una CNN generaliza mejor con menos parámetros?**


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader

torch.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"
device


In [None]:
# Dataset: MNIST
try:
    from torchvision import datasets, transforms
    tfm = transforms.Compose([transforms.ToTensor()])
    train_ds = datasets.MNIST(root="./data", train=True, download=True, transform=tfm)
    test_ds  = datasets.MNIST(root="./data", train=False, download=True, transform=tfm)
except Exception as e:
    raise RuntimeError(f"Error cargando MNIST: {e}")

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False)

len(train_ds), len(test_ds)


## Modelo CNN

Arquitectura:
- 2 bloques Conv → ReLU → Pool
- Flatten
- 2 capas densas


In [None]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 28x28 -> 14x14
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 14x14 -> 7x7
        )
        self.classifier = nn.Sequential(
            nn.Linear(32 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        return self.classifier(x)

cnn = CNN().to(device)
sum(p.numel() for p in cnn.parameters())


In [None]:
# MLP para comparación
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28*28, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        return self.net(x)

mlp = MLP().to(device)
sum(p.numel() for p in mlp.parameters())


In [None]:
criterion = nn.CrossEntropyLoss()

def train_epoch(model, loader, optimizer):
    model.train()
    total_loss, correct, n = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        correct += (logits.argmax(dim=1) == y).sum().item()
        n += x.size(0)
    return total_loss/n, correct/n

@torch.no_grad()
def eval_epoch(model, loader):
    model.eval()
    total_loss, correct, n = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        loss = criterion(logits, y)
        total_loss += loss.item() * x.size(0)
        correct += (logits.argmax(dim=1) == y).sum().item()
        n += x.size(0)
    return total_loss/n, correct/n


In [None]:
# Entrenar ambos modelos brevemente
def run(model, epochs=5):
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    for ep in range(1, epochs+1):
        tr_loss, tr_acc = train_epoch(model, train_loader, opt)
        te_loss, te_acc = eval_epoch(model, test_loader)
        print(f"Epoch {ep:02d} | train acc={tr_acc:.3f} | test acc={te_acc:.3f}")
    return te_acc

print("CNN")
acc_cnn = run(cnn)

print("\nMLP")
acc_mlp = run(mlp)

acc_cnn, acc_mlp


## Discusión

Observa:
- precisión
- velocidad de convergencia
- número de parámetros

Pregunta final: **¿Por qué la CNN logra mejor desempeño con menos parámetros?**
