In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

In [5]:
def load_data(data_root, train_batch_size, test_batch_size):
    mean, std_dev = [0.1307], [0.3081]
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std_dev)
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std_dev)
    ])
    trainset = MNIST(
        data_root, train=True, download=True,
        transform=transform_train
    )
    testset = MNIST(
        data_root, train=False, download=True,
        transform=transform_test
    )
    train_loader = DataLoader(
        trainset, train_batch_size, shuffle=False,
        num_workers=2, pin_memory=True
    )
    test_loader = DataLoader(
        testset, test_batch_size, shuffle=False,
        num_workers=2, pin_memory=True
    )
    return train_loader, test_loader


class LeNet_300_100(nn.Module):
    def __init__(self):
        super(LeNet_300_100, self).__init__()
        self.linear0 = nn.Linear(28*28, 300)
        self.relu0 = nn.ReLU(inplace=True)
        self.linear1 = nn.Linear(300, 100)
        self.relu1 = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(100, 10)

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        x = self.relu0(self.linear0(x))
        x = self.relu1(self.linear1(x))
        x = self.linear2(x)
        return x

train_batch_size, test_batch_size = 32, 16
data_root = "../../../../data/torchvision"
train_loader, test_loader = load_data(
    data_root, train_batch_size, test_batch_size
)
model = LeNet_300_100()
x, y = next(iter(train_loader))
x.shape, y.shape

(torch.Size([32, 1, 32, 32]), torch.Size([32]))

# Train a Neural Network
训练神经网络的步骤包括：
1. 获得数据集，并对其进行预处理；
2. 定义模型；
3. 在训练集上训练模型：
    1. 将数据送入网络得到输出，并根据其计算损失；
    2. 根据损失进行反向传播以更新网络参数；
    3. 回到步骤 A. 直至遍历整个数据集；
4. 在验证集上进行验证（可省略）；
5. 回到步骤 3. 直至遍历所有训练 epoch；
6. 在测试集上对模型进行测试，



## Load data and define a model
前两步的相关教程参见[数据预处理教程](http://localhost:8888/notebooks/Help_Viewer_Python/PyTorch/pytorch/Tutorial/01.Data_Preprocessing.ipynb)和[模型定义教程](http://localhost:8888/notebooks/Help_Viewer_Python/PyTorch/pytorch/Tutorial/02.Define_a_Model.ipynb)；下面仍基于 MNIST 数据集演示训练神经网络的过程；数据预处理和模型定义如下：

```python
def load_data(data_root, train_batch_size, test_batch_size):
    mean, std_dev = [0.1307], [0.3081]
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean, std_dev)
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std_dev)
    ])
    trainset = MNIST(
        data_root, train=True, download=True,
        transform=transform_train
    )
    testset = MNIST(
        data_root, train=False, download=True,
        transform=transform_test
    )
    train_loader = DataLoader(
        trainset, train_batch_size, shuffle=False,
        num_workers=2, pin_memory=True
    )
    test_loader = DataLoader(
        testset, test_batch_size, shuffle=False,
        num_workers=2, pin_memory=True
    )
    return train_loader, test_loader


class LeNet_300_100(nn.Module):
    def __init__(self):
        super(LeNet_300_100, self).__init__()
        self.linear0 = nn.Linear(28*28, 300)
        self.relu0 = nn.ReLU(inplace=True)
        self.linear1 = nn.Linear(300, 100)
        self.relu1 = nn.ReLU(inplace=True)
        self.linear2 = nn.Linear(100, 10)

    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        x = self.relu0(self.linear0(x))
        x = self.relu1(self.linear1(x))
        x = self.linear2(x)
        return x

train_batch_size, test_batch_size = 32, 16
data_root = "../../../../data/torchvision"
train_loader, test_loader = load_data(
    data_root, train_batch_size, test_batch_size
)
model = LeNet_300_100()
```

## Define a loss function and metrics
PyTorch 中损失函数统一以`loss_fn(outputs, targets)`形式接收的参数，`torch.nn` API 中定义了多种内置的损失函数的接口，例如`MSELoss`、`nn.CrossEntropyLoss`、`nn.KLDivLoss`等；
```python
class AvgMetric(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.counter = 0
        self.accumulator = 0.0
        self.value = 0.0

    def update(self, value, n):
        self.value = value
        self.counter += n
        self.accumulator += value

    def result():
        return self.accumulator / self.counter
```


## Back propagate and update the params
得到损失`loss`后，调用`loss.backward()`便可以实现自动求导，详见自动求导机制；更新参数的一种经典方法是 SGD 算法，其实现方式可以是手动实现；或实例化一个`torch.optim` API 中的优化函数，再调用`optimizer.step()`即可；示例如下：

```python
# manually
for param in model.parameters():
    param.data.sub_(param.grad.data * 0.01)

# use the built-in method
optimizer.zero_grad()
optimizer = optim.SGD(model.parameters(), lr=0.01)
outputs = net(inputs)
loss = loss_fn(outputs, targets)
loss.backward()
optimizer.step()
```

将对数据集的遍历、训练 epoch 的遍历、模型验证和测试考虑进去，便有以下代码：
```python
def train(epochs):
    loss_metric = AvgMetric()
    for epoch in range(epochs):
        for step, (inputs, targets) in enumerate(train_loader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            loss.backward()  # backprop
            optimizer.step() # update params
            loss_metric.update(loss, inputs.shape[0])

            if (step+1) % 200 == 0:
                print("Epoch[{}/{}], step[{}/{}]:\tLoss={:.4f}".format(
                    epoch+1, epochs, step+1,
                    len(train_loader.dataset) // train_batch_size,
                    loss_metric.result()
                ))


model = LeNet_300_100()
optimizer = optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
train(2)
```