# Optimizer 优化器

In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt

torch.set_printoptions(edgeitems=2, threshold=50)

In [3]:
t_c = [0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0]
t_u = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]
t_c = torch.tensor(t_c)
t_u = torch.tensor(t_u)

In [4]:
t_un = 0.1 * t_u

In [5]:
def model(x, w, b):
    return w * x + b

In [6]:
def loss_fn(p, y):
    squared_diffs = (p - y) ** 2
    return squared_diffs.mean()

In [7]:
params = torch.tensor([1.0, 0.0], requires_grad=True)

In [8]:
params.grad is None

True

In [9]:
params, params.shape

(tensor([1., 0.], requires_grad=True), torch.Size([2]))

*操作为解包操作，可以将张量展开为所需的参数

In [10]:
loss = loss_fn(model(t_u, *params), t_c)

In [11]:
loss.backward()

In [12]:
params.grad

tensor([4517.2969,   82.6000])

In [13]:
if params.grad is not None:
    params.grad.zero_()

In [14]:
params.grad

tensor([0., 0.])

In [15]:
def training_loop(n_epochs, lr, params, x, y):
    for epoch in range(1, n_epochs + 1):
        if params.grad is not None:
            params.grad.zero_()

        p = model(x, *params)
        loss = loss_fn(p, y)
        loss.backward()

        with torch.no_grad():
            params -= lr * params.grad
        if epoch % 500 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item():.4f}")
    return params

In [16]:
params = training_loop(
    n_epochs=5000,
    lr=1e-2,
    params=torch.tensor([1.0, 0.0], requires_grad=True),
    x=t_un,
    y=t_c,
)

Epoch: 500, Loss: 7.8601
Epoch: 1000, Loss: 3.8285
Epoch: 1500, Loss: 3.0922
Epoch: 2000, Loss: 2.9577
Epoch: 2500, Loss: 2.9331
Epoch: 3000, Loss: 2.9286
Epoch: 3500, Loss: 2.9278
Epoch: 4000, Loss: 2.9277
Epoch: 4500, Loss: 2.9277
Epoch: 5000, Loss: 2.9276


In [17]:
import torch.optim as optim

dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

In [18]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-5
optimizer = optim.SGD([params], lr=lr)

In [19]:
p = model(t_u, *params)
loss = loss_fn(p, t_c)
loss.backward()

In [20]:
# params -= lr * params.grad
optimizer.step()
params

tensor([ 9.5483e-01, -8.2600e-04], requires_grad=True)

In [21]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-2
optimizer = optim.SGD([params], lr=lr)

p = model(t_u, *params)
loss = loss_fn(p, t_c)

# 注意梯度归零
optimizer.zero_grad()
loss.backward()

optimizer.step()
params

tensor([-44.1730,  -0.8260], requires_grad=True)

In [22]:
def training_loop(n_epochs, lr, x, y, params, optimizer):
    for epoch in range(1, n_epochs + 1):
        p = model(x, *params)
        loss = loss_fn(p, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 500 == 0:
            print(f"Epoch: {epoch}, Loss: {loss.item():.4f}")
    return params

In [23]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-2
optimizer = optim.SGD([params], lr=lr)

In [24]:
training_loop(n_epochs=5000, lr=lr, x=t_un, y=t_c, params=params, optimizer=optimizer)

Epoch: 500, Loss: 7.8601
Epoch: 1000, Loss: 3.8285
Epoch: 1500, Loss: 3.0922
Epoch: 2000, Loss: 2.9577
Epoch: 2500, Loss: 2.9331
Epoch: 3000, Loss: 2.9286
Epoch: 3500, Loss: 2.9278
Epoch: 4000, Loss: 2.9277
Epoch: 4500, Loss: 2.9277
Epoch: 5000, Loss: 2.9276


tensor([  5.3671, -17.3012], requires_grad=True)

In [25]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-1
optimizer = optim.SGD([params], lr=lr)
training_loop(n_epochs=2000, lr=lr, x=t_u, y=t_c, params=params, optimizer=optimizer)

Epoch: 500, Loss: nan
Epoch: 1000, Loss: nan
Epoch: 1500, Loss: nan
Epoch: 2000, Loss: nan


tensor([nan, nan], requires_grad=True)

视频里在尝试增大lr时，不小心改用了Adam优化器，因此正常训练成功了。但是此处可以看到，当使用SGD优化器的时候，把lr增大会导致完全无法收敛

并且使用Adam时，即使使用未缩放的输入数据，也能达到很好的收敛效果，由此可见Adam优化器的强大之处。(其实也主要是因为Adam优化器对输入数据的缩放与否并不敏感)

In [26]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-1
optimizer = optim.Adam([params], lr=lr)
training_loop(n_epochs=2000, lr=lr, x=t_u, y=t_c, params=params, optimizer=optimizer)

Epoch: 500, Loss: 7.6129
Epoch: 1000, Loss: 3.0867
Epoch: 1500, Loss: 2.9286
Epoch: 2000, Loss: 2.9276


tensor([  0.5367, -17.3021], requires_grad=True)

# 训练、验证与过拟合

In [27]:
n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

In [28]:
n_samples, n_val

(11, 2)

In [29]:
shuffled_indices = torch.randperm(n_samples)
train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]
train_indices, val_indices

(tensor([4, 2, 6, 3, 5, 7, 9, 0, 1]), tensor([ 8, 10]))

In [30]:
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]
train_t_u, train_t_c

(tensor([56.3000, 58.2000, 33.9000, 81.9000, 48.9000, 21.8000, 60.4000, 35.7000,
         55.9000]),
 tensor([11.0000, 15.0000,  3.0000, 28.0000,  8.0000, -4.0000, 13.0000,  0.5000,
         14.0000]))

In [31]:
val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]
val_t_u, val_t_c

(tensor([48.4000, 68.4000]), tensor([ 6., 21.]))

In [32]:
train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

In [33]:
def training_loop(n_epochs, optimizer, params, train_x, train_y, val_x, val_y):
    for epoch in range(1, n_epochs + 1):
        train_p = model(train_x, *params)
        train_loss = loss_fn(train_p, train_y)

        val_p = model(val_x, *params)
        val_loss = loss_fn(val_p, val_y)

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        if epoch <= 3 or epoch % 500 == 0:
            print(
                f"Epoch {epoch}, Training loss {train_loss.item():.4f},"
                f" Validation loss {val_loss.item():.4f}"
            )
    return params

In [34]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
lr = 1e-2
optimizer = optim.SGD([params], lr=lr)

training_loop(
    n_epochs=3000,
    optimizer=optimizer,
    params=params,
    train_x=train_t_un,
    val_x=val_t_un,
    train_y=train_t_c,
    val_y=val_t_c,
)

Epoch 1, Training loss 75.7952, Validation loss 100.9256
Epoch 2, Training loss 38.2342, Validation loss 44.6465
Epoch 3, Training loss 31.5485, Validation loss 32.3576
Epoch 500, Training loss 6.4860, Validation loss 11.6878
Epoch 1000, Training loss 3.0221, Validation loss 7.4872
Epoch 1500, Training loss 2.5218, Validation loss 6.1733
Epoch 2000, Training loss 2.4496, Validation loss 5.7147
Epoch 2500, Training loss 2.4391, Validation loss 5.5463
Epoch 3000, Training loss 2.4376, Validation loss 5.4832


tensor([  5.2245, -16.4587], requires_grad=True)

由于评估时不需要记录梯度，也不需要进行迭代更新，所以可以将其放在torch.no_grad()上下文中，来阻止其生成对应的计算图，来节省内存和运行时间。

In [35]:
def training_loop(n_epochs, optimizer, params, train_x, train_y, val_x, val_y):
    for epoch in range(1, n_epochs + 1):
        train_p = model(train_x, *params)
        train_loss = loss_fn(train_p, train_y)

        with torch.no_grad():
            val_p = model(val_x, *params)
            val_loss = loss_fn(val_p, val_y)
            assert val_loss.requires_grad == False

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        if epoch <= 3 or epoch % 500 == 0:
            print(
                f"Epoch {epoch}, Training loss {train_loss.item():.4f},"
                f" Validation loss {val_loss.item():.4f}"
            )
    return params

还可以显式地使用torch.set_grad_enabled(bool_expression)来在运行时启用或禁用自动求导。
在此样例中，我们可以定义一个calc_forward()方法，自定义在推理预测时是否进行自动求导。

In [38]:
def calc_forward(x, y, is_train):
    with torch.set_grad_enabled(is_train):
        p = model(x, *params)
        loss = loss_fn(p, y)
    return loss

In [37]:
def training_loop(n_epochs, optimizer, params, train_x, train_y, val_x, val_y):
    for epoch in range(1, n_epochs + 1):
        train_loss = calc_forward(train_x, train_y, is_train=True)

        # with torch.no_grad():
        #     val_p = model(val_x, *params)
        #     val_loss = loss_fn(val_p, val_y)
        #     assert val_loss.requires_grad == False

        val_loss = calc_forward(val_x, val_y, is_train=False)

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        if epoch <= 3 or epoch % 500 == 0:
            print(
                f"Epoch {epoch}, Training loss {train_loss.item():.4f},"
                f" Validation loss {val_loss.item():.4f}"
            )
    return params