In [2]:
import torch
from torch import optim
import numpy
torch.set_printoptions(edgeitems=2)

Use vanilla gradient descent for optimization is possible, but currently we have a lot of different algorithms for optimization that can better serve to more complicated models.

`torch` have a submodule called `optim` where we can find different optimizers already implemented.

In [22]:
t_c = torch.tensor([0.5,  14.0, 15.0, 28.0, 11.0,  8.0,  3.0, -4.0,  6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]) # values in unknown units
t_un = 0.1 * t_u


In [13]:
# applying autograd
def model(t_u, w, b):
    return w * t_u + b

In [14]:
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

In [15]:
params = torch.tensor([1.0, 0.0], requires_grad=True)

In [16]:
dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

In [17]:
# lets use a stochastic gradient descent
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-5
# is equal to grandient descent on momentum = 0.0
optimizer = optim.SGD([params], lr=learning_rate)

In [18]:
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()

In [19]:
optimizer.step()

In [20]:
params

tensor([ 9.5483e-01, -8.2600e-04], requires_grad=True)

In [26]:
# our new training loop, now with the optimizer
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 500 == 0:
            print('Epoch {}, Loss {}'.format(epoch, loss))
    return params

In [27]:
# lets use a stochastic gradient descent
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
# is equal to grandient descent on momentum = 0.0
optimizer = optim.SGD([params], lr=learning_rate)

In [28]:
training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    params=params,
    t_u=t_un,
    t_c=t_c
)

Epoch 500, Loss 7.860119819641113
Epoch 1000, Loss 3.828537940979004
Epoch 1500, Loss 3.092191219329834
Epoch 2000, Loss 2.957697868347168
Epoch 2500, Loss 2.933133840560913
Epoch 3000, Loss 2.9286484718322754
Epoch 3500, Loss 2.9278297424316406
Epoch 4000, Loss 2.9276793003082275
Epoch 4500, Loss 2.927651882171631
Epoch 5000, Loss 2.9276468753814697


tensor([  5.3671, -17.3012], requires_grad=True)

In [30]:
# experiment another optimizer
learning_rate = 1e-1
params = torch.tensor([1.0, 0.0], requires_grad=True)
optimizer = optim.Adam([params], lr=learning_rate)

In [31]:
training_loop(
    n_epochs=5000,
    optimizer=optimizer,
    params=params,
    t_u=t_u,
    t_c=t_c
)

Epoch 500, Loss 7.612900257110596
Epoch 1000, Loss 3.086700439453125
Epoch 1500, Loss 2.928579092025757
Epoch 2000, Loss 2.9276442527770996
Epoch 2500, Loss 2.927645206451416
Epoch 3000, Loss 2.9276459217071533
Epoch 3500, Loss 2.927644968032837
Epoch 4000, Loss 2.927645683288574
Epoch 4500, Loss 2.9276463985443115
Epoch 5000, Loss 2.927645206451416


tensor([  0.5368, -17.3048], requires_grad=True)

Basically the same result, without normalizing the inputs and with the same convergence.