In [1]:
# Import the torch module
import torch
import torch.optim as optim

# torch.optim

The torch module has an **optim** submodule where we can find classes implementing different optimization algorithms.

> - Every optimizer constructor takes a list of parameters as the first input. All parameters passed to the optimizer are retained inside the optimizer object so the optimizer can update their values and access their grad attribute
> - Each optimizer exposes two methods: zero_grad and step. zero_grad zeroes the grad attribute of all the parameters passed to the optimizer upon construction.

In [2]:
dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

## Getting data and Defining model function and loss function

In [3]:
# temperature data in good old Celsius and measurements from a new thermometer.
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0, 8.0,3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9,33.9, 21.8, 48.4, 60.4, 68.4])
t_un = 0.1 * t_u  # feature scaling

# linear model function
def model(t_u, w, b):
    return w * t_u + b

# mean square error as loss function
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

## Defining Training Loop Using Optimizer and Autograd

In [4]:
# training loop
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)

        optimizer.zero_grad()  # zeroes the grad attribute of all the parameters passed to the optimizer upon construction
        loss.backward()
        optimizer.step()  # Performs a single optimization step (parameter update).

        if epoch % 500 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))

    return params

## Using stochastic gradient descent (SGD)
The term stochastic comes from the fact that the gradient is typically obtained by averaging over a random subset of all input samples, called a minibatch.

In [5]:
# Let’s create params and instantiate a gradient descent optimizer
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate) # <1>

training_loop(n_epochs = 5000,  optimizer = optimizer, params = params, t_u = t_un, t_c = t_c)

Epoch 500, Loss 7.860120
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957698
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927679
Epoch 4500, Loss 2.927652
Epoch 5000, Loss 2.927647


tensor([  5.3671, -17.3012], requires_grad=True)

# Examples of Using Adam Optimizer

In [6]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-1
optimizer = optim.Adam([params], lr=learning_rate) # using adam

training_loop(n_epochs = 5000, optimizer = optimizer,params = params,t_u = t_u, t_c = t_c)

Epoch 500, Loss 7.612900
Epoch 1000, Loss 3.086698
Epoch 1500, Loss 2.928578
Epoch 2000, Loss 2.927646
Epoch 2500, Loss 2.927645
Epoch 3000, Loss 2.927646
Epoch 3500, Loss 2.927645
Epoch 4000, Loss 2.927646
Epoch 4500, Loss 2.927646
Epoch 5000, Loss 2.927645


tensor([  0.5368, -17.3048], requires_grad=True)