# Testing framework for comparison of pytorch and candle results

<a target="_blank" href="https://colab.research.google.com/github/KGrewal1/optimisers/blob/master/pytorch_test.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>

## Imports

In [1]:
import torch
from torch import optim
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.get_device_name(0)) # this should be commented out to run on colab

2.1.0
12.1
NVIDIA GeForce RTX 3070 Ti


# SGD tests

## nesterov_sgd_test

In [2]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, momentum=0.1, nesterov=True)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[ 1.0750, -9.9042]], requires_grad=True)
Parameter containing:
tensor([-1.8961], requires_grad=True)


## nesterov_decay_sgd_test

In [3]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, momentum=0.1, nesterov=True, weight_decay = 0.1)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[  0.9921, -10.3803]], requires_grad=True)
Parameter containing:
tensor([-1.9331], requires_grad=True)


## momentum_sgd_test

In [4]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, momentum=0.1, nesterov=False, weight_decay = 0.0)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.8870, 0.8589]], requires_grad=True)
Parameter containing:
tensor([-0.6341], requires_grad=True)


## momentum_sgd_decay_test

In [5]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, momentum=0.1, nesterov=False, weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.8751, 0.8514]], requires_grad=True)
Parameter containing:
tensor([-0.5626], requires_grad=True)


## momentum_sgd_dampened_test

In [6]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, momentum=0.1, nesterov=False, weight_decay = 0.0, dampening=0.2)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.8746, 0.8434]], requires_grad=True)
Parameter containing:
tensor([-0.4838], requires_grad=True)


## sgd_test

In [7]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, weight_decay = 0.0)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.8809, 0.8513]], requires_grad=True)
Parameter containing:
tensor([-0.5606], requires_grad=True)


## sgd_decay_test

In [8]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.SGD(m.parameters(), lr=0.004, weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.8700, 0.8450]], requires_grad=True)
Parameter containing:
tensor([-0.5003], requires_grad=True)


# AdaGrad tests

## adagrad_test

In [9]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adagrad(m.parameters(), lr=0.004, weight_decay=0.00)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.2424, 0.2341]], requires_grad=True)
Parameter containing:
tensor([0.2379], requires_grad=True)


## adagrad_lr_decay_test

In [10]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adagrad(m.parameters(), lr=0.004, lr_decay=0.2)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.0231, 0.0230]], requires_grad=True)
Parameter containing:
tensor([0.0230], requires_grad=True)


## adagrad_weight_decay_test

In [11]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adagrad(m.parameters(), lr=0.004, weight_decay=0.2)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.2424, 0.2341]], requires_grad=True)
Parameter containing:
tensor([0.2378], requires_grad=True)


# AdaDelta Tests

## adadelta_test

In [12]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adadelta(m.parameters(), lr=0.004)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.0016, 0.0016]], requires_grad=True)
Parameter containing:
tensor([0.0016], requires_grad=True)


## adadelta_weight_decay_test

In [13]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adadelta(m.parameters(), lr=0.004, weight_decay = 0.8)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.0016, 0.0016]], requires_grad=True)
Parameter containing:
tensor([0.0016], requires_grad=True)


## AdaMax Tests

## adamax_test

In [14]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adamax(m.parameters(), lr=0.004)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.3895, 0.3450]], requires_grad=True)
Parameter containing:
tensor([0.3643], requires_grad=True)


## adamax_weight_decay_test

In [15]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adamax(m.parameters(), lr=0.004, weight_decay = 0.6)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.3894, 0.3450]], requires_grad=True)
Parameter containing:
tensor([0.3639], requires_grad=True)


# NAdam Tests

## nadam_test

In [16]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.NAdam(m.parameters())
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.1897, 0.1837]], requires_grad=True)
Parameter containing:
tensor([0.1864], requires_grad=True)


## nadam_weight_decay_test

In [17]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.NAdam(m.parameters(), weight_decay = 0.6)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.1897, 0.1837]], requires_grad=True)
Parameter containing:
tensor([0.1863], requires_grad=True)


## nadam_decoupled_weight_decay_test

In [18]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.NAdam(m.parameters(), weight_decay = 0.6, decoupled_weight_decay=True)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.1792, 0.1737]], requires_grad=True)
Parameter containing:
tensor([0.1762], requires_grad=True)


# RAdam Tests

## radam_test

In [19]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RAdam(m.parameters())
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.2128, 1.2819]], requires_grad=True)
Parameter containing:
tensor([0.2923], requires_grad=True)


## radam_weight_decay_test

In [20]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RAdam(m.parameters(), weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.2117, 1.2812]], requires_grad=True)
Parameter containing:
tensor([0.2921], requires_grad=True)


# RMSprop Tests

## rmsprop_test

In [21]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters())
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[1.6650, 0.7867]], requires_grad=True)
Parameter containing:
tensor([1.3012], requires_grad=True)


## rmsprop_weight_decay_test

In [22]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(), weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[1.6643, 0.7867]], requires_grad=True)
Parameter containing:
tensor([1.2926], requires_grad=True)


## rmsprop_centered_test

In [23]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(), centered = True)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[1.8892, 0.7617]], requires_grad=True)
Parameter containing:
tensor([1.3688], requires_grad=True)


## rmsprop_centered_decay_test

In [24]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(), centered = True, weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[1.8883, 0.7621]], requires_grad=True)
Parameter containing:
tensor([1.3558], requires_grad=True)


## rmsprop_momentum_test

In [25]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(),  momentum = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.3042, 0.6835]], requires_grad=True)
Parameter containing:
tensor([1.5441], requires_grad=True)


## rmsprop_momentum_decay_test

In [26]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(),  momentum = 0.4, weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.3028, 0.6858]], requires_grad=True)
Parameter containing:
tensor([1.5149], requires_grad=True)


## rmsprop_centered_momentum_test

In [27]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(), centered = True, momentum = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.4486, 0.6715]], requires_grad=True)
Parameter containing:
tensor([1.5045], requires_grad=True)


## rmsprop_centered_momentum_decay_test

In [28]:
w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.RMSprop(m.parameters(), centered = True, momentum = 0.4, weight_decay = 0.4)
for _step in range(100):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[2.4468, 0.6744]], requires_grad=True)
Parameter containing:
tensor([1.4695], requires_grad=True)


# Adam Tests

## adam_test

In [29]:
import torch
from torch import optim

w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adam(m.parameters())
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.9000, 0.6967]], requires_grad=True)
Parameter containing:
tensor([0.7996], requires_grad=True)


## adam_weight_decay_test

In [30]:
import torch
from torch import optim

w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adam(m.parameters(), weight_decay = 0.6)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.8997, 0.6964]], requires_grad=True)
Parameter containing:
tensor([0.7975], requires_grad=True)


## adamw_weight_decay_test

In [31]:
import torch
from torch import optim

w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.AdamW(m.parameters(), weight_decay = 0.6)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.6901, 0.5677]], requires_grad=True)
Parameter containing:
tensor([0.6287], requires_grad=True)


## adam_amsgrad_test

In [32]:
import torch
from torch import optim

w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adam(m.parameters(), amsgrad=True)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.9001, 0.6904]], requires_grad=True)
Parameter containing:
tensor([0.7978], requires_grad=True)


## adam_amsgrad_decay_test

In [33]:
import torch
from torch import optim

w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.Adam(m.parameters(), amsgrad=True, weight_decay=0.6)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.8998, 0.6901]], requires_grad=True)
Parameter containing:
tensor([0.7955], requires_grad=True)


## adamw_amsgrad_decay_test

In [34]:
import torch
from torch import optim

w_gen = torch.tensor([[3., 1.]])
b_gen = torch.tensor([-2.])

sample_xs = torch.tensor([[2., 1.], [7., 4.], [-4., 12.], [5., 8.]])
sample_ys = sample_xs.matmul(w_gen.t()) + b_gen

m = torch.nn.Linear(2, 1)
with torch.no_grad():
    m.weight.zero_()
    m.bias.zero_()
optimiser = optim.AdamW(m.parameters(), amsgrad=True, weight_decay=0.6)
for _step in range(1000):
    optimiser.zero_grad()
    ys = m(sample_xs)
    loss = ((ys - sample_ys)**2).sum()
    loss.backward()
    optimiser.step()
print(m.weight)
print(m.bias)

Parameter containing:
tensor([[0.6901, 0.5648]], requires_grad=True)
Parameter containing:
tensor([0.6287], requires_grad=True)
