In [1]:
import torch
import torch.nn as nn

#### Input (x) and Output (y)

In [2]:
x = torch.randn(10, 3)
y = torch.randn(10, 2)

#### Model: $ M(x) = W x + b $

In [3]:
M = nn.Linear(3,2)
print('W: ',M.weight)
print('b: ',M.bias)

W:  Parameter containing:
tensor([[ 0.3345,  0.2204,  0.2693],
        [ 0.0158, -0.1395,  0.5564]], requires_grad=True)
b:  Parameter containing:
tensor([0.1030, 0.2012], requires_grad=True)


#### Loss function (criterion) and optimization approach (optimizer)

In [4]:
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(M.parameters(), lr=0.01)

#### Computing loss function
Pytorch uses a computation graph and reverse mode to compute the gradients numerically. They are added and have to be reset at each iteration.

.zero_grad() is needed to reset the gradients that are accumulated

In [5]:
optimizer.zero_grad() # IMPORTANT!
pred = M(x)
loss = criterion(pred,y)
print(loss)

tensor(0.8316, grad_fn=<MseLossBackward>)


#### Backpropagation

Model weights are optimized iteratively using stochastic gradient descent.

In [6]:
loss.backward() # Automatic differentiation
print ('dL/dW: ', M.weight.grad) 
print ('dL/db: ', M.bias.grad)
optimizer.step() # Updates weights

dL/dW:  tensor([[0.5653, 0.1845, 0.1116],
        [0.0474, 0.4121, 0.5427]])
dL/db:  tensor([0.1811, 0.4516])


#### Loss decreases

In [7]:
pred = M(x)
loss = criterion(pred,y)
print(loss)

tensor(0.8210, grad_fn=<MseLossBackward>)


#### Arbitrary loss functions thanks to automatic differentiation

In [8]:
criterion = lambda x, y: torch.pow(x-y, 2).mean() # Equivalenet to nn.MSELoss()
criterion = lambda x, y: torch.abs(x-y).mean()
pred = M(x)
loss = criterion(pred,y)
print(loss)
loss.backward()
print ('dL/dW: ', M.weight.grad) 
print ('dL/db: ', M.bias.grad)

tensor(0.7569, grad_fn=<MeanBackward0>)
dL/dW:  tensor([[0.8170, 0.1694, 0.1238],
        [0.1314, 0.6274, 0.8551]])
dL/db:  tensor([0.2811, 0.5516])
