In [1]:
import torch 
import numpy as np
import torch.nn.functional as F
from torch import nn

In [2]:
class TestModel(nn.Module):
    def __init__(self):
        super(TestModel, self).__init__()
        # fully connected layers
        self.fc_layers = nn.ModuleList()
        self.fc_layers.append(nn.Linear(1, 2))
        self.fc_layers.append(nn.Linear(2, 2))
        self.fc_layers.append(nn.Linear(2, 1))
        
        for layer in self.fc_layers:
            layer.weight.data.fill_(1)
            layer.bias.data.fill_(1)


    def forward(self, x):
        for k in range(len(self.fc_layers) - 1):
            x = F.relu(self.fc_layers[k](x))
        y = self.fc_layers[-1](x)
        return y


In [3]:
x = torch.arange(4)+1
x = x.view(1, -1, 1)
y = x**2
print(x.shape, x, x.type())
print(y, y.type())

torch.Size([1, 4, 1]) tensor([[[1],
         [2],
         [3],
         [4]]]) torch.LongTensor
tensor([[[ 1],
         [ 4],
         [ 9],
         [16]]]) torch.LongTensor


In [4]:
# forward pass through network 
model = TestModel()
yhat = model(x.float())
print(yhat)
print(y)

tensor([[[11.],
         [15.],
         [19.],
         [23.]]], grad_fn=<AddBackward0>)
tensor([[[ 1],
         [ 4],
         [ 9],
         [16]]])


In [5]:
loss = F.mse_loss(y.float(), yhat)
print(loss)

tensor(92.5000, grad_fn=<MeanBackward0>)


In [6]:
loss.backward()

In [7]:
for name, param in model.named_parameters():
    print(name, param.grad)

fc_layers.0.weight tensor([[90.],
        [90.]])
fc_layers.0.bias tensor([38., 38.])
fc_layers.1.weight tensor([[64., 64.],
        [64., 64.]])
fc_layers.1.bias tensor([19., 19.])
fc_layers.2.weight tensor([[147., 147.]])
fc_layers.2.bias tensor([19.])


In [8]:
# now what is gradient if we just do the same thing?
# forward pass through network 
yhat = model(x.float())
loss = F.mse_loss(y.float(), yhat)
print(f'loss: {loss}')
loss.backward()
for name, param in model.named_parameters():
    print(name, param.grad)

loss: 92.5
fc_layers.0.weight tensor([[180.],
        [180.]])
fc_layers.0.bias tensor([76., 76.])
fc_layers.1.weight tensor([[128., 128.],
        [128., 128.]])
fc_layers.1.bias tensor([38., 38.])
fc_layers.2.weight tensor([[294., 294.]])
fc_layers.2.bias tensor([38.])


In [9]:
# now do again, but keep the graph
# but first zero the gradient
model.zero_grad()
for name, param in model.named_parameters():
    print(name, param.grad)
yhat = model(x.float())
loss = F.mse_loss(y.float(), yhat)
print(f'loss: {loss}')
loss.backward(create_graph=False)
print('first pass--------------------------')
for name, param in model.named_parameters():
    print(name, param.grad)
loss.backward(create_graph=False)
print('second pass-------------------------')
for name, param in model.named_parameters():
    print(name, param.grad)

fc_layers.0.weight tensor([[0.],
        [0.]])
fc_layers.0.bias tensor([0., 0.])
fc_layers.1.weight tensor([[0., 0.],
        [0., 0.]])
fc_layers.1.bias tensor([0., 0.])
fc_layers.2.weight tensor([[0., 0.]])
fc_layers.2.bias tensor([0.])
loss: 92.5
first pass--------------------------
fc_layers.0.weight tensor([[90.],
        [90.]])
fc_layers.0.bias tensor([38., 38.])
fc_layers.1.weight tensor([[64., 64.],
        [64., 64.]])
fc_layers.1.bias tensor([19., 19.])
fc_layers.2.weight tensor([[147., 147.]])
fc_layers.2.bias tensor([19.])


RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.