In [None]:
"""
This note shows how to use torch.no_grad(), .requires_grad, .detach() by case study.
To sum up, torch.no_grad() affects the output variables while keeping paramters of the model unchanged; 
.requires_grad could directly affect if paramters require grad or not, however, it doesn't affect the output variables
of the model;
.detach() is used to copy one or several specific variables to other variables with .requires_grad = False without 
affecting the whole graph.

References: https://pytorch.org/docs/master/notes/autograd.html
https://discuss.pytorch.org/t/clone-and-detach-in-v0-4-0/16861
https://stackoverflow.com/questions/51748138/pytorch-how-to-set-requires-grad-false
"""

In [1]:
import torch
from torch import nn

In [2]:
# By default, a variable.requires_grad is False
x1 = torch.randn(2, 2)
print(x1.requires_grad)
x1

False


tensor([[-0.1278, -2.2626],
        [ 0.2183, -1.5492]])

In [3]:
# Define a variabe with .requires_grad True
x2 = torch.randn(2, 2)
x2.requires_grad = True
x2

tensor([[ 1.1053, -1.5058],
        [ 0.1517, -1.2914]], requires_grad=True)

In [6]:
# define several models. by default their parameters have .requires_grad=Truw
lin0 = nn.Linear(2, 2)
lin1 = nn.Linear(2, 2)
lin2 = nn.Linear(2, 2)
vars(lin0)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x109d87390>,
 '_parameters': OrderedDict([('weight', Parameter containing:
               tensor([[ 0.4102, -0.0866],
                       [ 0.2055,  0.6210]], requires_grad=True)),
              ('bias', Parameter containing:
               tensor([-0.2167,  0.4468], requires_grad=True))]),
 '_buffers': OrderedDict(),
 '_backward_hooks': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 'training': True,
 'in_features': 2,
 'out_features': 2}

In [7]:
x3 = lin0(x2)
x4 = lin1(x3)
x4.sum().backward()
print(lin0.weight.grad, lin1.weight.grad, lin2.weight.grad)

tensor([[ 0.8470, -1.8848],
        [ 0.5216, -1.1608]]) tensor([[ 0.3245, -0.5851],
        [ 0.3245, -0.5851]]) None


In [8]:
# Use torch.no_grad()
with torch.no_grad():
    x5 = lin2(x4)
    print(x4.requires_grad)
    print(lin2.weight.requires_grad)
    print(x5.requires_grad)
# The result shows that the input variable x4 is not changed, the parameters of model lin2 still have 
#.requires_grad=True. However, the output variable x5 has .requires_grad=False

True
True
False


In [9]:
# Notice that once the gradient tracking is cut-off by torch.no_grad(), you won't be able to backpropagate the 
# gradient to layers before the no_grad.
lin0 = nn.Linear(2, 2)
lin1 = nn.Linear(2, 2)
lin2 = nn.Linear(2, 2)
x3 = lin0(x2)
with torch.no_grad():
    x4 = lin1(x3)
x5 = lin2(x4)
x5.sum().backward()
print(lin0.weight.grad, lin1.weight.grad, lin2.weight.grad)

None None tensor([[-0.3956, -1.6145],
        [-0.3956, -1.6145]])


In [10]:
# Use .requires_grad to freeze some parameters of model. This is a very useful case if you want to use the output of
# a trained model as features for downstream task.
for param in lin1.parameters():
    param.requires_grad = False
print(vars(lin1))

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend object at 0x109d87390>, '_parameters': OrderedDict([('weight', Parameter containing:
tensor([[-0.3327, -0.0667],
        [-0.2523,  0.0855]])), ('bias', Parameter containing:
tensor([ 0.1044, -0.6160]))]), '_buffers': OrderedDict(), '_backward_hooks': OrderedDict(), '_forward_hooks': OrderedDict(), '_forward_pre_hooks': OrderedDict(), '_state_dict_hooks': OrderedDict(), '_load_state_dict_pre_hooks': OrderedDict(), '_modules': OrderedDict(), 'training': True, 'in_features': 2, 'out_features': 2}


In [11]:
# The output variable of a frozen model keeps the gradient tracking and you are able to backpropogate through it. 
# However, it doesn't affect the gradient. This is a good proprety that we want.
lin0 = nn.Linear(2, 2)
lin1 = nn.Linear(2, 2)
lin2 = nn.Linear(2, 2)
x3 = lin0(x2)
for param in lin1.parameters():
    param.requires_grad = False
x4 = lin1(x3)
x5 = lin2(x4)
x5.sum().backward()
print(lin0.weight.grad, lin1.weight.grad, lin2.weight.grad)

tensor([[-0.7582,  1.6871],
        [ 0.4078, -0.9074]]) None tensor([[-0.2020,  0.2053],
        [-0.2020,  0.2053]])


In [12]:
lin0 = nn.Linear(2, 2)
nonlin0 = nn.ReLU()
lin1 = nn.Linear(2, 2)
lin2 = nn.Linear(2, 2)
x2 = lin0(x1)
x3 = nonlin0(x2)
for param in lin1.parameters():
    param.requires_grad = False
x4 = lin1(x3)
x5 = lin2(x4)
print(x2)
print(x3)
print(x4)
print(x5)

tensor([[0.5972, 1.3182],
        [0.4222, 0.8155]], grad_fn=<AddmmBackward>)
tensor([[0.5972, 1.3182],
        [0.4222, 0.8155]], grad_fn=<ThresholdBackward0>)
tensor([[-0.6268,  0.4045],
        [-0.5410,  0.3039]], grad_fn=<AddmmBackward>)
tensor([[-0.5108, -0.0270],
        [-0.4217, -0.0467]], grad_fn=<AddmmBackward>)


In [26]:
# Since lin1 .requires_grad = False, if the input variable doesn't require grad, then the output neither; 
# otherwise, the output has requires_grad=True
x0 = torch.randn(2,2)
x1= lin1(x0)
x2 = lin2(x0)
print(x0.requires_grad)
print(x1.requires_grad)
print(x2.requires_grad)

False
False
True


In [18]:
# Use .detach() to get x2 without grad and the whole computing graph stays same.
lin0 = nn.Linear(2, 2)
lin1 = nn.Linear(2, 2)
x1 = torch.randn(2, 2)
x2 = lin0(x1)
x3 = lin1(x2)
output = x2.detach()
print(x2)
print(output)
print(x3)

tensor([[ 0.3828, -0.1423],
        [ 0.4588, -0.2701]], grad_fn=<AddmmBackward>)
tensor([[ 0.3828, -0.1423],
        [ 0.4588, -0.2701]])
tensor([[-0.5769, -0.3604],
        [-0.6140, -0.2757]], grad_fn=<AddmmBackward>)


In [19]:
# You can use output as part of other graphs.
x4 = lin1(output)
x4.sum().backward()
print(lin0.weight.grad, lin1.weight.grad)
# The backward of this new graph doesn't affect the initial graph. That's why lin0.weight.grad is None.

None tensor([[ 0.8416, -0.4124],
        [ 0.8416, -0.4124]])
