In [1]:

import torch


In [2]:
x = torch.ones(5)
print("x is ", x)
y = torch.zeros(3)
print("y is ", y)
w = torch.randn(5, 3, requires_grad=True)
print("w is ", w)
         
b = torch.randn(3, requires_grad=True)
print("b is ", b)

z = torch.matmul(x, w) + b
print("z is ", z)

loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
print("loss is ", loss)


x is  tensor([1., 1., 1., 1., 1.])
y is  tensor([0., 0., 0.])
w is  tensor([[-0.0829, -0.5782, -0.1361],
        [-0.7981,  0.7377, -0.9458],
        [-0.1041, -0.5958, -0.9834],
        [ 0.9404, -1.1530, -0.5993],
        [ 0.8876, -0.7987,  0.1743]], requires_grad=True)
b is  tensor([-0.9466,  0.1541, -1.0284], requires_grad=True)
z is  tensor([-0.1037, -2.2339, -3.5187], grad_fn=<AddBackward0>)
loss is  tensor(0.2579, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [3]:
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")


Gradient function for z = <AddBackward0 object at 0x000001ECA5812B60>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x000001ECFC8A9660>


In [4]:
loss.backward()
print(w.grad)
print(b.grad)


tensor([[0.1580, 0.0323, 0.0096],
        [0.1580, 0.0323, 0.0096],
        [0.1580, 0.0323, 0.0096],
        [0.1580, 0.0323, 0.0096],
        [0.1580, 0.0323, 0.0096]])
tensor([0.1580, 0.0323, 0.0096])


- We can only obtain the grad properties for the leaf nodes of the computational graph, which have requires_grad property set to True. For all other nodes in our graph, gradients will not be available.

- We can only perform gradient calculations using backward once on a given graph, for performance reasons. If we need to do several backward calls on the same graph, we need to pass retain_graph=True to the backward call.

If we only forward computation is needed, then `requires_grad` can be set to `False`

In [5]:
z = torch.matmul(x, w) + b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x,w) + b
print(z.requires_grad)

True
False


In [6]:
z = torch.matmul(x,w) + b
z_det = z.detach()
print(z_det.requires_grad)

False



There are reasons you might want to disable gradient tracking:

- To mark some parameters in your neural network as frozen parameters.

- To speed up computations when you are only doing forward pass, because computations on tensors that do not track gradients would be more efficient.



**DAGs are dynamic in PyTorch** An important thing to note is that the graph is recreated from scratch; after each .backward() call, autograd starts populating a new graph. This is exactly what allows you to use control flow statements in your model; **you can change the shape, size and operations at every iteration if needed**

In [8]:
inp = torch.eye(4, 5, requires_grad=True)
print(inp)
out = (inp+1).pow(2).t()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.]], requires_grad=True)
First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
First call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])
