In [None]:
#AUTOMATIC DIFFERENTIATION WITH TORCH.AUTOGRAD
#back propagation is used to train network
#parameters are adjusted accorting to the gradient of loss
#with respect to the gicen parameter

# built-in differentiation engine called torch.autograd. 
# It supports automatic computation of gradient for 
# any computational graph.


In [1]:
# the simplest one-layer neural network
# input x parameters w and b, and loss
import torch

x = torch.ones(5) #input tensor
y = torch.zeros(3) #expected output

#parameters, which we need to optimize
#so set repuires_grad True
#Note:can set the value of requires_grad when creating a tensor
# or later by using x.requires_grad_(True) method.
w = torch.randn(5,3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
#y hat
z = torch.matmul(x,w)+b

loss = torch.nn.functional.binary_cross_entropy_with_logits(z,y)

In [2]:
#what we define in code is an object of class Function
#The backward propagation functon is stored in 
# grad_fn property of a tensor
print('Gradient function for z =', z.grad_fn)
print('Gradient function for loss =', loss.grad_fn)

Gradient function for z = <AddBackward0 object at 0x7f01f45c2f90>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7f01f45c2f50>


In [3]:
#use loss.backwrd() to compute derivatives
loss.backward()
print(w.grad)
print(b.grad)
#Note_1:can only obtain the grad properties for 
# the leaf nodes of the computational graph,
# which have requires_grad property set to True

#Note_2: can only perform gradient calculations
# using backward once on a given graph, for performance reasons
# If we need to do several backward calls on the same graph, 
# we need to pass retain_graph=True to the backward call

tensor([[0.2241, 0.1198, 0.0173],
        [0.2241, 0.1198, 0.0173],
        [0.2241, 0.1198, 0.0173],
        [0.2241, 0.1198, 0.0173],
        [0.2241, 0.1198, 0.0173]])
tensor([0.2241, 0.1198, 0.0173])


In [5]:
#Default tensor with requires_grad=True are 
# tracking their computational history and support gradient computation. 
# when we have trained the model and just want to apply it to some input data, 
# i.e. we only want to do forward computations through the network. 

# We can stop tracking computations by 
# surrounding our computation code with torch.no_grad() block
z = torch.matmul(x, w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w)+b
print(z.requires_grad)

#Another way is to use detach() on the tensor
z = torch.matmul(x, w)+b
z_det = z.detach()
print(z_det.requires_grad)


True
False
False


### More on Computational Graph
Autograd keeps a record of data (tensors) and 

all executed operations (along with the resulting new tensors) in a directed acyclic graph (DAG) consisting of Function objects

In this DAG, leaves are the input tensors, roots are the output tensors. By tracing this graph from roots to leaves, you can automatically compute the gradients using the chain rule.

Forward pass, autograd does two things simultaneously:

run the requested operation to compute a resulting tensor

maintain the operation’s gradient function in the DAG.

The backward pass kicks off when .backward() is called on the DAG root. autograd then:

computes the gradients from each .grad_fn,

accumulates them in the respective tensor’s .grad attribute

using the chain rule, propagates all the way to the leaf tensors.
    
 after each .backward() call, autograd starts populating a new graph. This is exactly what allows you to use control flow statements in your model; you can change the shape, size and operations at every iteration if needed.

In [6]:
#Tensor Gradients and Jacobian Products
# the output function is an arbitrary tensor. In this case,
# PyTorch allows you to compute so-called Jacobian product, 
# and not the actual gradient.

#Instead of computing the Jacobian matrix itself,
# PyTorch allows you to compute Jacobian Product 
# Achieved by backward with v as an argument. v is the var being respected
inp = torch.eye(5, requires_grad=True)
out = (inp+1).pow(2)
out.backward(torch.ones_like(inp), retain_graph=True)
print("First call\n", inp.grad)
out.backward(torch.ones_like(inp), retain_graph=True)
print("\nSecond call\n", inp.grad)
inp.grad.zero_()
out.backward(torch.ones_like(inp), retain_graph=True)
print("\nCall after zeroing gradients\n", inp.grad)
#If you want to compute the proper gradients, you need to zero out the grad property before. 
# In real-life training an optimizer helps us to do this.
#Note:Previously we were calling backward() function without parameters. 
# This is essentially equivalent to calling backward(torch.tensor(1.0)), 
# which is a useful way to compute the gradients in case of a scalar-valued function, 
# such as loss during neural network training.

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]], requires_grad=True)
First call
 tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])

Second call
 tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.],
        [4., 4., 4., 4., 8.]])

Call after zeroing gradients
 tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.],
        [2., 2., 2., 2., 4.]])
