# Gradient Calculatiosn with Pytorch

This is based on https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html

In [1]:
import torch

In [16]:
# N - Batch size
# D_in - Dimension input layer
# H - Hidden layer dimension
# D_out - Dimension output layer
N, D_in, H, D_out = 64, 1000, 100, 10

In [18]:
# Randonly initialize Input and target tensors
x = torch.randn(N, D_in, dtype=torch.float)
y = torch.randn(N, D_out, dtype=torch.float)

In [17]:
# Randonly initialize Weights
w1 = torch.randn(D_in, H, dtype=torch.float, requires_grad=True)
w2 = torch.randn(H, D_out, dtype=torch.float, requires_grad=True)

In [19]:
# Take a look at the weights.
print("w1: \n", w1)
print("w1 grad: \n", w1.grad)

w1: 
 tensor([[ 0.5707,  1.6557, -1.9721,  ..., -0.1041, -1.1784,  1.3585],
        [ 0.1208,  0.2422, -1.3279,  ...,  0.7340,  0.1400,  0.3644],
        [ 1.2201,  0.5085, -0.5272,  ..., -1.1704, -1.4522,  0.4710],
        ...,
        [-0.8088, -0.3297, -0.2810,  ..., -0.9297, -0.7952,  1.0705],
        [-0.4711,  0.6621,  0.6727,  ..., -1.4358,  1.4293, -0.9453],
        [ 0.6332, -1.3893, -0.6308,  ..., -0.8311, -0.4963,  0.0880]],
       requires_grad=True)
w1 grad: 
 None


### Observation
Notice no gradient is calculated for w1 - Expected

## Simulate One Epoch

In [27]:
# Simulate an epoch
y_pred_temp = x.mm(w1)
y_pred = y_pred_temp.mm(w2)

print("y_pred_temp's 0th element:\n ", y_pred[0])
print("\ny_pred (Final Result)'s 0th element:\n ", y_pred[0])

y_pred_temp's 0th element:
  tensor([  76.9237,  390.6676,  411.7830,  165.1798, 1183.3716,  147.9824,
        -513.4570,   82.7567,    2.8554, -453.8504], grad_fn=<SelectBackward>)

y_pred (Final Result)'s 0th element:
  tensor([  76.9237,  390.6676,  411.7830,  165.1798, 1183.3716,  147.9824,
        -513.4570,   82.7567,    2.8554, -453.8504], grad_fn=<SelectBackward>)


In [31]:
# Gradient Function and Gradients for y_pred and y_pred_temp
print ("Gradient Function for y_pred:\n", y_pred.grad_fn)
print ("Gradient for y_pred:\n", y_pred.grad)

print("\n===\n")

print ("Gradient Function for y_pred_temp:\n", y_pred_temp.grad_fn)
print ("Gradient for y_pred_temp:\n", y_pred_temp.grad)

Gradient Function for y_pred:
 <MmBackward object at 0x10cfec5c0>
Gradient for y_pred:
 None

===

Gradient Function for y_pred_temp:
 <MmBackward object at 0x10cfec390>
Gradient for y_pred_temp:
 None


## Observation:
We only expect gradient functions to be defined. <br>
Because right when a tensor is created due to an operation, the gradient function is saved to create gradients at a future time.<br>
That future time is when <b>"backward()"</b> is called. <br><u>At that time, he gradients are calculated based on the gradient functions.</u>

In [33]:
# Let's check for grad function and gradient for weights.
# Expectation: Since these no operation done to create the weights (we just assigned values), we dont expect to see any gradient function.
#           Hence, gradients can also be created when due to a backward() function or back propagation from associated values 
print ("Gradient function for w1:\n", w1.grad_fn)
print ("No Gradient calculated for w1:\n", w1.grad)

Gradient function for w1:
 None
No Gradient calculated for w1:
 None


# Can we do back propagation before we calculate loss ?
Can I say I want to reduce all the values in the output vector?

In [34]:
y_pred.backward() # Expect to error out. Reason provided below.

RuntimeError: grad can be implicitly created only for scalar outputs

## Reason:

when you do loss.backward(), it is a shortcut for loss.backward(torch.Tensor([1])). This in only valid if loss is a tensor containing a single element.
DataParallel returns to you the partial loss that was computed on each gpu, so you usually want to do loss.backward(torch.Tensor([1, 1])) or loss.sum().backward(). Both will have the exact same behaviour.

source: https://discuss.pytorch.org/t/loss-backward-raises-error-grad-can-be-implicitly-created-only-for-scalar-outputs/12152

# Side Question's Answer

We need to calculate loss and then call backward() to start calculating gradients for each layer in computation graph.

In [35]:
loss = (y_pred - y).pow(2).sum()
print(loss)

tensor(64490872., grad_fn=<SumBackward0>)


In [36]:
w1.grad

In [37]:
loss.backward()

In [38]:
w1.grad

tensor([[  4420.6182,  13158.3086, -17610.3320,  ..., -11544.2861,
          -3195.0935,   7218.5684],
        [ 10877.6670,  -4241.4419, -11119.0107,  ...,   5404.6465,
           4060.0625,   1923.6958],
        [ 12294.5137, -10381.8799,  23304.0098,  ...,  -9609.0498,
          -5106.9688,  29908.2461],
        ...,
        [ 12295.5039, -14284.3213,   5180.2925,  ...,   8553.3086,
          -8581.9355,   2029.7241],
        [ 15299.3740,  11083.9980, -10776.3535,  ...,  -7479.1006,
           9648.5254,  -1739.0972],
        [ 14823.6123,   7228.9502,  16855.3223,  ...,  -1710.9236,
          28023.5820, -10215.4346]])

## Computation Graph Disposal
Once the backward() is called and gradient calculated, the computation graph is disposed, to save memory!

In [40]:
loss.backward()

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

# How to use the calculated gradients?

Using Optimizers, the gradients will be used to modify the weights.

`optimizer = optim.Adam(model.parameters(), lr=0.003)`