In [1]:
import torch

In [2]:
x = torch.randn(3, requires_grad=True)
x

tensor([-0.4654, -0.4551,  1.0197], requires_grad=True)

In [3]:
# Now we want to calculate gradient of some function with respect to x
y = x+2
print(y)

tensor([1.5346, 1.5449, 3.0197], grad_fn=<AddBackward0>)


![image-2.png](attachment:image-2.png)

In [4]:
z = y*y*2
print(z)

tensor([ 4.7098,  4.7737, 18.2373], grad_fn=<MulBackward0>)


In [5]:
z = z.mean()
print(z)

tensor(9.2403, grad_fn=<MeanBackward0>)


In [8]:
z.backward() # This will calculate gradients of "z" with respect to "x" (dz/dx = dz/dy . dy/dx)
print(x.grad) # Printing gradients of x

tensor([2.0461, 2.0599, 4.0263])


In [12]:
# gradients were calculated because "z" was scalar.
# We'll get an error if "z" is not scalar but vector
z = y*y*2
print(z)
z.backward()
print(x.grad)

tensor([15.5707,  0.7798,  0.0228], grad_fn=<MulBackward0>)


RuntimeError: grad can be implicitly created only for scalar outputs

In [13]:
# To resolve this problem
v = torch.tensor([0.1, 1.0, 0.001], dtype=torch.float32)
z.backward(v) # dz/dx
print(x.grad)

tensor([5.9525, 5.8278, 0.1431])


In [15]:
# When we train our network and updating our weights in each loop, we have to prevent
# PyTorch to calculate the history of grad_fn function.

# First Option
# x.requires_grad_(False) # _ means inplace modification

# Second Option
# x = x.detach()

# Third Option
# with torch.no_grad(): Operations

In [12]:
x = torch.randn(3, requires_grad=True)
print(x)
x.requires_grad_(False)
print(x)

tensor([-0.9276, -0.2752, -0.9621], requires_grad=True)
tensor([-0.9276, -0.2752, -0.9621])


In [14]:
x = torch.randn(3, requires_grad=True)
print(x)
x = x.detach()
print(x)

tensor([-0.3202,  0.4847,  0.9530], requires_grad=True)
tensor([-0.3202,  0.4847,  0.9530])


In [20]:
x = torch.randn(3, requires_grad=True)
print(x)
with torch.no_grad():
    y = x+2
    print(y)

tensor([ 1.2196, -2.1850,  1.2200], requires_grad=True)
tensor([ 3.2196, -0.1850,  3.2200])


### Dummy Training Example

In [22]:
weights =  torch.ones(4, requires_grad=True)

# All the values are being accumulated in iterative process
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [24]:
# Solution
weights =  torch.ones(4, requires_grad=True)

# All the values are being accumulated in iterative process
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


### When you'll use optimizer, you will zero the gradients of optimizer since your weights will be updated/used by optimizer

In [30]:
W =  torch.ones(4, requires_grad=True)
optimizer = torch.optim.SGD([W], lr=0.01)
optimizer.step()
optimizer.zero_grad()