# Grads in pytorch



In [None]:
import torch

x = torch.tensor([2, 1.2, 0, 4],requires_grad= True)

y = x*x
z = y.sum()

print("x =",x)
print("x.grad = ",x.grad) # grad is none

z.backward()
print("y: ",y,"y.grad = ", y.grad) # lead to warning
print("x: ",x,"x.grad = ", x.grad)


loss.backward() computes dloss/dx for every parameter x which has requires_grad=True.

These are accumulated into x.grad for every parameter x.
In pseudo-code:  x.grad += dloss/dx

z = loss

We want dz/dx

dz/dx = dz/dy * dy/dx

z = y0 + y1 + y2 + y3

dz/dy = [1,1,1,1] dz equal to  loss

y = x^2

dy/dx = 2x

In [None]:
!pip install torchviz

In [None]:
from torchviz import make_dot

y = x*x
z = y.sum()
make_dot(z, params={'x':x, 'y':y, 'z':z}, show_saved=True)#, show_attrs=True, show_saved=True)

## Let's call backward second time

it's produce an error


In [None]:
try:
  z.backward()
except Exception as e:
  print("ERROR")
  print(e)

In [None]:
x = torch.tensor([2,1.2,0,4],requires_grad= True)
print ("X = ",x)

y = x*x

# Now grads for intermediate tensor y stay in memory
y.retain_grad()

z = y.sum()
print ("z (loss) =",z)
print ("y =",y)
print ("x =",x)

print ("dz/dx ",x.grad) # grad is None

print("========== Backprop 1 ==============")
z.backward(retain_graph=True)
print ("dz/dy ",y.grad)
print ("dz/dx ",x.grad)
print("========== Backprop 2 ==============")
z.backward()

# Grads are accumulated
print ("dz/dy ",y.grad)
print ("dz/dx ",x.grad)


# Optimization in pytorch

## manual

https://discuss.pytorch.org/t/leaf-variable-was-used-in-an-inplace-operation/308

In [None]:
x = torch.tensor([2,1.2,-3,4],requires_grad= True)
w = torch.tensor([0.1,-0.2,-0.1,0.4],requires_grad= True)
g_true = torch.tensor([1,1,2,3])
print("X = ",x)
print("W = ",w)

for i in range(100):
  y = x*w
  loss = torch.sum(torch.abs(y - g_true)) # L1 Loss must be a scalar
  loss.backward()
  if i ==0 :
    print("dW/dL",w.grad) # stay the same
  with torch.no_grad():
    w =  w - 0.01* w.grad # update w with lr =0.01, note that w recreated here
    w.requires_grad = True
  if i % 10 == 0:
    print(f"Loss {loss.item()} W {w} y {y}")

## Via optimizer

In [None]:
import torch.optim as optim

x = torch.tensor([2,1.2,-3,4],requires_grad= True)
w = torch.tensor([0.1,-0.2,-0.1,0.4],requires_grad= True)

print("X = ",x)
print("W = ",w)

y = x*w
L = y.sum() # loss stub

print ("dz/dy ",y.grad)
print ("dz/dx ",x.grad)

optimizer = optim.SGD([w],lr=0.01) # send params to optimizer
L.backward()
print("========== Backprop 1 ==============")
print ("dL/dy ",y.grad, "  - because y is not a leaf")
print ("dL/dx ",x.grad)
print ("dL/dw ",w.grad)
print("========== Optimize with LR = 0.01 ==============")
optimizer.step()
print ("X = ",x)
print ("W = W-lr*(dL/dw) = ",w)
print("========== Zero grad ==============")
optimizer.zero_grad()
print ("dL/dx ",x.grad)
print ("dL/dw ",w.grad," after zero_grad call")



## Typical train loop

In [None]:
x = torch.tensor([2,1.2,-3,4],requires_grad= True)
w = torch.tensor([0.1,-0.2,-0.01,0.4],requires_grad= True)
print("X = ",x)
print("W = ",w)

optimizer = optim.SGD([w],lr=0.01)

for i in range(100):
  y = x*w
  loss = L = torch.sum(torch.abs(y - g_true)) # L1 Loss must be a scalar
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  if i % 10 == 0:
    print(f"Loss {loss.item()} W {w} y {y}")
