In [1]:
import torch 
import string
import sympy as sp

In [2]:
" ".join(string.ascii_letters[:13]),",".join(string.ascii_letters[:13])

('a b c d e f g h i j k l m', 'a,b,c,d,e,f,g,h,i,j,k,l,m')

In [3]:
a,b,c,d,e,f,g,h,i,j,k,l,m = sp.symbols('a b c d e f g h i j k l m')

In [4]:
def print_grad_fn(a:torch.Tensor,name:str):
    print(f"grad fn of {name} : {g.name() if(g:=a.grad_fn) else g}")

In [5]:
a_tensor = torch.tensor(2.0)
b_tensor = torch.tensor(2.0)


print_grad_fn(a_tensor,"a")
print_grad_fn(b_tensor,"b") 

grad fn of a : None
grad fn of b : None


In [6]:
c_tensor = a_tensor * b_tensor

c_tensor.requires_grad = True
print_grad_fn(c_tensor,"c")

grad fn of c : None


In [7]:
d_tensor = torch.tensor(2.0)

e_tensor = c_tensor *d_tensor
e = c * d

print_grad_fn(d_tensor,"d")
print_grad_fn(e_tensor,"e")

grad fn of d : None
grad fn of e : MulBackward0


In [8]:
e

c*d

# Why we are storing the value d only not c?
1. Only `C` has the require_grad = True and its also leaf.
2. By default grad will accumulated on the leaf .
3. in order to accumulate grad on the non leaf node of tree we need to use `retain_grad`
4. When we differentiate the `E wrt C` is `D` refer sympy for that
5. That the reason we are storing the D in saved_other

In [9]:
id(e_tensor.grad_fn._saved_other) == id(d_tensor)

True

In [10]:
sp.diff(e,c)

d

when we do backward on E, the grad will is accumulated in C.grad which is not nothing but the value of D

In [11]:
e_tensor.grad_fn.next_functions

((<AccumulateGrad at 0x7ff25823d120>, 0), (None, 0))

In [12]:
print("Before e.backward() : ",c_tensor.grad)
e_tensor.backward()
print("After  e.backward() : ",c_tensor.grad)

Before e.backward() :  None
After  e.backward() :  tensor(2.)


# Why only leaf get the grad default?

All parameter in model is initialized , it **NOT** create by the operations. That why the grad is accumulated only on the leaf.

In [13]:
f_tensor = torch.tensor(2.0)
g_tensor = e_tensor * f_tensor

print_grad_fn(f_tensor,"f")
print_grad_fn(g_tensor,"g")

grad fn of f : None
grad fn of g : MulBackward0


In [14]:
g_tensor.grad_fn.next_functions

# for input e --> MulBackward
# for input f --> No function 

((<MulBackward0 at 0x7ff1652bb340>, 0), (None, 0))

In [15]:
id(g_tensor.grad_fn.next_functions[0][0]) == id(e_tensor.grad_fn)

# Why there is no accumulate grad for E even though it require grad ?
# this is because by default only leave node get the grad accumulated 

True

# Grad Accumulation

In [49]:
a_tensor = torch.tensor(2.0, requires_grad=True)
b_tensor = torch.tensor(2.0, requires_grad=True)

c_tensor = a_tensor + b_tensor


In [50]:
c_tensor

tensor(4., grad_fn=<AddBackward0>)

In [51]:
c_tensor.grad_fn._saved_alpha

1

In [52]:
print(a_tensor.grad,b_tensor.grad)
c_tensor.backward(retain_graph=True)
print(a_tensor.grad,b_tensor.grad)

a,b,c = sp.symbols("a b c")
c = a + b
print(sp.diff(c,a),sp.diff(c,b))

None None
tensor(1.) tensor(1.)
1 1


In [53]:
d_tensor = 5 * c_tensor

print(a_tensor.grad,b_tensor.grad)
d_tensor.backward(retain_graph=True)
print(a_tensor.grad,b_tensor.grad)
# grad accumulated

tensor(1.) tensor(1.)
tensor(6.) tensor(6.)


In [21]:
c_tensor.grad_fn._saved_alpha,d_tensor.grad_fn._saved_other

(1, tensor(5))

In [22]:
a,b,c,d = sp.symbols("a b c d")
d = 5 * (a + b)
sp.diff(d,a) ,sp.diff(d,b) 

(5, 5)

## Custom AutoGrad Function

In [23]:
class Exp(torch.autograd.Function):
    @staticmethod
    def forward(ctx,i):
        print("Calling forward")
        for attr in dir(ctx):
            if not attr.startswith("__") and not callable((a:=getattr(ctx,attr,None))):
                print(f"{attr:<40s}:{a}")
        result = i.exp()
        ctx.save_for_backward(result)
        return result
    @staticmethod
    def backward(ctx,grad_output):
        print("Calling backward")
        for attr in dir(ctx):
            if not attr.startswith("__") and not callable((a:=getattr(ctx,attr,None))):
                print(f"{attr:<40s}:{a}")
        result, = ctx.saved_tensors
        return grad_output * result

In [24]:
in_tensor = torch.tensor(2.0,requires_grad=True)
out_tensor = Exp.apply(in_tensor)

Calling forward
_materialize_non_diff_grads             :True
_raw_saved_tensors                      :()
dirty_tensors                           :None
materialize_grads                       :None
metadata                                :{}
needs_input_grad                        :(True,)
next_functions                          :((<AccumulateGrad object at 0x7ff1652ba9e0>, 0),)
non_differentiable                      :None
requires_grad                           :True
saved_for_forward                       :None
saved_tensors                           :()
saved_variables                         :()
to_save                                 :None


  if not attr.startswith("__") and not callable((a:=getattr(ctx,attr,None))):


In [25]:
print("Before out.backward() : ",in_tensor.grad)
out_tensor.backward()
print("After  out.backward() : ",in_tensor.grad)

Before out.backward() :  None
Calling backward
_materialize_non_diff_grads             :True
_raw_saved_tensors                      :(<torch._C._autograd.SavedTensor object at 0x7ff25a3c72f0>,)
dirty_tensors                           :None
materialize_grads                       :None
metadata                                :{}
needs_input_grad                        :(True,)
next_functions                          :((<AccumulateGrad object at 0x7ff25823cc10>, 0),)
non_differentiable                      :None
requires_grad                           :True
saved_for_forward                       :None
saved_tensors                           :(tensor(7.3891, grad_fn=<ExpBackward>),)
saved_variables                         :(tensor(7.3891, grad_fn=<ExpBackward>),)
to_save                                 :None
After  out.backward() :  tensor(7.3891)


  if not attr.startswith("__") and not callable((a:=getattr(ctx,attr,None))):


In [61]:
a_tensor = torch.tensor(2.0, requires_grad=True)
b_tensor = torch.tensor(2.0, requires_grad=True)

class Addition(torch.autograd.Function):
    @staticmethod
    def forward(ctx,a,b):
        return a+b
    @staticmethod
    def backward(ctx,grad):
        # print(ctx.needs_input_grad)
        # print(grad)
        return grad,grad
        

c_tensor = Addition.apply(a_tensor,b_tensor)

In [62]:
c_tensor

tensor(4., grad_fn=<AdditionBackward>)

In [63]:
print(a_tensor.grad,b_tensor.grad)
c_tensor.backward(retain_graph=True)
print(a_tensor.grad,b_tensor.grad)

None None
tensor(1.) tensor(1.)


In [64]:
d_tensor = 5 * c_tensor

print(a_tensor.grad,b_tensor.grad)
d_tensor.backward(retain_graph=True)
print(a_tensor.grad,b_tensor.grad)

tensor(1.) tensor(1.)
tensor(6.) tensor(6.)
