### What is the difference between `nn.ModuleList()` and `nn.Sequential()`?
- nn.Module/nn.ModuleList/nn.Sequential are all containers that we could add module in it 

### `nn.Module`
- torch docs: https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module
- Base class for all neural network modules 


In [19]:
# Construct a basic Network and add hook at forward
import torch 
import torch.nn as nn 
import torch.nn.functional as F 

class NetSample(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20,20,5)
        self.max_pool = nn.MaxPool2d(3,stride=1,padding=1)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.max_pool(x)
        return x 

def hook_fn(module, input, output):
    """Ref:https://blog.paperspace.com/pytorch-hooks-gradient-clipping-debugging/"""
    print(module)
    print("-------------Module parameters----------------")
    neles = sum( [ param.nelement() for param in module.parameters()])
    print(neles)
    print("-------------Input Grad ----------------")

    for grad in input:
        try:
            print(grad.shape)
        except AttributeError:
            print("None found for Gradient")
    print("-------------Output Grad ----------------")
    for grad in output:
        try: 
            print(grad.shape)
        except AttributeError:
            print("None found for Gradient")
    print("\n")

net = NetSample()
# Iterate the childern to register the forward hook
for i in net.children():
    print(i.register_forward_hook(hook_fn))

input = torch.randn(1,1,224,224)
net(input).shape


<torch.utils.hooks.RemovableHandle object at 0x7fea613df990>
<torch.utils.hooks.RemovableHandle object at 0x7fea613df990>
<torch.utils.hooks.RemovableHandle object at 0x7fea613df990>
Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
-------------Module parameters----------------
520
-------------Input Grad ----------------
torch.Size([1, 1, 224, 224])
-------------Output Grad ----------------
torch.Size([20, 220, 220])


Conv2d(20, 20, kernel_size=(5, 5), stride=(1, 1))
-------------Module parameters----------------
10020
-------------Input Grad ----------------
torch.Size([1, 20, 220, 220])
-------------Output Grad ----------------
torch.Size([20, 216, 216])


MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
-------------Module parameters----------------
0
-------------Input Grad ----------------
torch.Size([1, 20, 216, 216])
-------------Output Grad ----------------
torch.Size([20, 216, 216])




torch.Size([1, 20, 216, 216])