In [1]:
import torch
from torch import nn

## LeNet-5
> 99.2% acc, 5/6 layers

* INPUT (32*32) -\[convolutions\]-> C1: feature maps (6@28*28) -\[subsampling\]-> S2: feature maps (6@14*14) -\[convolutions\]-> C3: feature maps (16@10*10) -\[subsampling\]-> S4: feature maps (16@5*5) -\[full connection\]-> C5: layer(120) -\[full connection\]-> F6: layer (84) -\[Gaussian connection\]-> OUTPUT (10)

## AlexNet
> GTX 580 (3GB*2), 11*11, 8 layers

* Similar framework to LeNet but:
    * Max pooling, ReLU nonlinearity
    * More data and bigger model (7 hidden layers, 650K units, 60M parameters)
    * GPU implementation (50x speedup over CPU) - Trained on two GPUs for a week
    * Dropout regularization
    > A. Krizhevsky, I. Sutskever, and G. Hinton

## VGG
> 3x3, 1x1, 11-19 layers

* Sequence of deeper networks trained progressively
* Large receptive fields replaced by successive layers of 3*3 convolutions (with ReLU in between)
* One 7*7 convolutions layer with C feature maps needs 49C^2 weights, three 3*3 convolutions layers need only 27C^2 weights
* Experimented with 1*1 convolutions
>> K. Simonyan and A. Zisserman

## GoogLeNet
> 1st in 2014 ILSVRC, 22layers

* Previous layer -{1*1 convolutions, 3*3 convolutions, 5*5 convolutions, 3*3 max pooling}-> Filter concatenation
>> C.Szegedy et al.

### Stack more layers?
> CIFAR-10 experiments

## ResNet
> 152 layers, ILSVRC 2015

* The residual module
    * Introduce skip or shortcut connections (existing before in various forms in literature)
    * Make it easy for network layers to represent the identity mapping
    * For some reason, need to skip at least two layers
    
>> Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun

* Deeper residual module (bottleneck)
    * Directly performing 3*3 convolutions with 256 feature maps at input and output: 256*256*3*3 ~ 600K operations
    * Using 1*1 convolutions to reduce 256 to 64 feature maps, followed by 3*3 convolutions, followed by 1*1 convolutions to expand back to 256 maps: 256*64*1*1 ~ 16K, 64*64*3*3 ~ 36K, 64*256*1*1 ~ 16K, total: ~70K

### DenseNet
* More complicated version of ResNet
* Introduce skip or shortcut connections between every layer

In [None]:
# ResNet
class ResBlk(nn.Module):
    def __int__(self, ch_in, ch_out):
        self.conv1 = torch.nn.Conv2d(ch_in, ch_out, kernel_size=3, stride=1, padding=1)
        self.bn1 = torch.nn.BatchNorm2d(ch_out)
        self.conv2 = torch.nn.Conv2d(ch_out, ch_out, kernel_size=3, stride=1, padding=1)
        self.bn2 = torch.nn.BatchNorm2d(ch_out)

        self.extra = torch.nn.Sequential()
        if ch_out != ch_in:
            # [b, ch_in, h, w] => [b, ch_out, h, w]
            self.extra = torch.nn.Sequential(
                torch.nn.Conv2d(ch_in, ch_out, kernel_size=1, stride=1),
                torch.nn.BatchNorm2d(ch_out)
            )

    def forward(self, x):
        out = torch.nn.funtional.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = self.extra(x) + out
        return  out


### nn.Module
* Magic
    * Every layer is nn.Module
    > nn.Linear, nn.BatchNorm2d, nn.Conv2d

    * nn.Module nested in nn.Module
* 1. embed current layers
> Linear, ReLU, Sigmoid, Conv2d, ConvTransposed2d, Dropout, etc

* 2. container
> net(x), nn.Sequential()

* 3. parameters
> .parameters()

* 4. modules
    * modules: all nodes
    * children: direct children

* 5. to(device)

In [None]:
# nn.Module
class MyLinear(nn.Module):

    def __init__(self, inp, outp):
        super(MyLinear, self).__init__()

        # requires_grad = True
        self.w = nn.Parameter(torch.randn(outp, inp))
        self.b = nn.Parameter(torch.randn(outp))

    def forward(self, x):
        x = x @ self.w.t() + self.b
        return x

In [None]:
# container
self.net = nn.Sequential(
    nn.Conv2d(1,32,5,1,1),
    nn.MaxPool2d(2,2),
    nn.ReLU(True),
    nn.BatchNorm2d(32),

    nn.Conv2d(32,64,3,1,1),
    nn.ReLU(True),
    nn.BatchNorm2d(64),

    nn.Conv2d(64,64,3,1,1),
    nn.MaxPool2d(2,2),
    nn.ReLU(True),
    nn.BatchNorm2d(64),

    nn.Conv2d(64,128,3,1,1),
    nn.ReLU(True),
    nn.BatchNorm2d(128),
)

In [2]:
# parameters
net = torch.nn.Sequential(torch.nn.Linear(4,2),torch.nn.Linear(2,2))
list(net.parameters())[0].shape

torch.Size([2, 4])

In [4]:
list(net.named_parameters())[0]

('0.weight',
 Parameter containing:
 tensor([[-0.0526,  0.1836, -0.2847,  0.4368],
         [ 0.4340, -0.0786,  0.3293,  0.2472]], requires_grad=True))

In [6]:
dict(net.named_parameters()).items()

dict_items([('0.weight', Parameter containing:
tensor([[-0.0526,  0.1836, -0.2847,  0.4368],
        [ 0.4340, -0.0786,  0.3293,  0.2472]], requires_grad=True)), ('0.bias', Parameter containing:
tensor([-0.3175,  0.2888], requires_grad=True)), ('1.weight', Parameter containing:
tensor([[ 0.1183, -0.6117],
        [-0.2555,  0.4316]], requires_grad=True)), ('1.bias', Parameter containing:
tensor([0.0416, 0.2182], requires_grad=True))])

In [2]:
# modules
class BasicNet(nn.Module):
    def __init__(self):
        super(BasicNet, self).__init__()
        self.net = nn.Linear(4,3)

    def forward(self,x):
        return self.net(x)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.net = nn.Sequential(BasicNet(),
                                nn.ReLU(),
                                nn.Linear(3,2))
    
    def forward(self, x):
        return self.net(x)

In [29]:
m = Net()
print(list(m.modules()))
print('--------------------------------------------')
print(list(m.children()))

[Net(
  (net): Sequential(
    (0): BasicNet(
      (net): Linear(in_features=4, out_features=3, bias=True)
    )
    (1): ReLU()
    (2): Linear(in_features=3, out_features=2, bias=True)
  )
), Sequential(
  (0): BasicNet(
    (net): Linear(in_features=4, out_features=3, bias=True)
  )
  (1): ReLU()
  (2): Linear(in_features=3, out_features=2, bias=True)
), BasicNet(
  (net): Linear(in_features=4, out_features=3, bias=True)
), Linear(in_features=4, out_features=3, bias=True), ReLU(), Linear(in_features=3, out_features=2, bias=True)]
--------------------------------------------
[Sequential(
  (0): BasicNet(
    (net): Linear(in_features=4, out_features=3, bias=True)
  )
  (1): ReLU()
  (2): Linear(in_features=3, out_features=2, bias=True)
)]
