In [1]:
import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[0.0271],
        [0.1779]], grad_fn=<AddmmBackward>)

In [7]:
# We can access model parameters explicitly
print(net[2].state_dict())

OrderedDict([('weight', tensor([[-0.1011,  0.1472, -0.2475, -0.2094,  0.0992, -0.2100, -0.0397, -0.2418]])), ('bias', tensor([0.2059]))])


In [3]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.2059], requires_grad=True)
tensor([0.2059])


In [5]:
# The initial gradient value is None
net[2].weight.grad == None 

True

In [8]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [9]:
net.state_dict()

OrderedDict([('0.weight',
              tensor([[ 0.4940, -0.2946,  0.2286,  0.0901],
                      [ 0.2484, -0.4816,  0.1595, -0.4982],
                      [ 0.0220, -0.1336,  0.0839,  0.3234],
                      [-0.4448, -0.2009,  0.1759,  0.3437],
                      [-0.0507, -0.1753,  0.0928, -0.4410],
                      [-0.3591, -0.1541,  0.4844, -0.2122],
                      [-0.2660, -0.3265, -0.4585, -0.4336],
                      [-0.0945, -0.2040, -0.0870, -0.4333]])),
             ('0.bias',
              tensor([ 0.4415,  0.3722, -0.4617,  0.2411, -0.1614, -0.0087,  0.0983,  0.2475])),
             ('2.weight',
              tensor([[-0.1011,  0.1472, -0.2475, -0.2094,  0.0992, -0.2100, -0.0397, -0.2418]])),
             ('2.bias', tensor([0.2059]))])

In [15]:
net.state_dict()['0.weight'].data

tensor([[ 0.4940, -0.2946,  0.2286,  0.0901],
        [ 0.2484, -0.4816,  0.1595, -0.4982],
        [ 0.0220, -0.1336,  0.0839,  0.3234],
        [-0.4448, -0.2009,  0.1759,  0.3437],
        [-0.0507, -0.1753,  0.0928, -0.4410],
        [-0.3591, -0.1541,  0.4844, -0.2122],
        [-0.2660, -0.3265, -0.4585, -0.4336],
        [-0.0945, -0.2040, -0.0870, -0.4333]])

## 5.2.1.3. Collecting Parameters from Nested Blocks
Let us see how the parameter naming conventions work if we nest multiple blocks inside each other. For that we first define a function that produces blocks (a block factory, so to speak) and then combine these inside yet larger blocks.

In [17]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4),
                         nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # Nested here
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[0.2672],
        [0.2672]], grad_fn=<AddmmBackward>)

In [18]:
rgnet

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)

In [19]:
# Access nested parameters
rgnet[0][1][0].bias.data

tensor([-0.2207, -0.1982,  0.2267,  0.0655,  0.2137,  0.4453, -0.0392,  0.0495])

## 5.2.2. Parameter Initialization
By default, PyTorch initializes weight and bias matrices uniformly by drawing from a range that is computed according to the input and output dimension. PyTorch’s nn.init module provides a variety of preset initialization methods.



In [24]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

torch.manual_seed(0)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0113, -0.0115, -0.0025, -0.0043]), tensor(0.))

Question: Why do we use `if type(m) == nn.Linear` rather than `if isinstance(m, nn.Linear)`?

In [26]:
def init_normal_(m):
    if isinstance(m, nn.Linear): that they are actually the same object rather than just having the
# same value
        
        # NOTICE: we use in-place function to initialize the parameters
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
        
torch.manual_seed(0)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0113, -0.0115, -0.0025, -0.0043]), tensor(0.))

We can get the same result!

In [27]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

We can also apply different initializers for certain blocks. For example, below we initialize the first layer with the Xavier initializer and initialize the second layer to a constant value of 42.

In [28]:
def xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)

def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

# module.apply(function)
net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([-0.3136, -0.0255,  0.4522,  0.7030])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


## 5.2.2.2. Custom Initialization
Sometimes, the initialization methods we need are not provided by the deep learning framework. In the example below, we define an initializer for any weight parameter  w  using the following strange distribution:

$$
w \sim 
\begin{cases}
    U(5, 10) & \text{with probability} \ \frac{1}{4} \\
    0 & \text{with probability} \ \frac{1}{2} \\
    U(-10, -5) & \text{with probability} \ \frac{1}{4}
  \end{cases}
$$

In [35]:
# Self-implemented Customized Initialization
def custom_init(m):
    if type(m) == nn.Linear:
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

In [31]:
nn.init.uniform_?

[0;31mSignature:[0m [0mnn[0m[0;34m.[0m[0minit[0m[0;34m.[0m[0muniform_[0m[0;34m([0m[0mtensor[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m [0ma[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m0.0[0m[0;34m,[0m [0mb[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m1.0[0m[0;34m)[0m [0;34m->[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fills the input Tensor with values drawn from the uniform
distribution :math:`\mathcal{U}(a, b)`.

Args:
    tensor: an n-dimensional `torch.Tensor`
    a: the lower bound of the uniform distribution
    b: the upper bound of the uniform distribution

Examples:
    >>> w = torch.empty(3, 5)
    >>> nn.init.uniform_(w)
[0;31mFile:[0m      ~/anaconda3/envs/torch182_py38/lib/python3.8/site-packages/torch/nn/init.py
[0;31mType:[0m      function


In [36]:
# Book-Provided Customized Initialization
def my_init(m):
    if type(m) == nn.Linear:
        print(
            "Init",
            *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[-9.3195,  8.8849,  7.6036, -9.9753],
        [ 0.0000, -0.0000, -0.0000, -0.0000]], grad_fn=<SliceBackward>)

## 5.2.3. Tied Parameters
Often, we want to share parameters across multiple layers. Let us see how to do this elegantly. In the following we allocate a dense layer and then use its parameters specifically to set those of another layer.

In [44]:
# We need to give the shared layer a name so that we can refer to its
# parameters
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared,
                    nn.ReLU(), nn.Linear(8, 1))
net(X)

# Check whether the parameters are the same
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
net[2].bias.data[7] = 100

# Make sure that they are actually the same object rather than just having the
# same value
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [45]:
shared.state_dict()

OrderedDict([('weight',
              tensor([[ 1.0000e+02, -1.6142e-01,  1.8523e-01, -1.6355e-01, -1.7413e-01,
                       -3.0926e-02, -3.3983e-02, -2.7541e-01],
                      [ 2.9475e-01, -1.5596e-01,  1.2541e-01,  3.0751e-01,  1.7830e-01,
                        5.0056e-02,  3.0082e-01,  4.7527e-02],
                      [-1.6357e-01,  3.3446e-01,  8.3679e-02, -3.4496e-01, -1.0065e-01,
                       -2.4083e-01,  3.0998e-01, -5.8388e-02],
                      [-3.2227e-01, -2.2226e-02,  2.2204e-01,  9.1853e-02,  1.1180e-01,
                        3.2836e-02,  1.3181e-01, -8.6144e-02],
                      [-1.4064e-01, -3.3047e-01, -2.6634e-01,  1.5323e-01, -2.0934e-01,
                        5.0723e-02,  1.1285e-01,  2.5033e-02],
                      [-2.2923e-01,  3.3811e-01, -2.0559e-01,  2.9080e-01, -2.8125e-01,
                       -8.5047e-02,  1.9232e-01, -1.4446e-01],
                      [ 2.9700e-01, -2.4330e-01, -2.9690e-01, -1.5942e

**Question**: Construct an MLP containing a shared parameter layer and train it. During the training process, observe the model parameters and gradients of each layer.