# 第2节-参数管理
本节的主题是：
1. 如何访问模型的参数，以便读取和存储，以及当遇到复杂的嵌套结构时应该怎么做？
2. 如何初始化模型的参数？
3. 层间的参数共享是什么？有什么意义？

---
参数访问

In [15]:
import torch
from torch import nn
from torch.nn import functional as F
# 如下实现了一个单层感知机, 接下来以他为例展示参数的访问
class MyMLPa(nn.Module):
    def __init__(self, ni, no, nh=32):
        super().__init__()
        self.h = nn.Linear(ni, nh)
        self.o = nn.Linear(nh, no)
    def forward(self, X):
        return self.o(F.relu(self.h(X)))

mlpA = MyMLPa(3, 3, nh=4)
iX = torch.tensor([[1, 2, 3], [2.0, 1, 3], [2.1, 1, 3], [3.2, 2.1, 1.3]])
mlpA, iX.shape

(MyMLPa(
   (h): Linear(in_features=3, out_features=4, bias=True)
   (o): Linear(in_features=4, out_features=3, bias=True)
 ),
 torch.Size([4, 3]))

In [16]:
mlpA(iX)

tensor([[-0.2523,  0.6922,  0.5877],
        [ 0.0197,  0.9616,  0.9022],
        [ 0.0315,  0.9780,  0.9096],
        [-0.2641,  0.9209,  0.2576]], grad_fn=<AddmmBackward0>)

`state_dict`参数字典

In [17]:
mlpAparamDict = mlpA.state_dict()
mlpAparamDict #OrderedDict

OrderedDict([('h.weight',
              tensor([[ 0.0006,  0.0582, -0.4792],
                      [-0.1380, -0.2754, -0.4258],
                      [ 0.2937, -0.3733,  0.3648],
                      [ 0.1505,  0.1432, -0.2363]])),
             ('h.bias', tensor([-0.4956, -0.4062, -0.2216,  0.5224])),
             ('o.weight',
              tensor([[ 0.1579, -0.0702,  0.4078, -0.0096],
                      [ 0.1069, -0.1516,  0.4004,  0.3122],
                      [ 0.2654, -0.0182,  0.4762, -0.4345]])),
             ('o.bias', tensor([-0.4211,  0.4459,  0.4965]))])

In [18]:
mlpAparamDict.keys()

odict_keys(['h.weight', 'h.bias', 'o.weight', 'o.bias'])

In [19]:
mlpAparamDict['h.bias'] # 这就访问到了mlpA的h层的bias数据

tensor([-0.4956, -0.4062, -0.2216,  0.5224])

In [20]:
# 另一种方法 -> 使用 . 操作符不断向下搜寻
mlpA.h.bias.data

tensor([-0.4956, -0.4062, -0.2216,  0.5224])

In [21]:
# 访问grad
mlpA.o.weight.grad == None # 没有输出表示为None

True

In [22]:
# 遍历所有参数
print(*[(name, param.shape) for name, param in mlpA.named_parameters()])

('h.weight', torch.Size([4, 3])) ('h.bias', torch.Size([4])) ('o.weight', torch.Size([3, 4])) ('o.bias', torch.Size([3]))


----
参数初始化, 使用`init_func+module.apply(init_func)`模式, 其中
```
init_func(m):
    if type(m) == XMode:
        xxx
    if ...
```

In [23]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.constant_(m.bias, 0.01)
        # nn.init.zeros_(m.bias) # 一般情况是这样
mlpA.apply(init_normal)
mlpA.h.bias.data

tensor([0.0100, 0.0100, 0.0100, 0.0100])

In [25]:
def init_xvaier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
mlpA.h.apply(init_xvaier)
mlpA.o.apply(init_normal)
mlpA.h.weight, mlpA.o.weight.data

(Parameter containing:
 tensor([[-0.7708,  0.1171,  0.7692],
         [ 1.1842, -0.2726, -0.2774],
         [-0.1519,  0.5423, -0.1488],
         [ 0.4153,  0.4740,  0.4213]], requires_grad=True),
 tensor([[-0.0163,  0.0007,  0.0018,  0.0189],
         [ 0.0045,  0.0079, -0.0093,  0.0057],
         [ 0.0024, -0.0155, -0.0043,  0.0111]]))

当然你可以直接访问m.weight.data并对其修改(初始化)

---
参数共享, 方法是在不同的层共享同一个组件, 这样做带来什么影响？

In [27]:
# mlpA: 3 -> 4 -> 3, 
net = nn.Sequential(mlpA, nn.ReLU(), mlpA, nn.ReLU(), nn.Linear(3, 4))
print(net)

Sequential(
  (0): MyMLPa(
    (h): Linear(in_features=3, out_features=4, bias=True)
    (o): Linear(in_features=4, out_features=3, bias=True)
  )
  (1): ReLU()
  (2): MyMLPa(
    (h): Linear(in_features=3, out_features=4, bias=True)
    (o): Linear(in_features=4, out_features=3, bias=True)
  )
  (3): ReLU()
  (4): Linear(in_features=3, out_features=4, bias=True)
)


In [28]:
# net[0]和net[2]是同一个组件
net[0].h.weight.data == net[2].h.weight.data

tensor([[True, True, True],
        [True, True, True],
        [True, True, True],
        [True, True, True]])

In [29]:
# 打印net[2].h.weight.data
net[2].h.weight.data

tensor([[-0.7708,  0.1171,  0.7692],
        [ 1.1842, -0.2726, -0.2774],
        [-0.1519,  0.5423, -0.1488],
        [ 0.4153,  0.4740,  0.4213]])

In [30]:
# 修改net[0].h.weight.data
net[0].apply(init_xvaier)
net[2].h.weight.data, net[0].h.weight.data == net[2].h.weight.data

(tensor([[ 0.1038, -0.3065, -1.0523],
         [-1.0332,  0.8989,  0.2749],
         [-0.2488, -0.2872,  0.3212],
         [ 0.2005, -0.7189,  0.5312]]),
 tensor([[True, True, True],
         [True, True, True],
         [True, True, True],
         [True, True, True]]))

In [None]:
# 以上结果说明了参数共享, 这一现象有什么意义或者影响呢?