# 参数管理

In [1]:
import torch
from torch import nn

net = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8,1)
)

X = torch.ones((1,4), requires_grad=True)
y = net(X)
y.backward()  # 这里只有一个标量输出，backward()无需括号参数

In [2]:
print(net[0].state_dict())

OrderedDict([('weight', tensor([[ 0.0475, -0.1706,  0.2993, -0.2022],
        [-0.1910, -0.3215,  0.1190, -0.0941],
        [-0.4170,  0.4206,  0.1179, -0.3234],
        [-0.1484,  0.4251,  0.2853,  0.0606],
        [-0.4066, -0.3449,  0.3275, -0.3547],
        [-0.2267, -0.4553,  0.3136,  0.3734],
        [-0.2732,  0.4342,  0.0323,  0.3913],
        [-0.2383, -0.3003, -0.0323,  0.3576]])), ('bias', tensor([ 0.0903,  0.3464,  0.0816,  0.1354,  0.3603,  0.0726, -0.3984,  0.3422]))])


In [3]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2086], requires_grad=True)
tensor([-0.2086])


In [4]:
net[2].weight.grad == None

False

In [5]:
print(*[(name, parameter) for name,parameter in net[0].named_parameters()])

('weight', Parameter containing:
tensor([[ 0.0475, -0.1706,  0.2993, -0.2022],
        [-0.1910, -0.3215,  0.1190, -0.0941],
        [-0.4170,  0.4206,  0.1179, -0.3234],
        [-0.1484,  0.4251,  0.2853,  0.0606],
        [-0.4066, -0.3449,  0.3275, -0.3547],
        [-0.2267, -0.4553,  0.3136,  0.3734],
        [-0.2732,  0.4342,  0.0323,  0.3913],
        [-0.2383, -0.3003, -0.0323,  0.3576]], requires_grad=True)) ('bias', Parameter containing:
tensor([ 0.0903,  0.3464,  0.0816,  0.1354,  0.3603,  0.0726, -0.3984,  0.3422],
       requires_grad=True))


In [7]:
print(*[(name, parameter) for name,parameter in net.named_parameters()])

('0.weight', Parameter containing:
tensor([[ 0.0475, -0.1706,  0.2993, -0.2022],
        [-0.1910, -0.3215,  0.1190, -0.0941],
        [-0.4170,  0.4206,  0.1179, -0.3234],
        [-0.1484,  0.4251,  0.2853,  0.0606],
        [-0.4066, -0.3449,  0.3275, -0.3547],
        [-0.2267, -0.4553,  0.3136,  0.3734],
        [-0.2732,  0.4342,  0.0323,  0.3913],
        [-0.2383, -0.3003, -0.0323,  0.3576]], requires_grad=True)) ('0.bias', Parameter containing:
tensor([ 0.0903,  0.3464,  0.0816,  0.1354,  0.3603,  0.0726, -0.3984,  0.3422],
       requires_grad=True)) ('2.weight', Parameter containing:
tensor([[ 0.2127,  0.1428,  0.1384, -0.3014,  0.3300,  0.1692,  0.2517, -0.0556]],
       requires_grad=True)) ('2.bias', Parameter containing:
tensor([-0.2086], requires_grad=True))


In [9]:
net.state_dict()['2.weight'].data

tensor([[ 0.2127,  0.1428,  0.1384, -0.3014,  0.3300,  0.1692,  0.2517, -0.0556]])

---

In [10]:
def block1():
    return nn.Sequential(
        nn.Linear(4,8),
        nn.ReLU(),
        nn.Linear(8,4),
        nn.ReLU()
    )

def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f"block {i}", block1())
    return net

rgent = nn.Sequential(
    block2(),
    nn.Linear(4,1)
)

rgent(X)

tensor([[0.2388]], grad_fn=<AddmmBackward0>)

In [11]:
print(rgent)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [12]:
print(rgent[0][1][0].bias.data)

tensor([-0.4743, -0.4158,  0.2748, -0.1860, -0.0487,  0.4243,  0.1309, -0.4488])


In [13]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
    
net.apply(init_normal)
net[0].weight.data,net[0].bias.data

(tensor([[ 0.0072, -0.0088, -0.0068,  0.0016],
         [ 0.0136,  0.0096, -0.0068, -0.0138],
         [-0.0048, -0.0132, -0.0069, -0.0009],
         [ 0.0055,  0.0089,  0.0026,  0.0003],
         [-0.0047, -0.0170, -0.0062, -0.0026],
         [ 0.0114,  0.0018, -0.0056, -0.0036],
         [-0.0083,  0.0003,  0.0021, -0.0216],
         [ 0.0047,  0.0138, -0.0059, -0.0027]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [14]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

net.apply(init_constant)
net[0].weight.data,net[0].bias.data

(tensor([[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [16]:
def init_xavier(m):
    if type(m) == nn.Linear:
        # xavier_uniform_ 带下划线，代表 in-place 操作(直接修改原 weight 参数)；
        # xavier_uniform 不带下划线，返回新 tensor，不会直接作用到权重参数上。
        nn.init.xavier_uniform_(m.weight)  # 推荐用带下划线的，直接初始化权重
        nn.init.zeros_(m.bias)
        
def init_constant_42(m):
    if type(m) == nn.Linear:
        # 42是宇宙尽头的答案
        nn.init.constant_(m.weight, 42)
        
net[0].apply(init_xavier)
net[2].apply(init_constant_42)
print(net[0].weight.data)
print(net[0].weight.data)

tensor([[ 0.1262, -0.3036,  0.1812,  0.4696],
        [ 0.2484,  0.1389,  0.2626,  0.1821],
        [ 0.1872, -0.1950,  0.1701,  0.4226],
        [-0.6716, -0.4889, -0.3134,  0.0070],
        [ 0.4150,  0.2262,  0.3947, -0.3614],
        [ 0.1524,  0.4352,  0.4573, -0.1606],
        [-0.0343,  0.4763,  0.4649,  0.0077],
        [-0.6116, -0.5317,  0.6218, -0.0092]])
tensor([[ 0.1262, -0.3036,  0.1812,  0.4696],
        [ 0.2484,  0.1389,  0.2626,  0.1821],
        [ 0.1872, -0.1950,  0.1701,  0.4226],
        [-0.6716, -0.4889, -0.3134,  0.0070],
        [ 0.4150,  0.2262,  0.3947, -0.3614],
        [ 0.1524,  0.4352,  0.4573, -0.1606],
        [-0.0343,  0.4763,  0.4649,  0.0077],
        [-0.6116, -0.5317,  0.6218, -0.0092]])


In [27]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >=5

net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 0.0000,  0.0000,  5.3444, -9.5099],
        [ 9.2501,  6.3876, -6.4802,  7.0457]], grad_fn=<SliceBackward0>)

In [28]:
net[0].weight.data[:]+=1
net[0].weight.data[0,0]=42
net[0].weight.data[0]

tensor([42.0000,  1.0000,  6.3444, -8.5099])

In [29]:
shared = nn.Linear(8,8)

net = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    shared,
    nn.ReLU(),
    shared,
    nn.ReLU(),
    nn.Linear(8,1)
)

net(X)

tensor([[-0.3711]], grad_fn=<AddmmBackward0>)

In [30]:
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])


In [31]:
net[2].weight.data[0,0]=100
print(net[2].weight.data[0] == net[4].weight.data[0])


tensor([True, True, True, True, True, True, True, True])
