In [2]:
import torch 
from torch import nn

In [4]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
x = torch.rand(size=(2, 4))
net(x)

tensor([[-0.0196],
        [-0.1803]], grad_fn=<AddmmBackward0>)

In [19]:
print(net[0].state_dict)

<bound method Module.state_dict of Linear(in_features=4, out_features=8, bias=True)>


In [21]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2612], requires_grad=True)
tensor([-0.2612])


In [23]:
net[2].weight.grad == None

True

In [27]:
print([(name, param.shape) for name, param in net[0].named_parameters()])
print([(name, param.shape) for name, param in net.named_parameters()])

[('weight', torch.Size([8, 4])), ('bias', torch.Size([8]))]
[('0.weight', torch.Size([8, 4])), ('0.bias', torch.Size([8])), ('2.weight', torch.Size([1, 8])), ('2.bias', torch.Size([1]))]


In [30]:
print(net.state_dict())
net.state_dict()['2.bias'].data

OrderedDict([('0.weight', tensor([[-0.1115, -0.0548, -0.3203,  0.0338],
        [ 0.3456,  0.2929, -0.0348,  0.2992],
        [ 0.2257, -0.3613,  0.3305,  0.1293],
        [-0.2265, -0.1625, -0.2178, -0.1091],
        [ 0.0683,  0.0042, -0.0490,  0.3198],
        [ 0.1971, -0.0112, -0.2553,  0.3559],
        [ 0.1646,  0.0532,  0.1180, -0.1032],
        [ 0.3577,  0.0629,  0.4466,  0.2542]])), ('0.bias', tensor([-0.3251,  0.2230, -0.0684, -0.4151,  0.3510, -0.0971, -0.0960, -0.1066])), ('2.weight', tensor([[-0.2856, -0.1472, -0.2369, -0.1530,  0.1169, -0.0036,  0.2675, -0.0023]])), ('2.bias', tensor([-0.2612]))])


tensor([-0.2612])

In [32]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())
def block2():
    net = nn.Sequential()
    for i in range(4):
        net.add_module(f'block{i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(x)

tensor([[-0.0208],
        [-0.0208]], grad_fn=<AddmmBackward0>)

In [33]:
rgnet.__len__

<bound method Sequential.__len__ of Sequential(
  (0): Sequential(
    (block0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)>

In [36]:
rgnet[0][1][0].named_modules

<bound method Module.named_modules of Linear(in_features=4, out_features=8, bias=True)>

In [38]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)
net.apply(init_constant)
net[0].weight.data, net[0].bias.data

(tensor([[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]),
 tensor([0., 0., 0., 0., 0., 0., 0., 0.]))

In [39]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([-0.0062,  0.0034, -0.0088,  0.0122]), tensor(0.))

In [40]:
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.2404,  0.7015, -0.2141, -0.0624])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])


In [46]:
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", [(name, param.shape)
                        for name, param in m.named_parameters()])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]

Init [('weight', torch.Size([8, 4])), ('bias', torch.Size([8]))]
Init [('weight', torch.Size([1, 8])), ('bias', torch.Size([1]))]


tensor([[ 6.8203,  8.7349, -8.4275, -0.0000],
        [ 0.0000, -0.0000,  7.5113, -6.0408]], grad_fn=<SliceBackward0>)

In [6]:
# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(x)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [8]:
net[0], net[1]

(Linear(in_features=4, out_features=8, bias=True), ReLU())