In [1]:
import math
import numpy as np
import torch
import torch.nn as nn

In [2]:
w = torch.nn.Parameter(torch.empty(2, 3))

nn.init.uniform_(w , a=-1, b=1)
nn.init.normal_(w, mean=0, std=1)
nn.init.ones_(w)
nn.init.xavier_uniform_(w, gain=1)
nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu')

Parameter containing:
tensor([[ 0.4423,  1.2361,  0.6878],
        [-0.2776, -0.7045, -0.8003]], requires_grad=True)

In [3]:
with torch.no_grad():
    w.uniform_(2,3)

In [4]:
conv = nn.Conv2d(1, 3, kernel_size=1)
nn.init.kaiming_normal_(conv.weight, mode='fan_in')
nn.init.constant_(conv.bias, 0.)

Parameter containing:
tensor([0., 0., 0.], requires_grad=True)

In [5]:
rnn = nn.LSTM(input_size=12, hidden_size=128, num_layers=1, bidirectional=False)
for name, param in rnn.named_parameters():
    print(name)
    if name.startswith("weight"):
        nn.init.xavier_normal_(param)
    else:
        nn.init.zeros_(param)

weight_ih_l0
weight_hh_l0
bias_ih_l0
bias_hh_l0


In [6]:
def reset_parameters(self):
    init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    if self.bias is not None:
        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        init.uniform_(self.bias, -bound, bound)

def reset_parameters(self):
    n = self.in_channels
    init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    if self.bias is not None:
        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        init.uniform_(self.bias, -bound, bound)
    
def reset_parameters(self):
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
        init.uniform_(weight, -stdv, stdv)

The full set of parameters registered by the module can be iterated through \
via a call to parameters() or named_parameters(),\
where the latter includes each parameter’s name:\

In [7]:
class MyLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.bias = nn.Parameter(torch.randn(out_features))
        self.fc = nn.Linear(3, 3)

    def forward(self, input):
        return self.fc((input @ self.weight) + self.bias)
    
    
m = MyLinear(4, 3)
sample_input = torch.randn(4)
m(sample_input)
for name,m in m.named_parameters():
    print(name)

weight
bias
fc.weight
fc.bias


### Modules as Building Blocks

In [8]:
# using modules to building blocks
net = nn.Sequential()
net.add_module('1', MyLinear(4, 3))
net.add_module('3', nn.Linear(3, 1))

In [9]:
sample_input = torch.ones(4)*1
y = net(sample_input)
y.backward()
for m in net.parameters():
    print(m.grad)

    
optimizer = torch.optim.Adam(net.parameters())  ## 清除梯度

tensor([[ 0.2592, -0.0934,  0.3342],
        [ 0.2592, -0.0934,  0.3342],
        [ 0.2592, -0.0934,  0.3342],
        [ 0.2592, -0.0934,  0.3342]])
tensor([ 0.2592, -0.0934,  0.3342])
tensor([[ 1.1968,  1.1262, -0.4764],
        [ 0.2241,  0.2109, -0.0892],
        [-1.0052, -0.9459,  0.4002]])
tensor([-0.4934, -0.0924,  0.4144])
tensor([[1.1604, 0.6113, 0.8149]])
tensor([1.])


### 初始化过程相当于p.grad置为None

In [10]:
for group in optimizer.param_groups:
    print(group)
    for p in group['params']:
        print(p.grad)
        if p.grad is not None:
            p.grad = None

{'params': [Parameter containing:
tensor([[ 0.4334, -2.0267,  0.2222],
        [-2.9178,  1.7465, -0.3632],
        [ 0.6030, -1.7102, -0.6228],
        [-1.3348,  0.6843,  0.6407]], requires_grad=True), Parameter containing:
tensor([ 0.7906, -0.9763,  1.0886], requires_grad=True), Parameter containing:
tensor([[-0.2988, -0.2928, -0.2925],
        [-0.4478,  0.1074,  0.2529],
        [ 0.1699, -0.5501,  0.5145]], requires_grad=True), Parameter containing:
tensor([ 0.0498, -0.4739, -0.5253], requires_grad=True), Parameter containing:
tensor([[-0.4934, -0.0924,  0.4144]], requires_grad=True), Parameter containing:
tensor([-0.5047], requires_grad=True)], 'lr': 0.001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False, 'maximize': False, 'foreach': None, 'capturable': False, 'differentiable': False, 'fused': None}
tensor([[ 0.2592, -0.0934,  0.3342],
        [ 0.2592, -0.0934,  0.3342],
        [ 0.2592, -0.0934,  0.3342],
        [ 0.2592, -0.0934,  0.3342]])
tensor(

In [11]:
for name,parameter in net.named_parameters():
    print(name)
    print(parameter.grad)
    with torch.no_grad():
        if parameter.grad is not None:
            parameter.grad = None

1.weight
None
1.bias
None
1.fc.weight
None
1.fc.bias
None
3.weight
None
3.bias
None


### Simple submodule children() named_children()

In [12]:
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.l0 = nn.Linear(4, 3)
        self.l1 = nn.Linear(3, 1)
    def forward(self, x):
        x = self.l0(x)
        x = F.relu(x)
        x = self.l1(x)
        return x
net = Net()
for name, child in net.named_children():
    print(name)

l0
l1


In [13]:
class BigNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = MyLinear(5, 4)
        self.net = Net()
    def forward(self, x):
        return self.net(self.l1(x))

big_net = BigNet()
for module in big_net.named_modules():
    print(module)

('', BigNet(
  (l1): MyLinear(
    (fc): Linear(in_features=3, out_features=3, bias=True)
  )
  (net): Net(
    (l0): Linear(in_features=4, out_features=3, bias=True)
    (l1): Linear(in_features=3, out_features=1, bias=True)
  )
))
('l1', MyLinear(
  (fc): Linear(in_features=3, out_features=3, bias=True)
))
('l1.fc', Linear(in_features=3, out_features=3, bias=True))
('net', Net(
  (l0): Linear(in_features=4, out_features=3, bias=True)
  (l1): Linear(in_features=3, out_features=1, bias=True)
))
('net.l0', Linear(in_features=4, out_features=3, bias=True))
('net.l1', Linear(in_features=3, out_features=1, bias=True))


### 动态定网络submodules. The ModuleList and ModuleDict modules

In [14]:
class DynamicNet(nn.Module):
    def __init__(self, num_layers):
        super().__init__()
        self.linears = nn.ModuleList(
        [nn.Linear(4, 4) for _ in range(num_layers)])
        self.activations = nn.ModuleDict({
        'relu': nn.ReLU(),
        'lrelu': nn.LeakyReLU()
        })
        self.final = nn.Linear(4, 1)
        
    def forward(self, x, act):
        for linear in self.linears:
            x = linear(x)
            x = self.activations[act](x)
        x = self.final(x)
        return x

dynamic_net = DynamicNet(3)
sample_input = torch.randn(4)
output = dynamic_net(sample_input, 'relu')

###  its parameters consist of its direct parameters as well as the parameters of all submodules.
使用parameters 和 named_parameters() 将会递归的调用子参数

In [15]:
# parameter_registers = []
for parameters in dynamic_net.named_parameters():
    print(parameters)
#     with torch.no_grad():
#         parameter_registers.append(parameters[1].detach())

('linears.0.weight', Parameter containing:
tensor([[ 0.4857,  0.2158, -0.3233, -0.4697],
        [ 0.3397,  0.4831,  0.0161,  0.4129],
        [ 0.4010,  0.2304,  0.3120,  0.2003],
        [-0.3562,  0.0602, -0.0277,  0.0270]], requires_grad=True))
('linears.0.bias', Parameter containing:
tensor([ 0.4645,  0.0422, -0.2372, -0.2482], requires_grad=True))
('linears.1.weight', Parameter containing:
tensor([[ 0.3702,  0.1347,  0.2715, -0.0403],
        [ 0.1754, -0.0897,  0.4280,  0.4035],
        [-0.1338, -0.3020, -0.1539,  0.3424],
        [ 0.2116,  0.1039,  0.3017,  0.0621]], requires_grad=True))
('linears.1.bias', Parameter containing:
tensor([ 0.1498,  0.0089, -0.0198,  0.2049], requires_grad=True))
('linears.2.weight', Parameter containing:
tensor([[-2.2388e-01,  1.9924e-01, -3.7828e-01,  3.1292e-04],
        [-2.2334e-02, -8.1629e-02,  4.8860e-01, -3.2840e-01],
        [ 1.5269e-01,  1.7929e-01, -2.4909e-01, -5.4804e-03],
        [ 7.5740e-02, -3.4095e-02, -1.7930e-01, -4.2932e-01

In [16]:
parameter_registers

NameError: name 'parameter_registers' is not defined

In [17]:
device = next(dynamic_net.parameters()).device
dynamic_net.to(device=device, dtype=torch.float64)

DynamicNet(
  (linears): ModuleList(
    (0-2): 3 x Linear(in_features=4, out_features=4, bias=True)
  )
  (activations): ModuleDict(
    (relu): ReLU()
    (lrelu): LeakyReLU(negative_slope=0.01)
  )
  (final): Linear(in_features=4, out_features=1, bias=True)
)

In [18]:
torch.manual_seed(4)
dynamic_net.to(device='cuda' if torch.cuda.is_available() else 'cpu')
result = dynamic_net(torch.randn(4, device='cuda', dtype=torch.float32), act='relu')
result.backward()

RuntimeError: expected scalar type Float but found Double

In [19]:
#grad_register = []
for param in dynamic_net.named_parameters():
    print(param[0],param[1].grad)
    #with torch.no_grad():
     #   grad_register.append(param[1].grad.detach().clone())

linears.0.weight None
linears.0.bias None
linears.1.weight None
linears.1.bias None
linears.2.weight None
linears.2.bias None
final.weight None
final.bias None


In [20]:
for param in dynamic_net.named_parameters():
    with torch.no_grad():
        if param[1].grad is not None:
            param[1].grad = None
        

In [None]:
grad_register

In [21]:
def reset_parameters(self):
    init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    if self.bias is not None:
        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        init.uniform_(self.bias, -bound, bound)

def reset_parameters(self):
    n = self.in_channels
    init.kaiming_uniform_(self.weight, a=math.sqrt(5))
    if self.bias is not None:
        fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
        bound = 1 / math.sqrt(fan_in)
        init.uniform_(self.bias, -bound, bound)
    
def reset_parameters(self):
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
        init.uniform_(weight, -stdv, stdv)

### 网络初始化函数及apply

In [24]:
class DynamicNet(nn.Module):
    def __init__(self, num_layers):
        super().__init__()
        self.linears = nn.ModuleList(
        [nn.Linear(4, 4) for _ in range(num_layers)])
        self.activations = nn.ModuleDict({
        'relu': nn.ReLU(),
        'lrelu': nn.LeakyReLU()
        })
        self.cov = nn.Conv2d(2,2,kernel_size=1)
        self.rnn = nn.LSTM(input_size=2, hidden_size=2, num_layers=2, dropout=0.3)
        self.final = nn.Linear(4, 1)
        
    def forward(self, x, act):
        for linear in self.linears:
            x = linear(x)
            x = self.activations[act](x)
        x = self.final(x)
        return x

dynamic_net = DynamicNet(3)

### 初始化函数
# Define a function to initialize Linear weights.
# Note that no_grad() is used here to avoid tracking this computation in the autograd graph.
# @torch.no_grad()
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
        if m.bias is not None:
            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
            bound = 1 / math.sqrt(fan_in)
            torch.nn.init.uniform_(m.bias, -bound, bound)
    elif isinstance(m, nn.Conv2d):
        n = m.in_channels
        print(n)
        torch.nn.init.kaiming_uniform_(m.weight, a=math.sqrt(5))
        if m.bias is not None:
            fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(m.weight)
            bound = 1 / math.sqrt(fan_in)
            torch.nn.init.uniform_(m.bias, -bound, bound)
    elif isinstance(m, nn.LSTM):
        stdv = 1.0 / math.sqrt(m.hidden_size)
        for weight in m.parameters():
            print('w')
            torch.nn.init.uniform_(weight, -stdv, stdv)


In [23]:
dynamic_net.apply(init_weights)
for p in dynamic_net.named_parameters():
    print(p)

2
w
w
w
w
w
w
w
w
('linears.0.weight', Parameter containing:
tensor([[ 0.4184, -0.1598, -0.4926,  0.4890],
        [ 0.2707,  0.3608, -0.0912, -0.1030],
        [ 0.0489,  0.1224,  0.0129, -0.2595],
        [ 0.4466,  0.2335, -0.0452, -0.4259]], requires_grad=True))
('linears.0.bias', Parameter containing:
tensor([-0.2463, -0.1744, -0.0834,  0.1384], requires_grad=True))
('linears.1.weight', Parameter containing:
tensor([[-0.0836,  0.1730, -0.3248, -0.1094],
        [ 0.2260,  0.4911, -0.4224,  0.0520],
        [ 0.2641, -0.0539,  0.1165,  0.3122],
        [-0.4968,  0.2961, -0.4470,  0.0929]], requires_grad=True))
('linears.1.bias', Parameter containing:
tensor([ 0.3026, -0.2273,  0.0045, -0.4800], requires_grad=True))
('linears.2.weight', Parameter containing:
tensor([[-0.0308,  0.2818,  0.2492,  0.1336],
        [-0.0410, -0.2049,  0.4064,  0.1477],
        [ 0.1779,  0.2435, -0.1413, -0.0381],
        [ 0.4718,  0.4475, -0.4963, -0.4267]], requires_grad=True))
('linears.2.bias', Pa

In [25]:
rnn = nn.LSTM(input_size=2, hidden_size=2, num_layers=2, dropout=0.3)
for m in rnn.named_parameters():
    print(m)

('weight_ih_l0', Parameter containing:
tensor([[ 0.5181, -0.0161],
        [ 0.6578,  0.0111],
        [-0.1733, -0.2237],
        [ 0.3685,  0.6207],
        [ 0.4737, -0.2573],
        [-0.2658, -0.5630],
        [-0.6517,  0.2302],
        [ 0.2767,  0.2644]], requires_grad=True))
('weight_hh_l0', Parameter containing:
tensor([[-0.3235, -0.4423],
        [ 0.1226,  0.1008],
        [-0.5240, -0.6215],
        [-0.0507,  0.0563],
        [ 0.6154, -0.0559],
        [ 0.2897, -0.0234],
        [ 0.2582, -0.6910],
        [-0.2747,  0.0940]], requires_grad=True))
('bias_ih_l0', Parameter containing:
tensor([ 0.3661,  0.6481,  0.5172,  0.4149,  0.2209,  0.5209, -0.4222, -0.0090],
       requires_grad=True))
('bias_hh_l0', Parameter containing:
tensor([ 0.5096,  0.6593,  0.6949,  0.1772,  0.6886,  0.3372,  0.4052, -0.4551],
       requires_grad=True))
('weight_ih_l1', Parameter containing:
tensor([[ 0.4503,  0.4315],
        [ 0.1963,  0.4462],
        [-0.4434, -0.0621],
        [ 0.287

In [70]:
### 
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-4, weight_decay=1e-2, momentum=0.1)
for i in range(10000):
    torch.manual_seed(i)
    input = torch.randn(4)
    output = net(input)
    loss = torch.abs(output)
    net.zero_grad()
    loss.backward()
    optimizer.step()

In [71]:
# optimizer.zero_grad()  # net.zero_grad()
for name,param in net.named_parameters():
    print(param)

Parameter containing:
tensor([[-0.1464, -0.1097, -0.2943, -0.3205],
        [ 0.4544,  0.2091, -0.3286,  0.1542],
        [ 0.2800,  0.1145, -0.3014, -0.0013]], requires_grad=True)
Parameter containing:
tensor([-0.2541,  0.3595,  0.3480], requires_grad=True)
Parameter containing:
tensor([[-0.2148, -0.2534,  0.3346]], requires_grad=True)
Parameter containing:
tensor([-3.2252e-05], requires_grad=True)


In [87]:
net.train()
print(net.training)
net.eval()
print(net.training)

True
False


### 保持模型 model state
state_dict 保存的参数包括：\
Parameters: learnable aspects of computation; contained within the state_dict\
Persistent buffers: contained within the state_dict (i.e. serialized when saving & loading)\

In [89]:
# Save the module
torch.save(net.state_dict(), 'net.pt')

In [91]:
# Load the module later on
new_net = Net()
new_net.load_state_dict(torch.load('net.pt'))

<All keys matched successfully>

In [100]:
class RunningMean(nn.Module):
    def __init__(self, num_features, momentum=0.9):
        super().__init__()
        self.momentum = momentum
        self.register_buffer('mean_register', torch.zeros(num_features))
        self.mean = torch.zeros(num_features)
    def forward(self, x):
        self.mean = self.momentum * self.mean + (1.0 - self.momentum) * x
        self.mean_register = self.momentum * self.mean_register + (1.0 - self.momentum) * x
        return self.mean
    
m = RunningMean(4)
for _ in range(10):
    input = torch.randn(4)
    m(input)


In [98]:
m.state_dict()

OrderedDict([('mean_register', tensor([-0.1269, -0.0470,  0.0229, -0.5168]))])

### to 方法将移动 参数 与 Buffers  to the specified device / dtype

In [104]:
# Moves all module parameters and buffers to the specified device / dtype
m.to(device='cuda', dtype=torch.float64)

RunningMean()