In [1]:
import torch
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


## 优化器
**torch.optim**,基类optim.Optimizer

In [13]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=3,out_channels=6,kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.Conv2d(in_channels=6,out_channels=16,kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(2,2)
        )
        # Conv和pooling，数据的形状会怎么变化？
        self.classifier = nn.Sequential(
            nn.Linear(16*5*5,120),
            nn.ReLU(),
            nn.Linear(120,84),
            nn.ReLU(),
            nn.Linear(84,10)
        )
    
    def forward(self,x):
        x = self.features(x)
        x = x.view(-1,16*5*5)
        x = self.classifier(x)
        
        return x
        

In [14]:
net = Net()

In [4]:
from torch import optim

In [11]:
net

Net(
  (features): Sequential(
    (0): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Linear(in_features=400, out_features=120, bias=True)
    (1): ReLU()
    (2): Linear(in_features=120, out_features=84, bias=True)
    (3): ReLU()
    (4): Linear(in_features=84, out_features=10, bias=True)
  )
)

### 随机梯度下降为例

In [7]:
optimizer = optim.SGD(params = net.parameters(),lr=1)
# params 优化的参数
# lr learning rate
# weight_decay:

# 梯度清零
optimizer.zero_grad()

input = torch.randn(1,3,32,32)
output = net(input)
print(output)
output.backward(output) # 反向传播
print(output)

# 优化器优化
optimizer.step()

tensor([[ 0.0341,  0.1830,  0.2576, -0.1983,  0.0186,  0.0848, -0.0482, -0.0031,
         -0.0067,  0.0926]], grad_fn=<AddmmBackward0>)
tensor([[ 0.0341,  0.1830,  0.2576, -0.1983,  0.0186,  0.0848, -0.0482, -0.0031,
         -0.0067,  0.0926]], grad_fn=<AddmmBackward0>)


In [8]:
# 为不同的自网络可以设置不同的学习率
# 设置方法
# 网络和对应的lr分别用字典封装起来

optimizer = optim.SGD([
    {'params':net.features.parameters()}, # 子网络features使用默认的学习率
    {'params':net.classifier.parameters(),'lr':1e-2}
],lr=1e-5)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 1e-05
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    foreach: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [10]:
len(optimizer.param_groups) # 访问优化的参数的方法

2

### 新函数
* id() 函数返回指定对象的唯一 id。
* map() 第一个参数 function 以参数序列中的每一个元素调用 function 函数，返回包含每次 function 函数返回值的新列表。
* filter()该接收两个参数，第一个为函数，第二个为序列，序列的每个元素作为参数传递给函数进行判断，然后返回 True 或 False，最后将返回 True 的元素放到新列表中。

In [16]:
# 新的函数
# map，filter，id
# 
special_layers = nn.ModuleList([net.classifier[0], net.classifier[2]]) 
special_layers_params = list(map(id,special_layers.parameters())) # 特殊的层的params，对应的id
base_params = filter(lambda p: id(p) not in special_layers_params,net.parameters()) # 其他层的params

In [17]:
optimizer = torch.optim.SGD([
    {'params': base_params},
    {'params': special_layers.parameters(),'lr':0.01}
],lr=0.001)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    foreach: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

### 调整学习率
* 新建一个优化器，lr变成调整以后的lr，但是这样对使用动量的优化器可能会丢失状态信息
* 直接修改优化器中的lr

In [20]:
# 1. 新建optimizer
old_lr = 0.1
optimizer1 = optim.SGD([
    {'params':net.features.parameters()},
    {'params':net.classifier.parameters(),'lr':old_lr*0.1}
],lr=1e-5)
optimizer1

SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 1e-05
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    foreach: None
    lr: 0.010000000000000002
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [21]:
# 2. 直接手动修改lr
for param_group in optimizer.param_groups:
    param_group['lr'] *= 0.1
optimizer

SGD (
Parameter Group 0
    dampening: 0
    foreach: None
    lr: 0.0001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0

Parameter Group 1
    dampening: 0
    foreach: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

## nn.functional
用nn.Module实现的layers是一个特殊的类，都是由`class layer(nn.Module)`定义，会自动提取可学习的参数。而`nn.functional`中的函数更像是纯函数，由`def function(input)`定义。

In [2]:
input = torch.randn(2,3)
model = nn.Linear(3,4)
output1 = model(input)
output2 = nn.functional.linear(input=input,weight=model.weight,bias=model.bias)

output1 == output2

tensor([[True, True, True, True],
        [True, True, True, True]])

In [4]:
b = nn.functional.relu(input)
b2 = nn.ReLU()(input)
b == b2

tensor([[True, True, True],
        [True, True, True]])

In [5]:
%%markdown
在模型中搭配使用`nn.Module`和`nn.functional`

在模型中搭配使用`nn.Module`和`nn.functional`


In [20]:
from torch.nn import functional as F
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3,out_channels=6,kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6,out_channels=16,kernel_size=5)
        
        self.fc1 = nn.Linear(16*5*5,120)
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,10)
        
    def forward(self,x):
        # 这里relu，pool都直接使用functional中的function
        x = self.conv1(x)
        x = F.relu(x)
        x = F.pool(x,2)
        
        # 一行写完
        x = F.pool(F.relu(self.conv2(x)),2)
        x = x.view(-1,16*5*5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [7]:
# 手撕Linear层（x
class MyLinear(nn.Module):
    def __init__(self):
        super(MyLinear,self).__init__()
        # 打包成Parameter
        self.weight = nn.Parameter(torch.randn(3,4))
        self.bias = nn.Parameter(torch.zeros(3))
        
    def forward(self,x):
        return F.linear(x,self.weight,self.bias)

In [9]:
%%markdown
## 初始化策略
PyTorch中`nn.init`模块就是专门为初始化而设计，如果某种初始化策略`nn.init`不提供，用户也可以自己直接初始化。

## 初始化策略
PyTorch中`nn.init`模块就是专门为初始化而设计，如果某种初始化策略`nn.init`不提供，用户也可以自己直接初始化。


In [10]:
# nn.init初始化
from torch.nn import init
linear = nn.Linear(3,4)

torch.manual_seed(1)
# 初始化
init.xavier_normal_(linear.weight) # 服从一个N(0,std)的高斯分布

Parameter containing:
tensor([[ 0.3535,  0.1427,  0.0330],
        [ 0.3321, -0.2416, -0.0888],
        [-0.8140,  0.2040, -0.5493],
        [-0.3010, -0.4769, -0.0311]], requires_grad=True)

In [11]:
import math
torch.manual_seed(1)

std = math.sqrt(2) / math.sqrt(3+4.) # in_feature + out_feature
linear.weight.data.normal_(0,std)

tensor([[ 0.3535,  0.1427,  0.0330],
        [ 0.3321, -0.2416, -0.0888],
        [-0.8140,  0.2040, -0.5493],
        [-0.3010, -0.4769, -0.0311]])

In [21]:
net = Net()

In [25]:
# 模型参数初始化
for name,params in net.named_parameters():
    if name.find('fc') != -1:
        #  init linear
        print(name)
        print(params[0])

fc1.weight
tensor([-2.9677e-02,  2.5925e-02, -4.4038e-02, -3.5723e-02,  4.9460e-02,
        -2.7446e-02, -4.2176e-02,  4.3130e-02,  3.3042e-02, -2.8350e-02,
         1.0480e-02,  2.3439e-03,  2.9416e-02, -3.9736e-02, -3.2993e-02,
        -4.0635e-02,  7.9559e-03, -4.8766e-02, -3.7929e-02,  5.7842e-03,
         1.7950e-02,  5.9884e-03,  3.1195e-02, -1.0225e-02, -3.3703e-02,
        -2.6772e-02,  1.7335e-03, -4.6715e-02, -3.0527e-02, -4.2052e-02,
        -3.3746e-02, -1.0058e-02,  2.7784e-02,  1.1201e-02, -3.6287e-02,
        -3.8815e-02,  3.1654e-02, -4.1199e-02,  1.4068e-02, -4.2106e-02,
         2.3294e-02, -2.4933e-02, -2.9120e-02, -4.0588e-02, -2.5668e-02,
        -1.2819e-02,  2.2915e-03, -1.7900e-02, -2.7965e-02, -3.0340e-02,
        -4.3396e-02,  2.2986e-03,  2.6125e-02,  2.0004e-02,  3.4494e-03,
        -6.9220e-03, -5.9373e-03,  4.8400e-02, -1.9492e-02, -7.2676e-04,
         4.6170e-02,  5.2375e-05,  3.0104e-02, -3.4700e-02,  4.2568e-02,
         1.2869e-02, -9.9212e-03, -8.344