In [15]:
import torch
import torch.nn as nn
# 这里的functional 相当于 tf.nn，相当于是一些不需要construct的基本操作
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # 在TF2中定义model无需定义in_channel，但是这里需要定义
        # 三个参数为in_channel, out_channel, kernel_size
        # 此处默认stride为1, padding为0
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # spatial resoultion 为6*6, 但是因为没有padding，实际输出应该为5*5?, channel数为16
        self.fc1 = nn.Linear(16*6*6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    def forward(self, x):
#         用一个2*2的kernel作卷积
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
#         相当于TF2中的flatten layer，但是这里需要手动reshape
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x
#     相当于手动创建一个TF2 中的 flatten layer
    def num_flat_features(self, x):
#       注意: 这里size是一个method, 是size(), 并非参数
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [16]:
# net.parameters() 会返回一个generator，不能被print，只能用list()以后才能被print，这是一个method，而并非是一个property
# 这个parameters 有点像TF2 中model.trainable_variables()
params = list(net.parameters())
# params = net.parameters()
print(len(params))
print([i.size() for i in params])
# 之所以有10个大的parameters,是因为有2个conv,他们有2个kernel以及对应的bias,一共四个参数,
# 3层MLP,他们有3个weights以及对应的bias,共3个参数
print(type(params[0]))


10
[torch.Size([6, 1, 3, 3]), torch.Size([6]), torch.Size([16, 6, 3, 3]), torch.Size([16]), torch.Size([120, 576]), torch.Size([120]), torch.Size([84, 120]), torch.Size([84]), torch.Size([10, 84]), torch.Size([10])]
<class 'torch.nn.parameter.Parameter'>


In [19]:
# PyTorch 中 4D Tensor (batch_size, channels, height, width)
# TF2 中的 4D Tensor (batch_size, height, width, channels)
# 两个必须要区分
rand_in = torch.randn(1, 1, 32, 32)
# nn.Module和TF2 model 一样,都仅支持mini batch operations, i.e 所有的I/O都必须包含batch_size这个dimension
# single sample 中增加batch_size 的方法 Tensor.unsequeeze(0)
out = net(rand_in)
print(out)
print(out.size())

tensor([[0.0000, 0.0000, 0.0860, 0.0350, 0.0938, 0.0043, 0.0000, 0.0000, 0.1006,
         0.0465]], grad_fn=<ReluBackward0>)
torch.Size([1, 10])


In [18]:
# nn.Module.zero_grad()方法能够使network中清除network之前的gradients
net.zero_grad()
# 假设一个随机的gradient,和网络输出一样的dimension,便能够开始backpropagation
out.backward(torch.randn(1,10))

In [20]:
output = net(rand_in)
target = torch.randn(10)
target = target.view(1, -1)
# 相当于是新建MSELoss的一个instance
criterion = nn.MSELoss() 

loss = criterion(output, target)
print(loss)

tensor(0.9075, grad_fn=<MseLossBackward>)


In [39]:
# 查看gradient function 组成的computation graph，每个grad_fn都有next_functions这个功能，以查看下一个grad_fn
print(loss.grad_fn)
print(loss.grad_fn.next_functions[0][0])
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])
# 利用recursive method 查看computation graph的方法
def print_graph(g, level=0):
    if g == None: return
    print('{}'.format(level), g)
    for subg in g.next_functions:
        print_graph(subg[0], level+1)

print_graph(loss.grad_fn, 0)

0 <MseLossBackward object at 0x7ff6056affd0>
1 <ReluBackward0 object at 0x7ff60df0c748>
2 <AddmmBackward object at 0x7ff6132ee780>
3 <AccumulateGrad object at 0x7ff6132eec50>
3 <ReluBackward0 object at 0x7ff605541160>
4 <AddmmBackward object at 0x7ff605541dd8>
5 <AccumulateGrad object at 0x7ff605541f60>
5 <ReluBackward0 object at 0x7ff605541e80>
6 <AddmmBackward object at 0x7ff6055410f0>
7 <AccumulateGrad object at 0x7ff605541d68>
7 <ViewBackward object at 0x7ff605541780>
8 <MaxPool2DWithIndicesBackward object at 0x7ff60553c780>
9 <ReluBackward0 object at 0x7ff60553c978>
10 <MkldnnConvolutionBackward object at 0x7ff60553c828>
11 <MaxPool2DWithIndicesBackward object at 0x7ff60553c4e0>
12 <ReluBackward0 object at 0x7ff60553ce80>
13 <MkldnnConvolutionBackward object at 0x7ff6055593c8>
14 <AccumulateGrad object at 0x7ff6055596d8>
14 <AccumulateGrad object at 0x7ff6055592e8>
11 <AccumulateGrad object at 0x7ff60553c710>
11 <AccumulateGrad object at 0x7ff60553c400>
7 <TBackward object at 0x7f

In [37]:
net.zero_grad()
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0019,  0.0018, -0.0090,  0.0042, -0.0016,  0.0006])


In [41]:
# 这里演示的是不采用任何optimizer，直接进行SGD的操作
learning_rate = 0.01
# 这里.parameters() 返回的是一个generator，而f就是tensor.nn.parameter.Parameter，是Tensor的subclass
# sub_() 代表Tensor 的 in-place subtraction 操作
# 既然f是Tensor的subclass，自然也具有Tensor.grad求出grad的操作
# Tensor.data 只包含 array 本身，不然还可能包括grad_fn
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

In [None]:
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr=0.01)
# gradients会accumulate的，所以需要清零
optimizer.zero_grad()
output = net(input)
loss = criterion(output, target)
# 求每个parameter的gradient
loss.backward()
# optimize network parameters
optimizer.step()