In [22]:
import torch
from torch import nn
from d2l import torch as d2l
from torch.nn import functional as F

In [23]:
# 普通的多模组拼接，Sequential相当于一个list
net = nn.Sequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))
X = torch.rand(2,20) # 2批次，维度20
net(X)

tensor([[-0.0526,  0.2071,  0.1756,  0.3949, -0.1839,  0.2518, -0.1591, -0.0287,
          0.1354, -0.2595],
        [-0.1383,  0.1457,  0.0283,  0.4037, -0.0487,  0.1715, -0.1284,  0.0828,
         -0.0184, -0.1523]], grad_fn=<AddmmBackward0>)

In [24]:
# 自定义MLP，自定义模型的内部细节，增加网络层等操作
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 模型内增加一层全连接层
        self.hidden = nn.Linear(20,256)
        self.out = nn.Linear(256,10) # 输出
    def forward(self,X):
        return self.out(F.relu(self.hidden(X)))

net = MLP()
net(X)

tensor([[-0.2662,  0.1794,  0.1804, -0.0403, -0.1575,  0.0947,  0.0807, -0.0978,
         -0.0828, -0.1585],
        [-0.2209,  0.1481,  0.0659, -0.0177, -0.1130,  0.0661, -0.0120, -0.0919,
          0.0173, -0.0893]], grad_fn=<AddmmBackward0>)

In [25]:
# 构建自己的Sequential
class MySequential(nn.Module):
    def __init__(self,*args):
        super().__init__()
        for block in args:
            self._modules[block]=block
    def forward(self,X):
        for block in self._modules.values():
            X = block(X)
        return X
net = MySequential(nn.Linear(20,256),nn.ReLU(),nn.Linear(256,10))
net(X)

tensor([[-0.0501,  0.0885,  0.1277, -0.1917, -0.0720, -0.1948, -0.0222,  0.0510,
         -0.0804,  0.0568],
        [ 0.0276,  0.2202,  0.1867, -0.2715, -0.1660, -0.0482,  0.0097, -0.0312,
         -0.1331, -0.0053]], grad_fn=<AddmmBackward0>)

In [26]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 自定义MLP中的操作，可以生成一个矩阵但不参与训练
        self.rand_weight = torch.rand((20,20),requires_grad=False)
        self.linear = nn.Linear(20,20)
    def forward(self,X):
        X = self.linear(X)
        # 可以做任何操作，并且返回一个标量，这里的矩阵无意义，只是随意的操作
        X = F.relu(torch.mm(X,self.rand_weight)+1)
        X = self.linear(X)
        while X.abs().sum()>1:
            X /= 2
        return X.sum()
net = FixedHiddenMLP()
net(X)

tensor(0.1669, grad_fn=<SumBackward0>)

In [27]:
# 随意嵌套层
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 定义一个组合
        self.net = nn.Sequential(nn.Linear(20,64),nn.ReLU(),
                                 nn.Linear(64,32),nn.ReLU())
        # 再定义一个全连接
        self.linear = nn.Linear(32,16)
    
    def forward(self,X):
        return self.linear(self.net(X))
chimera = nn.Sequential(NestMLP(),nn.Linear(16,20),FixedHiddenMLP())
chimera(X)   # 随意组合自定义嵌套层和普通链接

tensor(0.0204, grad_fn=<SumBackward0>)

-----
# 参数管理

模型的参数保存和访问方式

In [28]:
net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),nn.Linear(8,1))
X = torch.rand(size=(2,4))
net(X)

tensor([[-0.3146],
        [-0.2588]], grad_fn=<AddmmBackward0>)

In [29]:
# 访问所有参数
print(net[2].state_dict())
print('\n')
# 网络参数类型都是Parameter
print(type(net[2].bias),type(net[2].weight))

print('\n')
# 访问参数本身，返回的是tensor
print(net[2].weight.data,net[2].bias.data)

# 访问参数的梯度
print('\n')
print(net[2].weight.grad,net[2].bias.grad)

OrderedDict([('weight', tensor([[ 0.3289, -0.0961,  0.1278, -0.1124, -0.3463,  0.3372, -0.3152,  0.1197]])), ('bias', tensor([-0.1152]))])


<class 'torch.nn.parameter.Parameter'> <class 'torch.nn.parameter.Parameter'>


tensor([[ 0.3289, -0.0961,  0.1278, -0.1124, -0.3463,  0.3372, -0.3152,  0.1197]]) tensor([-0.1152])


None None


In [30]:
# 一次性访问某个模型所有参数
datas = [(name,param.shape) for name,param in net[0].named_parameters()]
print(*datas)  # * 用于解包

# 一次性访问某个组合模型的所有模型的所有参数  等价于 net.state_dict()
datas = [(name,param.shape) for name,param in net.named_parameters()]
print(*datas)

# 通过列出所有模型参数，再用索引访问
print(net.state_dict()['2.weight'])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))
tensor([[ 0.3289, -0.0961,  0.1278, -0.1124, -0.3463,  0.3372, -0.3152,  0.1197]])


In [31]:
# 嵌套块的参数命名法则
def block1():
    return nn.Sequential(nn.Linear(4,8),nn.ReLU(),
                         nn.Linear(8,4),nn.ReLU())
def block2():
    net = nn.Sequential()
    for i in range(4):
        # 添加模块，需要模块名和模块
        net.add_module(f'block {i}',block1())
    return net

rgnet = nn.Sequential(block2(),nn.Linear(4,1))
rgnet(X)

tensor([[-0.2176],
        [-0.2177]], grad_fn=<AddmmBackward0>)

In [32]:
# 查看嵌套块
print(rgnet)
# 可以像访问list一样访问
rgnet[0][1][2].bias.data

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


tensor([-0.3511, -0.1420,  0.3054, -0.0990])

In [33]:
# 参数初始化
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight,mean=0,std=0.01) # 正态分布，均值0，方差0.01
        nn.init.zeros_(m.bias)
# 对多层的每个模型都执行这样的初始化
net.apply(init_normal)
net[0].weight.data

tensor([[ 0.0024,  0.0006, -0.0174, -0.0061],
        [ 0.0213,  0.0039,  0.0084,  0.0004],
        [ 0.0087,  0.0153, -0.0107,  0.0029],
        [-0.0076,  0.0026,  0.0274,  0.0053],
        [-0.0008,  0.0248,  0.0031,  0.0070],
        [-0.0174,  0.0061,  0.0236, -0.0080],
        [-0.0005, -0.0086, -0.0030, -0.0001],
        [ 0.0065,  0.0171, -0.0044, -0.0057]])

In [34]:
# 参数初始化为常数
def init_constant(m):
    if type(m)==nn.Linear:
        nn.init.constant_(m.weight,1)
# Xavier初始化
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
net[0].apply(init_xavier)
net[2].apply(init_constant)
print(net[0].weight.data)

tensor([[ 0.0663,  0.0467, -0.6499,  1.0888],
        [-0.4460,  0.1132, -0.4972,  0.2118],
        [-0.1786, -0.1659,  0.2795,  0.9941],
        [ 0.0941,  0.0828,  0.0097, -0.4063],
        [-0.4029, -0.3692, -0.1537,  0.0472],
        [-0.4940, -0.4170,  0.5301,  0.3784],
        [ 0.0917,  0.0374,  0.5043,  0.3448],
        [-0.7164, -0.9209,  0.1347, -0.1637]])


In [35]:
# 自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print('init',*[(name,param.shape) for name,param in m.named_parameters()][0])
        nn.init.uniform_(m.weight,-10,10) # -10到10的均匀分布
        m.weight.data *= m.weight.data.abs() >=5 # 只保留大于5的部分

net.apply(my_init)
net[0].weight[:2]

init weight torch.Size([8, 4])
init weight torch.Size([1, 8])


tensor([[ 0.0000,  0.0000,  8.9851,  0.0000],
        [-7.5785, -0.0000,  9.1425, -0.0000]], grad_fn=<SliceBackward0>)

In [44]:
# 直接手动设置参数
net[0].weight.data[:]+=1
net[0].weight.data[0,0]=42
net[0].weight.data

tensor([[42.0000,  6.0000, 14.9851,  6.0000],
        [-1.5785,  6.0000, 15.1425,  6.0000],
        [14.9646, -0.6480,  0.3961,  6.0000],
        [-1.5388, 11.4065, 14.7135, 15.1823],
        [-1.0243, 12.0140, 11.5659, 13.7363],
        [-0.9213,  6.0000,  6.0000, 14.5679],
        [ 6.0000,  6.0000,  6.0000,  6.0000],
        [-0.9396, 13.7712, -3.4796,  6.0000]])

In [46]:
# 参数绑定，即设置共享层
shared = nn.Linear(8,8)
net = nn.Sequential(nn.Linear(4,8),nn.ReLU(),
                    shared,nn.ReLU(),
                    shared,nn.ReLU(),
                    nn.Linear(8,1))
net(X)
print(net[2].weight.data[0]==net[4].weight.data[0])
net[2].weight.data[0,0]=100
print(net[2].weight.data[0]==net[4].weight.data[0])


tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


----
构建层（模型是由多个层组成，每个层执行具体的前向的矩阵乘法）

层和模块都是nn.Module的子类，都自动映射了forward方法，使得调用的时候不用  xx.forward(x)

In [50]:
import torch
import torch.nn.functional as F
from torch import nn

class CenteredLayer(nn.Module): # 定义一个层，前向函数里只将X的均值变为0
    def __init__(self):
        super().__init__()
    def forward(self,X):
        return X-X.mean()

layer = CenteredLayer()
layer(torch.FloatTensor([1,2,3,4,5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [53]:
# 自定义层的组合
net = nn.Sequential(nn.Linear(8,128),CenteredLayer())
Y = net(torch.rand(4,8))
print(Y.mean())

tensor(-9.3132e-10, grad_fn=<MeanBackward0>)


In [56]:
# 自定义带参数的层
class MyLinear(nn.Module):
    def __init__(self,in_units,units): # 指定一层的输入和输出维度
        super().__init__()
        # 自定义两个参数，w，b，需要指定为parameter类型，这种类型会自动保存梯度
        self.weight = nn.Parameter(torch.randn(in_units,units))
        # torch.randn是正态分布，torch.rand是均匀分布
        self.bias = nn.Parameter(torch.randn(units))
    def forward(self,X):
        linear = torch.matmul(X,self.weight.data)+self.bias.data
        return F.relu(linear)

# 实例化一个linear
linear = MyLinear(5,3)
print(linear.weight.data)
linear(torch.rand(2,5))


tensor([[-0.1618,  0.2227,  0.1049],
        [ 1.3816, -0.5240, -1.2592],
        [ 0.1696, -2.5408, -0.2076],
        [ 1.8466,  2.0874,  0.3024],
        [-1.5016, -0.3834, -1.0029]])


tensor([[0.0000, 0.5458, 0.0000],
        [1.5721, 0.0217, 0.0000]])

In [63]:
# 自定义层构建模型
net = nn.Sequential(nn.Linear(64,8),MyLinear(8,1))
net(torch.rand(3,64))

tensor([[0.5153],
        [1.9762],
        [0.6330]], grad_fn=<ReluBackward0>)

---- 

# 模型保存在本地如何实现

In [71]:
# 1. 保存tensor张量
x = torch.arange(4)
torch.save(x,'./weights/x-file')  #本地文件名：x-file
# 1.1 加载张量
x2 = torch.load('./weights/x-file')
print(x2)

# 2. 保存和读取张量列表
y = torch.zeros(4)
torch.save([x,y],'./weights/x-files')

x2,y2 = torch.load('./weights/x-files')
print(x2,y2)

# 3. 保存和读取张量字典
mydict = {'x':x,'y':y}
torch.save(mydict,'./weights/mydict')
mydict2 = torch.load('./weights/mydict')
print(mydict2)


tensor([0, 1, 2, 3])
tensor([0, 1, 2, 3]) tensor([0., 0., 0., 0.])
{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}


In [87]:
# 4. 模型加载和保存
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20,256)
        self.output = nn.Linear(256,10)
    def forward(self,x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(2,20)
Y = net(X)
print(Y)

torch.save(net.state_dict(),'./weights/mlp.params')

# 加载
clone = MLP()
clone.load_state_dict(torch.load('./weights/mlp.params'))
clone.eval()  # 切换到评估模式，用于正向 设置后不会使用dropout 评估时使用训练期间积累的全局统计数据来归一化
Y_clone = clone(X)
print(Y_clone==Y)

tensor([[-0.4940, -0.0014,  0.2467,  0.1463, -0.1547, -0.2169, -0.0428, -0.1436,
         -0.5019, -0.2251],
        [-0.1169,  0.0587,  0.0408,  0.2293,  0.1631,  0.0645,  0.1849, -0.1220,
         -0.1051, -0.1488]], grad_fn=<AddmmBackward0>)
tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])


----
GPU

In [88]:
!nvidia-smi

Tue Sep 24 16:17:30 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 512.15       Driver Version: 512.15       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   35C    P8     3W /  N/A |      0MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces