In [1]:
import torch
from torch import nn
from torch.nn import functional as F

# 层和块

In [2]:
net = nn.Sequential(nn.Linear(20, 256),
                    nn.ReLU(),
                    nn.Linear(256, 10))

X = torch.rand(2, 20)
net(X)

tensor([[ 0.0846, -0.0449,  0.0950, -0.0617, -0.1398, -0.0039,  0.0050, -0.0146,
          0.0903, -0.0608],
        [ 0.0366, -0.0043,  0.1464,  0.0157, -0.1390,  0.0325,  0.0327, -0.0012,
          0.0570, -0.0447]], grad_fn=<AddmmBackward0>)

## 自定义块

In [3]:
class MLP(nn.Module):
    # 用模型参数声明层。这里声明2个全连接层
    def __init__(self):
        # 调用MLP的父类Module的构造函数来执行必要的初始化
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)

    # 定义模型的前向计算，即如何根据输入x计算返回的预测值y
    def forward(self, x):
        return self.out(F.relu(self.hidden(x)))

In [4]:
net = MLP()
net(X)

tensor([[ 0.1179,  0.2446,  0.2538,  0.0369, -0.2304,  0.2575,  0.1000, -0.1030,
          0.2115, -0.0753],
        [ 0.1553,  0.2019,  0.1728,  0.0670, -0.0829,  0.4435,  0.0793, -0.1056,
          0.2301,  0.0200]], grad_fn=<AddmmBackward0>)

## 在前向传播函数中执行代码

In [10]:
class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 不计算梯度的随机权重参数，因此在训练期间不变
        self.rand_weight = torch.rand(20, 20, requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):
        X = self.linear(X)
        # 使用创建的常量参数
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # 复用全连接层，相当于两个层共享参数
        X = self.linear(X)
        # 控制流
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

In [11]:
net = FixedHiddenMLP()
net(X)

tensor(-0.3027, grad_fn=<SumBackward0>)

In [14]:
class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))


chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
chimera(X)

tensor(0.1169, grad_fn=<SumBackward0>)

# 参数管理

In [16]:
net = nn.Sequential(nn.Linear(4, 8),
                    nn.ReLU(),
                    nn.Linear(8, 1))

X = torch.rand(size=(2, 4))
net(X)

tensor([[0.2800],
        [0.2369]], grad_fn=<AddmmBackward0>)

## 参数访问

In [17]:
# 检查第2个全连接层的参数
print(net[2].state_dict())

OrderedDict([('weight', tensor([[ 0.0242, -0.1177, -0.0145,  0.1872,  0.2512,  0.0126,  0.3535, -0.0684]])), ('bias', tensor([0.0005]))])


### 目标参数

In [19]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([0.0005], requires_grad=True)
tensor([0.0005])


In [20]:
net[2].weight.grad == None

True

### 一次性访问所有参数

In [21]:
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [22]:
net.state_dict()['2.bias'].data

tensor([0.0005])

### 从嵌套块收集参数

In [23]:
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())


def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net


rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[-0.5493],
        [-0.5492]], grad_fn=<AddmmBackward0>)

In [24]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [25]:
rgnet[0][1][0].bias.data

tensor([ 0.2643,  0.1869,  0.0360, -0.4075,  0.0611, -0.3521, -0.3877,  0.4724])

## 参数初始化

### 内置初始化

In [26]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)


net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0100,  0.0189, -0.0030, -0.0057]), tensor(0.))

In [27]:
def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)


net[0].apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([1., 1., 1., 1.]), tensor(0.))

In [29]:
# 用xavier方法初始化
def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)


# 自定义初始化
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)
        nn.init.zeros_(m.bias)


net[0].apply(init_xavier)
net[2].apply(init_42)

print(net[0].weight.data[0], net[0].bias.data[0])
print(net[2].weight.data[0], net[2].bias.data[0])

tensor([ 0.3289, -0.5094, -0.5570, -0.1416]) tensor(0.)
tensor([42., 42., 42., 42., 42., 42., 42., 42.]) tensor(0.)


### 参数共享

In [32]:
# 定义共享层，以便引用参数
shared = nn.Linear(8, 8)

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),
                    nn.Linear(8, 1))
net(X)

# 检查参数是否共享
print(net[2].weight.data[0] == net[4].weight.data[0])

# 改变共享层的参数
net[2].weight.data[0, 0] = 100

# 检查是否真的共享
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


# 自定义层

## 不带参数的层

In [33]:
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

In [34]:
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [38]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
Y.mean()

tensor(0., grad_fn=<MeanBackward0>)

## 带参数的层

In [39]:
class MyLinear(nn.Module):
    def __init__(self, in_units, out_units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, out_units))
        self.bias = nn.Parameter(torch.randn(out_units, ))

    def forward(self, X):
        return F.relu(X.mm(self.weight.data) + self.bias.data)

In [40]:
linear = MyLinear(5, 3)
linear.weight.data, linear.bias.data

(tensor([[-7.5904e-01, -4.4376e-01, -5.2586e-02],
         [ 2.7010e-01,  1.6096e+00,  1.1526e+00],
         [-1.3138e-03,  4.5547e-01, -6.8279e-01],
         [ 2.6027e+00,  1.6051e+00, -8.4308e-01],
         [ 3.2514e-01,  9.1211e-01, -8.2289e-02]]),
 tensor([-0.6037, -0.1418,  0.3063]))

In [41]:
linear(torch.rand(4, 5))

tensor([[0.8062, 1.1851, 0.0000],
        [0.3634, 2.1871, 0.8922],
        [0.0000, 0.3780, 0.3825],
        [1.4830, 2.8557, 0.3157]])

In [42]:
net = nn.Sequential(MyLinear(64, 8), MyLinear(8, 1))
net(torch.rand(3, 64))

tensor([[6.6480],
        [8.1332],
        [1.9611]])

# 读写文件

## 加载和保存张量

In [44]:
x = torch.arange(4)
torch.save(x, 'x-file')  # 保存张量到文件

In [46]:
x2 = torch.load('x-file')  # 从文件中加载张量
x2

tensor([0, 1, 2, 3])

In [47]:
y = torch.zeros(4)
torch.save([x, y], 'x-files')  # 保存两个张量到文件

In [48]:
x2, y2 = torch.load('x-files')  # 从文件中加载两个张量
x2, y2

(tensor([0, 1, 2, 3]), tensor([0., 0., 0., 0.]))

In [49]:
mydict = {'x': x, 'y': y}
torch.save(mydict, 'mydict')  # 保存字典到文件
mydict2 = torch.load('mydict')  # 从文件中加载字典
mydict2

{'x': tensor([0, 1, 2, 3]), 'y': tensor([0., 0., 0., 0.])}

## 加载和保存模型参数

In [50]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, X):
        return self.output(F.relu(self.hidden(X)))

In [51]:
net = MLP()
X = torch.randn(3, 20)
y = net(X)

In [52]:
# 存储模型参数
torch.save(net.state_dict(), 'mlp.params')

In [53]:
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))  # 加载模型参数
clone.eval()  # 设置为评估模式

MLP(
  (hidden): Linear(in_features=20, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
)

In [54]:
y_clone = clone(X)
print(y_clone == y)

tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])


# GPU

## 计算设备

In [55]:
! nvidia-smi

Thu May 23 22:53:06 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 517.47       Driver Version: 517.47       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   43C    P0    N/A /  N/A |      0MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [56]:
torch.device('cpu'), torch.device('cuda')

(device(type='cpu'), device(type='cuda'))

In [57]:
torch.cuda.device_count()

1

## 张量与GPU

In [58]:
x = torch.tensor([1, 2, 3])
x.device

device(type='cpu')

### 存储在GPU上

In [59]:
X = torch.randn(2, 3, device='cuda')
X

tensor([[-0.3782,  0.9844,  0.2949],
        [ 1.0046,  1.0984,  1.1872]], device='cuda:0')

### 复制

In [60]:
Y = torch.randn(2, 3)  # 默认存储在CPU上
Y.device

device(type='cpu')

In [61]:
Z = Y.cuda()  # 复制到GPU
print(X)
print(Z)

tensor([[-0.3782,  0.9844,  0.2949],
        [ 1.0046,  1.0984,  1.1872]], device='cuda:0')
tensor([[ 0.3086, -0.5646, -1.6916],
        [-1.2670, -0.1028, -0.8035]], device='cuda:0')


In [62]:
X + Z

tensor([[-0.0696,  0.4198, -1.3967],
        [-0.2624,  0.9956,  0.3837]], device='cuda:0')

In [64]:
Z.cuda() is Z

True

## 神经网络与GPU

In [65]:
net = nn.Sequential(nn.Linear(3, 1))
net = net.to('cuda')

In [68]:
net(X)

tensor([[0.0805],
        [0.1600]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [69]:
net[0].weight.data.device

device(type='cuda', index=0)