In [10]:
# 通过 Nvidia System management interface 查看GPU信息
! nvidia-smi

Fri Jul 21 04:31:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 511.69       Driver Version: 511.69       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:03:00.0 Off |                  N/A |
| N/A   52C    P8    N/A /  N/A |    469MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
from torch import nn

In [14]:
# torch.cuda.device('cuda')
print(torch.cuda.device_count())
torch.device('cpu')
torch.cuda.device('cuda')
torch.device('cuda:0')

1


device(type='cuda', index=0)

In [4]:
# 定义一般化设备辅助函数
def try_gpu(i=0):
    if torch.cuda.device_count() >= i+1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():
    devices = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

In [22]:
# 测试上述函数
try_gpu(0),try_gpu(3),try_all_gpus()

(device(type='cuda', index=0),
 device(type='cpu'),
 [device(type='cuda', index=0)])

In [23]:
# 张量与GPU
x = torch.tensor([1, 2, 3])
x.device # 默认是在CPU上

device(type='cpu')

In [26]:
# 存储在GPU上，注意创建的数据不要超过显存，可通过nvidia-smi观察 GPU 运行时情况
X = torch.ones(2, 3, device=try_gpu())
X

tensor([[1., 1., 1.],
        [1., 1., 1.]], device='cuda:0')

In [27]:
# 若存在多个GPU，在不同GPU上执行时要复制到同一设备进行运算
Y = torch.rand(2, 3, device=try_gpu(1)) # 至少有2个GPU下运行
Y
Z = X.cuda(1)
print(X)
print(Z)
Y + Z
# 如果设备相同，cuda函数不会复制，验证如下
Z.cuda(1) is Z

RuntimeError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [30]:
X
Z = X.cuda('cuda')
X is Z

True

In [9]:
# 神经网络与GPU
# 注意：张量与网络都需要在GPU上，且cuda不支持整数的矩阵乘法
X = torch.tensor([1.0, 2, 4], device=try_gpu())
net = nn.Sequential(nn.Linear(3, 1))
net.to(try_gpu())
net(X)

tensor([1.0295], device='cuda:0', grad_fn=<AddBackward0>)

In [47]:
net[0].weight.data

tensor([[-0.2700, -0.3359, -0.5395]], device='cuda:0')

In [57]:
X = torch.tensor([1, 2])
Y = torch.tensor([2, 3], device=try_gpu())
# X + Y # 将失败
# 转换同一设备
# 转换到CPU
Y1 = Y.to('cpu')
print(X + Y1)
# 转换到GPU
X1 = X.cuda(0)
X2 = X.to('cuda:0')
print(X1 + Y)
print(X2 + Y)
X3 = X1.cuda(0)
X4 = X1.to('cuda:0')
print(X1 is X3, X1 is X4) # 由此得知to与cuda函数只跨设备才发生拷贝

# 注意：张量转换时通过返回新结果方式获得拷贝后结果，网咯调用to可以不通过返回获得拷贝后结果，综上：建议都用 A = A.to(...) A = A.cuda(...)的方式

tensor([3, 5])
tensor([3, 5], device='cuda:0')
tensor([3, 5], device='cuda:0')
True True
