In [2]:
import torch

## 张量运算

In [3]:
a = torch.tensor([[1, 2], [3, 4]], dtype=torch.int32)
print(a)

tensor([[1, 2],
        [3, 4]], dtype=torch.int32)


对应位置相乘：

In [4]:
a*a

tensor([[ 1,  4],
        [ 9, 16]], dtype=torch.int32)

矩阵乘法

In [5]:
a@a

tensor([[ 7, 10],
        [15, 22]], dtype=torch.int32)

创建 torch 张量的时候声明数据类型

In [6]:
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float16)
print(x)

tensor([[1., 2.],
        [3., 4.]], dtype=torch.float16)


torch 张量支持反向传播计算梯度

In [7]:
x.requires_grad_(True)
y = (x ** 2).sum()
print("y = ")
print(y)
print()
y.backward()
print("dy/dx =")
print(x.grad)

y = 
tensor(30., dtype=torch.float16, grad_fn=<SumBackward0>)

dy/dx =
tensor([[2., 4.],
        [6., 8.]], dtype=torch.float16)


## 大规模张量运算

创建一个 10000 * 10000 的矩阵，元素服从标准正态分布，同时开启了梯度计算功能，便于后续进行反向传播和自动求导

In [None]:
x = torch.randn(10000, 10000, requires_grad=True)

CPU 矩阵乘法已经被优化得非常快

In [9]:
y = x.matmul(x)
sum_y = y.sum()

In [10]:
sum_y.backward()

In [11]:
x.grad

tensor([[  -7.3104,   77.4198,   34.9493,  ...,  -94.5424,   65.5811,
           85.6551],
        [  23.4142,  108.1444,   65.6739,  ...,  -63.8178,   96.3057,
          116.3797],
        [-105.5612,  -20.8310,  -63.3016,  ..., -192.7932,  -32.6698,
          -12.5957],
        ...,
        [  14.9111,   99.6413,   57.1707,  ...,  -72.3209,   87.8026,
          107.8766],
        [ 119.9319,  204.6621,  162.1916,  ...,   32.6999,  192.8234,
          212.8974],
        [ -98.3320,  -13.6018,  -56.0723,  ..., -185.5640,  -25.4405,
           -5.3665]])

创建大张量并将其分配到 GPU（cuda 设备）上

In [12]:
x_cuda = torch.randn(10000, 10000, requires_grad=True).to('cuda')

在 GPU 上进行大规模矩阵乘法只需约 0.2 秒，远快于 CPU 计算。这展示了 GPU 在处理大规模张量运算时的强大性能优势，尤其适用于深度学习等高性能计算场景。

In [13]:
y_cuda = x_cuda.matmul(x_cuda)
sum_y_cuda = y_cuda.sum()

In [14]:
sum_y_cuda.backward()