# torch train basics

torch 训练 需要数据集、网络结构定义、Loss函数、优化器。



In [11]:

# linear regression
import torch
import os
from torch import nn

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
hidden_size = 64

data = torch.rand(1024, hidden_size).float().cuda()
weight = torch.rand(hidden_size, 1).float().cuda()
bias = torch.rand(1,).float().cuda()
expect = data @ weight + bias

class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(in_features=hidden_size, out_features=1, bias=True)

    def forward(self, x):
        return self.linear(x)


lr = 1e-3
epochs = 1
batch_size = 64
model = LinearRegression().train().cuda()
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

for epoch in range(epochs):
    for idx in range(0, 1024, batch_size):
        x = data[idx: idx+batch_size]
        y = expect[idx: idx + batch_size]
        preds = model(x)
        if idx == 0:
            print(model.linear.bias)
        loss = loss_fn(y, preds)
        if idx == 0:
            print(model.linear.bias)
        optimizer.zero_grad()
        if idx == 0:
            print(model.linear.bias)
        loss.backward()
        if idx == 0:
            print(model.linear.bias)
        optimizer.step()
        if idx == 0:
            print(model.linear.bias)
    if epoch % 100 == 0:
        print(loss)



Parameter containing:
tensor([-0.0410], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.0410], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.0410], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.0410], device='cuda:0', requires_grad=True)
Parameter containing:
tensor([-0.0105], device='cuda:0', requires_grad=True)
tensor(82.7912, device='cuda:0', grad_fn=<MseLossBackward0>)


In [8]:
model.eval()
# print(weight.view(-1))
# print(model.linear.weight.view(-1))

# print(bias)
# print(model.linear.bias)

print(expect[:batch_size].view(-1))
print(model(data[:batch_size]).view(-1))

tensor([15.9450, 14.9724, 15.9223, 17.0857, 15.0906, 14.8759, 14.2641, 16.3343,
        15.6017, 17.0765, 12.9845, 15.5586, 16.9820, 15.9542, 17.7072, 16.3956,
        17.0560, 15.6756, 15.6948, 16.3338, 17.4296, 15.3655, 17.1667, 15.1330,
        15.4936, 15.1647, 16.7156, 13.6955, 14.7442, 16.0632, 12.8069, 15.1517,
        16.1933, 14.7765, 14.2440, 14.2172, 14.4803, 15.4032, 14.9916, 14.3205,
        12.7070, 13.8750, 13.9180, 16.8788, 16.4735, 16.1670, 13.4830, 14.4689,
        13.8179, 15.6851, 14.7772, 14.4882, 16.2031, 14.0884, 13.5313, 17.5115,
        17.2383, 16.0091, 16.2571, 17.0073, 15.2144, 14.9747, 13.7560, 15.4398],
       device='cuda:0')
tensor([15.9539, 14.9790, 15.9161, 17.0165, 15.1091, 14.8780, 14.2719, 16.3147,
        15.5802, 17.0503, 13.0557, 15.5458, 16.9319, 15.9438, 17.6500, 16.3750,
        17.0072, 15.6966, 15.6925, 16.3340, 17.3589, 15.3501, 17.0987, 15.1292,
        15.4643, 15.1721, 16.6982, 13.7601, 14.7692, 16.0673, 12.8510, 15.1519,
        16.1572

## profiling torch module

torch提供了了专门的profiler API 分析代码中耗时以及内存消耗。

In [9]:
# profiler

import torch
import numpy as np
from torch import nn
import torch.autograd.profiler as profiler

class MyModule(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True):
        super(MyModule, self).__init__()
        self.linear = nn.Linear(in_features, out_features, bias)

    def forward(self, input, mask):
        with profiler.record_function("LINEAR PASS"):
            out = self.linear(input)

        with profiler.record_function("MASK INDICES"):
            threshold = out.sum(axis=1).mean().item()
            hi_idx = np.argwhere(mask.cpu().numpy() > threshold)
            hi_idx = torch.from_numpy(hi_idx).cuda()

        return out, hi_idx

model = MyModule(500, 10).cuda()
input = torch.rand(128, 500).cuda()
mask = torch.rand((500, 500, 500), dtype=torch.double).cuda()

# warm-up
model(input, mask)

with profiler.profile(with_stack=True, profile_memory=True) as prof:
    out, idx = model(input, mask)

STAGE:2024-01-17 08:19:56 40283:40283 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-01-17 08:20:00 40283:40283 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-01-17 08:20:00 40283:40283 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


In [10]:
print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5))

-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                     MASK INDICES        69.08%        2.764s        99.87%        3.996s        3.996s           0 b    -953.67 Mb       2.79 Gb      -1.00 Kb             1  
                                  cudaMemcpyAsync        30.76%        1.231s        30.76%        1.231s     410.328ms           0 b           0 b           0 b           0 b             3  
                                      L