# Pytorch基本训练框架

## 基本组件1: 神经网络
1. 所有的Pytorch神经网络都必须继承自一个基类: nn.Module
2. 两个最重要的函数:
    1. 构造函数 __init__ : 定义所有成员变量, 也就是网络结构
    2. forward()函数: 定义网络的前向过程

In [49]:
import torch
import torch.nn as nn

# 定义网络
class TestNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(784,256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256,256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256,10)
        )
        
    def forward(self,x):
        B,C,W,H = x.shape
        x = x.view(B,W*H)
        return self.net(x)

# 初始化网络
model = TestNet().cuda()

In [52]:
list(model.parameters())

[Parameter containing:
 tensor([[ 5.0412e-03, -5.2985e-03, -3.3600e-02,  ...,  2.6739e-02,
          -1.2767e-02, -2.6027e-02],
         [-3.4408e-03, -9.6523e-03, -1.8345e-02,  ...,  3.0322e-02,
           1.9581e-02, -1.0461e-02],
         [ 1.7202e-02,  3.0045e-02, -2.5928e-02,  ..., -2.4950e-02,
           3.2066e-02,  1.7404e-02],
         ...,
         [ 1.2372e-02,  2.6555e-02,  1.5035e-02,  ..., -1.3219e-02,
           7.5606e-03,  2.5888e-02],
         [ 5.9575e-05, -1.2969e-03,  2.0245e-02,  ..., -1.2663e-02,
          -1.1087e-04,  4.9757e-03],
         [ 2.2977e-02, -1.6914e-02,  1.5477e-02,  ...,  3.2303e-02,
           2.7043e-02, -1.9879e-02]], device='cuda:0', requires_grad=True),
 Parameter containing:
 tensor([-0.0061,  0.0253, -0.0056,  0.0139,  0.0051,  0.0211,  0.0136,  0.0338,
         -0.0059,  0.0026, -0.0187,  0.0212, -0.0073,  0.0172, -0.0085, -0.0333,
          0.0143, -0.0036, -0.0168, -0.0331, -0.0320, -0.0275, -0.0053,  0.0060,
         -0.0155, -0.0129, -

## 基本组件2: 优化器
1. 优化器一般不用自己写, 通常继承自: torch.optim.Optimizer
2. 优化器的构造函数中必须定义该优化器对应的可优化参数
3. 例如learning rate, weight decay, momentum 之类的都是不同优化器的可调节参数，这些参数对最终模型的性能影响非常大

In [53]:
from torch.optim import Adam
optimizer = Adam(model.parameters(),lr=1e-3)

## 基本组件3: 损失函数
1. 损失函数可以自己任意定义，一般来说输出是一个标量的损失值

In [13]:
import torch.nn.functional as F
loss_func = F.cross_entropy

## 定义训练函数
有了上述三个组件，我们就可以定义训练过程了，在很多框架中，下面的这个函数被称为trainer

trainer 定义了训练中一个完整的正向、反向传播过程

trainer 不一定有返回值,一般会返回loss的数值进行可视化，在这个过程中，重要的是model这个对象的所有参数获得了更新

In [31]:
# 注意loss_func不一定通过传参形式给到trainer, 可以直接import
# device 是CPU或CUDA, 或者特定编号的GPU
def trainer(batch, model, optimizer, loss_func, device):
    # 将模型参数设为训练模式
    model.train()
    # 从batch中获取输入数据和标签(不一定有标签)
    x, y = batch
    # 将数据存入对应设备中
    x = x.to(device)
    y = y.to(device)
    # 梯度清零
    optimizer.zero_grad()
    # 前向传播
    y_hat = model(x)
    # 计算loss
    loss = loss_func(y_hat,y)
    # 反向传播获取梯度
    loss.backward()
    # 更新参数
    optimizer.step()

    # 计算准确率
    predictions = torch.argmax(y_hat, dim=1)  # 获取模型的预测结果
    correct_predictions = (predictions == y).sum().item()  # 统计正确的预测数量
    return loss.item() / y.shape[0], correct_predictions

In [61]:
y = torch.randint(0,10,(32,1))

In [102]:
optimizer.zero_grad()

In [103]:
x = torch.randn(32,1,28,28).cuda()
y_hat = model(x)
loss = y_hat.sum()

In [104]:
model.net[0].weight

Parameter containing:
tensor([[ 5.0412e-03, -5.2985e-03, -3.3600e-02,  ...,  2.6739e-02,
         -1.2767e-02, -2.6027e-02],
        [-3.4408e-03, -9.6523e-03, -1.8345e-02,  ...,  3.0322e-02,
          1.9581e-02, -1.0461e-02],
        [ 1.7202e-02,  3.0045e-02, -2.5928e-02,  ..., -2.4950e-02,
          3.2066e-02,  1.7404e-02],
        ...,
        [ 1.2372e-02,  2.6555e-02,  1.5035e-02,  ..., -1.3219e-02,
          7.5606e-03,  2.5888e-02],
        [ 5.9575e-05, -1.2969e-03,  2.0245e-02,  ..., -1.2663e-02,
         -1.1087e-04,  4.9757e-03],
        [ 2.2977e-02, -1.6914e-02,  1.5477e-02,  ...,  3.2303e-02,
          2.7043e-02, -1.9879e-02]], device='cuda:0', requires_grad=True)

In [105]:
loss.backward()

In [106]:
optimizer.step()

In [107]:
model.net[0].weight

Parameter containing:
tensor([[ 0.0040, -0.0043, -0.0326,  ...,  0.0257, -0.0118, -0.0250],
        [-0.0044, -0.0107, -0.0193,  ...,  0.0313,  0.0186, -0.0095],
        [ 0.0162,  0.0310, -0.0269,  ..., -0.0239,  0.0331,  0.0164],
        ...,
        [ 0.0134,  0.0256,  0.0160,  ..., -0.0122,  0.0086,  0.0249],
        [ 0.0011, -0.0003,  0.0212,  ..., -0.0117,  0.0009,  0.0040],
        [ 0.0240, -0.0159,  0.0145,  ...,  0.0313,  0.0260, -0.0189]],
       device='cuda:0', requires_grad=True)

## 定义验证函数
1. 验证函数需要将模型设置为预测模式
2. 需要输入validation loader

In [117]:
def validate(val_loader, model, loss_func, device):
    # 将模型参数设为评估模式
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():  # 禁用梯度计算
        for batch in val_loader:
            x, y = batch
            x = x.to(device)
            y = y.to(device)

            # 前向传播
            y_hat = model(x)
            # 计算loss
            loss = loss_func(y_hat, y)

            total_loss += loss.item() / y.size(0)

            # 计算准确率
            predictions = torch.argmax(y_hat, dim=1)
            correct_predictions += (predictions == y).sum().item()
            total_samples += y.size(0)

    # 计算平均损失和准确率
    average_loss = total_loss / len(val_loader)
    accuracy = correct_predictions / total_samples

    print(f"Validation Loss: {average_loss:.4f} | Accuracy: {accuracy * 100:.2f}%")

    return average_loss, accuracy

## 输入数据

## 数据集
1. 数据集通常继承自: torch.utils.data.Dataset 基类
2. 对于图像中的常用数据集, 一般会在torchvision库中有定义好的Dataset类
3. 对于不常用的数据集，往往需要手写Dataset类, 手写Dataset类时一定需要的是__getitem__方法

In [118]:
from torch.utils.data import Dataset
from torchvision.datasets import MNIST
import torchvision.transforms as T

# 定义数据集的转换
my_transform = T.Compose([
    T.ToTensor(),  # 将图像转换为张量
    T.Normalize((0.5,), (0.5,))  # 标准化张量，使其范围在[-1, 1]之间
])

# 初始化训练集和测试集
train_dataset = MNIST(root='./data', train=True, transform=my_transform, download=True)
test_dataset = MNIST(root='./data', train=False, transform=my_transform, download=True)

# 自定义数据集
class MyDataset(Dataset):
    def __init__(self, transform):
        super().__init__()
        self.data = [0 for i in range(100)]
        self.transform = transform

    def __getitem__(self, index):
        return self.transform(self.data[index])

## 数据读取
1. DataLoader可以从数据集中读取不同batch的数据，可以通过定义num_workers来定义数据读取线程的数量, 通过pin_memory来决定数据是否要存放在内存条中

In [119]:
from torch.utils.data import DataLoader

# 定义 DataLoader 来加载数据
batch_size = 512
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True)

## 开始训练

## 结果输出
1. 可以输出到txt文件
2. 直接使用print
3. 使用tensorboard等工具

# Tips
1. 代码需要遵循“高内聚，低耦合”的设计思路，既每个函数、每个类都只完成最少的任务，这样才方便修改，快速移植
2. 善于组合别人的代码，将别人的模块化用到自己的代码中

In [120]:
device = "cuda"
total_ep = 100
for ep in range(total_ep):
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    for batch in train_loader:
        loss, correct_predictions = trainer(batch, model, optimizer, loss_func, device)
        total_loss += loss
        total_correct += correct_predictions
        total_samples += batch[1].shape[0]
    average_loss = total_loss / len(train_loader)
    accuracy = total_correct / total_samples
    print(f"Epoch: {ep} Training Loss: {average_loss:.4f} | Accuracy: {accuracy * 100:.2f}%")
        
    if (ep+1) % 5 == 0:
        validate(test_loader, model, loss_func, device)

Epoch: 0 Training Loss: 0.0016 | Accuracy: 81.53%
Epoch: 1 Training Loss: 0.0007 | Accuracy: 93.25%
Epoch: 2 Training Loss: 0.0005 | Accuracy: 95.18%
Epoch: 3 Training Loss: 0.0004 | Accuracy: 96.37%
Epoch: 4 Training Loss: 0.0003 | Accuracy: 97.11%
Validation Loss: 0.0003 | Accuracy: 97.03%
Epoch: 5 Training Loss: 0.0003 | Accuracy: 97.66%
Epoch: 6 Training Loss: 0.0002 | Accuracy: 98.09%
Epoch: 7 Training Loss: 0.0002 | Accuracy: 98.44%
Epoch: 8 Training Loss: 0.0002 | Accuracy: 98.70%
Epoch: 9 Training Loss: 0.0001 | Accuracy: 98.90%
Validation Loss: 0.0002 | Accuracy: 98.00%
Epoch: 10 Training Loss: 0.0001 | Accuracy: 99.12%
Epoch: 11 Training Loss: 0.0001 | Accuracy: 99.27%
Epoch: 12 Training Loss: 0.0001 | Accuracy: 99.45%
Epoch: 13 Training Loss: 0.0001 | Accuracy: 99.59%
Epoch: 14 Training Loss: 0.0001 | Accuracy: 99.66%
Validation Loss: 0.0002 | Accuracy: 98.18%
Epoch: 15 Training Loss: 0.0001 | Accuracy: 99.77%
Epoch: 16 Training Loss: 0.0000 | Accuracy: 99.80%
Epoch: 17 Trai