In [2]:
from tqdm import tqdm

import torch
from torch import nn
from torch import optim
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

# tensorboard --logdir=/opt/logs 
writer = SummaryWriter(log_dir="/opt/logs/quick", flush_secs=30)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [5]:
train_data = datasets.CIFAR10(
    root="/opt/data",
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.CIFAR10(
    root="/opt/data",
    train=False,
    download=True,
    transform=ToTensor(),
)

train_size = len(train_data)
valid_size = len(test_data)
print("训练样本总数:", train_size)
print("测试样本总数:", valid_size)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /opt/data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:16<00:00, 10424687.36it/s]


Extracting /opt/data/cifar-10-python.tar.gz to /opt/data
Files already downloaded and verified
训练样本总数: 50000
测试样本总数: 10000


In [None]:
batch_size = 64
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

print("训练批次数:", len(train_dataloader))
print("测试批次数:", len(test_dataloader))
for X, y in test_dataloader:
    print(X.shape, y.shape)
    break

训练批次数: 938
测试批次数: 157
torch.Size([64, 1, 28, 28]) torch.Size([64])


In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            # (1, 28, 28) -> (32, 26, 26)
            # (32, 26, 26) -> (32, 13, 13)
            nn.Conv2d(1, 32, 3, 1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            # (32, 13, 13) -> (64, 11, 11)
            # (64, 11, 11) -> (64, 5, 5)
            nn.Conv2d(32, 64, 3, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64 * 5 * 5, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 10),
        )

        # 权重初始化
        for m in self.net.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.net(x)

In [None]:
epoch_count = 0  # 总训练轮数
best_accuracy = 0  # 最佳准确率
model_path = "/opt/models/quick.pth"  # 模型保存路径

initial_lr = 1e-2  # 初始学习率
lr_patience = 10  # 学习率等待衰减次数
lr_factor = 0.5  # 学习率衰减因子

worse_count = 0  # 连续无增长计数
worse_tolerance = 20  # 无增长容忍次数

In [6]:
model = NeuralNetwork().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=initial_lr, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=lr_patience, factor=lr_factor)

In [None]:
max_epochs = 100
pbar = tqdm(range(max_epochs))
for i in pbar:
    train_loss, valid_loss, accuracy = 0, 0, 0

    model.train()
    for batch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred, y)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    with torch.no_grad():
        for X, y in test_dataloader:
            X, y = X.to(device), y.to(device)

            pred = model(X)
            valid_loss += loss_fn(pred, y).item()
            accuracy += (pred.argmax(1) == y).type(torch.float).sum().item()

    epoch_count += 1
    train_loss /= train_size
    valid_loss /= valid_size
    accuracy = accuracy / valid_size * 100
    scheduler.step(valid_loss)

    if accuracy > best_accuracy:
        worse_count = 0
        best_accuracy = accuracy
        torch.save(model.state_dict(), model_path)
    else:
        worse_count += 1
        if worse_count == worse_tolerance:
            print(f"模型连续{worse_tolerance}次无提升，提前终止训练")
            break

    writer.add_scalar("train_loss", train_loss, i)
    writer.add_scalar("valid_loss", valid_loss, i)
    writer.add_scalar("accuracy", accuracy, i)
    writer.add_scalar("learning_rate", optimizer.param_groups[0]["lr"], i)
    pbar.set_postfix(lr=f"{optimizer.param_groups[0]["lr"]:.0e}", train_loss=f"{train_loss:.2e}", valid_loss=f"{valid_loss:.2e}", accuracy=f"{accuracy:.2f}%", best_accuracy=f"{best_accuracy:.2f}%")

print("模型架构:", model)
print("模型参数:", sum(p.numel() for p in model.parameters()))
print("保存路径:", model_path)
print("最佳训练轮数:", epoch_count - worse_tolerance)
print("最佳准确率:", best_accuracy)

 71%|███████   | 71/100 [04:19<01:46,  3.66s/it, accuracy=90.83%, best_accuracy=91.04%, lr=3e-04, valid_loss=1.02e-02, train_loss=1.41e-03]

模型连续20次无提升，提前终止训练
模型架构: NeuralNetwork(
  (net): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Flatten(start_dim=1, end_dim=-1)
    (9): Linear(in_features=1600, out_features=256, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_features=256, out_features=10, bias=True)
  )
)
模型参数: 431434
保存路径: /opt/quick.pth
最佳训练轮数: 52
最佳准确率: 91.03999999999999



