In [1]:
# 先导入torch
import torch
import torchvision
from torch import nn
from torch.utils import data
from torchvision import transforms
from d2l import torch as d2l
from torch.nn import functional as F
import numpy as np
from tqdm import tqdm

# 然后晒一下自己的显卡，美滋滋^ ^
if torch.cuda.is_available():
    print(torch.cuda.get_device_properties(0))

_CudaDeviceProperties(name='NVIDIA GeForce RTX 3080', major=8, minor=6, total_memory=10239MB, multi_processor_count=68)


In [2]:
def load_data(batch_size):
    # 常规的CV操作一下数据
    # 加入数据增广，我也不知道这种神奇的参数哪里来的
    # 反正只做普通的操作不行
    # 然后做了这一套操作之后，就不好展示图片了
    # trans = transforms.ToTensor()
    train_augs = torchvision.transforms.Compose([transforms.RandomCrop(32, padding=4),
                                                 torchvision.transforms.RandomHorizontalFlip(),
                                                 torchvision.transforms.ToTensor(),
                                                 transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
    test_augs = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
                                                transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
    
    data_train = torchvision.datasets.CIFAR10(root="data", train=True, transform=train_augs)
    data_test = torchvision.datasets.CIFAR10(root="data", train=False, transform=test_augs)
    # 继续调用torch的库函数，自己写真的麻烦死，这个工作数得根据自己硬件的情况改
    # 然后这里返回的是迭代器，比自己写的循环好多了，虽然也就是个yield的事
    return (data.DataLoader(data_train, batch_size, shuffle=True,num_workers=4),
            data.DataLoader(data_test, batch_size, shuffle=False,num_workers=4))

In [3]:
# 如果块的前后channel发生改变
# 就需要用一个单位卷积核来改变输入的channel用来相加
class Residual(nn.Module):
    def __init__(self, input_channels, num_channels, change=False, strides=1):
        super(Residual, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
        if change:
            self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)
    
    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)

In [4]:
block1 = nn.Sequential(nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False), 
                   nn.BatchNorm2d(64), nn.ReLU(), 
                   # nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
                    )

block2 = nn.Sequential(Residual(64, 64), Residual(64, 64))

block3 = nn.Sequential(Residual(64, 128, change=True, strides=2), Residual(128, 128))

block4 = nn.Sequential(Residual(128, 256, change=True, strides=2), Residual(256, 256))

block5 = nn.Sequential(Residual(256, 512, change=True, strides=2), Residual(512, 512))

net = nn.Sequential(block1, block2, block3, block4, block5, 
                    nn.AdaptiveAvgPool2d(1), nn.Flatten(), nn.Linear(512, 10))

In [5]:
# 直接把LeNet的搬过来
def test(net, test_iter, device=None):
    if isinstance(net, nn.Module):
        net.eval()  # 设置为评估模式，停止dropout和batchnorm
        if not device:
            device = next(iter(net.parameters())).device
    metric = d2l.Accumulator(2)
    # 同样要阻挡梯度的传播
    with torch.no_grad():
        for X, y in test_iter:
            X = X.to(device)
            y = y.to(device)
            # 后者为数组中元素的个数
            metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

def train(net, train_iter, test_iter, num_epochs, lr, device):
    net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()
    num_batches = len(train_iter)
    for i in range(10):
        with tqdm(total=int(num_epochs/10), desc='Iteration %d' % i) as pbar:
            for epoch in range(int(num_epochs/10)):
                metric = d2l.Accumulator(3)
                net.train()
                for i, (X, y) in enumerate(train_iter):
                    optimizer.zero_grad()
                    X, y = X.to(device), y.to(device)
                    y_hat = net(X)
                    l = loss(y_hat, y)
                    l.backward()
                    optimizer.step()
                    with torch.no_grad():
                        metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
                    train_l = metric[0] / metric[2]
                    train_acc = metric[1] / metric[2]
                # 计算这个epoch下，测试集上的准确率
                test_acc = test(net, test_iter)
                if (epoch + 1) % 5 == 0:
                    pbar.set_postfix({
                        'train_loss':
                        '%.3f' % train_l,
                        'train_acc':
                        '%.3f' % train_acc,
                        'test_acc':
                        '%.3f' % test_acc
                    })
                pbar.update(1)
    print(test_acc)

In [6]:
lr, num_epochs, batch_size = 0.01, 100, 256
train_iter, test_iter = load_data(batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train(net, train_iter, test_iter, num_epochs, lr, device)
# 原始论文中Resnet20是8.75%错误率
# 不主动初始化，只用SGD是75%
# 改用Adam也就80%左右
# 我又去看了原始论文，得用图像增广
# 缓解了过拟合，但得更长时间的训练
# 加入标准化，达到85%
# 加入填充裁剪，达到88%
# 去掉max层，达到90%

torch.save(net.state_dict(), './test.pth')

Iteration 0: 100%|██████████████████| 10/10 [04:28<00:00, 26.85s/it, train_loss=0.566, train_acc=0.801, test_acc=0.783]
Iteration 1: 100%|██████████████████| 10/10 [04:25<00:00, 26.57s/it, train_loss=0.324, train_acc=0.888, test_acc=0.833]
Iteration 2: 100%|██████████████████| 10/10 [04:26<00:00, 26.69s/it, train_loss=0.217, train_acc=0.923, test_acc=0.836]
Iteration 3: 100%|██████████████████| 10/10 [04:24<00:00, 26.45s/it, train_loss=0.157, train_acc=0.944, test_acc=0.873]
Iteration 4: 100%|██████████████████| 10/10 [04:25<00:00, 26.51s/it, train_loss=0.118, train_acc=0.958, test_acc=0.892]
Iteration 5: 100%|██████████████████| 10/10 [04:25<00:00, 26.50s/it, train_loss=0.091, train_acc=0.968, test_acc=0.897]
Iteration 6: 100%|██████████████████| 10/10 [04:23<00:00, 26.39s/it, train_loss=0.076, train_acc=0.973, test_acc=0.892]
Iteration 7: 100%|██████████████████| 10/10 [04:27<00:00, 26.72s/it, train_loss=0.067, train_acc=0.977, test_acc=0.894]
Iteration 8: 100%|██████████████████| 10

0.8948



