# 微调

In [None]:
import os

# 设置代理
os.environ['http_proxy'] = 'http://127.0.0.1:7893'
os.environ['https_proxy'] = 'http://127.0.0.1:7893'
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7893'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7893'
os.environ['no_proxy'] = '127.0.0.1,localhost'
os.environ['NO_PROXY'] = '127.0.0.1,localhost'

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
import time
import numpy as np
import matplotlib.pyplot as plt
from IPython import display
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torchvision
import os
import hashlib
import requests
import zipfile

In [None]:
class Timer:
    """Record multiple running times."""
    def __init__(self):
        """Defined in :numref:`sec_minibatch_sgd`"""
        self.times = []
        self.start()

    def start(self):
        """Start the timer."""
        self.tik = time.time()

    def stop(self):
        """Stop the timer and record the time in a list."""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """Return the average time."""
        return sum(self.times) / len(self.times)

    def sum(self):
        """Return the sum of time."""
        return sum(self.times)

    def cumsum(self):
        """Return the accumulated time."""
        return np.array(self.times).cumsum().tolist()

def try_gpu(i=0):
    """返回第i个GPU设备，如果不存在则返回CPU"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():
    """返回所有可用的GPU设备，如果没有GPU则返回[cpu()]"""
    devices = [torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

def load_data_fashion_mnist(batch_size, resize=None):
    """下载Fashion-MNIST数据集，然后将其加载到内存中"""
    trans = [transforms.ToTensor()]
    if resize:
        trans.insert(0, transforms.Resize(resize))
    trans = transforms.Compose(trans)
    mnist_train = datasets.FashionMNIST(
        root='./data', train=True, transform=trans, download=True)
    mnist_test = datasets.FashionMNIST(
        root='./data', train=False, transform=trans, download=True)
    return (DataLoader(mnist_train, batch_size, shuffle=True, num_workers=4),
            DataLoader(mnist_test, batch_size, shuffle=False, num_workers=4))

def evaluate_accuracy_gpu(net, data_iter, device=None):
    """计算在指定数据集上模型的精度"""
    if isinstance(net, nn.Module):
        net.eval()  # 设置为评估模式
        if device is None:
            device = next(iter(net.parameters())).device
    
    # 正确预测的数量，总预测的数量
    metric = [0.0] * 2
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # BERT微调所需的（之后将介绍）
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric[0] += (net(X).argmax(dim=1) == y).sum().item()
            metric[1] += y.numel()
    return metric[0] / metric[1]

class Accumulator:
    """在n个变量上累加"""
    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class Animator:
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(3.5, 2.5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: self._set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def _set_axes(self, axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
        """设置matplotlib的轴"""
        axes.set_xlabel(xlabel)
        axes.set_ylabel(ylabel)
        axes.set_xscale(xscale)
        axes.set_yscale(yscale)
        axes.set_xlim(xlim)
        axes.set_ylim(ylim)
        if legend:
            axes.legend(legend)
        axes.grid()

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        display.display(self.fig)
        display.clear_output(wait=True)
        
class Residual(nn.Module):
    """The Residual block of ResNet models."""
    def __init__(self, num_channels, use_1x1conv=False, strides=1):
        super().__init__()
        self.conv1 = nn.LazyConv2d(num_channels, kernel_size=3, padding=1,
                                   stride=strides)
        self.conv2 = nn.LazyConv2d(num_channels, kernel_size=3, padding=1)
        if use_1x1conv:
            self.conv3 = nn.LazyConv2d(num_channels, kernel_size=1,
                                       stride=strides)
        else:
            self.conv3 = None
        self.bn1 = nn.LazyBatchNorm2d()
        self.bn2 = nn.LazyBatchNorm2d()

    def forward(self, X):
        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        Y += X
        return F.relu(Y)

def resnet18(num_classes, in_channels=1):
    """ResNet-18模型"""
    def resnet_block(num_channels, num_residuals, first_block=False):
        blk = []
        for i in range(num_residuals):
            if i == 0 and not first_block:
                blk.append(Residual(num_channels, use_1x1conv=True, strides=2))
            else:
                blk.append(Residual(num_channels))
        return nn.Sequential(*blk)

    # ResNet-18
    net = nn.Sequential(
        nn.LazyConv2d(64, kernel_size=7, stride=2, padding=3),
        nn.LazyBatchNorm2d(), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
    )
    net.add_module("resnet_block1", resnet_block(64, 2, first_block=True))
    net.add_module("resnet_block2", resnet_block(128, 2))
    net.add_module("resnet_block3", resnet_block(256, 2))
    net.add_module("resnet_block4", resnet_block(512, 2))
    net.add_module("global_avg_pool", nn.AdaptiveAvgPool2d((1, 1)))
    net.add_module("fc", nn.Sequential(
        nn.Flatten(),
        nn.LazyLinear(num_classes)
    ))
    return net

def use_svg_display():
    """使用svg格式显示绘图"""
    from matplotlib_inline import backend_inline
    backend_inline.set_matplotlib_formats('svg')


def set_figsize(figsize=(3.5, 2.5)):
    """Set the figure size for matplotlib."""
    use_svg_display()
    plt.rcParams['figure.figsize'] = figsize


def accuracy(y_hat, y):
    """Compute the number of correct predictions."""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())


def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5):
    """Plot a list of images."""
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
    axes = axes.flatten()
    for i, (ax, img) in enumerate(zip(axes, imgs)):
        # 将tensor或PIL图像转换为numpy数组
        if isinstance(img, torch.Tensor):
            # 如果是PyTorch tensor，转换为numpy
            img = img.detach().cpu().numpy()
            # 如果是(C, H, W)格式，转换为(H, W, C)
            if img.ndim == 3 and img.shape[0] in [1, 3]:
                img = np.transpose(img, (1, 2, 0))
            # 如果是单通道图像，去掉通道维度
            if img.shape[-1] == 1:
                img = img.squeeze(-1)
        ax.imshow(img)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        if titles:
            ax.set_title(titles[i])
    return axes


DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'


def download(name, cache_dir='../data'):
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 DATA_HUB"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    
    # 如果文件已存在且哈希值匹配，直接返回
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname
    
    # 下载文件
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname


def download_extract(name, folder=None):
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    
    if ext == '.zip':
        with zipfile.ZipFile(fname, 'r') as fp:
            fp.extractall(base_dir)
    elif ext in ('.tar', '.gz'):
        import tarfile
        with tarfile.open(fname, 'r') as fp:
            fp.extractall(base_dir)
    
    return os.path.join(base_dir, folder) if folder else data_dir


def train_batch_ch13(net, X, y, loss, trainer, devices):
    """用多GPU进行小批量训练"""
    if isinstance(X, list):
        # 对于BERT微调所需的
        X = [x.to(devices[0]) for x in X]
    else:
        X = X.to(devices[0])
    y = y.to(devices[0])
    
    net.train()
    trainer.zero_grad()
    pred = net(X)
    l = loss(pred, y)
    l.mean().backward()  # 使用mean而不是sum，这样梯度更稳定
    trainer.step()
    
    train_loss_sum = l.sum()
    train_acc_sum = accuracy(pred, y)
    return train_loss_sum, train_acc_sum


def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices=None):
    """用GPU训练模型（在第十三章定义）"""
    if devices is None:
        devices = try_all_gpus()
    
    timer, num_batches = Timer(), len(train_iter)
    animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
                       legend=['train loss', 'train acc', 'test acc'])
    
    # 只有在有多个GPU时才使用DataParallel
    if len(devices) > 1 and str(devices[0]).startswith('cuda'):
        net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    else:
        net = net.to(devices[0])
    
    for epoch in range(num_epochs):
        # 训练损失之和，训练准确率之和，样本数
        metric = Accumulator(3)
        for i, (features, labels) in enumerate(train_iter):
            timer.start()
            l, acc = train_batch_ch13(net, features, labels, loss, trainer, devices)
            metric.add(l, acc, labels.shape[0])
            timer.stop()
            
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                           (metric[0] / metric[2], metric[1] / metric[2], None))
        
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    
    print(f'loss {metric[0] / metric[2]:.3f}, train acc '
          f'{metric[1] / metric[2]:.3f}, test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on '
          f'{str(devices)}')

In [None]:
DATA_HUB['hotdog'] = (DATA_URL + 'hotdog.zip', 'fba480ffa8aa7e0febbb511d181409f899b9baa5')

In [None]:
data_dir = download_extract('hotdog')

In [4]:
train_images = torchvision.datasets.ImageFolder(os.path.join(data_dir, 'train'))
test_images = torchvision.datasets.ImageFolder(os.path.join(data_dir, 'test'))

In [None]:
hot_dogs = [train_images[i][0] for i in range(8)]
not_hot_dogs = [train_images[-i-1][0] for i in range(8)]
show_images(hot_dogs + not_hot_dogs, 2, 8, scale=1.4)
plt.show()

In [6]:
# 一般预测时是不会“反向”做回去的。模型训练和预测阶段都用相同的归一化（标准化）操作，即
# transforms.ToTensor() 归一化到 0~1，transforms.Normalize(...) 做标准化。
# 如果你想让模型输出的人眼可读（比如用plt.imshow显示图片），可以用反归一化，把标准化步骤的操作反着来：
# img = img * std + mean， 记得要先把Tensor数据恢复成numpy类型，数值范围也要限制在0~1或0~255。
normalize = torchvision.transforms.Normalize(
    [0.485, 0.456, 0.406],  # 这三个数字分别是归一化到0~1后R、G、B通道的均值
    [0.229, 0.224, 0.225]   # 这三个数字分别是归一化到0~1后R、G、B通道的标准差
)

train_augs = torchvision.transforms.Compose([
    # 这里用的是RandomResizedCrop(224)，意思是在原图上随机裁剪出一块区域，然后再resize到224x224。为什么是“随机”的裁剪呢？因为这样可以让模型看到目标在不同位置、尺度、比例的情况，相当于一种数据增强，能提升模型对输入变换的鲁棒性（比如现实中拍照时热狗可能大可能小，可能在左侧可能在右侧）。如果总是同一个位置同样resize，训练出来的模型泛化能力就差了。
    # 另外随机裁剪有可能裁出来的那块区域不包含全部目标，甚至全是背景，也有助于让模型学会区分重要特征和干扰特征。
    # 那是不是有的图像其实没 resize 呢？实际上RandomResizedCrop会先随机选个区域（有大小和宽高比限制），然后无论原来这块有多大都强行resize为224×224送进网络，所以所有图片的尺寸最终都会统一为224×224，但内容的可视范围和内容比例是随机的。
    torchvision.transforms.RandomResizedCrop(224),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    normalize,
])

test_augs = torchvision.transforms.Compose([
    torchvision.transforms.Resize(255),
    torchvision.transforms.CenterCrop(224),
    torchvision.transforms.ToTensor(),
    normalize,
])

In [7]:
pretrain_net = torchvision.models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/yw.hao/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100.0%


In [None]:
print("pretrain_net.fc:", pretrain_net.fc)
print("ResNet18属性:")
print(list(pretrain_net.named_children()))

In [None]:
finetune_net = torchvision.models.resnet18(pretrained=True)
finetune_net.fc = nn.Linear(finetune_net.fc.in_features, 2)
nn.init.xavier_uniform_(finetune_net.fc.weight)

Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /Users/yw.hao/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth
0.5%

In [None]:
def train_fine_tuning(net, learning_rate, batch_size=128, num_epochs=5, param_group=True):
    train_iter = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(
        os.path.join(data_dir, 'train'), transform=train_augs
    ), batch_size=batch_size, shuffle=True)
    test_iter = torch.utils.data.DataLoader(torchvision.datasets.ImageFolder(
        os.path.join(data_dir, 'test'), transform=test_augs
    ), batch_size=batch_size)
    devices = try_all_gpus()
    loss = nn.CrossEntropyLoss(reduction='none')
    if param_group:
        params_1x = [param for name, param in net.named_parameters() if name not in ["fc.weight", "fc.bias"]]
        trainer = torch.optim.SGD([
            {'params': params_1x},
            {'params': net.fc.parameters(), "lr": learning_rate * 10}
        ], lr=learning_rate, weight_decay=0.001)
    else:
        trainer = torch.optim.SGD(net.parameters(), lr=learning_rate, weight_decay=0.001)
    train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices)

In [None]:
train_fine_tuning(finetune_net, learning_rate=5e-5)

In [None]:
scratch_res18 = torchvision.models.resnet18()
scratch_res18.fc = nn.Linear(scratch_res18.fc.in_features, 2)
train_fine_tuning(scratch_res18, 5e-4, param_group=False)