# 一、训练模型的模板

In [46]:
import time
import torchvision
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [47]:
# dataset
train_data = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
test_data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())


train_data_size = len(train_data)
test_data_size = len(test_data)

print(f"训练数据集长度为：{train_data_size}")
print(f"测试数据集长度为：{test_data_size}")

Files already downloaded and verified
Files already downloaded and verified
训练数据集长度为：50000
测试数据集长度为：10000


In [48]:
# DataLoader
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True)

In [49]:
# Model net
# 可以抽离到单独的文件中，并进行测试
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, 5, padding=2),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 32, 5, padding=2),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 5, padding=2),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(1024, 64),
            nn.Linear(64, 10)
        )
    
    def forward(self, x):
        return self.model(x)

注意： 在GPU上训练时，只有模型、损失函数和数据（输入和标注信息）可以放到GPU上。

In [45]:
writer = SummaryWriter("train_log")

# 设置设备
# 设备参数可以为cuda:0, cuda:1, cpu等
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 实例化模型
net = Net()
# 方式1
if torch.cuda.is_available():
    # 将模型放到GPU上(使用GPU训练)
    net.cuda()
# 方式2
net.to(device)
# 与方式2等价，只有数据是必须赋值回去的。
net = net.to(device)

# 损失函数
loss = nn.CrossEntropyLoss()
# 将损失函数放到GPU上(使用GPU训练)
# 方式1
if torch.cuda.is_available():
    loss.cuda()
# 方式2
# loss.to(device)
loss = loss.to(device)

# 优化器
learning_rate = 1e-3
optimizer = optim.SGD(net.parameters(), lr=learning_rate)

epoch_num = 3 # 训练轮次
train_start_time = time.time() # 记录训练开始时间
for epoch in range(epoch_num):
    print(f"----------第 {epoch+1} 轮训练开始----------")
    
    epoch_start_time = time.time() # 记录每轮开始时间
    total_train_step = 0 # 记录训练的次数
    total_test_step = 0 # 记录测试的次数
    
    # 训练步骤开始
    net.train() # 设置为训练模式，但是这一步只对某些模型有用，比如Dropout
    for data in train_dataloader:
        step_start_time = time.time()
        imgs, labels = data
        # 将数据放到GPU上(使用GPU训练)
        # 方式1
        if torch.cuda.is_available():
            imgs = imgs.cuda()
            labels = labels.cuda()
        # 方式2
        imgs = imgs.to(device)
        labels = labels.to(device)
        
        outputs = net(imgs)
        res_loss = loss(outputs, labels)
        
        # 优化器优化模型
        optimizer.zero_grad()
        res_loss.backward()
        optimizer.step()
        
        # 记录每次训练的信息
        total_train_step += 1
        writer.add_scalar(f'loss_in_epoch_{epoch}', res_loss.item(), total_train_step)
        if total_train_step % 100 == 0:
            step_end_time = time.time()
            print(f'Epoch[{epoch+1}/{epoch_num}], Step: {total_train_step}, Loss: {res_loss.item()}, Time: {step_end_time-step_start_time}')
        
    # 测试步骤开始
    net.eval() # 设置为测试模式，但是这一步只对某些模型有用，比如Dropout
    total_test_loss = 0
    total_accuracy = 0
    with torch.no_grad():
        for data in test_dataloader:
            imgs, labels = data
            if torch.cuda.is_available():
                # 将数据放到GPU上(使用GPU测试)
                imgs = imgs.cuda()
                labels = labels.cuda()
            outputs = net(imgs)
            res_loss = loss(outputs, labels)
            accuracy = (outputs.argmax(1) == labels).sum()
            total_accuracy += accuracy
            total_test_loss += res_loss.item()
            
    total_test_step += 1
    print(f"整体测试集上的Loss: {total_test_loss}")
    print(f"整体测试集上的正确率: {total_accuracy/test_data_size}")
    writer.add_scalar(f'test_loss_in_epoch_{epoch+1}', total_test_loss, total_test_step)
    writer.add_scalar(f'test_accuracy_in_epoch_{epoch+1}', total_accuracy/test_data_size, total_test_step)
    
    torch.save(net.state_dict(), f'model/cifar_net_{epoch+1}.pth')
    print("模型已保存")
    
writer.close()
    

----------第 1 轮训练开始----------
Epoch[1/3], Step: 100, Loss: 2.311138153076172
Epoch[1/3], Step: 200, Loss: 2.2951126098632812
Epoch[1/3], Step: 300, Loss: 2.29854416847229
Epoch[1/3], Step: 400, Loss: 2.2941946983337402
Epoch[1/3], Step: 500, Loss: 2.2997050285339355
Epoch[1/3], Step: 600, Loss: 2.3097915649414062
Epoch[1/3], Step: 700, Loss: 2.2978742122650146
Epoch[1/3], Step: 800, Loss: 2.300922155380249
Epoch[1/3], Step: 900, Loss: 2.2940542697906494
Epoch[1/3], Step: 1000, Loss: 2.2936084270477295
Epoch[1/3], Step: 1100, Loss: 2.2953104972839355
Epoch[1/3], Step: 1200, Loss: 2.3037326335906982
Epoch[1/3], Step: 1300, Loss: 2.299428701400757
Epoch[1/3], Step: 1400, Loss: 2.282520294189453
Epoch[1/3], Step: 1500, Loss: 2.292264461517334
整体测试集上的Loss: 716.2407293319702
整体测试集上的正确率: 0.13529999554157257
模型已保存
----------第 2 轮训练开始----------
Epoch[2/3], Step: 1600, Loss: 2.2681667804718018
Epoch[2/3], Step: 1700, Loss: 2.2685704231262207
Epoch[2/3], Step: 1800, Loss: 2.2869765758514404
Epoch

# 二、验证模型的模板

In [ ]:
from PIL import Image

# 获得真实图片并处理，使其符合模型的输入要求
image_path = r""
image = Image.open(image_path)
image.convert("RGB") # 转换为只保留颜色通道（PNG文件会有4个通道）
print(image)

transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((32, 32)),
    torchvision.transforms.ToTensor()
])

image = transform(image)
image.reshape(1, 3, 32, 32)
print(image.shape)

# 加载模型并进入测试模式
model = torch.load("model/cifar_net_1.pth")
model.eval()

# 使用模型对真实图片进行预测
with torch.no_grad():
    output = model(image)
print(output)
print(output.argmax(1))


# 三、使用现有模型进行迁移学习，二次开发

In [26]:
vgg16_false = torchvision.models.vgg16(pretrained=False)
vgg16_true = torchvision.models.vgg16(pretrained=True)
print(vgg16_true)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [29]:
vgg16_true.add_module('fc1-modify', nn.Linear(10, 10))
vgg16_true.classifier.add_module('fc2-modify', nn.Linear(1000, 10))
print(vgg16_true)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [28]:
vgg16_false.classifier[6] = nn.Linear(4096, 10)
print(vgg16_false)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

# 四、模型的保存与加载

In [30]:
vgg16 = torchvision.models.vgg16(pretrained=False)

## 1. 保存模型结构+模型参数

注意：要求导入的模型的类是存在的，否则会报错（尤其是自定义的类时）

In [32]:
torch.save(vgg16, 'model/vgg16.pth')

In [35]:
vgg16_model = torch.load('model/vgg16.pth')
print(vgg16_model, vgg16)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

## 2. 仅保存模型参数（官方推荐）

In [33]:
torch.save(vgg16.state_dict(), 'model/vgg16_state_dict.pth')

In [36]:
load_vgg16 = torchvision.models.vgg16(pretrained=False)
# map_location参数是为了解决CPU和GPU之间的模型加载问题：
# 如果模型是在GPU上训练的，加载时需要加上map_location参数，否则会报错
vgg16.load_state_dict(torch.load('model/vgg16_state_dict.pth'), map_location=device)
print(vgg16, load_vgg16)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1