
docs: https://www.yuque.com/huangzhongqing/lightweight/lno6i7

video timeline: https://www.bilibili.com/video/BV1zP4y1F7g4/?spm_id_from=333.788&vd_source=617461d43c4542e4c5a3ed54434a0e55&t=297.3

## 导入工具包

In [31]:
import torch
import numpy as np
import pandas as pd
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from torchsummary import summary # from torchinfo import summary #     from torchsummary import summary
from tqdm import tqdm


import matplotlib.pyplot as plt

In [32]:
#设置随机种子

torch.manual_seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
# 使用cuDNN加速卷积运算
torch.backends.cudnn.benchmark=True

## 载入MNIST数据集

In [34]:
# 载入MNIST数据集
train_dataset=torchvision.datasets.MNIST(root="dataset/",train=True,transform=transforms.ToTensor(),download=True)
test_dataset=torchvision.datasets.MNIST(root="dataset/",train=False,transform=transforms.ToTensor(),download=True)

# # 载入训练集
# train_dataset = torchvision.datasets.MNIST(
#     root="dataset/",
#     train=True,
#     transform=transforms.ToTensor(),
#     download=False # True
# )
# # 载入测试集
# test_dataset = torchvision.datasets.MNIST(
#     root="dataset/",
#     train=False,
#     transform=transforms.ToTensor(),
#     download=False # True
# )

train_loder = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
test_loder  = DataLoader(dataset=test_dataset, batch_size=32,shuffle=False)

## 1 教师网络（3层隐含层 每层1200个神经元）

In [35]:
# 教师模型
class TeacherModel(nn.Module):
    def __init__(self,in_channels=1,num_classes=10):
        super(TeacherModel, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(784,1200)
        self.fc2 = nn.Linear(1200,1200)
        self.fc3 = nn.Linear(1200,num_classes)
        self.dropout = nn.Dropout(p = 0.5)

    def forward(self,x):
        x = x.view(-1,784)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.dropout(x)
        x = self.relu(x)

        x = self.fc3(x)

        return x

### 从头训练教师模型（0.96） 并保存最新1epochs模型权重：teacher_model = model

In [36]:
model = TeacherModel()
model = model.to(device)

### summary信息（9.29MB）

In [37]:
summary(model, (32,1,28,28)) #     from torchsummary import summary
# Estimated Total Size (MB): 9.29

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1200]         942,000
           Dropout-2                 [-1, 1200]               0
              ReLU-3                 [-1, 1200]               0
            Linear-4                 [-1, 1200]       1,441,200
           Dropout-5                 [-1, 1200]               0
              ReLU-6                 [-1, 1200]               0
            Linear-7                   [-1, 10]          12,010
Total params: 2,395,210
Trainable params: 2,395,210
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.10
Forward/backward pass size (MB): 0.06
Params size (MB): 9.14
Estimated Total Size (MB): 9.29
----------------------------------------------------------------


In [38]:
criterion = nn.CrossEntropyLoss() # 设置使用交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4) # 使用Adam优化器，学习率为lr=1e-4

epochs = 1 # 训练6轮
for epoch in range(epochs):
    model.train()

    for data,targets in tqdm(train_loder):
        data = data.to(device)
        # print(data.shape) # torch.Size([32, 1, 28, 28])
        targets = targets.to(device)
        
        # 前向预测
        preds = model(data)
        loss = criterion(preds,targets)
        # loss = loss.to(device)

        # 反向传播，优化权重
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 测试集上评估性能
    model.eval()
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x,y in test_loder:
            x = x.to(device)
            y = y.to(device)
            preds = model(x)
            predictions = preds.max(1).indices
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
        acc = (num_correct / num_samples).item()
    
    model.train()
    teacher_model = model # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 将模型（with参数）保存，用于最后的知识蒸馏  保存最新模型权重：teacher_model = model
    print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))

100%|██████████| 1875/1875 [00:03<00:00, 519.10it/s]


Epoch:1	 Accuracy:0.943100


## 2 学生模型（每一层20个神经元）并且从头训练学生模型

In [39]:
# 学生模型
class StudentModel(nn.Module):
    def __init__( self,inchannels=1,num_class=10):
        super(StudentModel, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(784, 20) # 1200修改为20
        self.fc2 = nn.Linear(20, 20)
        self.fc3 = nn.Linear(20, num_class)
        #self.dropout = nn.Dropout(p = 0.5)

    def forward(self,x):
        x = x.view(-1, 784)
        x = self.fc1(x)
        #x = self.dropout(x)
        x = self.relu(x)

        x = self.fc2(x)
        #x = self.dropout(x)
        x = self.relu(x)
        
        x = self.fc3(x)

        return x


### 从头训练学生模型（0.88）

In [40]:
model = StudentModel() # 从头先训练一下学生模型
model = model.to(device)

# 设置交叉损失函数 和 激活函数
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)

epochs = 3
# 训练集上训练权重
for epoch in range(epochs):
    model.train()

    for data,targets in tqdm(train_loder):
        data = data.to(device)
        targets = targets.to(device)

        # 前向预测
        preds = model(data)
        loss = criterion(preds,targets)

        # 反向传播，优化权重
        optimizer.zero_grad() # 把梯度置为0
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        for x,y in  test_loder:
            x = x.to(device)
            y = y.to(device)
            preds = model(x)
            predictions = preds.max(1).indices
            num_correct += (predictions==y).sum()
            num_samples += predictions.size(0)
            acc = (num_correct / num_samples).item()

    model.train()
    print(("学生模型 Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))

100%|██████████| 1875/1875 [00:03<00:00, 491.59it/s]


学生模型 Epoch:1	 Accuracy:0.893450


100%|██████████| 1875/1875 [00:03<00:00, 531.07it/s]


学生模型 Epoch:2	 Accuracy:0.891867


100%|██████████| 1875/1875 [00:03<00:00, 615.56it/s]


学生模型 Epoch:3	 Accuracy:0.893925


### 保存训练3个epochs的权重 student_model_scratch = model

In [41]:
student_model_scratch = model # 保存训练3个epochs的学生模型网络结构和权重

## 3 final: 知识蒸馏训练 预训练学生模型(0.90)

In [42]:
# 准备好预训练好的教师模型
teacher_model.eval()

# 准备新的学生模型
model = StudentModel()
model.train()

StudentModel(
  (relu): ReLU()
  (fc1): Linear(in_features=784, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=20, bias=True)
  (fc3): Linear(in_features=20, out_features=10, bias=True)
)

In [43]:
# 蒸馏温度>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>..
temp = 7

# hard_loss
hard_loss = nn.CrossEntropyLoss()
# hard_loss权重
alpha = 0.3

# soft_loss
soft_loss = nn.KLDivLoss(reduction="batchmean")

optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)

In [46]:
epochs = 3
for epoch in range(epochs):
    for data,targets in tqdm(train_loder):
        data = data.to(device)
        targets = targets.to(device)
        # 教师模型预测
        with torch.no_grad():
            teacher_preds = teacher_model(data) # 预训练好的教师模型

        # 学生模型预测
        student_preds = student_model_scratch(data) # 预训练好的学生模型

        # loss1: hard_loss
        student_loss = hard_loss(student_preds,targets)

        # loss2: soft_loss
        # 计算蒸馏后的预测结果及soft_loss
        distillation_loss = soft_loss(
            F.softmax(student_preds / temp, dim=1), # 蒸馏温度temp = 7
            F.softmax(teacher_preds / temp, dim=1)
        )

        # 将 hard_loss 和 soft_loss 加权求和（两个loss都是用预训练的模型计算？？？）
        loss = alpha * student_loss + (1-alpha) * distillation_loss # 0.900100????????????????
        # loss = 1.0 * student_loss  # >>>>>>0.900100

        # 反向传播,优化权重
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 测试集上评估性能
    model.eval()
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x,y in test_loder:
            x = x.to(device)
            y = y.to(device)
            preds = student_model_scratch(x)
            predictions = preds.max(1).indices
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
        acc = (num_correct/num_samples).item()

    model.train()
    print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))

100%|██████████| 1875/1875 [00:03<00:00, 595.81it/s]


Epoch:1	 Accuracy:0.900100


100%|██████████| 1875/1875 [00:02<00:00, 718.44it/s]


Epoch:2	 Accuracy:0.900100


100%|██████████| 1875/1875 [00:02<00:00, 715.85it/s]


Epoch:3	 Accuracy:0.900100
