In [None]:
pip install d2l

In [None]:
pip install torchinfo

In [None]:
import pandas as pd
from d2l import torch as d2l

In [None]:
raw_data = pd.read_csv('/kaggle/input/weiboedit/weibo_xiugaishuju.csv', names=['label','review'], header=None)
raw_data

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
raw_data

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)
for train_index,test_index in split.split(raw_data['review'],raw_data['label']):
    train_set = raw_data.iloc[train_index, :]
    test_set = raw_data.iloc[test_index, :]
train_set
test_set

In [None]:
#使用过采样
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
train_set_balanced_over, _ = ros.fit_resample(train_set, train_set['label'])
train_set_balanced_over
#使用欠采样
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
train_set_balanced_under, _ = rus.fit_resample(train_set, train_set['label'])
train_set_balanced_under

In [None]:
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import torchvision
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torchvision import transforms
from torchinfo import summary
import collections
from transformers import BertTokenizer, DataCollatorWithPadding

In [None]:
batch_size = 32#超参数，批量大小
max_length = 64#超参数，最大长度
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm-ext')#加载分词器

In [None]:
cuda = True
device = "cuda" if cuda else cpu

In [None]:
def get_data_loaders(data, tokenizer, batch_size, max_length, shuffle = True):#shuffle：需不需要打乱
    collact = DataCollatorWithPadding(tokenizer)
    dataset = []
    texts = data['review'].to_list()#将评论转化列表
    labels = data['label'].to_list()#将分类列表转化为列表
    for i in tqdm(range(len(texts))):
        text, label = texts[i], labels[i]
        inputs = tokenizer(text = text, max_length = max_length, padding = 'max_length', truncation = True)#truncation是判断超过长度是否截断
        inputs["labels"] = label
        dataset.append(inputs)
    data_loader = DataLoader(dataset, batch_size = batch_size, shuffle = shuffle, collate_fn = collact)
    return data_loader

train_loader = get_data_loaders(train_set, tokenizer, batch_size = batch_size, max_length = max_length)#如果不使用采样操作，则第一个参数为：train_set；如果进行采样的话，则第一个输入为：train_set_balanced
test_loader = get_data_loaders(test_set, tokenizer, batch_size * 2, max_length = max_length, shuffle = False)

In [None]:
from transformers import BertForSequenceClassification

In [None]:
class Bert_wwm_ext_ClassificationModel(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super(Bert_wwm_ext_ClassificationModel, self).__init__()
        self.pretrained_bert = BertForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm-ext', num_labels=num_classes).to(device)
    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.pretrained_bert(
                                      input_ids = input_ids,
                                      attention_mask = attention_mask,
                                      token_type_ids = token_type_ids
                                     )#需要返回input_ids;attention_mask;token_type_ids
        logits = outputs.logits#直接拿出outputs中的最后一层
        return logits

In [None]:
bert_wwm_ext_model = Bert_wwm_ext_ClassificationModel(hidden_size = 768, num_classes = 2).to(device)
#输出模型参数
summary(bert_wwm_ext_model)
total = sum ([param.nelement () for param in bert_wwm_ext_model.parameters ()]) 
print ("Number of parameters: %.2fM" % (total/1e6))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
def bert_wwm_ext_evaluate_model(net, test_iter, criterion, device = None):
    """
    net: bert_model
    data_loader: test_dataloader
    criterion: CEloss;KLloss
    device: GPU
    """
    if isinstance(net, nn.Module):
        net.eval()
        if not device:
            device = next(iter(net.parameters())).device
    #在测试集上不需要梯度更新，所以这里直接梯度冻结        
    with torch.no_grad():
        test_y_all, prediction_all = [], []#用来存储测试集上的真实值和预测值
        total_loss = 0.0#初始化损失值
        for test_data in test_iter:
            test_data = test_data.to(device)#将测试集数据挪到GPU上
            out = net(
                     input_ids = test_data["input_ids"],
                     attention_mask = test_data["attention_mask"],
                     token_type_ids = test_data["token_type_ids"]
                     )
            test_y = test_data["labels"]
            
            l = criterion(out, test_y)
            total_loss += l.sum().item()#计算每个样本的损失
            
            prediction = out.argmax(dim = 1)
            prediction = prediction.cpu().detach().numpy()#将测试集上的预测值从GPU上抽出并转化为list
            test_y = test_y.cpu().detach().numpy()#将测试集上的真实值从GPU上抽出并转化为list
            
            test_y_all.append(test_y)#将真实值进行拼接
            prediction_all.append(prediction)#将预测值进行拼接
        
        test_y_all = np.concatenate(test_y_all, axis=0)#将测试集的真实值按行进行拼接
        prediction_all = np.concatenate(prediction_all, axis=0)#将测试集的预测值按行进行拼接
        
        #设置评价指标
        acc_score = accuracy_score(test_y_all, prediction_all)#准确率
        pre_score = precision_score(test_y_all, prediction_all, average = "binary")#二分类的精确率
        rec_score = recall_score(test_y_all, prediction_all, average = "binary")#二分类的召回率
        fscore = f1_score(test_y_all, prediction_all, average = "binary")#二分类的f1得分
        
        #返回测试集上模型的平均损失
        avg_loss = total_loss / len(test_iter)
        
    #依次返回了acc;pre;rec,f1和loss    
    return acc_score, pre_score, rec_score, fscore, avg_loss

In [None]:
teacher_loss = nn.CrossEntropyLoss()
test_acc, test_pre, test_rec, test_f1, test_loss = bert_wwm_ext_evaluate_model(bert_wwm_ext_model, test_loader, criterion = teacher_loss)

In [None]:
import time#输出单条数据的推理时间

In [None]:
def bert_wwm_ext_model_classification(net, train_iter, criterion, optimizer, num_epochs, device):
    """
    net: bert_model
    data_loader: train_dataloader
    loss: bert_teacher_loss
    optimizer: bert_teacher_optimizer
    num_epochs: epoch
    device: GPU
    """
    net.train()
    for epoch in range(num_epochs):
        timer = d2l.Timer()#设置模型训练时间
        train_y_all, prediction_all = [], []#用来存储训练数据的真实值和预测值
        for train_data in tqdm(train_iter):
            train_data = train_data.to(device)#将训练数据集挪到GPU上
            
            start_time = time.time()#计时开始
            
            out = net(
                     input_ids = train_data["input_ids"],
                     attention_mask = train_data["attention_mask"],
                     token_type_ids = train_data["token_type_ids"]
                     )
            train_y = train_data["labels"]
            
            #计算损失
            loss = criterion(out, train_y)
            
            #梯度清除
            optimizer.zero_grad()
            #反向传播
            loss.backward()
            #梯度更新
            optimizer.step()
            
            prediction = out.argmax(dim = 1)#将训练集上的预测值从logti——>softmax
            
            end_time = time.time()
            inference_time = (end_time - start_time) * 1000#计算单条推理时间，并以毫秒进行显示
            
            #将prediction和train_y从GPU中抽到CPU
            prediction = prediction.cpu().detach()
            train_y = train_y.cpu().detach()
            
            train_y_all.append(train_y)#存储当前epoch中的真实标签
            prediction_all.append(prediction)#存储当前epoch中的测试值
        
        train_y_all = torch.cat(train_y_all, dim = 0)#按行链接所有epoch的真实标签
        prediction_all = torch.cat(prediction_all, dim = 0)#按行链接所有epoch的预测值
        #设置模型评价指标    
        acc_score = accuracy_score(train_y_all,  prediction_all)
        pre_score = precision_score(train_y_all, prediction_all, average = "binary")
        rec_score = recall_score(train_y_all, prediction_all, average = "binary")
        fscore = f1_score(train_y_all, prediction_all, average = "binary")
        
        #在训练集上进行模型训练，并输出评价指标和loss    
        print(f'epoch{epoch+1}, train_loss {loss:.4f}, train_acc {acc_score:.4f}, train_pre {pre_score:.4f}, train_rec {rec_score:.4f}, train_f1 {fscore:.4f}')
        
        #在测试集上进行模型评估，并输出评价指标和loss
        test_acc, test_pre, test_rec, test_f1, test_loss = bert_evaluate_model(bert_model, test_loader, criterion = teacher_loss)
        print(f'test loss {test_loss:.4f}', f'test acc {test_acc:.4f}', f'test pre {test_pre:.4f}', f'test rec {test_rec:.4f}', f'test f1 {test_f1:.4f}')
        
        #计算模型推理时间；计算出单条数据的推理时间
        print("Total time : {:.2f}".format(timer.stop()), "Single_data time: {:.2f} ms".format(inference_time))

In [None]:
lr = 3e-5#学习率：超参数
num_epochs = 3#迭代周期：超参数

In [None]:
bert_wwm_ext_model = Bert_wwm_ext_ClassificationModel(hidden_size = 768, num_classes = 2).to(device)
teacher_loss = nn.CrossEntropyLoss()
teacher_optimizer = AdamW(bert_wwm_ext_model.parameters(), lr=lr)#优化器的选择：超参数

In [None]:
bert_wwm_ext_model_classification(net = bert_wwm_ext_model, train_iter = train_loader, criterion = teacher_loss, optimizer = teacher_optimizer, num_epochs = num_epochs, device = device)

In [None]:
# 保存模型
torch.save(bert_wwm_ext_model.state_dict(), '/kaggle/working/bert_wwm_ext_model.pth')#注意保存路径。其中，第一个bert_model的意思是自己定义的模型；第二个bert_model的意思是，保存预训练好的模型的名字

In [None]:
# 加载保存的BERT模型参数
bert_model = BertClassificationModel(hidden_size=768, num_classes=2)
bert_wwm_ext_model.load_state_dict(torch.load('/kaggle/working/bert_wwm_ext_model.pth'))#注意加载路径。其中，第一个bert_model的意思是自己定义的模型；第二个bert_model的意思是，保存预训练好的模型的名字

In [None]:
class BiLSTMClassification(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, **kwargs):
        """
        vocab_size：词汇表的大小
        embed_size：嵌入层大小embedding_size
        hiddern_size：隐藏层大小
        num_layers：层数
        """
        super(BiLSTMClassification, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, hidden_size, num_layers = num_layers, bidirectional = True, dropout = 0.5)
        self.decoder = nn.Sequential(
                                      nn.Linear(2 * hidden_size, 2)#双向LSTM中，先前向一次再后向一次，所以是*2
                                      )
    def forward(self, inputs):
        """
        input_shape: (batch_size, max_length)
        """
        embeddings = self.embedding(inputs).transpose(0, 1)#shape：(max_length, batch_size, embed_size)#保证max_length在第一个维度
        outputs, (_,_) = self.encoder(embeddings)#输出为: (outputs, hidden_state)，只取出outputs所对应的向量，舍去hidden_state所对应的向量
        outputs_fw = outputs[:, :, :hidden_size]#BiLSTM需要拿出前向LSMT的最后一个时间步骤的向量
        outputs_bw = outputs[:, :, hidden_size:]#BiLSTM需要拿出后向LSTM的第一个时间步骤的向量
        logits = self.decoder(torch.cat((outputs_fw[-1], outputs_bw[0]), dim = 1))#将最后一个时间步骤的向量和第一个时间步骤的向量进行concat操作
        return logits

In [None]:
vocab_size = tokenizer.get_vocab()
len(vocab_size)

In [None]:
#定义学生模型参数
embed_size, hidden_size, num_layers = 64, 64, 2
bilstm_model = BiLSTMClassification(len(vocab_size), embed_size, hidden_size, num_layers).to(device)#vocab_szie = tokenizer.get_vocab()
summary(bilstm_model)
total = sum([param.nelement() for param in bilstm_model.parameters()])
print("Number of parameters: %.2fM" % (total/1e6))

In [None]:
def bilstm_evaluate_model(net, test_iter, criterion, device = None):
    """
    net: bert_model
    data_loader: test_dataloader
    criterion: CEloss;KLloss
    device: GPU
    """
    if isinstance(net, nn.Module):
        net.eval()
        if not device:
            device = next(iter(net.parameters())).device
    #在测试集上不需要梯度更新，所以这里直接梯度冻结        
    with torch.no_grad():
        test_y_all, prediction_all = [], []#用来存储测试集上的真实值和预测值
        total_loss = 0.0#初始化损失值
        for test_data in test_iter:
            test_data = test_data.to(device)#将测试集数据挪到GPU上
            out = net(
                     inputs = test_data["input_ids"]
                     )
            test_y = test_data["labels"]
            
            l = criterion(out, test_y)
            total_loss += l.sum().item()#计算每个样本的损失
            
            prediction = out.argmax(dim = 1)
            prediction = prediction.cpu().detach().numpy()#将测试集上的预测值从GPU上抽出并转化为list
            test_y = test_y.cpu().detach().numpy()#将测试集上的真实值从GPU上抽出并转化为list
            
            test_y_all.append(test_y)#将真实值进行拼接
            prediction_all.append(prediction)#将预测值进行拼接
        
        test_y_all = np.concatenate(test_y_all, axis=0)#将测试集的真实值按行进行拼接
        prediction_all = np.concatenate(prediction_all, axis=0)#将测试集的预测值按行进行拼接
        
        #设置评价指标
        acc_score = accuracy_score(test_y_all, prediction_all)#准确率
        pre_score = precision_score(test_y_all, prediction_all, average = "binary")#二分类的精确率
        rec_score = recall_score(test_y_all, prediction_all, average = "binary")#二分类的召回率
        fscore = f1_score(test_y_all, prediction_all, average = "binary")#二分类的f1得分
        
        #返回测试集上模型的平均损失
        avg_loss = total_loss / len(test_iter)
        
    #依次返回了acc;pre;rec,f1和loss    
    return acc_score, pre_score, rec_score, fscore, avg_loss

In [None]:
student_loss = nn.CrossEntropyLoss()
test_acc, test_pre, test_rec, test_f1, test_loss = bilstm_evaluate_model(bilstm_model, test_loader, criterion = student_loss)

In [None]:
def bilstmmodel_classification(net, train_iter, criterion, optimizer, num_epochs, device):
    """
    net: bilstm_model,学生模型
    train_iter: train_loader,数据集
    criterion: bilstm_student_loss,学生模型损失函数
    optimizer: bilstm_student_optimizer,学生模型优化器
    num_epoch: student_epoch，学生模型迭代周期
    device: GPU,加速器
    """
    for epoch in range(num_epochs):
        net.train()  # 设置模型为训练模式
        timer = d2l.Timer()#设置模型训练时间
        train_y_all, prediction_all = [], []#用来存储训练数据的真实值和预测值
        for train_data in tqdm(train_iter):
            train_data = train_data.to(device)#将训练数据集挪到GPU上
            
            start_time = time.time()#计时开始
            
            out = net(
                     inputs = train_data["input_ids"]#只需要去除tokenizer中的input_ids即可
                     )
            train_y = train_data["labels"]
            
            #计算损失
            loss = criterion(out, train_y)
            
            #梯度清除
            optimizer.zero_grad()
            #反向传播
            loss.backward()
            #梯度更新
            optimizer.step()
            
            prediction = out.argmax(dim = 1)#将训练集上的预测值从logit——>softmax
            
            end_time = time.time()
            inference_time = (end_time - start_time) * 1000#计算单条推理时间，并以毫秒进行显示
            
            #将prediction和train_y从GPU中抽到CPU
            prediction = prediction.cpu().detach()
            train_y = train_y.cpu().detach()
            
            train_y_all.append(train_y)#存储当前epoch中的真实标签
            prediction_all.append(prediction)#存储当前epoch中的测试值
        
        train_y_all = torch.cat(train_y_all, dim = 0)#按行链接所有epoch的真实标签
        prediction_all = torch.cat(prediction_all, dim = 0)#按行链接所有epoch的预测值
        #设置模型评价指标    
        acc_score = accuracy_score(train_y_all,  prediction_all)
        pre_score = precision_score(train_y_all, prediction_all, average = "binary")
        rec_score = recall_score(train_y_all, prediction_all, average = "binary")
        fscore = f1_score(train_y_all, prediction_all, average = "binary")
        
        #在训练集上进行模型训练，并输出评价指标和loss    
        print(f'epoch{epoch+1}, train_loss {loss:.4f}, train_acc {acc_score:.4f}, train_pre {pre_score:.4f}, train_rec {rec_score:.4f}, train_f1 {fscore:.4f}')
        
        #在测试集上进行模型评估，并输出评价指标和loss
        test_acc, test_pre, test_rec, test_f1, test_loss = bilstm_evaluate_model(bilstm_model, test_loader, criterion = student_loss)
        print(f'test loss {test_loss:.4f}', f'test acc {test_acc:.4f}', f'test pre {test_pre:.4f}', f'test rec {test_rec:.4f}', f'test f1 {test_f1:.4f}')
        
        #计算模型推理时间；计算出单条数据的推理时间
        print("Total time : {:.2f}".format(timer.stop()), "Single_data time: {:.2f} ms".format(inference_time))

In [None]:
lr = 0.0001#学习率：超参数
num_epochs = 3#迭代周期：超参数

In [None]:
bilstm_model = BiLSTMClassification(len(vocab_size), embed_size, hidden_size, num_layers).to(device)#vocab_szie = tokenizer.get_vocab()
student_loss = nn.CrossEntropyLoss()
student_optimizer = AdamW(bilstm_model.parameters(), lr=lr)#优化器的选择：超参数

In [None]:
bilstmmodel_classification(bilstm_model, train_loader, criterion = student_loss, optimizer = student_optimizer, num_epochs = num_epochs, device = device)

In [None]:
def kd_evaluate_model(teacher_net, student_net, test_iter, hard_loss, soft_loss, device = None):
    """
    net: bert_model
    data_loader: test_dataloader
    criterion: CEloss;KLloss
    device: GPU
    """
    if isinstance(student_net, nn.Module):
        teacher_net.eval()
        student_net.eval()
        if not device:
            device = next(iter(student_net.parameters())).device
    #在测试集上不需要梯度更新，所以这里直接梯度冻结        
    with torch.no_grad():
        test_y_all, student_prediction_all = [], []#用来存储测试集上的真实值和预测值
        total_loss = 0.0#初始化损失值
        for test_data in test_iter:
            test_data = test_data.to(device)#将测试集数据挪到GPU上
            teacher_out = teacher_net(
                                     input_ids = test_data["input_ids"],
                                     attention_mask = test_data["attention_mask"],
                                     token_type_ids = test_data["token_type_ids"]
                                     )
            student_out = student_net(
                                     inputs = test_data["input_ids"]
                                     )
            test_y = test_data["labels"]
            student_loss = hard_loss(student_out, test_y)
            dist_loss = soft_loss(
                                 F.log_softmax(student_out / T, dim = -1),
                                 F.softmax(teacher_out / T, dim = -1)
                                 )
            total_loss = (1 - alpha) * student_loss + alpha * dist_loss
            total_loss += total_loss.sum().item()#计算每个样本的损失
            
            student_prediction = student_out.argmax(dim = 1)
            student_prediction = student_prediction.cpu().detach().numpy()#将测试集上的预测值从GPU上抽出并转化为list
            test_y = test_y.cpu().detach().numpy()#将测试集上的真实值从GPU上抽出并转化为list
            
            test_y_all.append(test_y)#将真实值进行拼接
            student_prediction_all.append(student_prediction)#将预测值进行拼接
        
        test_y_all = np.concatenate(test_y_all, axis=0)#将测试集的真实值按行进行拼接
        student_prediction_all = np.concatenate(student_prediction_all, axis=0)#将测试集的预测值按行进行拼接
        
        #设置评价指标
        acc_score = accuracy_score(test_y_all, student_prediction_all)#准确率
        pre_score = precision_score(test_y_all, student_prediction_all, average = "binary")#二分类的精确率
        rec_score = recall_score(test_y_all, student_prediction_all, average = "binary")#二分类的召回率
        fscore = f1_score(test_y_all, student_prediction_all, average = "binary")#二分类的f1得分
        
        #返回测试集上模型的平均损失
        avg_loss = total_loss / len(test_iter)
        
    #依次返回了acc;pre;rec,f1和loss    
    return acc_score, pre_score, rec_score, fscore, avg_loss

In [None]:
kd_lr = 0.001
kd_num_epochs = 3
teacher_model = bert_wwm_ext_model.to(device)
student_model = bilstm_model.to(device)
kd_hard_loss = nn.CrossEntropyLoss()
kd_soft_loss = nn.KLDivLoss(reduction = 'batchmean')
kd_optimizer = torch.optim.Adam(student_model.parameters(), lr = kd_lr)#优化器的选择：超参数
T, alpha = 2, 0.8

In [None]:
test_acc, test_pre, test_rec, test_f1, test_loss = kd_evaluate_model(teacher_net = bert_wwm_ext_model, student_net = student_model, test_iter = test_loader, hard_loss = kd_hard_loss, soft_loss = kd_soft_loss)

In [None]:
def kd_blk(teacher_net, student_net, train_iter, hard_loss, soft_loss, optimizer, T, alpha, num_epochs, device):
    """
    teacher_net: 教师模型
    student_net: 学生模型
    train_iter: 训练集迭代器
    hard_loss, soft_loss: 硬损失，软损失
    T, alpha, num_epochs: 蒸馏温度， 蒸馏系数， 迭代次数
    """
    student_net = nn.DataParallel(student_net).to(device)#使用多GPU进行并行训练教师模型
    teacher_net = nn.DataParallel(teacher_net).to(device)#使用多GPU进行并行训练学生模型
    teacher_net.eval()#蒸馏过程中，教师模型参数固定
    for epoch in range(num_epochs):
        timer = d2l.Timer()#设置模型参数训练时间
        student_net.train()#设置学生模型为训练模型
        train_y_all, student_prediction_all = [], []#用来存储训练数据的真实值和预测值
        for train_data in tqdm(train_iter):
            train_data = train_data.to(device)#将训练数据集挪到GPU上
            
            start_time = time.time()#计时开始
            
            #关闭教师模型梯度，使得教师模型不参与梯度更新
            with torch.no_grad():
                #输出教师模型的预测值
                teacher_pre = teacher_net(
                                         input_ids = train_data["input_ids"],
                                         attention_mask = train_data["attention_mask"],
                                         token_type_ids = train_data["token_type_ids"]
                                         )
            #输出学生模型的预测值
            student_pre = student_net(
                                     inputs = train_data["input_ids"]
                                     )
            #输出学生模型的预测值
            student_pre = student_net(
                                     inputs = train_data["input_ids"]
                                     )
            #输出真实值
            train_y = train_data["labels"]
            
            #计算硬损失（学生损失）
            student_loss = hard_loss(student_pre, train_y)
            #计算软损失（蒸馏损失）
            dist_loss = soft_loss(F.log_softmax(student_pre / T, dim = -1),
                                  F.softmax(teacher_pre / T, dim = -1)
                                 )
            #计算总损失
            total_loss = (1 - alpha) * student_loss + alpha * dist_loss
            
            #梯度清除
            optimizer.zero_grad()
            #反向传播
            total_loss.backward()
            #梯度更新
            optimizer.step()
            
            #学生模型用来预测
            student_prediction = student_pre.argmax(dim = 1)#将训练集上的预测值logit——>softmax
            
            end_time = time.time()
            inference_time = (end_time - start_time) * 1000#计算单条推理时间，并以毫秒显示
            
            #将student_prediction和train_y从GPU中抽到CPU
            student_prediction = student_prediction.cpu().detach()
            train_y = train_y.cpu().detach()
            
            train_y_all.append(train_y)#存储当前epoch中的真实标签
            student_prediction_all.append(student_prediction)#存储当前epoch中的学生模型的预测值
            
        train_y_all = torch.cat(train_y_all, dim = 0)#按行链接所有epoch的真实标签
        student_prediction_all = torch.cat(student_prediction_all, dim = 0)#按行链接所有epoch的学生预测值
        
        #设置评价指标
        acc_score = accuracy_score(train_y_all,  student_prediction_all)
        pre_score = precision_score(train_y_all, student_prediction_all, average = "binary")
        rec_score = recall_score(train_y_all, student_prediction_all, average = "binary")
        fscore = f1_score(train_y_all, student_prediction_all, average = "binary")   
        
        #在训练集上进行模型训练，并输出评价指标和loss
        print(f'epoch{epoch+1}, train_loss {total_loss:.4f}, train acc {acc_score:.4f}, train pre {pre_score:.4f}, train rec {rec_score:.4f}, train f1 {fscore:.4f}')
        
        #在测试集上进行模型评估，并输出评价指标和loss
        test_acc, test_pre, test_rec, test_f1, test_loss = kd_evaluate_model(teacher_net = bert_model, student_net = student_model, test_iter = test_loader, hard_loss = kd_hard_loss, soft_loss = kd_soft_loss)
        print(f'test loss {test_loss:.4f}', f'test acc {test_acc:.4f}', f'test pre {test_pre:.4f}', f'test rec {test_rec:.4f}', f'test f1 {test_f1:.4f}')
        
        #计算模型推理时间，计算出单条数据的推理时间
        print("Total time : {:.2f}".format(timer.stop()), "Single_data time: {:.2f} ms".format(inference_time))

In [None]:
kd_lr = 0.001
kd_num_epochs = 3

In [None]:
teacher_model = bert_wwm_ext_model.to(device)
student_model = bilstm_model.to(device)
kd_hard_loss = nn.CrossEntropyLoss()
kd_soft_loss = nn.KLDivLoss(reduction = 'batchmean')
kd_optimizer = torch.optim.Adam(student_model.parameters(), lr = kd_lr)#优化器的选择：超参数
T, alpha = 2, 0.8

In [None]:
kd_blk(teacher_net = teacher_model, student_net = student_model, train_iter = train_loader, hard_loss = kd_hard_loss, soft_loss = kd_soft_loss, optimizer = kd_optimizer, T = T, alpha = alpha, num_epochs = kd_num_epochs, device = device)