In [1]:
import subprocess
import os
os.environ['CURL_CA_BUNDLE'] = ''

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

##  回顾：损失函数计算公式

- $q(x)$：from student model，$p(x)$：from teacher model
- 其次对于 $q(x), p(x)$ 在计算时需要加温度
$$
\begin{split}
L_{\text{student}}&=\alpha L_{\text{CE}} + (1-\alpha)L_{KD}\\
&=\alpha L_{\text{CE}} + (1-\alpha)T^2D_{KL}\\
&=\alpha L_{\text{CE}} + (1-\alpha)T^2\sum_ip_i(x)\log\frac{p_i(x)}{q_i(x)}
\end{split}
$$

- 其中，KL 散度可以通过 `nn.KLDivLoss()` 函数来计算
    - inputs ($q(x)$): log probabilities
    - labels ($p(x)$): normal probabilities

## TrainerArguments & Trainer

In [2]:
from transformers import TrainingArguments, Trainer
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
# import wandb

In [3]:
class DistillTrainingArguments(TrainingArguments):
    # TrainingArguments: @dataclass
    # 增加两个 KD 所需的参数参数
    def __init__(self, *args, alpha=0.5, temperature=2., **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

In [4]:
class DistillTrainer(Trainer):
    
    def __init__(self, *args, teacher_model=None, **kwargs):
        # 增加 teacher_model 参数
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model

    # 重写 trainer 中核心方法（forward 计算损失）
    def compute_loss(self, model, inputs, return_outputs=False):
        s_output = model(**inputs)
        s_ce = s_output.loss
        s_logits = s_output.logits
        
        with torch.no_grad():
            t_output = self.teacher_model(**inputs)
            t_logits = t_output.logits
        
        loss_kl_fct = nn.KLDivLoss(reduction='batchmean')
        loss_kd = self.args.temperature**2 * loss_kl_fct(F.log_softmax(s_logits/self.args.temperature, dim=-1),
                                                         F.softmax(t_logits/self.args.temperature, dim=-1))
        loss = self.args.alpha * s_ce + (1 - self.args.alpha) * loss_kd
        return (loss, s_output) if return_outputs else loss

## pipeline

### datasets

In [5]:
# import os
# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

In [6]:
from datasets import load_dataset

# SequenceClassification
clinc = load_dataset("clinc_oos", "plus")

In [7]:
clinc

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})

In [8]:
clinc['train'][:10]

{'text': ['what expression would i use to say i love you if i were an italian',
  "can you tell me how to say 'i do not speak much spanish', in spanish",
  "what is the equivalent of, 'life is good' in french",
  "tell me how to say, 'it is a beautiful morning' in italian",
  'if i were mongolian, how would i say that i am a tourist',
  "how do i say 'hotel' in finnish",
  "i need you to translate the sentence, 'we will be there soon' into portuguese",
  'please tell me how to ask for a taxi in french',
  "can you tell me how i would say, 'more bread please' in french",
  "what is the correct way to say 'i am a visitor' in french"],
 'intent': [61, 61, 61, 61, 61, 61, 61, 61, 61, 61]}

In [9]:
intents = clinc['train'].features['intent']
num_labels = intents.num_classes
num_labels

151

### Student model 初始化

In [10]:
from transformers import AutoConfig, AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 学生模型
s_ckpt = 'distilbert-base-uncased'
s_tokenizer = AutoTokenizer.from_pretrained(s_ckpt)

# 教师模型
t_ckpt = 'transformersbook/bert-base-uncased-finetuned-clinc'
t_model = AutoModelForSequenceClassification.from_pretrained(t_ckpt, num_labels=num_labels).to(device)

In [12]:
clinc_enc = clinc.map(
    lambda batch: s_tokenizer(batch['text'], truncation=True),
    batched=True,
    remove_columns=["text"]
)
clinc_enc = clinc_enc.rename_columns({'intent': 'labels'})
clinc_enc

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 5500
    })
})

In [19]:
batch_size = 64

s_training_args = DistillTrainingArguments(
    output_dir='distilbert-base-uncased-ft-clinc', 
    evaluation_strategy='epoch', 
    num_train_epochs=5, 
    learning_rate=3e-4, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, 
    alpha=0.5, 
    weight_decay=0.01, 
    logging_strategy='epoch',
    push_to_hub=False,
    report_to="none"  # 这样就不会自动启用 W&B
)

s_config = AutoConfig.from_pretrained(
    s_ckpt, 
    num_labels=num_labels, 
    id2label=t_model.config.id2label, 
    label2id=t_model.config.label2id
)



In [14]:
def student_init():
    return AutoModelForSequenceClassification.from_pretrained(s_ckpt, config=s_config).to(device)

## Student model 训练

In [15]:
import evaluate
accuracy_score = evaluate.load('accuracy')

In [16]:
# 准确度指标计算函数
def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy_score.compute(references=labels, predictions=predictions)

In [20]:
distill_trainer = DistillTrainer(
    model_init=student_init, 
    teacher_model=t_model, 
    args=s_training_args, 
    train_dataset=clinc_enc['train'], 
    eval_dataset=clinc_enc['validation'], 
    compute_metrics=compute_metrics, 
    tokenizer=s_tokenizer,
)
distill_trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2396,0.446824,0.89
2,0.312,0.331388,0.932258
3,0.2236,0.299509,0.945806
4,0.1938,0.283754,0.950645
5,0.1822,0.27617,0.951613


TrainOutput(global_step=1195, training_loss=0.4302276834783195, metrics={'train_runtime': 142.9024, 'train_samples_per_second': 533.581, 'train_steps_per_second': 8.362, 'total_flos': 427022126020140.0, 'train_loss': 0.4302276834783195, 'epoch': 5.0})

In [23]:
distill_trainer.save_model("./distilbert-base-uncased-ft-clinc")  # 保存模型

In [21]:
import math
math.ceil(15250 / (64 * 1)) * 5

1195

## Student model 使用

In [26]:
from transformers import pipeline

# ft_ckpt = 'lanchunhui/distilbert-base-uncased-ft-clinc'
# distill_trainer.push_to_hub('finetune completed!')

pipe = pipeline('text-classification', model='./distilbert-base-uncased-ft-clinc/', device=0)

In [28]:
pipe(""" Hey, I'd like to rent a vehicle from Nov 1st to Nov 15th in Paris and I need a 15 passenger van """)

[{'label': 'car_rental', 'score': 0.8687736988067627}]