# 微调预训练模型

- 任务：微调一个句子对分类模型（保存验证集上的最好模型权重）
- Dataset：模型。我们选择蚂蚁金融语义相似度数据集 AFQMC 作为语料
    - `{"sentence1": "还款还清了，为什么花呗账单显示还要还款", "sentence2": "花呗全额还清怎么显示没有还款", "label": "1"}`

## Loading Dataset
### **首先继承 Dataset 类构造自定义数据集，以组织样本和标签。**
- 如果数据集非常巨大，难以一次性加载到内存中，我们也可以继承 IterableDataset 类构建迭代型数据集

In [1]:
from torch.utils.data import Dataset
import json

class AFQMC(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file, ):
        Data = {}
        with open (data_file, "rt", encoding="utf-8") as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]
        

In [2]:
import os
os.getcwd()

'C:\\Users\\hhm18\\Desktop\\course'

In [3]:
train_data = AFQMC("dataset/AFQMC/train.json")
valid_data = AFQMC("dataset/AFQMC/dev.json")
train_data[0], len(train_data)

({'sentence1': '蚂蚁借呗等额还款可以换成先息后本吗', 'sentence2': '借呗有先息到期还本吗', 'label': '0'},
 34334)

### DataLoader
按照批次进行数据加载，并且转化为需要的格式。(padding, trancated...)


In [4]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def collote_fn(batch_samples):
    batch_samples_1, batch_samples_2 =[], []
    batch_label = []
    for sample in batch_samples:
        batch_samples_1.append(sample["sentence1"])
        batch_samples_2.append(sample["sentence2"])
        batch_label.append(int(sample['label']))
        
    # 转换为张量
    X = tokenizer(
        batch_samples_1, batch_samples_2, 
        padding=True, truncation=True, return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

In [7]:
train_dataloader = DataLoader(
    train_data, batch_size=16, shuffle=True, collate_fn=collote_fn
)
valid_dataloader = DataLoader(
    valid_data, batch_size=16, shuffle=True, collate_fn=collote_fn
)

In [8]:
len(train_dataloader), len(valid_dataloader)

(2146, 270)

In [9]:
batch_X, batch_y = next(iter(train_dataloader))
print('batch_X shape:', {k: v.shape for k, v in batch_X.items()})
print('batch_y shape:', batch_y.shape)
print(batch_X)
print(batch_y)

batch_X shape: {'input_ids': torch.Size([16, 65]), 'token_type_ids': torch.Size([16, 65]), 'attention_mask': torch.Size([16, 65])}
batch_y shape: torch.Size([16])
{'input_ids': tensor([[ 101,  711,  784,  ...,    0,    0,    0],
        [ 101, 2769,  679,  ...,    0,    0,    0],
        [ 101, 5709, 1446,  ...,    0,    0,    0],
        ...,
        [ 101, 1377,  809,  ...,    0,    0,    0],
        [ 101,  711,  784,  ...,    0,    0,    0],
        [ 101, 5709, 1446,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
tensor([1, 0, 0, 1, 1,

可以看到，DataLoader 按照我们设置的 batch size 每次对 4 个样本进行编码，并且通过设置 padding=True 和 truncation=True 来自动对每个 batch 中的样本进行补全和截断。
- [CLS]--101 [SPE]--102
- 按照BERT将样本处理成了[CLS] ...[SEP] ... [SEP]  

## 训练模型
可以直接使用HF的`AutoModelForSentenceClassification`，但是实际中需要一些自定义操作。
最简单的方法是加载BERT，然后加一个全连接层完成分类任务

```python
from torch import nn
from transformers import AutoModel

device = "cuda" if torch.cuda.is_available else "cpu"
print(f"Using {device} device")

class BertForPairewiseCLS(nn.Module):
    def __init__(self):
        super(BertForPairewiseCLS, self).__init__()
        self.bert_encoder = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2) # 这里的label只有两个1，0

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vecter = bert_output.last_hidden_state[:, 0, :] # 手动提取CLS
        cls_vecter = self.dropout(cls_vecter)  # 手动添加dropout
        logits = self.classifier(cls_vecter)   # 手动分类
        return logits

model = BertForPairewiseCLS().to(device)
print(model)
```

更为常见的写法是继承 Transformers 库中的预训练模型来创建自己的模型。例如这里我们可以继承 BERT 模型（BertPreTrainedModel 类）来创建一个与上面模型结构完全相同的分类器：
```python
from torch import nn
from transformers import AutoConfig
from transformers import BertPreTrainedModel, BertModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

class BertForPairwiseCLS(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(768, 2)
        self.post_init()
    
    def forward(self, x):
        bert_output = self.bert(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0, :]
        cls_vectors = self.dropout(cls_vectors)
        logits = self.classifier(cls_vectors)
        return logits

config = AutoConfig.from_pretrained(checkpoint)
model = BertForPairwiseCLS.from_pretrained(checkpoint, config=config).to(device)
print(model)
```

In [10]:
from torch import nn
from transformers import AutoConfig
from transformers import BertPreTrainedModel, BertModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

class BertForPairwiseCLS(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(768, 2)
        self.post_init()
    
    def forward(self, x):
        bert_output = self.bert(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0, :]
        cls_vectors = self.dropout(cls_vectors)
        logits = self.classifier(cls_vectors)
        return logits

config = AutoConfig.from_pretrained(checkpoint)
model = BertForPairwiseCLS.from_pretrained(checkpoint, config=config).to(device)
print(model)

Using cuda device


Some weights of BertForPairwiseCLS were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForPairwiseCLS(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [11]:
# 测试输出的shape
outputs = model(batch_X.to(device))
print(outputs.shape)

torch.Size([16, 2])


## 优化模型的参数
每一轮 Epoch 分为训练循环和验证/测试循环。在训练循环中计算损失、优化模型的参数，在验证/测试循环中评估模型的性能：

In [12]:
from tqdm.auto import tqdm

def train_loop(dataloader, model, loss_fn, optimizer, 
               lr_shedular, epoch, total_loss):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f"loss: {0:>7f}")
    finish_step_num = (epoch-1) * len(dataloader)

    model.train()
    for step, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_shedular.step()

        total_loss += loss.item()
        progress_bar.set_description(f"loss{total_loss/(finish_step_num + step):>7f}")
        progress_bar.update(1)
    return total_loss

def test_loop(dataloader, model, mode="Test"):
    assert mode in ["Valid", "Test"] 
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            # 对于[batch_size, num_classes]取出标签维度的最大索引得到预测标签，
            # 和真实标签进行比较 返回bool，在进行01转换求和出正确的总数
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    correct /=size
    print(f"{mode}Accuracy:{(100 * correct):>0.1f}%\n")


Transformers 库同样实现了很多的优化器，并且相比 Pytorch 固定学习率，Transformers 库的优化器会随着训练过程逐步减小学习率（通常会产生更好的效果）。

In [13]:
from transformers import get_scheduler
from torch.optim import AdamW

epoch =3
lr = 1e-5
loss_fn = nn.CrossEntropyLoss()
num_training_steps = epoch * len(train_dataloader)

optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

total_loss = 0.
for t in range(epoch):
    print(f"Epoch {t+1}/{epoch}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    test_loop(valid_dataloader, model, mode='Valid')
print("Done!")


Epoch 1/3
-------------------------------


loss0.610267: 100%|████████████████████████████████████████████████████████████████| 2146/2146 [04:36<00:00,  7.77it/s]


ValidAccuracy:69.0%

Epoch 2/3
-------------------------------


loss0.587256: 100%|████████████████████████████████████████████████████████████████| 2146/2146 [04:35<00:00,  7.80it/s]


ValidAccuracy:69.3%

Epoch 3/3
-------------------------------


loss0.556291: 100%|████████████████████████████████████████████████████████████████| 2146/2146 [04:31<00:00,  7.92it/s]


ValidAccuracy:68.4%

Done!


In [14]:
def test_loop(dataloader, model, mode="Test"):
    assert mode in ["Valid", "Test"] 
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            pred = model(x)
            # 对于[batch_size, num_classes]取出标签维度的最大索引得到预测标签，
            # 和真实标签进行比较 返回bool，在进行01转换求和出正确的总数
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    correct /=size
    print(f"{mode}Accuracy:{(100 * correct):>0.1f}%\n")
    return correct

total_loss = 0.
best_acc = 0.
for t in range(epoch):
    print(f"Epoch {t+1}/{epoch}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
    valid_acc = test_loop(valid_dataloader, model, mode='Valid')
    if valid_acc > best_acc:
        best_acc = valid_acc
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_model_weights.bin')
print("Done!")

Epoch 1/3
-------------------------------


loss0.442641: 100%|████████████████████████████████████████████████████████████████| 2146/2146 [04:30<00:00,  7.92it/s]


ValidAccuracy:68.4%

saving new weights...

Epoch 2/3
-------------------------------


loss0.443197: 100%|████████████████████████████████████████████████████████████████| 2146/2146 [04:30<00:00,  7.95it/s]


ValidAccuracy:68.4%

Epoch 3/3
-------------------------------


loss0.443389: 100%|████████████████████████████████████████████████████████████████| 2146/2146 [04:34<00:00,  7.81it/s]


ValidAccuracy:68.4%

Done!


最后，我们加载验证集上最优的模型权重，汇报其在测试集上的性能。由于 AFQMC 公布的测试集上并没有标签，无法评估性能，这里我们暂且用验证集代替进行演示：

In [17]:
model.load_state_dict(torch.load('./model/epoch_1_valid_acc_68.4_model_weights.bin'))
print(test_loop(valid_dataloader, model, mode='Test'))

  model.load_state_dict(torch.load('./model/epoch_1_valid_acc_68.4_model_weights.bin'))


TestAccuracy:68.4%

0.6835032437442076
