# Transformers包使用教程
Author:Yuan Zheng

# Introduction
- 预训练模型介绍
- 安装Transformers
- 加载预训练模型
- fine tune预训练模型
- 训练自己的下游模型

# 预训练模型介绍
## Bert
英文模型、中文模型
最近更新了wwm和小参数模型
https://github.com/google-research/bert

### Whole-word-masking
Original Sentence: 我今天吃饭了。
Mask: 我今\[MASK\]天吃饭了。
Whole-word-masking: 我\[MASK\]\[MASK\]吃饭了。

bert-wwm
roberta-wwm
https://github.com/ymcui/Chinese-BERT-wwm

### BioBert
用PubMed + ... 训练的英文bert
https://huggingface.co/monologg/biobert_v1.1_pubmed

## Roberta
改变了一些预训练细节，包括mask和batch等。
https://github.com/ymcui/Chinese-BERT-wwm

## Albert
参数共享，更小的模型文件，更好的表现。
https://github.com/google-research/albert

# 下载地址
/media/sdc/GanjinZero/pretraining_models/
https://huggingface.co/models

# 安装transformers
- 需要Tensorflow 2.0+或者Pytorch 1.0+
- 主要还是用的是Pytorch
https://github.com/huggingface/transformers

```
pip install transformers
```

In [1]:
import transformers
transformers.__version__

I0327 12:16:26.964957 140463573395200 file_utils.py:41] PyTorch version 1.2.0 available.


'2.5.1'

# 加载预训练模型
预训练模型在transformers包中有三个组成部分：模型结构(config)、预训练文件(checkpoint)、词典(vocab)

In [4]:
# 加载一个官方预训练好的模型，需要在线下载

from transformers import BertModel, BertTokenizer
from transformers import AutoModel, AutoTokenizer # 自动识别模型
## Bert
"""
model = BertModel.from_pretrained('bert-base-uncased')
tokenzier = BertTokenizer.from_pretrained('bert-base-uncased')
"""

## BioBert
"""
tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed")
model = AutoModel.from_pretrained("monologg/biobert_v1.1_pubmed")
"""

## Roberta
"""
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext")
"""

## Albert
"""
tokenizer = AutoTokenizer.from_pretrained("voidful/albert_chinese_base")
model = AutoModel.from_pretrained("voidful/albert_chinese_base")
"""

# 加载一个本地的模型（可能是你fine—tune过的）
# 加载bert_wwm_
tokenizer = AutoTokenizer.from_pretrained("/media/sdc/GanjinZero/pretraining_models/bert_wwm")
model = AutoModel.from_pretrained("/media/sdc/GanjinZero/pretraining_models/bert_wwm")


I0327 12:39:07.922970 140463573395200 configuration_utils.py:254] loading configuration file /media/sdc/GanjinZero/pretraining_models/bert_wwm/config.json
I0327 12:39:07.924646 140463573395200 configuration_utils.py:292] Model config BertConfig {
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "outpu

In [24]:
import torch
import numpy as np

sentence = "孙一峰二五仔。"
print(len(sentence))
input_ids = torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)])
with torch.no_grad():
    output = model(input_ids)
    output_seq = output[0]
    output_cls = output[1]
print(output_seq.shape) #用于字符分类
print(output_cls.shape) #用于句子分类
(output_seq[0][0] == output_cls[0]).all()

7
torch.Size([1, 9, 768])
torch.Size([1, 768])


tensor(False)

# 文本分类模型
run_glue.py
https://github.com/huggingface/transformers/blob/master/examples/run_glue.py

```shell
export GLUE_DIR=/path/to/glue
export TASK_NAME=MRPC

python ./examples/run_glue.py \
    --model_type bert \
    --model_name_or_path bert-base-uncased \
    --task_name $TASK_NAME \
    --do_train \
    --do_eval \
    --do_lower_case \
    --data_dir $GLUE_DIR/$TASK_NAME \
    --max_seq_length 128 \
    --per_gpu_eval_batch_size=8   \
    --per_gpu_train_batch_size=8   \
    --learning_rate 2e-5 \
    --num_train_epochs 3.0 \
    --output_dir /tmp/$TASK_NAME/
```
参数解释：
- model_type 模型类型
- model_name_or_path 模型地址
- task_name 任务名称
- do_train 是否训练
- do_eval 是否验证
- do_lower_case 是否小写（和模型有关系）
- data_dir 数据集文件夹
- max_seq_length 文本最长长度
- learngin_rate 学习率：建议1e-5~5e-5
- num_train_epochs epoch：建议2~5
- batch_size 看如下表格

12GB显存
System       | Seq Length | Max Batch Size
------------ | ---------- | --------------
`BERT-Base`  | 64         | 64
...          | 128        | 32
...          | 256        | 16
...          | 320        | 14
...          | 384        | 12
...          | 512        | 6
`BERT-Large` | 64         | 12
...          | 128        | 6
...          | 256        | 2
...          | 320        | 1
...          | 384        | 0
...          | 512        | 0
据说batch size过小会影响效果

对于自己的任务，需要改写预处理数据的文件！或者将自己的数据弄得和他格式一致。
比如说把你的任务是双句子二分类任务（比如句子相似任务），看MRPC数据集在transformers里的读入代码：

```python
class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["sentence1"].numpy().decode("utf-8"),
            tensor_dict["sentence2"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            text_b = line[4]
            label = line[0]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
```

你需要在data_dir中存有train.csv, dev.csv和test.csv。其中文件每行为：xxx就是随意，+代表tab
label + xxx + xxx + text_a + text_b

如果你是单句子二分类任务，可以表示成
label + xxx + xxx + text_a + ""

主要目的就是为了凑
```python
text_a = line[3]
text_b = line[4]
label = line[0]
```

对于需要完全改写预处理的文件格式，注释掉run_glue.py的这四行
```python
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
```
按照你自己的文件格式重写这4个函数，可以参照transformers里的源代码改写。
https://github.com/huggingface/transformers/tree/master/src/transformers/data

In [None]:
fine tune后的模型和预测结构都保存在output文件夹中。如果想要再使用模型，读取该文件夹中的保存模型即可。

# 训练自己的下游模型
在这里举一个例子，如何利用Transformers搭建自己的模型。
模型目标：判断两个句子是否相似。
采用的模型结构：Siamese-Bert
完整的模型代码：/media/sdc/GanjinZero/bert-siamese
模型描述：用Bert作为句子的encoder，对于两个句子text_a, text_b；得到Bert(text_a), Bert(text_b)。用(Bert(text_a), Bert(text_b), |Bert(text_a) - Bert(text_b)|)连接一个全连接层预测两个句子的相似与否。Loss使用CrossEntropy(MSE当然也可以).

In [28]:
from transformers import BertConfig, BertPreTrainedModel, BertTokenizer, BertModel
from torch import nn
from torch.nn import MSELoss, CrossEntropyLoss
import torch


class BertSiamese(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob) 
        self.classifier = nn.Linear(3 * config.hidden_size, 2) # 1 for regression
        torch.nn.init.normal_(self.classifier.weight)
        
        self.init_weights()

    def forward(
        self,
        input_ids_0=None,
        attention_mask_0=None,
        token_type_ids_0=None,
        position_ids_0=None,
        head_mask_0=None,
        inputs_embeds_0=None,
        input_ids_1=None,
        attention_mask_1=None,
        token_type_ids_1=None,
        position_ids_1=None,
        head_mask_1=None,
        inputs_embeds_1=None,
        labels=None,
    ):
        outputs_0 = self.bert(
            input_ids_0,
            attention_mask=attention_mask_0,
            token_type_ids=token_type_ids_0,
            position_ids=position_ids_0,
            head_mask=head_mask_0,
            inputs_embeds=inputs_embeds_0)
        
        outputs_1 = self.bert(
            input_ids_1,
            attention_mask=attention_mask_1,
            token_type_ids=token_type_ids_1,
            position_ids=position_ids_1,
            head_mask=head_mask_1,
            inputs_embeds=inputs_embeds_1)

        pooled_output_0 = outputs_0[1]
        pooled_output_1 = outputs_1[1]
        minus = torch.abs(pooled_output_0 - pooled_output_1)
        h = torch.cat((pooled_output_0, pooled_output_1, minus), 1)
        h = self.dropout(h)
        logits = self.classifier(h)

        if labels is not None:
            loss = torch.nn.CrossEntropyLoss()(logits.view(-1, 2), labels.view(-1))
            outputs = (loss, logits)
        else:
            outputs = (0, logits)
        return outputs


In [30]:
model = BertSiamese.from_pretrained('/media/sdc/GanjinZero/pretraining_models/bert_wwm')

input_ids_0 = torch.tensor(tokenizer.encode("黄旭东扎色a", add_special_tokens=True)).unsqueeze(0)
input_ids_1 = torch.tensor(tokenizer.encode("孙一峰二五仔", add_special_tokens=True)).unsqueeze(0)

output_label = model(input_ids_0=torch.cat((input_ids_0, input_ids_1, input_ids_0), 0), 
                     input_ids_1=torch.cat((input_ids_1, input_ids_1, input_ids_0), 0),
                     labels=torch.tensor([0, 1, 1]))
print(output_label)

I0327 13:29:11.108398 140463573395200 configuration_utils.py:254] loading configuration file /media/sdc/GanjinZero/pretraining_models/bert_wwm/config.json
I0327 13:29:11.110187 140463573395200 configuration_utils.py:292] Model config BertConfig {
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "outpu

(tensor(0.6920, grad_fn=<NllLossBackward>), tensor([[-1.0064, -0.1961],
        [-0.9117, -0.0814],
        [-0.7019, -0.3580]], grad_fn=<AddmmBackward>))


In [33]:
import json
import copy


class InputFeatures(object):
    def __init__(self, input_ids_0, input_ids_1, attention_mask_0=None, attention_mask_1=None, token_type_ids_0=None, token_type_ids_1=None, label=None):
        self.input_ids_0 = input_ids_0
        self.attention_mask_0 = attention_mask_0
        self.token_type_ids_0 = token_type_ids_0
        self.input_ids_1 = input_ids_1
        self.attention_mask_1 = attention_mask_1
        self.token_type_ids_1 = token_type_ids_1
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

class InputExample(object):

    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [34]:
import os
from transformers.data.processors import DataProcessor


class SiameseProcessor(DataProcessor):

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        return None

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.csv")), "dev")

    def get_labels(self):
        return ["1", "0"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s-%s-%s" % (set_type, line[1], line[2], line[3])
            text_a = line[1]
            text_b = line[2]
            label = line[-1]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

# fine tune细节
一般会用学习率在训练过程中是在改变的。先线性上升(warmup)，再逐步归零。所以训练过程中注意有这个步骤。详细训练代码见/media/sdc/GanjinZero/bert-siamese/train.py
核心训练代码见下面，不和上面接着！这是DuReader一个QA任务的train代码，写的比较短就拿过来了。这个任务的完整代码见https://github.com/basketballandlearn/Dureader-Bert

In [None]:
import os
import args
import torch
import random
import pickle
from tqdm import tqdm
from torch import nn, optim
import evaluate
from dataset.dataloader import Dureader
from transformers import BertForQuestionAnswering, BertConfig, AdamW


def train():
    # 加载预训练bert
    model = BertForQuestionAnswering.from_pretrained('/media/sdc/GanjinZero/pretraining_models/bert_wwm')
    device = args.device
    model.to(device)

    # 准备 optimizer
    param_optimizer = list(model.named_parameters())
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
    optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=0.1, t_total=args.num_train_optimization_steps)

    # 准备数据
    data = Dureader()
    train_dataloader, dev_dataloader = data.train_iter, data.dev_iter

    best_loss = 100000.0
    model.train()
    for i in range(args.num_train_epochs):
        for step , batch in enumerate(tqdm(train_dataloader, desc="Epoch")):
            input_ids, input_mask, segment_ids, start_positions, end_positions = \
                                        batch.input_ids, batch.input_mask, batch.segment_ids, batch.start_position, batch.end_position
            input_ids, input_mask, segment_ids, start_positions, end_positions = \
                                        input_ids.to(device), input_mask.to(device), segment_ids.to(device), start_positions.to(device), end_positions.to(device)

            # 计算loss
            loss, _, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, start_positions=start_positions, end_positions=end_positions)
            loss = loss / args.gradient_accumulation_steps
            loss.backward()

            # 更新梯度
            if (step+1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            # 验证
            if step % args.log_step == 4:
                eval_loss = evaluate.evaluate(model, dev_dataloader)
                if eval_loss < best_loss:
                    best_loss = eval_loss
                    torch.save(model.state_dict(), './model_dir/' + "best_model")
                    model.train()
