python 3.8 + pytorch 1.8

# 目的：利用PyTorch-Transformers框架实现遮蔽语言模型及句对预测模型
- 输入：id序列(input)
- 输出：预测id序列(decoder_output)
- 主要步骤：
 1. 遮蔽语言模型
    1. 数据预处理：句子分词（中文分字）
    2. 读取预训练模型
    3. 模型预测
 2. 句对预测模型
    1. 数据预处理：分词，id化，构造分段id
    2. 读取预训练模型
    3. 模型预测

## 1. 遮蔽语言模型

### a. 数据预处理

In [None]:
import torch
from pytorch_transformers import BertTokenizer

model_name = 'bert-base-chinese'  # 指定实验需下载的预训练模型参数

# BERT 在预训练中引入了 [CLS] 和 [SEP] 标记句子的开头和结尾
samples = ['[CLS] 中国的首都是哪里？ [SEP] 北京是 [MASK] 国的首都。 [SEP]']  # 准备输入模型的语句
MASKED_LOC = 14
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenized_text = [tokenizer.tokenize(i) for i in samples]
input_ids = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_text]
input_ids = torch.LongTensor(input_ids)
input_ids

### b. 读取预训练模型

In [None]:
from pytorch_transformers import BertForMaskedLM

model = BertForMaskedLM.from_pretrained(model_name, cache_dir="./")
model.eval() # 验证模式，只前向传播，不更新参数

### c. 模型预测

In [None]:
outputs = model(input_ids)
prediction_scores = outputs[0]
prediction_scores.shape

In [None]:
#  选取可能性最大的预测结果

import numpy as np

sample = prediction_scores[0].detach().numpy()
pred = np.argmax(sample, axis=1)

tokenizer.convert_ids_to_tokens(pred)[MASKED_LOC]
outputs = model(input_ids)
prediction_scores = outputs[0]
prediction_scores.shape

## 2. 句对预测

### a. 数据预处理：分词，id化，构造分段id

In [None]:
samples = ["[CLS]今天天气怎么样？[SEP]今天天气很好。[SEP]", "[CLS]小明今年几岁了？[SEP]小明爱吃西瓜。[SEP]"]
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenized_text = [tokenizer.tokenize(i) for i in samples]
input_ids = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_text]
input_ids = torch.LongTensor(input_ids)
input_ids

# 构造句子分段ID
segments_ids = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]

segments_tensors = torch.tensor(segments_ids)
segments_tensors

### b. 读取预训练模型

In [None]:
from pytorch_transformers import BertForNextSentencePrediction

model = BertForNextSentencePrediction.from_pretrained(
    model_name, cache_dir="./")
model.eval() # 同上，模型‘冷冻’

### c. 模型预测

In [None]:
outputs = model(input_ids)
seq_relationship_scores = outputs[0]
seq_relationship_scores

In [None]:
# 输出二分类预测结果
sample = seq_relationship_scores.detach().numpy()
pred = np.argmax(sample, axis=1)
pred