# 2020语言与智能技术竞赛：机器阅读理解任务
https://aistudio.baidu.com/aistudio/competition/detail/28


平台提供的数据为JSON文件格式，样例如下:

    {
        "data": [
            {
                "paragraphs": [
                    {
                        "qas": [
                            {
                                "question": "非洲气候带", 
                                "id": "bd664cb57a602ae784ae24364a602674", 
                                "answers": [
                                    {
                                        "text": "热带气候", 
                                        "answer_start": 45
                                    }
                                ]
                            }
                        ], 
                        "context": "1、全年气温高，有热带大陆之称。主要原因在与赤道穿过大陆中部，位于南北纬30度之间，主要是热带气候，没有温带和寒带… 
                    }, 
                    {
                        "qas": [
                            {
                                "question": "韩国全称", 
                                "id": "a7eec8cf0c55077e667e0d85b45a6b34", 
                                "answers": [
                                    {
                                        "text": "大韩民国", 
                                        "answer_start": 5
                                    }
                                ]
                            }
                        ], 
                        "context": "韩国全称“大韩民国”，位于朝鲜半岛南部，隔“三八线”与朝鲜民主主义人民共和国相邻，面积9.93万平方公理… "
                    }
                ], 
                "title": ""
            }
        ]
    }





* 百度LIC2020的机器阅读理解赛道，非官方baseline
* 直接用RoBERTa+Softmax预测首尾
* BASE模型在第一期测试集上能达到0.69的F1，优于官方baseline
* 如果你显存足够，可以换用RoBERTa Large模型，F1可以到0.71

In [1]:
!pip install --upgrade pip
# !pip install bert4keras

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/5a/4a/39400ff9b36e719bdf8f31c99fe1fa7842a42fa77432e584f707a5080063/pip-20.2.2-py2.py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 15.1MB/s eta 0:00:01[K     |▍                               | 20kB 2.2MB/s eta 0:00:01[K     |▋                               | 30kB 2.8MB/s eta 0:00:01[K     |▉                               | 40kB 3.1MB/s eta 0:00:01[K     |█                               | 51kB 2.5MB/s eta 0:00:01[K     |█▎                              | 61kB 2.8MB/s eta 0:00:01[K     |█▌                              | 71kB 3.0MB/s eta 0:00:01[K     |█▊                              | 81kB 3.4MB/s eta 0:00:01[K     |██                              | 92kB 3.6MB/s eta 0:00:01[K     |██▏                             | 102kB 3.4MB/s eta 0:00:01[K     |██▍                             | 112kB 3.4MB/s eta 0:00:01[K     |██▋                             | 122kB 3.4MB/s eta 0:00:

In [2]:
# !nvidia-smi

In [3]:
!pip install bert4keras==0.7.7

Collecting bert4keras==0.7.7
  Downloading bert4keras-0.7.7.tar.gz (37 kB)
Building wheels for collected packages: bert4keras
  Building wheel for bert4keras (setup.py) ... [?25l[?25hdone
  Created wheel for bert4keras: filename=bert4keras-0.7.7-py3-none-any.whl size=36805 sha256=270222c573212aaf95c14908066493a2edfa775469017981f2ff0bc70a0d582b
  Stored in directory: /root/.cache/pip/wheels/fe/44/ad/947f4210d1d87fac2d67621c954ed556ecbd85cb374e346d4f
Successfully built bert4keras
Installing collected packages: bert4keras
Successfully installed bert4keras-0.7.7


In [4]:
import json, os
import numpy as np
from bert4keras.backend import keras, K
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Layer, Dense, Permute
from keras.models import Model
from tqdm import tqdm

# 基本信息
maxlen = 512
epochs = 20
batch_size = 4
learing_rate = 2e-5

# 下载模型

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## 解压

In [6]:
!ls /content/drive/"My Drive"/"Colab Notebooks"/"Machine Reading Comprehension"/data

demo	  dev.json.pred.json  evluation_utils.py  __pycache__  train.json
dev.json  evaluate.py	      License.docx	  README.md


## 设置数据路径

In [7]:
data_dir="/content/drive/My Drive/Colab Notebooks/Machine Reading Comprehension/data"
output_dir='/content/drive/My Drive/Colab Notebooks/Machine Reading Comprehension/data/output/'

# 模型路径

### bert 预训练模型：https://github.com/google-research/bert#pre-trained-models

In [8]:
bert_dir = '/content/drive/My Drive/Colab Notebooks/Machine Reading Comprehension/baseline_model'
config_path = f'{bert_dir}/bert_config.json'
checkpoint_path = f'{bert_dir}/bert_model.ckpt'
dict_path = f'{bert_dir}/vocab.txt'

# 加载数据

In [9]:
def load_data(filename):
    D = []
    for d in json.load(open(filename))['data'][0]['paragraphs']:
        for qa in d['qas']:
            D.append([
                qa['id'], d['context'], qa['question'],
                [a['text'] for a in qa.get('answers', [])]
            ])
    return D

# 读取数据

In [10]:
train_data = load_data(
    # os.path.join(data_dir,'train.json')
    os.path.join(data_dir,'demo/demo_train.json')
)

# 建立分词器

In [12]:
tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 子串搜索

In [13]:
def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到，返回第一个下标；否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1

# 数据生成器

In [14]:
class data_generator(DataGenerator):
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, item in self.sample(random):
            context, question, answers = item[1:]
            token_ids, segment_ids = tokenizer.encode(
                question, context, max_length=maxlen
            )
            a = np.random.choice(answers)
            a_token_ids = tokenizer.encode(a)[0][1:-1]
            start_index = search(a_token_ids, token_ids)
            if start_index != -1:
                labels = [[start_index], [start_index + len(a_token_ids) - 1]]
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append(labels)
                if len(batch_token_ids) == self.batch_size or is_end:
                    batch_token_ids = sequence_padding(batch_token_ids)
                    batch_segment_ids = sequence_padding(batch_segment_ids)
                    batch_labels = sequence_padding(batch_labels)
                    yield [batch_token_ids, batch_segment_ids], batch_labels
                    batch_token_ids, batch_segment_ids, batch_labels = [], [], []

# Mask

In [15]:
class MaskedSoftmax(Layer):
    """
    在序列长度那一维进行softmax，并mask掉padding部分
    """
    def compute_mask(self, inputs, mask=None):
        return None

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, 2)
            inputs = inputs - (1.0 - mask) * 1e12
        return K.softmax(inputs, 1)

# 构建模型

In [16]:
model = build_transformer_model(
    config_path,
    checkpoint_path,
)

output = Dense(2)(model.output)
output = MaskedSoftmax()(output)
output = Permute((2, 1))(output)

model = Model(model.input, output)

In [17]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     (None, None, 1024)   21635072    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 1024)   2048        Input-Segment[0][0]              
_______________________________________________________________________________________

# 评估函数

In [18]:
def sparse_categorical_crossentropy(y_true, y_pred):
    # y_true需要重新明确一下shape和dtype
    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
    y_true = K.cast(y_true, 'int32')
    y_true = K.one_hot(y_true, K.shape(y_pred)[2])
    # 计算交叉熵
    return K.mean(K.categorical_crossentropy(y_true, y_pred))


def sparse_accuracy(y_true, y_pred):
    # y_true需要重新明确一下shape和dtype
    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
    y_true = K.cast(y_true, 'int32')
    # 计算准确率
    y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32')
    return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx()))

# 编译模型

In [19]:
model.compile(
    loss=sparse_categorical_crossentropy,
    optimizer=Adam(learing_rate),
    metrics=[sparse_accuracy]
)

# 答案抽取

In [20]:
def extract_answer(question, context, max_a_len=16):
    """
    抽取答案函数
    """
    max_q_len = 64
    q_token_ids = tokenizer.encode(question, max_length=max_q_len)[0]
    c_token_ids = tokenizer.encode(
        context, max_length=maxlen - len(q_token_ids) + 1
    )[0]
    token_ids = q_token_ids + c_token_ids[1:]
    segment_ids = [0] * len(q_token_ids) + [1] * (len(c_token_ids) - 1)
    c_tokens = tokenizer.tokenize(context)[1:-1]
    mapping = tokenizer.rematch(context, c_tokens)
    probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
    probas = probas[:, len(q_token_ids):-1]
    start_end, score = None, -1
    for start, p_start in enumerate(probas[0]):
        for end, p_end in enumerate(probas[1]):
            if end >= start and end < start + max_a_len:
                if p_start * p_end > score:
                    start_end = (start, end)
                    score = p_start * p_end
    start, end = start_end
    return context[mapping[start][0]:mapping[end][-1] + 1]

# 预测文件生成

In [21]:
def predict_to_file(infile, out_file):
    """预测结果到文件，方便提交
    """
    fw = open(out_file, 'w', encoding='utf-8')
    R = {}
    for d in tqdm(load_data(infile)):
        a = extract_answer(d[2], d[1])
        R[d[0]] = a
    R = json.dumps(R, ensure_ascii=False, indent=4)
    fw.write(R)
    fw.close()

# 官方评估函数

In [24]:
import sys
import io
import json
sys.path.append(data_dir)
from evluation_utils import evaluate as src_evaluate
from collections import OrderedDict

In [25]:
file_name_ref_ans = os.path.join(data_dir,'demo/demo_dev.json')

In [31]:
dev_data = load_data(file_name_ref_ans)

In [32]:
len(dev_data)

100

In [29]:
file_name_pred_ans = os.path.join(data_dir,'demo/demo_dev.json').replace('.json','') + '_pred.json'

In [30]:
file_name_pred_ans

'/content/drive/My Drive/Colab Notebooks/Machine Reading Comprehension/data/demo/demo_dev_pred.json'

In [39]:
def evaluate(file_name_ref_ans,file_name_pred_ans):
    """
    评测函数（官方提供评测脚本evaluate.py）
    """
    predict_to_file(file_name_ref_ans,file_name_pred_ans)
    ref_ans = json.load(io.open(file_name_ref_ans))
    pred_ans = json.load(io.open(file_name_pred_ans))
    F1, EM, TOTAL, SKIP = src_evaluate(ref_ans, pred_ans)
    output_result = OrderedDict()
    output_result['F1'] = '%.3f' % F1
    output_result['EM'] = '%.3f' % EM
    output_result['TOTAL'] = TOTAL
    output_result['SKIP'] = SKIP
    return output_result


class Evaluator(keras.callbacks.Callback):
    """
    评估和保存模型
    """
    def __init__(self):
        self.best_val_f1 = 0.

    def on_epoch_end(self, epoch, logs=None):
        metrics = evaluate(file_name_ref_ans,file_name_pred_ans)
        if float(metrics['F1']) >= self.best_val_f1:
            self.best_val_f1 = float(metrics['F1'])
            model.save_weights(os.path.join(output_dir,'roberta_best_model.weights'))
            model.save(os.path.join(output_dir,'roberta_best_model.h5'))
        metrics['BEST_F1'] = self.best_val_f1
        print(metrics)

# 获取数据

In [40]:
train_generator = data_generator(train_data, batch_size)
evaluator = Evaluator()

In [41]:
[batch_token_ids, batch_segment_ids], batch_labels = next(iter(train_generator))

# 模型训练

In [42]:
epochs=1
model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=[evaluator]
)



100%|██████████| 100/100 [05:57<00:00,  3.57s/it]


OrderedDict([('F1', '53.141'), ('EM', '39.000'), ('TOTAL', 100), ('SKIP', 0), ('BEST_F1', 53.141)])


<tensorflow.python.keras.callbacks.History at 0x7f1d70ecf780>

# 加载最优模型

In [None]:
from keras.models import load_model
model=load_model(os.path.join(output_dir,'roberta_best_model.h5'),custom_objects={'MaskedSoftmax':MaskedSoftmax,'sparse_accuracy':sparse_accuracy})
print(evaluate(os.path.join(data_dir,'dev.json')))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
100%|██████████| 1417/1417 [00:46<00:00, 30.51it/s]

OrderedDict([('F1', '74.491'), ('EM', '63.232'), ('TOTAL', 1417), ('SKIP', 0)])





# 样例文章问题答案预测

In [113]:
test_data = dev_data[89]

In [114]:
context = test_data[1]

In [115]:
question = test_data[2]

In [116]:
real_answer = test_data[3]

In [117]:
pred_answer = extract_answer(question,context)

In [118]:
context

'一般纳税人开具普通发票与开具增值税专用发票,不管是开给的是小规模纳税人还是一般纳税人,都是按17%计算交税的。|一般纳税人是指年应征增值税销售额(以下简称年应税销售额,包括一个公历年度内的全部应税销售额)超过财政部规定的小规模纳税人标准的企业和企业性单位。一般纳税人的特点是增值税进项税额可以抵扣销项税额。|会计从业 税务办税指南会计从业 税务知识科普会计从业 税票知识科普会计从业 税务行业问答会计从业 税务发票查询会计从业 税务网上申报'

In [119]:
question

'普通发票税点是多少'

In [120]:
real_answer

['17%']

In [121]:
pred_answer

'17%'