## 滑动窗口

In [2]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments

In [24]:
datasets = load_dataset("cmrc2018",cache_dir= "./cmrc")
datasets["train"]["answers"][0]

{'text': ['1963年'], 'answer_start': [30]}

In [6]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
tokenizer

BertTokenizerFast(name_or_path='hfl/chinese-macbert-base', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [9]:
def process_func(example):
    tokenized_example = tokenizer(
        text = example["question"],
        text_pair = example["context"],
        padding = "max_length",
        return_offsets_mapping = True,
        max_length = 384,
        truncation = "only_second",
        return_overflowing_tokens = True,
        stride = 128
    )
    
    sample_mapping = tokenized_example.pop("overflow_to_sample_mapping")
    start_position = []
    end_position = []
    example_ids = []
    
    for idx, _ in enumerate(sample_mapping):
        answer = example["answers"][sample_mapping[idx]]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        
        context_start = tokenized_example.sequence_ids(idx).index(1)
        context_end = tokenized_example.sequence_ids(idx).index(None, context_start) -1
        
        offset = tokenized_example.get("offset_mapping")[idx]
        
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char :
            start_token_pos = 0
            end_token_pos = 0
        else:
            id = context_start
            while id <= context_end and offset[id][0] < start_char:
                id += 1
            start_token_pos = id
            id = context_end
            while id >= context_start and offset[id][1] > end_char:
                id -= 1
            end_token_pos = id
        
        start_position.append(start_token_pos)
        end_position.append(end_token_pos)
        example_ids.append(example["id"][sample_mapping[idx]])
        tokenized_example["offset_mapping"][idx] = [
            (v if tokenized_example.sequence_ids(idx)[k] == 1 else None)
            for k,v in enumerate(tokenized_example["offset_mapping"][idx])
        ]
    
    tokenized_example["example_ids"] = example_ids
    tokenized_example["start_positions"] = start_position
    tokenized_example["end_positions"] = end_position
    
    return tokenized_example
            

In [10]:
tokenized_datasets = datasets.map(process_func,batched= True, remove_columns= datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 10142/10142 [00:49<00:00, 206.56 examples/s]
Map: 100%|██████████| 3219/3219 [00:14<00:00, 221.46 examples/s]
Map: 100%|██████████| 1002/1002 [00:08<00:00, 113.63 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_ids', 'start_positions', 'end_positions'],
        num_rows: 19189
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_ids', 'start_positions', 'end_positions'],
        num_rows: 6327
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_ids', 'start_positions', 'end_positions'],
        num_rows: 1988
    })
})

## 预测

In [16]:
import numpy as np
import collections

def get_result(start_logits,end_logits,examples, features):
    
    predictions = {}
    references = {}
    example_to_feature = collections.defaultdict(list)
    
    for idx, example_id in enumerate(features["example_ids"]):
        example_to_feature[example_id].append(idx)
    
    #最优答案候选
    n_best = 20
    max_length = 30
    
    for example in examples:
        example_id = example["id"]
        context = example["context"]
        answers = []
        for feature_id in example_to_feature[example_id]:
            start_logit = start_logits[feature_id]
            end_logit = end_logits[feature_id]
            offset = features[feature_id]["offset_mapping"]
            start_indexes = np.argsort(start_logit)[::-1][:n_best]
            end_indexes = np.argsort(end_logit)[::-1][:n_best]
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offset[start_index] == None or offset[end_index] == None:
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_length:
                        continue
                    answers.append(
                        {
                            "text" : context[offset[start_index][0] : offset[end_index][1]],
                            "score" : start_logit[start_index] + end_logit[end_index]
                        }
                    )
    
        if(len(answers) > 0) :
            best_answer = max(answers, key= lambda x : x["score"])
            predictions[example_id] = best_answer["text"]
        else:
            predictions[example_id] = ""
        
        references[example_id] = example["answers"]["text"]
        
    return predictions , references

## 评估

In [17]:
from cmrc_eval import evaluate_cmrc

def metric(pred):
    start_logit,end_logit = pred[0]
    if(start_logit.shape[0] == len(tokenized_datasets["validation"])):
        p,r = get_result(start_logit,end_logit,datasets["validation"],tokenized_datasets["validation"])
    else:
        p,r = get_result(start_logit,end_logit,datasets["test"],tokenized_datasets["test"])
    
    return evaluate_cmrc(p,r)

In [13]:
model = AutoModelForQuestionAnswering.from_pretrained("hfl/chinese-macbert-base")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
args = TrainingArguments(
    output_dir= "model_for_qa",
    per_device_train_batch_size= 32,
    per_device_eval_batch_size= 32,
    eval_strategy= "steps",
    eval_steps= 20,
    save_strategy= "epoch",
    logging_steps= 50,
    num_train_epochs=1
)

In [27]:
from transformers import DefaultDataCollator
trainer = Trainer(
    model= model,
    args= args,
    train_dataset= tokenized_datasets["train"],
    eval_dataset= tokenized_datasets["validation"],
    data_collator= DefaultDataCollator(),
    compute_metrics= metric
)

In [28]:
trainer.train()

Step,Training Loss,Validation Loss


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\32721/nltk_data'
    - 'c:\\Users\\32721\\anaconda3\\envs\\transformers\\nltk_data'
    - 'c:\\Users\\32721\\anaconda3\\envs\\transformers\\share\\nltk_data'
    - 'c:\\Users\\32721\\anaconda3\\envs\\transformers\\lib\\nltk_data'
    - 'C:\\Users\\32721\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [29]:
from transformers import pipeline

pipe = pipeline("question-answering", model = model , tokenizer= tokenizer)

pipe

Device set to use cpu


<transformers.pipelines.question_answering.QuestionAnsweringPipeline at 0x2aa3075a130>

In [30]:
pipe(question= "小明在哪里上班？", context= "小明在北京上班。")

{'score': 0.15639106929302216, 'start': 3, 'end': 8, 'answer': '北京上班。'}