# Fine Tune BERT For Question And Answering on SQUAD
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [1]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [2]:
import os
import json
import random
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

In [3]:
# !pip install transformers

In [4]:
from transformers import AutoTokenizer
from transformers import default_data_collator

In [5]:
device = "cuda" if cuda.is_available() else "cpu"

In [6]:
model_name =  "distilbert-base-uncased"

## 1) Import Data



In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
squad_v2_dir = "/content/drive/My Drive/Colab Notebooks/Data/SQUAD2" # data folder

In [9]:
os.listdir(squad_v2_dir)

['train-v2.0.json', 'dev-v2.0.json', 'evaluate.py', '__pycache__']

In [10]:
include_impossible = False
load_impossible_answer = False

In [11]:
# download from https://rajpurkar.github.io/SQuAD-explorer/
train_data_path = os.path.join(squad_v2_dir,"train-v2.0.json")
dev_data_path = os.path.join(squad_v2_dir,'dev-v2.0.json')

In [12]:
def read_json(file_path):
    with open(file_path) as f:
        json_f = json.load(f)
    data = json_f['data']
    return data


In [13]:
def get_random_index(List):
    return random.sample(range(len(List)),1)[0]

In [14]:
def load_data(data_path, load_impossible_answer = False):

    data = read_json(data_path)
    
    data_dict = {}
    title_list = []
    context_list = []
    question_list = []
    id_list = []
    answer_text_list = []
    answer_start_list = []
    is_impossible_list = []
    
    for paragraphs in data:
        title = paragraphs['title']
        context_qas_list = paragraphs['paragraphs']

        for context_qas in context_qas_list:
            context = context_qas['context']
            qas_list = context_qas['qas']

            for qas in qas_list:
                title_list.append(title)
                context_list.append(context)

                is_impossible = qas['is_impossible']
                is_impossible_list.append(is_impossible)

                id_ = qas['id']
                id_list.append(id_)
                question = qas['question']
                question_list.append(question)

                if not is_impossible:
                    answer_list = qas['answers']
                    idx = get_random_index(answer_list)
                    answer_text = answer_list[idx]['text']
                    answer_start = answer_list[idx]['answer_start']

                    answer_text_list.append(answer_text)
                    answer_start_list.append(answer_start)
                else:
                    if load_impossible_answer:
                        answer_list = qas['plausible_answers']
                        idx = get_random_index(answer_list)
                        answer_text = answer_list[idx]['text']
                        answer_start = answer_list[idx]['answer_start']
                        answer_text_list.append(answer_text)
                        answer_start_list.append(answer_start)
                    else:
                        answer_text_list.append("")
                        answer_start_list.append(-1)

    data_dict['id'] = id_list
    data_dict['title'] = title_list
    data_dict['context'] = context_list
    data_dict['question'] = question_list
    data_dict['answer_text'] = answer_text_list
    data_dict['answer_start'] = answer_start_list
    data_dict['is_impossible'] = is_impossible_list

    return data_dict





In [15]:
sample_size = 8000
# take the sample size for training only

In [16]:
train_data_dict = load_data(train_data_path,load_impossible_answer=False)
dev_data_dict = load_data(dev_data_path,load_impossible_answer=False)

In [17]:
train_data_df = pd.DataFrame(train_data_dict)
dev_data_df = pd.DataFrame(dev_data_dict)

In [18]:
if not include_impossible:
    train_data_df = train_data_df[train_data_df['is_impossible']==False]
    dev_data_df = dev_data_df[dev_data_df['is_impossible']==False]

train_data_df = shuffle(train_data_df).head(sample_size)
dev_data_df = dev_data_df.head(sample_size)

In [19]:
train_data_df.head()

Unnamed: 0,id,title,context,question,answer_text,answer_start,is_impossible
7819,56dc7e6714d3a41400c26922,Comprehensive_school,Scotland has a very different educational syst...,What has Scotland refused to adopt?,specialist schools,345,False
55195,5726420638643c19005ad3a1,Mammal,To maintain a high constant body temperature i...,What do the majority of mammals under 18 oz eat?,insects,1674,False
103995,572eb7eddfa6aa1500f8d309,Spanish_language_in_the_United_States,"Until the 20th century, there was no clear rec...",When did the Venezuelans emigrate to the unite...,"Until the 20th century, there was no clear rec...",0,False
104451,5730462ba23a5019007fd047,"Charleston,_South_Carolina",As many as five bands were on tour during the ...,When did Gershwin and Heyward write their folk...,summer of 1934,722,False
38964,570c347e6b8089140040fc36,Federal_Bureau_of_Investigation,The FBI has been frequently depicted in popula...,When did the FBI first appear in popular media?,1930s,64,False


## 2) Label Preparation / Feature Engineering

In [20]:
max_length = 512

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
# function to search the start and end position for labeling

def search_start_end_position(tokenized_inputs,start_char,answer_text):

    offsets = tokenized_inputs.pop("offset_mapping")

    end_char = start_char + len(answer_text)

    token_start_index = 0
    sequence_ids = tokenized_inputs.sequence_ids()

    while sequence_ids[token_start_index]!=1:
        token_start_index +=1
    
    token_end_index = len(tokenized_inputs)-1
    while sequence_ids[token_end_index] != 1:
        token_end_index -=1

    if (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):

        while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
            token_start_index+=1
        
        start_position = token_start_index - 1

        while offsets[token_end_index][1] >= end_char:
            token_end_index -=1
        
        end_position = token_end_index + 1
        
        end_position = len(sequence_ids) + end_position

        return start_position,end_position

    return -1,-1



In [23]:
"""test"""
example_id = 0
example = train_data_df.iloc[example_id]

context = example['context']
question = example['question']
answer_text = example['answer_text']
start_char = example['answer_start']

In [24]:
tokenized_inputs = tokenizer(
    text = question,
    text_pair = context,
    truncation = "only_second",
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_offsets_mapping = True,
)

In [25]:

tokenized_inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping'])

In [26]:
start_pos,end_pos = search_start_end_position(tokenized_inputs,start_char,answer_text)

In [27]:
start_pos,end_pos

(63, 64)

In [28]:
answer_text

'specialist schools'

In [29]:
print(tokenizer.decode(tokenized_inputs['input_ids'][start_pos:end_pos+1]))

specialist schools


In [30]:
# prepare feature for model feeding

def prepare_feature(example):
    context = example['context']
    question = example['question']
    answer_text = example['answer_text']
    start_char = example['answer_start']

    tokenized_inputs = tokenizer(
        text = question,
        text_pair = context,
        truncation = "only_second",
        add_special_tokens = True,
        max_length = max_length,
        padding = "max_length",
        return_offsets_mapping = True,
    )

    cls_index = tokenized_inputs['input_ids'].index(tokenizer.cls_token_id)

    if start_char ==-1:
        tokenized_inputs['start_positions'] = cls_index
        tokenized_inputs['end_positions'] = cls_index
        _ = tokenized_inputs.pop("offset_mapping")
    else:
        start_pos,end_pos = search_start_end_position(tokenized_inputs,start_char,answer_text)
        if start_pos!=-1 and end_pos!=-1:
            tokenized_inputs['start_positions'] = start_pos
            tokenized_inputs['end_positions'] = end_pos
        else:
            tokenized_inputs['start_positions'] = cls_index
            tokenized_inputs['end_positions'] = cls_index

    return tokenized_inputs




In [31]:
prepare_feature(example)

{'input_ids': [101, 2054, 2038, 3885, 4188, 2000, 11092, 1029, 102, 3885, 2038, 1037, 2200, 2367, 4547, 2291, 2013, 2563, 1998, 3575, 1010, 2295, 2036, 2241, 2006, 7721, 2495, 1012, 2009, 2038, 2367, 5535, 1997, 4651, 1010, 2367, 14912, 1998, 1037, 2367, 4695, 1997, 3601, 1998, 9347, 1012, 2035, 7271, 6787, 3078, 1998, 3905, 2816, 2024, 7721, 1012, 1996, 4104, 2231, 2038, 5837, 3488, 2005, 8325, 2816, 2004, 1997, 2384, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [32]:
"""Test"""

'Test'

In [33]:
sample_train_data_df  = train_data_df.head(5)
train_features_temp = sample_train_data_df.apply(lambda df:prepare_feature(df),axis = 1)

In [34]:

sample = pd.DataFrame(list(train_features_temp))

In [35]:
sample

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",64,"[101, 2054, 2038, 3885, 4188, 2000, 11092, 102...",63
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",371,"[101, 2054, 2079, 1996, 3484, 1997, 11993, 210...",371
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",36,"[101, 2043, 2106, 1996, 15332, 2015, 12495, 22...",14
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",164,"[101, 2043, 2106, 25600, 1998, 4931, 7652, 433...",162
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",23,"[101, 2043, 2106, 1996, 8495, 2034, 3711, 1999...",23


In [36]:
def decode_answer(df):
    return tokenizer.decode(df["input_ids"][df['start_positions']:df['end_positions']+1])

In [37]:
answer = sample.apply(lambda x:decode_answer(x),axis = 1)

In [38]:
answer

0                                   specialist schools
1                                              insects
2    until the 20th century, there was no clear rec...
3                                       summer of 1934
4                                                1930s
dtype: object

In [39]:
sample_train_data_df['answer_text']

7819                                     specialist schools
55195                                               insects
103995    Until the 20th century, there was no clear rec...
104451                                       summer of 1934
38964                                                 1930s
Name: answer_text, dtype: object

In [40]:
#### convert to data frame dataset

In [41]:
train_feature_df = train_data_df.apply(lambda df: prepare_feature(df),axis = 1)
train_feature_df = pd.DataFrame(list(train_feature_df))


dev_feature_df = dev_data_df.apply(lambda df: prepare_feature(df),axis = 1)
dev_feature_df = pd.DataFrame(list(dev_feature_df))

In [42]:

dev_feature_df.head()

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",49,"[101, 1999, 2054, 2406, 2003, 13298, 2284, 102...",49
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",40,"[101, 2043, 2020, 1996, 5879, 2015, 1999, 1329...",35
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",76,"[101, 2013, 2029, 3032, 2106, 1996, 15342, 217...",72
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",81,"[101, 2040, 2001, 1996, 15342, 3003, 1029, 102...",80
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",158,"[101, 2054, 2301, 2106, 1996, 5879, 2015, 2034...",157


In [43]:
#### create pytorch dataset

In [44]:
class SQUADTokenizedDataSet(Dataset):
    def __init__(self,dataframe,device = "cpu"):
        self.len = len(dataframe)
        self.dataframe = dataframe
        self.device = device
    
    def __getitem__(self,index):
        df = self.dataframe.iloc[index]

        if isinstance(df,pd.core.series.Series):
            data_dict = df.to_dict()
        else:
            data_dict = df.to_dict(orient = "list")

        return {k:torch.tensor(v,dtype = torch.long).to(self.device) for k,v in data_dict.items()}

    def __len__(self):
        return self.len

In [45]:
TrainTokenizedDataset  = SQUADTokenizedDataSet(train_feature_df,"cpu")
DevTokenizedDataset  = SQUADTokenizedDataSet(dev_feature_df,"cpu")

## 3) Fine Tune Model

In [46]:
from transformers import DistilBertForQuestionAnswering,TrainingArguments,Trainer,BertForQuestionAnswering

In [47]:
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode



source codes from transformer
```python
class BertForQuestionAnswering(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

```



In [48]:
# sample_data = TrainTokenizedDataset[:5]

In [49]:
# model(**sample_data)

QuestionAnsweringModelOutput([('loss', tensor(6.2595, grad_fn=<DivBackward0>)),
                              ('start_logits',
                               tensor([[-0.1945,  0.0443, -0.0266,  ...,  0.1642,  0.1491,  0.1103],
                                       [-0.1292,  0.1695,  0.1493,  ...,  0.1701,  0.2677,  0.1699],
                                       [-0.0270,  0.0928,  0.2349,  ...,  0.1643,  0.2820,  0.2661],
                                       [-0.1232,  0.0354,  0.0747,  ...,  0.1404,  0.1157,  0.1383],
                                       [-0.0149,  0.0599,  0.1335,  ...,  0.3116,  0.2690,  0.2174]],
                                      grad_fn=<CloneBackward0>)),
                              ('end_logits',
                               tensor([[ 0.0603,  0.2172,  0.0854,  ..., -0.1009, -0.0850, -0.0702],
                                       [ 0.1477,  0.2885,  0.2669,  ...,  0.1825,  0.3170,  0.1020],
                                       [ 0.0085, -0.22

In [50]:
model_save_path = "/content/drive/My Drive/Colab Notebooks/Models/SQUADModels" # data folder

In [51]:
os.listdir(model_save_path)

['runs']

In [52]:
batch_size = 12

In [60]:
args = TrainingArguments(
    output_dir = model_save_path,
    overwrite_output_dir = True,
    do_train  = True,
    do_eval = True,
    evaluation_strategy = "epoch",
    num_train_epochs = 2,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    logging_steps = 64,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [61]:
data_collator = default_data_collator

In [62]:
trainer = Trainer(
model,
args,
train_dataset = TrainTokenizedDataset,
eval_dataset = DevTokenizedDataset,
data_collator = data_collator,
tokenizer = tokenizer,
)

In [63]:
trainer.train()

***** Running training *****
  Num examples = 8000
  Num Epochs = 2
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 1334


Epoch,Training Loss,Validation Loss
1,1.8727,1.763241
2,1.4647,1.66353


Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-500
Configuration saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-500/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 5928
  Batch size = 12
Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-1000
Configuration saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-1000/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/checkpoint-1000/pytorch_model.bin
tokenizer config file 

TrainOutput(global_step=1334, training_loss=1.9241173842857624, metrics={'train_runtime': 1009.3521, 'train_samples_per_second': 15.852, 'train_steps_per_second': 1.322, 'total_flos': 2090449600512000.0, 'train_loss': 1.9241173842857624, 'epoch': 2.0})

# Evaluation

In [64]:
token_inputs = TrainTokenizedDataset[30:35]
token_inputs = {k:v.to(device) for k,v in token_inputs.items()}
max_answer_len = 32

In [65]:
def answer_question_from_tokenized_inputs(model,token_inputs,device = 'cude'):
    token_inputs = {k:v.to(device) for k,v in token_inputs.items()}

    model = model.to(device)
    model.eval()
    with torch.no_grad():
        output = model(**token_inputs)


    token_inputs = {k:v.to("cpu") for k,v in token_inputs.items()}

    start_logits = output.start_logits.cpu().detach().numpy()
    end_logits = output.end_logits.cpu().detach().numpy()

    input_ids = token_inputs['input_ids']
    result_dict_list = []

    for idx in range(len(start_logits)):
        result_dict = {}
        start_end = (0,0)
        start_end_score = (-1,-1)

        score = -1

        for start,p_start in enumerate(start_logits[idx]):
            if p_start>0:
                for end,p_end in enumerate(end_logits[idx]):
                    if p_end>0:
                        if end >= start and end < start + max_answer_len:
                            if p_start * p_end > score:
                                start_end = (start,end)
                                start_end_score = (p_start,p_end)
                                score = p_start * p_end
        start,end = start_end
        start_score,end_score = start_end_score

        pred_answer = ""
        if start!=0 and end !=0:
            pred_answer = tokenizer.decode(
                input_ids[idx][start:end+1]
            )

        result_dict['start_pos'] = start
        result_dict['start_score'] = start_score
        result_dict['end_pos'] = end
        result_dict['end_score'] = end_score

        result_dict['answer'] = pred_answer
        result_dict['score'] = score

        result_dict_list.append(result_dict)

    return result_dict_list





    


In [66]:
# answer_question_from_tokenized_inputs(model,token_inputs,'cuda')

In [67]:
def answer_question_from_context(context_list,question_list,tokenizer,device):
    tokenized_list = []

    for context,question in zip(context_list,question_list):
        tokenized_inputs = tokenizer(
            text = question,
            text_pair = context,
            truncation = "only_second",
            add_special_tokens = True,
            max_length = max_length,
            padding = "max_length",
            return_offsets_mapping = False,
        )

        tokenized_list.append(tokenized_inputs)


    tokenized_dataframe = pd.DataFrame(tokenized_list)

    token_inputs = tokenized_dataframe.to_dict("list")

    token_inputs = {k:torch.tensor(v,dtype = torch.long).to(device) for k,v in token_inputs.items()}

    res_list = answer_question_from_tokenized_inputs(model,token_inputs,device = device)

    return res_list

In [68]:
sample_df = train_data_df.head(5)
context_list = sample_df['context']
question_list = sample_df['question']
real_answer_list = sample_df['answer_text']

In [69]:
predict_result = answer_question_from_context(context_list,question_list,tokenizer,'cuda')

In [70]:
predict_result

[{'answer': 'specialist schools',
  'end_pos': 64,
  'end_score': 2.3163822,
  'score': 6.4470944,
  'start_pos': 63,
  'start_score': 2.7832603},
 {'answer': '500 g',
  'end_pos': 365,
  'end_score': 4.0800505,
  'score': 11.167816,
  'start_pos': 364,
  'start_score': 2.737176},
 {'answer': '18th and early 19th centuries',
  'end_pos': 43,
  'end_score': 2.041257,
  'score': 5.43066,
  'start_pos': 39,
  'start_score': 2.660449},
 {'answer': '1934',
  'end_pos': 164,
  'end_score': 6.2909193,
  'score': 30.055708,
  'start_pos': 164,
  'start_score': 4.7776337},
 {'answer': 'the 1930s',
  'end_pos': 23,
  'end_score': 6.638422,
  'score': 39.274616,
  'start_pos': 22,
  'start_score': 5.916258}]

In [71]:
real_answer_list.to_list()

['specialist schools',
 'insects',
 'Until the 20th century, there was no clear record of the number of Venezuelans who emigrated to the United States.',
 'summer of 1934',
 '1930s']

### Evaulate with official function

In [72]:
from torch.utils.data import DataLoader
from tqdm import tqdm

In [73]:
dev_data_df = pd.DataFrame(dev_data_dict)
dev_feature_df = dev_data_df.apply(lambda df: prepare_feature(df),axis = 1)
dev_feature_df = pd.DataFrame(list(dev_feature_df))
DevTokenizedDataset  = SQUADTokenizedDataSet(dev_feature_df,"cuda")

In [74]:
DevTokenizedLoader = DataLoader(DevTokenizedDataset,batch_size=32,shuffle=False)

In [75]:
all_result_dict_list = []

In [76]:
for token_inputs in tqdm(DevTokenizedLoader):
    result_dict_list = answer_question_from_tokenized_inputs(model,token_inputs,device = 'cuda')
    all_result_dict_list.extend(result_dict_list)

100%|██████████| 372/372 [05:01<00:00,  1.23it/s]


In [77]:
predict_answers = [dict_['answer'] for dict_ in all_result_dict_list]

In [78]:
dev_data_df['prection_answer'] = predict_answers

In [79]:
preds = dev_data_df[['id','prection_answer']].set_index('id').T.to_dict('records')[0]

In [80]:
dev_dataset = read_json(dev_data_path)

In [81]:
import sys
# import official evaluation function
sys.path.append(squad_v2_dir)

In [82]:
import evaluate as evaluate_utils 

In [83]:
exact_raw,f1_raw = evaluate_utils.get_raw_scores(dev_dataset,preds)

In [84]:
metrics = evaluate_utils.make_eval_dict(exact_raw,f1_raw)  

In [85]:
metrics
# increate the size of training data to imporve the metrics

OrderedDict([('exact', 27.170891939695107),
             ('f1', 33.267882531140515),
             ('total', 11873)])