# Fine Tune AlBERT For Question And Answering on SQUAD
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [1]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [2]:
import os
import json
import random
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 11.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 37.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
 

In [4]:
from transformers import AutoTokenizer
from transformers import default_data_collator

In [5]:
device = "cuda" if cuda.is_available() else "cpu"

In [6]:
model_name =  "albert-base-v2"

## 1) Import Data



In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
squad_v2_dir = "/content/drive/My Drive/Colab Notebooks/Data/SQUAD2" # data folder

In [9]:
os.listdir(squad_v2_dir)

['train-v2.0.json', 'dev-v2.0.json', 'evaluate.py', '__pycache__']

In [10]:
include_impossible = False
load_impossible_answer = False

In [11]:
# download from https://rajpurkar.github.io/SQuAD-explorer/
train_data_path = os.path.join(squad_v2_dir,"train-v2.0.json")
dev_data_path = os.path.join(squad_v2_dir,'dev-v2.0.json')

In [12]:
def read_json(file_path):
    with open(file_path) as f:
        json_f = json.load(f)
    data = json_f['data']
    return data


In [13]:
def get_random_index(List):
    return random.sample(range(len(List)),1)[0]

In [14]:
def load_data(data_path, load_impossible_answer = False):

    data = read_json(data_path)
    
    data_dict = {}
    title_list = []
    context_list = []
    question_list = []
    id_list = []
    answer_text_list = []
    answer_start_list = []
    is_impossible_list = []
    
    for paragraphs in data:
        title = paragraphs['title']
        context_qas_list = paragraphs['paragraphs']

        for context_qas in context_qas_list:
            context = context_qas['context']
            qas_list = context_qas['qas']

            for qas in qas_list:
                title_list.append(title)
                context_list.append(context)

                is_impossible = qas['is_impossible']
                is_impossible_list.append(is_impossible)

                id_ = qas['id']
                id_list.append(id_)
                question = qas['question']
                question_list.append(question)

                if not is_impossible:
                    answer_list = qas['answers']
                    idx = get_random_index(answer_list)
                    answer_text = answer_list[idx]['text']
                    answer_start = answer_list[idx]['answer_start']

                    answer_text_list.append(answer_text)
                    answer_start_list.append(answer_start)
                else:
                    if load_impossible_answer:
                        answer_list = qas['plausible_answers']
                        idx = get_random_index(answer_list)
                        answer_text = answer_list[idx]['text']
                        answer_start = answer_list[idx]['answer_start']
                        answer_text_list.append(answer_text)
                        answer_start_list.append(answer_start)
                    else:
                        answer_text_list.append("")
                        answer_start_list.append(-1)

    data_dict['id'] = id_list
    data_dict['title'] = title_list
    data_dict['context'] = context_list
    data_dict['question'] = question_list
    data_dict['answer_text'] = answer_text_list
    data_dict['answer_start'] = answer_start_list
    data_dict['is_impossible'] = is_impossible_list

    return data_dict





In [15]:
sample_size = 8000
# take the sample size for training only
sample_size = None

In [16]:
train_data_dict = load_data(train_data_path,load_impossible_answer=False)
dev_data_dict = load_data(dev_data_path,load_impossible_answer=False)

In [17]:
train_data_df = pd.DataFrame(train_data_dict)
dev_data_df = pd.DataFrame(dev_data_dict)

In [18]:
sample_ratio = 2

In [19]:
train_data_df = shuffle(train_data_df)
dev_data_df = shuffle(dev_data_df)

In [20]:
train_size = int(len(train_data_df)//sample_ratio)
dev_size = int(len(dev_data_df))

In [21]:
train_data_df = train_data_df[:train_size]
dev_data_df = dev_data_df[:dev_size]

In [22]:
len(train_data_df)

65159

In [23]:
if not include_impossible:
    train_data_df = train_data_df[train_data_df['is_impossible']==False]
    dev_data_df = dev_data_df[dev_data_df['is_impossible']==False]

if not sample_size:
    train_data_df = shuffle(train_data_df).head(sample_size)
    dev_data_df = dev_data_df.head(sample_size)

In [24]:
train_data_df.head()

Unnamed: 0,id,title,context,question,answer_text,answer_start,is_impossible
24526,56f8a3aa9b226e1400dd0d23,Alps,Scientists have been studying the impact of cl...,Who have been studying the impact of climate c...,Scientists,0,False
116520,573094c48ab72b1400f9c5c2,Airport,There are a number of aids available to pilots...,What instruments do pilots use to find the run...,instrument landing system,540,False
2353,56d1335f17492d1400aabc15,The_Legend_of_Zelda:_Twilight_Princess,"A high-definition remaster of the game, The Le...",What company is developing the remaster?,Tantalus Media,105,False
43043,570e25b30dc6ce1900204dfd,Eritrea,"Additionally, owing to its colonial history, c...",What is Eritrea's popular alcoholic beverage M...,honey,827,False
5276,56d09f06234ae51400d9c3cc,Buddhism,"In Buddhism, Karma (from Sanskrit: ""action, wo...",What is theavoidance of unwholesome actions an...,sīla,404,False


In [25]:
len(train_data_df)

43476

## 2) Label Preparation / Feature Engineering

In [26]:
max_length = 512

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

In [113]:
# function to search the start and end position for labeling

def search_start_end_position(tokenized_inputs,start_char,answer_text):

    end_char = start_char + len(answer_text)

    offsets = tokenized_inputs.pop("offset_mapping")
    token_type_ids = tokenized_inputs['token_type_ids']
    
    content_token_start_index = token_type_ids.index(1)
    # print(content_token_start_index)
    content_token_end_index  = len(token_type_ids) - 1 - token_type_ids[::-1].index(1)
    content_token_end_index -=1


    answer_token_start_index = content_token_start_index
    answer_token_end_index = content_token_end_index

    
    if (offsets[content_token_start_index][0]) <= start_char and (offsets[content_token_end_index][1] >= end_char):
        while answer_token_start_index<len(offsets) and offsets[answer_token_start_index][0] <= start_char:
            answer_token_start_index += 1
        
        answer_token_start_index-=1

        while offsets[answer_token_end_index][1] >= end_char:
            answer_token_end_index -= 1
        
        answer_token_end_index +=1

        return answer_token_start_index,answer_token_end_index

    return -1,-1



In [114]:
"""test"""
example_id = 0
example = train_data_df.iloc[example_id]

context = example['context']
question = example['question']
answer_text = example['answer_text']
start_char = example['answer_start']

In [115]:
cls_token_id = tokenizer.cls_token_id

sep_token_id = tokenizer.sep_token_id

In [116]:
tokenized_inputs = tokenizer(
    text = question,
    text_pair = context,
    truncation = "only_second",
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_offsets_mapping = True,
)

In [117]:
# tokenized_inputs['attention_mask']


In [118]:
start_pos,end_pos = search_start_end_position(tokenized_inputs,start_char,answer_text)

In [119]:
start_pos,end_pos

(15, 15)

In [120]:
answer_text

'Scientists'

In [121]:
print(tokenizer.decode(tokenized_inputs['input_ids'][start_pos:end_pos+1]))

scientists


In [122]:
# prepare feature for model feeding

def prepare_feature(example):
    context = example['context']
    question = example['question']
    answer_text = example['answer_text']
    start_char = example['answer_start']

    tokenized_inputs = tokenizer(
        text = question,
        text_pair = context,
        truncation = "only_second",
        add_special_tokens = True,
        max_length = max_length,
        padding = "max_length",
        return_offsets_mapping = True,
    )

    cls_index = tokenized_inputs['input_ids'].index(tokenizer.cls_token_id)

    if start_char ==-1:
        tokenized_inputs['start_positions'] = cls_index
        tokenized_inputs['end_positions'] = cls_index
        _ = tokenized_inputs.pop("offset_mapping")
    else:
        start_pos,end_pos = search_start_end_position(tokenized_inputs,start_char,answer_text)
        if start_pos!=-1 and end_pos!=-1:
            tokenized_inputs['start_positions'] = start_pos
            tokenized_inputs['end_positions'] = end_pos
        else:
            tokenized_inputs['start_positions'] = cls_index
            tokenized_inputs['end_positions'] = cls_index

    return tokenized_inputs




In [123]:
prepare_feature(example)

{'input_ids': [2, 72, 57, 74, 4493, 14, 2261, 16, 3045, 753, 17, 308, 275, 60, 3, 5432, 57, 74, 4493, 14, 2261, 16, 3045, 753, 17, 308, 275, 9, 26, 823, 15, 206, 159, 91, 308, 25, 21074, 37, 5117, 26, 2224, 5603, 19, 14, 7185, 4765, 18, 15, 14, 1590, 16, 56, 25, 768, 2562, 9, 6438, 15, 14, 9826, 16, 6677, 1892, 1669, 924, 2461, 29, 21, 7676, 16, 14502, 29, 987, 8, 7230, 8, 6899, 11557, 69, 15128, 123, 57, 21, 1022, 2261, 27, 14, 5117, 19, 14, 15085, 28, 134, 28, 40, 1590, 27, 14, 308, 11886, 20, 14, 13827, 18, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [124]:
"""Test"""

'Test'

In [125]:
sample_train_data_df  = train_data_df.head(5)
train_features_temp = sample_train_data_df.apply(lambda df:prepare_feature(df),axis = 1)

In [126]:

sample = pd.DataFrame(list(train_features_temp))

In [127]:
sample

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",15,"[2, 72, 57, 74, 4493, 14, 2261, 16, 3045, 753,...",15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",142,"[2, 98, 4507, 107, 8114, 275, 20, 477, 14, 801...",140,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",36,"[2, 98, 237, 25, 3561, 14, 302, 4594, 60, 3, 2...",33,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",214,"[2, 98, 25, 24230, 22, 18, 844, 16360, 14513, ...",214,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",134,"[2, 98, 25, 14, 13884, 49, 11855, 16, 367, 192...",133,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [128]:
def decode_answer(df):
    return tokenizer.decode(df["input_ids"][df['start_positions']:df['end_positions']+1])

In [129]:
answer = sample.apply(lambda x:decode_answer(x),axis = 1)

In [130]:
answer

0                   scientists
1    instrument landing system
2               tantalus media
3                        honey
4                         sila
dtype: object

In [131]:
sample_train_data_df['answer_text']

24526                    Scientists
116520    instrument landing system
2353                 Tantalus Media
43043                         honey
5276                           sīla
Name: answer_text, dtype: object

In [132]:
#### convert to data frame dataset

In [133]:
train_feature_df = train_data_df.apply(lambda df: prepare_feature(df),axis = 1)
train_feature_df = pd.DataFrame(list(train_feature_df))


dev_feature_df = dev_data_df.apply(lambda df: prepare_feature(df),axis = 1)
dev_feature_df = pd.DataFrame(list(dev_feature_df))

In [134]:
dev_feature_df.head()

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",130,"[2, 483, 25, 32, 681, 20, 11628, 1231, 4860, 3...",126,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",105,"[2, 98, 23, 14, 5628, 1333, 1923, 16, 2611, 75...",105,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",89,"[2, 14, 5897, 16, 15987, 255, 7676, 23, 885, 8...",88,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",74,"[2, 98, 286, 23, 1121, 2743, 20, 14, 28807, 18...",72,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",142,"[2, 19, 369, 4083, 98, 2194, 144, 388, 3683, 6...",139,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ..."


In [135]:
#### create pytorch dataset

In [136]:
class SQUADTokenizedDataSet(Dataset):
    def __init__(self,dataframe,device = "cpu"):
        self.len = len(dataframe)
        self.dataframe = dataframe
        self.device = device
    
    def __getitem__(self,index):
        df = self.dataframe.iloc[index]

        if isinstance(df,pd.core.series.Series):
            data_dict = df.to_dict()
        else:
            data_dict = df.to_dict(orient = "list")

        return {k:torch.tensor(v,dtype = torch.long).to(self.device) for k,v in data_dict.items()}

    def __len__(self):
        return self.len

In [137]:
TrainTokenizedDataset  = SQUADTokenizedDataSet(train_feature_df,"cpu")
DevTokenizedDataset  = SQUADTokenizedDataSet(dev_feature_df,"cpu")

## 3) Fine Tune Model

In [138]:
from transformers import AlbertForQuestionAnswering,TrainingArguments,Trainer

In [139]:
model = AlbertForQuestionAnswering.from_pretrained(model_name)

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN t



source codes from transformer
```python
class BertForQuestionAnswering(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

```



In [140]:
# sample_data = TrainTokenizedDataset[:5]

In [141]:
# model(**sample_data)

In [142]:
model_save_path = "/content/drive/My Drive/Colab Notebooks/Models/SQUADModels/Albert" # data folder

In [143]:
os.listdir(model_save_path)

['runs',
 'checkpoint-600',
 'checkpoint-1200',
 'checkpoint-1800',
 'checkpoint-2400']

In [144]:
batch_size = 12

In [145]:
steps = len(TrainTokenizedDataset)//batch_size

In [146]:
steps = 1200

In [147]:
evaluate_steps = steps//2

In [148]:
evaluate_steps

600

In [149]:
args = TrainingArguments(
    model_save_path,
    overwrite_output_dir = True,
    evaluation_strategy = "steps",
    eval_steps = evaluate_steps,
    logging_steps = evaluate_steps,
    save_steps = evaluate_steps,
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 2,
    seed = 0,
    load_best_model_at_end = True,
)

In [150]:
data_collator = default_data_collator

In [151]:
trainer = Trainer(
model,
args,
train_dataset = TrainTokenizedDataset,
eval_dataset = DevTokenizedDataset,
data_collator = data_collator,
tokenizer = tokenizer,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 43476
  Num Epochs = 2
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 7246


Step,Training Loss,Validation Loss
600,1.459,1.133949


***** Running Evaluation *****
  Num examples = 2989
  Batch size = 12
Saving model checkpoint to /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/Albert/checkpoint-600
Configuration saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/Albert/checkpoint-600/config.json
Model weights saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/Albert/checkpoint-600/pytorch_model.bin
tokenizer config file saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/Albert/checkpoint-600/tokenizer_config.json
Special tokens file saved in /content/drive/My Drive/Colab Notebooks/Models/SQUADModels/Albert/checkpoint-600/special_tokens_map.json


# Evaluation

In [None]:
token_inputs = TrainTokenizedDataset[30:35]
token_inputs = {k:v.to(device) for k,v in token_inputs.items()}
max_answer_len = 32

In [None]:
def answer_question_from_tokenized_inputs(model,token_inputs,device = 'cude'):
    token_inputs = {k:v.to(device) for k,v in token_inputs.items()}

    model = model.to(device)
    model.eval()
    with torch.no_grad():
        output = model(**token_inputs)


    token_inputs = {k:v.to("cpu") for k,v in token_inputs.items()}

    start_logits = output.start_logits.cpu().detach().numpy()
    end_logits = output.end_logits.cpu().detach().numpy()

    input_ids = token_inputs['input_ids']
    result_dict_list = []

    for idx in range(len(start_logits)):
        result_dict = {}
        start_end = (0,0)
        start_end_score = (-1,-1)

        score = -1

        for start,p_start in enumerate(start_logits[idx]):
            if p_start>0:
                for end,p_end in enumerate(end_logits[idx]):
                    if p_end>0:
                        if end >= start and end < start + max_answer_len:
                            if p_start * p_end > score:
                                start_end = (start,end)
                                start_end_score = (p_start,p_end)
                                score = p_start * p_end
        start,end = start_end
        start_score,end_score = start_end_score

        pred_answer = ""
        if start!=0 and end !=0:
            pred_answer = tokenizer.decode(
                input_ids[idx][start:end+1]
            )

        result_dict['start_pos'] = start
        result_dict['start_score'] = start_score
        result_dict['end_pos'] = end
        result_dict['end_score'] = end_score

        result_dict['answer'] = pred_answer
        result_dict['score'] = score

        result_dict_list.append(result_dict)

    return result_dict_list





    


In [None]:
# answer_question_from_tokenized_inputs(model,token_inputs,'cuda')

In [None]:
def answer_question_from_context(context_list,question_list,tokenizer,device):
    tokenized_list = []

    for context,question in zip(context_list,question_list):
        tokenized_inputs = tokenizer(
            text = question,
            text_pair = context,
            truncation = "only_second",
            add_special_tokens = True,
            max_length = max_length,
            padding = "max_length",
            return_offsets_mapping = False,
        )

        tokenized_list.append(tokenized_inputs)


    tokenized_dataframe = pd.DataFrame(tokenized_list)

    token_inputs = tokenized_dataframe.to_dict("list")

    token_inputs = {k:torch.tensor(v,dtype = torch.long).to(device) for k,v in token_inputs.items()}

    res_list = answer_question_from_tokenized_inputs(model,token_inputs,device = device)

    return res_list

In [None]:
sample_df = train_data_df.head(5)
context_list = sample_df['context']
question_list = sample_df['question']
real_answer_list = sample_df['answer_text']

In [None]:
predict_result = answer_question_from_context(context_list,question_list,tokenizer,'cuda')

In [None]:
predict_result

In [None]:
real_answer_list.to_list()

### Evaulate with official function

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
dev_data_df = pd.DataFrame(dev_data_dict)
dev_feature_df = dev_data_df.apply(lambda df: prepare_feature(df),axis = 1)
dev_feature_df = pd.DataFrame(list(dev_feature_df))
DevTokenizedDataset  = SQUADTokenizedDataSet(dev_feature_df,"cuda")

In [None]:
DevTokenizedLoader = DataLoader(DevTokenizedDataset,batch_size=32,shuffle=False)

In [None]:
all_result_dict_list = []

In [None]:
for token_inputs in tqdm(DevTokenizedLoader):
    result_dict_list = answer_question_from_tokenized_inputs(model,token_inputs,device = 'cuda')
    all_result_dict_list.extend(result_dict_list)

In [None]:
predict_answers = [dict_['answer'] for dict_ in all_result_dict_list]

In [None]:
dev_data_df['prection_answer'] = predict_answers

In [None]:
preds = dev_data_df[['id','prection_answer']].set_index('id').T.to_dict('records')[0]

In [None]:
dev_dataset = read_json(dev_data_path)

In [None]:
import sys
# import official evaluation function
sys.path.append(squad_v2_dir)

In [None]:
import evaluate as evaluate_utils 

In [None]:
exact_raw,f1_raw = evaluate_utils.get_raw_scores(dev_dataset,preds)

In [None]:
metrics = evaluate_utils.make_eval_dict(exact_raw,f1_raw)  

In [None]:
metrics
# increate the size of training data to imporve the metrics