# Fine Tune AlBERT For Question And Answering on SQUAD
Author: Nelson LIN (nelsonlin0321@outlook.com)

In [1]:
# import libaries
import torch
from torch import cuda
from torch.utils.data import Dataset,DataLoader

In [2]:
import os
import json
import random
import pandas as pd
from sklearn.utils import shuffle
from collections import Counter

In [3]:
# !pip install transformers

In [4]:
from transformers import AutoTokenizer
from transformers import default_data_collator

In [5]:
device = "cuda" if cuda.is_available() else "cpu"

In [6]:
model_name =  "albert-base-v2"

## 1) Import Data



In [7]:
def read_qnli_data(file_name, data_dir):
    path = os.path.join(data_dir, file_name)
    with open(path, encoding='utf-8-sig') as f:
        text = f.readlines()

    header = text[0].strip().split("\t")
    lines = [line.strip().split("\t") for line in text[1:]]

    df = pd.DataFrame(lines, columns=header)
    return df


def get_qnli_pandas_dataframe(data_dir):
    qnli_train_df = read_qnli_data("train.tsv",data_dir)
    qnli_dev_df = read_qnli_data("dev.tsv",data_dir)
    qnli_train_df['label'] = np.where(
        qnli_train_df['label'] == 'entailment', 1, 0)
    qnli_dev_df['label'] = np.where(qnli_dev_df['label'] == 'entailment', 1, 0)

    qnli_dev_df['question'] = qnli_dev_df['question'].apply(lambda x: x.strip())
    qnli_dev_df['sentence'] = qnli_dev_df['sentence'].apply(lambda x: x.strip()) 

    qnli_train_df['question'] = qnli_train_df['question'].apply(lambda x: x.strip())
    qnli_train_df['sentence'] = qnli_train_df['sentence'].apply(lambda x: x.strip()) 

    return qnli_dev_df, qnli_train_df


def read_document_to_list(document_path):
    with open(document_path, encoding='utf-8-sig') as f:
        document = f.readlines()
        sentence_list = [line.strip()
                         for line in document if len(line.strip()) != 0]
        return sentence_list


def read_document_dict(document_dir):

    document_dict = {}

    for document_file_name in os.listdir(document_dir):
        if document_file_name.endswith(".txt"):
            document_name = document_file_name.replace(
                ".txt", "").replace("_", " ")
            document_path = os.path.join(document_dir, document_file_name)
            document_dict[document_name] = read_document_to_list(document_path)

    return document_dict


def read_json(file_path):
    with open(file_path) as f:
        json_f = json.load(f)
    data = json_f['data']
    return data


def get_random_index(List):
    return random.sample(range(len(List)), 1)[0]


def load_data(data_path, load_impossible_answer=False):

    data = read_json(data_path)

    data_dict = {}
    title_list = []
    context_list = []
    question_list = []
    id_list = []
    answer_text_list = []
    answer_start_list = []
    is_impossible_list = []

    for paragraphs in data:
        title = paragraphs['title']
        context_qas_list = paragraphs['paragraphs']

        for context_qas in context_qas_list:
            context = context_qas['context']
            qas_list = context_qas['qas']

            for qas in qas_list:
                title_list.append(title)
                context_list.append(context)

                is_impossible = qas['is_impossible']
                is_impossible_list.append(is_impossible)

                id_ = qas['id']
                id_list.append(id_)
                question = qas['question']
                question_list.append(question)

                if not is_impossible:
                    answer_list = qas['answers']
                    idx = get_random_index(answer_list)
                    answer_text = answer_list[idx]['text']
                    answer_start = answer_list[idx]['answer_start']

                    answer_text_list.append(answer_text)
                    answer_start_list.append(answer_start)
                else:
                    if load_impossible_answer:
                        answer_list = qas['plausible_answers']
                        idx = get_random_index(answer_list)
                        answer_text = answer_list[idx]['text']
                        answer_start = answer_list[idx]['answer_start']
                        answer_text_list.append(answer_text)
                        answer_start_list.append(answer_start)
                    else:
                        answer_text_list.append("")
                        answer_start_list.append(-1)

    data_dict['id'] = id_list
    data_dict['title'] = title_list
    data_dict['context'] = context_list
    data_dict['question'] = question_list
    data_dict['answer_text'] = answer_text_list
    data_dict['answer_start'] = answer_start_list
    data_dict['is_impossible'] = is_impossible_list

    return data_dict


def get_squad_v2_pandas_dataframe(squad_v2_dir,include_impossible=False, load_impossible_answer=False):
    # download from https://rajpurkar.github.io/SQuAD-explorer/
    train_data_path = os.path.join(squad_v2_dir, "train-v2.0.json")
    dev_data_path = os.path.join(squad_v2_dir, 'dev-v2.0.json')

    train_data_dict = load_data(train_data_path, load_impossible_answer)
    dev_data_dict = load_data(dev_data_path, load_impossible_answer)

    train_data_df = pd.DataFrame(train_data_dict)
    dev_data_df = pd.DataFrame(dev_data_dict)

    if not include_impossible:
        train_data_df = train_data_df[train_data_df['is_impossible'] == False]
        dev_data_df = dev_data_df[dev_data_df['is_impossible'] == False]

    train_data_df['question'] = train_data_df['question'].apply(lambda x: x.strip())
    train_data_df['context'] = train_data_df['context'].apply(lambda x: x.strip()) 

    dev_data_df['question'] = dev_data_df['question'].apply(lambda x: x.strip())
    dev_data_df['context'] = dev_data_df['context'].apply(lambda x: x.strip()) 

    return train_data_df, dev_data_df


In [10]:
!ls /data/SQUAD2

dev-v2.0.json  evaluate.py  __pycache__  train-v2.0.json


In [11]:
squad_v2_dir = "/data/SQUAD2"

In [12]:
train_data_df, dev_data_df = get_squad_v2_pandas_dataframe(squad_v2_dir,include_impossible=True, load_impossible_answer=False)

In [13]:
train_data_df.head()

Unnamed: 0,id,title,context,question,answer_text,answer_start,is_impossible
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s,269,False
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing,207,False
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003,526,False
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas",166,False
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s,276,False


In [14]:
len(train_data_df)

130319

## 2) Label Preparation / Feature Engineering

In [15]:
max_length = 512

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
# function to search the start and end position for labeling

def search_start_end_position(tokenized_inputs,start_char,answer_text):

    end_char = start_char + len(answer_text)

    offsets = tokenized_inputs.pop("offset_mapping")
    token_type_ids = tokenized_inputs['token_type_ids']
    
    content_token_start_index = token_type_ids.index(1)
    # print(content_token_start_index)
    content_token_end_index  = len(token_type_ids) - 1 - token_type_ids[::-1].index(1)
    content_token_end_index -=1


    answer_token_start_index = content_token_start_index
    answer_token_end_index = content_token_end_index

    
    if (offsets[content_token_start_index][0]) <= start_char and (offsets[content_token_end_index][1] >= end_char):
        while answer_token_start_index<len(offsets) and offsets[answer_token_start_index][0] <= start_char:
            answer_token_start_index += 1
        
        answer_token_start_index-=1

        while offsets[answer_token_end_index][1] >= end_char:
            answer_token_end_index -= 1
        
        answer_token_end_index +=1

        return answer_token_start_index,answer_token_end_index

    return -1,-1



In [18]:
"""test"""
example_id = 0
example = train_data_df.iloc[example_id]

context = example['context']
question = example['question']
answer_text = example['answer_text']
start_char = example['answer_start']

In [19]:
cls_token_id = tokenizer.cls_token_id

sep_token_id = tokenizer.sep_token_id

In [20]:
tokenized_inputs = tokenizer(
    text = question,
    text_pair = context,
    truncation = "only_second",
    add_special_tokens = True,
    max_length = max_length,
    padding = "max_length",
    return_offsets_mapping = True,
)

In [21]:
# tokenized_inputs['attention_mask']


In [22]:
start_pos,end_pos = search_start_end_position(tokenized_inputs,start_char,answer_text)

In [23]:
start_pos,end_pos

(77, 81)

In [24]:
answer_text

'in the late 1990s'

In [25]:
print(tokenizer.decode(tokenized_inputs['input_ids'][start_pos:end_pos+1]))

in the late 1990s


In [26]:
# prepare feature for model feeding

def prepare_feature(example):
    context = example['context']
    question = example['question']
    answer_text = example['answer_text']
    start_char = example['answer_start']

    tokenized_inputs = tokenizer(
        text = question,
        text_pair = context,
        truncation = "only_second",
        add_special_tokens = True,
        max_length = max_length,
        padding = "max_length",
        return_offsets_mapping = True,
    )

    cls_index = tokenized_inputs['input_ids'].index(tokenizer.cls_token_id)

    if start_char ==-1:
        tokenized_inputs['start_positions'] = cls_index
        tokenized_inputs['end_positions'] = cls_index
        _ = tokenized_inputs.pop("offset_mapping")
    else:
        start_pos,end_pos = search_start_end_position(tokenized_inputs,start_char,answer_text)
        if start_pos!=-1 and end_pos!=-1:
            tokenized_inputs['start_positions'] = start_pos
            tokenized_inputs['end_positions'] = end_pos
        else:
            tokenized_inputs['start_positions'] = cls_index
            tokenized_inputs['end_positions'] = cls_index

    return tokenized_inputs




In [27]:
prepare_feature(example)

{'input_ids': [2, 76, 144, 24809, 799, 1535, 844, 60, 3, 24809, 16004, 3745, 143, 1355, 8, 1367, 815, 13, 5, 118, 2161, 1, 728, 1, 23157, 1, 118, 12092, 8, 7370, 8, 6366, 6, 13, 5, 381, 299, 268, 15, 2229, 6, 25, 40, 189, 1377, 15, 7815, 15, 571, 1421, 17, 2182, 9, 386, 17, 1127, 19, 4187, 15, 1338, 15, 39, 986, 19, 617, 3385, 17, 4626, 5868, 28, 21, 850, 15, 17, 1092, 20, 2720, 19, 14, 456, 961, 18, 28, 672, 1377, 16, 761, 1569, 220, 695, 8, 8024, 11271, 22, 18, 850, 9, 1471, 34, 36, 321, 15, 17677, 143, 1355, 15, 14, 214, 178, 53, 16, 14, 126, 22, 18, 246, 8, 10033, 695, 1170, 16, 65, 85, 9, 66, 16436, 441, 14, 830, 16, 24809, 22, 18, 893, 244, 15, 23853, 19, 339, 13, 5, 3325, 6, 15, 56, 613, 36, 28, 21, 2046, 1169, 3497, 15, 1931, 355, 8877, 1160, 17, 1070, 14, 3304, 1047, 808, 234, 8, 849, 2391, 13, 7, 23282, 19, 339, 7, 17, 13, 7, 12152, 883, 7, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [28]:
"""Test"""

'Test'

In [29]:
sample_train_data_df  = train_data_df.head(5)
train_features_temp = sample_train_data_df.apply(lambda df:prepare_feature(df),axis = 1)

In [30]:

sample = pd.DataFrame(list(train_features_temp))

In [31]:
sample

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",81,"[2, 76, 144, 24809, 799, 1535, 844, 60, 3, 248...",77,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",72,"[2, 98, 924, 144, 24809, 3975, 19, 76, 39, 23,...",70,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",148,"[2, 76, 144, 24809, 767, 11271, 22, 18, 850, 1...",148,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",62,"[2, 19, 98, 136, 17, 146, 144, 24809, 3213, 71...",60,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",82,"[2, 19, 56, 3953, 144, 24809, 533, 1561, 60, 3...",80,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."


In [32]:
def decode_answer(df):
    return tokenizer.decode(df["input_ids"][df['start_positions']:df['end_positions']+1])

In [33]:
answer = sample.apply(lambda x:decode_answer(x),axis = 1)

In [34]:
answer

0      in the late 1990s
1    singing and dancing
2                   2003
3         houston, texas
4             late 1990s
dtype: object

In [35]:
sample_train_data_df['answer_text']

0      in the late 1990s
1    singing and dancing
2                   2003
3         Houston, Texas
4             late 1990s
Name: answer_text, dtype: object

In [36]:
#### convert to data frame dataset

In [37]:
train_feature_df = train_data_df.apply(lambda df: prepare_feature(df),axis = 1)
train_feature_df = pd.DataFrame(list(train_feature_df))


dev_feature_df = dev_data_df.apply(lambda df: prepare_feature(df),axis = 1)
dev_feature_df = pd.DataFrame(list(dev_feature_df))

In [48]:
import pickle
def save_object(obj,save_path):
    with open(save_path,mode='wb') as f:
        pickle.dump(obj,f,protocol = pickle.HIGHEST_PROTOCOL)
        
        
def open_object(file_name):
    with open(file_name,mode = 'rb') as f:
        return pickle.load(f)

In [49]:
save_object(train_feature_df,"./data/train_feature_df.pkl")

In [50]:
save_object(dev_feature_df,"./data/dev_feature_df.pkl")

In [51]:
# train_feature_df = open_object("./data/dev_feature_df.pkl")
# dev_feature_df = open_object("./data/dev_feature_df.pkl")

In [52]:
dev_feature_df.head()

Unnamed: 0,attention_mask,end_positions,input_ids,start_positions,token_type_ids
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",54,"[2, 19, 98, 475, 25, 14650, 335, 60, 3, 14, 44...",54,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",45,"[2, 76, 46, 14, 4406, 18, 19, 14650, 60, 3, 14...",40,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",84,"[2, 37, 56, 1166, 144, 14, 16773, 24170, 60, 3...",80,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",89,"[2, 72, 23, 14, 16773, 1156, 60, 3, 14, 4406, ...",88,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",168,"[2, 98, 428, 144, 14, 4406, 18, 64, 3288, 66, ...",166,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [53]:
#### create pytorch dataset

In [54]:
class SQUADTokenizedDataSet(Dataset):
    def __init__(self,dataframe,device = "cpu"):
        self.len = len(dataframe)
        self.dataframe = dataframe
        self.device = device
    
    def __getitem__(self,index):
        df = self.dataframe.iloc[index]

        if isinstance(df,pd.core.series.Series):
            data_dict = df.to_dict()
        else:
            data_dict = df.to_dict(orient = "list")

        return {k:torch.tensor(v,dtype = torch.long).to(self.device) for k,v in data_dict.items()}

    def __len__(self):
        return self.len

In [55]:
TrainTokenizedDataset  = SQUADTokenizedDataSet(train_feature_df,"cpu")
DevTokenizedDataset  = SQUADTokenizedDataSet(dev_feature_df,"cpu")

## 3) Fine Tune Model

In [56]:
from transformers import AlbertForQuestionAnswering,TrainingArguments,Trainer

In [57]:
model = AlbertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN t



source codes from transformer
```python
class BertForQuestionAnswering(BertPreTrainedModel):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

```



In [58]:
# sample_data = TrainTokenizedDataset[:5]

In [59]:
# model(**sample_data)

In [62]:
model_save_path = "./single-albert-squad/" # data folder

In [63]:
os.listdir(model_save_path)

[]

In [64]:
batch_size = 24

In [65]:
steps = len(TrainTokenizedDataset)//batch_size

In [66]:
steps

5429

In [68]:
evaluate_steps = 600

In [69]:
args = TrainingArguments(
    model_save_path,
    overwrite_output_dir = True,
    evaluation_strategy = "steps",
    eval_steps = evaluate_steps,
    logging_steps = evaluate_steps,
    save_steps = evaluate_steps,
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = 2,
    seed = 0,
    load_best_model_at_end = True,
)

In [70]:
data_collator = default_data_collator

In [71]:
trainer = Trainer(
model,
args,
train_dataset = TrainTokenizedDataset,
eval_dataset = DevTokenizedDataset,
data_collator = data_collator,
tokenizer = tokenizer,
)

In [72]:
trainer.train()

***** Running training *****
  Num examples = 130319
  Num Epochs = 2
  Instantaneous batch size per device = 24
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 1
  Total optimization steps = 10860


Step,Training Loss,Validation Loss
600,1.5115,1.08372
1200,1.1143,0.982566
1800,1.0148,0.9608
2400,1.006,1.01026
3000,0.9328,0.91213
3600,0.9067,0.917927
4200,0.9135,0.917354
4800,0.8988,0.882689
5400,0.8755,0.848747
6000,0.6898,0.876036


***** Running Evaluation *****
  Num examples = 11873
  Batch size = 24
Saving model checkpoint to ./single-albert-squad/checkpoint-600
Configuration saved in ./single-albert-squad/checkpoint-600/config.json
Model weights saved in ./single-albert-squad/checkpoint-600/pytorch_model.bin
tokenizer config file saved in ./single-albert-squad/checkpoint-600/tokenizer_config.json
Special tokens file saved in ./single-albert-squad/checkpoint-600/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 11873
  Batch size = 24
Saving model checkpoint to ./single-albert-squad/checkpoint-1200
Configuration saved in ./single-albert-squad/checkpoint-1200/config.json
Model weights saved in ./single-albert-squad/checkpoint-1200/pytorch_model.bin
tokenizer config file saved in ./single-albert-squad/checkpoint-1200/tokenizer_config.json
Special tokens file saved in ./single-albert-squad/checkpoint-1200/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 11873
  Batch s

TrainOutput(global_step=10860, training_loss=0.8386003431035669, metrics={'train_runtime': 6943.983, 'train_samples_per_second': 37.534, 'train_steps_per_second': 1.564, 'total_flos': 5755878281760768.0, 'train_loss': 0.8386003431035669, 'epoch': 2.0})

# Evaluation

In [73]:
token_inputs = TrainTokenizedDataset[30:35]
token_inputs = {k:v.to(device) for k,v in token_inputs.items()}
max_answer_len = 32

In [74]:
def answer_question_from_tokenized_inputs(model,token_inputs,device = 'cude'):
    token_inputs = {k:v.to(device) for k,v in token_inputs.items()}

    model = model.to(device)
    model.eval()
    with torch.no_grad():
        output = model(**token_inputs)


    token_inputs = {k:v.to("cpu") for k,v in token_inputs.items()}

    start_logits = output.start_logits.cpu().detach().numpy()
    end_logits = output.end_logits.cpu().detach().numpy()

    input_ids = token_inputs['input_ids']
    result_dict_list = []

    for idx in range(len(start_logits)):
        result_dict = {}
        start_end = (0,0)
        start_end_score = (-1,-1)

        score = -1

        for start,p_start in enumerate(start_logits[idx]):
            if p_start>0:
                for end,p_end in enumerate(end_logits[idx]):
                    if p_end>0:
                        if end >= start and end < start + max_answer_len:
                            if p_start * p_end > score:
                                start_end = (start,end)
                                start_end_score = (p_start,p_end)
                                score = p_start * p_end
        start,end = start_end
        start_score,end_score = start_end_score

        pred_answer = ""
        if start!=0 and end !=0:
            pred_answer = tokenizer.decode(
                input_ids[idx][start:end+1]
            )

        result_dict['start_pos'] = start
        result_dict['start_score'] = start_score
        result_dict['end_pos'] = end
        result_dict['end_score'] = end_score

        result_dict['answer'] = pred_answer
        result_dict['score'] = score

        result_dict_list.append(result_dict)

    return result_dict_list





    


In [75]:
# answer_question_from_tokenized_inputs(model,token_inputs,'cuda')

In [76]:
def answer_question_from_context(context_list,question_list,tokenizer,device):
    tokenized_list = []

    for context,question in zip(context_list,question_list):
        tokenized_inputs = tokenizer(
            text = question,
            text_pair = context,
            truncation = "only_second",
            add_special_tokens = True,
            max_length = max_length,
            padding = "max_length",
            return_offsets_mapping = False,
        )

        tokenized_list.append(tokenized_inputs)


    tokenized_dataframe = pd.DataFrame(tokenized_list)

    token_inputs = tokenized_dataframe.to_dict("list")

    token_inputs = {k:torch.tensor(v,dtype = torch.long).to(device) for k,v in token_inputs.items()}

    res_list = answer_question_from_tokenized_inputs(model,token_inputs,device = device)

    return res_list

In [105]:
sample_df = dev_data_df.head(10)
context_list = sample_df['context']
question_list = sample_df['question']
real_answer_list = sample_df['answer_text']

In [106]:
predict_result = answer_question_from_context(context_list,question_list,tokenizer,'cuda')

In [107]:
predict_result

[{'start_pos': 54,
  'start_score': 9.47926,
  'end_pos': 54,
  'end_score': 9.101431,
  'answer': 'france',
  'score': 86.27483},
 {'start_pos': 40,
  'start_score': 9.09523,
  'end_pos': 45,
  'end_score': 9.475946,
  'answer': '10th and 11th centuries',
  'score': 86.18591},
 {'start_pos': 0,
  'start_score': 8.076175,
  'end_pos': 0,
  'end_score': 8.088457,
  'answer': '',
  'score': 65.32379},
 {'start_pos': 0,
  'start_score': 8.994506,
  'end_pos': 0,
  'end_score': 8.857089,
  'answer': '',
  'score': 79.66514},
 {'start_pos': 44,
  'start_score': 8.427077,
  'end_pos': 45,
  'end_score': 8.577871,
  'answer': '10th',
  'score': 72.286385},
 {'start_pos': 0,
  'start_score': 10.277268,
  'end_pos': 0,
  'end_score': 9.69535,
  'answer': '',
  'score': 99.64171},
 {'start_pos': 0,
  'start_score': 9.1655035,
  'end_pos': 0,
  'end_score': 9.256594,
  'answer': '',
  'score': 84.84134},
 {'start_pos': 0,
  'start_score': 9.420945,
  'end_pos': 0,
  'end_score': 10.140012,
  'ans

In [108]:
real_answer_list.to_list()

['France',
 'in the 10th and 11th centuries',
 'Denmark, Iceland and Norway',
 'Rollo',
 'the first half of the 10th century',
 '',
 '',
 '',
 '',
 'William the Conqueror']

### Evaulate with official function

In [81]:
from torch.utils.data import DataLoader
from tqdm import tqdm

In [87]:
# train_data_path = os.path.join(squad_v2_dir, "train-v2.0.json")
dev_data_path = os.path.join(squad_v2_dir, 'dev-v2.0.json')

# train_data_dict = load_data(train_data_path, True)
dev_data_dict = load_data(dev_data_path, load_impossible_answer=False)

In [88]:
dev_data_df = pd.DataFrame(dev_data_dict)
dev_feature_df = dev_data_df.apply(lambda df: prepare_feature(df),axis = 1)
dev_feature_df = pd.DataFrame(list(dev_feature_df))
DevTokenizedDataset  = SQUADTokenizedDataSet(dev_feature_df,"cuda")

In [89]:
DevTokenizedLoader = DataLoader(DevTokenizedDataset,batch_size=32,shuffle=False)

In [90]:
all_result_dict_list = []

In [91]:
for token_inputs in tqdm(DevTokenizedLoader):
    result_dict_list = answer_question_from_tokenized_inputs(model,token_inputs,device = 'cuda')
    all_result_dict_list.extend(result_dict_list)

100%|██████████| 372/372 [08:05<00:00,  1.30s/it]


In [92]:
predict_answers = [dict_['answer'] for dict_ in all_result_dict_list]

In [93]:
dev_data_df['prection_answer'] = predict_answers

In [94]:
preds = dev_data_df[['id','prection_answer']].set_index('id').T.to_dict('records')[0]

In [95]:
dev_dataset = read_json(dev_data_path)

In [96]:
import sys
# import official evaluation function
sys.path.append(squad_v2_dir)

In [97]:
import evaluate as evaluate_utils 

In [98]:
exact_raw,f1_raw = evaluate_utils.get_raw_scores(dev_dataset,preds)

In [99]:
metrics = evaluate_utils.make_eval_dict(exact_raw,f1_raw)  

In [100]:
metrics
# increate the size of training data to imporve the metrics

OrderedDict([('exact', 78.657458098206),
             ('f1', 81.93571504516356),
             ('total', 11873)])