<a href="https://colab.research.google.com/github/NLP613-Metaplexia/assignment3/blob/main/bert_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook Card: Finetuning Bert model
* * *

This notebook describes how to fine-tune a pre-trained `Bert` model for the following downstream task:

1. Classification on SST-2
2. Question-answering on SQuaD

## Pre-requisites

* Setup preferred encoding
* Installation of required libraries

In [1]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
!pip install evaluate transformers[torch] datasets rouge_score bert_score
!pip install accelerate -U

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[torch]
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m72.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-

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

In [3]:
!python -m pip install huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'stor

## Importing required *packages*

In [15]:
import os
import torch
import evaluate
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, AutoModelForQuestionAnswering, DefaultDataCollator
from datasets import load_dataset, DatasetDict
import collections
from tqdm import tqdm
from transformers import pipeline
from prettytable import PrettyTable


## Required parameters for finetuning

Fill the rendered form below (right side) and then run the cell below

In [5]:
# Take user inputs

class Arguments:
    pass
args = Arguments()

args.epoch = 5 # @param {type:"number"}
args.model = "temporary0-0name/run_opt" # @param {type:"string"}
args.batch = 32 # @param {type:"number"}
args.lr = 3e-4 # @param {type:"number"}
args.cuda_device = "0" # @param {type:"string"}
args.max_context_length = 512 # @param {type:"string"}
args.only_evaluate = False # @param {type:"boolean"}
args.random_seed = 1 # @param {type:"number"}
args.test_split = 0.2 # @param {type:"number"}

## Environment Setup

1. Setting seeds for reproducability
2. Configurations for hugging face package
3. GPU/CPU device setup

In [6]:
# Setting seeds for reproducability
random_seed = args.random_seed
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)

# Configurations for training models
model_checkpoint = args.model # Huggingface model repo name
BATCH_SIZE = args.batch
EPOCH = args.epoch # Number of epochs
LR = args.lr # Learning rate. Just using the one generally used. Needs hyperparameter tuning to improve
MAX_LENGTH = args.max_context_length

# GPU/CPU device setup
CUDA = args.cuda_device
os.environ["CUDA_VISIBLE_DEVICES"] = CUDA
device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

(…)n_opt/resolve/main/tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

(…)ry0-0name/run_opt/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)name/run_opt/resolve/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

(…)opt/resolve/main/special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [14]:
def model_size_and_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    model_size = sum(t.numel() for t in model.parameters())
    print(f"bert-base-uncased size: {model_size/1000**2:.1f}M parameters")
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params


## Fine-tune task (a): Classification


### Load SST-2 dataset and perform 80-20 split

Performing stratify to ensure a balanced dataset

In [None]:
cls_dataset = load_dataset("sst2", split="train")
cls_dataset = cls_dataset.train_test_split(test_size=args.test_split, seed=1, stratify_by_column="label")
print(cls_dataset)

Downloading builder script:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 53879
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 13470
    })
})


### Perform tokenization for classification

In [None]:
def cls_preprocess_function(sample):
    """Tokenize the samples
    """
    model_inputs = tokenizer(sample["sentence"], truncation=True)
    model_inputs["labels"] = sample["label"]
    return model_inputs
tokenized_dataset = cls_dataset.map(cls_preprocess_function, batched=True) # This will take 10 min

Map:   0%|          | 0/53879 [00:00<?, ? examples/s]

Map:   0%|          | 0/13470 [00:00<?, ? examples/s]

### Function definition for metrics

1. Accuracy
2. F1
3. Recall
4. Precision

In [None]:
accuracy = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
recall_metric = evaluate.load('recall')
precision_metric = evaluate.load("precision")

def cls_compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Accuracy
    result = accuracy.compute(predictions=predictions, references=labels)

    # F1
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    result.update(f1)

    # Recall
    recall = recall_metric.compute(predictions=predictions, references=labels)
    result.update(recall)

    # Precision
    precision = precision_metric.compute(predictions=predictions, references=labels)
    result.update(precision)

    return result


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

### Starting finetuning & Evaluation of the model on all the metrics

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Mapping of expected ids and labels
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Load the model
cls_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id).to(device)


(…)0-0name/run_opt/resolve/main/config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at temporary0-0name/run_opt and are newly initialized: ['classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

training_args = TrainingArguments(
    num_train_epochs=EPOCH,
    output_dir="data/sst2",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=1, # Number of models that need to be saved
    warmup_steps=500,
    fp16=True,
    report_to = "tensorboard",
    push_to_hub=True,
)

trainer = Trainer(
    model=cls_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=cls_compute_metrics
)


if not args.only_evaluate:
    trainer.train()
else:
    eval_results = trainer.evaluate()
    print(eval_results)

Downloading (…)lve/main/config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at temporary0-0name/run_opt and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1,0.4964,0.487948,0.804677,0.834289,0.881421,0.791941
2,0.6953,0.686381,0.557832,0.716165,1.0,0.557832
3,0.698,0.689114,0.557832,0.716165,1.0,0.557832
4,0.6944,0.68713,0.557832,0.716165,1.0,0.557832
5,0.6873,0.686492,0.557832,0.716165,1.0,0.557832


#### Number of parameter recheck after finetuning

In [19]:
model_size_and_parameters(cls_model)

bert-base-uncased size: 109.5M parameters
+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |
|        bert.embeddings.position_embeddings.weight       |   393216   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |
|             bert.embeddings.LayerNorm.weight            |    768     |
|              bert.embeddings.LayerNorm.bias             |    768     |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |
|      bert.encoder.layer.0.attention.self.key.weight     |   589824   |
|       bert.encoder.layer.0.attention.self.key.bias      |    768     |
|     bert.encoder.layer.0.attention.self.value.weight    |   589824   |
|      be

109483778

### Evaluation on public model

Model is saved in the huggingface repository - `Hitesh1501/sst2`

In [None]:
task_evaluator = evaluate.evaluator("text-classification")
eval_results = task_evaluator.compute(
    model_or_pipeline="Hitesh1501/sst2",
    data=cls_dataset["test"],
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
    input_column="sentence",
    label_column="label"
)
print(eval_results)

{'accuracy': 0.5578322197475872, 'recall': 1.0, 'precision': 0.5578322197475872, 'f1': 0.716164696911933, 'total_time_in_seconds': 208.93013424499986, 'samples_per_second': 64.4713126168987, 'latency_in_seconds': 0.015510774628433548}


## Fine-tune task (b): Question Answering

### Load SQuAD dataset and perform 80-20 split

In [8]:
qa_dataset = load_dataset("squad_v2", split="train")
qa_dataset = qa_dataset.train_test_split(test_size=args.test_split, seed=1)
print(qa_dataset)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 104255
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 26064
    })
})


In the dataset, for the `question` the model has to generate pointers to the `answer` from the given `context`.

Sample data is as follows

In [9]:
qa_dataset["train"][0]

{'id': '57304755b2c2fd1400568b67',
 'title': 'The_Blitz',
 'context': 'From the German point of view, March 1941 saw an improvement. The Luftwaffe flew 4,000 sorties that month, including 12 major and three heavy attacks. The electronic war intensified but the Luftwaffe flew major inland missions only on moonlit nights. Ports were easier to find and made better targets. To confuse the British, radio silence was observed until the bombs fell. X- and Y-Gerät beams were placed over false targets and switched only at the last minute. Rapid frequency changes were introduced for X-Gerät, whose wider band of frequencies and greater tactical flexibility ensured it remained effective at a time when British selective jamming was degrading the effectiveness of Y-Gerät.',
 'question': 'Why were ports better targets?',
 'answers': {'text': ['Ports were easier to find'], 'answer_start': [251]}}

### Perform tokenization for Question Answering

In [None]:
# # Tentative
# qa_dataset = DatasetDict({
#     "train": qa_dataset["test"],
#     "test": qa_dataset["test"]
# })

In [10]:
def qa_preprocess_function(sample):
    """Tokenize the samples
    """
    questions = [q.strip() for q in sample["question"]]
    # Tokenizes the input to the BERT model in the format [question][SEP][context] the truncation happens only to the last part (not the question, ie, from the end)
    model_inputs = tokenizer(questions, sample["context"], max_length=MAX_LENGTH, truncation="only_second", return_offsets_mapping=True, padding="max_length", return_tensors="pt")

    offset_mapping = model_inputs.pop("offset_mapping")
    answers = sample["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        # print(answer)
        if len(answer["answer_start"]) == 0: # If the answers are not possible then we will set the answer context to be (0, 0)
            start_positions.append(0)
            end_positions.append(0)
            continue
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0]) # Since end is absent in dataset, we create by adding the answer length
        sequence_ids = model_inputs.sequence_ids(i)

        # Find the start and end of the context
        # For the Question part of the model input it will be zero. So the cotext starts where first sequence id is 1
        # and the end will be the one that comes last (special tokens have seq id as None)
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # Finding the span of the answer from the context based on the answer and the context
        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    # Adding all the answer postions to the model input label
    model_inputs["start_positions"] = start_positions
    model_inputs["end_positions"] = end_positions

    return model_inputs
tokenized_dataset = qa_dataset.map(qa_preprocess_function, batched=True, remove_columns=qa_dataset["train"].column_names) # This will take 10 min

Map:   0%|          | 0/104255 [00:00<?, ? examples/s]

Map:   0%|          | 0/26064 [00:00<?, ? examples/s]

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 104255
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 26064
    })
})

In [None]:
# test
# tokenized_dataset["train"][0]

### Function definition for metrics
1. SQuAD metric
2. F1
3. METEOR
4. BLEU
5. ROUGE

In [12]:
squad_metric = evaluate.load("squad_v2")
meteor = evaluate.load('meteor')
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge") # A metric in which the value is between 0 and 1; higher the better
# Exact Match and F1 are computed in the squad metric

n_best_size = 20
max_answer_length = 30

def qa_compute_metrics(predicted_answers, references):

    # print(predicted_answers, references)

    result = {}

    # Squad score
    squad_score = squad_metric.compute(predictions=predicted_answers, references=references)
    result["exact_match"] = squad_score["exact"]
    result["f1"] = squad_score["f1"]

    predicted_answers = [i["prediction_text"] for i in predicted_answers]
    references = [i["answers"]["text"][0] if len(i["answers"]["text"]) > 0 else "" for i in references]


    # rouge
    result["rouge"] = rouge.compute(predictions=predicted_answers, references=references, use_stemmer=True) # Computing the metric

    # Bleu Score
    result["bleu"] = bleu.compute(predictions=predicted_answers, references=references)['bleu']


    # METEOR Score
    meteor_score = meteor.compute(predictions=predicted_answers, references=references)
    result.update(meteor_score)

    return result

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.93k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

### Starting finetuning

In [13]:
data_collator = DefaultDataCollator()
# Load model
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)


training_args = TrainingArguments(
    num_train_epochs=EPOCH,
    output_dir="data/squad",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=16, # More than this leads to overflow
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=1, # Number of models that need to be saved
    warmup_steps=500,
    fp16=True,
    report_to = "tensorboard",
    push_to_hub=True,
)

trainer = Trainer(
    model=qa_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


if not args.only_evaluate:
    trainer.train()
else:
    eval_results = trainer.evaluate()
    print(eval_results)


(…)0-0name/run_opt/resolve/main/config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at temporary0-0name/run_opt and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,6.2429,6.238281
2,6.2451,6.238281
3,6.2454,6.238281


KeyboardInterrupt: ignored

For executing 3 epochs it took 4hs so we are keeping the experiment short as Colab times out the GPU resources.

#### Number of parameter recheck

In [17]:
model_size_and_parameters(qa_model)

bert-base-uncased size: 108.9M parameters
+---------------------------------------------------------+------------+
|                         Modules                         | Parameters |
+---------------------------------------------------------+------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |
|        bert.embeddings.position_embeddings.weight       |   393216   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |
|             bert.embeddings.LayerNorm.weight            |    768     |
|              bert.embeddings.LayerNorm.bias             |    768     |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |
|      bert.encoder.layer.0.attention.self.key.weight     |   589824   |
|       bert.encoder.layer.0.attention.self.key.bias      |    768     |
|     bert.encoder.layer.0.attention.self.value.weight    |   589824   |
|      be

108893186

#### Evaluating of model on all the metrics

In [None]:
def prepare_validation_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=MAX_LENGTH,
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
test_features = qa_dataset["test"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=qa_dataset["test"].column_names
)

Map:   0%|          | 0/26064 [00:00<?, ? examples/s]

In [None]:
def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []

        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}

        answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
        predictions[example["id"]] = answer

    return predictions

In [None]:
raw_predictions = trainer.predict(test_features)

In [None]:
final_predictions = postprocess_qa_predictions(qa_dataset["test"], test_features, raw_predictions.predictions)
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in qa_dataset["test"]]
qa_compute_metrics(formatted_predictions, references)

### Evaluation on public model

Model is saved in the huggingface repository - `Hitesh1501/squad`

In [None]:
# Preprocessing QA Dataset
test_features = qa_dataset["test"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=qa_dataset["test"].column_names
)

data_collator = DefaultDataCollator()


# Loading model
qa_model = AutoModelForQuestionAnswering.from_pretrained("Hitesh1501/squad")

pipeline = Trainer(
    model=qa_model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

raw_predictions = pipeline.predict(test_features)

# Post Processing
final_predictions = postprocess_qa_predictions(qa_dataset["test"], test_features, raw_predictions.predictions)
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in qa_dataset["test"]]

qa_compute_metrics(formatted_predictions, references)

Post-processing 200 example predictions split into 200 features.


100%|██████████| 200/200 [00:01<00:00, 153.96it/s]


{'exact_match': 0.0,
 'f1': 1.1297538528087727,
 'rouge': {'rouge1': 0.012463295914611705,
  'rouge2': 0.003759649122807017,
  'rougeL': 0.012401278059172795,
  'rougeLsum': 0.012560224035224034},
 'bleu': 0.009718256075953204,
 'meteor': 0.019486391429812042}