# The dataset

### The dataset is called SQUAD.

### each datapoint is a question, a context that may contain the answer to the question, the start index of the answer, the answer


In [1]:
from datasets import load_dataset

dataset = load_dataset("squad")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [2]:
for q, a in zip(dataset["train"]["question"][:5], dataset["train"]["answers"]):
    print(f"{q} => {a} \n")

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? => {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]} 

What is in front of the Notre Dame Main Building? => {'text': ['a copper statue of Christ'], 'answer_start': [188]} 

The Basilica of the Sacred heart at Notre Dame is beside to which structure? => {'text': ['the Main Building'], 'answer_start': [279]} 

What is the Grotto at Notre Dame? => {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]} 

What sits on top of the Main Building at Notre Dame? => {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]} 



# this function adds the end of the answer to the dataset, and does some sanity checks


In [3]:
def compute_end_index(answers, contexts):
    """Add end index to answers"""
    # print(contexts)
    fixed_answers = []
    for answer, context in zip(answers, contexts):
        gold_text = answer["text"][0]
        # print(gold_text)
        answer["text"] = gold_text
        # print(answer["text"])

        start_idx = answer["answer_start"][0]
        answer["answer_start"] = start_idx

        # Make sure the starting index is valid and there is an answer
        assert start_idx >= 0 and len(gold_text.strip()) > 0
        end_idx = start_idx + len(gold_text)
        answer["answer_end"] = end_idx

        # Make sure the corresponding context matches the actual answer
        assert context[start_idx:end_idx] == gold_text

        fixed_answers.append(answer)
    return fixed_answers, contexts

In [4]:
for answer in dataset["train"]["answers"]:
    print(answer)
    break

{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}


In [13]:
train_questions = dataset["train"]["question"]
print("Training data corrections")
train_answers, train_contexts = compute_end_index(
    dataset["train"]["answers"], dataset["train"]["context"]
)

test_questions = dataset["validation"]["question"]
print("\nValidation data correction")
test_answers, test_contexts = compute_end_index(
    dataset["validation"]["answers"], dataset["validation"]["context"]
)

Training data corrections

Validation data correction


# Implementing BERT


In [8]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained(
    "bert-base-uncased"
)  # using the pretrained tokenizer (there is no distinction between uppercase and lowercase)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Testing the tokenizer


In [10]:
context = "this is the context"
question = "this is the question"

"""
text - A single or batch of text sequences to be encoded by the tokenizer
text_pair - An optinal single or batcch of text sequences to be encoded by the tokenizer. useful for multipair(question and context)
"""

token_ids = tokenizer(
    text=context, text_pair=question, padding=False, return_tensors="tf"
)
print(token_ids)

{'input_ids': <tf.Tensor: shape=(1, 11), dtype=int32, numpy=
array([[ 101, 2023, 2003, 1996, 6123,  102, 2023, 2003, 1996, 3160,  102]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [12]:
print(tokenizer.convert_ids_to_tokens(token_ids["input_ids"].numpy()[0]))

['[CLS]', 'this', 'is', 'the', 'context', '[SEP]', 'this', 'is', 'the', 'question', '[SEP]']


# Encoding the train and test dataset


In [14]:
# Encoding train
train_encodings = tokenizer(
    train_contexts, train_questions, truncation=True, padding=True, return_tensors="tf"
)

# Encoding test
test_encodings = tokenizer(
    test_contexts, test_questions, truncation=True, padding=True, return_tensors="tf"
)

print("train_encodings.shape: {}".format(train_encodings["input_ids"].shape))

train_encodings.shape: (87599, 512)


In [16]:
# convert the character-based indices to token-based indecis


def replace_char_with_token_indices(encodings, answers):
    """
    This function takes a set of BatchEncodings called encodings generated by the tokenizer and a set of
    answers( a list of dictionaries). Then it updates the provided encodings with two new keys:
    start_position and end_positions. These keys respectively hold the token-based indices denoting the
    start and end of the answer. if the asnwer is not found, set the start and end indices to the last token.
    To convert the existing character-based indeces to token-based indices, call char_to_token() provided by
    the BatchEncodings class.
    """

    start_positions = []
    end_positions = []
    n_updates = 0

    # Go through all the answers
    for i in range(len(answers)):
        # Get the token position for both start and end char positions
        start_positions.append(encodings.char_to_token(i, answers[i]["answer_start"]))
        end_positions.append(encodings.char_to_token(i, answers[i]["answer_end"] - 1))

        if start_positions[-1] is None or end_positions[-1] is None:
            n_updates += 1

        # if start position is None, the answer passage has been truncated
        # https://huggingface.co/transformers/custom_datasets.html#qa-squad

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length - 1

        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length - 1

    print("{}/{} had answers truncated".format(n_updates, len(answers)))
    encodings.update(
        {"start_positions": start_positions, "end_positions": end_positions}
    )

In [17]:
replace_char_with_token_indices(train_encodings, train_answers)
replace_char_with_token_indices(test_encodings, test_answers)

10/87599 had answers truncated
8/10570 had answers truncated
