In [1]:
# !pip install datasets
import tensorflow as tf
import transformers
from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering
from transformers import pipeline

# Load dataset and metric for evaluation
from datasets import load_dataset, load_metric


from tqdm import tqdm
import pandas as pd
import time
import os
import json
import re
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Setup initial parameters

squad_v2 = False # indicates if impossible answers are allowed
model_checkpoint = 'distilbert-base-uncased'
batch_size = 32

In [3]:
# Loading the SQuAD dataset

# use a smaller size for testing
train_size = 200 #87599
valid_size = 50 #10570

split = [
    f"train[:{train_size}]",
    f"validation[:{valid_size}]"
]


train, validation = load_dataset('squadv2' if squad_v2 else 'squad', split=split)



Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 200
})

In [5]:
validation

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 50
})

In [6]:
train[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

### Text pre-processing

1. Tokenizer
- Convert text to tokens with ids
- Generate other inputs in the format the model requires

from_pretrained:
- gets tokenizer corr to our model
- downloads and caches vocab used when trining this model

In [7]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# Checking that the tokenizer is a fast tokenizer as we will be using some of their features for preprocessing
tokenizer.is_fast

True

In [9]:
# test the tokenizer on a sample question, context
tokenizer('What day is it today?', 'Today is a gloomy day, it most likely is a Wednesday')

{'input_ids': [101, 2054, 2154, 2003, 2009, 2651, 1029, 102, 2651, 2003, 1037, 24067, 2100, 2154, 1010, 2009, 2087, 3497, 2003, 1037, 9317, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
# To handle very long contexts, split the context into multiple chunks with a sliding window between them

# sample with very long context
for i, sample in enumerate(train):
  # pass q, context to tokenizer & fetch input_ids
  inp = tokenizer(sample['question'], sample['context'])['input_ids']
  length = len(inp)
  if length > 300: # model max length
    long_sample = train[i]
    break

long_sample, length

({'id': '5733b5df4776f41900661105',
  'title': 'University_of_Notre_Dame',
  'context': 'In 2014 the Notre Dame student body consisted of 12,179 students, with 8,448 undergraduates, 2,138 graduate and professional and 1,593 professional (Law, M.Div., Business, M.Ed.) students. Around 21–24% of students are children of alumni, and although 37% of students come from the Midwestern United States, the student body represents all 50 states and 100 countries. As of March 2007[update] The Princeton Review ranked the school as the fifth highest \'dream school\' for parents to send their children. As of March 2015[update] The Princeton Review ranked Notre Dame as the ninth highest. The school has been previously criticized for its lack of diversity, and The Princeton Review ranks the university highly among schools at which "Alternative Lifestyles [are] Not an Alternative." It has also been commended by some diversity oriented publications; Hispanic Magazine in 2004 ranked the university ninth 

In [12]:
# pass the sample to tokenizer to cap only context lenght to 300

tokenized_sample = tokenizer(
    sample['question'],
    sample['context'],
    max_length = 300,
    stride = 100,
    truncation = 'only_second',
    return_overflowing_tokens = True,
)
tokenized_sample


{'input_ids': [[101, 2129, 2116, 2104, 16307, 2015, 2020, 7052, 10289, 8214, 1999, 2297, 1029, 102, 1999, 2297, 1996, 10289, 8214, 3076, 2303, 5031, 1997, 2260, 1010, 20311, 2493, 1010, 2007, 1022, 1010, 4008, 2620, 8324, 2015, 1010, 1016, 1010, 15028, 4619, 1998, 2658, 1998, 1015, 1010, 5354, 2509, 2658, 1006, 2375, 1010, 1049, 1012, 4487, 2615, 1012, 1010, 2449, 1010, 1049, 1012, 3968, 1012, 1007, 2493, 1012, 2105, 2538, 1516, 2484, 1003, 1997, 2493, 2024, 2336, 1997, 9441, 1010, 1998, 2348, 4261, 1003, 1997, 2493, 2272, 2013, 1996, 13608, 11795, 2142, 2163, 1010, 1996, 3076, 2303, 5836, 2035, 2753, 2163, 1998, 2531, 3032, 1012, 2004, 1997, 2233, 2289, 1031, 10651, 1033, 1996, 9173, 3319, 4396, 1996, 2082, 2004, 1996, 3587, 3284, 1005, 3959, 2082, 1005, 2005, 3008, 2000, 4604, 2037, 2336, 1012, 2004, 1997, 2233, 2325, 1031, 10651, 1033, 1996, 9173, 3319, 4396, 10289, 8214, 2004, 1996, 6619, 3284, 1012, 1996, 2082, 2038, 2042, 3130, 6367, 2005, 2049, 3768, 1997, 8906, 1010, 1998, 1996

In [13]:
# 1 sample is broken down into multiple chunks of length 300 with overlap.
# input_ids therefore contains multiple lists of individual chunks

# checkin that length is truncated to 300 for each chunk
[len(x) for x in tokenized_sample['input_ids']]

[300, 159]

In [14]:
# let's view the decoded tokenized output - context for 1 question has been split into multiple chunks with overlap
for chunk in tokenized_sample['input_ids']:
  print(tokenizer.decode(chunk))

[CLS] how many undergrads were attending notre dame in 2014? [SEP] in 2014 the notre dame student body consisted of 12, 179 students, with 8, 448 undergraduates, 2, 138 graduate and professional and 1, 593 professional ( law, m. div., business, m. ed. ) students. around 21 – 24 % of students are children of alumni, and although 37 % of students come from the midwestern united states, the student body represents all 50 states and 100 countries. as of march 2007 [ update ] the princeton review ranked the school as the fifth highest'dream school'for parents to send their children. as of march 2015 [ update ] the princeton review ranked notre dame as the ninth highest. the school has been previously criticized for its lack of diversity, and the princeton review ranks the university highly among schools at which " alternative lifestyles [ are ] not an alternative. " it has also been commended by some diversity oriented publications ; hispanic magazine in 2004 ranked the university ninth on 

In [15]:
# use return_offsets_mapping to return start and end chars for each token as output so that we can extract the ans
# CLS token is a blank token at (0,0) followed by 'how' at (0,3)

tokenized_sample = tokenizer(
    sample['question'],
    sample['context'],
    max_length = 300,
    stride = 100,
    truncation = 'only_second',
    return_overflowing_tokens = True,
    return_offsets_mapping = True,
)
tokenized_sample['offset_mapping'][0][:10]

[(0, 0),
 (0, 3),
 (4, 8),
 (9, 14),
 (14, 18),
 (18, 19),
 (20, 24),
 (25, 34),
 (35, 40),
 (41, 45)]

In [16]:
# Verifiying that offset mapping generated actually corresponds to a word in input_ids (q+context)
# 1st chunk, 1st word - how (skip CLS token)

first_token_id = tokenized_sample['input_ids'][0][1]
print('First input id: ', first_token_id)

print('First word using input_ids: ',tokenizer.convert_ids_to_tokens(first_token_id))

offset_id = tokenized_sample['offset_mapping'][0][1]
print('First offset id: ', offset_id)

print('First word using offset id: ', sample['question'][offset_id[0] : offset_id[1]])

First input id:  2129
First word using input_ids:  how
First offset id:  (0, 3)
First word using offset id:  How


In [17]:
# to distinguish between q & c in offset, use sequence_ids
# None for CLS and SEP tokens
# 0 for q
# 1 for c

seq_id = tokenized_sample.sequence_ids()
print(seq_id)

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]


In [18]:
# we will now find the first and last token of the answer within the context
# testing if ans is present in first chunk

answer = sample['answers']
input_ids = tokenized_sample['input_ids'][0]
offset = tokenized_sample['offset_mapping'][0]

ans_start_char = answer['answer_start'][0]
ans_end_char   = ans_start_char + len(answer['text'][0])

# get start and end char of current context
context_start_char = 0
while seq_id[context_start_char] != 1: # 1 indicates start of context
  context_start_char += 1
context_end_char = len(input_ids) - 1
while seq_id[context_end_char] != 1: # read from end until first 1 is encountered -> end of context
  context_end_char -= 1

if offset[context_start_char][0] <= ans_start_char and  offset[context_end_char][1] >= ans_end_char : # if ans lies within current context
  # take start and end token positions
  # from the start of the context, move along context tokens until you reach the ans start char.
  # context_start_char should not go beyond total length if ans is the last word
  while context_start_char < len(offset) and offset[context_start_char][0] <= ans_start_char :
    context_start_char += 1
  chunk_ans_start_pos = context_start_char - 1
  # move backwards from end of context until you reach the ans end char
  while offset[context_end_char][1] >= ans_end_char :
    context_end_char -= 1
  chunk_ans_end_pos = context_end_char + 1
  print('Ans start and end positions: ', chunk_ans_start_pos,chunk_ans_end_pos)
else:
  print('Ans does not lie in current context (feature)')

Ans start and end positions:  29 32


In [19]:
# let's decode this output and validate with the actual answer

ans_pos_decoded = tokenizer.decode(
    tokenized_sample['input_ids'][0][chunk_ans_start_pos : chunk_ans_end_pos+1]
)
actual_ans = answer['text'][0]

print(ans_pos_decoded)
print(actual_ans)

8, 448
8,448


###Text pre-processing - Training data

In [20]:
def preprocess_training_data(train_ds):
  question = [ q.strip() for q in train_ds['question']] # Cleanup questions - Removing extra spaces
  context  = [ c.strip() for c in train_ds['context']]
  pad_on_right = tokenizer.padding_side == 'right' # returns True if question is followed by context,
                                                   # to support models where context is followed by question, - order of question and context will be swapped

  # To handle very long contexts, split the context into multiple chunks with a sliding window between them
  inputs = tokenizer(
    question if pad_on_right else context,
    context if pad_on_right else question,
    max_length = 384,                 # setting max length of question + context to 100
    stride = 128,                     # overlapping tokens between chunks
    truncation = 'only_second' if pad_on_right else 'only_first',       # if question +  context is too long, truncate only context (2nd argument)
    return_overflowing_tokens = True, # return chunks that have been truncated as well
    return_offsets_mapping = True,    # return which chunk contains the answer
    padding = 'max_length',
    )

  # One context is broken into multiple chunks if it exceeds max_length.
  # Creating a mapper that maps each context to its corresponding features (chunks)
  sample_mapping = inputs.pop('overflow_to_sample_mapping')


  # Create offset map to map start and end end indices of answer in context
  offset_mapping = inputs.pop('offset_mapping')


  # Now that context is split into chunks, we will now find the first and last token of the answer within the context
  chunk_ans_start_pos = []
  chunk_ans_end_pos = []


  for i, offset in enumerate(offset_mapping):


    # Fetch input ids and CLS index
    input_ids = inputs['input_ids'][i]
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # calculate context start and end indices
    # fetch sequence_ids to know what is the question and what is the context
    seq_id = inputs.sequence_ids(i)

    sample_idx = sample_mapping[i] # get index of current chunk
    answer = train_ds['answers'][sample_idx] # get answers for each source

    # if no answer is given, set cls_index as start and end char
    if len(answer['answer_start']) == 0:
      chunk_ans_start_pos.append(cls_index)
      chunk_ans_end_pos.append(cls_index)
    else:
      # calculate ans start and end indices
      ans_start_char = answer['answer_start'][0]
      ans_end_char   = ans_start_char + len(answer['text'][0])


      # get start and end char of current context
      context_start_char = 0
      while seq_id[context_start_char] != (1 if pad_on_right else 0): # 1 indicates start of context
        context_start_char += 1

      context_end_char = len(input_ids) - 1
      while seq_id[context_end_char] != (1 if pad_on_right else 0): # continue reading 1's until 0 is encountered -> end of context
        context_end_char -= 1


      # for all the chunks, check if ans lies within context of that chunk
      # if current context doesn't contain ans -> set cls_index
      if not(
          offset[context_start_char][0] <= ans_start_char and  offset[context_end_char][1] >= ans_end_char # checking if ans lies within current context
      ):
        chunk_ans_start_pos.append(cls_index)
        chunk_ans_end_pos.append(cls_index)

      else: # take start and end token positions
        # from the start of the context, move along context tokens until you reach the ans start char.
        # context_start_char should not go beyond total length if ans is the last word
        while context_start_char < len(offset) and offset[context_start_char][0] <= ans_start_char:
          context_start_char += 1
        chunk_ans_start_pos.append(context_start_char - 1)

        # move backwards from end of context until you reach the ans end char
        while offset[context_end_char][1] >= ans_end_char:
          context_end_char -= 1
        chunk_ans_end_pos.append(context_end_char + 1)

  # Add start and end positions to inputs
  inputs['start_positions'] = chunk_ans_start_pos
  inputs['end_positions'] = chunk_ans_end_pos

  return inputs


### Apply the function to the entire dataset

In [21]:
processed_train = train.map(
    preprocess_training_data,
    batched=True, # 1 row is being split into multiple chunks/ features
    remove_columns = train.column_names,
)

processed_validation = validation.map(
    preprocess_training_data,
    batched=True, # 1 row is being split into multiple chunks/ features
    remove_columns = train.column_names,
)

print('\n')
print('Number of records in original training data: ', len(train))
print('Number of records in processed training data: ',len(processed_train))
print('\n')
print('Number of records in original validation data: ',len(validation))
print('Number of records in processed training data: ', len(processed_validation))

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]



Number of records in original training data:  200
Number of records in processed training data:  200


Number of records in original validation data:  50
Number of records in processed training data:  50


In [22]:
processed_train

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 200
})

In [23]:
processed_validation

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 50
})

 ## ------------------------------- Pre-processing Complete ------------------------------------

### Fine-tuning the model
Training the model

In [24]:
# Initialize the model

from transformers import TFAutoModelForQuestionAnswering

# Using from_pretrained to download and cache the model
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

In [25]:
# Set up training parameters
model_name = model_checkpoint.split('/')[-1]
huggingface_model_name = f'{model_name}-finetuned-squad'
learning_rate = 2e-5
num_epochs = 1

In [26]:
# convert dataset to tf datasets
train_ds = model.prepare_tf_dataset(
    processed_train,
    shuffle = True,
    batch_size = batch_size
)

validation_ds = model.prepare_tf_dataset(
    processed_validation,
    shuffle = False,
    batch_size = batch_size
)



In [27]:
# create optimizer from transformers uses AdamW optimizer with weight decay
from transformers import create_optimizer

training_steps = len(train_ds) * num_epochs

optimizer, _ = create_optimizer(
    init_lr = learning_rate,
    num_train_steps = training_steps,
    num_warmup_steps = 0
)


In [28]:
# compile the model
# no need to mention loss as model automatically handles it

import tensorflow as tf
keras = tf.keras

model.compile(optimizer = optimizer, metrics = ['accuracy'])

In [31]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
# Define callbacks
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="./qa_model_save",
    tokenizer=tokenizer,
    hub_model_id=huggingface_model_name,
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./qa_model_save/logs", histogram_freq=1)

callbacks = [push_to_hub_callback, tensorboard_callback]

Cloning https://huggingface.co/Mariah64/distilbert-base-uncased-finetuned-squad into local empty directory.


Download file tf_model.h5:   0%|          | 14.8k/253M [00:00<?, ?B/s]

Download file logs/validation/events.out.tfevents.1716449748.bd1ff78dbe5a.33.1.v2: 100%|##########| 374/374 [0…

Clean file logs/validation/events.out.tfevents.1716449748.bd1ff78dbe5a.33.1.v2: 100%|##########| 374/374 [00:0…

Download file logs/train/events.out.tfevents.1716447577.bd1ff78dbe5a.33.0.v2:   2%|2         | 32.0k/1.28M [00…

Clean file logs/train/events.out.tfevents.1716447577.bd1ff78dbe5a.33.0.v2:   0%|          | 1.00k/1.28M [00:00…

Clean file tf_model.h5:   0%|          | 1.00k/253M [00:00<?, ?B/s]

In [33]:
# Loading the pre-trained weights
checkpoint_path = '/content/qa_model_save/tf_model.h5'
model.load_weights(checkpoint_path)


# Train the model on the entire dataset
# model.fit(train_ds,
#           validation_data = validation_ds,
#           epochs = num_epochs,
#           callbacks = callbacks,
#           )

# model.save_weights("full_model_trained.h5")

### Make predictions and evaluate metrics on validation dataset

In [34]:
# Checking model output for a batch of validation data

batch = next(iter(validation_ds))
predictions = model.predict_on_batch(batch)
predictions.keys()

# start_logits is an array of lists that gives the probability of each token being the start of the ans.
# so to find the answer for each feature, take the max of start start_logits as starting position

odict_keys(['start_logits', 'end_logits'])

In [35]:
# We have start and end logit (probabilities) for each feature (16) and each token (384)
predictions.start_logits.shape, predictions.end_logits.shape

((32, 384), (32, 384))

In [36]:
# Taking max probability to find ans start and end indices
import numpy as np
np.argmax(predictions.start_logits, -1), np.argmax(predictions.end_logits, -1)

(array([ 46,  57,  78,  43, 118, 107,  72,  35, 107,  34,  73,  41,  80,
         80, 156,  35,  83,  80,  80,  12,  77,  31,  42,  53,  41,  35,
         42,  77,  11,  44,  27, 133]),
 array([ 47,  58,  92,  44, 118, 109,  75,  37, 109,  36,  76,  42,  83,
         94, 158,  35,  83,  94,  83,  60,  80,  31,  43,  54,  42,  35,
         43,  80,  13,  45,  28, 133]))

In [37]:
# since we take max, there may be cases where the start position is greater than the end position;
# so we will take the top 20 best predictions
# - > check if each one is valid
# - > sort them by their score (start logit + end logit) and keep the best one

# Testing this for the 1st feature

import numpy as np

n_best_logits = 20

start_logit = predictions.start_logits[0]
end_logit   = predictions.end_logits[0]

# list of n_best indexes
start_indexes = np.argsort(start_logit)[-1: -n_best_logits-1: -1].tolist()
end_indexes   = np.argsort(end_logit)[-1: -n_best_logits-1: -1].tolist()

answers = []
for start_idx in start_indexes:
      for end_idx in end_indexes:
        # check if ans is fully in the context else skip
        if start_idx <= end_idx:
          answers.append(
              {
                  'score': start_logit[start_idx] + end_logit[end_idx],
                  'text' : ''
              }
          )

answers[:10]



[{'score': 13.411608, 'text': ''},
 {'score': 11.1234665, 'text': ''},
 {'score': 8.316137, 'text': ''},
 {'score': 7.921116, 'text': ''},
 {'score': 6.510167, 'text': ''},
 {'score': 6.3893547, 'text': ''},
 {'score': 5.5779505, 'text': ''},
 {'score': 5.2802835, 'text': ''},
 {'score': 5.0835423, 'text': ''},
 {'score': 4.6679964, 'text': ''}]

In [38]:
# to fetch the text, we need to re-process the validation_ds and add:
# 1. record_id to map each record to its corr features
# 2. offset map - gives start and end chars of each token

def process_validation_data(val_ds):
  question = [ q.strip() for q in val_ds['question']] # Cleanup questions - Removing extra spaces
  context  = [ c.strip() for c in val_ds['context']]
  pad_on_right = tokenizer.padding_side == 'right'

  # To handle very long contexts, split the context into multiple chunks with a sliding window between them
  inputs = tokenizer(
    question if pad_on_right else context,
    context if pad_on_right else question,
    max_length = 384, # setting max length of question + context to 384
    stride = 128, # overlapping tokens between chunks
    truncation = 'only_second' if pad_on_right else 'only_first',
    return_overflowing_tokens = True,
    return_offsets_mapping = True, # return start and end indices of answer in context
    padding = 'max_length',  # padding to max length as contexts are long and no need for dynamic padding
    )

  # One context is broken into multiple chunks if it exceeds max_length.
  # Creating a mapper that maps each context to its corresponding features (chunks)
  sample_mapping = inputs.pop('overflow_to_sample_mapping')

  # cleanup offset mapping: it contains offset for question + context
  # set question offset to None so that we can identify Question and Context
  inputs['record_id']=[]
  for i in range(len(inputs['input_ids'])):
    # Take seq ids to distinguish b/w q and c
    seq_id = inputs.sequence_ids(i)
    context_index = 1 if pad_on_right else 0

    # Fetching current feature
    sample_idx = sample_mapping[i]
    # Add record_ids to inputs
    inputs['record_id'].append(val_ds['id'][sample_idx]) # create record id list to tie created chunks back to their source


    # update offset mapping to None for everything not part of context : seq_id is 1 for context and 0 for question
    inputs['offset_mapping'][i] = [
        (offset if seq_id[j] == context_index else None)
        for j, offset in enumerate(inputs['offset_mapping'][i])
    ]


  return inputs



In [39]:
# Reprocess the validation data
reprocessed_validation = validation.map(
    process_validation_data,
    batched = True,
    remove_columns = validation.column_names,
)

# Convert it to tf dataset
reprocessed_validation_tf = model.prepare_tf_dataset(
    reprocessed_validation,
    shuffle = False,
    batch_size = batch_size,
)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [40]:
# Make predictions

pred_val = model.predict(reprocessed_validation_tf)
pred_val

TFQuestionAnsweringModelOutput(loss=None, start_logits=array([[-3.7142131 , -7.1585927 , -7.2458763 , ..., -8.192327  ,
        -8.149728  , -8.189176  ],
       [-3.760429  , -7.156919  , -7.23235   , ..., -8.190614  ,
        -8.147986  , -8.187328  ],
       [-3.5433943 , -5.931965  , -6.9411297 , ..., -8.259971  ,
        -8.243669  , -8.225153  ],
       ...,
       [-2.3122556 , -3.6793737 , -3.9274411 , ..., -8.197878  ,
        -8.203227  , -8.215396  ],
       [-3.453832  , -7.2457666 , -7.335319  , ..., -8.247725  ,
        -8.197778  , -8.244684  ],
       [-0.82100475, -6.6275935 , -6.7532244 , ..., -8.190688  ,
        -8.200349  , -8.208529  ]], dtype=float32), end_logits=array([[-3.3483777, -7.505767 , -7.072659 , ..., -7.94529  , -7.991425 ,
        -7.9546204],
       [-3.409004 , -7.5199513, -7.0881543, ..., -7.9484363, -7.9948864,
        -7.957849 ],
       [-3.5612319, -6.5223255, -7.293981 , ..., -7.868505 , -7.88835  ,
        -7.9063087],
       ...,
       [-2.

In [41]:
# modifying the earlier test for 1st feature
import numpy as np

n_best_logits = 20
max_ans_length = 30 # eliminate long ans

start_logit = pred_val.start_logits[0]
end_logit   = pred_val.end_logits[0]
offsets = reprocessed_validation[0]['offset_mapping']
context =  validation[0]['context'] # context of 1st feature

# list of n_best indexes
start_indexes = np.argsort(start_logit)[-1: -n_best_logits-1: -1].tolist()
end_indexes   = np.argsort(end_logit)[-1: -n_best_logits-1: -1].tolist()

answers = []
for start_idx in start_indexes:
      for end_idx in end_indexes:
        # check if 1. indices are out of bounds and 2. indices are not part of context. None = question/ CLS token
        if start_idx >= len(offsets) or end_idx >= len(offsets) \
        or offsets[start_idx] is None or offsets[end_idx] is None:
          continue
        # check if ans length is not < 0 or > maxlength then skip
        if end_idx < start_idx or end_idx - start_idx + 1 > max_ans_length:
          continue
        if start_idx <= end_idx:
          ans_start_char = offsets[start_idx][0]
          ans_end_char   = offsets[end_idx][1]
          answers.append(
              {
                  'score': start_logit[start_idx] + end_logit[end_idx],
                  'text' : context[ans_start_char : ans_end_char]
              }
          )
# top 20 answers sorted by score
sorted_answers = sorted(answers, key = lambda x: x['score'], reverse = True)[:n_best_logits]
sorted_answers, len(sorted_answers)


([{'score': 13.411608, 'text': 'Denver Broncos'},
  {'score': 11.1234665,
   'text': 'Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers'},
  {'score': 10.468445,
   'text': 'American Football Conference (AFC) champion Denver Broncos'},
  {'score': 8.866593,
   'text': 'The American Football Conference (AFC) champion Denver Broncos'},
  {'score': 8.490751, 'text': 'Carolina Panthers'},
  {'score': 8.316137,
   'text': 'Denver Broncos defeated the National Football Conference'},
  {'score': 8.30167, 'text': 'Broncos'},
  {'score': 8.1803055,
   'text': 'American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers'},
  {'score': 7.921116, 'text': 'Denver'},
  {'score': 6.996997, 'text': 'American Football Conference'},
  {'score': 6.5839653, 'text': 'champion Denver Broncos'},
  {'score': 6.578453,
   'text': 'The American Football Conference (AFC) champion Denver Broncos defea

In [42]:
# Let's check with the actual answer for 1st feature

validation[0]['answers']

{'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],
 'answer_start': [177, 177, 177]}

The model's highest probability answer matches the actual answer!

#### Post-processing predictions

Each example has multiple features. Mapping each example to its corresponding features to extract the answer

In [43]:
import collections

features_per_example = collections.defaultdict(list)

example_id_index_map = { k: i  for i, k in enumerate(validation['id']) }

for i, feature in enumerate(reprocessed_validation):
  features_per_example[example_id_index_map[feature['record_id']]].append(i)

In [44]:
# Track the progress of code execution with a progress bar
from tqdm.auto import tqdm

def postprocess_predictions(examples, features, pred_start_logits, pred_end_logits):
    # Mapping each example to its corresponding features
    features_per_example = collections.defaultdict(list)
    example_id_index_map = { k: i  for i, k in enumerate(examples['id']) }

    for i, feature in enumerate(features):
         features_per_example[example_id_index_map[feature['record_id']]].append(i)

    # Display number of examples and features
    print(f'Post processing: {len(examples)} examples split into {len(features)} features')

    # Create empty dict for storing predictions
    predicted_ans = collections.OrderedDict()

    # Looping over all examples
    for i, example in enumerate(tqdm(examples)):
        # fetch features associated to current example
        features_indices = features_per_example[i]
        context = example['context']
        answers = []
        # For squadv2 - handling impossible answers
        min_null_score = None

        # Looping over all features associated to current example
        for feature_index in features_indices:
            # get the model predictions and offsets for this feature
            start_logits = pred_start_logits[feature_index]
            end_logits   = pred_end_logits[feature_index]
            offsets      = features[feature_index]['offset_mapping']

            # squad_v2 Impossible answers: update minimum null prediction
            # for impossible ans, start and end index = CLS token index
            cls_index = features[feature_index]['input_ids'].index(tokenizer.cls_token_id)
            # compute the score for impossible ans for this feature:
            # 1. ans is simply not in the current feature => not an impossible ans
            # 2. it is truly an impossible ans
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            # get the best impossible ans score for an example
            # update the null score if impossible ans score for this feature > prev impossible ans score
            if min_null_score is None or feature_null_score > min_null_score:
                min_null_score = feature_null_score


            # list of n_best indexes
            n_best_size=20
            max_answer_length=30
            start_indexes = np.argsort(start_logits)[-1: -n_best_logits-1: -1].tolist()
            end_indexes   = np.argsort(end_logits)[-1: -n_best_logits-1: -1].tolist()

            for start_idx in start_indexes:
                for end_idx in end_indexes:
                    # check if indices are within bounds or ans is fully in the context else skip
                    if  (start_idx >= len(offsets) or end_idx >= len(offsets) or
                    offsets[start_idx] is None or offsets[end_idx] is None):
                        continue
                    # check if ans length is not < 0 or > maxlength then skip
                    if end_idx < start_idx or end_idx - start_idx + 1 > max_ans_length:
                        continue
                    start_char = offsets[start_idx][0]
                    end_char   = offsets[end_idx][1]
                    answers.append(
                    {
                        'text' : context[start_char : end_char],
                        'score': start_logits[start_idx] + end_logits[end_idx]
                    }
                    )

        if len(answers)>0:
            best_ans = max(answers, key = lambda x: x['score']) #selecting ans with highest logit score as best
        else:
            best_ans = {'text': '', score: 0.0}

        # calculate the final ans: best ans or null ans (for squad_v2)
        if not squad_v2:
            predicted_ans[example['id']] = best_ans['text']
        else: # if normal ans score > impossible ans score take normal ans else blank
            final_ans = best_ans['text'] if best_ans['score'] > min_null_score else ''
            predicted_ans[example['id']] = final_ans
    return predicted_ans

# Apply post processing function to validation predicted_ans
final_predictions = postprocess_predictions(
    validation,
    reprocessed_validation,
    pred_val['start_logits'],
    pred_val['end_logits'],
)

  0%|          | 0/50 [00:00<?, ?it/s]

This format of predicted ans is expected by the squad evaluation metric we will use.

#### Load the squad evaluation metric

In [45]:
metric = load_metric('squad_v2' if squad_v2 else 'squad')

Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Formatting the predicted ans to a list of dict as expected by the squad evaluation metric we will use.

In [46]:
if squad_v2:
    formatted_predictions = [
        {'id': key, 'prediction_text': value, 'no_answer_probability':0.0} # no_ans_prob=0 since we explicity set it to blank for no ans
        for key, value in final_predictions.items()
    ]
else:
    formatted_predictions = [
        {'id': key, 'prediction_text': value}
        for key, value in final_predictions.items()
    ]

actual_ans = [
    {'id': example['id'], 'answers': example['answers']}
    for example in validation
]

In [52]:
# View sample predicted and actual ans
print(formatted_predictions[4])
print(actual_ans[4])

In [48]:
# Evalute using the metric
metric.compute(predictions = formatted_predictions, references = actual_ans)

{'exact_match': 86.0, 'f1': 88.83076923076922}

#### Making Inferences

In [53]:
# Downloading our model from the hub

from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering

checkpoint = "Mariah64/distilbert-base-uncased-finetuned-squad"
model      = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint)
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/266M [00:00<?, ?B/s]

Some layers from the model checkpoint at Mariah64/distilbert-base-uncased-finetuned-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at Mariah64/distilbert-base-uncased-finetuned-squad and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [54]:
# testing with random data

context = """The dominant sequence transduction models are based on complex recurrent or convolutional
neural networks in an encoder-decoder configuration. The best performing models also connect the encoder
and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on
two machine translation tasks show these models to be superior in quality while being more parallelizable
and requiring significantly less time to train."""
question = "What kind of mechanisms is Transformer based on?"


In [55]:
import numpy as np
inputs = tokenizer([context], [question], return_tensors='np')
outputs = model(inputs)

# Finding best possible ans
ans_start = np.argmax(outputs.start_logits[0])
ans_end   = np.argmax(outputs.end_logits[0])
print('ans_start = ', ans_start)
print('ans_end = ', ans_end)

# Extract ans tokens between start and end positions
ans = inputs['input_ids'][0, ans_start : ans_end + 1]
print('ans tokens = ', ans)

In [56]:
# Decoding the tokens back to text
tokenizer.decode(ans)

'attention mechanisms'

#### Using Pipline API for quick inferencing
Once the model is on the hub, we can use pipeline api to replace the above steps and give us the ans directly

In [57]:
from transformers import pipeline

qa = pipeline('question-answering',  "Mariah64/distilbert-base-uncased-finetuned-squad", framework='tf')

Some layers from the model checkpoint at Mariah64/distilbert-base-uncased-finetuned-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at Mariah64/distilbert-base-uncased-finetuned-squad and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [58]:
context = '''
The advent of the accordion is the subject of debate among researchers. Many credit C. Friedrich L. Buschmann, whose Handäoline was patented
in Berlin in 1822, as the inventor of the accordion, while others give the distinction to Cyril Demian of Vienna, who patented his Accordion
in 1829, thus coining the name. A modification of the Handäoline, Demian’s invention comprised a small manual bellows and five keys, although,
as Demian noted in a description of the instrument, extra keys could be incorporated into the design. Numerous variations of the device soon followed.
'''

question = 'Whats is the subject of debate among researchers'

In [59]:
qa(context = context, question = question)

{'score': 0.42739593982696533,
 'start': 5,
 'end': 28,
 'answer': 'advent of the accordion'}