In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForQuestionAnswering, TFAutoModelForQuestionAnswering
from transformers import pipeline
import transformers
import torch
import os
from tqdm import tqdm
import json
import re
import pandas as pd
import torch
import time
import tensorflow as tf

In [None]:
# Setup initial parameters

squad_v2 = False # indicates if impossible answers are allowed
model_checkpoint = 'distilbert-base-uncased'
batch_size = 16

In [None]:
# Load dataset and metric for evaluation

from datasets import load_dataset, load_metric
import warnings
warnings.filterwarnings('ignore')

import os
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Loading the SQuAD dataset

# use a smaller size for testing
train_size = 300 #87599
valid_size = 50 #10570

split = [
    f"train[:{train_size}]",
    f"validation[:{valid_size}]"
]


train, validation = load_dataset('squadv2' if squad_v2 else 'squad', split=split)



In [None]:
train

In [None]:
validation

In [None]:
train[0]

### Text pre-processing

1. Tokenizer
- Convert text to tokens with ids
- Generate other inputs in the format the model requires

from_pretrained:
- gets tokenizer corr to our model
- downloads and caches vocab used when trining this model

In [None]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Checking that the tokenizer is a fast tokenizer as we will be using some of their features for preprocessing
tokenizer.is_fast

In [None]:
# test the tokenizer on a sample question, context
tokenizer('What day is it today?', 'Today is a gloomy day, it most likely is a Wednesday')

In [None]:
# To handle very long contexts, split the context into multiple chunks with a sliding window between them

# sample with very long context
for i, sample in enumerate(train):
  # pass q, context to tokenizer & fetch input_ids
  inp = tokenizer(sample['question'], sample['context'])['input_ids']
  length = len(inp)
  if length > 300: # model max length
    long_sample = train[i]
    break

# long_sample, length

In [None]:
# pass the sample to tokenizer to cap only context lenght to 300

tokenized_sample = tokenizer(
    sample['question'],
    sample['context'],
    max_length = 300,
    stride = 100,
    truncation = 'only_second',
    return_overflowing_tokens = True,
)
tokenized_sample


In [None]:
# 1 sample is broken down into multiple chunks of length 300 with overlap.
# input_ids therefore contains multiple lists of individual chunks

# checkin that length is truncated to 300 for each chunk
[len(x) for x in tokenized_sample['input_ids']]

In [None]:
# let's view the decoded tokenized output - context for 1 question has been split into multiple chunks with overlap
for chunk in tokenized_sample['input_ids']:
  print(tokenizer.decode(chunk))

In [None]:
# use return_offsets_mapping to return start and end chars for each token as output so that we can extract the ans
# CLS token is a blank toke at (0,0) followed by 'how' at (0,3)

tokenized_sample = tokenizer(
    sample['question'],
    sample['context'],
    max_length = 300,
    stride = 100,
    truncation = 'only_second',
    return_overflowing_tokens = True,
    return_offsets_mapping = True,
)
tokenized_sample['offset_mapping'][0][:10]

In [None]:
# Verifiying that offset mapping generated actually corresponds to a word in input_ids (q+context)
# 1st chunk, 1st word - how (skip CLS token)

first_token_id = tokenized_sample['input_ids'][0][1]
print('First input id: ', first_token_id)

print('First word using input_ids: ',tokenizer.convert_ids_to_tokens(first_token_id))

offset_id = tokenized_sample['offset_mapping'][0][1]
print('First offset id: ', offset_id)

print('First word using offset id: ', sample['question'][offset_id[0] : offset_id[1]])

In [None]:
# to distinguish between q & c in offset, use sequence_ids
# None for CLS and SEP tokens
# 0 for q
# 1 for c

seq_id = tokenized_sample.sequence_ids()
print(seq_id)

In [None]:
# we will now find the first and last token of the answer within the context
# testing if ans is present in first chunk

answer = sample['answers']
input_ids = tokenized_sample['input_ids'][0]
offset = tokenized_sample['offset_mapping'][0]

ans_start_char = answer['answer_start'][0]
ans_end_char   = ans_start_char + len(answer['text'][0])

# get start and end char of current context
context_start_char = 0
while seq_id[context_start_char] != 1: # 1 indicates start of context
  context_start_char += 1
context_end_char = len(input_ids) - 1
while seq_id[context_end_char] != 1: # read from end until first 1 is encountered -> end of context
  context_end_char -= 1

if offset[context_start_char][0] <= ans_start_char and  offset[context_end_char][1] >= ans_end_char : # if ans lies within current context
  # take start and end token positions
  # from the start of the context, move along context tokens until you reach the ans start char.
  # context_start_char should not go beyond total length if ans is the last word
  while context_start_char < len(offset) and offset[context_start_char][0] <= ans_start_char :
    context_start_char += 1
  chunk_ans_start_pos = context_start_char - 1
  # move backwards from end of context until you reach the ans end char
  while offset[context_end_char][1] >= ans_end_char :
    context_end_char -= 1
  chunk_ans_end_pos = context_end_char + 1
  print('Ans start and end positions: ', chunk_ans_start_pos,chunk_ans_end_pos)
else:
  print('Ans does not lie in current context (feature)')

In [None]:
# let's decode this output and validate with the actual answer

ans_pos_decoded = tokenizer.decode(
    tokenized_sample['input_ids'][0][chunk_ans_start_pos : chunk_ans_end_pos+1]
)
actual_ans = answer['text'][0]

print(ans_pos_decoded)
print(actual_ans)

###Text pre-processing - Training data

In [None]:
def preprocess_training_data(train_ds):
  question = [ q.strip() for q in train_ds['question']] # Cleanup questions - Removing extra spaces
  context  = [ c.strip() for c in train_ds['context']]
  pad_on_right = tokenizer.padding_side == 'right' # returns True if question is followed by context,
                                                   # to support models where context is followed by question, - order of question and context will be swapped

  # To handle very long contexts, split the context into multiple chunks with a sliding window between them
  inputs = tokenizer(
    question if pad_on_right else context,
    context if pad_on_right else question,
    max_length = 384,                 # setting max length of question + context to 100
    stride = 128,                     # overlapping tokens between chunks
    truncation = 'only_second' if pad_on_right else 'only_first',       # if question +  context is too long, truncate only context (2nd argument)
    return_overflowing_tokens = True, # return chunks that have been truncated as well
    return_offsets_mapping = True,    # return which chunk contains the answer
    padding = 'max_length',
	)

  # One context is broken into multiple chunks if it exceeds max_length.
  # Creating a mapper that maps each context to its corresponding features (chunks)
  sample_mapping = inputs.pop('overflow_to_sample_mapping')


  # Create offset map to map start and end end indices of answer in context
  offset_mapping = inputs.pop('offset_mapping')


  # Now that context is split into chunks, we will now find the first and last token of the answer within the context
  chunk_ans_start_pos = []
  chunk_ans_end_pos = []

  # Now that context is split into chunks, we will now find the first and last token of the answer within the context

  chunk_ans_start_pos = []
  chunk_ans_end_pos = []

  for i, offset in enumerate(offset_mapping):


    # Fetch input ids and CLS index
    input_ids = inputs['input_ids'][i]
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # calculate context start and end indices
    # fetch sequence_ids to know what is the question and what is the context
    seq_id = inputs.sequence_ids(i)

    sample_idx = sample_mapping[i] # get index of current chunk
    answer = train_ds['answers'][sample_idx] # get answers for each source

    # if no answer is given, set cls_index as start and end char
    if len(answer['answer_start']) == 0:
      chunk_ans_start_pos.append(cls_index)
      chunk_ans_end_pos.append(cls_index)
    else:
      # calculate ans start and end indices
      ans_start_char = answer['answer_start'][0]
      ans_end_char   = ans_start_char + len(answer['text'][0])


      # get start and end char of current context
      context_start_char = 0
      while seq_id[context_start_char] != (1 if pad_on_right else 0): # 1 indicates start of context
        context_start_char += 1

      context_end_char = len(input_ids) - 1
      while seq_id[context_end_char] != (1 if pad_on_right else 0): # continue reading 1's until 0 is encountered -> end of context
        context_end_char -= 1


      # for all the chunks, check if ans lies within context of that chunk
      # if current context doesn't contain ans -> set cls_index
      if not(
          offset[context_start_char][0] <= ans_start_char and  offset[context_end_char][1] >= ans_end_char # checking if ans lies within current context
      ):
        chunk_ans_start_pos.append(cls_index)
        chunk_ans_end_pos.append(cls_index)

      else: # take start and end token positions
        # from the start of the context, move along context tokens until you reach the ans start char.
        # context_start_char should not go beyond total length if ans is the last word
        while context_start_char < len(offset) and offset[context_start_char][0] <= ans_start_char:
          context_start_char += 1
        chunk_ans_start_pos.append(context_start_char - 1)

        # move backwards from end of context until you reach the ans end char
        while offset[context_end_char][1] >= ans_end_char:
          context_end_char -= 1
        chunk_ans_end_pos.append(context_end_char + 1)

  # Add start and end positions to inputs
  inputs['start_positions'] = chunk_ans_start_pos
  inputs['end_positions'] = chunk_ans_end_pos

  return inputs


### Apply the function to the entire dataset

In [None]:
processed_train = train.map(
    preprocess_training_data,
    batched=True, # 1 row is being split into multiple chunks/ features
    remove_columns = train.column_names,
)

processed_validation = validation.map(
    preprocess_training_data,
    batched=True, # 1 row is being split into multiple chunks/ features
    remove_columns = train.column_names,
)

print('\n')
print('Number of records in original training data: ', len(train))
print('Number of records in processed training data: ',len(processed_train))
print('\n')
print('Number of records in original validation data: ',len(validation))
print('Number of records in processed training data: ', len(processed_validation))

In [None]:
processed_train

In [None]:
processed_validation

 ## ------------------------------- Pre-processing Complete ------------------------------------

### Fine-tuning the model
Training the model

In [None]:
# Initialize the model

from transformers import TFAutoModelForQuestionAnswering

# Using from_pretrained to download and cache the model
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
# Set up training parameters
model_name = model_checkpoint.split('/')[-1]
huggingface_model_name = f'{model_name}-finetuned-squad'
learning_rate = 2e-5
num_epochs = 1

In [None]:
# convert dataset to tf datasets
train_ds = model.prepare_tf_dataset(
    processed_train,
    shuffle = True,
    batch_size = batch_size
)

validation_ds = model.prepare_tf_dataset(
    processed_validation,
    shuffle = False,
    batch_size = batch_size
)



In [None]:
# create optimizer from trnasformers uses AdamW optimizer with weight decay
from transformers import create_optimizer

training_steps = len(train_ds) * num_epochs

optimizer, _ = create_optimizer(
    init_lr = learning_rate,
    num_train_steps = training_steps,
    num_warmup_steps = 0
)


In [None]:
# compile the model 
# no need to mention loss as model automatically handles it

import tensorflow as tf
keras = tf.keras

model.compile(optimizer = optimizer, metrics = ['accuracy'])

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Define callbacks
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="./qa_model_save",
    tokenizer=tokenizer,
    hub_model_id=huggingface_model_name,
)

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./qa_model_save/logs")

callbacks = [push_to_hub_callback, tensorboard_callback]

In [None]:
# Loading the pre-trained weights
checkpoint_path = '/kaggle/working/qa_model_save/tf_model.h5'
model.load_weights(checkpoint_path)


# Train the model on the entire dataset
# model.fit(train_ds,
#           validation_data = validation_ds,
#           epochs = num_epochs,
#           callbacks = callbacks,
#           )

# model.save_weights("full_model_trained.h5")

### Make predictions and evaluate metrics on validation dataset

In [None]:
# Checking model output for a batch of validataion data

batch = next(iter(validation_ds))
predictions = model.predict_on_batch(batch)
predictions.keys()

# start_logits is an array of lists that gives the probability of each token being the start of the ans.
# so to find the answer for each feature, take the max of start start_logits as starting position

In [None]:
# We have start and end logit (probabilities) for each feature (16) and each token (384) 
predictions.start_logits.shape, predictions.end_logits.shape

In [None]:
# Taking max probability to find ans start and end indices
import numpy as np
np.argmax(predictions.start_logits, -1), np.argmax(predictions.end_logits, -1)

In [None]:
# since we take max, there may be cases where the start position is greater than the end position;
# so we will take the top 20 best predictions
# - > check if each one is valid
# - > sort them by their score (start logit + end logit) and keep the best one

# Testing this for the 1st feature

import numpy as np

n_best_logits = 20

start_logit = predictions.start_logits[0]
end_logit   = predictions.end_logits[0]

# list of n_best indexes
start_indexes = np.argsort(start_logit)[-1: -n_best_logits-1: -1].tolist()
end_indexes   = np.argsort(end_logit)[-1: -n_best_logits-1: -1].tolist()

answers = []
for start_idx in start_indexes:
      for end_idx in end_indexes:
        # check if ans is fully in the context else skip
        if start_idx <= end_idx:
          answers.append(
              {
                  'score': start_logit[start_idx] + end_logit[end_idx],
                  'text' : ''
              }
          )

answers[:10]



In [None]:
# to fetch the text, we need to re-process the validation_ds and add:
# 1. record_id to map each record to its corr features
# 2. offset map - gives start and end chars of each token

def process_validation_data(val_ds):
  question = [ q.strip() for q in val_ds['question']] # Cleanup questions - Removing extra spaces
  context  = [ c.strip() for c in val_ds['context']]
  pad_on_right = tokenizer.padding_side == 'right'

  # To handle very long contexts, split the context into multiple chunks with a sliding window between them
  inputs = tokenizer(
    question if pad_on_right else context,
    context if pad_on_right else question,
    max_length = 384, # setting max length of question + context to 384
    stride = 128, # overlapping tokens between chunks
    truncation = 'only_second' if pad_on_right else 'only_first',
    return_overflowing_tokens = True,
    return_offsets_mapping = True, # return start and end indices of answer in context
    padding = 'max_length',  # padding to max length as contexts are long and no need for dynamic padding
    )

  # One context is broken into multiple chunks if it exceeds max_length.
  # Creating a mapper that maps each context to its corresponding features (chunks)
  sample_mapping = inputs.pop('overflow_to_sample_mapping')

  # cleanup offset mapping: it contains offset for question + context
  # set question offset to None so that we can identify Question and Context
  inputs['record_id']=[]
  for i in range(len(inputs['input_ids'])):
    # Take seq ids to distinguish b/w q and c
    seq_id = inputs.sequence_ids(i)
    context_index = 1 if pad_on_right else 0

    # Fetching current feature
    sample_idx = sample_mapping[i]
    # Add record_ids to inputs
    inputs['record_id'].append(val_ds['id'][sample_idx]) # create record id list to tie created chunks back to their source


    # update offset mapping to None for everything not part of context : seq_id is 1 for context and 0 for question
    inputs['offset_mapping'][i] = [
        (offset if seq_id[j] == context_index else None)
        for j, offset in enumerate(inputs['offset_mapping'][i])
    ]


  return inputs



In [None]:
# Reprocess the validation data
reprocessed_validation = validation.map(
    process_validation_data,
    batched = True,
    remove_columns = validation.column_names,
)

# Convert it to tf dataset
reprocessed_validation_tf = model.prepare_tf_dataset(
    reprocessed_validation,
    shuffle = False,
    batch_size = batch_size,
)

In [None]:
# Make predictions

pred_val = model.predict(reprocessed_validation_tf)
pred_val

In [None]:
# modifying the earlier test for 1st feature
import numpy as np

n_best_logits = 20
max_ans_length = 30 # eliminate long ans

start_logit = predictions.start_logits[0]
end_logit   = predictions.end_logits[0]
offsets = reprocessed_validation[0]['offset_mapping']
context =  validation[0]['context'] # context of 1st feature

# list of n_best indexes
start_indexes = np.argsort(start_logit)[-1: -n_best_logits-1: -1].tolist()
end_indexes   = np.argsort(end_logit)[-1: -n_best_logits-1: -1].tolist()

answers = []
for start_idx in start_indexes:
      for end_idx in end_indexes:
        # check if 1. indices are out of bounds and 2. indices are not part of context. None = question/ CLS token
        if start_idx >= len(offsets) or end_idx >= len(offsets) \
        or offsets[start_idx] is None or offsets[end_idx] is None:
          continue
        # check if ans length is not < 0 or > maxlength then skip
        if end_idx < start_idx or end_idx - start_idx + 1 > max_ans_length:
          continue
        if start_idx <= end_idx:
          ans_start_char = offsets[start_idx][0]
          ans_end_char   = offsets[end_idx][1]
          answers.append(
              {
                  'score': start_logit[start_idx] + end_logit[end_idx],
                  'text' : context[ans_start_char : ans_end_char]
              }
          )
# top 20 answers sorted by score
sorted_answers = sorted(answers, key = lambda x: x['score'], reverse = True)[:n_best_logits]
sorted_answers, len(sorted_answers)


In [None]:
# Let's check with the actual answer for 1st feature

validation[0]['answers']

The model's highest probability answer matches the actual answer!

#### Post-processing predictions

Each example has multiple features. Mapping each example to its corresponding features to extract the answer

In [None]:
import collections

features_per_example = collections.defaultdict(list)

example_id_index_map = { k: i  for i, k in enumerate(validation['id']) }

for i, feature in enumerate(reprocessed_validation):
  features_per_example[example_id_index_map[feature['record_id']]].append(i)

In [None]:
# Track the progress of code execution with a progress bar
from tqdm.auto import tqdm

def postprocess_predictions(examples, features, pred_start_logits, pred_end_logits):
    # Mapping each example to its corresponding features 
    features_per_example = collections.defaultdict(list)
    example_id_index_map = { k: i  for i, k in enumerate(examples['id']) }

    for i, feature in enumerate(features):
         features_per_example[example_id_index_map[feature['record_id']]].append(i)
    
    # Display number of examples and features
    print(f'Post processing: {len(examples)} examples split into {len(features)} features')
        
    # Create empty dict for storing predictions
    predicted_ans = collections.OrderedDict()
    
    # Looping over all examples
    for i, example in enumerate(tqdm(examples)):
        # fetch features associated to current example
        features_indices = features_per_example[i]
        context = example['context']
        answers = []
        # For squadv2 - handling impossible answers
        min_null_score = None
        
        # Looping over all features associated to current example
        for feature_index in features_indices:
            # get the model predictions and offsets for this feature
            start_logits = pred_start_logits[feature_index]
            end_logits   = pred_end_logits[feature_index]
            offsets      = features[feature_index]['offset_mapping']
            
            # squad_v2 Impossible answers: update minimum null prediction
            # for impossible ans, start and end index = CLS token index
            cls_index = features[feature_index]['input_ids'].index(tokenizer.cls_token_id)
            # compute the score for impossible ans for this feature:
            # 1. ans is simply not in the current feature => not an impossible ans
            # 2. it is truly an impossible ans
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            # get the best impossible ans score for an example
            # update the null score if impossible ans score for this feature > prev impossible ans score
            if min_null_score is None or feature_null_score > min_null_score:
                min_null_score = feature_null_score
                
            
            # list of n_best indexes
            n_best_size=20
            max_answer_length=30
            start_indexes = np.argsort(start_logits)[-1: -n_best_logits-1: -1].tolist()
            end_indexes   = np.argsort(end_logits)[-1: -n_best_logits-1: -1].tolist()

            for start_idx in start_indexes:
                for end_idx in end_indexes:
                    # check if indices are within bounds or ans is fully in the context else skip
                    if  (start_idx >= len(offsets) or end_idx >= len(offsets) or 
                    offsets[start_idx] is None or offsets[end_idx] is None):
                        continue
                    # check if ans length is not < 0 or > maxlength then skip
                    if end_idx < start_idx or end_idx - start_idx + 1 > max_ans_length:
                        continue
                    start_char = offsets[start_idx][0]
                    end_char   = offsets[end_idx][1]
                    answers.append(
                    {
                        'text' : context[start_char : end_char],
                        'score': start_logits[start_idx] + end_logits[end_idx]
                    }
                    )   
                
        if len(answers)>0:
            best_ans = max(answers, key = lambda x: x['score']) #selecting ans with highest logit score as best
        else:
            best_ans = {'text': '', score: 0.0}
        
        # calculate the final ans: best ans or null ans (for squad_v2)
        if not squad_v2:
            predicted_ans[example['id']] = best_ans['text']
        else: # if normal ans score > impossible ans score take normal ans else blank
            final_ans = best_ans['text'] if best_ans['score'] > min_null_score else ''
            predicted_ans[example['id']] = final_ans
    return predicted_ans

# Apply post preoccsing function to validation predicted_ans
final_predictions = postprocess_predictions(
    validation,
    reprocessed_validation,
    pred_val['start_logits'],
    pred_val['end_logits'],
)

This format of predicted ans is expected by the squad evaluation metric we will use.

#### Load the squad evaluation metric

In [None]:
metric = load_metric('squad_v2' if squad_v2 else 'squad')

Formatting the predicted ans to a list of dict as expected by the squad evaluation metric we will use.

In [None]:
if squad_v2:
    formatted_predictions = [
        {'id': key, 'prediction_text': value, 'no_answer_probability':0.0} # no_ans_prob=0 since we explicity set it to blank for no ans
        for key, value in final_predictions.items()
    ]
else:
    formatted_predictions = [
        {'id': key, 'prediction_text': value}
        for key, value in final_predictions.items()
    ]
    
actual_ans = [
    {'id': example['id'], 'answers': example['answers']}
    for example in validation
]

In [None]:
# View sample predicted and actual ans
print(formatted_predictions[4])
print(actual_ans[4])

In [None]:
# Evalute using the metric
metric.compute(predictions = formatted_predictions, references = actual_ans)

#### Making Inferences

In [None]:
# Downloading our model from the hub

from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering

checkpoint = "Mariah64/distilbert-base-uncased-finetuned-squad"
model      = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint)
tokenizer  = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
# testing with random data

context = """The dominant sequence transduction models are based on complex recurrent or convolutional 
neural networks in an encoder-decoder configuration. The best performing models also connect the encoder
and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, 
based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on 
two machine translation tasks show these models to be superior in quality while being more parallelizable 
and requiring significantly less time to train."""
question = "What kind of mechanisms is Transformer based on?"


In [None]:
import numpy as np
inputs = tokenizer([context], [question], return_tensors='np')
outputs = model(inputs)

# Finding best possible ans
ans_start = np.argmax(outputs.start_logits[0])
ans_end   = np.argmax(outputs.end_logits[0])
print('ans_start = ', ans_start)
print('ans_end = ', ans_end)

# Extract ans tokens between start and end positions
ans = inputs['input_ids'][0, ans_start : ans_end + 1]
print('ans tokens = ', ans)

In [None]:
# Decoding the tokens back to text
tokenizer.decode(ans)

#### Using Pipline API for quick inferencing
Once the model is on the hub, we can use pipeline api to replace the above steps and give us the ans directly

In [None]:
from transformers import pipeline

qa = pipeline('question-answering',  "Mariah64/distilbert-base-uncased-finetuned-squad", framework='tf')

In [None]:
context = '''
The advent of the accordion is the subject of debate among researchers. Many credit C. Friedrich L. Buschmann, whose Handäoline was patented 
in Berlin in 1822, as the inventor of the accordion, while others give the distinction to Cyril Demian of Vienna, who patented his Accordion 
in 1829, thus coining the name. A modification of the Handäoline, Demian’s invention comprised a small manual bellows and five keys, although, 
as Demian noted in a description of the instrument, extra keys could be incorporated into the design. Numerous variations of the device soon followed.
'''

question = 'Whats is the subject of debate among researchers'

In [None]:
qa(context = context, question = question)