In [1]:

from transformers import AutoTokenizer, AutoModel
from transformers import TFAutoModelForQuestionAnswering
from transformers import pipeline
import transformers
import os
from tqdm import tqdm
import pandas as pd
import time
import tensorflow as tf
keras = tf.keras
import warnings
warnings.filterwarnings('ignore')

# Load dataset and metric for evaluation
from datasets import load_dataset, load_metric


2024-05-27 14:55:08.146621: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-27 14:55:08.146718: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-27 14:55:08.285713: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Setup initial parameters

squad_v2 = False # indicates if impossible answers are allowed
model_checkpoint = 'distilbert-base-uncased'

In [3]:
# Loading the SQuAD dataset

# use a smaller size for testing
train_size = 87599
valid_size = 10570

split = [
    f"train[:{train_size}]",
    f"validation[:{valid_size}]"
]


train, validation = load_dataset('squadv2' if squad_v2 else 'squad', split=split)


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 14.5M/14.5M [00:00<00:00, 43.7MB/s]
Downloading data: 100%|██████████| 1.82M/1.82M [00:00<00:00, 9.60MB/s]


Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [5]:
validation

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [6]:
train[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

### Text pre-processing

1. Tokenizer
- Convert text to tokens with ids
- Generate other inputs in the format the model requires

from_pretrained:
- gets tokenizer corr to our model
- downloads and caches vocab used when trining this model

In [7]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# Checking that the tokenizer is a fast tokenizer as we will be using some of their features for preprocessing
tokenizer.is_fast

True

In [9]:
def preprocess_training_data(train_ds):
  question = [ q.strip() for q in train_ds['question']] # Cleanup questions - Removing extra spaces
  context  = [ c.strip() for c in train_ds['context']]
  pad_on_right = tokenizer.padding_side == 'right' # returns True if question is followed by context,
                                                   # to support models where context is followed by question, - order of question and context will be swapped

  # To handle very long contexts, split the context into multiple chunks with a sliding window between them
  inputs = tokenizer(
    question if pad_on_right else context,
    context if pad_on_right else question,
    max_length = 384,                 # setting max length of question + context to 100
    stride = 128,                     # overlapping tokens between chunks
    truncation = 'only_second' if pad_on_right else 'only_first',       # if question +  context is too long, truncate only context (2nd argument)
    return_overflowing_tokens = True, # return chunks that have been truncated as well
    return_offsets_mapping = True,    # return which chunk contains the answer
    padding = 'max_length',
    )

  # One context is broken into multiple chunks if it exceeds max_length.
  # Creating a mapper that maps each context to its corresponding features (chunks)
  sample_mapping = inputs.pop('overflow_to_sample_mapping')


  # Create offset map to map start and end end indices of answer in context
  offset_mapping = inputs.pop('offset_mapping')


  # Now that context is split into chunks, we will now find the first and last token of the answer within the context
  chunk_ans_start_pos = []
  chunk_ans_end_pos = []

  # Now that context is split into chunks, we will now find the first and last token of the answer within the context

  chunk_ans_start_pos = []
  chunk_ans_end_pos = []

  for i, offset in enumerate(offset_mapping):


    # Fetch input ids and CLS index
    input_ids = inputs['input_ids'][i]
    cls_index = input_ids.index(tokenizer.cls_token_id)

    # calculate context start and end indices
    # fetch sequence_ids to know what is the question and what is the context
    seq_id = inputs.sequence_ids(i)

    sample_idx = sample_mapping[i] # get index of current chunk
    answer = train_ds['answers'][sample_idx] # get answers for each source

    # if no answer is given, set cls_index as start and end char
    if len(answer['answer_start']) == 0:
      chunk_ans_start_pos.append(cls_index)
      chunk_ans_end_pos.append(cls_index)
    else:
      # calculate ans start and end indices
      ans_start_char = answer['answer_start'][0]
      ans_end_char   = ans_start_char + len(answer['text'][0])


      # get start and end char of current context
      context_start_char = 0
      while seq_id[context_start_char] != (1 if pad_on_right else 0): # 1 indicates start of context
        context_start_char += 1

      context_end_char = len(input_ids) - 1
      while seq_id[context_end_char] != (1 if pad_on_right else 0): # continue reading 1's until 0 is encountered -> end of context
        context_end_char -= 1


      # for all the chunks, check if ans lies within context of that chunk
      # if current context doesn't contain ans -> set cls_index
      if not(
          offset[context_start_char][0] <= ans_start_char and  offset[context_end_char][1] >= ans_end_char # checking if ans lies within current context
      ):
        chunk_ans_start_pos.append(cls_index)
        chunk_ans_end_pos.append(cls_index)

      else: # take start and end token positions
        # from the start of the context, move along context tokens until you reach the ans start char.
        # context_start_char should not go beyond total length if ans is the last word
        while context_start_char < len(offset) and offset[context_start_char][0] <= ans_start_char:
          context_start_char += 1
        chunk_ans_start_pos.append(context_start_char - 1)

        # move backwards from end of context until you reach the ans end char
        while offset[context_end_char][1] >= ans_end_char:
          context_end_char -= 1
        chunk_ans_end_pos.append(context_end_char + 1)

  # Add start and end positions to inputs
  inputs['start_positions'] = chunk_ans_start_pos
  inputs['end_positions'] = chunk_ans_end_pos

  return inputs


### Apply the function to the entire dataset

In [10]:
processed_train = train.map(
    preprocess_training_data,
    batched=True, # 1 row is being split into multiple chunks/ features
    remove_columns = train.column_names,
)

processed_validation = validation.map(
    preprocess_training_data,
    batched=True, # 1 row is being split into multiple chunks/ features
    remove_columns = train.column_names,
)

print('\n')
print('Number of records in original training data: ', len(train))
print('Number of records in processed training data: ',len(processed_train))
print('\n')
print('Number of records in original validation data: ',len(validation))
print('Number of records in processed training data: ', len(processed_validation))

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]



Number of records in original training data:  87599
Number of records in processed training data:  88524


Number of records in original validation data:  10570
Number of records in processed training data:  10784


In [11]:
processed_train

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 88524
})

In [12]:
processed_validation

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 10784
})

 ## ------------------------------- Pre-processing Complete ------------------------------------

### Fine-tuning the model
Training the model

In [13]:
# Initialize the model

# Using from_pretrained to download and cache the model
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

### Using Hyperopt for hyper parameter tuning

In [14]:
from hyperopt import hp

# 1. Define the search space
search_space = {
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.01),
    'batch_size': hp.quniform('batch_size', 16, 32, 16),
    'epochs': hp.quniform('epochs', 2, 3, 1)
}

In [15]:
# 2. Define the objective function

from hyperopt import STATUS_OK
from transformers import create_optimizer # create optimizer from transformers uses AdamW optimizer with weight decay

def objective(params):
    batch_size = int(params['batch_size'])
    epochs     = int(params['epochs'])
    # convert dataset to tf datasets
    train_ds = model.prepare_tf_dataset(
        processed_train,
        shuffle = True,
        batch_size = batch_size
    )

    validation_ds = model.prepare_tf_dataset(
        processed_validation,
        shuffle = False,
        batch_size = batch_size
    )
    
    optimizer, _ = create_optimizer(
        init_lr = params['learning_rate'],
        num_train_steps = len(train_ds) * epochs,
        num_warmup_steps = 0
    )
    # compile the model 
    # no need to mention loss as model automatically handles it
    model.compile(optimizer = optimizer, metrics = ['accuracy'])
    
    history = model.fit(train_ds,
          validation_data = validation_ds,
          epochs = epochs,
          batch_size = batch_size
          )
    
    val_loss = history.history['val_loss'][-1]
    return {'loss': val_loss, 'status': STATUS_OK}

In [16]:
# 3. Run hyperopt

from hyperopt import fmin, tpe, Trials

trials = Trials()

best = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 5,
    trials = trials
)

print('Best parameters: ', best)

Epoch 1/2

Cause: for/else statement not yet supported
  0%|          | 0/5 [00:03<?, ?trial/s, best loss=?]

I0000 00:00:1716821842.167196      88 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   1/5532 [..............................] - ETA: 84:25:46 - loss: 5.9357 - end_logits_accuracy: 0.0625 - start_logits_accuracy: 0.0000e+00
   2/5532 [..............................] - ETA: 36:58 - loss: 5.9839 - end_logits_accuracy: 0.0312 - start_logits_accuracy: 0.0000e+00   
   3/5532 [..............................] - ETA: 36:16 - loss: 5.9745 - end_logits_accuracy: 0.0208 - start_logits_accuracy: 0.0000e+00
   4/5532 [..............................] - ETA: 36:09 - loss: 5.9738 - end_logits_accuracy: 0.0156 - start_logits_accuracy: 0.0000e+00
   5/5532 [..............................] - ETA: 35:56 - loss: 5.9766 - end_logits_accuracy: 0.0125 - start_logits_accuracy: 0.0000e+00
   6/5532 [..............................] - ETA: 35:50 - loss: 5.9807 - end_logits_accuracy: 0.0104 - start_logits_accuracy: 0.0000e+00
   7/5532 [..............................] - ETA: 37:07 - loss: 5.9730 - end_logits_accuracy: 0.0089 - start_logits_accuracy: 0.0000e+00
   8/5532 [........................