In [None]:
# run "pip install -r requirements.txt"

# Fine Tuning TFBertQuestionAnswering

## Imports

In [1]:
from datasets import load_dataset, Dataset
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
)

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
2024-12-01 15:14:34.696483: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 15:14:34.833358: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733084074.888140    2009 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733084074.903257    2009 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 15:14:35.028370: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

## Loading dataset
### The dataset is a popular question-answering dataset called SQUAD. each datapoint consists of 
* A question
* A context that may contain the answer to the question
* The start  Index of the answer
* The answer

In [2]:
squad = load_dataset("squad", split="train")
squad = squad.train_test_split(test_size=0.2)

In [3]:
# Convert the dataset to a dictionary
data_dict = squad["train"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)
print(df.isna().any())
df.head()

id          False
title       False
context     False
question    False
answers     False
dtype: bool


Unnamed: 0,id,title,context,question,answers
0,57325124e17f3d140042285f,The_Bronx,The development of the Bronx is directly conne...,Who owned Kingsbridge?,"{'text': ['Frederick Philipse'], 'answer_start..."
1,572eaab5cb0c0d14000f1428,Vacuum,"Almost two thousand years after Plato, René De...",What did Descartes believe about vacuums in na...,{'text': ['that a vacuum does not occur in nat...
2,5726f4505951b619008f835b,Crimean_War,The Russians evacuated Wallachia and Moldavia ...,Who voted to have a committee investigate the ...,"{'text': ['Parliament'], 'answer_start': [541]}"
3,570d5f41b3d812140066d77f,Valencia,During the 20th century Valencia remained the ...,Where did Valencia rank in size among Spanish ...,"{'text': ['third'], 'answer_start': [46]}"
4,56dd2d7d9a695914005b9533,Prime_minister,"Walpole always denied that he was ""prime minis...",During whose government did prime minister fir...,"{'text': ['Benjamin Disraeli'], 'answer_start'..."


In [4]:
df["answers"][0]

{'text': ['Frederick Philipse'], 'answer_start': [313]}

In [5]:
for q, a in zip(squad["train"]["question"][:5], squad["train"]["answers"][:5]):
    print(f"{q} -> {a}")

Who owned Kingsbridge? -> {'text': ['Frederick Philipse'], 'answer_start': [313]}
What did Descartes believe about vacuums in nature? -> {'text': ['that a vacuum does not occur in nature'], 'answer_start': [250]}
Who voted to have a committee investigate the mismanagement during the war? -> {'text': ['Parliament'], 'answer_start': [541]}
Where did Valencia rank in size among Spanish cities in the 1900s? -> {'text': ['third'], 'answer_start': [46]}
During whose government did prime minister first see use on official state documents? -> {'text': ['Benjamin Disraeli'], 'answer_start': [690]}


# Preprocessing

### Testing tokenizer

In [8]:
context = "This is the context"
question = "This is the question"

token_ids = tokenizer(
    text=context, text_pair=question, padding=False, return_tensors="tf"
)

print(token_ids)

{'input_ids': <tf.Tensor: shape=(1, 11), dtype=int32, numpy=
array([[ 101, 2023, 2003, 1996, 6123,  102, 2023, 2003, 1996, 3160,  102]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 11), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


In [10]:
print(tokenizer.convert_ids_to_tokens(token_ids["input_ids"].numpy()[0]))

['[CLS]', 'this', 'is', 'the', 'context', '[SEP]', 'this', 'is', 'the', 'question', '[SEP]']


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

## This function preprocesses tokenizes the data, adds the end position of the context to the data and returns the data and the dataframe

In [None]:
def preprocess(df, type):

    # remove whitespace and set maximum length of sentence
    questions = [q.strip() for q in df["question"]]
    context = [q.strip() for q in df["context"]]
    inputs = tokenizer(
        questions,
        context,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []  # to store the end position of the context
    answers = df["answers"]

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:

            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    df["start_positions"] = start_positions
    df["end_positions"] = end_positions

    data = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "start_positions": start_positions,
        "end_positions": end_positions,
    }
    type = f"encoding_{type}"
    df = pd.DataFrame(data)
    df.to_csv(f"{type}.csv", index=False)
    data = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "start_positions": start_positions,
        "end_positions": end_positions,
    }
    data = Dataset.from_pandas(df)
    return df, data

In [7]:
_, train = preprocess(df, "train")
print(train)
_.head()

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 70079
})


Unnamed: 0,input_ids,attention_mask,start_positions,end_positions
0,"[101, 2054, 2024, 2788, 16578, 2011, 4632, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",32,33
1,"[101, 2054, 2597, 2001, 4789, 2805, 2012, 1367...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",19,29
2,"[101, 2054, 2003, 1996, 6557, 21534, 3698, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",35,37
3,"[101, 2054, 1005, 1055, 1996, 2095, 2008, 3933...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",56,56
4,"[101, 2040, 2515, 1996, 8434, 2012, 1996, 1613...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",76,78


# pre-processing the Test Dataset

In [8]:
# Convert the dataset to a dictionary
data_dict = squad["test"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)

In [9]:
_, test = preprocess(df, "test")
print(test)
_.head()

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 17520
})


Unnamed: 0,input_ids,attention_mask,start_positions,end_positions
0,"[101, 2043, 2001, 1996, 2645, 1997, 23136, 406...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",62,62
1,"[101, 15053, 1011, 13838, 2764, 3365, 16105, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",73,74
2,"[101, 2065, 4957, 2003, 2025, 3048, 1010, 2073...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",113,115
3,"[101, 2054, 2095, 2001, 1996, 2034, 3444, 2143...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",96,96
4,"[101, 2129, 2116, 3645, 1022, 15943, 2020, 415...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",85,86


# Fine-tuning BERT for the dataset

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir="question_answering_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# training for 5 epochs

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2737,1.164262
2,0.965,1.089928
3,0.7899,1.115258


TrainOutput(global_step=13140, training_loss=1.1296380308665097, metrics={'train_runtime': 3481.4314, 'train_samples_per_second': 60.388, 'train_steps_per_second': 3.774, 'total_flos': 2.060108684357069e+16, 'train_loss': 1.1296380308665097, 'epoch': 3.0})

## Testing the model

In [None]:
question = "How many programming languages does BLOOM support?"

context = """BLOOM has 176 billion parameters and can generate 
            text in 46 languages natural languages and 13 programming
            languages."""


question_answerer = pipeline(
    "question-answering", model="./question_answering_model/checkpoint-21900"
)
question_answerer(question=question, context=context)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'score': 0.8985372185707092, 'start': 106, 'end': 108, 'answer': '13'}