In [1]:
# !nvidia-smi

# Fine Tuning TFBertQuestionAnswering

## Imports

In [2]:
from datasets import load_dataset
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
2024-10-26 12:44:27.832465: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-26 12:44:27.841049: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1729961067.851882  173662 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1729961067.855023  173662 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-26 12:44:27.867195: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

## Loading dataset
### The dataset is a popular question-answering dataset called SQUAD. each datapoint consists of 
* A question
* A context that may contain the answer to the question
* The start  Index of the answer
* The answer

In [3]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
squad = squad.train_test_split(test_size=0.2)

In [4]:
# Convert the dataset to a dictionary
data_dict = squad["train"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)
print(df.isna().any())
df.head()

id          False
title       False
context     False
question    False
answers     False
dtype: bool


Unnamed: 0,id,title,context,question,answers
0,56e464f48c00841900fbaf85,Architecture,"In the early 19th century, Augustus Welby Nort...",What world did Pugin dislike?,"{'text': ['modern, industrial world'], 'answer..."
1,57278d03dd62a815002ea030,FA_Cup,"Almost 60 years later, 80 year old career crim...",How claims to have stolen the FA cup?,{'text': ['80 year old career criminal Henry (...
2,5725ba9aec44d21400f3d47d,Dutch_language,"Outside of the Low Countries, it is the native...","In what place with the word ""name"" in it do mo...","{'text': ['Suriname'], 'answer_start': [93]}"
3,56cd798262d2951400fa65fa,Sino-Tibetan_relations_during_the_Ming_dynasty,"Throughout the following month, the Yongle Emp...",Who did Deshin Shekpa persuade the Yongle Empe...,{'text': ['religious leaders of other Tibetan ...
4,56d23d72b329da140004ec4d,Buddhism,"Regarding the monastic rules, the Buddha const...",The rules themselves are designed to assure a ...,"{'text': ['life'], 'answer_start': [183]}"


# Preprocessing

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

## This function preprocesses tokenizes the data, adds the end position of the context to the data and returns the data and the dataframe

In [6]:
def preprocess(df, type):

    # remove whitespace and set maximum length of sentence
    questions = [q.strip() for q in df["question"]]
    context = [q.strip() for q in df["context"]]
    inputs = tokenizer(
        questions,
        context,
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []  # to store the end position of the context
    answers = df["answers"]

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    df["start_positions"] = start_positions
    df["end_positions"] = end_positions

    data = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "start_positions": start_positions,
        "end_positions": end_positions,
    }
    type = f"encoding_{type}"
    df = pd.DataFrame(data)
    df.to_csv(f"{type}.csv", index=False)
    data = {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "start_positions": start_positions,
        "end_positions": end_positions,
    }
    data = Dataset.from_pandas(df)
    return df, data

In [7]:
_, train = preprocess(df, "train")
print(train)
_.head()

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 70079
})


Unnamed: 0,input_ids,attention_mask,start_positions,end_positions
0,"[101, 2054, 2088, 2106, 16405, 11528, 18959, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",36,39
1,"[101, 2129, 4447, 2000, 2031, 7376, 1996, 6904...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",16,33
2,"[101, 1999, 2054, 2173, 2007, 1996, 2773, 1000...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",37,37
3,"[101, 2040, 2106, 4078, 10606, 2016, 2243, 450...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",232,239
4,"[101, 1996, 3513, 3209, 2024, 2881, 2000, 1430...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",47,47


# pre-processing the Test Dataset

In [8]:
# Convert the dataset to a dictionary
data_dict = squad["test"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)

In [9]:
_, test = preprocess(df, "test")
print(test)
_.head()

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 17520
})


Unnamed: 0,input_ids,attention_mask,start_positions,end_positions
0,"[101, 2054, 2003, 3344, 2041, 2006, 3329, 2302...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",218,219
1,"[101, 2054, 2515, 1996, 2413, 4552, 2110, 2005...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",23,28
2,"[101, 2054, 2468, 4187, 2043, 1037, 2711, 2003...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",27,27
3,"[101, 2054, 2020, 2691, 1999, 2047, 5231, 1029...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",101,101
4,"[101, 2054, 3297, 2136, 3249, 9433, 2005, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",31,31


# Fine-tuning BERT for the dataset

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir="question_answering_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# training for 5 epochs

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2684,1.147714
2,0.9966,1.091371
3,0.7679,1.119363
4,0.5983,1.188258
5,0.4893,1.269816


TrainOutput(global_step=21900, training_loss=0.8982335429431096, metrics={'train_runtime': 5288.9822, 'train_samples_per_second': 66.25, 'train_steps_per_second': 4.141, 'total_flos': 3.433514473928448e+16, 'train_loss': 0.8982335429431096, 'epoch': 5.0})

## Testing the model

In [2]:
from transformers import pipeline

question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."


question_answerer = pipeline(
    "question-answering", model="./question_answering_model/checkpoint-21900"
)
question_answerer(question=question, context=context)

  from .autonotebook import tqdm as notebook_tqdm
2024-10-27 17:28:11.980559: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-27 17:28:12.181136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730064492.253699    1423 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730064492.279453    1423 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-27 17:28:12.429089: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

{'score': 0.8985372185707092, 'start': 93, 'end': 95, 'answer': '13'}