In [1]:
#This example is based on SQuAD dataset. (Stanford Question Answering Dataset)
#Depending on the model and the GPU (if using), the batch size is adjusted to avoid out-of-memory errors. 
#The three parameters below must be set for the notebook to run smoothly.

In [2]:
squad_v2 = False #specifying version of the dataset
model_checkpoint = "distilbert-base-uncased" #
batch_size = 16 #defining the batch size accordingly

In [3]:
import transformers
print(transformers.__version__)

4.41.1


In [4]:
from datasets import load_dataset, load_metric
datasets = load_dataset("squad_v2" if squad_v2 else "squad")
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [5]:
datasets["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast) #run by Rust, much more faster implementation i.e., fast tokenizer

In [8]:
tokenizer("What is your name?", "My name is Kamalam.")

{'input_ids': [101, 2054, 2003, 2115, 2171, 1029, 102, 2026, 2171, 2003, 25353, 22144, 2378, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [10]:
for i, example in enumerate(datasets["train"]):
    if len(tokenizer(example["question"], example["context"])["input_ids"]) > 384:
        break
example = datasets["train"][i]

In [11]:
example #max_length context

{'id': '5733caf74776f4190066124c',
 'title': 'University_of_Notre_Dame',
 'context': "The men's basketball team has over 1,600 wins, one of only 12 schools who have reached that mark, and have appeared in 28 NCAA tournaments. Former player Austin Carr holds the record for most points scored in a single game of the tournament with 61. Although the team has never won the NCAA Tournament, they were named by the Helms Athletic Foundation as national champions twice. The team has orchestrated a number of upsets of number one ranked teams, the most notable of which was ending UCLA's record 88-game winning streak in 1974. The team has beaten an additional eight number-one teams, and those nine wins rank second, to UCLA's 10, all-time in wins against the top team. The team plays in newly renovated Purcell Pavilion (within the Edmund P. Joyce Center), which reopened for the beginning of the 2009–2010 season. The team is coached by Mike Brey, who, as of the 2014–15 season, his fifteenth at Notre

In [12]:
sentence = "This sentence is not too long but we are going to split it anyway."
inputs = tokenizer(
    sentence, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids)) #[CLS] beginning of a sentence/chunk/split #[SEP] separator tag

[CLS] this sentence is not [SEP]
[CLS] is not too long [SEP]
[CLS] too long but we [SEP]
[CLS] but we are going [SEP]
[CLS] are going to split [SEP]
[CLS] to split it anyway [SEP]
[CLS] it anyway. [SEP]


In [13]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length, #max length for context
    truncation="only_second", #truncate only the context
    return_overflowing_tokens=True, #overlapping token chunks ensures our answer doesn't get missed
    stride=doc_stride #stride window over the text for chunking
)

In [14]:
for x in tokenized_example["input_ids"][:2]:
    print(tokenizer.decode(x)) #decodes the mapping to give text
    print("\n")

[CLS] how many wins does the notre dame men's basketball team have? [SEP] the men's basketball team has over 1, 600 wins, one of only 12 schools who have reached that mark, and have appeared in 28 ncaa tournaments. former player austin carr holds the record for most points scored in a single game of the tournament with 61. although the team has never won the ncaa tournament, they were named by the helms athletic foundation as national champions twice. the team has orchestrated a number of upsets of number one ranked teams, the most notable of which was ending ucla's record 88 - game winning streak in 1974. the team has beaten an additional eight number - one teams, and those nine wins rank second, to ucla's 10, all - time in wins against the top team. the team plays in newly renovated purcell pavilion ( within the edmund p. joyce center ), which reopened for the beginning of the 2009 – 2010 season. the team is coached by mike brey, who, as of the 2014 – 15 season, his fifteenth at notr

In [15]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second", 
    return_overflowing_tokens=True,
    return_offsets_mapping=True, #returns the mapping between tokens and position in the og context, the offset of the particular token is returned. (0,0) is for [CLS]
    stride=doc_stride
)
print(tokenized_example["offset_mapping"][0][1]) #the offset mapping for "how"

(0, 3)


In [16]:
first_token_id = tokenized_example["input_ids"][0][1]
offsets = tokenized_example["offset_mapping"][0][1]
print(tokenizer.convert_ids_to_tokens([first_token_id])[0], example["question"][offsets[0]:offsets[1]]) #mapping between ids to offset

how How


In [17]:
tokenized_example["offset_mapping"][0][1]

(0, 3)

In [18]:
for x in tokenized_example["input_ids"][0]:
    print(tokenizer.decode(x))

[CLS]
how
many
wins
does
the
notre
dame
men
'
s
basketball
team
have
?
[SEP]
the
men
'
s
basketball
team
has
over
1
,
600
wins
,
one
of
only
12
schools
who
have
reached
that
mark
,
and
have
appeared
in
28
ncaa
tournaments
.
former
player
austin
carr
holds
the
record
for
most
points
scored
in
a
single
game
of
the
tournament
with
61
.
although
the
team
has
never
won
the
ncaa
tournament
,
they
were
named
by
the
helm
##s
athletic
foundation
as
national
champions
twice
.
the
team
has
orchestrated
a
number
of
upset
##s
of
number
one
ranked
teams
,
the
most
notable
of
which
was
ending
ucla
'
s
record
88
-
game
winning
streak
in
1974
.
the
team
has
beaten
an
additional
eight
number
-
one
teams
,
and
those
nine
wins
rank
second
,
to
ucla
'
s
10
,
all
-
time
in
wins
against
the
top
team
.
the
team
plays
in
newly
renovated
purcell
pavilion
(
within
the
edmund
p
.
joyce
center
)
,
which
reopened
for
the
beginning
of
the
2009
–
2010
season
.
the
team
is
coached
by
mike
br
##ey
,
who
,
as
of
the
201

In [19]:
sequence_ids = tokenized_example.sequence_ids()
print(sequence_ids) #0 indicates question #1 indicates context

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [20]:
#To find start and end tokens for our answer based on the context, sequence_ids along with context can be used

In [21]:
tokenized_example["input_ids"][0]

[101,
 2129,
 2116,
 5222,
 2515,
 1996,
 10289,
 8214,
 2273,
 1005,
 1055,
 3455,
 2136,
 2031,
 1029,
 102,
 1996,
 2273,
 1005,
 1055,
 3455,
 2136,
 2038,
 2058,
 1015,
 1010,
 5174,
 5222,
 1010,
 2028,
 1997,
 2069,
 2260,
 2816,
 2040,
 2031,
 2584,
 2008,
 2928,
 1010,
 1998,
 2031,
 2596,
 1999,
 2654,
 5803,
 8504,
 1012,
 2280,
 2447,
 5899,
 12385,
 4324,
 1996,
 2501,
 2005,
 2087,
 2685,
 3195,
 1999,
 1037,
 2309,
 2208,
 1997,
 1996,
 2977,
 2007,
 6079,
 1012,
 2348,
 1996,
 2136,
 2038,
 2196,
 2180,
 1996,
 5803,
 2977,
 1010,
 2027,
 2020,
 2315,
 2011,
 1996,
 16254,
 2015,
 5188,
 3192,
 2004,
 2120,
 3966,
 3807,
 1012,
 1996,
 2136,
 2038,
 23339,
 1037,
 2193,
 1997,
 6314,
 2015,
 1997,
 2193,
 2028,
 4396,
 2780,
 1010,
 1996,
 2087,
 3862,
 1997,
 2029,
 2001,
 4566,
 12389,
 1005,
 1055,
 2501,
 6070,
 1011,
 2208,
 3045,
 9039,
 1999,
 3326,
 1012,
 1996,
 2136,
 2038,
 7854,
 2019,
 3176,
 2809,
 2193,
 1011,
 2028,
 2780,
 1010,
 1998,
 2216,
 3157,
 52

In [23]:
answers = example["answers"]
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])

#Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != 1:
    token_start_index += 1

#End token index of the current span in the text.
token_end_index = len(tokenized_example["input_ids"][0]) - 1
while sequence_ids[token_end_index] != 1:
    token_end_index -= 1

#Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
offsets = tokenized_example["offset_mapping"][0]
if (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
    # Move the token_start_index and token_end_index to the two ends of the answer.
    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
        token_start_index += 1
    start_position = token_start_index - 1
    while offsets[token_end_index][1] >= end_char:
        token_end_index -= 1
    end_position = token_end_index + 1
    print(start_position, end_position)
else:
    print("The answer is not in this feature.")

23 26


In [24]:
print(tokenizer.decode(tokenized_example["input_ids"][0][start_position: end_position+1]))
print(answers["text"][0]) #the offset based value and the actual text value

over 1, 600
over 1,600


In [25]:
pad_on_right = tokenizer.padding_side == "right" 
#by default padding is on the right, bool value to check the same
#to account for the special case where the model expects padding on the left

In [26]:
def prepare_train_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]] #removing whitespace at the beginning of a question.
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"], 
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    #Map from a feature to its corresponding example
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    #Map from token to character position in the original context.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        #Impossible answers are labelled with [CLS] token
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        #for that particular example, consider the sequence ids to separate the question and context
        sequence_ids = tokenized_examples.sequence_ids(i)

        #One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        #If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            #Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            #Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            #End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            #Detect if the answer is out of the span (if so labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                #Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [27]:
features = prepare_train_features(datasets['train'][:5])

In [28]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names) #tokenising the dataset.

In [29]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
#Setting up all the attributes to customize the training using "TrainingArguments"
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad", #folder to save the model checkpoints
    eval_strategy = "epoch", #evaluation strategy, done at the end of each epoch
    learning_rate=2e-5, #defining the learning rate
    per_device_train_batch_size=batch_size, #train size
    per_device_eval_batch_size=batch_size, #batch size
    num_train_epochs=3, #no.of epochs
    weight_decay=0.01, #decay component
)

In [34]:
from transformers import default_data_collator
data_collator = default_data_collator #to batch the processed examples together

In [35]:
#passing the configurations and dataset to the Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model("qa-bert-trained") #Saving the model locally