https://huggingface.co/course/chapter7/7?fw=pt

## Install libraries

In [None]:
! pip install datasets transformers datasets huggingface_hub

In [None]:
!apt install git-lfs

## Login to huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Fine-tuning a model on a question-answering task

## SELECT MODEL

In [None]:
# Select a model
model_names = [
    "bert-base-cased", # 0
    "prajjwal1/bert-tiny", # 1
    "prajjwal1/bert-small", # 2
    "distilbert-base-uncased", # 3
    "albert-base-v2", # 4
    "xlm-roberta-base", # 5
    "HooshvareLab/bert-fa-zwnj-base", # 6
    "HooshvareLab/distilbert-fa-zwnj-base", # 7
    "HooshvareLab/roberta-fa-zwnj-base", # 8
    # fine-tuned on squad
    "distilbert-base-uncased-distilled-squad", # 9
    "deepset/roberta-base-squad2", # 10
]
model_id = 7

# This flag is the difference between SQUAD v1 or 2 (if you're using another dataset, it indicates if impossible
# answers are allowed or not).
# Since pquad is similar to squad_v2 we choose True here
squad_v2 = True

model_checkpoint = model_names[model_id]
batch_size = 16

## Loading the dataset

In [None]:
from datasets import load_dataset, load_metric

In [None]:
# datasets = load_dataset("squad_v2" if squad_v2 else "squad")
datasets = load_dataset("gholamreza/pquad")

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

Downloading and preparing dataset pquad/pquad to /root/.cache/huggingface/datasets/gholamreza___pquad/pquad/1.0.0/ce0e1bf6a7a67398a195ccb3a16263336e977abad6e67800390346e8d9530ab0...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/602k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/593k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/63994 [00:00<?, ? examples/s]

/root/.cache/huggingface/datasets/downloads/e49d5f650d69a5999fe6ceb4438a023cccdcf3e6519abc4dabce736f91595591


Generating validation split:   0%|          | 0/7976 [00:00<?, ? examples/s]

/root/.cache/huggingface/datasets/downloads/ea42ddfa9db6f39bc3249a878c853a6f6b466f6217a360bbb8afbac9410d84cc


Generating test split:   0%|          | 0/8002 [00:00<?, ? examples/s]

/root/.cache/huggingface/datasets/downloads/d6ba3b80ff2a6d0333454fac286694b5e777518ea141e0dcd7c0558b71624882
Dataset pquad downloaded and prepared to /root/.cache/huggingface/datasets/gholamreza___pquad/pquad/1.0.0/ce0e1bf6a7a67398a195ccb3a16263336e977abad6e67800390346e8d9530ab0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

The `datasets` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set.

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 63994
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 7976
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 8002
    })
})

### Size of splits

In [None]:
print(f"Size of Train dataset: {len(datasets['train'])}")
print(f"Size of Valid dataset: {len(datasets['validation'])}")
print(f"Size of Test dataset:  {len(datasets['test'])}")

Size of Train dataset: 63994
Size of Valid dataset: 7976
Size of Test dataset:  8002


### A sample data

In [None]:
# datasets["train"][0]

### Show random samples

In [None]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [None]:
# show_random_elements(datasets["train"])

## Preprocessing the training data

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/500 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

### Check Tokenizer

In [None]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

Let's find one long example in our dataset:

In [None]:
for i, example in enumerate(datasets["train"]):
    if len(tokenizer(example["question"], example["context"])["input_ids"]) > 384:
        break
example = datasets["train"][i]

Without any truncation, we get the following length for the input IDs:

In [None]:
len(tokenizer(example["question"], example["context"])["input_ids"])

395

Now, if we just truncate, we will lose information (and possibly the answer to our question):

In [None]:
len(tokenizer(example["question"], example["context"], max_length=max_length, truncation="only_second")["input_ids"])

384

In [None]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second",
    return_overflowing_tokens=True,
    stride=doc_stride
)

Now we don't have one list of `input_ids`, but several: 

In [None]:
[len(x) for x in tokenized_example["input_ids"]]

[384, 166]

And if we decode them, we can see the overlap:

In [None]:
for x in tokenized_example["input_ids"][:2]:
    print(tokenizer.decode(x))

[CLS] بزرگ خاندان استارک که اعضای خانواده [ZWNJ] اش در بیشتر قسمت [ZWNJ] های مجموعه بازی تاج [ZWNJ] وتخت حضور دارند چه نام دارد ؟ [SEP] [UNK] [UNK] ند استارک [ و 8 ] ( شان بین ) بزرگ خاندان استارک [ و 6 ] است که اعضای خانواده [ZWNJ] اش در بیشتر قسمت [ZWNJ] های داستان حضور دارند. او و همسرش کتلین تالی [ و 9 ] ( میشل فرلی ) پنج فرزند دارند : راب [ و 10 ] ( ریچارد مدن ) فرزند بزرگ خانواده ، سانسا [ و 11 ] ( سوفی ترنر ) دختری زیبا ، [UNK] [ و 12 ] ( میسی ویلیامز ) دختری بازیگوش ، برن [ و 13 ] ( ایزاک همپستد رایت ) پسری ماجراجو و ریکن [ و 14 ] ( [UNK] پارکینسون ) کوچک [ZWNJ] ترین فرزند [UNK] [ZWNJ] هاست. گروگان و نگهبان ند استارک ، تیان گریجوی [ و 15 ] ( [UNK] [UNK] ) نیز تا پیش از مواجهه با رمزی اسنو [ و 16 ] دیوانه ( ایوان ریان ) در کنار استارک [ZWNJ] ها زندگی می [ZWNJ] کرد. همسر راب ، تالیسا مگیر [ و 17 ] ( اونا چاپلین ) شفابخش است. [UNK] نیز با شاگرد [UNK] به نام گندری [ و 18 ] ( جو دمپسی ) رابطه [ZWNJ] ای دوستانه دارد. پسر حرام [ZWNJ] [UNK] ند ، جان اسنو [ و 19 ] ( کیت هرینگتون ) و دوس

In [None]:
tokenized_example = tokenizer(
    example["question"],
    example["context"],
    max_length=max_length,
    truncation="only_second",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    stride=doc_stride
)
print(tokenized_example["offset_mapping"][0][:100])

[(0, 0), (0, 4), (5, 11), (12, 18), (19, 21), (22, 27), (28, 35), (35, 36), (36, 38), (39, 41), (42, 47), (48, 52), (52, 53), (53, 56), (57, 63), (64, 68), (69, 72), (72, 73), (73, 77), (78, 82), (83, 88), (89, 91), (92, 95), (96, 100), (100, 101), (0, 0), (0, 4), (5, 11), (12, 14), (15, 21), (21, 22), (22, 23), (24, 25), (25, 26), (27, 28), (28, 31), (32, 35), (35, 36), (37, 41), (42, 48), (49, 55), (55, 56), (56, 57), (58, 59), (59, 60), (61, 64), (65, 67), (68, 73), (74, 81), (81, 82), (82, 84), (85, 87), (88, 93), (94, 98), (98, 99), (99, 102), (103, 109), (110, 114), (115, 120), (120, 121), (122, 124), (125, 126), (127, 132), (133, 136), (136, 138), (139, 143), (143, 144), (144, 145), (146, 147), (147, 148), (149, 150), (150, 154), (155, 157), (157, 159), (159, 160), (161, 164), (165, 170), (171, 176), (176, 177), (178, 181), (181, 182), (182, 183), (184, 186), (186, 187), (188, 189), (189, 195), (196, 198), (198, 199), (199, 200), (201, 206), (207, 211), (212, 219), (219, 220), (

In [None]:
first_token_id = tokenized_example["input_ids"][0][1]
offsets = tokenized_example["offset_mapping"][0][1]
print(tokenizer.convert_ids_to_tokens([first_token_id])[0], example["question"][offsets[0]:offsets[1]])

بزرگ بزرگ


In [None]:
sequence_ids = tokenized_example.sequence_ids()
print(sequence_ids)

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
answers = example["answers"]
start_char = answers["answer_start"][0]
end_char = start_char + len(answers["text"][0])

# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != 1:
    token_start_index += 1

# End token index of the current span in the text.
token_end_index = len(tokenized_example["input_ids"][0]) - 1
while sequence_ids[token_end_index] != 1:
    token_end_index -= 1

# Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
offsets = tokenized_example["offset_mapping"][0]
if (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
    # Move the token_start_index and token_end_index to the two ends of the answer.
    # Note: we could go after the last offset if the answer is the last word (edge case).
    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
        token_start_index += 1
    start_position = token_start_index - 1
    while offsets[token_end_index][1] >= end_char:
        token_end_index -= 1
    end_position = token_end_index + 1
    print(start_position, end_position)
else:
    print("The answer is not in this feature.")

26 29


In [None]:
print(tokenizer.decode(tokenized_example["input_ids"][0][start_position: end_position+1]))
print(answers["text"][0])

[UNK] [UNK] ند استارک
لُرد اِدارد ند استارک


In [None]:
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    # TODO: Shouln't this be rstrip() for rtl languages???
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
features = prepare_train_features(datasets['train'][:5])

### Apply prepare_train_features on training set

In [None]:
tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)

  0%|          | 0/64 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

## Fine-tuning the model

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/303M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at HooshvareLab/distilbert-fa-zwnj-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You shoul

### Training Config

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-pquad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

Then we will need a data collator that will batch our processed examples together, here the default one will work:

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

## ||| TRAIN

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/Gholamreza/distilbert-fa-zwnj-base-finetuned-2epoch-pquad into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/287M [00:00<?, ?B/s]

Download file runs/Feb18_19-16-59_98a20cd19eb9/1676747831.136729/events.out.tfevents.1676747831.98a20cd19eb9.6…

Clean file runs/Feb18_19-16-59_98a20cd19eb9/1676747831.136729/events.out.tfevents.1676747831.98a20cd19eb9.683.…

Download file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Clean file training_args.bin:  29%|##9       | 1.00k/3.43k [00:00<?, ?B/s]

Download file runs/Feb18_19-16-59_98a20cd19eb9/events.out.tfevents.1676747831.98a20cd19eb9.683.0: 100%|#######…

Clean file runs/Feb18_19-16-59_98a20cd19eb9/events.out.tfevents.1676747831.98a20cd19eb9.683.0:  21%|##        …

Download file runs/Feb19_12-22-31_e5992a9644a0/events.out.tfevents.1676809424.e5992a9644a0.223.0: 100%|#######…

Clean file runs/Feb19_12-22-31_e5992a9644a0/events.out.tfevents.1676809424.e5992a9644a0.223.0:  24%|##3       …

Download file runs/Feb19_12-22-31_e5992a9644a0/1676809424.9824603/events.out.tfevents.1676809424.e5992a9644a0.…

Clean file runs/Feb19_12-22-31_e5992a9644a0/1676809424.9824603/events.out.tfevents.1676809424.e5992a9644a0.223…

Clean file pytorch_model.bin:   0%|          | 1.00k/287M [00:00<?, ?B/s]

We can now finetune our model by just calling the `train` method:

In [None]:
trainer.train()

***** Running training *****
  Num examples = 64043
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 8006
  Number of trainable parameters = 75179522


Epoch,Training Loss,Validation Loss
1,1.1522,1.143524
2,0.8579,1.108949


Saving model checkpoint to distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-500
Configuration saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-500/config.json
Model weights saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-500/pytorch_model.bin
tokenizer config file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-500/tokenizer_config.json
Special tokens file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-500/special_tokens_map.json
tokenizer config file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/tokenizer_config.json
Special tokens file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/special_tokens_map.json
Saving model checkpoint to distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-1000
Configuration saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-1000/config.json
Model weights saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/checkpoint-1000/pytorch

TrainOutput(global_step=8006, training_loss=1.1742519172108117, metrics={'train_runtime': 5554.2697, 'train_samples_per_second': 23.061, 'train_steps_per_second': 1.441, 'total_flos': 1.2551124728024064e+16, 'train_loss': 1.1742519172108117, 'epoch': 2.0})

Since this training is particularly long, let's save the model just in case we need to restart.

In [None]:
trainer.save_model("test-pquad-trained")

Saving model checkpoint to test-pquad-trained
Configuration saved in test-pquad-trained/config.json
Model weights saved in test-pquad-trained/pytorch_model.bin
tokenizer config file saved in test-pquad-trained/tokenizer_config.json
Special tokens file saved in test-pquad-trained/special_tokens_map.json
Saving model checkpoint to distilbert-fa-zwnj-base-finetuned-2epoch-pquad
Configuration saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/config.json
Model weights saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/pytorch_model.bin
tokenizer config file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/tokenizer_config.json
Special tokens file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 32.0k/287M [00:00<?, ?B/s]

Upload file runs/Feb19_12-51-30_293d815ad4f9/events.out.tfevents.1676811180.293d815ad4f9.401.0: 100%|#########…

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/Gholamreza/distilbert-fa-zwnj-base-finetuned-2epoch-pquad
   2844b92..766195e  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/Gholamreza/distilbert-fa-zwnj-base-finetuned-2epoch-pquad
   2844b92..766195e  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}, 'dataset': {'name': 'pquad', 'type': 'pquad', 'config': 'pquad', 'split': 'validation', 'args': 'pquad'}}
To https://huggingface.co/Gholamreza/distilbert-fa-zwnj-base-finetuned-2epoch-pquad
   766195e..17f6310  main -> main

   766195e..17f6310  main -> main



## Evaluation

In [None]:
# Choose which one to do evaluation on

# target_dataset = "validation"
target_dataset = "test"

In [None]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

odict_keys(['loss', 'start_logits', 'end_logits'])

In [None]:
output.start_logits.shape, output.end_logits.shape

(torch.Size([16, 384]), torch.Size([16, 384]))

In [None]:
output.start_logits.argmax(dim=-1), output.end_logits.argmax(dim=-1)

(tensor([ 22,  43,  58,  64,  62,   0,  27,  29,  56,  62, 106, 123,   0,   0,
          90, 130], device='cuda:0'),
 tensor([ 28,  55,  60,  64,  62,   0,  29,  37,  58,  64, 106, 128,   0,  51,
         108, 154], device='cuda:0'))

In [None]:
n_best_size = 20

In [None]:
import numpy as np

start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "" # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

In [None]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
validation_features = datasets[target_dataset].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets[target_dataset].column_names
)

  0%|          | 0/9 [00:00<?, ?ba/s]

Now we can grab the predictions for all features by using the `Trainer.predict` method:

In [None]:
raw_predictions = trainer.predict(validation_features)

The following columns in the test set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 8026
  Batch size = 16


In [None]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [None]:
max_answer_length = 30

In [None]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
context = datasets[target_dataset][0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
            start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or offset_mapping[start_index] is None
            or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

[{'score': 12.832811, 'text': 'صورت وحی از سوی او توسط جبرئیل'},
 {'score': 12.376301,
  'text': 'صورت وحی از سوی او توسط جبرئیل بر پیامبر اسلام، محمد'},
 {'score': 11.846052,
  'text': 'صورت وحی از سوی او توسط جبرئیل بر پیامبر اسلام، محمد بن'},
 {'score': 10.983219, 'text': 'صورت وحی از سوی او توسط جبرئیل بر'},
 {'score': 10.935022, 'text': 'سوی او توسط جبرئیل'},
 {'score': 10.478512, 'text': 'سوی او توسط جبرئیل بر پیامبر اسلام، محمد'},
 {'score': 9.948263, 'text': 'سوی او توسط جبرئیل بر پیامبر اسلام، محمد بن'},
 {'score': 9.770666, 'text': '، محمد'},
 {'score': 9.2404175, 'text': '، محمد بن'},
 {'score': 9.08543, 'text': 'سوی او توسط جبرئیل بر'},
 {'score': 8.245945, 'text': 'به صورت وحی از سوی او توسط جبرئیل'},
 {'score': 7.7894344,
  'text': 'به صورت وحی از سوی او توسط جبرئیل بر پیامبر اسلام، محمد'},
 {'score': 7.315167,
  'text': 'صورت وحی از سوی او توسط جبرئیل بر پیامبر اسلام، محمد بن عبدالله، نازل شده'},
 {'score': 7.259186,
  'text': 'به صورت وحی از سوی او توسط جبرئ

We can compare to the actual ground-truth answer:

In [None]:
datasets[target_dataset][0]["answers"]

{'text': ['قرآن'], 'answer_start': [0]}

In [None]:
import collections

examples = datasets[target_dataset]
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [None]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

In [None]:
final_predictions = postprocess_qa_predictions(datasets[target_dataset], validation_features, raw_predictions.predictions)

Post-processing 8002 example predictions split into 8026 features.


  0%|          | 0/8002 [00:00<?, ?it/s]

Then we can load the metric from the datasets library.

In [None]:
metric = load_metric("squad_v2" if squad_v2 else "squad")

  metric = load_metric("squad_v2" if squad_v2 else "squad")


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

## ||| RESULTS

In [None]:
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets[target_dataset]]
metric.compute(predictions=formatted_predictions, references=references)

{'exact': 65.59610097475631,
 'f1': 79.38310121315844,
 'total': 8002,
 'HasAns_exact': 59.11629434954008,
 'HasAns_f1': 77.23777528050128,
 'HasAns_total': 6088,
 'NoAns_exact': 86.20689655172414,
 'NoAns_f1': 86.20689655172414,
 'NoAns_total': 1914,
 'best_exact': 65.59610097475631,
 'best_exact_thresh': 0.0,
 'best_f1': 79.3831012131585,
 'best_f1_thresh': 0.0}

In [None]:
trainer.push_to_hub()

Saving model checkpoint to distilbert-fa-zwnj-base-finetuned-2epoch-pquad
Configuration saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/config.json
Model weights saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/pytorch_model.bin
tokenizer config file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/tokenizer_config.json
Special tokens file saved in distilbert-fa-zwnj-base-finetuned-2epoch-pquad/special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}, 'dataset': {'name': 'pquad', 'type': 'pquad', 'config': 'pquad', 'split': 'validation', 'args': 'pquad'}}
