<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/malthe/bert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Made with the help of this guide: https://huggingface.co/docs/transformers/tasks/question_answering


In [1]:
!pip install datasets transformers evaluate
!pip install accelerate -U



In [2]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

In [3]:
dataset["train"][0]

{'question_text': 'Milloin Charles Fort syntyi?',
 'document_title': 'Charles Fort',
 'language': 'finnish',
 'annotations': {'answer_start': [18],
  'answer_text': ['6. elokuuta (joidenkin lähteiden mukaan 9.) 1874']},
 'document_plaintext': 'Charles Hoy Fort (6. elokuuta (joidenkin lähteiden mukaan 9.) 1874 – 3. toukokuuta 1932) oli yhdysvaltalainen kirjailija ja paranormaalien ilmiöiden tutkija.',
 'document_url': 'https://fi.wikipedia.org/wiki/Charles%20Fort'}

In [4]:
filtered_dataset = dataset.filter(lambda entry: entry["language"] in ["bengali"])
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 4779
    })
    validation: Dataset({
        features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url'],
        num_rows: 224
    })
})

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [6]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
train_dataset = filtered_dataset["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=filtered_dataset["train"].column_names,
)
len(filtered_dataset["train"]), len(train_dataset)

(4779, 5856)

In [8]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["document_url"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids

    offset_mapping = inputs["offset_mapping"]
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [9]:
validation_dataset = filtered_dataset["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=filtered_dataset["validation"].column_names,
)
len(filtered_dataset["validation"]), len(validation_dataset)

(224, 267)

# training


In [10]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [11]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-multilingual-uncased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="/models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # TODO: skift til mere end 1 epochs
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.908588


TrainOutput(global_step=366, training_loss=1.459712127518784, metrics={'train_runtime': 455.264, 'train_samples_per_second': 12.863, 'train_steps_per_second': 0.804, 'total_flos': 1147615355584512.0, 'train_loss': 1.459712127518784, 'epoch': 1.0})

# Evaluation

In [13]:
small_eval_set = filtered_dataset["validation"].select(range(100))

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=filtered_dataset["validation"].column_names,
)

In [14]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping", 'start_positions', 'end_positions'])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}


with torch.no_grad():
    outputs = model(**batch)

In [15]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [16]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [17]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["document_url"]
    context = example["document_plaintext"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "indexes": (offsets[start_index][0], offsets[end_index][1]),
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"], "indexes": best_answer["indexes"]})

In [18]:
theoretical_answers = [
    {"id": ex["document_url"], "answers": ex["annotations"]["answer_text"][0], "indexes": ex["annotations"]} for ex in small_eval_set
]

In [19]:
print(predicted_answers[1])
print(theoretical_answers[1])

{'id': 'https://bn.wikipedia.org/wiki/%E0%A6%AE%E0%A6%B9%E0%A6%BE%E0%A6%B6%E0%A7%8D%E0%A6%AC%E0%A7%87%E0%A6%A4%E0%A6%BE%20%E0%A6%A6%E0%A7%87%E0%A6%AC%E0%A7%80', 'prediction_text': 'মহাশ্বেতা দেবী ১০০ট', 'indexes': (0, 19)}
{'id': 'https://bn.wikipedia.org/wiki/%E0%A6%AE%E0%A6%B9%E0%A6%BE%E0%A6%B6%E0%A7%8D%E0%A6%AC%E0%A7%87%E0%A6%A4%E0%A6%BE%20%E0%A6%A6%E0%A7%87%E0%A6%AC%E0%A7%80', 'answers': 'ঝাঁসির রানি', 'indexes': {'answer_start': [209], 'answer_text': ['ঝাঁসির রানি']}}


In [20]:
from sklearn.metrics import f1_score

def calculate_f1_score(predictions, references):
    # Extract the 'answer' values from the dictionaries
    predicted_answers = [item['prediction_text'] for item in predictions]
    reference_answers = [item['answers'] for item in references]
    # print(predicted_answers)
    # print(reference_answers)
    # Calculate the F1 score
    f1 = f1_score(reference_answers, predicted_answers, average='micro')

    return f1

In [21]:
f1_score = calculate_f1_score(predicted_answers, theoretical_answers)
print(f"F1 Score: {f1_score}")

F1 Score: 0.42999999999999994


In [22]:
def calculate_exact_match(predictions, references):
    exact_match_count = 0

    for pred, ref in zip(predictions, references):
      #print("")
      #print(pred['prediction_text'])
      #print(ref['answers'])
      if pred['prediction_text'] == ref['answers']:
            exact_match_count += 1

    exact_match_score = exact_match_count / len(predictions)

    return exact_match_score

In [23]:
exact_match_score = calculate_exact_match(predicted_answers, theoretical_answers)
print(f"Exact Match Score: {exact_match_score}")

Exact Match Score: 0.43
