<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/malthe/bert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Made with the help of this guide: https://huggingface.co/docs/transformers/tasks/question_answering


In [1]:
!pip install datasets transformers evaluate
!pip install accelerate -U



In [2]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")

filtered_dataset = dataset.filter(lambda entry: entry["language"] in ["bengali"])

train_set = filtered_dataset["train"]
validation_set = filtered_dataset["validation"]

In [3]:
from datasets import Dataset, DatasetDict
train_set_df = train_set.to_pandas()
train_set_df['id'] = range(len(train_set_df))
validation_set_df = validation_set.to_pandas()
validation_set_df['id'] = range(len(validation_set_df))

train_set = Dataset.from_pandas(train_set_df)
validation_set = Dataset.from_pandas(validation_set_df)

In [4]:
train_set[2]

{'question_text': 'কত সালে সর্বভারতীয় তৃণমূল কংগ্রেসের প্রতিষ্ঠা হয় ?',
 'document_title': 'সর্বভারতীয় তৃণমূল কংগ্রেস',
 'language': 'bengali',
 'annotations': {'answer_start': [114], 'answer_text': ['১৯৯৮']},
 'document_plaintext': 'সর্বভারতীয় তৃণমূল কংগ্রেস (সংক্ষেপে তৃণমূল কংগ্রেস; পূর্বনাম পশ্চিমবঙ্গ তৃণমূল কংগ্রেস) ভারতের একটি রাজনৈতিক দল। ১৯৯৮ সালের ১ জানুয়ারি পশ্চিমবঙ্গে ভারতীয় জাতীয় কংগ্রেস ভেঙে এই দল প্রতিষ্ঠিত হল। তৃণমূল কংগ্রেসের প্রধান নেত্রী হলেন মমতা বন্দ্যোপাধ্যায়।',
 'document_url': 'https://bn.wikipedia.org/wiki/%E0%A6%B8%E0%A6%B0%E0%A7%8D%E0%A6%AC%E0%A6%AD%E0%A6%BE%E0%A6%B0%E0%A6%A4%E0%A7%80%E0%A6%AF%E0%A6%BC%20%E0%A6%A4%E0%A7%83%E0%A6%A3%E0%A6%AE%E0%A7%82%E0%A6%B2%20%E0%A6%95%E0%A6%82%E0%A6%97%E0%A7%8D%E0%A6%B0%E0%A7%87%E0%A6%B8',
 'id': 2}

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [6]:
max_length = 512
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [7]:
train_dataset = train_set.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_set.column_names,
)
len(train_set), len(train_dataset)

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

(4779, 5197)

In [8]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["document_url"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids

    offset_mapping = inputs["offset_mapping"]
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [9]:
validation_dataset = validation_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)
len(validation_set), len(validation_dataset)

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

(224, 241)

# training


In [10]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [11]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-multilingual-uncased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="/models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # TODO: skift til mere end 1 epochs
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.977184


TrainOutput(global_step=325, training_loss=1.6341092623197115, metrics={'train_runtime': 542.4948, 'train_samples_per_second': 9.58, 'train_steps_per_second': 0.599, 'total_flos': 1357959244756992.0, 'train_loss': 1.6341092623197115, 'epoch': 1.0})

# Evaluation

In [13]:
from datasets import load_metric
compute_squad = load_metric("squad_v2")

  compute_squad = load_metric("squad_v2")


In [36]:
import evaluate

metric = evaluate.load("squad_v2")

Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

In [14]:
small_eval_set = validation_set.select(range(100))

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)

eval_tokens = small_eval_set.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)
eval_tokens

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 114
})

In [15]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping", 'start_positions', 'end_positions'])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}


with torch.no_grad():
    outputs = model(**batch)

In [16]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [42]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    print(examples[0])
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["document_plaintext"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, 'answers': [{'text': best_answer["text"]}]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
#[{'text': ['reference_answer1']}]
    theoretical_answers = [{"id": ex["id"], 'answers': [{'text': ex["annotations"]["answer_text"]}]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [43]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

{'question_text': 'পশ্চিম ভারতের মহারাষ্ট্র রাজ্যের মুম্বাই শহরে নির্মিত গেটওয়ে অব ইন্ডিয়া স্থাপত্যটির ভিত্তিপ্রস্তর স্থাপন করেন কে ?', 'document_title': 'গেটওয়ে অব ইন্ডিয়া', 'language': 'bengali', 'annotations': {'answer_start': [274], 'answer_text': ['স্যার জর্জ সিডেনহাম ক্লার্ক']}, 'document_plaintext': 'দিল্লী দর্বার তৈরীর পূর্বে, গেটওয়ে অব ইন্ডিয়া ১৯১১ সালে  কিং জর্জ ফাইভ এবং কুনি মেরি মুম্বাই আগমনের স্মৃতি রক্ষার্থে নির্মাণ করা হয়েছিল।  কিন্তু তারা এই স্থাপত্যটির শুধু কার্ডবোর্ড মডেল দেখে যেতে পেরেছিলেন, কেননা নির্মাণকাজ ১৯১৫ সালের পরে শুরু হয়েছিল।[11] বোম্বের সরকার স্যার জর্জ সিডেনহাম ক্লার্ক ১৯১১ সালের ৩১শে মার্চ স্থাপত্যটির ভিত্তিপ্রস্তর স্থাপন করেন। চুড়ান্ত নকশা ১৯১৩ সালের ৩১শে মার্চ অনুমোদিত হয়। গেটওয়েটি হলুদ ব্যাসল্ট এবং কংক্রিট দিয়ে বানানো হয়েছিল।[12]  ফাউন্ডেশনের কাজ ১৯২০ সালে এবং পুরো কাজ শেহ হয়েছিল ১৯২৪ সালে। [13] ভাইসরয় দি আর্ল অব রিডিং ১৯২৪ সালের ৪ই ডিসেম্বর গেটওয়েটি উন্মুক্ত করেন।[6]', 'document_url': 'https://bn.wikipedia.org/wiki/%E0%A6%97%E0%A7%87%

  0%|          | 0/100 [00:00<?, ?it/s]

ValueError: ignored

In [17]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["id"]].append(idx)

KeyError: ignored

In [None]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["document_plaintext"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "indexes": (offsets[start_index][0], offsets[end_index][1]),
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"], "indexes": best_answer["indexes"]})

In [None]:
theoretical_answers = [
    {"id": ex["document_url"], "answers": ex["annotations"]["answer_text"][0], "indexes": ex["annotations"]} for ex in small_eval_set
]

In [None]:
print(predicted_answers[1])
print(theoretical_answers[1])

In [None]:
from sklearn.metrics import f1_score

def calculate_f1_score(predictions, references):
    # Extract the 'answer' values from the dictionaries
    predicted_answers = [item['prediction_text'] for item in predictions]
    reference_answers = [item['answers'] for item in references]
    # print(predicted_answers)
    # print(reference_answers)
    # Calculate the F1 score
    f1 = f1_score(reference_answers, predicted_answers, average='micro')

    return f1

In [None]:
f1_score = calculate_f1_score(predicted_answers, theoretical_answers)
print(f"F1 Score: {f1_score}")

In [None]:
def calculate_exact_match(predictions, references):
    exact_match_count = 0

    for pred, ref in zip(predictions, references):
      #print("")
      #print(pred['prediction_text'])
      #print(ref['answers'])
      if pred['prediction_text'] == ref['answers']:
            exact_match_count += 1

    exact_match_score = exact_match_count / len(predictions)

    return exact_match_score

In [None]:
exact_match_score = calculate_exact_match(predicted_answers, theoretical_answers)
print(f"Exact Match Score: {exact_match_score}")