<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/malthe/w4_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Made with the help of this guide: https://huggingface.co/docs/transformers/tasks/question_answering


In [49]:
!pip install datasets transformers evaluate



In [50]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

In [51]:
dataset["train"][0]

{'question_text': 'Milloin Charles Fort syntyi?',
 'document_title': 'Charles Fort',
 'language': 'finnish',
 'annotations': {'answer_start': [18],
  'answer_text': ['6. elokuuta (joidenkin lähteiden mukaan 9.) 1874']},
 'document_plaintext': 'Charles Hoy Fort (6. elokuuta (joidenkin lähteiden mukaan 9.) 1874 – 3. toukokuuta 1932) oli yhdysvaltalainen kirjailija ja paranormaalien ilmiöiden tutkija.',
 'document_url': 'https://fi.wikipedia.org/wiki/Charles%20Fort'}

In [79]:
filtered_dataset = dataset.filter(lambda entry: entry["language"] in ["english"])
filtered_dataset = filtered_dataset.head(500)

AttributeError: ignored

In [53]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [54]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [55]:
tokenized_data = filtered_dataset.map(preprocess_function, batched=True, remove_columns=filtered_dataset["train"].column_names)

Map:   0%|          | 0/990 [00:00<?, ? examples/s]

In [56]:
tokenized_data['train'][0]

{'input_ids': [101,
  10704,
  10140,
  32950,
  12270,
  14676,
  14906,
  136,
  102,
  32950,
  12270,
  14676,
  71070,
  12167,
  10171,
  10103,
  13498,
  10108,
  51900,
  10380,
  50418,
  11183,
  52898,
  117,
  10146,
  10103,
  51900,
  10380,
  50418,
  11183,
  12270,
  10140,
  10103,
  10902,
  11197,
  20429,
  12270,
  10146,
  10108,
  10103,
  43416,
  119,
  138,
  129,
  140,
  131,
  122,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

In [57]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [58]:
from transformers import create_optimizer

batch_size = 16

num_epochs = 2

total_train_steps = (len(tokenized_data["train"]) // batch_size) * num_epochs

optimizer, schedule = create_optimizer(

    init_lr=2e-5,

    num_warmup_steps=0,

    num_train_steps=total_train_steps,

)

In [60]:
from transformers import AutoConfig, TFAutoModelForQuestionAnswering

config = AutoConfig.from_pretrained("bert-base-multilingual-uncased")
model = TFAutoModelForQuestionAnswering.from_config(config)

In [61]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["validation"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [62]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [64]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1) # TODO: change number of epochs



<keras.src.callbacks.History at 0x790331b15720>

In [73]:
filtered_dataset["validation"][0]

{'question_text': 'What is a way to increase your wound healing speed?',
 'document_title': 'Wound healing',
 'language': 'english',
 'annotations': {'answer_start': [51],
  'answer_text': ['cleaning and protection from reinjury or infection']},
 'document_plaintext': "Wound care encourages and speeds wound healing via cleaning and protection from reinjury or infection. Depending on each patient's needs, it can range from the simplest first aid to entire nursing specialties such as wound, ostomy, and continence nursing and burn center care.",
 'document_url': 'https://en.wikipedia.org/wiki/Wound%20healing'}

In [75]:
inputs_validation = tokenizer(filtered_dataset["validation"]["question_text"],
                              filtered_dataset["validation"]["document_plaintext"],
                              max_length=384,
                              truncation="only_second",
                              padding="max_length",
                              return_tensors="tf")

In [77]:
outputs = model(**inputs_validation)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
print(start_scores)
print(end_scores)

ResourceExhaustedError: ignored

In [None]:
predicted_answers = []
predicted_answer_indexes = []

for context, question in zip(validation_data["document_plaintext"], validation_data["question_text"]):
    inputs = tokenizer(question, context, return_tensors="tf", padding=True, truncation=True)
    # Model expects input_ids and attention_mask
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Make predictions
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_index = tf.argmax(start_scores, axis=1).numpy()[0]
    end_index = tf.argmax(end_scores, axis=1).numpy()[0]
    #predicted_answer = tokenizer.decode(input_ids[0][start_index:end_index + 1])
    #predicted_answers.append(predicted_answer)

In [None]:
input_ids