<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/malthe/bert_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Made with the help of this guide: https://huggingface.co/docs/transformers/tasks/question_answering and lab 6

In [1]:
!pip install update transformers
!pip install datasets
!pip install evaluate



In [2]:
from datasets import load_dataset
from datasets import load_metric
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import AutoConfig
from functools import partial
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR
from torch import nn
from collections import defaultdict, OrderedDict
# MODEL_NAME = 'xlm-roberta-base'
MODEL_NAME = 'bert-base-multilingual-uncased'

In [3]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [4]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")

filtered_dataset = dataset.filter(lambda entry: entry["language"] in ["indonesian"])#, "arabic", "bengali"])
#filtered_dataset = filtered_dataset.filter(lambda entry: entry["annotations"]["answer_start"][0] == -1)

train_set = filtered_dataset["train"]
validation_set = filtered_dataset["validation"]

In [None]:
# dataset["train"]["language"]

In [5]:
from datasets import Dataset, DatasetDict
train_set_df = train_set.to_pandas()
train_set_df['id'] = range(len(train_set_df))
validation_set_df = validation_set.to_pandas()
validation_set_df['id'] = range(len(validation_set_df))

train_set = Dataset.from_pandas(train_set_df)
validation_set = Dataset.from_pandas(validation_set_df)

In [28]:
from datasets import Dataset, DatasetDict
def get_datasets(languages, dataset):
  filtered_dataset = dataset.filter(lambda entry: entry["language"] in languages)

  train_set = filtered_dataset["train"]
  validation_set = filtered_dataset["validation"]
  train_set_df = train_set.to_pandas()
  train_set_df['id'] = range(len(train_set_df))
  validation_set_df = validation_set.to_pandas()
  validation_set_df['id'] = range(len(validation_set_df))

  train_set = Dataset.from_pandas(train_set_df)
  validation_set = Dataset.from_pandas(validation_set_df)
  return train_set, validation_set

In [6]:
print(len(validation_set))
train_set[2]

1191


{'question_text': 'Kapan PBB mulai terbentuk ?',
 'document_title': 'Perserikatan Bangsa-Bangsa',
 'language': 'indonesian',
 'annotations': {'answer_start': [360], 'answer_text': ['24 Oktober 1945']},
 'document_plaintext': 'Sebagai tindak lanjut Atlantic Charter tersebut, pada tanggal 25 April 1945, Konferensi PBB tentang Organisasi Internasional diadakan di San Francisco, dengan dihadiri oleh 50 pemerintah negara, dan sejumlah organisasi non-pemerintah yang terlibat dalam penyusunan Piagam Perserikatan Bangsa-Bangsa (Declaration of the United Nations). PBB resmi dibentuk pada 24 Oktober 1945 atas ratifikasi Piagam oleh lima anggota tetap Dewan Keamanan -Perancis, Republik Tiongkok, Uni Soviet, Inggris dan Amerika Serikat- dan mayoritas dari 46 negara anggota lainnya.',
 'document_url': 'https://id.wikipedia.org/wiki/Perserikatan%20Bangsa-Bangsa',
 'id': 2}

In [7]:
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
def get_train_features(tk, samples):
  '''
  Tokenizes all of the text in the given samples, splittling inputs that are too long for our model
  across multiple features. Finds the token offsets of the answers, which serve as the labels for
  our inputs.
  '''
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question_text'], samples['document_plaintext'])],
        padding='max_length',
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # Get a list which maps the input features index to their original index in the
  # samples list (for split inputs). E.g. if our batch size is 4 and the second sample
  # is split into 3 inputs because it is very large, sample_mapping would look like
  # [0, 1, 1, 1, 2, 3]
  sample_mapping = batch.pop('overflow_to_sample_mapping')
  # Get all of the character offsets for each token
  offset_mapping = batch.pop('offset_mapping')

  # Store the start and end tokens
  batch['start_tokens'] = []
  batch['end_tokens'] = []

  # Iterate through all of the offsets
  for i, offsets in enumerate(offset_mapping):
    # Get the right sample by mapping it to its original index
    sample_idx = sample_mapping[i]
    # Get the sequence IDs to know where context starts so we can ignore question tokens
    sequence_ids = batch.sequence_ids(i)

    # Get the start and end character positions of the answer
    ans = samples['annotations'][sample_idx]
    start_char = ans['answer_start'][0]
    end_char = start_char + len(ans['answer_text'][0])
    # while end_char > 0 and (end_char >= len(samples['context'][sample_idx]) or samples['context'][sample_idx][end_char] == ' '):
    #   end_char -= 1

    # Start from the first token in the context, which can be found by going to the
    # first token where sequence_ids is 1
    start_token = 0
    while sequence_ids[start_token] != 1:
      start_token += 1

    end_token = len(offsets) - 1
    while sequence_ids[end_token] != 1:
      end_token -= 1

    # By default set it to the CLS token if the answer isn't in this input
    if start_char < offsets[start_token][0] or end_char > offsets[end_token][1]:
      start_token = 0
      end_token = 0
    # Otherwise find the correct token indices
    else:
      # Advance the start token index until we have passed the start character index
      while start_token < len(offsets) and offsets[start_token][0] <= start_char:
        start_token += 1
      start_token -= 1

      # Decrease the end token index until we have passed the end character index
      while end_token >= 0 and offsets[end_token][1] >= end_char:
        end_token -= 1
      end_token += 1

    batch['start_tokens'].append(start_token)
    batch['end_tokens'].append(end_token)

  #batch['start_tokens'] = np.array(batch['start_tokens'])
  #batch['end_tokens'] = np.array(batch['end_tokens'])

  return batch

def collate_fn(inputs):
  '''
  Defines how to combine different samples in a batch
  '''
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])
  start_tokens = torch.tensor([i['start_tokens'] for i in inputs])
  end_tokens = torch.tensor([i['end_tokens'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask, 'start_tokens': start_tokens, 'end_tokens': end_tokens}

In [9]:
tokenized_dataset = train_set.map(partial(get_train_features, tk), batched=True, remove_columns=train_set.column_names)

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_tokens', 'end_tokens'],
    num_rows: 11594
})

In [11]:
samples = random.sample(list(range(len(tokenized_dataset))), 1000) # TODO: magic number?
tokenized_dataset = tokenized_dataset.select(samples)
train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [12]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    schedule: LambdaLR,
    n_epochs: int,
    device: torch.device
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask'],
          start_positions=batch['start_tokens'],
          end_positions=batch['end_tokens']
      )
      loss = outputs['loss']
      losses.append(loss.item())
      loss_epoch.append(loss.item())

      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model and advance the LR schedule
      optimizer.step()
      scheduler.step()
      #gc.collect()
  return losses

In [13]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Create the optimizer
lr=2e-5
n_epochs = 2
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl)
)



In [15]:
losses = train(
    model,
    train_dl,
    optimizer,
    scheduler,
    n_epochs,
    device
)

100%|██████████| 250/250 [00:52<00:00,  4.73it/s]
100%|██████████| 250/250 [00:52<00:00,  4.77it/s]


In [16]:
def get_validation_features(tk, samples):
  # First, tokenize the text. We get the offsets and return overflowing sequences in
  # order to break up long sequences into multiple inputs. The offsets will help us
  # determine the original answer text
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question_text'], samples['document_plaintext'])],
        padding='max_length',
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # We'll store the ID of the samples to calculate squad score
  batch['example_id'] = []
  # The overflow sample map tells us which input each sample corresponds to
  sample_map = batch.pop('overflow_to_sample_mapping')

  for i in range(len(batch['input_ids'])):
    # The sample index tells us which of the values in "samples" these features belong to
    sample_idx = sample_map[i]
    sequence_ids = batch.sequence_ids(i)

    # Add the ID to map these features back to the correct sample
    batch['example_id'].append(samples['id'][sample_idx])

    #Set offsets for non-context words to be None for ease of processing
    batch['offset_mapping'][i] = [o if sequence_ids[k] == 1 else None for k,o in enumerate(batch['offset_mapping'][i])]

  return batch

def val_collate_fn(inputs):
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [17]:
validation_dataset = validation_set.map(partial(get_validation_features, tk), batched=True, remove_columns=validation_set.column_names)

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

In [18]:
def predict(model: nn.Module, valid_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
  # layer normalization and dropout
  model.eval()
  start_logits_all = []
  end_logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(valid_dl, desc='Evaluation'):
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask']
      )
      # Store the "start" class logits and "end" class logits for every token in the input
      start_logits_all.extend(list(outputs['start_logits'].detach().cpu().numpy()))
      end_logits_all.extend(list(outputs['end_logits'].detach().cpu().numpy()))


    return start_logits_all,end_logits_all

def post_process_predictions(examples, dataset, logits, tokenizer, num_possible_answers = 20, max_answer_length = 30):
  all_start_logits, all_end_logits = logits
  # Build a map from example to its corresponding features. This will allow us to index from
  # sample ID to all of the features for that sample (in case they were split up due to long input)
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  features_per_example = defaultdict(list)
  for i, feature in enumerate(dataset):
      features_per_example[example_id_to_index[feature["example_id"]]].append(i)

  # Create somewhere to store our predictions
  predictions = OrderedDict()

  # Iterate through each sample in the dataset
  for j, sample in enumerate(tqdm(examples)):

    # Get the feature indices (all of the features split across the batch)
    feature_indices = features_per_example[j]
    # Get the original context which predumably has the answer text
    context = sample['document_plaintext']

    preds = []

    min_score_threshold = None

    # Iterate through all of the features
    for ft_idx in feature_indices:

      # Get the start and end answer logits for this input
      start_logits = all_start_logits[ft_idx]
      end_logits = all_end_logits[ft_idx]

      # Get the offsets to map token indices to character indices
      offset_mapping = dataset[ft_idx]['offset_mapping']


      # Update minimum null prediction.
      cls_index = dataset[ft_idx]["input_ids"].index(tokenizer.cls_token_id)
      feature_min_score_threshold = start_logits[cls_index] + end_logits[cls_index]
      if min_score_threshold is None or min_score_threshold < feature_min_score_threshold:
          min_score_threshold = feature_min_score_threshold

      # Sort the logits and take the top N
      start_indices = np.argsort(start_logits)[::-1][:num_possible_answers]
      end_indices = np.argsort(end_logits)[::-1][:num_possible_answers]

      # Iterate through start and end indices
      for start_index in start_indices:
        for end_index in end_indices:

          # Ignore this combination if either the indices are not in the context
          if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
            continue

          # Also ignore if the start index is greater than the end index of the number of tokens
          # is greater than some specified threshold
          if start_index > end_index or end_index - start_index + 1 > max_answer_length:
            continue

          ans_text = context[offset_mapping[start_index][0]:offset_mapping[end_index][1]]
          preds.append({
              'score': start_logits[start_index] + end_logits[end_index],
              'text': ans_text
          })

    if len(preds) > 0:
      # Sort by score to get the top answer
      best_answer = sorted(preds, key=lambda x: x['score'], reverse=True)[0]
    else:
      best_answer = {'score': 0.0, 'text': ""}

    # if the best answer is below the threshold for lowest score, give it the empty string

    answer = best_answer["text"] if best_answer["score"] > min_score_threshold else ""
    predictions[sample['id']] = answer
  return predictions

In [19]:
val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
logits = predict(model, val_dl)

Evaluation: 100%|██████████| 38/38 [00:31<00:00,  1.22it/s]


In [20]:
validation_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 1210
})

In [21]:
predictions = post_process_predictions(validation_set, validation_dataset, logits, tk)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in validation_set]

100%|██████████| 1191/1191 [00:04<00:00, 266.60it/s]


In [22]:
formatted_predictions

[{'id': 0, 'prediction_text': ''},
 {'id': 1, 'prediction_text': '4275,08km²'},
 {'id': 2, 'prediction_text': 'Frederick Winslow Taylor'},
 {'id': 3, 'prediction_text': '1815'},
 {'id': 4, 'prediction_text': '179,7 juta km²'},
 {'id': 5,
  'prediction_text': 'seniman-penghibur (entertainer) tradisional Jepang'},
 {'id': 6, 'prediction_text': ''},
 {'id': 7, 'prediction_text': 'Detroit, Michigan, Amerika Serikat'},
 {'id': 8, 'prediction_text': '16,1 km²'},
 {'id': 9, 'prediction_text': 'Hiroshi Takahashi'},
 {'id': 10, 'prediction_text': ''},
 {'id': 11, 'prediction_text': ''},
 {'id': 12, 'prediction_text': 'monarki konstitusional'},
 {'id': 13, 'prediction_text': '24 April 1934'},
 {'id': 14, 'prediction_text': 'Kim Stanley Robinson'},
 {'id': 15, 'prediction_text': ''},
 {'id': 16, 'prediction_text': '906,500km2'},
 {'id': 17, 'prediction_text': ''},
 {'id': 18, 'prediction_text': 'penulis dan produser Amerika Serikat'},
 {'id': 19,
  'prediction_text': 'istilah yang pertama kali di

In [23]:
""" Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    if len(prediction_tokens) == 0 and len(ground_truth_tokens) == 0:
      return 1
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    #print(prediction)
    #print(ground_truth)
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    #print(metric_fn)
    #print(prediction)
    #print(ground_truths)
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    #if len(scores_for_ground_truths) == 0:
    #  if len(prediction) == 0:
    #    return 1 # FIX: skal ændres, så at hvis der ikke er noget svar og prediction også siger det, så skal den have god score
     # else:
     #   return 0
    return max(scores_for_ground_truths)


def evaluate_squad(dataset, predictions):
    f1 = exact_match = total = 0
    for article in dataset:
        print(article)
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                #print(qa)
                #print(predictions[qa['id']])
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

def compute_squad(predictions, references):
  pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
  dataset = [
      {
          "paragraphs": [
              {
                  "qas": [
                      {
                          "answers": [{"text": ref["answers"]} ],
                          "id": ref["id"],
                      }
                      for ref in references
                  ]
              }
          ]
      }
  ]
  score = evaluate_squad(dataset=dataset, predictions=pred_dict)
  return score

In [24]:
"abc"[-1]

'c'

In [24]:
print(formatted_predictions)
print(len(gold))
print(len(validation_set))

[{'id': 0, 'prediction_text': ''}, {'id': 1, 'prediction_text': '4275,08km²'}, {'id': 2, 'prediction_text': 'Frederick Winslow Taylor'}, {'id': 3, 'prediction_text': '1815'}, {'id': 4, 'prediction_text': '179,7 juta km²'}, {'id': 5, 'prediction_text': 'seniman-penghibur (entertainer) tradisional Jepang'}, {'id': 6, 'prediction_text': ''}, {'id': 7, 'prediction_text': 'Detroit, Michigan, Amerika Serikat'}, {'id': 8, 'prediction_text': '16,1 km²'}, {'id': 9, 'prediction_text': 'Hiroshi Takahashi'}, {'id': 10, 'prediction_text': ''}, {'id': 11, 'prediction_text': ''}, {'id': 12, 'prediction_text': 'monarki konstitusional'}, {'id': 13, 'prediction_text': '24 April 1934'}, {'id': 14, 'prediction_text': 'Kim Stanley Robinson'}, {'id': 15, 'prediction_text': ''}, {'id': 16, 'prediction_text': '906,500km2'}, {'id': 17, 'prediction_text': ''}, {'id': 18, 'prediction_text': 'penulis dan produser Amerika Serikat'}, {'id': 19, 'prediction_text': 'istilah yang pertama kali diciptakan oleh tokoh yan

In [25]:
compute_squad(references=gold, predictions=formatted_predictions)

{'paragraphs': [{'qas': [{'answers': [{'text': 'orang-orang Viking'}], 'id': 0}, {'answers': [{'text': 'provinsi Sulawesi Tengah, Indonesia'}], 'id': 1}, {'answers': [{'text': 'Frederick Winslow Taylor'}], 'id': 2}, {'answers': [{'text': '15 Desember 1861'}], 'id': 3}, {'answers': [{'text': '179,7 juta km²'}], 'id': 4}, {'answers': [{'text': 'seniman-penghibur (entertainer) tradisional Jepang'}], 'id': 5}, {'answers': [{'text': '2000-an'}], 'id': 6}, {'answers': [{'text': 'Detroit, Michigan, Amerika Serikat'}], 'id': 7}, {'answers': [{'text': '32,58 km²'}], 'id': 8}, {'answers': [{'text': 'Hiroshi Takahashi'}], 'id': 9}, {'answers': [{'text': 'Sena'}], 'id': 10}, {'answers': [{'text': 'Tadashi Maeda, Tomegoro Yoshizumi, S. Nishijima, S. Miyoshi, Mohammad Hatta, Soekarno, dan Achmad Soebardjo'}], 'id': 11}, {'answers': [{'text': 'monarki konstitusional yang di dalamnya terdapat kuasa dari seorang Kaisar yang masih dibatasi dan hanya diturunkan terutama ketika melakukan tugas resmi'}], '

{'exact_match': 58.186397984886646, 'f1': 62.16008866513618}

# Now for each language:

In [None]:
def process_language(train_set, validation_set, MODEL_NAME):
  tk = AutoTokenizer.from_pretrained(MODEL_NAME)
  tokenized_dataset = train_set.map(partial(get_train_features, tk), batched=True, remove_columns=train_set.column_names)

  samples = random.sample(list(range(len(tokenized_dataset))), 4000) # TODO: magic number?
  tokenized_dataset = tokenized_dataset.select(samples)
  train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

  model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

  # Create the optimizer
  lr=2e-5
  n_epochs = 3
  weight_decay = 0.01
  warmup_steps = 200

  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': weight_decay},
      {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  # optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
  # scheduler = None
  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      warmup_steps,
      n_epochs * len(train_dl)
  )

  losses = train(
      model,
      train_dl,
      optimizer,
      scheduler,
      n_epochs,
      device
  )

  validation_dataset = validation_set.map(partial(get_validation_features, tk), batched=True, remove_columns=validation_set.column_names)

  val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
  logits = predict(model, val_dl)

  predictions = post_process_predictions(validation_set, validation_dataset, logits)
  formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
  gold = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in validation_set]

  return compute_squad(references=gold, predictions=formatted_predictions)

In [29]:
arab_train, arab_val = get_datasets(["arabic"], dataset)
ben_train, ben_val = get_datasets(["bengali"], dataset)
indo_train, indo_val = get_datasets(["indonesian"], dataset)

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [None]:
arab_evaluation = process_language(arab_train, arab_val, MODEL_NAME)
print(arab_evaluation)

Map:   0%|          | 0/29598 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 89%|████████▉ | 892/1000 [04:06<00:29,  3.62it/s]


KeyboardInterrupt: ignored

In [None]:
tk_arab = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_dataset_arab = arab_train.map(partial(get_train_features, tk_arab), batched=True, remove_columns=arab_train.column_names)

samples_arab = random.sample(list(range(len(tokenized_dataset_arab))), 4000) # TODO: magic number?
tokenized_dataset_arab = tokenized_dataset_arab.select(samples_arab)
train_dl_arab = DataLoader(tokenized_dataset_arab, collate_fn=collate_fn, shuffle=True, batch_size=4)

model_arab = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters_arab = [
    {'params': [p for n, p in model_arab.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model_arab.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer_arab = AdamW(optimizer_grouped_parameters_arab, lr=lr)
scheduler_arab = get_linear_schedule_with_warmup(
    optimizer_arab,
    warmup_steps,
    n_epochs * len(train_dl_arab)
)

losses_arab = train(
    model_arab,
    train_dl_arab,
    optimizer_arab,
    scheduler_arab,
    n_epochs,
    device
)

validation_dataset_arab = arab_val.map(partial(get_validation_features, tk), batched=True, remove_columns=arab_val.column_names)

val_dl_arab = DataLoader(validation_dataset_arab, collate_fn=val_collate_fn, batch_size=32)
logits_arab = predict(model_arab, val_dl_arab)

predictions_arab = post_process_predictions(arab_val, validation_dataset_arab, logits_arab)
formatted_predictions_arab = [{'id': k, 'prediction_text': v} for k,v in predictions_arab.items()]
gold_arab = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in arab_val]

compute_squad(references=gold_arab, predictions=formatted_predictions_arab)

Map:   0%|          | 0/29598 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [04:39<00:00,  3.58it/s]
100%|██████████| 1000/1000 [04:41<00:00,  3.55it/s]
100%|██████████| 1000/1000 [04:41<00:00,  3.55it/s]


Map:   0%|          | 0/1902 [00:00<?, ? examples/s]

Evaluation: 100%|██████████| 62/62 [01:00<00:00,  1.03it/s]
100%|██████████| 1902/1902 [00:04<00:00, 387.98it/s]


{'paragraphs': [{'qas': [{'answers': [{'text': 'بطولتين'}], 'id': 0}, {'answers': [{'text': 'الفرنسي (بيير كوري) وأخوه (جاك)'}], 'id': 1}, {'answers': [{'text': 'المَشِيمَة'}], 'id': 2}, {'answers': [{'text': 'مدينة جبيل اللبنانية'}], 'id': 3}, {'answers': [{'text': 'تبليسي'}], 'id': 4}, {'answers': [{'text': '110'}], 'id': 5}, {'answers': [{'text': 'توفى أثناء نومه'}], 'id': 6}, {'answers': [{'text': 'جيمس واط'}], 'id': 7}, {'answers': [{'text': '14 أيار / مايو 1948'}], 'id': 8}, {'answers': [{'text': 'البراق'}], 'id': 9}, {'answers': [{'text': 'أدولف هتلر'}], 'id': 10}, {'answers': [{'text': 'مصطلح عام يشمل جميع الترددات الممكنة من الإشعاعات الكهرومغناطيسية'}], 'id': 11}, {'answers': [{'text': 'جنوب إيطاليا'}], 'id': 12}, {'answers': [{'text': 'تسجيل وتبويب المدخلات والعمليات التي تمثل الأحداث الاقتصادية وفق نظام معين، المعلومات المالية المستخدمة بشكل أساسي من المدراء والمستثمرين والجهات الضريبية ومتخذي القرارات الآخرين'}], 'id': 13}, {'answers': [{'text': 'مدينة تكريت'}], 'id': 14},

{'exact_match': 3.101997896950578, 'f1': 4.00034342770803}

In [None]:
ben_evaluation = process_language(ben_train, ben_val, MODEL_NAME)
print(ben_evaluation)

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

KeyboardInterrupt: ignored

In [None]:
tk_ben = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_dataset_ben = ben_train.map(partial(get_train_features, tk_ben), batched=True, remove_columns=ben_train.column_names)

samples_ben = random.sample(list(range(len(tokenized_dataset_ben))), 4000) # TODO: magic number?
tokenized_dataset_ben = tokenized_dataset_ben.select(samples_ben)
train_dl_ben = DataLoader(tokenized_dataset_ben, collate_fn=collate_fn, shuffle=True, batch_size=4)

model_ben = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters_ben = [
    {'params': [p for n, p in model_ben.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model_ben.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer_ben = AdamW(optimizer_grouped_parameters_ben, lr=lr)
scheduler_ben = get_linear_schedule_with_warmup(
    optimizer_ben,
    warmup_steps,
    n_epochs * len(train_dl_ben)
)

losses_ben = train(
    model_ben,
    train_dl_ben,
    optimizer_ben,
    scheduler_ben,
    n_epochs,
    device
)

validation_dataset_ben = ben_val.map(partial(get_validation_features, tk), batched=True, remove_columns=ben_val.column_names)

val_dl_ben = DataLoader(validation_dataset_ben, collate_fn=val_collate_fn, batch_size=32)
logits_ben = predict(model_ben, val_dl_ben)

predictions_ben = post_process_predictions(ben_val, validation_dataset_ben, logits_ben)
formatted_predictions_ben = [{'id': k, 'prediction_text': v} for k,v in predictions_ben.items()]
gold_ben = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in ben_val]

compute_squad(references=gold_ben, predictions=formatted_predictions_ben)

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/1000 [00:00<08:18,  2.00it/s]


OutOfMemoryError: ignored

In [None]:
indo_evaluation = process_language(indo_train, indo_val, MODEL_NAME)
print(indo_evaluation)

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/1000 [00:00<?, ?it/s]


NameError: ignored

In [32]:
tk_indo = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenized_dataset_indo = indo_train.map(partial(get_train_features, tk_indo), batched=True, remove_columns=indo_train.column_names)

samples_indo = random.sample(list(range(len(tokenized_dataset_indo))), 1000) # TODO: magic number?
tokenized_dataset_indo = tokenized_dataset_indo.select(samples_indo)
train_dl_indo = DataLoader(tokenized_dataset_indo, collate_fn=collate_fn, shuffle=True, batch_size=4)

model_indo = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

# Create the optimizer
lr=2e-5
n_epochs = 3
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters_indo = [
    {'params': [p for n, p in model_indo.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model_indo.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer_indo = AdamW(optimizer_grouped_parameters_indo, lr=lr)
scheduler_indo = get_linear_schedule_with_warmup(
    optimizer_indo,
    warmup_steps,
    n_epochs * len(train_dl_indo)
)

losses_indo = train(
    model_indo,
    train_dl_indo,
    optimizer_indo,
    scheduler_indo,
    n_epochs,
    device
)

validation_dataset_indo = indo_val.map(partial(get_validation_features, tk_indo), batched=True, remove_columns=indo_val.column_names)

val_dl_indo = DataLoader(validation_dataset_indo, collate_fn=val_collate_fn, batch_size=32)
logits_indo = predict(model_indo, val_dl_indo)

predictions_indo = post_process_predictions(indo_val, validation_dataset_indo, logits_indo, tk_indo)
formatted_predictions_indo = [{'id': k, 'prediction_text': v} for k,v in predictions_indo.items()]
gold_indo = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in indo_val]

compute_squad(references=gold_indo, predictions=formatted_predictions_indo)

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 250/250 [00:53<00:00,  4.64it/s]
100%|██████████| 250/250 [00:52<00:00,  4.77it/s]
100%|██████████| 250/250 [00:53<00:00,  4.67it/s]


Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

Evaluation: 100%|██████████| 38/38 [00:30<00:00,  1.24it/s]
100%|██████████| 1191/1191 [00:05<00:00, 216.44it/s]


{'paragraphs': [{'qas': [{'answers': [{'text': 'orang-orang Viking'}], 'id': 0}, {'answers': [{'text': 'provinsi Sulawesi Tengah, Indonesia'}], 'id': 1}, {'answers': [{'text': 'Frederick Winslow Taylor'}], 'id': 2}, {'answers': [{'text': '15 Desember 1861'}], 'id': 3}, {'answers': [{'text': '179,7 juta km²'}], 'id': 4}, {'answers': [{'text': 'seniman-penghibur (entertainer) tradisional Jepang'}], 'id': 5}, {'answers': [{'text': '2000-an'}], 'id': 6}, {'answers': [{'text': 'Detroit, Michigan, Amerika Serikat'}], 'id': 7}, {'answers': [{'text': '32,58 km²'}], 'id': 8}, {'answers': [{'text': 'Hiroshi Takahashi'}], 'id': 9}, {'answers': [{'text': 'Sena'}], 'id': 10}, {'answers': [{'text': 'Tadashi Maeda, Tomegoro Yoshizumi, S. Nishijima, S. Miyoshi, Mohammad Hatta, Soekarno, dan Achmad Soebardjo'}], 'id': 11}, {'answers': [{'text': 'monarki konstitusional yang di dalamnya terdapat kuasa dari seorang Kaisar yang masih dibatasi dan hanya diturunkan terutama ketika melakukan tugas resmi'}], '

{'exact_match': 2.0151133501259446, 'f1': 7.338716682872452}

In [None]:
indo_evaluation

# Gammelt


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

In [None]:
max_length = 512
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = train_set.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train_set.column_names,
)
len(train_set), len(train_dataset)

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

(11394, 11594)

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question_text"]]
    inputs = tokenizer(
        questions,
        examples["document_plaintext"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["document_url"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids

    offset_mapping = inputs["offset_mapping"]
    answers = examples["annotations"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
validation_dataset = validation_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)
len(validation_set), len(validation_dataset)

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

(1191, 1210)

# training


In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("bert-base-multilingual-uncased")

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="/models",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1, # TODO: skift til mere end 1 epochs
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

ImportError: ignored

# Evaluation

In [None]:
from datasets import load_metric
compute_squad = load_metric("squad_v2")

In [None]:
import evaluate

metric = evaluate.load("squad_v2")

In [None]:
small_eval_set = validation_set.select(range(100))

eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)

eval_tokens = small_eval_set.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)
eval_tokens

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping", 'start_positions', 'end_positions'])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}


with torch.no_grad():
    outputs = model(**batch)

In [None]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [None]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    print(examples[0])
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["document_plaintext"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, 'answers': [{'text': best_answer["text"]}]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
#[{'text': ['reference_answer1']}]
    theoretical_answers = [{"id": ex["id"], 'answers': [{'text': ex["annotations"]["answer_text"]}]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

In [None]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["id"]].append(idx)

In [None]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["document_plaintext"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "indexes": (offsets[start_index][0], offsets[end_index][1]),
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"], "indexes": best_answer["indexes"]})

In [None]:
theoretical_answers = [
    {"id": ex["document_url"], "answers": ex["annotations"]["answer_text"][0], "indexes": ex["annotations"]} for ex in small_eval_set
]

In [None]:
print(predicted_answers[1])
print(theoretical_answers[1])

In [None]:
from sklearn.metrics import f1_score

def calculate_f1_score(predictions, references):
    # Extract the 'answer' values from the dictionaries
    predicted_answers = [item['prediction_text'] for item in predictions]
    reference_answers = [item['answers'] for item in references]
    # print(predicted_answers)
    # print(reference_answers)
    # Calculate the F1 score
    f1 = f1_score(reference_answers, predicted_answers, average='micro')

    return f1

In [None]:
f1_score = calculate_f1_score(predicted_answers, theoretical_answers)
print(f"F1 Score: {f1_score}")

In [None]:
def calculate_exact_match(predictions, references):
    exact_match_count = 0

    for pred, ref in zip(predictions, references):
      #print("")
      #print(pred['prediction_text'])
      #print(ref['answers'])
      if pred['prediction_text'] == ref['answers']:
            exact_match_count += 1

    exact_match_score = exact_match_count / len(predictions)

    return exact_match_score

In [None]:
exact_match_score = calculate_exact_match(predicted_answers, theoretical_answers)
print(f"Exact Match Score: {exact_match_score}")