In [None]:
!pip install transformers datasets
!pip install -U accelerate
!pip install datasets

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

raw_datasets = load_dataset("squad")

model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenisation

In [None]:
def find_answer_token_idx(
    ctx_start,
    ctx_end,
    ans_start_char,
    ans_end_char,
    offset):
  """
  Determines, if the answer is fully included in the context window,
  and returns the start and end token indices of the answer
  in the context window, if so.

  ctx_start: Start-token index of the context window.
  ctx_end: End-token index of the context window.
  ans_start_char: First char of the answer in context.
  ans_end_char: Last char of the answer in context.
  offset: List of tuples, containing the start and end char of every
          token of the input.
  """

  start_idx = 0
  end_idx = 0

  if offset[ctx_start][0] > ans_start_char or offset[ctx_end][1] < ans_end_char:
    pass
    # nothing else to do
  else:
    i = ctx_start
    for start_end_char in offset[ctx_start:]:
      start, end = start_end_char
      if start == ans_start_char:
        start_idx = i
      if end == ans_end_char:
        end_idx = i
        break

      i += 1
  return start_idx, end_idx

In [None]:
# google used thesse values in their experiments
max_length = 384
stride = 128

def tokenize_fn_train(batch):
  """
  Takes a question+context-pair input sample and splits
  them into multiple question+context_window input samples
  with max_length tokens. Additionaly it determines the token
  of the answers in each context window, if the answer is fully
  included.

  batch: Batch of input samples.
  """
  # some questions have leading and/or trailing whitespace
  questions = [q.strip() for q in batch["question"]]


  inputs = tokenizer(
    questions,
    batch["context"],
    max_length=max_length,
    # truncate only context, since this gets splitted
    truncation="only_second",
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    # all input samples have max_length tokens
    padding="max_length",
  )

  offset_mapping = inputs.pop("offset_mapping")
  orig_sample_idxs = inputs.pop("overflow_to_sample_mapping")

  answers = batch['answers']
  start_idxs, end_idxs = [], []

  for i, offset in enumerate(offset_mapping):
    sample_idx = orig_sample_idxs[i]
    answer = answers[sample_idx]

    ans_start_char = answer['answer_start'][0]
    ans_end_char = ans_start_char + len(answer['text'][0])

    sequence_ids = inputs.sequence_ids(i)

    # find start + end of context (first 1 and last 1)
    ctx_start = sequence_ids.index(1)
    ctx_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

    start_idx, end_idx = find_answer_token_idx(
      ctx_start,
      ctx_end,
      ans_start_char,
      ans_end_char,
      offset)

    start_idxs.append(start_idx)
    end_idxs.append(end_idx)

  inputs["start_positions"] = start_idxs
  inputs["end_positions"] = end_idxs
  return inputs

In [None]:
train_dataset = raw_datasets["train"].map(
  tokenize_fn_train,
  batched=True,
  remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

In [None]:
# tokenize the validation set differently
# since the metrics gets computed from the string answers
# also: overwrite offset_mapping with Nones in place of question
def tokenize_fn_validation(batch):
  questions = [q.strip() for q in batch["question"]]

  inputs = tokenizer(
    questions,
    batch["context"],
    max_length=max_length,
    truncation="only_second",
    stride=stride,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding="max_length",
  )

  orig_sample_idxs = inputs.pop("overflow_to_sample_mapping")
  sample_ids = []

  # rewrite offset mapping by replacing question tuples with None
  # this will be helpful later on when we compute metrics
  for i in range(len(inputs["input_ids"])):
    sample_idx = orig_sample_idxs[i]
    sample_ids.append(batch['id'][sample_idx])

    sequence_ids = inputs.sequence_ids(i)
    offset = inputs["offset_mapping"][i]
    inputs["offset_mapping"][i] = [
      x if sequence_ids[j] == 1 else None for j, x in enumerate(offset)]

  inputs['sample_id'] = sample_ids
  return inputs

In [None]:
validation_dataset = raw_datasets["validation"].map(
  tokenize_fn_validation,
  batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

(10570, 10822)

# Metrics

In [None]:
from datasets import load_metric
metric = load_metric("squad")

In [None]:
# needed format:
predicted_answers = [
  {'id': '1', 'prediction_text': 'Albert Einstein'},
  {'id': '2', 'prediction_text': 'physicist'},
  {'id': '3', 'prediction_text': 'general relativity'},
]
true_answers = [
  {'id': '1', 'answers': {'text': ['Albert Einstein'], 'answer_start': [100]}},
  {'id': '2', 'answers': {'text': ['physicist'], 'answer_start': [100]}},
  {'id': '3', 'answers': {'text': ['special relativity'], 'answer_start': [100]}},
]

# exact match only counts the samples, where answer is fully correct.
# f1 also takes partial correct answers into account
metric.compute(predictions=predicted_answers, references=true_answers)

In [None]:
from tqdm.autonotebook import tqdm

# only search in the tokens with the n_largest probabilitys
n_largest = 20
# max length of answer
max_answer_length = 30


def compute_metrics(start_logits, end_logits, processed_dataset, orig_dataset):
  """
  Determines the answer based on the model output.
  For this, the start and end tokens with the highest probabilities are used.
  The corresponding tokens must meet following conditions, to build a valid answer:

  1. start-token Timestep < end-token Timestep

  2. start and end-token must be included in the context_window part of the input

  3. answer must be less then 30 tokens long

  start_logits: (batch_size, sequence_length), logits for the start-token
  end_logits: (batch_size, sequence_length, logits for the end-tokens
  processed_dataset: tokenized and context-windowed dataset
  orig_datasset: unprocessed dataset
  """
  # maps every question+context_window pair to the id of the original sample
  sample_id2idxs = {}
  for i, id_ in enumerate(processed_dataset['sample_id']):
    if id_ not in sample_id2idxs:
      sample_id2idxs[id_] = [i]
    else:
      sample_id2idxs[id_].append(i)

  predicted_answers = []
  for sample in tqdm(orig_dataset):

    sample_id = sample['id']
    context = sample['context']

    # initialisation
    best_score = float('-inf')
    best_answer = None

    # loop over every question+context_window-pair,
    # to get the highest probability tokens over all these pairs,
    # for the corresponding original question+context-pair
    for idx in sample_id2idxs[sample_id]:
      start_logit = start_logits[idx] # (seq_len,) vector
      end_logit = end_logits[idx] # (seq_len,) vector

      # do NOT do the reverse indexing: ['offset_mapping'][idx], much slower
      offsets = processed_dataset[idx]['offset_mapping']

      # get the highest probability tokens
      start_indices = (-start_logit).argsort()
      end_indices = (-end_logit).argsort()
      for start_idx in start_indices[:n_largest]:
        for end_idx in end_indices[:n_largest]:

          # skip answers not contained in context window
          if offsets[start_idx] is None or offsets[end_idx] is None:
            continue

          # skip answers where end < start
          if end_idx < start_idx:
            continue

          # skip answers that are too long
          if end_idx - start_idx + 1 > max_answer_length:
            continue

          # see cell down under for score calculation
          score = start_logit[start_idx] + end_logit[end_idx]
          if score > best_score:
            best_score = score

            # find positions of start and end characters
            first_ch = offsets[start_idx][0]
            last_ch = offsets[end_idx][1]

            best_answer = context[first_ch:last_ch]

    # save best answer in needed format
    predicted_answers.append({'id': sample_id, 'prediction_text': best_answer})

  # save labels in needed format
  true_answers = [
    {'id': x['id'], 'answers': x['answers']} for x in orig_dataset
  ]
  return metric.compute(predictions=predicted_answers, references=true_answers)

## From logits to probability scores

$
P(s_i)...probability ~that ~start ~token ~of ~answer ~is ~on ~timestep ~i  \\  
P(e_j)...probability ~that ~end ~token ~of ~answer ~is ~on ~timestep ~j
$

Goal, with the independence assumption on the distributions:

$
P(s_i, e_j)=P(s_i)P(e_j) ~~ -> ~~ max
$

numerically stable computation via logits:

$
l(s_i) = logit(s_i) ~~~~~ s_i...~start ~token ~on ~timestep ~i  \\  
l(e_j) = logit(e_j) ~~~~~ e_j...~end ~token ~on ~timestep ~j
$

$P(s_i)=\frac{e^{l(s_i)}}{\sum_{n=1}^Te^{l(s_i)}}=\frac{e^{l(s_i)}}{Z_s}$

$P(e_j)=\frac{e^{l(e_j)}}{\sum_{n=1}^Te^{l(e_j)}}=\frac{e^{l(e_j)}}{Z_e}$

$
\underset{i,j}{argmax} ~ P(s_i)P(e_j)  \\
= \underset{i,j}{argmax} ~ \frac{e^{l(s_i)}}{Z_s}\frac{e^{l(e_j)}}{Z_e}
$

$
Z_s ~and ~Z_e ~are ~independent ~from ~i ~and ~j
$

$
\underset{i,j}{argmax} ~ \frac{e^{l(s_i)}}{Z_s}\frac{e^{l(e_j)}}{Z_e} ~=~ \underset{i,j}{argmax} ~ e^{l(s_i)}e^{l(e_j)} ~=~
\underset{i,j}{argmax} ~ log(e^{l(s_i)}e^{l(e_j)}) \\
\\
=\underset{i,j}{argmax} ~ l(s_i)+l(e_j)
$

# Train and evaluate

In [None]:
from transformers import Trainer
from google.colab import files

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

from transformers import TrainingArguments

args = TrainingArguments(
    "finetuned-squad",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    # train_dataset=train_dataset.shuffle(seed=42).select(range(1_000)),
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
trainer_output = trainer.predict(validation_dataset)

predictions, _, _ = trainer_output

start_logits, end_logits = predictions

compute_metrics(
    start_logits,
    end_logits,
    validation_dataset, # processed
    raw_datasets["validation"], # orig
)

# Saving and uploading to S3

In [None]:
trainer.save_model('trained_model')

In [None]:
# for direct download
!zip -r trained_model.zip trained_model
files.download('trained_model.zip')

In [None]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'extractive-qa-models-v1'

def create_bucket(bucket_name):
    s3.create_bucket(Bucket=bucket_name)
    print("Bucket is created")

create_bucket(bucket_name)

In [None]:
import os
def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")

            s3.upload_file(file_path, bucket_name, s3_key)

upload_directory('/content/trained_model', 'models')

In [None]:
def list_objects():
    response = s3.list_objects_v2(Bucket=bucket_name)
    for obj in response['Contents']:
        print(obj['Key'])

list_objects()