<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/malthe/assignments/w4_bert_ben.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 39 BERT Bengali
This notebook uses code from lab 6: https://github.com/MLFlexer/nlp-course/blob/cbbb4bc13d7d5f639ada243104b9d85efe1dc166/labs/notebooks_2023/lab_6.ipynb

In [None]:
!pip install update transformers
!pip install datasets
!pip install evaluate

In [None]:
from datasets import load_dataset
from datasets import load_metric
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import AutoConfig
from functools import partial
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR
from torch import nn
from collections import defaultdict, OrderedDict
# MODEL_NAME = 'xlm-roberta-base'
MODEL_NAME = 'bert-base-multilingual-uncased'
NUM_SUBSAMPLES = 4779
#bengali: 4779
#Arabic: 29598
#Indonesian: 11394
LANGUAGE = "bengali" # "arabic" "indonesian"

In [None]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [None]:
from datasets import load_dataset

dataset = load_dataset("copenlu/answerable_tydiqa")

filtered_dataset = dataset.filter(lambda entry: entry["language"] in [LANGUAGE])

train_set = filtered_dataset["train"]
validation_set = filtered_dataset["validation"]

In [None]:
from datasets import Dataset, DatasetDict
train_set_df = train_set.to_pandas()
train_set_df['id'] = range(len(train_set_df))
validation_set_df = validation_set.to_pandas()
validation_set_df['id'] = range(len(validation_set_df))

train_set = Dataset.from_pandas(train_set_df)
validation_set = Dataset.from_pandas(validation_set_df)

In [None]:
print(len(validation_set))
train_set[2]

In [None]:
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def get_train_features(tk, samples):
  '''
  Tokenizes all of the text in the given samples, splittling inputs that are too long for our model
  across multiple features. Finds the token offsets of the answers, which serve as the labels for
  our inputs.
  '''
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question_text'], samples['document_plaintext'])],
        padding='max_length',
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # Get a list which maps the input features index to their original index in the
  # samples list (for split inputs). E.g. if our batch size is 4 and the second sample
  # is split into 3 inputs because it is very large, sample_mapping would look like
  # [0, 1, 1, 1, 2, 3]
  sample_mapping = batch.pop('overflow_to_sample_mapping')
  # Get all of the character offsets for each token
  offset_mapping = batch.pop('offset_mapping')

  # Store the start and end tokens
  batch['start_tokens'] = []
  batch['end_tokens'] = []

  # Iterate through all of the offsets
  for i, offsets in enumerate(offset_mapping):
    # Get the right sample by mapping it to its original index
    sample_idx = sample_mapping[i]
    # Get the sequence IDs to know where context starts so we can ignore question tokens
    sequence_ids = batch.sequence_ids(i)

    # Get the start and end character positions of the answer
    ans = samples['annotations'][sample_idx]
    start_char = ans['answer_start'][0]
    end_char = start_char + len(ans['answer_text'][0])
    # while end_char > 0 and (end_char >= len(samples['context'][sample_idx]) or samples['context'][sample_idx][end_char] == ' '):
    #   end_char -= 1

    # Start from the first token in the context, which can be found by going to the
    # first token where sequence_ids is 1
    start_token = 0
    while sequence_ids[start_token] != 1:
      start_token += 1

    end_token = len(offsets) - 1
    while sequence_ids[end_token] != 1:
      end_token -= 1

    # By default set it to the CLS token if the answer isn't in this input
    if start_char < offsets[start_token][0] or end_char > offsets[end_token][1]:
      start_token = 0
      end_token = 0
    # Otherwise find the correct token indices
    else:
      # Advance the start token index until we have passed the start character index
      while start_token < len(offsets) and offsets[start_token][0] <= start_char:
        start_token += 1
      start_token -= 1

      # Decrease the end token index until we have passed the end character index
      while end_token >= 0 and offsets[end_token][1] >= end_char:
        end_token -= 1
      end_token += 1

    batch['start_tokens'].append(start_token)
    batch['end_tokens'].append(end_token)

  #batch['start_tokens'] = np.array(batch['start_tokens'])
  #batch['end_tokens'] = np.array(batch['end_tokens'])

  return batch

def collate_fn(inputs):
  '''
  Defines how to combine different samples in a batch
  '''
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])
  start_tokens = torch.tensor([i['start_tokens'] for i in inputs])
  end_tokens = torch.tensor([i['end_tokens'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask, 'start_tokens': start_tokens, 'end_tokens': end_tokens}

In [None]:
tokenized_dataset = train_set.map(partial(get_train_features, tk), batched=True, remove_columns=train_set.column_names)

In [None]:
tokenized_dataset

In [None]:
samples = random.sample(list(range(len(tokenized_dataset))), NUM_SUBSAMPLES)
tokenized_dataset = tokenized_dataset.select(samples)
train_dl = DataLoader(tokenized_dataset, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [None]:
import gc

In [None]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    schedule: LambdaLR,
    n_epochs: int,
    device: torch.device
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask'],
          start_positions=batch['start_tokens'],
          end_positions=batch['end_tokens']
      )
      loss = outputs['loss']
      losses.append(loss.item())
      loss_epoch.append(loss.item())

      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model and advance the LR schedule
      optimizer.step()
      scheduler.step()
      gc.collect()
  return losses

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

In [None]:
# Create the optimizer
lr=2e-5
n_epochs = 5
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl)
)

In [None]:
losses = train(
    model,
    train_dl,
    optimizer,
    scheduler,
    n_epochs,
    device
)

In [None]:
def get_validation_features(tk, samples):
  # First, tokenize the text. We get the offsets and return overflowing sequences in
  # order to break up long sequences into multiple inputs. The offsets will help us
  # determine the original answer text
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question_text'], samples['document_plaintext'])],
        padding='max_length',
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # We'll store the ID of the samples to calculate squad score
  batch['example_id'] = []
  # The overflow sample map tells us which input each sample corresponds to
  sample_map = batch.pop('overflow_to_sample_mapping')

  for i in range(len(batch['input_ids'])):
    # The sample index tells us which of the values in "samples" these features belong to
    sample_idx = sample_map[i]
    sequence_ids = batch.sequence_ids(i)

    # Add the ID to map these features back to the correct sample
    batch['example_id'].append(samples['id'][sample_idx])

    #Set offsets for non-context words to be None for ease of processing
    batch['offset_mapping'][i] = [o if sequence_ids[k] == 1 else None for k,o in enumerate(batch['offset_mapping'][i])]

  return batch

def val_collate_fn(inputs):
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [None]:
validation_dataset = validation_set.map(partial(get_validation_features, tk), batched=True, remove_columns=validation_set.column_names)

In [None]:
def predict(model: nn.Module, valid_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
  # layer normalization and dropout
  model.eval()
  start_logits_all = []
  end_logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(valid_dl, desc='Evaluation'):
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask']
      )
      # Store the "start" class logits and "end" class logits for every token in the input
      start_logits_all.extend(list(outputs['start_logits'].detach().cpu().numpy()))
      end_logits_all.extend(list(outputs['end_logits'].detach().cpu().numpy()))


    return start_logits_all,end_logits_all

def post_process_predictions(examples, dataset, logits, tokenizer, num_possible_answers = 20, max_answer_length = 30):
  all_start_logits, all_end_logits = logits
  # Build a map from example to its corresponding features. This will allow us to index from
  # sample ID to all of the features for that sample (in case they were split up due to long input)
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  features_per_example = defaultdict(list)
  for i, feature in enumerate(dataset):
      features_per_example[example_id_to_index[feature["example_id"]]].append(i)

  # Create somewhere to store our predictions
  predictions = OrderedDict()

  # Iterate through each sample in the dataset
  for j, sample in enumerate(tqdm(examples)):

    # Get the feature indices (all of the features split across the batch)
    feature_indices = features_per_example[j]
    # Get the original context which predumably has the answer text
    context = sample['document_plaintext']

    preds = []

    min_score_threshold = None

    # Iterate through all of the features
    for ft_idx in feature_indices:

      # Get the start and end answer logits for this input
      start_logits = all_start_logits[ft_idx]
      end_logits = all_end_logits[ft_idx]

      # Get the offsets to map token indices to character indices
      offset_mapping = dataset[ft_idx]['offset_mapping']


      # Update minimum null prediction.
      cls_index = dataset[ft_idx]["input_ids"].index(tokenizer.cls_token_id)
      feature_min_score_threshold = start_logits[cls_index] + end_logits[cls_index]
      if min_score_threshold is None or min_score_threshold < feature_min_score_threshold:
          min_score_threshold = feature_min_score_threshold

      # Sort the logits and take the top N
      start_indices = np.argsort(start_logits)[::-1][:num_possible_answers]
      end_indices = np.argsort(end_logits)[::-1][:num_possible_answers]

      # Iterate through start and end indices
      for start_index in start_indices:
        for end_index in end_indices:

          # Ignore this combination if either the indices are not in the context
          if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
            continue

          # Also ignore if the start index is greater than the end index of the number of tokens
          # is greater than some specified threshold
          if start_index > end_index or end_index - start_index + 1 > max_answer_length:
            continue

          ans_text = context[offset_mapping[start_index][0]:offset_mapping[end_index][1]]
          preds.append({
              'score': start_logits[start_index] + end_logits[end_index],
              'text': ans_text
          })

    if len(preds) > 0:
      # Sort by score to get the top answer
      best_answer = sorted(preds, key=lambda x: x['score'], reverse=True)[0]
    else:
      best_answer = {'score': 0.0, 'text': ""}

    # if the best answer is below the threshold for lowest score, give it the empty string

    answer = best_answer["text"] if best_answer["score"] > min_score_threshold else ""
    predictions[sample['id']] = answer
  return predictions

In [None]:
val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
logits = predict(model, val_dl)

In [None]:
validation_dataset

In [None]:
predictions = post_process_predictions(validation_set, validation_dataset, logits, tk)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in validation_set]

In [None]:
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    if len(prediction_tokens) == 0 and len(ground_truth_tokens) == 0:
      return 1
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate_squad(dataset, predictions):
    f1 = exact_match = total = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

def compute_squad(predictions, references):
  pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
  dataset = [
      {
          "paragraphs": [
              {
                  "qas": [
                      {
                          "answers": [{"text": ref["answers"]} ],
                          "id": ref["id"],
                      }
                      for ref in references
                  ]
              }
          ]
      }
  ]
  score = evaluate_squad(dataset=dataset, predictions=pred_dict)
  return score

In [None]:
compute_squad(references=gold, predictions=formatted_predictions)