# Zero shot sequence labeling
This notebook is made with the notion that we already have a pre-trained model.

This notebook is made with modified code from lab 6: https://github.com/coastalcph/nlp-course/blob/44633220993b07e10f81de4edaf007868b46392d/labs/notebooks_2023/lab_6.ipynb

In [36]:
!pip install update transformers
!pip install datasets
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
from datasets import load_dataset
from datasets import load_metric
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import AutoConfig
from functools import partial
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR
from torch import nn
from collections import defaultdict, OrderedDict
# MODEL_NAME = 'xlm-roberta-base'
MODEL_NAME = 'bert-base-multilingual-uncased'
# NUM_SUBSAMPLES = 11394
#bengali: 4779
#Arabic: 29598
#Indonesian: 11394
LANGUAGE = "indonesian" # "bengali" "arabic"

In [38]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [39]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

dataset = load_dataset("copenlu/answerable_tydiqa")

filtered_dataset = dataset.filter(lambda entry: entry["language"] in [LANGUAGE])

train_set = filtered_dataset["train"]
validation_set = filtered_dataset["validation"]

train_set_df = train_set.to_pandas()
train_set_df['id'] = range(len(train_set_df))
validation_set_df = validation_set.to_pandas()
validation_set_df['id'] = range(len(validation_set_df))

train_set = Dataset.from_pandas(train_set_df)
validation_set = Dataset.from_pandas(validation_set_df)

  0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [41]:
model = AutoModelForQuestionAnswering.from_pretrained("/kaggle/input/arabic-model").to(device)

In [42]:
def get_validation_features(tk, samples):
  # First, tokenize the text. We get the offsets and return overflowing sequences in
  # order to break up long sequences into multiple inputs. The offsets will help us
  # determine the original answer text
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question_text'], samples['document_plaintext'])],
        padding='max_length',
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # We'll store the ID of the samples to calculate squad score
  batch['example_id'] = []
  # The overflow sample map tells us which input each sample corresponds to
  sample_map = batch.pop('overflow_to_sample_mapping')

  for i in range(len(batch['input_ids'])):
    # The sample index tells us which of the values in "samples" these features belong to
    sample_idx = sample_map[i]
    sequence_ids = batch.sequence_ids(i)

    # Add the ID to map these features back to the correct sample
    batch['example_id'].append(samples['id'][sample_idx])

    #Set offsets for non-context words to be None for ease of processing
    batch['offset_mapping'][i] = [o if sequence_ids[k] == 1 else None for k,o in enumerate(batch['offset_mapping'][i])]

  return batch

def val_collate_fn(inputs):
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [43]:
validation_dataset = validation_set.map(partial(get_validation_features, tk), batched=True, remove_columns=validation_set.column_names)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [44]:
def predict(model: nn.Module, valid_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
  # layer normalization and dropout
  model.eval()
  start_logits_all = []
  end_logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(valid_dl, desc='Evaluation'):
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask']
      )
      # Store the "start" class logits and "end" class logits for every token in the input
      start_logits_all.extend(list(outputs['start_logits'].detach().cpu().numpy()))
      end_logits_all.extend(list(outputs['end_logits'].detach().cpu().numpy()))


    return start_logits_all,end_logits_all

def post_process_predictions(examples, dataset, logits, tokenizer, num_possible_answers = 20, max_answer_length = 30):
  all_start_logits, all_end_logits = logits
  # Build a map from example to its corresponding features. This will allow us to index from
  # sample ID to all of the features for that sample (in case they were split up due to long input)
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  features_per_example = defaultdict(list)
  for i, feature in enumerate(dataset):
      features_per_example[example_id_to_index[feature["example_id"]]].append(i)

  # Create somewhere to store our predictions
  predictions = OrderedDict()

  # Iterate through each sample in the dataset
  for j, sample in enumerate(tqdm(examples)):

    # Get the feature indices (all of the features split across the batch)
    feature_indices = features_per_example[j]
    # Get the original context which predumably has the answer text
    context = sample['document_plaintext']

    preds = []

    min_score_threshold = None

    # Iterate through all of the features
    for ft_idx in feature_indices:

      # Get the start and end answer logits for this input
      start_logits = all_start_logits[ft_idx]
      end_logits = all_end_logits[ft_idx]

      # Get the offsets to map token indices to character indices
      offset_mapping = dataset[ft_idx]['offset_mapping']


      # Update minimum null prediction.
      cls_index = dataset[ft_idx]["input_ids"].index(tokenizer.cls_token_id)
      feature_min_score_threshold = start_logits[cls_index] + end_logits[cls_index]
      if min_score_threshold is None or min_score_threshold < feature_min_score_threshold:
          min_score_threshold = feature_min_score_threshold

      # Sort the logits and take the top N
      start_indices = np.argsort(start_logits)[::-1][:num_possible_answers]
      end_indices = np.argsort(end_logits)[::-1][:num_possible_answers]

      # Iterate through start and end indices
      for start_index in start_indices:
        for end_index in end_indices:

          # Ignore this combination if either the indices are not in the context
          if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
            continue

          # Also ignore if the start index is greater than the end index of the number of tokens
          # is greater than some specified threshold
          if start_index > end_index or end_index - start_index + 1 > max_answer_length:
            continue
          try:
              ans_text = context[offset_mapping[start_index][0]:offset_mapping[end_index][1]]
              preds.append({
                  'score': start_logits[start_index] + end_logits[end_index],
                  'text': ans_text
              })
          except Exception as e:
              continue

    if len(preds) > 0:
      # Sort by score to get the top answer
      best_answer = sorted(preds, key=lambda x: x['score'], reverse=True)[0]
    else:
      best_answer = {'score': 0.0, 'text': ""}

    # if the best answer is below the threshold for lowest score, give it the empty string

    answer = best_answer["text"] if best_answer["score"] > min_score_threshold else ""
    predictions[sample['id']] = answer
  return predictions

In [45]:
val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
logits = predict(model, val_dl)

Evaluation: 100%|██████████| 38/38 [00:18<00:00,  2.06it/s]


In [46]:
validation_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 1210
})

In [47]:
predictions = post_process_predictions(validation_set, validation_dataset, logits, tk)
formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
gold = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in validation_set]

100%|██████████| 1191/1191 [00:06<00:00, 188.80it/s]


In [48]:
""" **MODIFIED code from lab 6** Official evaluation script for v1.1 of the SQuAD dataset. """
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    if len(prediction_tokens) == 0 and len(ground_truth_tokens) == 0:
      return 1
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate_squad(dataset, predictions):
    f1 = exact_match = total = 0
    for article in dataset:
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                total += 1
                if qa['id'] not in predictions:
                    message = 'Unanswered question ' + qa['id'] + \
                              ' will receive score 0.'
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x['text'], qa['answers']))
                prediction = predictions[qa['id']]
                exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
                f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

def compute_squad(predictions, references):
  pred_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
  dataset = [
      {
          "paragraphs": [
              {
                  "qas": [
                      {
                          "answers": [{"text": ref["answers"]} ],
                          "id": ref["id"],
                      }
                      for ref in references
                  ]
              }
          ]
      }
  ]
  score = evaluate_squad(dataset=dataset, predictions=pred_dict)
  return score

In [49]:
compute_squad(references=gold, predictions=formatted_predictions)

{'exact_match': 62.04869857262804, 'f1': 69.10885954901626}

## All in one function:

In [50]:
def zero_shot_eval(model_path, language):
    filtered_dataset = dataset.filter(lambda entry: entry["language"] in [language])

    train_set = filtered_dataset["train"]
    validation_set = filtered_dataset["validation"]

    train_set_df = train_set.to_pandas()
    train_set_df['id'] = range(len(train_set_df))
    validation_set_df = validation_set.to_pandas()
    validation_set_df['id'] = range(len(validation_set_df))

    train_set = Dataset.from_pandas(train_set_df)
    validation_set = Dataset.from_pandas(validation_set_df)
    
    model = AutoModelForQuestionAnswering.from_pretrained(model_path).to(device)
    
    validation_dataset = validation_set.map(partial(get_validation_features, tk), batched=True, remove_columns=validation_set.column_names)
    
    val_dl = DataLoader(validation_dataset, collate_fn=val_collate_fn, batch_size=32)
    logits = predict(model, val_dl)
    
    predictions = post_process_predictions(validation_set, validation_dataset, logits, tk)
    formatted_predictions = [{'id': k, 'prediction_text': v} for k,v in predictions.items()]
    gold = [{'id': example['id'], 'answers': example['annotations']["answer_text"][0]} for example in validation_set]
    
    return compute_squad(references=gold, predictions=formatted_predictions)
    

## Bengali model

In [51]:
bengali_indonesian_zero_shot = zero_shot_eval("/kaggle/input/bengali-bert-2", "indonesian")
bengali_indonesian_zero_shot

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 38/38 [00:18<00:00,  2.05it/s]
100%|██████████| 1191/1191 [00:06<00:00, 192.74it/s]


{'exact_match': 57.59865659109992, 'f1': 60.15850996218458}

In [52]:
bengali_arabic_zero_shot = zero_shot_eval("/kaggle/input/bengali-bert-2", "arabic")
bengali_arabic_zero_shot

  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 8/8 [00:05<00:00,  1.59it/s]
100%|██████████| 224/224 [00:01<00:00, 169.05it/s]


{'exact_match': 65.17857142857143, 'f1': 71.27000231910947}

In [53]:
bengali_bengali_zero_shot = zero_shot_eval("/kaggle/input/bengali-bert-2", "bengali")
bengali_bengali_zero_shot

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 62/62 [00:36<00:00,  1.68it/s]
100%|██████████| 1902/1902 [00:11<00:00, 172.68it/s]


{'exact_match': 56.098843322818084, 'f1': 58.272597262247125}

## Indonesian model

In [54]:
indonesian_arabic_zeo_shot = zero_shot_eval("/kaggle/input/indonesian-bert-3", "arabic")
indonesian_arabic_zeo_shot

  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 8/8 [00:04<00:00,  1.67it/s]
100%|██████████| 224/224 [00:01<00:00, 167.95it/s]


{'exact_match': 56.25, 'f1': 61.72435203685203}

In [55]:
indonesian_bengali_zeo_shot = zero_shot_eval("/kaggle/input/indonesian-bert-3", "bengali")
indonesian_bengali_zeo_shot

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 38/38 [00:18<00:00,  2.04it/s]
100%|██████████| 1191/1191 [00:06<00:00, 190.93it/s]


{'exact_match': 75.56675062972292, 'f1': 80.50033006143278}

In [56]:
indonesian_indonesian_zeo_shot = zero_shot_eval("/kaggle/input/indonesian-bert-3", "indonesian")
indonesian_indonesian_zeo_shot

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 62/62 [00:36<00:00,  1.68it/s]
100%|██████████| 1902/1902 [00:11<00:00, 171.46it/s]


{'exact_match': 66.19348054679286, 'f1': 73.36365789378847}

## Arabic model

In [57]:
arabic_ben_zero_shot = zero_shot_eval("/kaggle/input/arabic-bert-2", "bengali")
arabic_ben_zero_shot

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 38/38 [00:18<00:00,  2.07it/s]
100%|██████████| 1191/1191 [00:06<00:00, 190.53it/s]


{'exact_match': 62.04869857262804, 'f1': 69.10885954901626}

In [58]:
arabic_indo_zero_shot = zero_shot_eval("/kaggle/input/arabic-bert-2", "indonesian")
arabic_indo_zero_shot

  0%|          | 0/1 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 8/8 [00:04<00:00,  1.66it/s]
100%|██████████| 224/224 [00:01<00:00, 168.41it/s]


{'exact_match': 51.339285714285715, 'f1': 54.21977328227329}

In [59]:
arabic_arabic_zero_shot = zero_shot_eval("/kaggle/input/arabic-bert-2", "arabic")
arabic_arabic_zero_shot

  0%|          | 0/2 [00:00<?, ?ba/s]

Evaluation: 100%|██████████| 38/38 [00:18<00:00,  2.03it/s]
100%|██████████| 1191/1191 [00:06<00:00, 188.80it/s]


{'exact_match': 62.04869857262804, 'f1': 69.10885954901626}