# Transfomer based models

If using google colab run below cell and restart runtime. (Remember to use GPU)

In [1]:
%%capture
!pip install transformers[torch]

In [2]:
%%capture
!pip install update transformers
!pip install datasets

In [1]:
from datasets import load_dataset, DatasetDict
from datasets import load_metric
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import AutoConfig
from functools import partial
import torch
import random
import numpy as np
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.optim.lr_scheduler import LambdaLR
from torch import nn
from collections import defaultdict, OrderedDict
from datasets import DatasetDict
#MODEL_NAME = 'xlm-roberta-base'
#MODEL_NAME = 'bert-base-uncased'

In [2]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

## Loading and preparing data

In [4]:
dataset = load_dataset("copenlu/answerable_tydiqa")
train_set = dataset["train"]
validation_set = dataset["validation"]

languages = ['indonesian', 'bengali', 'arabic']

# Creating a dictionary for the training and validation set, that holds all three languages.
train_set_dict = {}
val_set_dict = {}

for language in languages:
    print("Adding ", language, "to dict")
    train_set_dict[language] = train_set.filter(lambda example: example["language"] == language)
    val_set_dict[language] = validation_set.filter(lambda example: example["language"] == language)

Adding  indonesian to dict
Adding  bengali to dict
Adding  arabic to dict


In [4]:
# Making sure that our dataset has the same shape as SquAD_V2

def reformat_example(example, idx):
    example["id"] = str(idx)

    # Rename columns
    example["title"] = example.pop("document_title")
    example["context"] = example.pop("document_plaintext")
    example["question"] = example.pop("question_text")

    # Reformat the answers structure
    annotations = example.pop("annotations")
    example["answers"] = {
        "text": annotations["answer_text"],
        "answer_start": annotations["answer_start"]
    }



    return example

TEST_SIZE = 0.15

def training_split(train_dict, val_dict):
    dict_list = DatasetDict()

    for key in train_dict.keys():
        hugging_dict = DatasetDict()

        # Transform the train dataset
        hugging_dict['train'] = train_dict[key].select_columns(['question_text', 'document_title', 'document_plaintext', 'annotations'])
        hugging_dict['train'] = hugging_dict['train'].map(reformat_example, with_indices=True)

        # Transform the validation dataset
        hugging_dict['validation'] = val_dict[key].select_columns(['question_text', 'document_title', 'document_plaintext', 'annotations'])
        hugging_dict['validation'] = hugging_dict['validation'].map(reformat_example, with_indices=True)

        dict_list[key] = hugging_dict

    return dict_list


In [5]:
tydiqa = training_split(train_set_dict, val_set_dict)

In [6]:
compute_squad = load_metric("squad_v2")

  compute_squad = load_metric("squad_v2")


## Multilingual Bert model.

### Shared function

In [7]:
MODEL_NAME = "bert-base-multilingual-cased"
#MODEL_NAME = "indolem/indobert-base-uncased"
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [8]:
def get_train_features(tk, samples):
  '''
  Tokenizes all of the text in the given samples, splittling inputs that are too long for our model
  across multiple features. Finds the token offsets of the answers, which serve as the labels for
  our inputs.
  '''
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question'], samples['context'])],
        padding='max_length',
         max_length=512,
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # Get a list which maps the input features index to their original index in the
  # samples list (for split inputs). E.g. if our batch size is 4 and the second sample
  # is split into 3 inputs because it is very large, sample_mapping would look like
  # [0, 1, 1, 1, 2, 3]
  sample_mapping = batch.pop('overflow_to_sample_mapping')
  # Get all of the character offsets for each token
  offset_mapping = batch.pop('offset_mapping')

  # Store the start and end tokens
  batch['start_tokens'] = []
  batch['end_tokens'] = []

  # Iterate through all of the offsets
  for i, offsets in enumerate(offset_mapping):
    # Get the right sample by mapping it to its original index
    sample_idx = sample_mapping[i]
    # Get the sequence IDs to know where context starts so we can ignore question tokens
    sequence_ids = batch.sequence_ids(i)

    # Get the start and end character positions of the answer
    ans = samples['answers'][sample_idx]
    start_char = ans['answer_start'][0]
    end_char = start_char + len(ans['text'][0])
    # while end_char > 0 and (end_char >= len(samples['context'][sample_idx]) or samples['context'][sample_idx][end_char] == ' '):
    #   end_char -= 1

    # Start from the first token in the context, which can be found by going to the
    # first token where sequence_ids is 1
    start_token = 0
    while sequence_ids[start_token] != 1:
      start_token += 1

    end_token = len(offsets) - 1
    while sequence_ids[end_token] != 1:
      end_token -= 1

    # By default set it to the CLS token if the answer isn't in this input
    if start_char < offsets[start_token][0] or end_char > offsets[end_token][1]:
      start_token = 0
      end_token = 0
    # Otherwise find the correct token indices
    else:
      # Advance the start token index until we have passed the start character index
      while start_token < len(offsets) and offsets[start_token][0] <= start_char:
        start_token += 1
      start_token -= 1

      # Decrease the end token index until we have passed the end character index
      while end_token >= 0 and offsets[end_token][1] >= end_char:
        end_token -= 1
      end_token += 1

    batch['start_tokens'].append(start_token)
    batch['end_tokens'].append(end_token)

  #batch['start_tokens'] = np.array(batch['start_tokens'])
  #batch['end_tokens'] = np.array(batch['end_tokens'])

  return batch

def collate_fn(inputs):
  '''
  Defines how to combine different samples in a batch
  '''
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])
  start_tokens = torch.tensor([i['start_tokens'] for i in inputs])
  end_tokens = torch.tensor([i['end_tokens'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask, 'start_tokens': start_tokens, 'end_tokens': end_tokens}

In [9]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    schedule: LambdaLR,
    n_epochs: int,
    device: torch.device
):
  """
  The main training loop which will optimize a given model on a given dataset
  :param model: The model being optimized
  :param train_dl: The training dataset
  :param optimizer: The optimizer used to update the model parameters
  :param n_epochs: Number of epochs to train for
  :param device: The device to train on
  """

  # Keep track of the loss and best accuracy
  losses = []
  best_acc = 0.0
  pcounter = 0

  # Iterate through epochs
  for ep in range(n_epochs):

    loss_epoch = []

    #Iterate through each batch in the dataloader
    for batch in tqdm(train_dl):
      # VERY IMPORTANT: Make sure the model is in training mode, which turns on
      # things like dropout and layer normalization
      model.train()

      # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
      # keeps track of these dynamically in its computation graph so you need to explicitly
      # zero them out
      optimizer.zero_grad()

      # Place each tensor on the GPU
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask'],
          start_positions=batch['start_tokens'],
          end_positions=batch['end_tokens']
      )
      loss = outputs['loss']
      losses.append(loss.item())
      loss_epoch.append(loss.item())

      # Calculate all of the gradients and weight updates for the model
      loss.backward()

      # Optional: clip gradients
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Finally, update the weights of the model and advance the LR schedule
      optimizer.step()
      scheduler.step()
      #gc.collect()
  return losses

In [10]:
def get_validation_features(tk, samples):
  # First, tokenize the text. We get the offsets and return overflowing sequences in
  # order to break up long sequences into multiple inputs. The offsets will help us
  # determine the original answer text
  batch = tk.batch_encode_plus(
        [[q,c] for q,c in zip(samples['question'], samples['context'])],
        padding='max_length',
        max_length = 512,
        truncation='only_second',
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

  # We'll store the ID of the samples to calculate squad score
  batch['example_id'] = []
  # The overflow sample map tells us which input each sample corresponds to
  sample_map = batch.pop('overflow_to_sample_mapping')

  for i in range(len(batch['input_ids'])):
    # The sample index tells us which of the values in "samples" these features belong to
    sample_idx = sample_map[i]
    sequence_ids = batch.sequence_ids(i)

    # Add the ID to map these features back to the correct sample
    batch['example_id'].append(samples['id'][sample_idx])

    #Set offsets for non-context words to be None for ease of processing
    batch['offset_mapping'][i] = [o if sequence_ids[k] == 1 else None for k,o in enumerate(batch['offset_mapping'][i])]

  return batch

def val_collate_fn(inputs):
  input_ids = torch.tensor([i['input_ids'] for i in inputs])
  attention_mask = torch.tensor([i['attention_mask'] for i in inputs])

  # Truncate to max length
  max_len = max(attention_mask.sum(-1))
  input_ids = input_ids[:,:max_len]
  attention_mask = attention_mask[:,:max_len]

  return {'input_ids': input_ids, 'attention_mask': attention_mask}

In [11]:
def predict(model: nn.Module, valid_dl: DataLoader):
  """
  Evaluates the model on the given dataset
  :param model: The model under evaluation
  :param valid_dl: A `DataLoader` reading validation data
  :return: The accuracy of the model on the dataset
  """
  # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
  # layer normalization and dropout
  model.eval()
  start_logits_all = []
  end_logits_all = []

  # ALSO IMPORTANT: Don't accumulate gradients during this process
  with torch.no_grad():
    for batch in tqdm(valid_dl, desc='Evaluation'):
      batch = {b: batch[b].to(device) for b in batch}

      # Pass the inputs through the model, get the current loss and logits
      outputs = model(
          input_ids=batch['input_ids'],
          attention_mask=batch['attention_mask']
      )
      # Store the "start" class logits and "end" class logits for every token in the input
      start_logits_all.extend(list(outputs['start_logits'].detach().cpu().numpy()))
      end_logits_all.extend(list(outputs['end_logits'].detach().cpu().numpy()))


    return start_logits_all,end_logits_all

def post_process_predictions(examples, dataset, logits, num_possible_answers = 20, max_answer_length = 30):
  all_start_logits, all_end_logits = logits
  # Build a map from example to its corresponding features. This will allow us to index from
  # sample ID to all of the features for that sample (in case they were split up due to long input)
  example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
  features_per_example = defaultdict(list)
  for i, feature in enumerate(dataset):
      features_per_example[example_id_to_index[feature["example_id"]]].append(i)

  # Create somewhere to store our predictions
  predictions = OrderedDict()

  # Iterate through each sample in the dataset
  for j, sample in enumerate(tqdm(examples)):

    # Get the feature indices (all of the features split across the batch)
    feature_indices = features_per_example[j]
    # Get the original context which predumably has the answer text
    context = sample['context']

    preds = []
    # Iterate through all of the features
    for ft_idx in feature_indices:

      # Get the start and end answer logits for this input
      start_logits = all_start_logits[ft_idx]
      end_logits = all_end_logits[ft_idx]

      # Get the offsets to map token indices to character indices
      offset_mapping = dataset[ft_idx]['offset_mapping']

      # Sort the logits and take the top N
      start_indices = np.argsort(start_logits)[::-1][:num_possible_answers]
      end_indices = np.argsort(end_logits)[::-1][:num_possible_answers]

      # Iterate through start and end indices
      for start_index in start_indices:
        for end_index in end_indices:

          # Ignore this combination if either the indices are not in the context
          if start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None:
            continue

          # Also ignore if the start index is greater than the end index of the number of tokens
          # is greater than some specified threshold
          if start_index > end_index or end_index - start_index + 1 > max_answer_length:
            continue

          ans_text = context[offset_mapping[start_index][0]:offset_mapping[end_index][1]]
          preds.append({
              'score': start_logits[start_index] + end_logits[end_index],
              'text': ans_text
          })

    if len(preds) > 0:
      # Sort by score to get the top answer
      answer = sorted(preds, key=lambda x: x['score'], reverse=True)[0]
    else:
      answer = {'score': 0.0, 'text': ""}

    predictions[sample['id']] = answer['text']
  return predictions

### Indonesian

In [12]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [13]:
tokenized_dataset_indonesian = tydiqa['indonesian']['train'].map(partial(get_train_features, tk), batched=True, remove_columns=tydiqa['indonesian']['train'].column_names)


In [14]:
tokenized_dataset_indonesian = tokenized_dataset_indonesian.train_test_split(test_size = 0.2)['train']

In [15]:
#samples_indonesian = random.sample(list(range(len(tokenized_dataset_indonesian))), 4000)
#tokenized_dataset_indonesian = tokenized_dataset_indonesian.select(samples_indonesian)
train_dl_indonesian = DataLoader(tokenized_dataset_indonesian, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [16]:
model_indonesian = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Create the optimizer
lr=2e-5
n_epochs = 5
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_indonesian.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model_indonesian.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl_indonesian)
)

In [18]:
train_dl_indonesian

<torch.utils.data.dataloader.DataLoader at 0x7872800f9ae0>

In [19]:
losses = train(
    model_indonesian,
    train_dl_indonesian,
    optimizer,
    scheduler,
    n_epochs,
    device
)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2323/2323 [03:08<00:00, 12.30it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2323/2323 [03:03<00:00, 12.69it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2323/2323 [03:02<00:00, 12.71it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2323/2323 [03:03<00:00, 12.66it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2323/2323 [03:04<00:00, 12.59it/s]


In [20]:
validation_dataset_indonesian = tydiqa['indonesian']['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=tydiqa['indonesian']['train'].column_names)

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

In [21]:
val_dl_indonesian = DataLoader(validation_dataset_indonesian, collate_fn=val_collate_fn, batch_size=32)
logits_indonesian = predict(model_indonesian, val_dl_indonesian)

Evaluation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 38/38 [00:10<00:00,  3.54it/s]


In [22]:
predictions_indonesian = post_process_predictions(tydiqa['indonesian']['validation'], validation_dataset_indonesian, logits_indonesian)
formatted_predictions_indonesian = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.5} for k,v in predictions_indonesian.items()]
references_indonesian = [{'answers': example['answers'], 'id' : example['id']} for example in tydiqa['indonesian']['validation']]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1191/1191 [00:04<00:00, 266.21it/s]


In [23]:
compute_squad.compute(predictions=formatted_predictions_indonesian, references = references_indonesian)

{'exact': 35.012594458438286,
 'f1': 41.56794422215618,
 'total': 1191,
 'HasAns_exact': 35.012594458438286,
 'HasAns_f1': 41.56794422215618,
 'HasAns_total': 1191,
 'best_exact': 35.012594458438286,
 'best_exact_thresh': 0.5,
 'best_f1': 41.56794422215618,
 'best_f1_thresh': 0.5}

### Bengali

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [None]:
tokenized_dataset_bengali = tydiqa['bengali']['train'].map(partial(get_train_features, tk), batched=True, remove_columns=tydiqa['bengali']['train'].column_names)


In [None]:
bengali_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#samples_bengali = random.sample(list(range(len(tokenized_dataset_bengali))), 4000)
#tokenized_dataset_bengali = tokenized_dataset_bengali.select(samples_bengali)
train_dl_bengali = DataLoader(tokenized_dataset_bengali, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [None]:
# Create the optimizer
lr=2e-5
n_epochs = 4
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in bengali_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in bengali_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl_bengali)
)

losses = train(
    bengali_model,
    train_dl_bengali,
    optimizer,
    scheduler,
    n_epochs,
    device
)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1300/1300 [02:23<00:00,  9.04it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1300/1300 [02:21<00:00,  9.21it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1300/1300 [02:21<00:00,  9.20it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1300/1300 [02:21<00:00,  9.16it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1300/1300 [02:20<00:00,  9.27it/s]


In [None]:
validation_dataset_beng = tydiqa['bengali']['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=tydiqa['bengali']['train'].column_names)

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

In [None]:
val_beng_dl = DataLoader(validation_dataset_beng, collate_fn=val_collate_fn, batch_size=32)

In [None]:
logits_beng = predict(bengali_model, val_beng_dl)

Evaluation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:02<00:00,  3.07it/s]


In [None]:
predictions_beng = post_process_predictions(tydiqa['bengali']['validation'], validation_dataset_beng, logits_beng)
formatted_predictions_beng = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.5} for k,v in predictions_beng.items()]
references_beng = [{'answers': example['answers'], 'id' : example['id']} for example in tydiqa['bengali']['validation']]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 224/224 [00:00<00:00, 228.09it/s]


In [None]:
compute_squad.compute(predictions=formatted_predictions_beng, references = references_beng)

{'exact': 25.446428571428573,
 'f1': 32.11850649350648,
 'total': 224,
 'HasAns_exact': 25.446428571428573,
 'HasAns_f1': 32.11850649350648,
 'HasAns_total': 224,
 'best_exact': 25.446428571428573,
 'best_exact_thresh': 0.5,
 'best_f1': 32.11850649350648,
 'best_f1_thresh': 0.5}

### Arabic

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [None]:
tokenized_dataset_arabic = tydiqa['arabic']['train'].map(partial(get_train_features, tk), batched=True, remove_columns=tydiqa['arabic']['train'].column_names)


In [None]:
arabic_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#samples = random.sample(list(range(len(tokenized_dataset_arab))), 4000)
#tokenized_dataset_arabic = tokenized_dataset_arab.select(samples)
train_dl_arabic = DataLoader(tokenized_dataset_arabic, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [None]:
# Create the optimizer
lr=2e-5
n_epochs = 4
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in arabic_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in arabic_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl_arabic)
)

losses = train(
    arabic_model,
    train_dl_arabic,
    optimizer,
    scheduler,
    n_epochs,
    device
)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7797/7797 [12:16<00:00, 10.58it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7797/7797 [12:15<00:00, 10.61it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7797/7797 [12:15<00:00, 10.60it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7797/7797 [12:17<00:00, 10.58it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7797/7797 [12:12<00:00, 10.64it/s]


In [None]:
validation_dataset_arabic = tydiqa['arabic']['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=tydiqa['arabic']['train'].column_names)

Map:   0%|          | 0/1902 [00:00<?, ? examples/s]

In [None]:
val_arab_dl = DataLoader(validation_dataset_arabic, collate_fn=val_collate_fn, batch_size=32)

In [None]:
logits_arabic = predict(arabic_model,val_arab_dl)

Evaluation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 62/62 [00:20<00:00,  3.01it/s]


In [None]:
predictions_arabic = post_process_predictions(tydiqa['arabic']['validation'], validation_dataset_arabic, logits_arabic)
formatted_predictions_arabic = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.5} for k,v in predictions_arabic.items()]
references_arabic = [{'answers': example['answers'], 'id' : example['id']} for example in tydiqa['arabic']['validation']]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1902/1902 [00:03<00:00, 479.25it/s]


In [None]:
compute_squad.compute(predictions=formatted_predictions_arabic, references = references_arabic)

{'exact': 31.70347003154574,
 'f1': 40.40158676404807,
 'total': 1902,
 'HasAns_exact': 31.70347003154574,
 'HasAns_f1': 40.40158676404807,
 'HasAns_total': 1902,
 'best_exact': 31.70347003154574,
 'best_exact_thresh': 0.5,
 'best_f1': 40.40158676404807,
 'best_f1_thresh': 0.5}

## Language specific transformers

### Indonesian: "cahya/bert-base-indonesian-522M"

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [66]:
#MODEL_NAME = "cahya/bert-base-indonesian-522M"
#MODEL_NAME = "indobenchmark/indobert-base-p2"
MODEL_NAME =  "indolem/indobert-base-uncased"
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [67]:
tokenized_dataset_indonesian = tydiqa['indonesian']['train'].map(partial(get_train_features, tk), batched=True, remove_columns=tydiqa['indonesian']['train'].column_names)


Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

In [68]:
#samples_indonesian = random.sample(list(range(len(tokenized_dataset_indonesian))), 4000)
#tokenized_dataset_indonesian = tokenized_dataset_indonesian.select(samples_indonesian)
train_dl_indonesian = DataLoader(tokenized_dataset_indonesian, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [69]:
model_indonesian = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
# Create the optimizer
lr=2e-5
n_epochs = 5
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_indonesian.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in model_indonesian.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl_indonesian)
)

In [71]:
losses = train(
    model_indonesian,
    train_dl_indonesian,
    optimizer,
    scheduler,
    n_epochs,
    device
)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2887/2887 [03:15<00:00, 14.76it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2887/2887 [03:09<00:00, 15.23it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2887/2887 [03:08<00:00, 15.29it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2887/2887 [03:09<00:00, 15.20it/s]


In [72]:
validation_dataset_indonesian = tydiqa['indonesian']['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=tydiqa['indonesian']['train'].column_names)

val_dl_indonesian = DataLoader(validation_dataset_indonesian, collate_fn=val_collate_fn, batch_size=32)
logits_indonesian = predict(model_indonesian, val_dl_indonesian)

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

Evaluation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 38/38 [00:09<00:00,  4.04it/s]


In [73]:
predictions_indonesian = post_process_predictions(tydiqa['indonesian']['validation'], validation_dataset_indonesian, logits_indonesian)
formatted_predictions_indonesian = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.5} for k,v in predictions_indonesian.items()]
references_indonesian = [{'answers': example['answers'], 'id' : example['id']} for example in tydiqa['indonesian']['validation']]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1191/1191 [00:02<00:00, 413.60it/s]


In [74]:
compute_squad.compute(predictions=formatted_predictions_indonesian, references = references_indonesian)

{'exact': 44.24853064651553,
 'f1': 49.60680053655968,
 'total': 1191,
 'HasAns_exact': 44.24853064651553,
 'HasAns_f1': 49.60680053655968,
 'HasAns_total': 1191,
 'best_exact': 44.24853064651553,
 'best_exact_thresh': 0.5,
 'best_f1': 49.60680053655968,
 'best_f1_thresh': 0.5}

### Bengali: "csebuetnlp/banglishbert"

In [None]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [75]:
MODEL_NAME = "csebuetnlp/banglishbert"
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [76]:
tokenized_dataset_bengali = tydiqa['bengali']['train'].map(partial(get_train_features, tk), batched=True, remove_columns=tydiqa['bengali']['train'].column_names)


In [77]:
bengali_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at csebuetnlp/banglishbert and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
#samples_bengali = random.sample(list(range(len(tokenized_dataset_bengali))), 4000)
#tokenized_dataset_bengali = tokenized_dataset_bengali.select(samples_bengali)
train_dl_bengali = DataLoader(tokenized_dataset_bengali, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [79]:
# Create the optimizer
lr=2e-5
n_epochs = 5
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in bengali_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in bengali_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl_bengali)
)

losses = train(
    bengali_model,
    train_dl_bengali,
    optimizer,
    scheduler,
    n_epochs,
    device
)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1220/1220 [01:36<00:00, 12.70it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1220/1220 [01:39<00:00, 12.25it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1220/1220 [01:31<00:00, 13.36it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1220/1220 [01:33<00:00, 13.09it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1220/1220 [01:31<00:00, 13.27it/s]


In [80]:
validation_dataset_beng = tydiqa['bengali']['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=tydiqa['bengali']['train'].column_names)

In [81]:
val_beng_dl = DataLoader(validation_dataset_beng, collate_fn=val_collate_fn, batch_size=32)

In [29]:
logits_beng = predict(bengali_model, val_beng_dl)

Evaluation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8/8 [00:01<00:00,  4.09it/s]


In [30]:
predictions_beng = post_process_predictions(tydiqa['bengali']['validation'], validation_dataset_beng, logits_beng)
formatted_predictions_beng = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.5} for k,v in predictions_beng.items()]
references_beng = [{'answers': example['answers'], 'id' : example['id']} for example in tydiqa['bengali']['validation']]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 224/224 [00:00<00:00, 430.02it/s]


In [82]:
compute_squad.compute(predictions=formatted_predictions_beng, references = references_beng)

{'exact': 33.482142857142854,
 'f1': 38.35197540554684,
 'total': 224,
 'HasAns_exact': 33.482142857142854,
 'HasAns_f1': 38.35197540554684,
 'HasAns_total': 224,
 'best_exact': 33.482142857142854,
 'best_exact_thresh': 0.5,
 'best_f1': 38.35197540554684,
 'best_f1_thresh': 0.5}

### Arabic: "aubmindlab/bert-base-arabertv02"

In [24]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [25]:
MODEL_NAME = "aubmindlab/bert-base-arabertv02"
tk = AutoTokenizer.from_pretrained(MODEL_NAME)

In [26]:
tokenized_dataset_arabic = tydiqa['arabic']['train'].map(partial(get_train_features, tk), batched=True, remove_columns=tydiqa['arabic']['train'].column_names)


In [27]:
arabic_model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
#samples = random.sample(list(range(len(tokenized_dataset_arabic))), 4000)
#tokenized_dataset_arabic = tokenized_dataset_arabic.select(samples)
train_dl_arabic = DataLoader(tokenized_dataset_arabic, collate_fn=collate_fn, shuffle=True, batch_size=4)

In [29]:
# Create the optimizer
lr=2e-5
n_epochs = 5
weight_decay = 0.01
warmup_steps = 200

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in arabic_model.named_parameters() if not any(nd in n for nd in no_decay)],
      'weight_decay': weight_decay},
    {'params': [p for n, p in arabic_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
# optimizer = Adam(optimizer_grouped_parameters, lr=1e-3)
# scheduler = None
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    warmup_steps,
    n_epochs * len(train_dl_arabic)
)

losses = train(
    arabic_model,
    train_dl_arabic,
    optimizer,
    scheduler,
    n_epochs,
    device
)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7529/7529 [09:09<00:00, 13.71it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7529/7529 [09:03<00:00, 13.84it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7529/7529 [09:04<00:00, 13.83it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7529/7529 [09:01<00:00, 13.91it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7529/7529 [09:03<00:00, 13.86it/s]


In [30]:
validation_dataset_arabic = tydiqa['arabic']['validation'].map(partial(get_validation_features, tk), batched=True, remove_columns=tydiqa['arabic']['train'].column_names)

In [31]:
val_arab_dl = DataLoader(validation_dataset_arabic, collate_fn=val_collate_fn, batch_size=32)
logits_arabic = predict(arabic_model,val_arab_dl)

Evaluation: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 61/61 [00:15<00:00,  3.82it/s]


In [32]:
predictions_arabic = post_process_predictions(tydiqa['arabic']['validation'], validation_dataset_arabic, logits_arabic)
formatted_predictions_arabic = [{'id': k, 'prediction_text': v, 'no_answer_probability': 0.5} for k,v in predictions_arabic.items()]
references_arabic = [{'answers': example['answers'], 'id' : example['id']} for example in tydiqa['arabic']['validation']]

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1902/1902 [00:05<00:00, 324.78it/s]


In [33]:
compute_squad.compute(predictions=formatted_predictions_arabic, references = references_arabic)

{'exact': 36.487907465825444,
 'f1': 45.14689457093472,
 'total': 1902,
 'HasAns_exact': 36.487907465825444,
 'HasAns_f1': 45.14689457093472,
 'HasAns_total': 1902,
 'best_exact': 36.487907465825444,
 'best_exact_thresh': 0.5,
 'best_f1': 45.14689457093472,
 'best_f1_thresh': 0.5}

# IOB model

Loading data

In [3]:
from datasets import load_dataset

df = load_dataset("copenlu/answerable_tydiqa")

language = "bengali"
df_train = df["train"].filter(lambda x: x["language"] == language)
#df_train = df_train.train_test_split(test_size = 0.02)['test']
df_val = df["validation"].filter(lambda x: x["language"] == language)
#df_val = df_val.train_test_split(test_size = 0.1)['test']

Defining new columns to tokenize only the text we need

In [4]:
def format(dataset):
    result = {}
    #Marking start of the sentence and where the question and plaintext separates
    result['sentence'] = '[CLS] ' + dataset['question_text'] + ' [SEP] ' + dataset['document_plaintext']
    #Where the answer starts in the document_plaintext
    answer_start = dataset['annotations']['answer_start'][0] + len(dataset['question_text']) + len("[CLS] ") + len(" [SEP] ")
    result['answer_start'] = answer_start
    #WHere the answer ends in the document_plaintext
    result['answer_end'] = answer_start + len(dataset['annotations']['answer_text'][0])
    #Where the answer starts in the document_plaintext
    #WHere the answer ends in the document_plaintext
    return result


def iob(ans_ids, tokens):
    result = []
    for answerID, token_word in enumerate(tokens):
        if token_word in ['[CLS]', '[SEP]']:
            result.append(-100)
        elif len(ans_ids) > 0 and answerID == ans_ids[0]:
            result.append(1)
        elif answerID in ans_ids:
            result.append(2)
        else:
            result.append(0)
    return result


def token_iob_labels(examples):
    sentence_tokens = tokenizer(examples["sentence"], truncation=True)
    answer_start = examples['answer_start']
    answer_end = examples['answer_end']
    sentence_token_id = []
    for id in range(len(sentence_tokens.tokens())):
        sentencespan = sentence_tokens.token_to_chars(batch_or_token_index=id)
        if sentencespan is not None:
            (elem1, _) = sentencespan
            if elem1 > answer_end:
                break
            elif elem1 >= answer_start:
                sentence_token_id.append(id)
    sentence_tokens["labels"] = iob(sentence_token_id, sentence_tokens.tokens())
    sentence_tokens['text_tokens'] = sentence_tokens.tokens()
    return sentence_tokens

Using the multilingual bert model

In [5]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, AdamW, get_scheduler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model)
model_auto = AutoModelForTokenClassification.from_pretrained(model, num_labels=3)
model_auto.to(device)
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
optimizer = AdamW(model_auto.parameters(), lr=2e-5)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenized_datasets_train = df_train.map(format).map(
    token_iob_labels
)
tokenized_datasets_val = df_val.map(format).map(
    token_iob_labels
)

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Extracting the columns we need (input ids, attention mask, iob labels and token type id)

In [7]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets_train.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), shuffle=True, batch_size=4, collate_fn=collator
)
eval_dataloader = DataLoader(
    tokenized_datasets_val.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), batch_size=4, collate_fn=collator
)

In [8]:
epochs = 1
num_class = 3

training

In [9]:
from tqdm.auto import tqdm

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_auto(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        logits = outputs.get("logits").to(device)

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


predictions


In [10]:
from tqdm import tqdm
y_true_flat = []
y_pred_flat = []
y_true = []
y_pred = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true.append(val_data['labels'])
  y_true_flat.extend(val_data['labels'])
  y_pred.append(predictions[0].detach().cpu().numpy())
  y_pred_flat.extend(predictions[0].detach().cpu().numpy())

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 224/224 [00:06<00:00, 35.37it/s]


In [11]:
def compute_exact_match(predicted, true, replacement_value=0):
    assert len(predicted) == len(true), "Length of predicted and true labels must match"
    em = 0
    total = len(predicted)

    predicted = [[replacement_value if x == -100 else x for x in seq] for seq in predicted]
    true = [[replacement_value if x == -100 else x for x in seq] for seq in true]

    for pred_seq, true_seq in zip(predicted, true):
        if np.array_equal(pred_seq, true_seq):
            em += 1

    em_score = em / total
    return em_score

em_score_bengali = compute_exact_match(y_pred, y_true)

In [12]:
from sklearn.metrics import precision_recall_fscore_support
precision_bengali, recall_bengali, f1_bengali, _ = precision_recall_fscore_support(y_true_flat, y_pred_flat, average='macro', zero_division=1)

print("BENGALI")
print("*********")
print("F1: ", f1_bengali)

BENGALI
*********
F1:  0.49739710538218


ARABIC

In [13]:
language = "arabic"
df_train = df["train"].filter(lambda x: x["language"] == language)
#df_train = df_train.train_test_split(test_size = 0.1)['test']
df_val = df["validation"].filter(lambda x: x["language"] == language)
#df_val = df_val.train_test_split(test_size = 0.1)['test']


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model)
model_auto = AutoModelForTokenClassification.from_pretrained(model, num_labels=3)
model_auto.to(device)
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
optimizer = AdamW(model_auto.parameters(), lr=2e-5)


tokenized_datasets = df_train.map(format).map(
    token_iob_labels
)
tokenized_datasets_val = df_val.map(format).map(
    token_iob_labels
)

train_dataloader = DataLoader(
    tokenized_datasets.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), shuffle=True, batch_size=8, collate_fn=collator
)
eval_dataloader = DataLoader(
    tokenized_datasets_val.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), batch_size=8, collate_fn=collator
)

epochs = 1
num_class = 3

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_auto(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        logits = outputs.get("logits").to(device)

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

y_true_flat_arab = []
y_pred_flat_arab = []
y_pred_arab = []
y_true_arab = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_arab.append(val_data['labels'])
  y_true_flat_arab.extend(val_data['labels'])
  y_pred_arab.append(predictions[0].detach().cpu().numpy())
  y_pred_flat_arab.extend(predictions[0].detach().cpu().numpy())


em_score_arab = compute_exact_match(y_pred_arab, y_true_arab)
precision_arab, recall_arab, f1_arab, _ = precision_recall_fscore_support(y_true_flat_arab, y_pred_flat_arab, average='macro', zero_division=1)
print("ARABIC")
print("*********")
print("F1: ", f1_arab)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/29598 [00:00<?, ? examples/s]

Map:   0%|          | 0/29598 [00:00<?, ? examples/s]

Map:   0%|          | 0/1902 [00:00<?, ? examples/s]

Map:   0%|          | 0/1902 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1902/1902 [00:52<00:00, 36.24it/s]


ARABIC
*********
F1:  0.5839945969542365


INDONESIAN

In [14]:
language = "indonesian"
df_train = df["train"].filter(lambda x: x["language"] == language)
#df_train = df_train.train_test_split(test_size = 0.1)['test']
df_val = df["validation"].filter(lambda x: x["language"] == language)
#df_val = df_val.train_test_split(test_size = 0.1)['test']


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model)
model_auto = AutoModelForTokenClassification.from_pretrained(model, num_labels=3)
model_auto.to(device)
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
optimizer = AdamW(model_auto.parameters(), lr=2e-5)


tokenized_datasets = df_train.map(format).map(
    token_iob_labels
)
tokenized_datasets_val = df_val.map(format).map(
    token_iob_labels
)

train_dataloader = DataLoader(
    tokenized_datasets.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), shuffle=True, batch_size=8, collate_fn=collator
)
eval_dataloader = DataLoader(
    tokenized_datasets_val.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), batch_size=8, collate_fn=collator
)

epochs = 1
num_class = 3

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_auto(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        logits = outputs.get("logits").to(device)

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

y_true_flat_indo = []
y_pred_flat_indo = []
y_true_indo = []
y_pred_indo = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_indo.append(val_data['labels'])
  y_true_flat_indo.extend(val_data['labels'])
  y_pred_indo.append(predictions[0].detach().cpu().numpy())
  y_pred_flat_indo.extend(predictions[0].detach().cpu().numpy())


em_score_indo = compute_exact_match(y_pred_indo, y_true_indo)
precision_indo, recall_indo, f1_indo, _ = precision_recall_fscore_support(y_true_flat_indo, y_pred_flat_indo, average='macro', zero_division=1)

print("INDONESIAN")
print("*********")
print("F1: ", f1_indo)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1191/1191 [00:35<00:00, 33.80it/s]

INDONESIAN
*********
F1:  0.5463268754111743





RESULT TOGETHER

In [15]:
print()
print("BENGALI")
print("*********")
print(f"EM Score: {em_score_bengali}")
print(f"f1 Score: {f1_bengali}")
print(f"precision Score: {precision_bengali}")
print()
print()
print("ARABIC")
print("*********")
print(f"EM Score: {em_score_arab}")
print(f"f1 Score: {f1_arab}")
print(f"precision Score: {precision_arab}")
print()
print()
print("INDONESIAN")
print("*********")
print(f"EM Score: {em_score_indo}")
print(f"f1 Score: {f1_indo}")
print(f"precision Score: {precision_indo}")



BENGALI
*********
EM Score: 0.5223214285714286
f1 Score: 0.49739710538218
precision Score: 0.7967041697707318


ARABIC
*********
EM Score: 0.6393270241850684
f1 Score: 0.5839945969542365
precision Score: 0.8724910411972784


INDONESIAN
*********
EM Score: 0.5852225020990764
f1 Score: 0.5463268754111743
precision Score: 0.852818885666527
