### Installs

In [1]:
!pip install datasets
!pip install bpemb

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1


### Imports

In [2]:
import io
import torch
import numpy as np
import pandas as pd
import random
from bpemb import BPEmb
from datasets import load_dataset


from math import log
from numpy import array
from numpy import argmax
from math import log
from numpy import array
from numpy import argmax
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [3]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

### Data

In [4]:
embedding_dim = 50
vocab_size = 10000

bpemb_bn = BPEmb(lang="bn", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Bengali
bpemb_ar = BPEmb(lang="ar", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Arabic
bpemb_id = BPEmb(lang="id", dim=embedding_dim, vs=vocab_size, add_pad_emb=True) # Indonesian

# Extract the embeddings and add an embedding for our extra [PAD] token
embeddings_ar = np.concatenate([bpemb_ar.emb.vectors, np.zeros(shape=(1,embedding_dim))], axis=0)
embeddings_bn = np.concatenate([bpemb_bn.emb.vectors, np.zeros(shape=(1,embedding_dim))], axis=0)
embeddings_id = np.concatenate([bpemb_id.emb.vectors, np.zeros(shape=(1,embedding_dim))], axis=0)
# Extract the vocab and add an extra [PAD] token
vocabulary_ar = bpemb_ar.emb.index_to_key + ['<pad>']
vocabulary_bn = bpemb_bn.emb.index_to_key + ['<pad>']
vocabulary_id = bpemb_id.emb.index_to_key + ['<pad>']

downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs10000.model


100%|██████████| 471203/471203 [00:00<00:00, 838303.95B/s]


downloading https://nlp.h-its.org/bpemb/bn/bn.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1933584/1933584 [00:00<00:00, 2435735.37B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs10000.model


100%|██████████| 428120/428120 [00:00<00:00, 765758.51B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1928527/1928527 [00:00<00:00, 2400284.31B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs10000.model


100%|██████████| 396303/396303 [00:00<00:00, 702297.19B/s]


downloading https://nlp.h-its.org/bpemb/id/id.wiki.bpe.vs10000.d50.w2v.bin.tar.gz


100%|██████████| 1920574/1920574 [00:00<00:00, 2391860.02B/s]


In [5]:
vocabulary_id = bpemb_id.emb.index_to_key + ['<pad>']
embeddings_id = np.concatenate([bpemb_id.emb.vectors, np.zeros(shape=(1,embedding_dim))], axis=0)

def labeller(row):
  """
  Tokenizes and encodes question and document, and creates
  corresponding labels.

  row: a pandas.core.series.Series (one row of dataframe)
  returns: (tokens, labels)
  """
  if row.language == 'indonesian':
    vocab = bpemb_id
  elif row.language == 'bengali':
    vocab = bpemb_bn
  elif row.language == 'arabic':
    vocab = bpemb_ar
  else:
    raise ValueError(f'Language not supported: {row.language}')

  # a: answer
  a_start = row.annotations.get('answer_start')[0]
  a = row.annotations.get('answer_text')[0]
  a_len = len(a) # answer char length

  # q: question
  q = row.question_text + ' [sep] '
  q_ids = vocab.encode_ids(q)
  q_ids_len = len(q_ids)

  # d: document
  d_pre = row.document_plaintext[:a_start] # document text before the answer
  d_pre_ids = vocab.encode_ids(d_pre)
  d_pre_ids_len = len(d_pre_ids)

  d_ans = row.document_plaintext[a_start:a_start+a_len] # answer in document text
  d_ans_ids = vocab.encode_ids(d_ans)
  d_ans_ids_len = len(d_ans_ids)

  d_post = row.document_plaintext[a_start+a_len:] # document text after answer
  d_post_ids = vocab.encode_ids(d_post)
  d_post_ids_len = len(d_post_ids)

  token_ids = torch.tensor(q_ids + d_pre_ids + d_ans_ids + d_post_ids)

  total_len = q_ids_len + d_pre_ids_len + d_ans_ids_len + d_post_ids_len
  pre_ans_len = q_ids_len + d_pre_ids_len
  labels = torch.zeros(total_len)
  labels[pre_ans_len : pre_ans_len + d_ans_ids_len] = torch.ones(d_ans_ids_len)

  return token_ids, labels


In [6]:
dataset = load_dataset("copenlu/answerable_tydiqa")

train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['language'].isin(['indonesian', 'arabic', 'bengali'])]

val_df = dataset['validation'].to_pandas()
val_df = val_df[val_df['language'].isin(['indonesian', 'arabic', 'bengali'])]

# create is_answerable column
train_df['is_answerable'] = train_df['annotations'].apply(lambda x: x.get('answer_start', [-1])[0] != -1)
val_df['is_answerable'] = val_df['annotations'].apply(lambda x: x.get('answer_start', [-1])[0] != -1)

# only answerable questions
train_df = train_df[train_df['is_answerable']]
val_df = val_df[val_df['is_answerable']]


train_df[['inputs','labels']] = train_df.apply(labeller, result_type='expand', axis=1)
val_df[['inputs','labels']] = val_df.apply(labeller, result_type='expand', axis=1)

# divide train set
train_arab = train_df[train_df['language'] == 'arabic'][['inputs','labels']]
train_indo = train_df[train_df['language'] == 'indonesian'][['inputs','labels']]
train_beng = train_df[train_df['language'] == 'bengali'][['inputs','labels']]
# divide val set
val_arab = val_df[val_df['language'] == 'arabic'][['inputs','labels']]
val_indo = val_df[val_df['language'] == 'indonesian'][['inputs','labels']]
val_beng = val_df[val_df['language'] == 'bengali'][['inputs','labels']]

Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [7]:
def collate_batch_bilstm(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_ids = [i[0].tolist() for i in input_data]
    seq_lens = [i[1] for i in input_data]
    labels = [i[2].tolist() for i in input_data]

    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [vocab_size] * (max_length - len(i))) for i in input_ids]
    labels = [(i + [0] * (max_length - len(i))) for i in labels]

    assert (all(len(i) == max_length for i in input_ids))
    assert (all(len(i) == max_length for i in labels))
    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels).type(torch.LongTensor)


class DatasetReader(Dataset):
  def __init__(self, df):
    """
    :param df: Dataframe
    :param tokenizer: bpemb entity
    """
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    row = self.df.values[idx]
    input_ids = row[0]
    seq_len = len(input_ids)
    label = row[1]
    return input_ids, seq_len, label

### Model

In [8]:
class BiLSTM(nn.Module):

    def __init__(
            self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            lstm_layers: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2
    ):

        super(BiLSTM, self).__init__()

        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1),
            'bilstm': nn.LSTM(
                pretrained_embeddings.shape[1],  # input size
                lstm_dim,  # hidden dim
                lstm_layers,  # number of layers
                batch_first=True,
                dropout=dropout_prob,
                bidirectional=True),
            'ff': nn.Linear(2*lstm_dim, n_classes),
        })
        self.n_classes = n_classes
        self.loss = nn.CrossEntropyLoss()
        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['bilstm'].named_parameters()) + \
                     list(self.model['ff'].named_parameters())
        for n,p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens, hidden_states = None, labels = None):
        """
        Defines how tensors flow through the model
        :param inputs: (b x seq_len) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :param labels: (b) The label of each sample
        :return: (loss, logits) if `labels` is not None, otherwise just (logits,)
        """

        embeds = self.model['embeddings'](inputs)

        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.cpu(),
            batch_first=True,
            enforce_sorted=False
        )

        if hidden_states:
            lstm_out, hidden = self.model['bilstm'](lstm_in, hidden_states)
        else:
            lstm_out, hidden = self.model['bilstm'](lstm_in)

        lstm_out, lengths = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        # Get logits (b x seq_len x n_classes)
        logits = self.model['ff'](lstm_out)
        outputs = (logits, lengths)
        if labels is not None:
            loss = self.loss(logits.reshape(-1, self.n_classes), labels.reshape(-1))
            outputs =  outputs + (loss,)

        return outputs

### Define training and evaluation

In [65]:
from os import access
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    scheduler=None,
):
    """
    The main training loop which will optimize a given model on a given dataset
    :param model: The model being optimized
    :param train_dl: The training dataset
    :param valid_dl: A validation dataset
    :param optimizer: The optimizer used to update the model parameters
    :param n_epochs: Number of epochs to train for
    :param device: The device to train on
    :return: (model, losses) The best model and the losses per iteration
    """

    losses = []
    learning_rates = []
    best_f1 = 0.0

    for ep in range(n_epochs):
        model.train()
        loss_epoch = []

        for batch in tqdm(train_dl):
            optimizer.zero_grad()

            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            seq_lens = batch[1]
            labels = batch[2]

            logits, lengths, loss = model(input_ids, seq_lens, labels=labels)
            losses.append(loss.item())
            loss_epoch.append(loss.item())

            loss.backward()

            # Optional: clip gradients
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            if scheduler != None:
                scheduler.step()
                learning_rates.append(scheduler.get_last_lr()[0])

    f1 = evaluate(model, valid_dl)
            #print(f'Validation f1: {f1}, train loss: {sum(loss_epoch) / len(loss_epoch)}')
    """
        if f1 > best_f1:
            torch.save(model.state_dict(), 'best_model')
            best_f1 = f1
    """
    return losses, learning_rates

def part_match(pred, target, batch_size):
  tp = 0
  fp = 0
  fn = 0
  tn = 0
  for i in range(batch_size):
    for j in range(pred.shape[1]):
      if target[i][j] == 1:
        if pred[i][j] == 1: # true positive
          tp += 1
        else: # false negative
          fn += 1
      else:
        if pred[i][j] == 0: # true negative
          tn += 1
        else: #false positive
          fp += 1

  acc = (tp + tn) / (tp + tn + fp + fn)
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  try:
    f1 = 2 * (precision * recall) / (precision + recall)
  except:
    print("Division by 0 error when calculating f1 score")
    f1 = 0
  return acc, f1, precision, recall

def exact_match(pred, target, batch_size):
  """Returns the number of correct predictions in the batch"""
  c = 0
  for i in range(batch_size):
    if np.all(pred[i] == target[i]):
      c += 1
  return c

def evaluate(model: nn.Module, valid_dl: DataLoader):
    model.eval()
    n = 0
    c = 0
    with torch.no_grad():
        for batch in tqdm(valid_dl, desc='Evaluation'):
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            seq_lens = batch[1]
            labels = batch[2]

            batch_size = labels.shape[0]
            n += batch_size
            hidden_states = None

            logits, _, _ = model(input_ids, seq_lens, hidden_states=hidden_states, labels=labels)
            pred = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            c += exact_match(pred, labels, batch_size)
            acc, f1, precision, recall  = part_match(pred, labels, batch_size)

        print(f'Excact matches: {c}/{n}, {(c/n)*100}%')
        print(f"""Partly matches (last batch)
        Acc: {acc},
        f1: {f1},
        precision: {precision},
        recall: {recall} """)

    return f1

### Hyperparams

In [48]:
# Model
lstm_dim = 256
lstm_layers = 2
n_classes = 2
dropout_prob = 0.1

batch_size = 8
lr = 1e-2
n_epochs = 10
n_workers = 2

### Train and evaluate

#### Arabic

In [66]:
# Create data loader
train_dl = DataLoader(DatasetReader(train_arab), batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=n_workers) # ar
val_dl = DataLoader(DatasetReader(val_arab), batch_size=len(val_arab), collate_fn=collate_batch_bilstm, num_workers=n_workers) # ar * 2

# Create the model
model = BiLSTM(
    pretrained_embeddings = torch.FloatTensor(embeddings_ar), # ar
    lstm_dim = lstm_dim,
    lstm_layers = lstm_layers,
    dropout_prob=dropout_prob,
    n_classes=n_classes
    ).to(device)

optimizer = Adam(model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# Train
losses, learning_rates = train(model, train_dl, val_dl, optimizer, n_epochs, device, scheduler)
#model.load_state_dict(torch.load('best_model'))



  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

  0%|          | 0/1851 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Excact matches: 0/951, 0.0%
Partly matches (last batch)
        Acc: 0.9974369832844655, 
        f1: 0.7026576920408021, 
        precision: 0.8204504942472857, 
        recall: 0.6144417475728156 


#### Bengali

In [67]:
# Create data loader
train_dl = DataLoader(DatasetReader(train_beng), batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=n_workers) # bn
val_dl = DataLoader(DatasetReader(val_beng), batch_size=len(val_beng), collate_fn=collate_batch_bilstm, num_workers=n_workers) # bn * 2

# Create the model
model = BiLSTM(
    pretrained_embeddings = torch.FloatTensor(embeddings_bn), # bn
    lstm_dim = lstm_dim,
    lstm_layers = lstm_layers,
    dropout_prob=dropout_prob,
    n_classes=n_classes
    ).to(device)

optimizer = Adam(model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# Train
losses, learning_rates = train(model, train_dl, val_dl, optimizer, n_epochs, device, scheduler)

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

  0%|          | 0/299 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Excact matches: 0/112, 0.0%
Partly matches (last batch)
        Acc: 0.9954527998882837, 
        f1: 0.2651622002820874, 
        precision: 0.38524590163934425, 
        recall: 0.2021505376344086 


#### Indonesian

In [68]:
# Create data loader
train_dl = DataLoader(DatasetReader(train_indo), batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=n_workers) # id
val_dl = DataLoader(DatasetReader(val_indo), batch_size=len(val_indo), collate_fn=collate_batch_bilstm, num_workers=n_workers) # id *2

# Create the model
model = BiLSTM(
    pretrained_embeddings = torch.FloatTensor(embeddings_id), # id
    lstm_dim = lstm_dim,
    lstm_layers = lstm_layers,
    dropout_prob=dropout_prob,
    n_classes=n_classes
    ).to(device)

optimizer = Adam(model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# Train
losses, learning_rates = train(model, train_dl, val_dl, optimizer, n_epochs, device, scheduler)

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Excact matches: 0/597, 0.0%
Partly matches (last batch)
        Acc: 0.9959722387958556, 
        f1: 0.6337084673097534, 
        precision: 0.7711118356700358, 
        recall: 0.5378667273140778 
