<a href="https://colab.research.google.com/github/MLFlexer/nlp-course/blob/Emma/w4_jens_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Preamble
import sys
sys.path.append('..')

In [2]:
!pip install datasets



In [3]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

print(len(df_train))
print(len(df_val))

df_train.head()


116067
13325


Unnamed: 0,question_text,document_title,language,annotations,document_plaintext,document_url
0,Milloin Charles Fort syntyi?,Charles Fort,finnish,"{'answer_start': [18], 'answer_text': ['6. elo...",Charles Hoy Fort (6. elokuuta (joidenkin lähte...,https://fi.wikipedia.org/wiki/Charles%20Fort
1,“ダン” ダニエル・ジャドソン・キャラハンの出身はどこ,ダニエル・J・キャラハン,japanese,"{'answer_start': [35], 'answer_text': ['カリフォルニ...",“ダン”こと、ダニエル・ジャドソン・キャラハンは1890年7月26日、カリフォルニア州サンフ...,https://ja.wikipedia.org/wiki/%E3%83%80%E3%83%...
2,వేప చెట్టు యొక్క శాస్త్రీయ నామం ఏమిటి?,వేప,telugu,"{'answer_start': [12], 'answer_text': ['Azadir...","వేప (లాటిన్ Azadirachta indica, syn. Melia aza...",https://te.wikipedia.org/wiki/%E0%B0%B5%E0%B1%...
3,চেঙ্গিস খান কোন বংশের রাজা ছিলেন ?,চেঙ্গিজ খান,bengali,"{'answer_start': [414], 'answer_text': ['বোরজি...",চেঙ্গিজ খান (মঙ্গোলীয়: Чингис Хаан আ-ধ্ব-ব: ...,https://bn.wikipedia.org/wiki/%E0%A6%9A%E0%A7%...
4,రెయ్యలగడ్ద గ్రామ విస్తీర్ణత ఎంత?,రెయ్యలగడ్ద,telugu,"{'answer_start': [259], 'answer_text': ['27 హె...","రెయ్యలగడ్ద, విశాఖపట్నం జిల్లా, గంగరాజు మాడుగుల...",https://te.wikipedia.org/wiki/%E0%B0%B0%E0%B1%...


In [4]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']

print(len(df_train_english),len(df_val_english))




7389 990


In [5]:
df_train_english.head()
print(df_val_english.iloc[0]['annotations'])
print()
print(df_val_english.iloc[0]['document_plaintext'])

{'answer_start': array([51]), 'answer_text': array(['cleaning and protection from reinjury or infection'], dtype=object)}

Wound care encourages and speeds wound healing via cleaning and protection from reinjury or infection. Depending on each patient's needs, it can range from the simplest first aid to entire nursing specialties such as wound, ostomy, and continence nursing and burn center care.


In [6]:
def split_text(row):
    start = row['annotations']['answer_start'][0]
    answer = row['annotations']['answer_text'][0]
    text = row['document_plaintext']
    return pd.Series([text[:start], answer, text[start+len(answer):]])

def apply_split_and_create_df(df):
    df[['text_before_answer', 'text_answer', 'text_after_answer']] = df.apply(split_text, axis=1)
    df_merged = pd.DataFrame({
        'text_before_answer': df['text_before_answer'].apply(lambda x: x.split()),
        'text_answer': df['text_answer'].apply(lambda x: x.split()),
        'text_after_answer': df['text_after_answer'].apply(lambda x: x.split()),
        'question_text': df['question_text'].apply(lambda x: x.split()),
        'answerable': df['annotations'].apply(lambda x: 0 if x['answer_start'] == [-1] else 1),
        'annotations': df['annotations'],
    })
    return df_merged

def decorate_with_bio_tags(df):
    df['text_before_answer_BIO'] = df['text_before_answer'].apply(lambda x: ['O']*len(x))
    df['text_answer_BIO'] = df['text_answer'].apply(lambda x: ['B']+['I']*(len(x)-1) if len(x) > 0 else [])
    df['text_after_answer_BIO'] = df['text_after_answer'].apply(lambda x: ['O']*len(x))
    df['question_text_BIO'] = df['question_text'].apply(lambda x: ['O']*len(x))
    return df

def transform_bio_tags_to_numbers(df):
    df['BIO_tags'] = df['BIO_tags'].apply(lambda tags: [0 if tag == 'O' else 2 if tag == 'B' else 1 for tag in tags])
    return df

def merge_columns(df, include_question_text=True):

    df['tokens'] = df['text_before_answer'] + df['text_answer'] + df['text_after_answer']
    df['BIO_tags'] = df['text_before_answer_BIO'] + df['text_answer_BIO'] + df['text_after_answer_BIO']

    if include_question_text:
        df['tokens'] += df['question_text']
        df['BIO_tags'] += df['question_text_BIO']

    return df

def pad_sequences(df):
    max_len = df['tokens'].apply(len).max()
    df['tokens'] = df['tokens'].apply(lambda x: x + ['PAD']*(max_len - len(x)))
    df['BIO_tags'] = df['BIO_tags'].apply(lambda x: x + ['O']*(max_len - len(x)))
    return df

def drop_unnecessary_columns(df):
    df = df[['tokens', 'BIO_tags']]
    return df






def preprocess(df):
    df = apply_split_and_create_df(df)
    df = decorate_with_bio_tags(df)
    df = merge_columns(df)
    df = pad_sequences(df)
    df = transform_bio_tags_to_numbers(df)
    df = drop_unnecessary_columns(df)

    assert all(df['tokens'].apply(len) == df['BIO_tags'].apply(len)), "Mismatch in lengths of 'text' and 'BIO_tags'"

    return df



# train data
df_train_bengali = preprocess(df_train_bengali)
df_train_arabic = preprocess(df_train_arabic)
df_train_indonesian = preprocess(df_train_indonesian)
df_train_english = preprocess(df_train_english)

# validation data
df_val_bengali = preprocess(df_val_bengali)
df_val_arabic = preprocess(df_val_arabic)
df_val_indonesian = preprocess(df_val_indonesian)
df_val_english = preprocess(df_val_english)


# For testing TODO REMOVE
df_train_bengali = df_train_bengali.head(500)
df_train_arabic = df_train_arabic.head(500)
df_train_indonesian = df_train_indonesian.head(500)
df_train_english = df_train_english.head(500)

df_val_bengali = df_val_bengali.head(500)
df_val_arabic = df_val_arabic.head(500)
df_val_indonesian = df_val_indonesian.head(500)
df_val_english = df_val_english.head(500)



df_train_english.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['text_before_answer', 'text_answer', 'text_after_answer']] = df.apply(split_text, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['text_before_answer', 'text_answer', 'text_after_answer']] = df.apply(split_text, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['text_before_

Unnamed: 0,tokens,BIO_tags
26,"[Quantum, field, theory, naturally, began, wit...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
43,"[The, Nobel, Prize, in, Literature, (Swedish:,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
112,"[Dialectic, or, dialectics, (Greek:, διαλεκτικ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
123,"[Hangul, was, personally, created, and, promul...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, ..."
125,"[Grasshoppers, are, plant-eaters,, with, a, fe...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# From Lab

In [7]:
import io
from math import log
from numpy import array
from numpy import argmax
import torch
import random
from math import log
from numpy import array
from numpy import argmax
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR, CyclicLR
from typing import List, Tuple, AnyStr
from tqdm.notebook import tqdm
from sklearn.metrics import precision_recall_fscore_support
import matplotlib.pyplot as plt
from copy import deepcopy
from datasets import load_dataset, load_metric
from sklearn.metrics import confusion_matrix
import torch.nn.functional as F
import heapq

In [8]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

enforce_reproducibility()

## Download vector embeddings

### English

In [9]:
import requests
import gzip
import shutil
import os

compressed_file_name = 'cc.en.300.vec.gz'
uncompressed_file_name = 'cc.en.300.vec'
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"

if not os.path.exists(uncompressed_file_name):
    response = requests.get(url, stream=True)
    with open(compressed_file_name, 'wb') as f:
        f.write(response.content)
    with gzip.open(compressed_file_name, 'rb') as f_in:
        with open(uncompressed_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(compressed_file_name)


### Bengali

In [10]:
import requests
import gzip
import shutil
import os

compressed_file_name = 'cc.bn.300.vec.gz'
uncompressed_file_name = 'cc.bn.300.vec'
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz"

if not os.path.exists(uncompressed_file_name):
    response = requests.get(url, stream=True)
    with open(compressed_file_name, 'wb') as f:
        f.write(response.content)
    with gzip.open(compressed_file_name, 'rb') as f_in:
        with open(uncompressed_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(compressed_file_name)

### Indonesian

In [11]:
import requests
import gzip
import shutil
import os

compressed_file_name = 'cc.id.300.vec.gz'
uncompressed_file_name = 'cc.id.300.vec'
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz"

if not os.path.exists(uncompressed_file_name):
    response = requests.get(url, stream=True)
    with open(compressed_file_name, 'wb') as f:
        f.write(response.content)
    with gzip.open(compressed_file_name, 'rb') as f_in:
        with open(uncompressed_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(compressed_file_name)

### Arabic

In [12]:
import requests
import gzip
import shutil
import os

compressed_file_name = 'cc.ar.300.vec.gz'
uncompressed_file_name = 'cc.ar.300.vec'
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz"

if not os.path.exists(uncompressed_file_name):
    response = requests.get(url, stream=True)
    with open(compressed_file_name, 'wb') as f:
        f.write(response.content)
    with gzip.open(compressed_file_name, 'rb') as f_in:
        with open(uncompressed_file_name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(compressed_file_name)

In [13]:
# Reduce down to our vocabulary and word embeddings
def load_vectors(fname, vocabulary):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    # tag_names = datasets["train"].features[f"ner_tags"].feature.names
    # final_vocab = ['[PAD]', '[UNK]', '[BOS]', '[EOS]']
    final_vocab = ['[PAD]', '[UNK]', 'B', 'I', 'O']
    final_vectors = [np.random.normal(size=(300,)) for _ in range(len(final_vocab))]
    for j,line in enumerate(fin):
        tokens = line.rstrip().split(' ')
        if tokens[0] in vocabulary or len(final_vocab) < 30000:
            final_vocab.append(tokens[0])
            final_vectors.append(np.array(list(map(float, tokens[1:]))))
    return final_vocab, np.vstack(final_vectors)

class FasttextTokenizer:
    def __init__(self, vocabulary):
        self.vocab = {}
        for j,l in enumerate(vocabulary):
            self.vocab[l.strip()] = j

    def encode(self, text):
        # Text is assumed to be tokenized
        return [self.vocab[t] if t in self.vocab else self.vocab['[UNK]'] for t in text]

In [14]:
def prepare_vocabulary_and_embeddings(df_train, df_val, vector_file):
    vocabulary = set([t for s in df_train['tokens'] for t in s] + [t for s in df_val['tokens'] for t in s])
    vocabulary, pretrained_embeddings = load_vectors(vector_file, vocabulary)
    print('size of vocabulary: ', len(vocabulary))
    print(len(pretrained_embeddings[0]))
    tokenizer = FasttextTokenizer(vocabulary)
    return pretrained_embeddings, tokenizer


In [15]:
# Define the model
class BiLSTM(nn.Module):
    """
    Basic BiLSTM-CRF network
    """
    def __init__(
            self,
            pretrained_embeddings: torch.tensor,
            lstm_dim: int,
            dropout_prob: float = 0.1,
            n_classes: int = 2
    ):
        """
        Initializer for basic BiLSTM network
        :param pretrained_embeddings: A tensor containing the pretrained BPE embeddings
        :param lstm_dim: The dimensionality of the BiLSTM network
        :param dropout_prob: Dropout probability
        :param n_classes: The number of output classes
        """

        # First thing is to call the superclass initializer
        super(BiLSTM, self).__init__()

        # We'll define the network in a ModuleDict, which makes organizing the model a bit nicer
        # The components are an embedding layer, a 2 layer BiLSTM, and a feed-forward output layer
        self.model = nn.ModuleDict({
            'embeddings': nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1),
            'bilstm': nn.LSTM(
                pretrained_embeddings.shape[1],  # input size
                lstm_dim,  # hidden size
                2,  # number of layers
                batch_first=True,
                dropout=dropout_prob,
                bidirectional=True),
            'ff': nn.Linear(2*lstm_dim, n_classes),
        })
        self.n_classes = n_classes
        self.loss = nn.CrossEntropyLoss()
        # Initialize the weights of the model
        self._init_weights()

    def _init_weights(self):
        all_params = list(self.model['bilstm'].named_parameters()) + \
                     list(self.model['ff'].named_parameters())
        for n,p in all_params:
            if 'weight' in n:
                nn.init.xavier_normal_(p)
            elif 'bias' in n:
                nn.init.zeros_(p)

    def forward(self, inputs, input_lens, hidden_states = None, labels = None):
        """
        Defines how tensors flow through the model
        :param inputs: (b x sl) The IDs into the vocabulary of the input samples
        :param input_lens: (b) The length of each input sequence
        :param labels: (b) The label of each sample
        :return: (loss, logits) if `labels` is not None, otherwise just (logits,)
        """

        # Get embeddings (b x sl x edim)
        embeds = self.model['embeddings'](inputs)

        # Pack padded: This is necessary for padded batches input to an RNN - https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch
        lstm_in = nn.utils.rnn.pack_padded_sequence(
            embeds,
            input_lens.cpu(),
            batch_first=True,
            enforce_sorted=False
        )

        # Pass the packed sequence through the BiLSTM
        if hidden_states:
            lstm_out, hidden = self.model['bilstm'](lstm_in, hidden_states)
        else:
            lstm_out, hidden = self.model['bilstm'](lstm_in)

        # Unpack the packed sequence --> (b x sl x 2*lstm_dim)
        lstm_out, lengths = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)

        # Get logits (b x seq_len x n_classes)
        logits = self.model['ff'](lstm_out)
        outputs = (logits, lengths)
        if labels is not None:
            loss = self.loss(logits.reshape(-1, self.n_classes), labels.reshape(-1))
            outputs =  outputs + (loss,)

        return outputs

In [16]:
def train(
    model: nn.Module,
    train_dl: DataLoader,
    valid_dl: DataLoader,
    optimizer: torch.optim.Optimizer,
    n_epochs: int,
    device: torch.device,
    scheduler=None,
):
    """
    The main training loop which will optimize a given model on a given dataset
    :param model: The model being optimized
    :param train_dl: The training dataset
    :param valid_dl: A validation dataset
    :param optimizer: The optimizer used to update the model parameters
    :param n_epochs: Number of epochs to train for
    :param device: The device to train on
    :return: (model, losses) The best model and the losses per iteration
    """

  # Keep track of the loss and best accuracy
    losses = []
    learning_rates = []
    best_f1 = 0.0

    # Iterate through epochs
    for ep in range(n_epochs):

        loss_epoch = []

        #Iterate through each batch in the dataloader
        for batch in tqdm(train_dl):
            # VERY IMPORTANT: Make sure the model is in training mode, which turns on
            # things like dropout and layer normalization
            model.train()

            # VERY IMPORTANT: zero out all of the gradients on each iteration -- PyTorch
            # keeps track of these dynamically in its computation graph so you need to explicitly
            # zero them out
            optimizer.zero_grad()

            # Place each tensor on the GPU
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            seq_lens = batch[1]
            labels = batch[2]

            # Pass the inputs through the model, get the current loss and logits
            logits, lengths, loss = model(input_ids, seq_lens, labels=labels)
            losses.append(loss.item())
            loss_epoch.append(loss.item())

            # Calculate all of the gradients and weight updates for the model
            loss.backward()

            # Optional: clip gradients
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Finally, update the weights of the model
            optimizer.step()
            if scheduler != None:
                scheduler.step()
                learning_rates.append(scheduler.get_last_lr()[0])

        # Perform inline evaluation at the end of the epoch
        f1 = evaluate(model, valid_dl)
        print(f'Validation F1: {f1}, train loss: {sum(loss_epoch) / len(loss_epoch)}')

        # Keep track of the best model based on the accuracy
        if f1 > best_f1:
            torch.save(model.state_dict(), 'best_model')
            best_f1 = f1

    return losses, learning_rates

In [17]:
def evaluate(model: nn.Module, valid_dl: DataLoader):
    """
    Evaluates the model on the given dataset
    :param model: The model under evaluation
    :param valid_dl: A `DataLoader` reading validation data
    :return: The accuracy of the model on the dataset
    """
    # VERY IMPORTANT: Put your model in "eval" mode -- this disables things like
    # layer normalization and dropout
    model.eval()
    labels_all = []
    preds_all = []

    # ALSO IMPORTANT: Don't accumulate gradients during this process
    with torch.no_grad():
        for batch in tqdm(valid_dl, desc='Evaluation'):
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            seq_lens = batch[1]
            labels = batch[2]
            hidden_states = None

            logits, _, _ = model(input_ids, seq_lens, hidden_states=hidden_states, labels=labels)
            preds_all.extend(torch.argmax(logits, dim=-1).reshape(-1).detach().cpu().numpy())
            labels_all.extend(labels.reshape(-1).detach().cpu().numpy())

    P, R, F1, _ = precision_recall_fscore_support(labels_all, preds_all, average='macro')
    print(confusion_matrix(labels_all, preds_all))
    return F1

In [18]:
lstm_dim = 128
dropout_prob = 0.1
batch_size = 8
lr = 1e-2
n_epochs = 10 # TODO CHANGE BACK TO 10
n_workers = 0  # set to a larger number if you run your code in colab

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [19]:
def convert_df_to_dict(df):
    return df.to_dict('records')

# English

In [20]:
# eng_pretrained_embeddings, eng_tokenizer = prepare_vocabulary_and_embeddings(df_train_english, df_val_english, 'cc.en.300.vec')

size of vocabulary:  35979
300


In [21]:
# def collate_batch_bilstm(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
#     input_ids = [eng_tokenizer.encode(i['tokens']) for i in input_data]
#     seq_lens = [len(i) for i in input_ids]
#     labels = [i['BIO_tags'] for i in input_data]

#     max_length = max([len(i) for i in input_ids])

#     input_ids = [(i + [0] * (max_length - len(i))) for i in input_ids]
#     labels = [(i + [0] * (max_length - len(i))) for i in labels] # 0 is the id of the O tag

#     assert (all(len(i) == max_length for i in input_ids))
#     assert (all(len(i) == max_length for i in labels))
#     return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [22]:
# # Create the model
# model = BiLSTM(
#     pretrained_embeddings=torch.FloatTensor(eng_pretrained_embeddings),
#     lstm_dim=lstm_dim,
#     dropout_prob=dropout_prob,
#     # n_classes=len(datasets["train"].features[f"ner_tags"].feature.names)
#     n_classes=3
#   ).to(device)


# dict_train_english = convert_df_to_dict(df_train_english)
# dict_val_english = convert_df_to_dict(df_val_english)

# print(dict_train_english[0])


# train_dl = DataLoader(dict_train_english, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm, num_workers=n_workers)
# valid_dl = DataLoader(dict_val_english, batch_size=batch_size, collate_fn=collate_batch_bilstm, num_workers=n_workers)

# # Create the optimizer
# optimizer = Adam(model.parameters(), lr=lr)
# scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# # Train
# losses, learning_rates = train(model, train_dl, valid_dl, optimizer, n_epochs, device, scheduler)
# model.load_state_dict(torch.load('best_model'))

{'tokens': ['Quantum', 'field', 'theory', 'naturally', 'began', 'with', 'the', 'study', 'of', 'electromagnetic', 'interactions,', 'as', 'the', 'electromagnetic', 'field', 'was', 'the', 'only', 'known', 'classical', 'field', 'as', 'of', 'the', '1920s', '.[8]:1', 'When', 'was', 'quantum', 'field', 'theory', 'developed?', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 

  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[471635      0      0]
 [  1870      0      0]
 [   495      0      0]]
Validation F1: 0.3324996783466489, train loss: 0.044049880064521284


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[471632      2      1]
 [  1870      0      0]
 [   495      0      0]]
Validation F1: 0.3324986182080697, train loss: 0.006444508801140483


<All keys matched successfully>

# Bengali

In [20]:
beng_pretrained_embeddings, beng_tokenizer = prepare_vocabulary_and_embeddings(df_train_bengali, df_val_bengali, 'cc.bn.300.vec')

size of vocabulary:  35461
300


In [23]:
def collate_batch_bilstm_beng(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_ids = [beng_tokenizer.encode(i['tokens']) for i in input_data]
    seq_lens = [len(i) for i in input_ids]
    labels = [i['BIO_tags'] for i in input_data]

    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [0] * (max_length - len(i))) for i in input_ids]
    labels = [(i + [0] * (max_length - len(i))) for i in labels] # 0 is the id of the O tag

    assert (all(len(i) == max_length for i in input_ids))
    assert (all(len(i) == max_length for i in labels))
    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [25]:
# Create the model
model = BiLSTM(
    pretrained_embeddings=torch.FloatTensor(beng_pretrained_embeddings),
    lstm_dim=lstm_dim,
    dropout_prob=dropout_prob,
    # n_classes=len(datasets["train"].features[f"ner_tags"].feature.names)
    n_classes=3
  ).to(device)

dict_train_bengali = convert_df_to_dict(df_train_bengali)
dict_val_bengali = convert_df_to_dict(df_val_bengali)


train_dl = DataLoader(dict_train_bengali, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm_beng, num_workers=n_workers)
valid_dl = DataLoader(dict_val_bengali, batch_size=batch_size, collate_fn=collate_batch_bilstm_beng, num_workers=n_workers)

# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# Train
losses, learning_rates = train(model, train_dl, valid_dl, optimizer, n_epochs, device, scheduler)
model.load_state_dict(torch.load('best_model'))


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[128977      0      0]
 [   159      0      0]
 [   112      0      0]]
Validation F1: 0.3329835092135412, train loss: 0.05848399585201627


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128977      0      0]
 [   159      0      0]
 [   112      0      0]]
Validation F1: 0.3329835092135412, train loss: 0.013120330286227048


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128977      0      0]
 [   159      0      0]
 [   112      0      0]]
Validation F1: 0.3329835092135412, train loss: 0.012206274040398143


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128977      0      0]
 [   159      0      0]
 [   112      0      0]]
Validation F1: 0.3329835092135412, train loss: 0.011215566679657925


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128977      0      0]
 [   159      0      0]
 [   112      0      0]]
Validation F1: 0.3329835092135412, train loss: 0.010579755677590294


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128977      0      0]
 [   159      0      0]
 [   112      0      0]]
Validation F1: 0.3329835092135412, train loss: 0.009781659447721072


  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[128971      0      6]
 [   159      0      0]
 [   111      0      1]]
Validation F1: 0.33857928613203797, train loss: 0.009001981740492203


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128941     18     18]
 [   157      2      0]
 [   108      2      2]]
Validation F1: 0.3504122126943367, train loss: 0.008293591550595704


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128951     14     12]
 [   155      3      1]
 [   108      3      1]]
Validation F1: 0.34942441418516285, train loss: 0.0075933341350820326


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/28 [00:00<?, ?it/s]

[[128934     16     27]
 [   154      3      2]
 [   106      3      3]]
Validation F1: 0.3578807324036299, train loss: 0.006982528456738071


<All keys matched successfully>

# Arabic

In [26]:
arab_pretrained_embeddings, arb_tokenizer = prepare_vocabulary_and_embeddings(df_train_arabic, df_val_arabic, 'cc.ar.300.vec')

size of vocabulary:  40025
300


In [27]:
def collate_batch_bilstm_arabic(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_ids = [arb_tokenizer.encode(i['tokens']) for i in input_data]
    seq_lens = [len(i) for i in input_ids]
    labels = [i['BIO_tags'] for i in input_data]

    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [0] * (max_length - len(i))) for i in input_ids]
    labels = [(i + [0] * (max_length - len(i))) for i in labels] # 0 is the id of the O tag

    assert (all(len(i) == max_length for i in input_ids))
    assert (all(len(i) == max_length for i in labels))
    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [28]:
# Create the model
model = BiLSTM(
    pretrained_embeddings=torch.FloatTensor(arab_pretrained_embeddings),
    lstm_dim=lstm_dim,
    dropout_prob=dropout_prob,
    # n_classes=len(datasets["train"].features[f"ner_tags"].feature.names)
    n_classes=3
  ).to(device)

dict_train_arabic = convert_df_to_dict(df_train_arabic)
dict_val_arabic = convert_df_to_dict(df_val_arabic)


train_dl = DataLoader(dict_train_arabic, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm_arabic, num_workers=n_workers)
valid_dl = DataLoader(dict_val_arabic, batch_size=batch_size, collate_fn=collate_batch_bilstm_arabic, num_workers=n_workers)

# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# Train
losses, learning_rates = train(model, train_dl, valid_dl, optimizer, n_epochs, device, scheduler)
model.load_state_dict(torch.load('best_model'))

  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839783      0      0]
 [  2217      0      0]
 [   500      0      0]]
Validation F1: 0.33279497761870824, train loss: 0.04695203516721016


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839783      0      0]
 [  2217      0      0]
 [   500      0      0]]
Validation F1: 0.33279497761870824, train loss: 0.005480377055290673


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839783      0      0]
 [  2217      0      0]
 [   500      0      0]]
Validation F1: 0.33279497761870824, train loss: 0.005421925683520616


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839783      0      0]
 [  2217      0      0]
 [   500      0      0]]
Validation F1: 0.33279497761870824, train loss: 0.005082245376552381


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839783      0      0]
 [  2217      0      0]
 [   500      0      0]]
Validation F1: 0.33279497761870824, train loss: 0.005092094341913859


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839776      7      0]
 [  2201     16      0]
 [   500      0      0]]
Validation F1: 0.33755865833547055, train loss: 0.004859340754115865


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839771     12      0]
 [  2174     43      0]
 [   499      1      0]]
Validation F1: 0.3454131204132314, train loss: 0.004488336136712442


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839412    371      0]
 [  1802    415      0]
 [   484     16      0]]
Validation F1: 0.42444843912437014, train loss: 0.004207543681360899


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839402    381      0]
 [  1682    535      0]
 [   476     24      0]]
Validation F1: 0.44580642032626666, train loss: 0.003999447328082863


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[839218    565      0]
 [  1529    688      0]
 [   465     35      0]]
Validation F1: 0.46368657099989846, train loss: 0.003691212133696628


<All keys matched successfully>

# Indonestian

In [29]:
indo_pretrained_embeddings, indo_tokenizer = prepare_vocabulary_and_embeddings(df_train_indonesian, df_val_indonesian, 'cc.id.300.vec')

size of vocabulary:  35371
300


In [30]:
def collate_batch_bilstm_indo(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_ids = [indo_tokenizer.encode(i['tokens']) for i in input_data]
    seq_lens = [len(i) for i in input_ids]
    labels = [i['BIO_tags'] for i in input_data]

    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [0] * (max_length - len(i))) for i in input_ids]
    labels = [(i + [0] * (max_length - len(i))) for i in labels] # 0 is the id of the O tag

    assert (all(len(i) == max_length for i in input_ids))
    assert (all(len(i) == max_length for i in labels))
    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [31]:
# Create the model
model = BiLSTM(
    pretrained_embeddings=torch.FloatTensor(indo_pretrained_embeddings),
    lstm_dim=lstm_dim,
    dropout_prob=dropout_prob,
    # n_classes=len(datasets["train"].features[f"ner_tags"].feature.names)
    n_classes=3
  ).to(device)

dict_train_indonesian = convert_df_to_dict(df_train_indonesian)
dict_val_indonesian = convert_df_to_dict(df_val_indonesian)


train_dl = DataLoader(dict_train_indonesian, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm_indo, num_workers=n_workers)
valid_dl = DataLoader(dict_val_indonesian, batch_size=batch_size, collate_fn=collate_batch_bilstm_indo, num_workers=n_workers)

# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)
scheduler = CyclicLR(optimizer, base_lr=0., max_lr=lr, step_size_up=1, step_size_down=len(train_dl)*n_epochs, cycle_momentum=False)

# Train
losses, learning_rates = train(model, train_dl, valid_dl, optimizer, n_epochs, device, scheduler)
model.load_state_dict(torch.load('best_model'))

  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[686111      0      0]
 [  1889      0      0]
 [   500      0      0]]
Validation F1: 0.33275401792458614, train loss: 0.04248678953283363


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[686111      0      0]
 [  1889      0      0]
 [   500      0      0]]
Validation F1: 0.33275401792458614, train loss: 0.007496630724903847


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[686103      8      0]
 [  1838     51      0]
 [   493      7      0]]
Validation F1: 0.35015741968053615, train loss: 0.006371071414342002


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[[685580    531      0]
 [  1087    802      0]
 [   375    125      0]]
Validation F1: 0.4925945772551778, train loss: 0.00528789450237084


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[685689    421      1]
 [  1033    856      0]
 [   365    129      6]]
Validation F1: 0.5139727931788106, train loss: 0.0045796042744306814


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[685357    751      3]
 [   780   1104      5]
 [   324    164     12]]
Validation F1: 0.5365983551321601, train loss: 0.004122997811507611


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[685734    359     18]
 [   979    899     11]
 [   361    114     25]]
Validation F1: 0.546788980630675, train loss: 0.003648994483112816


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[685765    322     24]
 [   904    973     12]
 [   342    129     29]]
Validation F1: 0.5629595006885558, train loss: 0.003457345240305932


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[685494    582     35]
 [   791   1078     20]
 [   330    138     32]]
Validation F1: 0.5641733740498902, train loss: 0.0029319603668732774


  0%|          | 0/63 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

[[685673    407     31]
 [   845   1024     20]
 [   345    119     36]]
Validation F1: 0.572331262610608, train loss: 0.002655098107569511


<All keys matched successfully>