In [1]:
!pip install datasets
!pip install update transformers

Collecting update
  Downloading update-0.0.1-py2.py3-none-any.whl (2.9 kB)
Collecting style==1.1.0 (from update)
  Downloading style-1.1.0-py2.py3-none-any.whl (6.4 kB)
Installing collected packages: style, update
Successfully installed style-1.1.0 update-0.0.1


In [3]:
import torch
torch.cuda.empty_cache()

In [4]:
import os
import numpy as np
from collections import Counter
import torch
import datasets
datasets.logging.set_verbosity_error()
from datasets import load_metric
# from google.colab import drive
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
import pandas as pd
import random

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertModel


# # uncomment if CAN'T CONNECT TO GPU
# import psutil
# import platform

In [5]:
# add-in as occasionally receive an error which requires this to be added
# uncomment if the issue arises
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.22.0
    Uninstalling accelerate-0.22.0:
      Successfully uninstalled accelerate-0.22.0
Successfully installed accelerate-0.24.0


In [6]:
def enforce_reproducibility(seed=42):
    # Sets seed manually for both CPU and CUDA
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For atomic operations there is currently
    # no simple way to enforce determinism, as
    # the order of parallel operations is not known.
    # CUDNN
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # System based
    random.seed(seed)
    np.random.seed(seed)

device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

enforce_reproducibility()

In [7]:
# Preamble
import sys

sys.path.append('..')

# Preprocessing

In [8]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

df_train = train_set.to_pandas()
df_val = validation_set.to_pandas()

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 75.43 MiB, generated: 131.78 MiB, post-processed: Unknown size, total: 207.21 MiB) to /root/.cache/huggingface/datasets/parquet/copenlu--nlp_course_tydiqa-42333912ea665dd0/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/copenlu--nlp_course_tydiqa-42333912ea665dd0/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# Get train and validation data for each language
df_train_bengali = df_train[df_train['language'] == 'bengali']
df_train_arabic = df_train[df_train['language'] == 'arabic']
df_train_indonesian = df_train[df_train['language'] == 'indonesian']

df_val_bengali = df_val[df_val['language'] == 'bengali']
df_val_arabic = df_val[df_val['language'] == 'arabic']
df_val_indonesian = df_val[df_val['language'] == 'indonesian']


# For testing
df_val_english = df_val[df_val['language'] == 'english']
df_train_english = df_train[df_train['language'] == 'english']


In [10]:
# Create a new dataframe with the combined documents and questions and add if they are answerable
df_train_bengali_merged = pd.DataFrame({
    'text':df_train_bengali["document_plaintext"],
    'question': df_train_bengali["question_text"],
    'answerable':(df_train_bengali["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_train_arabic_merged = pd.DataFrame({
    'text': df_train_arabic["document_plaintext"],
    'question': df_train_arabic["question_text"],
    'answerable': (df_train_arabic["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
                                    })
df_train_indonesian_merged = pd.DataFrame({
    'text':df_train_indonesian["document_plaintext"],
    'question': df_train_indonesian["question_text"],
    'answerable':(df_train_indonesian["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_train_english_merged = pd.DataFrame({
    'text':df_train_english["document_plaintext"],
    'question': df_train_english["question_text"],
    'answerable':(df_train_english["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })


## Same for validation data
df_val_bengali_merged = pd.DataFrame({
    'text':df_val_bengali["document_plaintext"],
    'question': df_val_bengali["question_text"],
    'answerable':(df_val_bengali["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_val_arabic_merged = pd.DataFrame({
    'text': df_val_arabic["document_plaintext"],
    'question': df_val_arabic["question_text"],
    'answerable': (df_val_arabic["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
                                    })
df_val_indonesian_merged = pd.DataFrame({
    'text':df_val_indonesian["document_plaintext"],
    'question': df_val_indonesian["question_text"],
    'answerable':(df_val_indonesian["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })
df_val_english_merged = pd.DataFrame({
    'text':df_val_english["document_plaintext"],
    'question':  df_val_english["question_text"],
    'answerable':(df_val_english["annotations"].apply(lambda x: 0 if x['answer_start'] == [-1] else 1))
    })

# Tokenization of text

In [11]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")

def tokenize_text(df, max_length=128):
    input_ids = []
    attention_masks = []


    encoded_text = tokenizer(
        df["question"].tolist(),
        df["text"].tolist(),
        max_length=max_length,
        padding="max_length",
        truncation='only_second',
        return_attention_mask=True,
        return_tensors="pt"
    )

    return encoded_text


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

### Indonesian

In [20]:
# For Indonesian

train_tokenized_text_indonesian = tokenize_text(df_train_indonesian_merged)
val_tokenized_text_indonesian = tokenize_text(df_val_indonesian_merged)


train_input_ids_indonesian = torch.cat([train_tokenized_text_indonesian["input_ids"]], dim=0)
train_attention_masks_indonesian = torch.cat([train_tokenized_text_indonesian["attention_mask"]], dim=0)
val_input_ids_indonesian = torch.cat([val_tokenized_text_indonesian["input_ids"]], dim=0)
val_attention_masks_indonesian = torch.cat([val_tokenized_text_indonesian["attention_mask"]], dim=0)

train_labels_indonesian = torch.tensor(df_train_indonesian_merged["answerable"].tolist())
val_labels_indonesian = torch.tensor(df_val_indonesian_merged["answerable"].tolist())


batch_size = 16

train_data_indonesian = TensorDataset(train_input_ids_indonesian.to('cuda'), train_attention_masks_indonesian.to('cuda'), train_labels_indonesian.to('cuda'))
train_sampler_indonesian = RandomSampler(train_data_indonesian)
train_dataloader_indonesian = DataLoader(train_data_indonesian, sampler=train_sampler_indonesian, batch_size=batch_size)

val_data_indonesian = TensorDataset(val_input_ids_indonesian.to('cuda'), val_attention_masks_indonesian.to('cuda'), val_labels_indonesian.to('cuda'))
val_sampler_indonesian = SequentialSampler(val_data_indonesian)
val_dataloader_indonesian = DataLoader(val_data_indonesian, sampler=val_sampler_indonesian, batch_size=batch_size)



### Bengali

In [19]:
# For Bengali

train_tokenized_text_bengali = tokenize_text(df_train_bengali_merged)
val_tokenized_text_bengali = tokenize_text(df_val_bengali_merged)


train_input_ids_bengali = torch.cat([train_tokenized_text_bengali["input_ids"]], dim=0)
train_attention_masks_bengali = torch.cat([train_tokenized_text_bengali["attention_mask"]], dim=0)
val_input_ids_bengali = torch.cat([val_tokenized_text_bengali["input_ids"]], dim=0)
val_attention_masks_bengali = torch.cat([val_tokenized_text_bengali["attention_mask"]], dim=0)

train_labels_bengali = torch.tensor(df_train_bengali_merged["answerable"].tolist())
val_labels_bengali = torch.tensor(df_val_bengali_merged["answerable"].tolist())

batch_size = 16

train_data_bengali = TensorDataset(train_input_ids_bengali.to('cuda'), train_attention_masks_bengali.to('cuda'), train_labels_bengali.to('cuda'))
train_sampler_bengali = RandomSampler(train_data_bengali)
train_dataloader_bengali = DataLoader(train_data_bengali, sampler=train_sampler_bengali, batch_size=batch_size)

val_data_bengali = TensorDataset(val_input_ids_bengali.to('cuda'), val_attention_masks_bengali.to('cuda'), val_labels_bengali.to('cuda'))
val_sampler_bengali = SequentialSampler(val_data_bengali)
val_dataloader_bengali = DataLoader(val_data_bengali, sampler=val_sampler_bengali, batch_size=batch_size)


### Arabic

In [13]:
# For Arabic

train_tokenized_text_arabic = tokenize_text(df_train_arabic_merged)
val_tokenized_text_arabic = tokenize_text(df_val_arabic_merged)


train_input_ids_arabic = torch.cat([train_tokenized_text_arabic["input_ids"]], dim=0)
train_attention_masks_arabic = torch.cat([train_tokenized_text_arabic["attention_mask"]], dim=0)
val_input_ids_arabic = torch.cat([val_tokenized_text_arabic["input_ids"]], dim=0)
val_attention_masks_arabic = torch.cat([val_tokenized_text_arabic["attention_mask"]], dim=0)

train_labels_arabic = torch.tensor(df_train_arabic_merged["answerable"].tolist())
val_labels_arabic = torch.tensor(df_val_arabic_merged["answerable"].tolist())

batch_size = 16

train_data_arabic = TensorDataset(train_input_ids_arabic.to('cuda'), train_attention_masks_arabic.to('cuda'), train_labels_arabic.to('cuda'))
train_sampler_arabic = RandomSampler(train_data_arabic)
train_dataloader_arabic = DataLoader(train_data_arabic, sampler=train_sampler_arabic, batch_size=batch_size)

val_data_arabic = TensorDataset(val_input_ids_arabic.to('cuda'), val_attention_masks_arabic.to('cuda'), val_labels_arabic.to('cuda'))
val_sampler_arabic = SequentialSampler(val_data_arabic)
val_dataloader_arabic = DataLoader(val_data_arabic, sampler=val_sampler_arabic, batch_size=batch_size)


### Week 41 - Zero-shot cross-lingual evaluation on classifier

First, using Arabic as the training set and evaluating on Indonesian and Bengali.

In [21]:
arabic_model = AutoModelForSequenceClassification.from_pretrained("./arabic_classification")
arabic_model.to('cuda')

arabic_model.eval()
predictions_bengali = []
true_labels_bengali = []
for batch in tqdm(val_dataloader_bengali, desc=f"Evaluating"):
    inputs = batch[:2]
    labels = batch[2]
    with torch.no_grad():
        outputs = arabic_model(*inputs)
    logits = outputs.logits
    predictions_bengali.extend(logits.argmax(dim=1).tolist())
    true_labels_bengali.extend(labels.tolist())

accuracy_bengali = accuracy_score(true_labels_bengali, predictions_bengali)
report_bengali = classification_report(true_labels_bengali, predictions_bengali, target_names=["Not Answerable", "Answerable"], digits=5)
print(f"Evaluating on Bengali: Accuracy: {accuracy_bengali:.4f} - Average Loss: {average_loss:.4f}")
print(report_bengali)

arabic_model.eval()
predictions_indonesian = []
true_labels_indonesian = []
for batch in tqdm(val_dataloader_indonesian, desc=f"Evaluating"):
    inputs = batch[:2]
    labels = batch[2]
    with torch.no_grad():
        outputs = arabic_model(*inputs)
    logits = outputs.logits
    predictions_indonesian.extend(logits.argmax(dim=1).tolist())
    true_labels_indonesian.extend(labels.tolist())

accuracy_indonesian = accuracy_score(true_labels_indonesian, predictions_indonesian)
report_indonesian = classification_report(true_labels_indonesian, predictions_indonesian, target_names=["Not Answerable", "Answerable"], digits=5)
print(f"Evaluating on Indonesian: Accuracy: {accuracy_indonesian:.4f} - Average Loss: {average_loss:.4f}")
print(report_indonesian)



Evaluating: 100%|██████████| 14/14 [00:00<00:00, 17.19it/s]


Evaluating on Bengali: Accuracy: 0.7545 - Average Loss: 0.0939
                precision    recall  f1-score   support

Not Answerable    0.72441   0.82143   0.76987       112
    Answerable    0.79381   0.68750   0.73684       112

      accuracy                        0.75446       224
     macro avg    0.75911   0.75446   0.75336       224
  weighted avg    0.75911   0.75446   0.75336       224



Evaluating: 100%|██████████| 75/75 [00:04<00:00, 17.75it/s]

Evaluating on Indonesian: Accuracy: 0.8816 - Average Loss: 0.0939
                precision    recall  f1-score   support

Not Answerable    0.88325   0.87879   0.88101       594
    Answerable    0.88000   0.88442   0.88221       597

      accuracy                        0.88161      1191
     macro avg    0.88162   0.88160   0.88161      1191
  weighted avg    0.88162   0.88161   0.88161      1191






Using Bengali as the training language, and evaluating on Arabic and Indonesian.

In [22]:
bengali_model = AutoModelForSequenceClassification.from_pretrained("./bengali_classification")

bengali_model.to('cuda')

bengali_model.eval()
predictions_arabic = []
true_labels_arabic = []
for batch in tqdm(val_dataloader_arabic, desc=f"Evaluating"):
    inputs = batch[:2]
    labels = batch[2]
    with torch.no_grad():
        outputs = bengali_model(*inputs)
    logits = outputs.logits
    predictions_arabic.extend(logits.argmax(dim=1).tolist())
    true_labels_arabic.extend(labels.tolist())

accuracy_arabic = accuracy_score(true_labels_arabic, predictions_arabic)
report_arabic = classification_report(true_labels_arabic, predictions_arabic, target_names=["Not Answerable", "Answerable"], digits=5)
print(f"Evaluating on Arabic: Accuracy: {accuracy_arabic:.4f} - Average Loss: {average_loss:.4f}")
print(report_arabic)

bengali_model.eval()
predictions_indonesian = []
true_labels_indonesian = []
for batch in tqdm(val_dataloader_indonesian, desc=f"Evaluating"):
    inputs = batch[:2]
    labels = batch[2]
    with torch.no_grad():
        outputs = bengali_model(*inputs)
    logits = outputs.logits
    predictions_indonesian.extend(logits.argmax(dim=1).tolist())
    true_labels_indonesian.extend(labels.tolist())

accuracy_indonesian = accuracy_score(true_labels_indonesian, predictions_indonesian)
report_indonesian = classification_report(true_labels_indonesian, predictions_indonesian, target_names=["Not Answerable", "Answerable"], digits=5)
print(f"Evaluating on Indonesian: Accuracy: {accuracy_indonesian:.4f} - Average Loss: {average_loss:.4f}")
print(report_indonesian)

Evaluating: 100%|██████████| 119/119 [00:06<00:00, 17.58it/s]


Evaluating on Arabic: Accuracy: 0.8728 - Average Loss: 0.0939
                precision    recall  f1-score   support

Not Answerable    0.89171   0.84858   0.86961       951
    Answerable    0.85557   0.89695   0.87577       951

      accuracy                        0.87277      1902
     macro avg    0.87364   0.87277   0.87269      1902
  weighted avg    0.87364   0.87277   0.87269      1902



Evaluating: 100%|██████████| 75/75 [00:04<00:00, 17.75it/s]

Evaluating on Indonesian: Accuracy: 0.8212 - Average Loss: 0.0939
                precision    recall  f1-score   support

Not Answerable    0.80976   0.83838   0.82382       594
    Answerable    0.83333   0.80402   0.81841       597

      accuracy                        0.82116      1191
     macro avg    0.82154   0.82120   0.82112      1191
  weighted avg    0.82157   0.82116   0.82111      1191






Using Indonesian as training dataset and Bengali and Arabic for validation.

In [23]:
indonesian_model = AutoModelForSequenceClassification.from_pretrained("./indonesian_classification")
indonesian_model.to('cuda')

indonesian_model.eval()
predictions_arabic = []
true_labels_arabic = []
for batch in tqdm(val_dataloader_arabic, desc=f"Evaluating"):
    inputs = batch[:2]
    labels = batch[2]
    with torch.no_grad():
        outputs = indonesian_model(*inputs)
    logits = outputs.logits
    predictions_arabic.extend(logits.argmax(dim=1).tolist())
    true_labels_arabic.extend(labels.tolist())

accuracy_arabic = accuracy_score(true_labels_arabic, predictions_arabic)
report_arabic = classification_report(true_labels_arabic, predictions_arabic, target_names=["Not Answerable", "Answerable"], digits=5)
print(f"Evaluating on Arabic: Accuracy: {accuracy_arabic:.4f} - Average Loss: {average_loss:.4f}")
print(report_arabic)

indonesian_model.eval()
predictions_bengali = []
true_labels_bengali = []
for batch in tqdm(val_dataloader_bengali, desc=f"Evaluating Epoch {epoch + 1}"):
    inputs = batch[:2]
    labels = batch[2]
    with torch.no_grad():
        outputs = indonesian_model(*inputs)
    logits = outputs.logits
    predictions_bengali.extend(logits.argmax(dim=1).tolist())
    true_labels_bengali.extend(labels.tolist())

accuracy_bengali = accuracy_score(true_labels_bengali, predictions_bengali)
report_bengali = classification_report(true_labels_bengali, predictions_bengali, target_names=["Not Answerable", "Answerable"], digits=5)
print(f"Evaluating on Bengali: Accuracy: {accuracy_bengali:.4f} - Average Loss: {average_loss:.4f}")
print(report_bengali)

Evaluating: 100%|██████████| 119/119 [00:06<00:00, 17.60it/s]


Evaluating on Arabic: Accuracy: 0.9085 - Average Loss: 0.0939
                precision    recall  f1-score   support

Not Answerable    0.93701   0.87592   0.90543       951
    Answerable    0.88351   0.94111   0.91141       951

      accuracy                        0.90852      1902
     macro avg    0.91026   0.90852   0.90842      1902
  weighted avg    0.91026   0.90852   0.90842      1902



Evaluating Epoch 4: 100%|██████████| 14/14 [00:00<00:00, 17.54it/s]

Evaluating on Bengali: Accuracy: 0.8125 - Average Loss: 0.0939
                precision    recall  f1-score   support

Not Answerable    0.83654   0.77679   0.80556       112
    Answerable    0.79167   0.84821   0.81897       112

      accuracy                        0.81250       224
     macro avg    0.81410   0.81250   0.81226       224
  weighted avg    0.81410   0.81250   0.81226       224




