# Week 41
Inspired by: https://huggingface.co/docs/transformers/tasks/sequence_classification

The purpose of this section is to evaluate the models from previous sections by training them on one language and testing their performance on a different language. Specifically, this involves the binary classification model from Week 38 and the BERT sequence classifier from Week 39.

In [1]:
%pip install datasets
%pip install googletrans==3.1.0a0
%pip install spicy
%pip install accelerate -U
%pip install transformers[torch]

from googletrans import Translator


import pandas as pd
import copy
import numpy as np
import torch
import nltk
import string
nltk.download('punkt')
from datasets import load_dataset
from tabulate import tabulate
from googletrans import Translator

# stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import gensim.downloader
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import random
from scipy.sparse import hstack as sparse_hstack

Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaspe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
languages = ['indonesian', 'bengali', 'arabic']

In [3]:
import pickle
file_path = 'train_set_dict.pkl'
with open(file_path, 'rb') as file:
    loaded_data = pickle.load(file)
train_set_dict = loaded_data['any']
print(train_set_dict)

file_path = 'val_set_dict.pkl'
with open(file_path, 'rb') as file:
    loaded_data = pickle.load(file)
val_set_dict = loaded_data['any']
print(val_set_dict)


{'indonesian': Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'question_words', 'answer_words', 'doc_words', 'doc_text_words', 'question_text_eng', 'doc_plaintext_eng', 'question_words_eng', 'doc_text_words_eng'],
    num_rows: 11394
}), 'bengali': Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'question_words', 'answer_words', 'doc_words', 'doc_text_words', 'question_text_eng', 'doc_plaintext_eng', 'question_words_eng', 'doc_text_words_eng'],
    num_rows: 4779
}), 'arabic': Dataset({
    features: ['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'question_words', 'answer_words', 'doc_words', 'doc_text_words', 'question_text_eng', 'doc_plaintext_eng', 'question_words_eng', 'doc_text_words_eng'],
    num_rows: 29598
})}
{'indonesian': Dataset({
    features: ['question_text', 'docu

# Logistic Regression with BPemb

In [4]:
!pip install bpemb
from bpemb import BPEmb



In [13]:
def run_classifier(X_train, y_train, X_test, y_test):
  # Define hyperparams for search
  # C = np.logspace(-6, 2, 50)
  # warm_start = [False, True]
  # class_weight = ['balanced', None]

  # hp = {"C": C, "warm_start": warm_start, 'class_weight': class_weight}

  classifier = LogisticRegression(penalty='l2', max_iter=1000)
  # classifier_random = RandomizedSearchCV(
  #     estimator=classifier,
  #     param_distributions=hp,
  #     n_iter=100,
  #     cv=5,
  #     verbose=2,
  #     random_state=1000,
  #     n_jobs=-1,
  #     scoring='f1'
  # )

  # classifier_random.fit(X_train, y_train)
  # print(classifier_random.best_params_)
  # print(classifier_random.best_score_)
  # model = classifier_random.best_estimator_
  classifier.fit(X_train,y_train)
  preds = classifier.predict(X_test)
  print(classification_report(y_test, preds))


In [28]:
bpemb_id = BPEmb(lang='id', dim=100, vs=25000)
bpemb_bn = BPEmb(lang='bn', dim=100, vs=25000)
bpemb_ar = BPEmb(lang='ar', dim=100, vs=25000)

bpemb_mapping = {
    "indonesian": bpemb_id,
    "bengali": bpemb_bn,
    "arabic": bpemb_ar
  }


def get_bpemb_features(dataset, bpemb):
  # With bpemb we can tokenize and embed an entire document using .embed(x)
  X = [bpemb.embed(x).mean(0) for x in tqdm(dataset[:,0])]
  y = list(dataset[:,1])
  return X,y


In [29]:
training_data = {
    'indonesian': (None, None), #X_train, y_train
    'bengali': (None, None), #X_train, y_train
    'arabic': (None, None) #X_train, y_train
}
testing_data = {
    'indonesian': (None, None), #X_test, y_test
    'bengali': (None, None), #X_test, y_test
    'arabic': (None, None) #X_test, y_test
}

In [30]:
training_data = {}
testing_data = {}

for lang in languages:
    X_train = [item['question_text'] + ' [SEP] ' + item['document_plaintext'] for item in train_data]
    X_test = [item['question_text'] + ' [SEP] ' + item['document_plaintext'] for item in test_data]
    
    # Get the appropriate bpemb model for the current language
    bpemb_model = bpemb_mapping[lang]
    
    X_train, y_train = get_bpemb_features(np.transpose((X_train, y_train)), bpemb_model)
    X_test, y_test = get_bpemb_features(np.transpose((X_test, y_test)), bpemb_model)
    
    # Populate training_data and testing_data for the current language
    training_data[lang] = (X_train, y_train)
    testing_data[lang] = (X_test, y_test)


100%|██████████| 29598/29598 [00:10<00:00, 2717.11it/s]
100%|██████████| 224/224 [00:00<00:00, 3225.79it/s]
100%|██████████| 29598/29598 [00:10<00:00, 2785.22it/s]
100%|██████████| 224/224 [00:00<00:00, 2640.99it/s]
100%|██████████| 29598/29598 [00:10<00:00, 2809.21it/s]
100%|██████████| 224/224 [00:00<00:00, 3421.35it/s]


In [19]:
for lang in languages:
    X_train = [item['question_text'] + ' [SEP] ' + item['document_plaintext'] for item in train_data]
    X_test = [item['question_text'] + ' [SEP] ' + item['document_plaintext'] for item in test_data]
    X_train, y_train = get_bpemb_features(np.transpose((X_train, y_train)), bpemb_model)
    X_test, y_test = get_bpemb_features(np.transpose((X_test, y_test)), bpemb_model)
    
    # Populate training_data and testing_data for the current language
    training_data[lang] = (X_train, y_train)
    testing_data[lang] = (X_test, y_test)

for lang in languages:
    if lang in bpemp_mapping:
        bpemb_model = globals()[f'bpemb_{bpemp_mapping[lang]}']  # Get the corresponding BPEmb model
    else:
        print(f"No BPEmb model found for language: {lang}")
        continue

    # Split data into train and test sets
    train_data = train_set_dict[lang]
    
    # Create a list of test languages other than the current language
    test_languages = [l for l in languages if l != lang]
    
    for test_lang in test_languages:
        print('Training on ', lang, ' data.', 'Testing on ', test_lang, ' data.')
        test_data = val_set_dict[test_lang]
        
        y_train = [0 if not answer else 1 for answer in train_data['answer_words']]
        y_test = [0 if not answer else 1 for answer in test_data['answer_words']]

        X_train = [item['question_text'] + ' [SEP] ' + item['document_plaintext'] for item in train_data]
        X_test = [item['question_text'] + ' [SEP] ' + item['document_plaintext'] for item in test_data]

        X_train, y_train = get_bpemb_features(np.transpose((X_train, y_train)), bpemb_model)
        X_test, y_test = get_bpemb_features(np.transpose((X_test, y_test)), bpemb_model)

        # Train and test your classifier
        run_classifier(X_train, y_train, X_test, y_test)



Training on  indonesian  data. Testing on  bengali  data.


100%|██████████| 11394/11394 [00:03<00:00, 3313.17it/s]
100%|██████████| 224/224 [00:00<00:00, 3793.08it/s]


              precision    recall  f1-score   support

           0       0.88      0.20      0.32       112
           1       0.55      0.97      0.70       112

    accuracy                           0.58       224
   macro avg       0.71      0.58      0.51       224
weighted avg       0.71      0.58      0.51       224

Training on  indonesian  data. Testing on  arabic  data.


100%|██████████| 11394/11394 [00:04<00:00, 2635.28it/s]
100%|██████████| 1902/1902 [00:00<00:00, 3135.46it/s]


              precision    recall  f1-score   support

           0       1.00      0.01      0.02       956
           1       0.50      1.00      0.67       946

    accuracy                           0.50      1902
   macro avg       0.75      0.50      0.34      1902
weighted avg       0.75      0.50      0.34      1902

Training on  bengali  data. Testing on  indonesian  data.


100%|██████████| 4779/4779 [00:01<00:00, 2487.11it/s]
100%|██████████| 1191/1191 [00:00<00:00, 2964.45it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67       597
           1       0.00      0.00      0.00       594

    accuracy                           0.50      1191
   macro avg       0.25      0.50      0.33      1191
weighted avg       0.25      0.50      0.33      1191

Training on  bengali  data. Testing on  arabic  data.


100%|██████████| 4779/4779 [00:02<00:00, 2366.58it/s]
100%|██████████| 1902/1902 [00:00<00:00, 3177.02it/s]


              precision    recall  f1-score   support

           0       1.00      0.02      0.03       956
           1       0.50      1.00      0.67       946

    accuracy                           0.51      1902
   macro avg       0.75      0.51      0.35      1902
weighted avg       0.75      0.51      0.35      1902

Training on  arabic  data. Testing on  indonesian  data.


100%|██████████| 29598/29598 [00:10<00:00, 2708.73it/s]
100%|██████████| 1191/1191 [00:00<00:00, 2460.25it/s]


              precision    recall  f1-score   support

           0       0.50      1.00      0.67       597
           1       0.00      0.00      0.00       594

    accuracy                           0.50      1191
   macro avg       0.25      0.50      0.33      1191
weighted avg       0.25      0.50      0.33      1191

Training on  arabic  data. Testing on  bengali  data.


100%|██████████| 29598/29598 [00:10<00:00, 2717.36it/s]
100%|██████████| 224/224 [00:00<00:00, 2894.91it/s]


              precision    recall  f1-score   support

           0       0.60      0.69      0.64       112
           1       0.63      0.54      0.58       112

    accuracy                           0.61       224
   macro avg       0.61      0.61      0.61       224
weighted avg       0.61      0.61      0.61       224



# Sequence Labeller

In [3]:
from datasets import load_dataset

df = load_dataset("copenlu/answerable_tydiqa")

In [4]:
df_train_bengali = df["train"].filter(lambda x: x["language"] == "bengali")
df_train_arab = df["train"].filter(lambda x: x["language"] == "arabic")
df_train_indo = df["train"].filter(lambda x: x["language"] == "indonesian")

#df_train_bengali = df_train_bengali.train_test_split(test_size = 0.1)['test']
#df_train_arab = df_train_arab.train_test_split(test_size = 0.1)['test']
#df_train_indo = df_train_indo.train_test_split(test_size = 0.1)['test']

df_val_bengali = df["validation"].filter(lambda x: x["language"] == "bengali")
df_val_arab = df["validation"].filter(lambda x: x["language"] == "arabic")
df_val_indo = df["validation"].filter(lambda x: x["language"] == "indonesian")

#df_val_bengali = df_val_bengali.train_test_split(test_size = 0.1)['test']
#df_val_arab = df_val_arab.train_test_split(test_size = 0.1)['test']
#df_val_indo = df_val_indo.train_test_split(test_size = 0.1)['test']


common functions

In [5]:
def format(dataset):
    result = {}
    #Marking start of the sentence and where the question and plaintext separates
    result['sentence'] = '[CLS] ' + dataset['question_text'] + ' [SEP] ' + dataset['document_plaintext']
    #Where the answer starts in the document_plaintext
    answer_start = dataset['annotations']['answer_start'][0] + len(dataset['question_text']) + len("[CLS] ") + len(" [SEP] ")
    result['answer_start'] = answer_start
    #WHere the answer ends in the document_plaintext
    result['answer_end'] = answer_start + len(dataset['annotations']['answer_text'][0])
    #Where the answer starts in the document_plaintext
    #WHere the answer ends in the document_plaintext
    return result


def iob(ans_ids, tokens):
    result = []
    for answerID, token_word in enumerate(tokens):
        if token_word in ['[CLS]', '[SEP]']:
            result.append(-100)
        elif len(ans_ids) > 0 and answerID == ans_ids[0]:
            result.append(1)
        elif answerID in ans_ids:
            result.append(2)
        else:
            result.append(0)
    return result


def token_iob_labels(examples):
    sentence_tokens = tokenizer(examples["sentence"], truncation=True)
    answer_start = examples['answer_start']
    answer_end = examples['answer_end']
    sentence_token_id = []
    for id in range(len(sentence_tokens.tokens())):
        sentencespan = sentence_tokens.token_to_chars(batch_or_token_index=id)
        if sentencespan is not None:
            (elem1, _) = sentencespan
            if elem1 > answer_end:
                break
            elif elem1 >= answer_start:
                sentence_token_id.append(id)
    sentence_tokens["labels"] = iob(sentence_token_id, sentence_tokens.tokens())
    sentence_tokens['text_tokens'] = sentence_tokens.tokens()
    return sentence_tokens


def compute_exact_match(predicted, true, replacement_value=0):
    assert len(predicted) == len(true), "Length of predicted and true labels must match"
    em = 0
    total = len(predicted)

    predicted = [[replacement_value if x == -100 else x for x in seq] for seq in predicted]
    true = [[replacement_value if x == -100 else x for x in seq] for seq in true]

    for pred_seq, true_seq in zip(predicted, true):
        if np.array_equal(pred_seq, true_seq):
            em += 1

    em_score = em / total
    return em_score

making torch data


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, AdamW, get_scheduler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model)
model_auto = AutoModelForTokenClassification.from_pretrained(model, num_labels=3)
model_auto.to(device)
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
optimizer = AdamW(model_auto.parameters(), lr=2e-5)


tokenized_train_bengali = df_train_bengali.map(format).map(
    token_iob_labels
)
tokenized_train_arab = df_train_arab.map(format).map(
    token_iob_labels
)
tokenized_train_indo = df_train_indo.map(format).map(
    token_iob_labels
)


tokenized_val_bengali = df_val_bengali.map(format).map(
    token_iob_labels
)
tokenized_val_arab = df_val_arab.map(format).map(
    token_iob_labels
)
tokenized_val_indo = df_val_indo.map(format).map(
    token_iob_labels
)

train_dataloader_bengali = DataLoader(
    tokenized_train_bengali.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), shuffle=True, batch_size=8, collate_fn=collator
)
train_dataloader_arab = DataLoader(
    tokenized_train_arab.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), shuffle=True, batch_size=8, collate_fn=collator
)
train_dataloader_indo = DataLoader(
    tokenized_train_indo.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), shuffle=True, batch_size=8, collate_fn=collator
)

eval_dataloader_bengali = DataLoader(
    tokenized_val_bengali.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), batch_size=8, collate_fn=collator
)
eval_dataloader_arab = DataLoader(
    tokenized_val_arab.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), batch_size=8, collate_fn=collator
)
eval_dataloader_indo = DataLoader(
    tokenized_val_indo.remove_columns(['question_text', 'document_title', 'language', 'annotations', 'document_plaintext', 'document_url', 'sentence', 'answer_start', 'answer_end', 'text_tokens']), batch_size=8, collate_fn=collator
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

Map:   0%|          | 0/4779 [00:00<?, ? examples/s]

Map:   0%|          | 0/29598 [00:00<?, ? examples/s]

Map:   0%|          | 0/29598 [00:00<?, ? examples/s]

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

Map:   0%|          | 0/11394 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

Map:   0%|          | 0/1902 [00:00<?, ? examples/s]

Map:   0%|          | 0/1902 [00:00<?, ? examples/s]

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

Map:   0%|          | 0/1191 [00:00<?, ? examples/s]

TRAIN = BENGALI
TEST = {ARABIC, INDONESIAN}

In [7]:
train_dataloader = train_dataloader_bengali

epochs = 1
num_class = 3

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_auto(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        logits = outputs.get("logits").to(device)
        
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


test for arab

In [8]:
tokenized_datasets_val = tokenized_val_arab

y_true_bengali_arab_flat = []
y_pred_bengali_arab_flat = []
y_true_bengali_arab = []
y_pred_bengali_arab = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_bengali_arab.append(val_data['labels'])
  y_true_bengali_arab_flat.extend(val_data['labels'])
  y_pred_bengali_arab.append(predictions[0].detach().cpu().numpy())
  y_pred_bengali_arab_flat.extend(predictions[0].detach().cpu().numpy())

from sklearn.metrics import precision_recall_fscore_support
em_score_bengali_arab = compute_exact_match(y_pred_bengali_arab, y_true_bengali_arab)
precision_bengali_arab, recall_bengali_arab, f1_bengali_arab, _ = precision_recall_fscore_support(y_true_bengali_arab_flat, y_pred_bengali_arab_flat, average='macro', zero_division=1)
print("TRAIN: Bengali")
print("TEST: Arabic")
print("*********")
print(f"f1 Score: {f1_bengali_arab}")


100%|██████████| 1902/1902 [00:42<00:00, 45.19it/s]


TRAIN: Bengali
TEST: Arabic
*********
f1 Score: 0.2927828364863799


test for indonesian

In [9]:
tokenized_datasets_val = tokenized_val_indo

y_true_bengali_indo_flat = []
y_pred_bengali_indo_flat = []
y_true_bengali_indo = []
y_pred_bengali_indo = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_bengali_indo.append(val_data['labels'])
  y_true_bengali_indo_flat.extend(val_data['labels'])
  y_pred_bengali_indo.append(predictions[0].detach().cpu().numpy())
  y_pred_bengali_indo_flat.extend(predictions[0].detach().cpu().numpy())


em_score_bengali_indo = compute_exact_match(y_pred_bengali_indo, y_true_bengali_indo)
precision_bengali_indo, recall_bengali_indo, f1_bengali_indo, _ = precision_recall_fscore_support(y_true_bengali_indo_flat, y_pred_bengali_indo_flat, average='macro', zero_division=1)
print("TRAIN: Bengali")
print("TEST: Indonesian")
print("*********")
print(f"f1 Score: {f1_bengali_indo}")

100%|██████████| 1191/1191 [00:22<00:00, 53.58it/s]


TRAIN: Bengali
TEST: Indonesian
*********
f1 Score: 0.29961675936233967


TRAIN = ARABIC
TEST = {BENGALI, INDONESIAN}

In [10]:
train_dataloader = train_dataloader_arab

epochs = 1
num_class = 3

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_auto(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        logits = outputs.get("logits").to(device)

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

test for bengali

In [11]:
tokenized_datasets_val = tokenized_val_bengali

y_true_arab_bengali_flat = []
y_pred_arab_bengali_flat = []
y_true_arab_bengali = []
y_pred_arab_bengali = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_arab_bengali.append(val_data['labels'])
  y_true_arab_bengali_flat.extend(val_data['labels'])
  y_pred_arab_bengali.append(predictions[0].detach().cpu().numpy())
  y_pred_arab_bengali_flat.extend(predictions[0].detach().cpu().numpy())

em_score_arab_bengali = compute_exact_match(y_pred_arab_bengali, y_true_arab_bengali)
precision_arab_bengali, recall_arab_bengali, f1_arab_bengali, _ = precision_recall_fscore_support(y_true_arab_bengali_flat, y_pred_arab_bengali_flat, average='macro', zero_division=1)
print("TRAIN: Arabic")
print("TEST: Bengali")
print("*********")
print(f"f1 Score: {f1_arab_bengali}")

100%|██████████| 224/224 [00:04<00:00, 50.98it/s]

TRAIN: Arabic
TEST: Bengali
*********
f1 Score: 0.4435026068861112





test for indonesian

In [12]:
tokenized_datasets_val = tokenized_val_indo

y_true_arab_indo_flat = []
y_pred_arab_indo_flat = []
y_true_arab_indo = []
y_pred_arab_indo = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_arab_indo.append(val_data['labels'])
  y_true_arab_indo_flat.extend(val_data['labels'])
  y_pred_arab_indo.append(predictions[0].detach().cpu().numpy())
  y_pred_arab_indo_flat.extend(predictions[0].detach().cpu().numpy())

em_score_arab_indo = compute_exact_match(y_pred_arab_indo, y_true_arab_indo)
precision_arab_indo, recall_arab_indo, f1_arab_indo, _ = precision_recall_fscore_support(y_true_arab_indo_flat, y_pred_arab_indo_flat, average='macro', zero_division=1)
print("TRAIN: Arabic")
print("TEST: Indonesian")
print("*********")
print(f"f1 Score: {f1_arab_indo}")

100%|██████████| 1191/1191 [00:16<00:00, 72.98it/s]


TRAIN: Arabic
TEST: Indonesian
*********
f1 Score: 0.4739948627851678


TRAIN = INDONESIAN
TEST = {BENGALI, ARABIC}

In [13]:
train_dataloader = train_dataloader_indo

epochs = 1
num_class = 3

for epoch in range(epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_auto(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

        logits = outputs.get("logits").to(device)

        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

test for bengali

In [14]:
tokenized_datasets_val = tokenized_val_bengali

y_true_indo_bengali_flat = []
y_pred_indo_bengali_flat = []
y_true_indo_bengali = []
y_pred_indo_bengali = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_indo_bengali.append(val_data['labels'])
  y_true_indo_bengali_flat.extend(val_data['labels'])
  y_pred_indo_bengali.append(predictions[0].detach().cpu().numpy())
  y_pred_indo_bengali_flat.extend(predictions[0].detach().cpu().numpy())

em_score_indo_bengali = compute_exact_match(y_pred_indo_bengali, y_true_indo_bengali)
precision_indo_bengali, recall_indo_bengali, f1_indo_bengali, _ = precision_recall_fscore_support(y_true_indo_bengali_flat, y_pred_indo_bengali_flat, average='macro', zero_division=1)
print("TRAIN: Indonesian")
print("TEST: Bengali")
print("*********")
print(f"f1 Score: {f1_indo_bengali}")

100%|██████████| 224/224 [00:04<00:00, 52.42it/s]

TRAIN: Indonesian
TEST: Bengali
*********
f1 Score: 0.3637747219709783





test for arabic

In [15]:
tokenized_datasets_val = tokenized_val_arab

y_true_indo_arab_flat = []
y_pred_indo_arab_flat = []
y_true_indo_arab = []
y_pred_indo_arab = []
for val_data in tqdm(tokenized_datasets_val):
  input_ids = torch.LongTensor([val_data['input_ids']]).to(device)
  attention_mask = torch.LongTensor([val_data['attention_mask']]).to(device)
  output = model_auto(input_ids=input_ids, attention_mask=attention_mask)
  _, predictions = torch.max(output.logits, 2)
  y_true_indo_arab.append(val_data['labels'])
  y_true_indo_arab_flat.extend(val_data['labels'])
  y_pred_indo_arab.append(predictions[0].detach().cpu().numpy())
  y_pred_indo_arab_flat.extend(predictions[0].detach().cpu().numpy())

em_score_indo_arab = compute_exact_match(y_pred_indo_arab, y_true_indo_arab)
precision_indo_arab, recall_indo_arab, f1_indo_arab, _ = precision_recall_fscore_support(y_true_indo_arab_flat, y_pred_indo_arab_flat, average='macro', zero_division=1)
print("TRAIN: Indonesian")
print("TEST: Arabic")
print("*********")
print(f"f1 Score: {f1_indo_arab}")

100%|██████████| 1902/1902 [00:29<00:00, 64.94it/s]


TRAIN: Indonesian
TEST: Arabic
*********
f1 Score: 0.5857687636097137


RESULTS 

In [16]:
print("TRAIN: Bengali")
print("_________________")
print("TEST: Arabic")
print(f"EM Score: {em_score_bengali_arab}")
print(f"f1 Score: {f1_bengali_arab}")
print(f"precision Score: {precision_bengali_arab}")
print("TEST: Indonesian")
print(f"EM Score: {em_score_bengali_indo}")
print(f"f1 Score: {f1_bengali_indo}")
print(f"precision Score: {precision_bengali_indo}")
print()
print()
print("TRAIN: Arabic")
print("_________________")
print("TEST: Bengali")
print(f"EM Score: {em_score_arab_bengali}")
print(f"f1 Score: {f1_arab_bengali}")
print(f"precision Score: {precision_arab_bengali}")
print("TEST: Indonesian")
print(f"EM Score: {em_score_arab_indo}")
print(f"f1 Score: {f1_arab_indo}")
print(f"precision Score: {precision_arab_indo}")
print()
print()
print("TRAIN: Indonesian")
print("_________________")
print("TEST: Bengali")
print(f"EM Score: {em_score_indo_bengali}")
print(f"f1 Score: {f1_indo_bengali}")
print(f"precision Score: {precision_indo_bengali}")
print("TEST: Arabic")
print(f"EM Score: {em_score_indo_arab}")
print(f"f1 Score: {f1_indo_arab}")
print(f"precision Score: {precision_indo_arab}")

TRAIN: Bengali
_________________
TEST: Arabic
EM Score: 0.5073606729758149
f1 Score: 0.2927828364863799
precision Score: 0.7975031077249233
TEST: Indonesian
EM Score: 0.5113350125944585
f1 Score: 0.29961675936233967
precision Score: 0.8826712909442742


TRAIN: Arabic
_________________
TEST: Bengali
EM Score: 0.5
f1 Score: 0.4435026068861112
precision Score: 0.7776705417321144
TEST: Indonesian
EM Score: 0.5743073047858942
f1 Score: 0.4739948627851678
precision Score: 0.7930672177184914


TRAIN: Indonesian
_________________
TEST: Bengali
EM Score: 0.5178571428571429
f1 Score: 0.3637747219709783
precision Score: 0.8443946313438012
TEST: Arabic
EM Score: 0.6445846477392219
f1 Score: 0.5857687636097137
precision Score: 0.8737510100703321
