# **Machine learning for low-resource NLP**: Advancing AI for Linguistic Inclusion
Cross-lingual transfer learning and pseudo-labeling for multilingual named entity recognition

## Table of Contents  
1. [Imports](#imports)  
2. [Project Setup](#project-setup)  
   - 2.1 [Configuration](#configuration)  
   - 2.2 [Reproducibility](#reproducibility)  
3. [Data Processing](#data-processing)  
   - 3.1 [Download Data](#download-data)  
   - 3.2 [NER Dataset](#ner-dataset)  
   - 3.3 [Dataloaders](#dataloaders)  
4. [Model Architecture](#model-architecture)  
5. [Training Utilities](#training-utilities)  
   - 5.1 [Optimizer Setup](#optimizer-setup)  
   - 5.2 [Evaluation Metric](#evaluation-metric)  
6. [Training Pipeline](#training-pipeline)  
   - 6.1 [Training Loop](#training-loop)  
   - 6.2 [Validation](#validation)  
7. [Experiments](#experiments)  
   - 7.1 [Baseline Models](#baseline-models)  
   - 7.2 [Cross-Lingual Transfer](#cross-lingual-transfer-learning)  
   - 7.3 [Iterative Pseudo-Labeling](#iterative-pseudo-labeling)  

### 1. Imports <a id='imports'></a>

In [None]:
import gc
import copy
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import display, Markdown


from datasets import load_dataset
from transformers import BertModel
from sklearn.metrics import f1_score
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split


import torch
import torch.nn as nn
from torchcrf import CRF
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import ConcatDataset, DataLoader

### 2. Project Setup <a id='project-setup'></a>

#### 2.1 Configuration <a id='configuration'></a>

In [None]:
class BaseConfig:
    RANDOM_STATE          = 42
    DEVICE                = torch.device("cuda")

    # Data
    low_resource_langs    = ["mg", "fo", "co", "hsb", "bh", "cv"]
    high_resource_langs   = ["id", "da", "it", "pl", "hi", "tr"]

    NUM_TAGS              = 7
    BATCH_SIZE            = 32
    MAX_SEQ_LEN           = 80

class TrainConfig(BaseConfig):
    EPOCHS                = 20
    PATIENCE              = 5
    BERT_LEARNING_RATE    = 0.00003
    LSTM_LEARNING_RATE    = 0.005
    CRF_LEARNING_RATE     = 0.00005
    WEIGHT_DECAY          = 0.02

class FineTuneConfig(BaseConfig):
    EPOCHS                = 15
    PATIENCE              = 3
    BERT_LEARNING_RATE    = 0.00002
    LSTM_LEARNING_RATE    = 0.003
    CRF_LEARNING_RATE     = 0.00003

class PseudoLabelingConfig(BaseConfig):
    EPOCHS                = 25
    PATIENCE              = 5
    BERT_LEARNING_RATE    = 0.00002
    LSTM_LEARNING_RATE    = 0.003
    CRF_LEARNING_RATE     = 0.00003

    CONFIDENCE_QUANTILE   = 0.965
    PSEUDO_DELAY          = 8
    ENTROPY_THRESHOLD     = 0.2

#### 2.2 Reproducibility <a id='reproducibility'></a>

In [4]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(BaseConfig.RANDOM_STATE)

### 3. Data Processing <a id='data-processing'></a>

#### 3.1 Download Data <a id='download-data'></a>

In [7]:
def load_wikiann_datasets(language_codes, cutoff=None):

    language_data = {}
    for lang in language_codes:
        
        # Load raw data from hugging face
        lang_dataset = load_dataset("unimelb-nlp/wikiann", name=lang)

        # Get data from different splits and combine
        train_df = pd.DataFrame(lang_dataset["train"])
        val_df = pd.DataFrame(lang_dataset["validation"])
        test_df = pd.DataFrame(lang_dataset["test"])

        complete_df = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)
        complete_df = complete_df.head(cutoff) if cutoff else complete_df

        # Split data into new train/val/test splits
        train, temp = train_test_split(complete_df, test_size=0.2, random_state=BaseConfig.RANDOM_STATE)
        val, test = train_test_split(temp, test_size=0.5, random_state=BaseConfig.RANDOM_STATE)

        language_data[lang] = {"train": train, "val": val, "test": test}

    return language_data

# Download and store data
low_resource_datasets = load_wikiann_datasets(BaseConfig.low_resource_langs)

#### 3.2 NER Dataset <a id='ner-dataset'></a>

In [None]:
class NERDataset:
    def __init__(self, texts, tags, include_sentence = False):
        self.texts = texts
        self.tags = tags

        self.tokenizer = BertTokenizerFast.from_pretrained(
            "google-bert/bert-base-multilingual-cased", do_lower_case = True
        )

        self.CLS_TOKEN = [101]
        self.SEP_TOKEN = [102]
        self.PAD_TOKEN = [0]
        self.MAX_LEN = BaseConfig.MAX_SEQ_LEN

        # Determines if the original sentence is returned for each batch
        self.include_sentence = include_sentence

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        tags = self.tags[index]

        token_ids = []
        target_tags = []
        for i, word in enumerate(text):
            word_ids = self.tokenizer.encode(word, add_special_tokens = False)
            token_ids.extend(word_ids)
            target_tags.extend(len(word_ids) * [tags[i]])

        # Resize for special tokens
        token_ids = token_ids[:self.MAX_LEN - 2]
        target_tags = target_tags[:self.MAX_LEN - 2]

        # Add special tokens
        token_ids = self.CLS_TOKEN + token_ids + self.SEP_TOKEN
        target_tags = self.PAD_TOKEN + target_tags + self.PAD_TOKEN

        attention_mask = [1] * len(token_ids)
        token_type_ids = [0] * len(token_ids)

        # Add padding to make sure all inputs are the same size
        padding_len = self.MAX_LEN - len(token_ids)
        token_ids += [0] * padding_len
        target_tags += [0] * padding_len
        attention_mask += [0] * padding_len
        token_type_ids += [0] * padding_len

        if self.include_sentence:
            return {
                "input_ids": torch.tensor(token_ids, dtype = torch.long),
                "target_tags": torch.tensor(target_tags, dtype = torch.long),
                "attention_mask": torch.tensor(attention_mask, dtype = torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype = torch.long),
                "orginal_text": " ".join(text)
            }

        return {
            "input_ids": torch.tensor(token_ids, dtype = torch.long),
            "target_tags": torch.tensor(target_tags, dtype = torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype = torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype = torch.long)
        }

#### 3.3 Dataloaders <a id='dataloaders'></a>

In [9]:
def create_dataloader(lang_split_data, include_sentence=False):
    dataset = NERDataset(
        lang_split_data["tokens"].to_list(),
        lang_split_data["ner_tags"].to_list(),
        include_sentence = include_sentence
    )
    return DataLoader(dataset, BaseConfig.BATCH_SIZE)

def create_dataloaders(lang_data):

    train_loader = create_dataloader(lang_data["train"])
    val_loader = create_dataloader(lang_data["val"])
    test_loader = create_dataloader(lang_data["test"])

    return train_loader, val_loader, test_loader

### 4. Model Architecture <a id='model-architecture'></a>

In [12]:
class BertBilstmCrf(nn.Module):
    def __init__(self, num_tags):
        super(BertBilstmCrf, self).__init__()

        # Define model layers
        self.bert = BertModel.from_pretrained("bert-base-multilingual-cased")
        self.lstm = nn.LSTM(
            input_size = self.bert.config.hidden_size,
            hidden_size = 128,
            num_layers = 2,
            bidirectional = True,
            batch_first = True,
            dropout = 0.3
        )
        self.fc = nn.Linear(in_features = 256, out_features = num_tags)
        self.crf = CRF(num_tags, batch_first = True)

    @torch.autocast(device_type="cuda")
    def forward(self, input_ids, target_tags, attention_mask, token_type_ids):
        # Pass inputs through layers
        bert_output = self.bert(input_ids, attention_mask, token_type_ids)
        sequence_output = bert_output.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        emissions = self.fc(lstm_output)

        loss = -self.crf(emissions, target_tags, mask = attention_mask.bool(), reduction = "mean")
        return emissions, loss

    def decode(self, emissions, attention_mask):
        return self.crf.decode(emissions, mask = attention_mask.bool())

### 5. Training Utilities <a id='training-utilities'></a>

#### 5.1 Optimizer Setup <a id='optimizer-setup'></a>

In [None]:
def setup_optimizer(model, CONFIG):
    param_groups = []
    # Check model layers and add appropiate learning rates
    if hasattr(model, "bert"):
        param_groups.append({"params" : model.bert.parameters(), "lr" : CONFIG.BERT_LEARNING_RATE})
    if hasattr(model, "lstm"):
        param_groups.append({"params" : model.lstm.parameters(), "lr" : CONFIG.LSTM_LEARNING_RATE})
    if hasattr(model, "crf"):
        param_groups.append({"params" : model.crf.parameters(), "lr" : CONFIG.CRF_LEARNING_RATE})
    optimizer = optim.Adam(param_groups, weight_decay = CONFIG.WEIGHT_DECAY)

    return optimizer

#### 5.2 Evaluation Metric <a id='evaluation-metric'></a>

In [None]:
def calculate_f1(target_tags, pred_tags, attention_mask):

    if isinstance(pred_tags, list):
        pred_tags = [sequence + [0] * (BaseConfig.MAX_SEQ_LEN - len(sequence)) for sequence in pred_tags]
        pred_tags = torch.tensor(pred_tags).to(BaseConfig.DEVICE)

    # Flatten batch results
    target_tags = target_tags.view(-1)
    pred_tags = pred_tags.view(-1)
    attention_mask = attention_mask.view(-1)

    # Filter out padding and special tokens
    target_tags = target_tags[attention_mask == 1]
    pred_tags = pred_tags[attention_mask == 1]

    f1_micro = f1_score(target_tags.cpu(), pred_tags.cpu(), average="micro")
    return f1_micro

### 6. Training Pipeline <a id='training-pipeline'></a>

#### 6.1 Training Functions <a id='training-functions'></a>

In [10]:
def train_model(model, optimizer, train_loader, val_loader, CONFIG):
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.1, patience = 5)

    best_val_f1 = -float("inf")
    best_train_f1 = 0
    patience_counter = CONFIG.PATIENCE

    for _ in range(CONFIG.EPOCHS):
        _, train_f1 = train_epoch(model, train_loader, optimizer)
        val_loss, val_f1 = evaluate_epoch(model, val_loader)

        scheduler.step(val_loss)

        # Save state of best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_train_f1 = train_f1
            patience_counter = CONFIG.PATIENCE
            best_model_state = copy.deepcopy(model.state_dict())
        else:
            patience_counter -= 1

        if patience_counter == 0:
            break  # Stop training if model doesn't improve

    # Delete to clear up memory
    model.to("cpu")
    del optimizer, scheduler, model

    # Clear cache
    gc.collect()
    torch.cuda.empty_cache()

    return best_model_state, best_train_f1, best_val_f1


def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss, total_f1 = 0, 0

    for batch in dataloader:
        batch = {key : value.to(BaseConfig.DEVICE) for key, value in batch.items()}

        optimizer.zero_grad()
        emissions, loss = model(**batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        pred_tags = model.decode(emissions, batch["attention_mask"])
        f1_score = calculate_f1(batch["target_tags"], pred_tags, batch["attention_mask"])
        total_f1 += f1_score

    return total_loss / len(dataloader), total_f1 / len(dataloader)

def evaluate_epoch(model, dataloader):
    model.eval()
    total_loss, total_f1 = 0, 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {key : value.to(BaseConfig.DEVICE) for key, value in batch.items()}

            emissions, loss = model(**batch)
            total_loss += loss.item()

            pred_tags = model.decode(emissions, batch["attention_mask"])
            f1_score = calculate_f1(batch["target_tags"], pred_tags, batch["attention_mask"])
            total_f1 += f1_score

    return total_loss / len(dataloader), total_f1 / len(dataloader)

#### 6.2 Pseudo-labeling Pipeline

In [None]:
def train_pseudo_labeling(model, optimizer, train_loader, val_loader, unlabeled_data, CONFIG):
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode = "min", factor = 0.1, patience = 5)

    best_val_f1 = -float("inf")
    best_train_f1 = 0
    patience_counter = CONFIG.PATIENCE

    for epoch in range(CONFIG.EPOCHS):

        _, train_f1 = train_epoch(model, train_loader, optimizer)
        val_loss, val_f1 = evaluate_epoch(model, val_loader)

        scheduler.step(val_loss)

        # Save state of best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_train_f1 = train_f1
            patience_counter = CONFIG.PATIENCE
            best_model_state = copy.deepcopy(model.state_dict())
        else:
            patience_counter -= 1

        if patience_counter == 0:
            break  # Stop training if model doesn't improve

        # Generate pseudo-labels with trained model on unlabeled data
        pseudo_labels = generate_pseudo_labels(model, unlabeled_data)
        confidence_threshold = pseudo_labels["confidence_score"].quantile(CONFIG.CONFIDENCE_QUANTILE)

        def filter_tags(row):
            high_confidence = row["confidence_score"] > confidence_threshold
            # low_entropy = row["entropy"] < CONFIG.ENTROPY_THRESHOLD
            low_entropy = True
            representative = set(row["ner_tags"]) != {0}
            same_length = len(row["tokens"]) == len(row["ner_tags"])
            return high_confidence and low_entropy and representative and same_length

        labels_to_keep = pseudo_labels.apply(filter_tags, axis=1)
        good_pseudo_labels = pseudo_labels[labels_to_keep]
        pseudo_labels = pseudo_labels[~labels_to_keep]

        pseudo_dataset = NERDataset(good_pseudo_labels["tokens"].tolist(), good_pseudo_labels["ner_tags"].tolist())

        if epoch > CONFIG.PSEUDO_DELAY:
            existing_data = train_loader.dataset
            combined_dataset = ConcatDataset([existing_data, pseudo_dataset])
            train_loader = DataLoader(combined_dataset, CONFIG.BATCH_SIZE)
            print(f"Added {len(good_pseudo_labels)} rows of data")
        else:
            print("Early epoch")

    # Delete to clear up memory
    model.to("cpu")
    del optimizer, scheduler, model

    # Clear cache
    gc.collect()
    torch.cuda.empty_cache()

    return best_model_state, best_train_f1, best_val_f1


def generate_pseudo_labels(model, unlabeled_data):

    unlabeled_dataloader = create_dataloader(unlabeled_data, include_sentence=True)

    # Initialize lists to store pseudo-labels and confidence scores
    pseudo_sentences, pseudo_tags, pseudo_confidence_scores, entropy_scores = [], [], [], []

    for batch in unlabeled_dataloader:
        texts = batch["orginal_text"]
        texts = [text.split() for text in texts]
        del batch["orginal_text"]

        batch = {key : value.to(BaseConfig.DEVICE) for key, value in batch.items()}

        with torch.no_grad():
            emissions, _ = model(**batch)
            predicted_tags = model.decode(emissions, batch["attention_mask"])

            # Compute sequence probabilities and entropy
            probs = F.softmax(emissions, dim=-1)
            sequence_confidence_scores, sequence_entropies = [], []
            for i, tags in enumerate(predicted_tags):
                token_confidence = [probs[i, j, tag].item() for j, tag in enumerate(tags)]
                token_entropy = -torch.sum(probs[i] * torch.log(probs[i] + 1e-9), dim=-1).cpu().numpy()

                seq_confidence = sum(token_confidence) / len(token_confidence)
                seq_entropy = sum(token_entropy) / len(token_entropy)
                sequence_confidence_scores.append(seq_confidence)
                sequence_entropies.append(seq_entropy)

            predicted_tags = [
                sequence[:BaseConfig.MAX_SEQ_LEN] + [0] * max(0, BaseConfig.MAX_SEQ_LEN - len(sequence))
                for sequence in predicted_tags
            ]

            # Trim predicted tags
            trimmed_predicted_tags = []
            word_counts = [len(text) for text in texts]
            for tag_seq, word_count in zip(predicted_tags, word_counts):
                tag_seq = tag_seq[1:-1]
                trimmed_predicted_tags.append(tag_seq[:word_count])

            pseudo_sentences.extend(texts)
            pseudo_tags.extend(trimmed_predicted_tags)
            pseudo_confidence_scores.extend(sequence_confidence_scores)
            entropy_scores.extend(sequence_entropies)

    pseudo_df = pd.DataFrame({
        "tokens": pseudo_sentences,
        "ner_tags": pseudo_tags,
        "confidence_score": pseudo_confidence_scores,
        "entropy": entropy_scores
    })

    return pseudo_df

### 7. Experiments <a id='experiments'></a>

#### 7.1 Baseline Models <a id='baseline-models'></a>

In [None]:
baseline_results = []

# Iterate through low-resource languages
for lang, lang_data in tqdm(low_resource_datasets.items(), ncols=80):

    train_loader, val_loader, test_loader = create_dataloaders(lang_data)

    # ------------------------------------------ TRAINING ------------------------------------------ #

    model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
    optimizer = setup_optimizer(model, TrainConfig)
    best_model_state, train_f1, val_f1 = train_model(model, optimizer, train_loader, val_loader, TrainConfig)

    # ------------------------------------------ EVALUATION ------------------------------------------ #
    eval_model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
    eval_model.load_state_dict(best_model_state, TrainConfig)
    test_loss, test_f1 = evaluate_epoch(eval_model, test_loader, TrainConfig)

    # ------------------------------------------ RESULTS ------------------------------------------ #
    torch.save(best_model_state, f"models/{lang}_baseline.pth")

    baseline_results.append({
        "language" : lang,
        "train_f1" : train_f1,
        "val_f1"   : val_f1,
        "test_f1"  : test_f1
    })
    
# Save and display results
baseline = pd.DataFrame(baseline_results)
baseline.to_csv("results/baseline.csv", index=False)

markdown_table = baseline.to_markdown(index=False)
display(Markdown(markdown_table))

100%|█████████████████████████████████████████████| 6/6 [04:31<00:00, 45.20s/it]


| language   |   train_f1 |   val_f1 |   test_f1 |
|:-----------|-----------:|---------:|----------:|
| mg         |   0.993728 | 0.933673 |  0.960352 |
| fo         |   0.974224 | 0.897482 |  0.901099 |
| co         |   0.956204 | 0.852308 |  0.81323  |
| hsb        |   0.951443 | 0.923387 |  0.854578 |
| bh         |   0.981549 | 0.888689 |  0.80212  |
| cv         |   0.977741 | 0.892617 |  0.830443 |

In [6]:
baseline = pd.read_csv("results/baseline.csv")

#### 7.2 Cross-Lingual Transfer Learning <a id='cross-lingual-transfer-learning'></a>

In [None]:
transfer_results = []

for augmentation_factor in tqdm(range(1, 24), ncols=80):

    high_resource_datasets = load_wikiann_datasets(BaseConfig.high_resource_langs, augmentation_factor * 240)

    # Iterate through low-resource and adjacent high-resource languages
    for (low_resource_lang, low_resource_data), (high_resource_lang, high_resource_data) in tqdm(zip(
            low_resource_datasets.items(), high_resource_datasets.items()
        ), ncols=80, leave=False):

        high_train_loader, high_val_loader, _ = create_dataloaders(high_resource_data)
        low_train_loader, low_val_loader, low_test_loader = create_dataloaders(low_resource_data)

        # ------------------------------------------ PRE-TRAINING ------------------------------------------ #

        high_resource_model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
        optimizer = setup_optimizer(high_resource_model, TrainConfig)

        high_resource_model_state, train_f1, val_f1 = train_model(high_resource_model, optimizer, high_train_loader, high_val_loader, TrainConfig)

        # ------------------------------------------ FINE-TUNING ------------------------------------------ #

        model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
        model.load_state_dict(high_resource_model_state)
        optimizer = setup_optimizer(model, FineTuneConfig)

        best_model_state, train_f1, val_f1 = train_model(model, optimizer, low_train_loader, low_val_loader, FineTuneConfig)

        # ------------------------------------------ EVALUATION ------------------------------------------ #

        eval_model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
        eval_model.load_state_dict(best_model_state)
        test_loss, test_f1 = evaluate_epoch(eval_model, low_test_loader)

        # ------------------------------------------ RESULTS ------------------------------------------ #
        torch.save(best_model_state, f"models/{low_resource_lang}_{high_resource_lang}_transfer.pth")

        baseline_performance = baseline.loc[baseline["language"] == low_resource_lang, "test_f1"].item()
        improvement = (test_f1 - baseline_performance) / baseline_performance * 100

        transfer_results.append({
            "high_resource_language" : high_resource_lang,
            "low_resource_lang"      : low_resource_lang,
            "augmentation_factor"    : augmentation_factor,
            "train_f1"               : train_f1,
            "val_f1"                 : val_f1,
            "test_f1"                : test_f1,
            "improvement"            : improvement
        })

        print(f"Aug: {augmentation_factor}  {low_resource_lang} Improvement over baseline: {improvement:.5f}")

transfer_data = pd.DataFrame(transfer_results)
transfer_data.to_csv("results/transfer_learning.csv", index=False)

#### 7.3 Iterative Pseudo-labeling <a id='iterative-pseudo-labeling'></a>

In [None]:
iterative_pseudo_labeling_results = []

# Iterate through low-resource languages
high_resource_datasets = load_wikiann_datasets(BaseConfig.high_resource_langs, 10000)

# Iterate through low-resource and adjacent high-resource languages
for (lang, low_resource_data), (_, high_resource_data) in tqdm(zip(
        low_resource_datasets.items(), high_resource_datasets.items()
    ), ncols=80, leave=False):

    train_loader, val_loader, test_loader = create_dataloaders(low_resource_data)
    unlabeled_data = high_resource_data["train"]

    # ------------------------------------------ TRAINING ------------------------------------------ #

    model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
    model.load_state_dict(torch.load(f"models/{lang}_baseline.pth"))
    optimizer = setup_optimizer(model, PseudoLabelingConfig)
    best_model_state, train_f1, val_f1 = train_pseudo_labeling(model, optimizer, train_loader, val_loader, unlabeled_data, PseudoLabelingConfig)

    # ------------------------------------------ EVALUATION ------------------------------------------ #
    
    eval_model = BertBilstmCrf(BaseConfig.NUM_TAGS).to(BaseConfig.DEVICE)
    eval_model.load_state_dict(best_model_state)
    test_loss, test_f1 = evaluate_epoch(eval_model, test_loader)

    # ------------------------------------------ RESULTS ------------------------------------------ #
    torch.save(best_model_state, f"models/{lang}_iterative_pseudo_labeling.pth")

    baseline_performance = baseline.loc[baseline["language"] == lang, "test_f1"].item()
    improvement = (test_f1 - baseline_performance) / baseline_performance * 100

    iterative_pseudo_labeling_results.append({
        "language"    : lang,
        "train_f1"    : train_f1,
        "val_f1"      : val_f1,
        "test_f1"     : test_f1,
        "improvement" : improvement
    })

    print(f"Language: {lang}    Improvement over baseline: {improvement:.5f}")


# Save results
iterative_pseudo_labeling = pd.DataFrame(iterative_pseudo_labeling_results)
iterative_pseudo_labeling.to_csv("results/iterative_pseudo_labeling.csv", index=False)