In [1]:
# %%capture
# !pip install datasets
# !pip install accelerate -U

## Libraries and Dependencies

In [2]:
import torch
import nltk
from datasets import load_dataset, Dataset
from tqdm.notebook import tqdm
import random

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer, logging
from transformers import TrainingArguments, Trainer

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

nltk.download('punkt')
logging.set_verbosity_error()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

## Functions

In [16]:
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, examples):
        input_ids = torch.tensor([example['input_ids'] for example in examples])
        labels = torch.tensor([example['labels'] for example in examples])

        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.tokenizer.pad_token_id)  # -100 is the default index to ignore in the loss function

        return {'input_ids': input_ids, 'labels': labels}


In [5]:
def tune_model(tune_set, model, tokenizer, n_epochs):
    # metric = evaluate.load("accuracy")

    training_args = TrainingArguments(
        f"finetuned-model",
        evaluation_strategy = "no",
        learning_rate = 1e-4,
        weight_decay = 0.01,
        num_train_epochs = n_epochs,
        logging_strategy = "no"
        )

    # data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
    data_collator = CustomDataCollator(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tune_set,
        # eval_dataset= ?,
        data_collator=data_collator,
        )

    trainer.train()

    # Evaluate the model
    # eval_results = trainer.evaluate()
    # print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

    return model

In [6]:
def mask_sentence(sentence, mask_token, i, M, L_min):
    return [mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

In [7]:
def BLANC_tune_inference(sentence, model, model_tuned, tokenizer, p_mask=0.15, L_min=4, device='cpu'):
    """
    Compares the performance of a model fine-tuned on the 'translation' vs. a model that has never seen the translation.

    Parameters:
    - sentence (List[str]): A tokenized sentence.
    - model: BERT-type model
    - model_tuned: The fine-tuned model.
    - tokenizer: The tokenizer associated with the model used.
    - p_mask (float): Probability of masking (default is 0.15).
    - L_min (int): Minimum length requirement for masked words (default is 4).

    Returns:
    - float: BLANC_tune score showing the quality of the translation.
    """

    S = [[0, 0], [0, 0]]
    M = int(1/p_mask)

    for i in range(M):

        masked_sentence = mask_sentence(sentence, tokenizer.mask_token, i, M, L_min)
        masked_sentence_ids = torch.tensor(tokenizer.convert_tokens_to_ids(masked_sentence)).to(device) # Shape: [sequence_length]

        out_base = model(input_ids=masked_sentence_ids.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]
        out_tune = model_tuned(input_ids=masked_sentence_ids.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]

        out_base = torch.argmax(out_base.squeeze(0), dim=-1)  # Shape: [sequence_length]
        out_tune = torch.argmax(out_tune.squeeze(0), dim=-1)  # Shape: [sequence_length]

        masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

        for j in masked_tokens:
            predicted_word_base = tokenizer.convert_ids_to_tokens(out_base[j].item())
            predicted_word_tune = tokenizer.convert_ids_to_tokens(out_tune[j].item())

            # print(f'predicted_word_base[{j}]: {predicted_word_base}')
            # print(f'predicted_word_help[{j}]: {predicted_word_tune}')
            # print(f'sentence[{j}]: {sentence[j]}')

            k = int(predicted_word_base == sentence[j])
            m = int(predicted_word_tune == sentence[j])
            S[k][m] += 1


    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])

    return B



In [8]:
def BLANC_tune(sentence, translation, model_checkpoint, model, tokenizer, p_mask=0.15, L_min=4, N=10, n_epochs=3, device='cpu'):

    # Model tuning
    N_words = len(translation)
    N_mask = int(N_words * p_mask)
    set_tune = Dataset.from_dict({})

    translation_ids = tokenizer.convert_tokens_to_ids(translation)

    for _ in range(N):
        pos = [i for i, token in enumerate(translation)
               if (len(token) >= L_min
                   or token.startswith('##')
                   or translation[min(i+1, len(translation)-1)].startswith('##'))] # positions of words longer than Lmin
        random.shuffle(pos)
        while len(pos) != 0:
            # Mask words in next N_mask positions
            masked_translation = translation_ids.copy()
            for pos_to_mask in pos[:N_mask]:
                masked_translation[pos_to_mask] = tokenizer.mask_token_id
            # Add translation with masked words to set_tune
            set_tune = set_tune.add_item({"input_ids": masked_translation, 'labels': translation_ids})
            pos = pos[N_mask:]

    # Creating a fresh pre-trained model
    new_model = BertForMaskedLM.from_pretrained(model_checkpoint).to(device)
    model_tuned = tune_model(set_tune, new_model, tokenizer, n_epochs)

    # Comparing inference with model vs. model_tuned
    score = BLANC_tune_inference(sentence, model, model_tuned, tokenizer, p_mask, L_min, device)

    del new_model
    del model_tuned
    torch.cuda.empty_cache()

    return score

In [9]:
import json

def add_results_to_json(new_data, file_path = "./results.json"):
    try:
        with open(file_path, 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        existing_data = {}

    for key, value in new_data.items():
        existing_data[key] = value

    with open(file_path, 'w') as json_file:
        json.dump(existing_data, json_file, indent=2)

    print(f"Data has been added to {file_path}")

## Datasets

In [10]:
# English - French
en_fr_ds = load_dataset('news_commentary', 'en-fr', split='train')

# English - Persian (Farsi)
en_fa_ds = load_dataset('persiannlp/parsinlu_translation_en_fa', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Model and Tokenizer

In [11]:
%%capture
model_checkpoint = 'bert-base-multilingual-uncased'
mbert_model = BertForMaskedLM.from_pretrained(model_checkpoint).to(DEVICE)
mbert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint, do_lower_case=True, use_fast=True)

## Preprocessing

English - French

In [12]:
# English - French

en_fr_ds = en_fr_ds.map(lambda example: example['translation'])\
                   .remove_columns(['id', 'translation'])\
                   .rename_column('en', 'sentence')\
                   .rename_column('fr', 'translation')\
                   .select(range(300))

# Tokenization
en_fr_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fr_ds['sentence']]  # (List[List[str]])

en_fr_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fr_ds['translation']] # (List[List[str]])

English - Persian

In [12]:
# English - Persian (Farsi)

# Removing the 'category' column
en_fa_ds = en_fa_ds.remove_columns(['category'])

# Removing list encapsulation
en_fa_ds = en_fa_ds.map(lambda example: {'targets': example['targets'][0]}, num_proc=4)

# Filtering out rows with the '\u200c' symbol and those where the length of either source or targets is less than a threshold
length_threshold = 30
filtered_en_fa_ds = en_fa_ds.filter(
    lambda example: '\u200c' not in example['targets']
                    and len(example['source']) >= length_threshold
                    and len(example['targets']) >= length_threshold
                    and 'Global Voices' not in example['source'], # Headlines. Very short and the 'Global Voices' part is never translated
    num_proc=4)

en_fa_ds = filtered_en_fa_ds.rename_column('source', 'sentence')\
                            .rename_column('targets', 'translation')\
                            .select(range(300))

# Tokenization
en_fa_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fa_ds['sentence']]  # (List[List[str]])

en_fa_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fa_ds['translation']] # (List[List[str]])

Map (num_proc=4):   0%|          | 0/1621665 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/1621665 [00:00<?, ? examples/s]

## Running the Program

English - French

In [18]:
BLANC_tune(en_fr_sentences[45], en_fr_translations[45], model_checkpoint, mbert_model, mbert_tokenizer, device=DEVICE)

{'train_runtime': 4.0061, 'train_samples_per_second': 37.443, 'train_steps_per_second': 5.242, 'train_loss': 0.3163124947320847, 'epoch': 3.0}


-0.17647058823529413

In [15]:
print(en_fr_sentences[45])
print(en_fr_translations[45])

['what', 'was', 'true', 'for', 'the', 'al', '##chem', '##ists', 'of', 'yo', '##re', 'remains', 'true', 'today', ':', 'gold', 'and', 'reason', 'are', 'often', 'difficult', 'to', 'rec', '##oncil', '##e', '.']
['ce', 'qui', 'etait', 'vrai', 'pour', 'les', 'al', '##chi', '##mist', '##es', 'd', '[UNK]', 'ant', '##an', 'reste', 'vrai', 'aujourd', '[UNK]', 'hui', ':', 'l', '[UNK]', 'or', 'et', 'la', 'raison', 'sont', 'parfois', 'difficile', '##s', 'a', 'con', '##cili', '##er', '.']


In [19]:
en_fr_scores = [BLANC_tune(sentences, translations, model_checkpoint, mbert_model, mbert_tokenizer, device=DEVICE)
                for sentences, translations in tqdm(zip(en_fr_sentences, en_fr_translations), total=len(en_fr_sentences))]

  0%|          | 0/300 [00:00<?, ?it/s]

{'train_runtime': 1.6655, 'train_samples_per_second': 72.052, 'train_steps_per_second': 9.006, 'train_loss': 0.3297830581665039, 'epoch': 3.0}
{'train_runtime': 2.8875, 'train_samples_per_second': 51.948, 'train_steps_per_second': 7.273, 'train_loss': 0.23243393216814315, 'epoch': 3.0}
{'train_runtime': 2.7219, 'train_samples_per_second': 44.086, 'train_steps_per_second': 5.511, 'train_loss': 0.38886874516805015, 'epoch': 3.0}
{'train_runtime': 4.9732, 'train_samples_per_second': 36.194, 'train_steps_per_second': 4.826, 'train_loss': 0.13696845372517905, 'epoch': 3.0}
{'train_runtime': 2.4369, 'train_samples_per_second': 61.554, 'train_steps_per_second': 8.618, 'train_loss': 0.37855416252499535, 'epoch': 3.0}
{'train_runtime': 3.2998, 'train_samples_per_second': 45.457, 'train_steps_per_second': 6.364, 'train_loss': 0.2629982857477097, 'epoch': 3.0}
{'train_runtime': 3.4038, 'train_samples_per_second': 52.882, 'train_steps_per_second': 7.051, 'train_loss': 0.13006973266601562, 'epoch':

Exception ignored in: <function _xla_gc_callback at 0x7b8af3c20670>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 97, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
en_fr_scores

In [None]:
en_fr_data = {}
en_fr_data['BLANC_tune_en_fr_translation'] = en_fr_scores
add_results_to_json(en_fr_data)

English - Persian (Farsi)

In [33]:
BLANC_tune(en_fa_sentences[0], en_fa_translations[0], model_checkpoint, mbert_model, mbert_tokenizer, device=DEVICE)

{'train_runtime': 3.6589, 'train_samples_per_second': 40.996, 'train_steps_per_second': 5.739, 'train_loss': 0.13686458269755045, 'epoch': 3.0}


-0.07692307692307693

In [38]:
en_fa_scores = [BLANC_tune(sentences, translations, model_checkpoint, mbert_model, mbert_tokenizer, device=DEVICE)
                for sentences, translations in tqdm(zip(en_fa_sentences, en_fa_translations), total=len(en_fa_sentences))]

  0%|          | 0/300 [00:00<?, ?it/s]

{'train_runtime': 3.902, 'train_samples_per_second': 38.442, 'train_steps_per_second': 5.382, 'train_loss': 0.09289323148273286, 'epoch': 3.0}
{'train_runtime': 3.8127, 'train_samples_per_second': 55.079, 'train_steps_per_second': 7.082, 'train_loss': 0.10461229748196071, 'epoch': 3.0}
{'train_runtime': 2.4844, 'train_samples_per_second': 60.377, 'train_steps_per_second': 8.453, 'train_loss': 0.16667411440894717, 'epoch': 3.0}
{'train_runtime': 3.6707, 'train_samples_per_second': 49.038, 'train_steps_per_second': 6.538, 'train_loss': 0.0751963456471761, 'epoch': 3.0}
{'train_runtime': 3.508, 'train_samples_per_second': 51.312, 'train_steps_per_second': 6.842, 'train_loss': 0.08011201024055481, 'epoch': 3.0}
{'train_runtime': 3.4084, 'train_samples_per_second': 52.811, 'train_steps_per_second': 7.041, 'train_loss': 0.11016147335370381, 'epoch': 3.0}
{'train_runtime': 4.9219, 'train_samples_per_second': 36.571, 'train_steps_per_second': 4.876, 'train_loss': 0.1320984959602356, 'epoch': 3

In [39]:
en_fa_scores

[-0.11538461538461539,
 -0.16666666666666666,
 -0.3333333333333333,
 -0.125,
 -0.14285714285714285,
 -0.047619047619047616,
 -0.05,
 -0.2727272727272727,
 -0.07142857142857142,
 -0.1,
 -0.3076923076923077,
 0.0,
 -0.42857142857142855,
 -0.2222222222222222,
 -0.42857142857142855,
 -0.08695652173913043,
 -0.1111111111111111,
 -0.2,
 0.07692307692307693,
 -0.125,
 -0.13333333333333333,
 0.07692307692307693,
 -0.1111111111111111,
 -0.2,
 -0.11764705882352941,
 -0.5,
 -0.6,
 -0.2,
 -0.17647058823529413,
 -0.26666666666666666,
 -0.7222222222222222,
 0.0,
 -0.16666666666666666,
 -0.5333333333333333,
 0.0,
 -0.2,
 -0.3333333333333333,
 -0.25,
 0.25,
 -0.4444444444444444,
 -0.125,
 -0.14285714285714285,
 -0.1,
 -0.14285714285714285,
 0.0,
 -0.1,
 -0.5714285714285714,
 -0.16666666666666666,
 -0.375,
 -0.25,
 -0.09090909090909091,
 0.0,
 -0.25925925925925924,
 0.125,
 -0.05555555555555555,
 -0.21739130434782608,
 0.12,
 -0.06666666666666667,
 -0.07692307692307693,
 -0.125,
 0.0,
 -0.0384615384615

In [43]:
# en_fa_data = {}
# en_fa_data['BLANC_tune_en_fa_translation'] = en_fa_scores
# add_results_to_json(en_fa_data)