In [1]:
# %%capture
# !pip install datasets

## Libraries and Dependencies

In [2]:
import pandas as pd
import torch
import nltk
from datasets import load_dataset
from tqdm.notebook import tqdm
import random

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer, logging

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

nltk.download('punkt')
logging.set_verbosity_error()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cpu')

## Algorithm Implementation

In [4]:
def mask_sentence(sentence, mask_token, i, M, L_min):
    return [mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

In [5]:
def BLANC_help_modified(sentence, model, model_tuned, tokenizer, M=6, L_min=4, device = DEVICE):
    """
    Calculates BLANC score between a given sentence and its translation using a specified model.

    Parameters:
    - sentence (List[str]): A tokenized sentence.
    - model: BERT-type model
    - model_tuned: The fine-tuned model.
    - tokenizer: The tokenizer associated with the model used.
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '[SEP]').

    Returns:
    - float: BLANC score for the given sentence and its translation.
    """

    S = [[0, 0], [0, 0]]

    for i in range(M):
        masked_sentence = mask_sentence(sentence, tokenizer.mask_token, i, M, L_min)

        masked_sentence_ids = torch.tensor(tokenizer.convert_tokens_to_ids(masked_sentence)).to(device) # Shape: [sequence_length]

        out_base = model(input_ids = masked_sentence_ids.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]
        out_tune = model_tuned(input_ids = masked_sentence_ids.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]

        out_base = torch.argmax(out_base.squeeze(0), dim=-1)  # Shape: [sequence_length]
        out_tune = torch.argmax(out_tune.squeeze(0), dim=-1)  # Shape: [sequence_length]

        masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

        for j in masked_tokens:
            predicted_word_base = tokenizer.convert_ids_to_tokens(out_base[j].item())
            predicted_word_tune = tokenizer.convert_ids_to_tokens(out_tune[j].item())

            # print(f'predicted_word_base: {predicted_word_base}')
            # print(f'predicted_word_help: {predicted_word_tune}')
            # print(f'sentence[{j}]: {sentence[j]}')

            k = int(predicted_word_base == sentence[j])
            m = int(predicted_word_tune == sentence[j])
            S[k][m] += 1


    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])

    return B



In [6]:
def get_word_lengths(df, tokenizer, l_min = 4):
    word_lengths = {}
    all_tokens = []

    for _, row in df.iterrows():
        translation = row['translation']
        preprocessed_result = tokenizer(translation,
                                        add_special_tokens = False,
                                        truncation = True,
                                        max_length = 512,
                                        padding = False,
                                        return_attention_mask = False)
        tokens = preprocessed_result["input_ids"]
        decoded_tokens = tokenizer.convert_ids_to_tokens(tokens)
        for token in tokens:
            if token not in all_tokens:
                all_tokens.append(token)

        i = 0
        while i < len(tokens):
            eligible = False
            if decoded_tokens[i].startswith('##'):
                eligible = True
                word_lengths[tokens[i - 1]] = eligible
                word_lengths[tokens[i]] = eligible
            else:
                if len(decoded_tokens[i]) >= l_min:
                    eligible = True
                word_lengths[tokens[i]] = eligible
            i += 1

    assert len(all_tokens) == len(word_lengths), "Association of tokens with word length : FAILED."

    return word_lengths

In [7]:
def training(set_tune, epochs = 10, device = DEVICE):
    model_tuned = BertForMaskedLM.from_pretrained('bert-base-multilingual-uncased').to(device)
    model_tuned.train()
    optimizer = torch.optim.AdamW(model_tuned.parameters(), lr=1e-4)

    inputs = torch.tensor(set_tune['masked_translation'].tolist(), dtype = torch.long).to(device)
    label = torch.tensor(set_tune['original_translation'].tolist(), dtype = torch.long).to(device)
    # print(inputs.size())

    for epochs in range(epochs):
        outputs = model_tuned(input_ids = inputs, labels = label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return model_tuned

In [8]:
def BLANC_tune(translation, text, model, tokenizer, word_lengths, p_mask=0.15, N=10, epochs=10, device='cpu'):
    N_translation = len(translation)
    N_mask = int(N_translation * p_mask)
    set_tune = pd.DataFrame(columns = ['masked_translation', 'original_translation'])

    tokenized_translation = torch.tensor(tokenizer.convert_tokens_to_ids(translation), dtype = torch.long).to(device)

    for _ in range(0, N):
        pos = [i for i, token in enumerate(tokenized_translation.tolist()) if token in word_lengths and word_lengths[token]]
        random.shuffle(pos)
        while len(pos) != 0:
            masked_translation = tokenized_translation.tolist().copy()
            for pos_to_mask in pos[:N_mask]:
                masked_translation[pos_to_mask] = tokenizer.mask_token_id
            set_tune.loc[set_tune.shape[0]] = [masked_translation, tokenized_translation.tolist()]
            pos = pos[N_mask:]

    model_tuned = training(set_tune, epochs)
    score = BLANC_help_modified(text, model, model_tuned, tokenizer)

    return score

In [9]:
def save_results(results, filename):
    with open(filename, 'w') as file:
        for result in results:
            file.write(str(result) + '\n')

## Datasets

In [10]:
# English - French
en_fr_ds = load_dataset('news_commentary', 'en-fr', split='train')

en_fr_df = pd.DataFrame(en_fr_ds['translation'][:300]).rename(
    columns={'en': 'sentence', 'fr': 'translation'})

en_fr_df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/209479 [00:00<?, ? examples/s]

Unnamed: 0,sentence,translation
0,"$10,000 Gold?",L’or à 10.000 dollars l’once ?
1,SAN FRANCISCO – It has never been easy to have...,SAN FRANCISCO – Il n’a jamais été facile d’avo...
2,"Lately, with gold prices up more than 300% ove...","Et aujourd’hui, alors que le cours de l’or a a..."
3,"Just last December, fellow economists Martin F...","En décembre dernier, mes collègues économistes..."
4,Wouldn’t you know it?,Mais devinez ce qui s’est passé ?
...,...,...
295,Although Abdullah is usually referred to in th...,Bien qu'Abdallah soit généralement considéré à...
296,"The Sudairis, it seems, have apparently left t...",Ils semblent avoir laissé leur demi-frère se c...
297,For although Crown Prince Abdullah has his own...,Même si le prince héritier Abdallah bénéficie ...
298,The idea of normalizing relations with Israel ...,L'idée d'une normalisation des relations avec ...


In [11]:
# English - Persian (Farsi)

en_fa_ds = load_dataset('persiannlp/parsinlu_translation_en_fa', split='train')

# Removing the 'category' column
en_fa_ds = en_fa_ds.remove_columns(['category'])

# Removing list encapsulation
en_fa_ds = en_fa_ds.map(
    lambda example: {'targets': example['targets'][0]}, num_proc=4)

# Filtering out rows with the '\u200c' symbol and those where the length of either source or targets is less than a threshold
length_threshold = 10
filtered_en_fa_ds = en_fa_ds.filter(
    lambda example: '\u200c' not in example['targets']
    and len(example['source']) >= length_threshold
    and len(example['targets']) >= length_threshold,
    num_proc=4)

en_fa_df = pd.DataFrame(filtered_en_fa_ds[:300]).rename(
    columns={'source': 'sentence', 'targets': 'translation'})

en_fa_df

Downloading data:   0%|          | 0.00/135M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/242k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/1621665 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/1621665 [00:00<?, ? examples/s]

Unnamed: 0,sentence,translation
0,Due Thank You note by Egyptian blogger Abdel M...,بلاگر مصری عبدل منعم محمود (عربی) پس از آزاد ش...
1,He was extremely surprised and happy to receiv...,وی همچنین از دریافت تعداد بسیار زیادی پیام تبر...
2,Monem blogs under the name of “Ana Ikwan”.,منعم به دلیل اتهامات سیاسی ۴۵ روز در زندان بود.
3,Ikhwan in Egyptian Arabic means Muslim Brother...,آزادی دینی در مصر
4,"On December 16, 2006, the Supreme Administrati...",در ۱۶ دسامبر ۲۰۰۶ شورای عالی اداری مصر که دولت...
...,...,...
295,Photos are included in the description of this...,وی عکس هایی را از این آهنگرانران به چاپ رسانده...
296,Turkey: Hrant Dink Named World Press Freedom H...,ارمنستان: قهرمان آزادی بیان
297,Jordan: New Traffic Law · Global Voices,اردن: ترافیک و دولت
298,Iraq: Yahoo Account Hacked · Global Voices,عراق: ای میل هک شده


## Model and Tokenizer

In [12]:
%%capture
model = 'bert-base-multilingual-uncased'
mbert_model = BertForMaskedLM.from_pretrained(model).to(DEVICE)
mbert_tokenizer = BertTokenizer.from_pretrained(model, do_lower_case = True)

## Preprocessing

In [13]:
# English - French

en_fr_word_lengths = get_word_lengths(en_fr_df, mbert_tokenizer)
print(len(en_fr_word_lengths))

en_fr_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fr_df['sentence']]  # (List[List[str]])

en_fr_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fr_df['translation']] # (List[List[str]])

2707


In [14]:
# English - Persian (Farsi)

en_fa_word_lengths = get_word_lengths(en_fa_df, mbert_tokenizer)
print(len(en_fa_word_lengths))

en_fa_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fa_df['sentence']]  # (List[List[str]])

en_fa_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fa_df['translation']] # (List[List[str]])

1022


## Running the Program

In [15]:
BLANC_tune(en_fr_translations[1],
           en_fr_sentences[1],
           mbert_model,
           mbert_tokenizer,
           en_fr_word_lengths,
           device=DEVICE)

-0.09090909090909091

In [None]:
en_fr_scores = [
    BLANC_tune(translation,
               sentence,
               mbert_model,
               mbert_tokenizer,
               en_fr_word_lengths,
               device=DEVICE)
    for translation, sentence in tqdm(
        zip(en_fr_translations, en_fr_sentences), total=len(en_fr_sentences))
    ]

en_fr_scores

  0%|          | 0/300 [00:00<?, ?it/s]

In [None]:
filename = 'unbatched_BLANCtune_translation_(en_fr).txt'
save_results(en_fr_scores, filename)

English - Persian (Farsi)

In [None]:
BLANC_tune(en_fa_translations[1],
           en_fa_sentences[1],
           mbert_model,
           mbert_tokenizer,
           en_fa_word_lengths,
           device=DEVICE)

In [None]:
en_fa_scores = [
    BLANC_tune(translation,
               sentence,
               mbert_model,
               mbert_tokenizer,
               en_fa_word_lengths,
               device=DEVICE)
    for translation, sentence in tqdm(
        zip(en_fa_translations, en_fa_sentences), total=len(en_fa_sentences))
    ]

en_fa_scores

In [None]:
filename = 'unbatched_BLANCtune_translation_(en_fa).txt'
save_results(en_fa_scores, filename)