In [91]:
%%capture
!pip install datasets

## Libraries and Dependencies

In [92]:
import pandas as pd
import torch
import nltk
from datasets import load_dataset
from tqdm.notebook import tqdm

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [93]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

## Algorithm Implementation

In [94]:
def mask_sentence(sentence, mask_token, i, M, L_min):
    return [mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

In [95]:
def BLANC_help(sentence, translation, model, tokenizer, M=6, L_min=4, sep='[SEP]', device='cpu'):
    """
    Calculates BLANC score between a given sentence and its translation using a specified model.

    Parameters:
    - sentence (List[str]): A tokenized sentence.
    - translation (List[str]): The tokenized translation.
    - model: BERT-type model
    - tokenizer: The tokenizer associated with the model used.
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '[SEP]').

    Returns:
    - float: BLANC score for the given sentence and its translation.
    """

    filler = ['.'] * len(translation)
    S = [[0, 0], [0, 0]]

    for i in range(M):
        masked_sentence = mask_sentence(sentence, tokenizer.mask_token, i, M, L_min)

        input_base = filler + [sep] + masked_sentence
        input_help = translation + [sep] + masked_sentence

        tokenized_input_base = torch.tensor(tokenizer.convert_tokens_to_ids(input_base)).to(device) # Shape: [sequence_length]
        tokenized_input_help = torch.tensor(tokenizer.convert_tokens_to_ids(input_help)).to(device) # Shape: [sequence_length]

        out_base = model(input_ids=tokenized_input_base.unsqueeze(0)).logits  # Shape: [1, sequence_length, model_vocab_size]
        out_help = model(input_ids=tokenized_input_help.unsqueeze(0)).logits  # Shape: [1, sequence_length, model_vocab_size]

        out_base = torch.argmax(out_base.squeeze(0), dim=-1)  # Shape: [sequence_length]
        out_help = torch.argmax(out_help.squeeze(0), dim=-1)  # Shape: [sequence_length]

        masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

        for j in masked_tokens:
            idx = len(translation + [sep]) + j
            predicted_word_base = tokenizer.convert_ids_to_tokens(out_base[idx].item())
            predicted_word_help = tokenizer.convert_ids_to_tokens(out_help[idx].item())

            # print(f'predicted_word_base[{idx - len(translation + [sep])}]: {predicted_word_base}')
            # print(f'predicted_word_help[{idx - len(translation + [sep])}]: {predicted_word_help}')
            # print(f'sentence[{j}]: {sentence[j]}')

            k = int(predicted_word_base == sentence[j])
            m = int(predicted_word_help == sentence[j])
            S[k][m] += 1


    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])

    return B

In [96]:
def study_results(dataset, scores, score_lower_bound=-1, score_upper_bound=1, verbose=False):
    num_examples = 0
    num_scores = dataset.shape[0]
    for idx, score in enumerate(scores[:num_scores]):
        if score_lower_bound <= score < score_upper_bound:
            num_examples += 1
            if dataset.shape[1] == 3:
                print(f'Example {idx}   score: {score}   annotator score: {dataset.iloc[idx, 2]}')
            else:
                print(f'Example {idx}   score: {score}   annotator score: -')
            if verbose:
                print(f'Sentence: {dataset.iloc[idx, 0]}')
                print(f'Translation: {dataset.iloc[idx, 1]}')
            print('-' * 100)
    print(f'{num_examples}/{dataset.shape[0]} scores were between {score_lower_bound} and {score_upper_bound}.')

## Datasets

In [97]:
# English - French
en_fr_ds = load_dataset('news_commentary', 'en-fr', split='train')

# English - Persian (Farsi)
en_fa_ds = load_dataset('persiannlp/parsinlu_translation_en_fa', split='train')

# English - Persian (with annotator scores)
en_fa_with_scores = pd.read_csv('/content/en-fa(0-55)_with_grades.csv', index_col=0)
en_fa_with_scores.head()

Unnamed: 0,source,targets,Score
0,Due Thank You note by Egyptian blogger Abdel M...,بلاگر مصری عبدل منعم محمود (عربی) پس از آزاد ش...,3.0
1,He was extremely surprised and happy to receiv...,وی همچنین از دریافت تعداد بسیار زیادی پیام تبر...,3.0
2,Monem blogs under the name of “Ana Ikwan”.,منعم به دلیل اتهامات سیاسی ۴۵ روز در زندان بود.,0.0
3,"On December 16, 2006, the Supreme Administrati...",در ۱۶ دسامبر ۲۰۰۶ شورای عالی اداری مصر که دولت...,2.0
4,Finally a call to Renew old Friendships: Tarek...,رانندگان مترو برای اعتراض به زخمی شدن یکی از ه...,0.0


## Model and Tokenizer

In [98]:
%%capture
mbert_model = BertForMaskedLM.from_pretrained('bert-base-multilingual-uncased').to(DEVICE)
mbert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case = True)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Preprocessing

In [99]:
# English - French

en_fr_ds = en_fr_ds.map(lambda example: example['translation'])\
                   .remove_columns(['id', 'translation'])\
                   .rename_column('en', 'sentence')\
                   .rename_column('fr', 'translation')\
                   .select(range(300))

# Tokenization
en_fr_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fr_ds['sentence']]  # (List[List[str]])

en_fr_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fr_ds['translation']] # (List[List[str]])

In [100]:
# English - Persian (Farsi)

# Removing the 'category' column
en_fa_ds = en_fa_ds.remove_columns(['category'])

# Removing list encapsulation
en_fa_ds = en_fa_ds.map(lambda example: {'targets': example['targets'][0]}, num_proc=4)

# Filtering out rows with the '\u200c' symbol and those where the length of either source or targets is less than a threshold
length_threshold = 30
filtered_en_fa_ds = en_fa_ds.filter(
    lambda example: '\u200c' not in example['targets']
                    and len(example['source']) >= length_threshold
                    and len(example['targets']) >= length_threshold
                    and 'Global Voices' not in example['source'], # Headlines. Very short and the 'Global Voices' part is never translated
    num_proc=4)

en_fa_ds = filtered_en_fa_ds.rename_column('source', 'sentence')\
                            .rename_column('targets', 'translation')\
                            .select(range(300))

# Tokenization
en_fa_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fa_ds['sentence']]  # (List[List[str]])

en_fa_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fa_ds['translation']] # (List[List[str]])

## Running the Program

English - French

In [101]:
BLANC_help(en_fr_translations[1], en_fr_sentences[1], mbert_model, mbert_tokenizer, device=DEVICE)

0.35714285714285715

In [102]:
%%time
en_fr_scores = [BLANC_help(translation, sentence, mbert_model, mbert_tokenizer, device=DEVICE)
                for translation, sentence in tqdm(zip(en_fr_translations, en_fr_sentences), total=len(en_fr_sentences))]

  0%|          | 0/300 [00:00<?, ?it/s]

CPU times: user 1min 3s, sys: 203 ms, total: 1min 3s
Wall time: 1min 10s


In [103]:
en_fr_scores

[0.0,
 0.35714285714285715,
 0.125,
 0.21621621621621623,
 -0.2,
 0.09523809523809523,
 0.0,
 0.05555555555555555,
 0.0,
 0.0,
 0.0,
 0.0,
 0.07692307692307693,
 0.23076923076923078,
 0.10526315789473684,
 0.0,
 0.2857142857142857,
 0.10256410256410256,
 0.0,
 0.0,
 0.13043478260869565,
 0.1836734693877551,
 0.17647058823529413,
 0.0,
 0.02857142857142857,
 0.13043478260869565,
 0.25,
 0.08,
 0.09523809523809523,
 0.04,
 0.0,
 0.07407407407407407,
 0.05263157894736842,
 0.0,
 0.0,
 0.0,
 0.08333333333333333,
 0.0,
 0.0,
 0.0,
 0.08333333333333333,
 0.0,
 0.0,
 0.0,
 0.16666666666666666,
 0.13043478260869565,
 0.0,
 0.16666666666666666,
 0.10344827586206896,
 0.07692307692307693,
 0.15384615384615385,
 0.0,
 0.26666666666666666,
 0.18181818181818182,
 0.029411764705882353,
 0.10526315789473684,
 0.13333333333333333,
 0.16129032258064516,
 0.21428571428571427,
 0.15625,
 0.14285714285714285,
 0.05405405405405406,
 0.1388888888888889,
 0.16666666666666666,
 0.14285714285714285,
 0.0,
 0.0

English - Persian (Farsi)

In [104]:
BLANC_help(en_fa_translations[1], en_fa_sentences[1], mbert_model, mbert_tokenizer, device=DEVICE)

0.25

In [105]:
%%time
en_fa_scores = [BLANC_help(translation, sentence, mbert_model, mbert_tokenizer, device=DEVICE)
                for translation, sentence in tqdm(zip(en_fa_translations, en_fa_sentences), total=len(en_fa_sentences))]

  0%|          | 0/300 [00:00<?, ?it/s]

CPU times: user 56.4 s, sys: 231 ms, total: 56.6 s
Wall time: 1min


In [106]:
en_fa_scores

[0.07407407407407407,
 0.25,
 0.1,
 0.047619047619047616,
 0.045454545454545456,
 0.0,
 0.0,
 0.0625,
 0.043478260869565216,
 0.1875,
 0.07142857142857142,
 0.034482758620689655,
 0.0,
 0.0,
 0.0,
 0.07692307692307693,
 0.2,
 0.09090909090909091,
 0.4,
 0.19047619047619047,
 0.05555555555555555,
 0.0,
 0.0,
 0.0,
 -0.046511627906976744,
 0.0,
 0.06666666666666667,
 -0.1111111111111111,
 0.11538461538461539,
 0.041666666666666664,
 -0.13333333333333333,
 0.0,
 0.02564102564102564,
 0.0,
 -0.09090909090909091,
 -0.05,
 0.0,
 0.09523809523809523,
 0.0625,
 0.14285714285714285,
 0.0,
 0.1,
 0.11764705882352941,
 -0.23076923076923078,
 0.21052631578947367,
 0.03571428571428571,
 -0.15384615384615385,
 0.07692307692307693,
 0.0,
 0.0,
 0.09090909090909091,
 0.03571428571428571,
 0.0,
 0.06896551724137931,
 0.05405405405405406,
 0.0,
 0.14285714285714285,
 0.0,
 0.05,
 -0.034482758620689655,
 0.0,
 0.0,
 0.0,
 0.0,
 0.06451612903225806,
 0.24,
 0.08333333333333333,
 0.0,
 -0.08333333333333333

## Analysis of the results

In [107]:
# Negative scores in the subdataset with annotator scores
study_results(en_fa_with_scores, en_fa_scores, score_upper_bound=0)

Example 24   score: -0.046511627906976744   annotator score: 2.0
----------------------------------------------------------------------------------------------------
Example 27   score: -0.1111111111111111   annotator score: 3.0
----------------------------------------------------------------------------------------------------
Example 30   score: -0.13333333333333333   annotator score: 2.0
----------------------------------------------------------------------------------------------------
Example 34   score: -0.09090909090909091   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 35   score: -0.05   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 43   score: -0.23076923076923078   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 46   score: -0.15

In [108]:
# 0.0 scores in the subdataset with annotator scores
study_results(en_fa_with_scores, en_fa_scores, score_lower_bound=0, score_upper_bound=0.01)

Example 5   score: 0.0   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 6   score: 0.0   annotator score: 1.0
----------------------------------------------------------------------------------------------------
Example 12   score: 0.0   annotator score: 1.0
----------------------------------------------------------------------------------------------------
Example 13   score: 0.0   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 14   score: 0.0   annotator score: 4.0
----------------------------------------------------------------------------------------------------
Example 21   score: 0.0   annotator score: 3.0
----------------------------------------------------------------------------------------------------
Example 22   score: 0.0   annotator score: 1.0
-------------------------------------------------------------------

In [109]:
# Positive scores in the subdataset with annotator scores
study_results(en_fa_with_scores, en_fa_scores, score_lower_bound=0.01)

Example 0   score: 0.07407407407407407   annotator score: 3.0
----------------------------------------------------------------------------------------------------
Example 1   score: 0.25   annotator score: 3.0
----------------------------------------------------------------------------------------------------
Example 2   score: 0.1   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 3   score: 0.047619047619047616   annotator score: 2.0
----------------------------------------------------------------------------------------------------
Example 4   score: 0.045454545454545456   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 7   score: 0.0625   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 8   score: 0.043478260869565216   annotator score: 1.0
-

In [110]:
# Scores higher than 0.2 in the subdataset with annotator scores
study_results(en_fa_with_scores, en_fa_scores, score_lower_bound=0.2)

Example 1   score: 0.25   annotator score: 3.0
----------------------------------------------------------------------------------------------------
Example 16   score: 0.2   annotator score: 0.0
----------------------------------------------------------------------------------------------------
Example 18   score: 0.4   annotator score: 1.0
----------------------------------------------------------------------------------------------------
Example 44   score: 0.21052631578947367   annotator score: 0.0
----------------------------------------------------------------------------------------------------
4/56 scores were between 0.2 and 1.


In [111]:
# Negative scores in the complete dataset
study_results(en_fa_ds, en_fa_scores, score_upper_bound=0)

Example 24   score: -0.046511627906976744   annotator score: -
----------------------------------------------------------------------------------------------------
Example 27   score: -0.1111111111111111   annotator score: -
----------------------------------------------------------------------------------------------------
Example 30   score: -0.13333333333333333   annotator score: -
----------------------------------------------------------------------------------------------------
Example 34   score: -0.09090909090909091   annotator score: -
----------------------------------------------------------------------------------------------------
Example 35   score: -0.05   annotator score: -
----------------------------------------------------------------------------------------------------
Example 43   score: -0.23076923076923078   annotator score: -
----------------------------------------------------------------------------------------------------
Example 46   score: -0.15384615384615

In [112]:
# 0.0 scores in the complete dataset
study_results(en_fa_ds, en_fa_scores, score_lower_bound=0, score_upper_bound=0.01)

Example 5   score: 0.0   annotator score: -
----------------------------------------------------------------------------------------------------
Example 6   score: 0.0   annotator score: -
----------------------------------------------------------------------------------------------------
Example 12   score: 0.0   annotator score: -
----------------------------------------------------------------------------------------------------
Example 13   score: 0.0   annotator score: -
----------------------------------------------------------------------------------------------------
Example 14   score: 0.0   annotator score: -
----------------------------------------------------------------------------------------------------
Example 21   score: 0.0   annotator score: -
----------------------------------------------------------------------------------------------------
Example 22   score: 0.0   annotator score: -
---------------------------------------------------------------------------------

In [113]:
# Positive scores in the complete dataset
study_results(en_fa_ds, en_fa_scores, score_lower_bound=0.01)

Example 0   score: 0.07407407407407407   annotator score: -
----------------------------------------------------------------------------------------------------
Example 1   score: 0.25   annotator score: -
----------------------------------------------------------------------------------------------------
Example 2   score: 0.1   annotator score: -
----------------------------------------------------------------------------------------------------
Example 3   score: 0.047619047619047616   annotator score: -
----------------------------------------------------------------------------------------------------
Example 4   score: 0.045454545454545456   annotator score: -
----------------------------------------------------------------------------------------------------
Example 7   score: 0.0625   annotator score: -
----------------------------------------------------------------------------------------------------
Example 8   score: 0.043478260869565216   annotator score: -
---------------

In [114]:
# Scores higher than 0.2 in the complete dataset
study_results(en_fa_ds, en_fa_scores, score_lower_bound=0.2)

Example 1   score: 0.25   annotator score: -
----------------------------------------------------------------------------------------------------
Example 16   score: 0.2   annotator score: -
----------------------------------------------------------------------------------------------------
Example 18   score: 0.4   annotator score: -
----------------------------------------------------------------------------------------------------
Example 44   score: 0.21052631578947367   annotator score: -
----------------------------------------------------------------------------------------------------
Example 65   score: 0.24   annotator score: -
----------------------------------------------------------------------------------------------------
Example 101   score: 0.21428571428571427   annotator score: -
----------------------------------------------------------------------------------------------------
Example 139   score: 0.35714285714285715   annotator score: -
----------------------------