In [1]:
# %%capture
# !pip install datasets

## Libraries and Dependencies

In [2]:
import pandas as pd
import torch
import nltk
from nltk.tokenize import sent_tokenize
from datasets import load_dataset
from tqdm.notebook import tqdm
import random

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer, logging
from transformers import AlbertForMaskedLM, AlbertTokenizer

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

nltk.download('punkt')
logging.set_verbosity_error()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

## Algorithm Implementation

In [4]:
def mask_sentence(sentence, mask_token, i, M, L_min):
    return [mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

In [5]:
def BLANC_help_modified(sentence, model, model_tuned, tokenizer, M=6, L_min=4, device = DEVICE):
    """
    Calculates BLANC score between a given sentence and its translation using a specified model.

    Parameters:
    - sentence (List[str]): A tokenized sentence.
    - model: BERT-type model
    - model_tuned: The fine-tuned model.
    - tokenizer: The tokenizer associated with the model used.
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '[SEP]').

    Returns:
    - float: BLANC score for the given sentence and its translation.
    """

    S = [[0, 0], [0, 0]]

    for i in range(M):
        masked_sentence = mask_sentence(sentence, tokenizer.mask_token, i, M, L_min)

        masked_sentence_ids = torch.tensor(tokenizer.convert_tokens_to_ids(masked_sentence)).to(device) # Shape: [sequence_length]

        out_base = model(input_ids = masked_sentence_ids.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]
        out_tune = model_tuned(input_ids = masked_sentence_ids.unsqueeze(0)).logits  # Shape: [1, sequence_length, Bert_vocab_size]

        out_base = torch.argmax(out_base.squeeze(0), dim=-1)  # Shape: [sequence_length]
        out_tune = torch.argmax(out_tune.squeeze(0), dim=-1)  # Shape: [sequence_length]

        masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

        for j in masked_tokens:
            predicted_word_base = tokenizer.convert_ids_to_tokens(out_base[j].item())
            predicted_word_tune = tokenizer.convert_ids_to_tokens(out_tune[j].item())

            # print(f'predicted_word_base: {predicted_word_base}')
            # print(f'predicted_word_help: {predicted_word_tune}')
            # print(f'sentence[{j}]: {sentence[j]}')

            k = int(predicted_word_base == sentence[j])
            m = int(predicted_word_tune == sentence[j])
            S[k][m] += 1


    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])

    return B



In [6]:
def get_word_lengths(df, tokenizer, l_min = 4):
    word_lengths = {}
    all_tokens = []

    for _, row in df.iterrows():
        translation = row['fr']
        preprocessed_result = tokenizer(translation,
                                        add_special_tokens = False,
                                        truncation = True,
                                        max_length = 512,
                                        padding = False,
                                        return_attention_mask = False)
        tokens = preprocessed_result["input_ids"]
        decoded_tokens = tokenizer.convert_ids_to_tokens(tokens)
        for token in tokens:
            if token not in all_tokens:
                all_tokens.append(token)

        i = 0
        while i < len(tokens):
            eligible = False
            if decoded_tokens[i].startswith('##'):
                eligible = True
                word_lengths[tokens[i - 1]] = eligible
                word_lengths[tokens[i]] = eligible
            else:
                if len(decoded_tokens[i]) >= l_min:
                    eligible = True
                word_lengths[tokens[i]] = eligible
            i += 1

    assert len(all_tokens) == len(word_lengths), "Association of tokens with word length : FAILED."

    return word_lengths

In [7]:
def training(set_tune, epochs = 10, device = DEVICE):
    model_tuned = BertForMaskedLM.from_pretrained('bert-base-uncased').to(device)
    model_tuned.train()
    optimizer = torch.optim.AdamW(model_tuned.parameters(), lr=1e-4)

    inputs = torch.tensor(set_tune['masked_translation'].tolist(), dtype = torch.long).to(device)
    label = torch.tensor(set_tune['translation'].tolist(), dtype = torch.long).to(device)
    # print(inputs.size())

    for epochs in range(epochs):
        outputs = model_tuned(input_ids = inputs, labels = label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return model_tuned

In [8]:
def BLANC_tune(translation, text, model, tokenizer, p_mask = 0.15, N = 10, epochs = 10, device = DEVICE):
    N_translation = len(translation)
    # N_summary = len([word for word in summary if not word.startswith('##')])
    N_mask = int(N_translation * p_mask)
    set_tune = pd.DataFrame(columns = ['masked_translation', 'translation'])

    tokenized_translation = torch.tensor(tokenizer.convert_tokens_to_ids(translation), dtype = torch.long).to(device)

    for _ in range(0, N):
        pos = [i for i, token in enumerate(tokenized_translation.tolist()) if token in word_lengths and word_lengths[token]]
        random.shuffle(pos)
        while len(pos) != 0:
            masked_translation = tokenized_translation.tolist().copy()
            for pos_to_mask in pos[:N_mask]:
                masked_translation[pos_to_mask] = tokenizer.mask_token_id
            set_tune.loc[set_tune.shape[0]] = [masked_translation, tokenized_translation.tolist()]
            pos = pos[N_mask:]

    model_tuned = training(set_tune, epochs)
    score = BLANC_help_modified(text, model, model_tuned, tokenizer)

    return score

In [9]:
def save_results(results, filename):
    with open(filename, 'w') as file:
        for result in results:
            file.write(str(result) + '\n')

## Datasets

In [10]:
news_commentary_ds = load_dataset('news_commentary', 'en-fr', split='train')
news_commentary_ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/209479 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'translation'],
    num_rows: 209479
})

In [11]:
parallel_df = pd.DataFrame(news_commentary_ds['translation'][:300])
parallel_df

Unnamed: 0,en,fr
0,"$10,000 Gold?",L’or à 10.000 dollars l’once ?
1,SAN FRANCISCO – It has never been easy to have...,SAN FRANCISCO – Il n’a jamais été facile d’avo...
2,"Lately, with gold prices up more than 300% ove...","Et aujourd’hui, alors que le cours de l’or a a..."
3,"Just last December, fellow economists Martin F...","En décembre dernier, mes collègues économistes..."
4,Wouldn’t you know it?,Mais devinez ce qui s’est passé ?
...,...,...
295,Although Abdullah is usually referred to in th...,Bien qu'Abdallah soit généralement considéré à...
296,"The Sudairis, it seems, have apparently left t...",Ils semblent avoir laissé leur demi-frère se c...
297,For although Crown Prince Abdullah has his own...,Même si le prince héritier Abdallah bénéficie ...
298,The idea of normalizing relations with Israel ...,L'idée d'une normalisation des relations avec ...


## Model and Tokenizer

In [12]:
%%capture
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
model = BertForMaskedLM.from_pretrained('bert-base-uncased').to(DEVICE)

## Preprocessing

In [13]:
word_lengths = get_word_lengths(parallel_df, tokenizer)
len(word_lengths)

2098

In [14]:
sentences = [tokenizer.tokenize(sentence)
             for sentence in parallel_df['en']]  # (List[List[str]])

translations = [tokenizer.tokenize(translation)
                for translation in parallel_df['fr']] # (List[List[str]])

## Running the Program

In [19]:
print(sentences[45])    # for some reason the loop gets stuck on this pair and doesn't proceed
print(translations[45])

['what', 'was', 'true', 'for', 'the', 'al', '##chemist', '##s', 'of', 'yo', '##re', 'remains', 'true', 'today', ':', 'gold', 'and', 'reason', 'are', 'often', 'difficult', 'to', 'reconcile', '.']
['ce', 'qui', 'eta', '##it', 'vr', '##ai', 'pour', 'les', 'al', '##chi', '##mist', '##es', 'd', '’', 'ant', '##an', 'rest', '##e', 'vr', '##ai', 'au', '##jou', '##rd', '’', 'hui', ':', 'l', '’', 'or', 'et', 'la', 'rai', '##son', 'son', '##t', 'par', '##fo', '##is', 'di', '##ffi', '##ci', '##les', 'a', 'con', '##ci', '##lier', '.']


In [16]:
BLANC_tune(translations[1], sentences[1], model, tokenizer)

0.0

In [17]:
scores = [BLANC_tune(translation, sentence, model, tokenizer)
          for translation, sentence in tqdm(zip(translations, sentences), total=len(sentences))]
scores

  0%|          | 0/300 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
filename = 'unbatched_BLANCtune_translation.txt'
save_results(scores, filename)