## Libraries and Dependencies

In [1]:
import pandas as pd
import torch
import nltk
from datasets import load_dataset

from transformers import BertForMaskedLM, BertTokenizer
from transformers import AlbertForMaskedLM, AlbertTokenizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Liora\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = 'cpu'
DEVICE

device(type='cuda')

## Algorithm Implementation

In [3]:
def mask_sentence(sentence, mask_token, i, M, L_min):
    return [mask_token
            if (j - i) % M == 0
            and (len(sentence[j]) >= L_min
                 or sentence[j].startswith('##')
                 or sentence[min(j+1, len(sentence)-1)].startswith('##'))
            else sentence[j]
            for j in range(len(sentence))]

In [4]:
def BLANC_help(sentence, translation, model, tokenizer, M=6, L_min=4, sep='[SEP]', device = DEVICE):
    """
    Calculates BLANC score between a given sentence and its translation using a specified model.

    Parameters:
    - sentence (List[str]): A tokenized sentence.
    - translation (List[str]): The tokenized translation.
    - model: BERT-type model
    - tokenizer: The tokenizer associated with the model used.
    - M (int): Parameter M for the algorithm (default is 6).
    - L_min (int): Minimum length requirement for masked words (default is 4).
    - sep (str): Separator between the inference help (filler/summary) and a sentence from the text (default is '[SEP]').

    Returns:
    - float: BLANC score for the given sentence and its translation.
    """

    filler = ['.'] * len(translation)
    S = [[0, 0], [0, 0]]

    for i in range(M):
        masked_sentence = mask_sentence(sentence, tokenizer.mask_token, i, M, L_min)
        print(masked_sentence)

        input_base = filler + [sep] + masked_sentence
        input_help = translation + [sep] + masked_sentence

        tokenized_input_base = torch.tensor(tokenizer.convert_tokens_to_ids(input_base)).to(device) # Shape: [sequence_length]
        tokenized_input_help = torch.tensor(tokenizer.convert_tokens_to_ids(input_help)).to(device) # Shape: [sequence_length]

        out_base = model(input_ids=tokenized_input_base.unsqueeze(0)).logits  # Shape: [1, sequence_length, model_vocab_size]
        out_help = model(input_ids=tokenized_input_help.unsqueeze(0)).logits  # Shape: [1, sequence_length, model_vocab_size]

        out_base = torch.argmax(out_base.squeeze(0), dim=-1)  # Shape: [sequence_length]
        out_help = torch.argmax(out_help.squeeze(0), dim=-1)  # Shape: [sequence_length]

        masked_tokens = [idx for idx, word in enumerate(masked_sentence) if word == tokenizer.mask_token]

        for j in masked_tokens:
            idx = len(translation + [sep]) + j
            predicted_word_base = tokenizer.convert_ids_to_tokens(out_base[idx].item())
            predicted_word_help = tokenizer.convert_ids_to_tokens(out_help[idx].item())

            print(f'predicted_word_base[{idx - len(translation + [sep])}]: {predicted_word_base}')
            print(f'predicted_word_help[{idx - len(translation + [sep])}]: {predicted_word_help}')
            print(f'sentence[{j}]: {sentence[j]}')

            k = int(predicted_word_base == sentence[j])
            m = int(predicted_word_help == sentence[j])
            S[k][m] += 1


    B = (S[0][1] - S[1][0]) / (S[0][0] + S[1][1] + S[0][1] + S[1][0])

    return B

## Datasets

In [5]:
en_fr_ds = load_dataset('news_commentary', 'en-fr', split='train')
en_fr_ds

Dataset({
    features: ['id', 'translation'],
    num_rows: 209479
})

In [6]:
en_fr_df = pd.DataFrame(en_fr_ds['translation'][:300])
en_fr_df

Unnamed: 0,en,fr
0,"$10,000 Gold?",L’or à 10.000 dollars l’once ?
1,SAN FRANCISCO – It has never been easy to have...,SAN FRANCISCO – Il n’a jamais été facile d’avo...
2,"Lately, with gold prices up more than 300% ove...","Et aujourd’hui, alors que le cours de l’or a a..."
3,"Just last December, fellow economists Martin F...","En décembre dernier, mes collègues économistes..."
4,Wouldn’t you know it?,Mais devinez ce qui s’est passé ?
...,...,...
295,Although Abdullah is usually referred to in th...,Bien qu'Abdallah soit généralement considéré à...
296,"The Sudairis, it seems, have apparently left t...",Ils semblent avoir laissé leur demi-frère se c...
297,For although Crown Prince Abdullah has his own...,Même si le prince héritier Abdallah bénéficie ...
298,The idea of normalizing relations with Israel ...,L'idée d'une normalisation des relations avec ...


## Model and Tokenizer

In [7]:
mbert_model = BertForMaskedLM.from_pretrained('bert-base-multilingual-uncased').to(DEVICE)
mbert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Preprocessing

In [17]:
en_translations = [mbert_tokenizer.tokenize(sentence)
             for sentence in en_fr_df['en']]  # (List[List[str]])

fr_sentences = [mbert_tokenizer.tokenize(translation)
                for translation in en_fr_df['fr']] # (List[List[str]])

In [10]:
# import random
# random.seed(42)

# def random_words_translation(translation):
#     random_translation = ""
#     translation = translation.split()
#     for _ in range(len(translation)):
#         random_translation += random.choice(translation) + ' '
#     return random_translation

# random_translations = [random_words_translation(translation) for translation in en_fr_df['fr'].unique()]
# tokenized_random_translations = [tokenizer.tokenize(translation) for translation in random_translations]


## Running the Program

In [18]:
BLANC_help(fr_sentences[90], en_translations[90], mbert_model, mbert_tokenizer)

['il', 's', '[UNK]', 'agir', '##a', 'd', '[MASK]', 'une', 'caracteristique', 'du', 'consensus', 'de', '[MASK]', '.']
predicted_word_base[6]: '
predicted_word_help[6]: '
sentence[6]: [UNK]
predicted_word_base[12]: .
predicted_word_help[12]: paris
sentence[12]: 1945
['il', 's', '[UNK]', 'agir', '##a', 'd', '[UNK]', 'une', 'caracteristique', 'du', 'consensus', 'de', '1945', '.']
['il', 's', '[MASK]', 'agir', '##a', 'd', '[UNK]', 'une', '[MASK]', 'du', 'consensus', 'de', '1945', '.']
predicted_word_base[2]: '
predicted_word_help[2]: '
sentence[2]: [UNK]
predicted_word_base[8]: modification
predicted_word_help[8]: partie
sentence[8]: caracteristique
['il', 's', '[UNK]', '[MASK]', '##a', 'd', '[UNK]', 'une', 'caracteristique', 'du', 'consensus', 'de', '1945', '.']
predicted_word_base[3]: agir
predicted_word_help[3]: servir
sentence[3]: agir
['il', 's', '[UNK]', 'agir', '[MASK]', 'd', '[UNK]', 'une', 'caracteristique', 'du', '[MASK]', 'de', '1945', '.']
predicted_word_base[4]: ##ait
predicted

-0.14285714285714285

In [19]:
def study_results(index = 253, model = mbert_model, tokenizer = mbert_tokenizer, translations = en_translations, sentences = fr_sentences):
    print("sentence : ", en_fr_df['fr'].iloc[index])
    print("translation : ", en_fr_df['en'].iloc[index])
    print("tokenization of sentence : ",sentences[index])
    print("tokenization of trans. : ", translations[index])
    print('\n')
    print(BLANC_help(sentences[index], translations[index], model, tokenizer))

In [11]:
%%time
en_fr_scores = [BLANC_help(sentence, translation, mbert_model, mbert_tokenizer)
          for sentence, translation in zip(fr_sentences, en_translations)]

['l', '[UNK]', 'or', 'a', '10', '.', '000', 'dollars', 'l', '[UNK]', 'once', '?']
['l', '[MASK]', 'or', 'a', '10', '.', '000', '[MASK]', 'l', '[UNK]', 'once', '?']
predicted_word_base[1]: [UNK]
predicted_word_help[1]: [UNK]
sentence[1]: [UNK]
predicted_word_base[7]: -
predicted_word_help[7]: .
sentence[7]: dollars
['l', '[UNK]', 'or', 'a', '10', '.', '000', 'dollars', 'l', '[UNK]', 'once', '?']
['l', '[UNK]', 'or', 'a', '10', '.', '000', 'dollars', 'l', '[MASK]', 'once', '?']
predicted_word_base[9]: [UNK]
predicted_word_help[9]: [UNK]
sentence[9]: [UNK]
['l', '[UNK]', 'or', 'a', '10', '.', '000', 'dollars', 'l', '[UNK]', '[MASK]', '?']
predicted_word_base[10]: .
predicted_word_help[10]: gold
sentence[10]: once
['l', '[UNK]', 'or', 'a', '10', '.', '000', 'dollars', 'l', '[UNK]', 'once', '?']
['san', 'francisco', '[UNK]', 'il', 'n', '[UNK]', 'a', 'jamais', 'ete', 'facile', 'd', '[UNK]', '[MASK]', 'une', 'discussion', 'ratio', '##nne', '##lle', 'sur', 'la', 'valeur', 'du', 'metal', 'jaune

In [20]:
for i in range(len(en_fr_scores)):
    print(f"{i} : {en_fr_scores[i]}")

0 : 0.0
1 : 0.35714285714285715
2 : 0.125
3 : 0.21621621621621623
4 : -0.2
5 : 0.09523809523809523
6 : 0.0
7 : 0.05555555555555555
8 : 0.0
9 : 0.0
10 : 0.0
11 : 0.0
12 : 0.07692307692307693
13 : 0.23076923076923078
14 : 0.10526315789473684
15 : 0.0
16 : 0.2857142857142857
17 : 0.10256410256410256
18 : 0.0
19 : 0.0
20 : 0.13043478260869565
21 : 0.1836734693877551
22 : 0.17647058823529413
23 : 0.0
24 : 0.02857142857142857
25 : 0.13043478260869565
26 : 0.25
27 : 0.08
28 : 0.09523809523809523
29 : 0.04
30 : 0.0
31 : 0.07407407407407407
32 : 0.05263157894736842
33 : 0.0
34 : 0.0
35 : 0.0
36 : 0.08333333333333333
37 : 0.0
38 : 0.0
39 : 0.0
40 : 0.08333333333333333
41 : 0.0
42 : 0.0
43 : 0.0
44 : 0.16666666666666666
45 : 0.13043478260869565
46 : 0.0
47 : 0.16666666666666666
48 : 0.10344827586206896
49 : 0.07692307692307693
50 : 0.15384615384615385
51 : 0.0
52 : 0.26666666666666666
53 : 0.18181818181818182
54 : 0.029411764705882353
55 : 0.10526315789473684
56 : 0.13333333333333333
57 : 0.16129

In [21]:
# good results 1
study_results(1, mbert_model, mbert_tokenizer)

# translation that exact translation "ex: "could" and "pouvait") can predict better but this is cheating
# idea of validation : switch words inside sentence + see if a more literal but less idiomatic translation has better results or not.

sentence :  SAN FRANCISCO – Il n’a jamais été facile d’avoir une discussion rationnelle sur la valeur du métal jaune.
translation :  SAN FRANCISCO – It has never been easy to have a rational conversation about the value of gold.
tokenization of sentence :  ['san', 'francisco', '[UNK]', 'il', 'n', '[UNK]', 'a', 'jamais', 'ete', 'facile', 'd', '[UNK]', 'avoir', 'une', 'discussion', 'ratio', '##nne', '##lle', 'sur', 'la', 'valeur', 'du', 'metal', 'jaune', '.']
tokenization of trans. :  ['san', 'francisco', '[UNK]', 'it', 'has', 'never', 'been', 'easy', 'to', 'have', 'a', 'rational', 'conversation', 'about', 'the', 'value', 'of', 'gold', '.']


['san', 'francisco', '[UNK]', 'il', 'n', '[UNK]', 'a', 'jamais', 'ete', 'facile', 'd', '[UNK]', '[MASK]', 'une', 'discussion', 'ratio', '##nne', '##lle', 'sur', 'la', 'valeur', 'du', 'metal', 'jaune', '.']
predicted_word_base[12]: avoir
predicted_word_help[12]: avoir
sentence[12]: avoir
['san', '[MASK]', '[UNK]', 'il', 'n', '[UNK]', 'a', '[MASK]', '

In [22]:
# good results 2
study_results(257, mbert_model, mbert_tokenizer)

# same thing

sentence :  En tant qu'ambassadeur auprès des Etats-Unis et de l'ONU, puis comme Ministre des affaires étrangères, il représenta une Israël à laquelle l'imagination libérale du monde entier pouvait s'identifier.
translation :  As Ambassador to the United States and the UN, and later as Foreign Minister, he represented an Israel with which the world's liberal imagination could identify.
tokenization of sentence :  ['en', 'tant', 'qu', "'", 'ambassadeur', 'aupres', 'des', 'etats', '-', 'unis', 'et', 'de', 'l', "'", 'onu', ',', 'puis', 'comme', 'ministre', 'des', 'affaires', 'etrangeres', ',', 'il', 'representa', 'une', 'israel', 'a', 'laquelle', 'l', "'", 'imagination', 'liberale', 'du', 'monde', 'entier', 'pouvait', 's', "'", 'identifier', '.']
tokenization of trans. :  ['as', 'ambassador', 'to', 'the', 'united', 'states', 'and', 'the', 'un', ',', 'and', 'later', 'as', 'foreign', 'minister', ',', 'he', 'represented', 'an', 'israel', 'with', 'which', 'the', 'world', "'", 's', 'liberal', 

In [23]:
# good results 3
study_results(286, mbert_model, mbert_tokenizer)

# same thing. sometimes it does rely on the french context ("terme") but it comes from bert itself, not the method ?

sentence :  Il était, dans le vrai sens du terme, un " self-made man ", dont le courage, l'ambition, le dynamisme et une foi profonde dans son destin ont mené au sommet.
translation :  He was, in the true sense of the word, a self-made man, whose pluck, ambition, drive and inner belief in his destiny carried him to the pinnacle of his achievements.
tokenization of sentence :  ['il', 'etait', ',', 'dans', 'le', 'vrai', 'sens', 'du', 'terme', ',', 'un', '"', 'self', '-', 'made', 'man', '"', ',', 'dont', 'le', 'courage', ',', 'l', "'", 'ambition', ',', 'le', 'dy', '##nami', '##sme', 'et', 'une', 'foi', 'profonde', 'dans', 'son', 'destin', 'ont', 'mene', 'au', 'sommet', '.']
tokenization of trans. :  ['he', 'was', ',', 'in', 'the', 'true', 'sense', 'of', 'the', 'word', ',', 'a', 'self', '-', 'made', 'man', ',', 'whose', 'plu', '##ck', ',', 'ambition', ',', 'drive', 'and', 'inner', 'belief', 'in', 'his', 'destiny', 'carried', 'him', 'to', 'the', 'pin', '##nac', '##le', 'of', 'his', 'achieve

In [24]:
# perfect score
study_results(275, mbert_model, mbert_tokenizer)

# too small

sentence :  Il ne l'a jamais fait.
translation :  He never did.
tokenization of sentence :  ['il', 'ne', 'l', "'", 'a', 'jamais', 'fait', '.']
tokenization of trans. :  ['he', 'never', 'did', '.']


['il', 'ne', 'l', "'", 'a', 'jamais', '[MASK]', '.']
predicted_word_base[6]: vu
predicted_word_help[6]: fait
sentence[6]: fait
['il', 'ne', 'l', "'", 'a', 'jamais', 'fait', '.']
['il', 'ne', 'l', "'", 'a', 'jamais', 'fait', '.']
['il', 'ne', 'l', "'", 'a', 'jamais', 'fait', '.']
['il', 'ne', 'l', "'", 'a', 'jamais', 'fait', '.']
['il', 'ne', 'l', "'", 'a', '[MASK]', 'fait', '.']
predicted_word_base[5]: pas
predicted_word_help[5]: jamais
sentence[5]: jamais
1.0


In [25]:
# significantly worst score 1
study_results(4, mbert_model, mbert_tokenizer)

# ?

sentence :  Mais devinez ce qui s’est passé ?
translation :  Wouldn’t you know it?
tokenization of sentence :  ['mais', 'devine', '##z', 'ce', 'qui', 's', '[UNK]', 'est', 'passe', '?']
tokenization of trans. :  ['would', '##n', '[UNK]', 't', 'you', 'know', 'it', '?']


['[MASK]', 'devine', '##z', 'ce', 'qui', 's', '[MASK]', 'est', 'passe', '?']
predicted_word_base[0]: vous
predicted_word_help[0]: vous
sentence[0]: mais
predicted_word_base[6]: '
predicted_word_help[6]: '
sentence[6]: [UNK]
['mais', '[MASK]', '##z', 'ce', 'qui', 's', '[UNK]', 'est', 'passe', '?']
predicted_word_base[1]: pense
predicted_word_help[1]: voye
sentence[1]: devine
['mais', 'devine', '[MASK]', 'ce', 'qui', 's', '[UNK]', 'est', '[MASK]', '?']
predicted_word_base[2]: ##z
predicted_word_help[2]: ##z
sentence[2]: ##z
predicted_word_base[8]: passe
predicted_word_help[8]: dit
sentence[8]: passe
['mais', 'devine', '##z', 'ce', 'qui', 's', '[UNK]', 'est', 'passe', '?']
['mais', 'devine', '##z', 'ce', 'qui', 's', '[UNK]'

In [26]:
# significantly worst score 2
study_results(90, mbert_model, mbert_tokenizer)

# ?

sentence :  Il s’agira d’une caractéristique du consensus de 1945.
translation :  It was part of the 1945 consensus.
tokenization of sentence :  ['il', 's', '[UNK]', 'agir', '##a', 'd', '[UNK]', 'une', 'caracteristique', 'du', 'consensus', 'de', '1945', '.']
tokenization of trans. :  ['it', 'was', 'part', 'of', 'the', '1945', 'consensus', '.']


['il', 's', '[UNK]', 'agir', '##a', 'd', '[MASK]', 'une', 'caracteristique', 'du', 'consensus', 'de', '[MASK]', '.']
predicted_word_base[6]: '
predicted_word_help[6]: '
sentence[6]: [UNK]
predicted_word_base[12]: .
predicted_word_help[12]: paris
sentence[12]: 1945
['il', 's', '[UNK]', 'agir', '##a', 'd', '[UNK]', 'une', 'caracteristique', 'du', 'consensus', 'de', '1945', '.']
['il', 's', '[MASK]', 'agir', '##a', 'd', '[UNK]', 'une', '[MASK]', 'du', 'consensus', 'de', '1945', '.']
predicted_word_base[2]: '
predicted_word_help[2]: '
sentence[2]: [UNK]
predicted_word_base[8]: modification
predicted_word_help[8]: partie
sentence[8]: caracteristique

In [27]:
# significantly worst score 3
study_results(166, mbert_model, mbert_tokenizer)

# ?

sentence :  Elle entend détruire, sans rien créer.
translation :  They can destroy but not create.
tokenization of sentence :  ['elle', 'enten', '##d', 'det', '##ruire', ',', 'sans', 'rien', 'creer', '.']
tokenization of trans. :  ['they', 'can', 'destroy', 'but', 'not', 'create', '.']


['[MASK]', 'enten', '##d', 'det', '##ruire', ',', '[MASK]', 'rien', 'creer', '.']
predicted_word_base[0]: je
predicted_word_help[0]: je
sentence[0]: elle
predicted_word_base[6]: sans
predicted_word_help[6]: et
sentence[6]: sans
['elle', '[MASK]', '##d', 'det', '##ruire', ',', 'sans', '[MASK]', 'creer', '.']
predicted_word_base[1]: enten
predicted_word_help[1]: enten
sentence[1]: enten
predicted_word_base[7]: le
predicted_word_help[7]: jamais
sentence[7]: rien
['elle', 'enten', '[MASK]', 'det', '##ruire', ',', 'sans', 'rien', '[MASK]', '.']
predicted_word_base[2]: ##d
predicted_word_help[2]: ##d
sentence[2]: ##d
predicted_word_base[8]: .
predicted_word_help[8]: faire
sentence[8]: creer
['elle', 'enten',

In [28]:
# test 
trans = mbert_tokenizer.tokenize("Elle entend détruire, sans rien créer.")
sent = mbert_tokenizer.tokenize("They can destroy but not create.")

print(BLANC_help(trans, sent, mbert_model, mbert_tokenizer))
print('\n')

trans = mbert_tokenizer.tokenize("Elle entend détruire, sans rien créer.")
sent = mbert_tokenizer.tokenize("She wants destruction, without creation.")

print(BLANC_help(trans, sent, mbert_model, mbert_tokenizer))
print('\n')

trans = mbert_tokenizer.tokenize("Elle entend détruire, sans rien créer.")
sent = mbert_tokenizer.tokenize("She wants destruction, without creating anything.")

print(BLANC_help(trans, sent, mbert_model, mbert_tokenizer))
print('\n')

trans = mbert_tokenizer.tokenize("Elle entend détruire, sans rien créer.")
sent = mbert_tokenizer.tokenize("She intends to destroy, without creation.")

print(BLANC_help(trans, sent, mbert_model, mbert_tokenizer))
print('\n')

trans = mbert_tokenizer.tokenize("Elle entend détruire, sans rien créer.")
sent = mbert_tokenizer.tokenize("She intends to destroy, without creating anything.")

print(BLANC_help(trans, sent, mbert_model, mbert_tokenizer))
print('\n')

['[MASK]', 'enten', '##d', 'det', '##ruire', ',', '[MASK]', 'rien', 'creer', '.']
predicted_word_base[0]: je
predicted_word_help[0]: je
sentence[0]: elle
predicted_word_base[6]: sans
predicted_word_help[6]: et
sentence[6]: sans
['elle', '[MASK]', '##d', 'det', '##ruire', ',', 'sans', '[MASK]', 'creer', '.']
predicted_word_base[1]: enten
predicted_word_help[1]: enten
sentence[1]: enten
predicted_word_base[7]: le
predicted_word_help[7]: jamais
sentence[7]: rien
['elle', 'enten', '[MASK]', 'det', '##ruire', ',', 'sans', 'rien', '[MASK]', '.']
predicted_word_base[2]: ##d
predicted_word_help[2]: ##d
sentence[2]: ##d
predicted_word_base[8]: .
predicted_word_help[8]: faire
sentence[8]: creer
['elle', 'enten', '##d', '[MASK]', '##ruire', ',', 'sans', 'rien', 'creer', '.']
predicted_word_base[3]: det
predicted_word_help[3]: det
sentence[3]: det
['elle', 'enten', '##d', 'det', '[MASK]', ',', 'sans', 'rien', 'creer', '.']
predicted_word_base[4]: ##ruire
predicted_word_help[4]: ##ruire
sentence[4]

In [31]:
# English - Persian (Farsi)

en_fa_ds = load_dataset('persiannlp/parsinlu_translation_en_fa', split='train')

# Removing the 'category' column
en_fa_ds = en_fa_ds.remove_columns(['category'])

# Removing list encapsulation
en_fa_ds = en_fa_ds.map(lambda example: {'targets': example['targets'][0]}, num_proc=4)

# Filtering out rows with the '\u200c' symbol and those where the length of either source or targets is less than a threshold
length_threshold = 10
filtered_en_fa_ds = en_fa_ds.filter(
    lambda example: '\u200c' not in example['targets']
    and len(example['source']) >= length_threshold
    and len(example['targets']) >= length_threshold,
    num_proc=4)

en_fa_df = pd.DataFrame(filtered_en_fa_ds[:300])
en_fa_df

Filter (num_proc=4):   0%|          | 0/1621665 [00:00<?, ? examples/s]

NameError: name 'length_threshold' is not defined

In [None]:
# English - Persian (Farsi)
# change names here (to be more clear on what is translations what is sentences)
en_fa_sentences = [mbert_tokenizer.tokenize(sentence)
                   for sentence in en_fa_df['source']]  # (List[List[str]])

en_fa_translations = [mbert_tokenizer.tokenize(translation)
                      for translation in en_fa_df['targets']] # (List[List[str]])

In [None]:
BLANC_help(en_fa_translations[1], en_fa_sentences[1], mbert_model, mbert_tokenizer, device=DEVICE)

In [None]:
%%time
en_fa_scores = [BLANC_help(translation, sentence, mbert_model, mbert_tokenizer, device=DEVICE)
                for translation, sentence in tqdm(zip(en_fa_translations, en_fa_sentences), total=len(en_fa_sentences))]

In [None]:
for i in range(len(en_fa_scores)):
    print(f"{i} : {en_fa_scores[i]}")

In [14]:
data = {}
data['BLANC_help_300_translation'] = en_fr_scores

import json

def add_results_to_json(new_data, file_path = "./results.json"):
    try:
        with open(file_path, 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        existing_data = {}

    for key, value in new_data.items():
        existing_data[key] = value

    with open(file_path, 'w') as json_file:
        json.dump(existing_data, json_file, indent=2)

    print(f"Data has been added to {file_path}")

add_results_to_json(data)

Data has been added to ./results.json


In [None]:
import json

def change_key_name(json_file_path, old_key, new_key):
    try:
        # Load existing data from the JSON file
        with open(json_file_path, 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        print(f"The file {json_file_path} does not exist.")
        return

    # Check if the old key exists in the data
    if old_key in existing_data:
        # Create a new key with the desired name
        existing_data[new_key] = existing_data.pop(old_key)

        # Write the updated data back to the JSON file
        with open(json_file_path, 'w') as json_file:
            json.dump(existing_data, json_file, indent=2)

        print(f"The key '{old_key}' has been changed to '{new_key}' in {json_file_path}")
    else:
        print(f"The key '{old_key}' does not exist in the data.")

# Example usage:
json_file_path = "./results.json"
old_key_name = "BLANC_help_unbatched_300"
new_key_name = "BLANC_help_300"

change_key_name(json_file_path, old_key_name, new_key_name)


In [51]:
import json

def remove_key_from_json(file_path, key_to_remove):
    try:
        # Load existing data from the JSON file
        with open(file_path, 'r') as json_file:
            existing_data = json.load(json_file)
    except FileNotFoundError:
        print(f"The file {file_path} does not exist.")
        return

    # Check if the key exists in the data
    if key_to_remove in existing_data:
        # Remove the key
        del existing_data[key_to_remove]

        # Write the updated data back to the JSON file
        with open(file_path, 'w') as json_file:
            json.dump(existing_data, json_file, indent=2)

        print(f"The key '{key_to_remove}' has been removed from {file_path}")
    else:
        print(f"The key '{key_to_remove}' does not exist in the data.")

# Example usage:
json_file_path = "./results.json"
key_to_remove = "BLANC_help_300_translation"

remove_key_from_json(json_file_path, key_to_remove)


The key 'BLANC_help_300_translation' has been removed from ./results.json
