In [1]:
import torch
from nltk.tokenize import sent_tokenize
from datasets import load_dataset
from tqdm.notebook import tqdm
import pandas as pd

# The models the authors used:
from transformers import BertForMaskedLM, BertTokenizer

from blanc import BLANC_tune_summary, BLANC_tune_translation, add_results_to_json

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Liora\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_checkpoint = "bert-base-uncased"
bert_model = BertForMaskedLM.from_pretrained(bert_checkpoint).to(DEVICE)
bert_tokenizer = BertTokenizer.from_pretrained(bert_checkpoint, do_lower_case=True)

mbert_checkpoint = "bert-base-multilingual-uncased"
mbert_model = BertForMaskedLM.from_pretrained(mbert_checkpoint).to(DEVICE)
mbert_tokenizer = BertTokenizer.from_pretrained(mbert_checkpoint, do_lower_case=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint

# BLANC tune for **summaries**

In [4]:
""" Datasets """

DailyNews_ds = load_dataset("json", data_files="../datasets/DailyNews_300.json", split="train")
DailyNews_ds

Dataset({
    features: ['text', 'annotators_ids', 'summary', 'scores'],
    num_rows: 300
})

In [5]:
""" Preprocessing """

summaries = DailyNews_ds["summary"]  # (List[str])
texts = DailyNews_ds[
    "text"
]  # (List[str]) each string is a paragraph made of a few sentences

# each text in texts is a list of sentences (each sentence is a string)
texts = [sent_tokenize(text.strip()) for text in texts]  # List[List[str]]
assert len(texts) == len(summaries) == 300

tokenized_texts = [
    [bert_tokenizer.tokenize(sentence) for sentence in text] for text in texts
]  # List[List[List[str]]]
tokenized_summaries = [
    bert_tokenizer.tokenize(summary) for summary in summaries
]  # [List[List[str]]]

In [7]:
BLANC_tune_summary(
        tokenized_texts[198], tokenized_summaries[198], bert_checkpoint, bert_model, bert_tokenizer, device=DEVICE)

hola
[]
coucou


0.0

In [None]:
""" Running the Program """

tune_summary_scores = [
    BLANC_tune_summary(
        text, summary, bert_checkpoint, bert_model, bert_tokenizer, device=DEVICE
    )
    for text, summary in tqdm(
        zip(tokenized_texts, tokenized_summaries), total=len(tokenized_texts)
    )
]

# Saving the results
summary_data = {}
summary_data["BLANC_tune_summary"] = tune_summary_scores
add_results_to_json(summary_data)

# BLANC tune for **translations**

In [3]:
""" Datasets """

# English - French
en_fr_df = pd.read_csv("../datasets/en_to_fr_translations.csv", index_col=0)

# English - Persian (Farsi)
en_fa_ds = load_dataset("persiannlp/parsinlu_translation_en_fa", split="train")

In [4]:
""" Preprocessing (English - French)"""

# Tokenization
en_fr_sentences = [
    mbert_tokenizer.tokenize(sentence) for sentence in en_fr_df["en"]
]  # (List[List[str]])

en_fr_translations = [
    mbert_tokenizer.tokenize(translation) for translation in en_fr_df["fr"]
]  # (List[List[str]])


""" Preprocessing (English - Persian (Farsi)) """

# Removing the 'category' column
en_fa_ds = en_fa_ds.remove_columns(["category"])

# Removing list encapsulation
en_fa_ds = en_fa_ds.map(lambda example: {"targets": example["targets"][0]}, num_proc=4)

# Filtering out:
# - rows with the '\u200c' symbol,
# - those where the length of either source or targets is less than a threshold
# - Headlines (ending in 'Global Voices') --> because they are very short and the 'Global Voices' part is never translated
length_threshold = 30
filtered_en_fa_ds = en_fa_ds.filter(
    lambda example: "\u200c" not in example["targets"]
    and len(example["source"]) >= 30
    and len(example["targets"]) >= 30
    and "Global Voices" not in example["source"],
    num_proc=4,
)

en_fa_ds = (
    filtered_en_fa_ds.rename_column("source", "sentence")
    .rename_column("targets", "translation")
    .select(range(300))
)

# Tokenization
en_fa_sentences = [
    mbert_tokenizer.tokenize(sentence) for sentence in en_fa_ds["sentence"]
]  # (List[List[str]])

en_fa_translations = [
    mbert_tokenizer.tokenize(translation) for translation in en_fa_ds["translation"]
]  # (List[List[str]])

In [5]:
tune_en_fr_scores = [
    BLANC_tune_translation(
        sentences,
        translations,
        mbert_checkpoint,
        mbert_model,
        mbert_tokenizer,
        device=DEVICE,
    )
    for sentences, translations in tqdm(
        zip(en_fr_sentences, en_fr_translations), total=len(en_fr_sentences)
    )
]

  0%|          | 0/100 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 28.5355, 'train_samples_per_second': 4.205, 'train_steps_per_second': 0.526, 'train_loss': 0.2070212682088216, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 33.6566, 'train_samples_per_second': 3.565, 'train_steps_per_second': 0.446, 'train_loss': 0.07531503041585287, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 29.7658, 'train_samples_per_second': 4.031, 'train_steps_per_second': 0.504, 'train_loss': 0.18761533101399738, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 28.5324, 'train_samples_per_second': 4.206, 'train_steps_per_second': 0.526, 'train_loss': 0.1906865437825521, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

In [9]:
""" Running the Program (English - French)"""

tune_en_fr_scores = [
    BLANC_tune_translation(
        sentences,
        translations,
        mbert_checkpoint,
        mbert_model,
        mbert_tokenizer,
        device=DEVICE,
    )
    for sentences, translations in tqdm(
        zip(en_fr_sentences, en_fr_translations), total=len(en_fr_sentences)
    )
]

# Saving the results
en_fr_data = {}
en_fr_data["BLANC_tune_en_fa_translation"] = tune_en_fr_scores
add_results_to_json(en_fr_data)

  0%|          | 0/300 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 6.6442, 'train_samples_per_second': 18.061, 'train_steps_per_second': 2.258, 'train_loss': 0.3151547114054362, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 11.1789, 'train_samples_per_second': 13.418, 'train_steps_per_second': 1.879, 'train_loss': 0.2723471550714402, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 8.312, 'train_samples_per_second': 14.437, 'train_steps_per_second': 1.805, 'train_loss': 0.17388927141825358, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 22.5168, 'train_samples_per_second': 7.994, 'train_steps_per_second': 1.066, 'train_loss': 0.1409877041975657, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 8.8716, 'train_samples_per_second': 16.908, 'train_steps_per_second': 2.367, 'train_loss': 0.37844603402273996, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 12.343, 'train_samples_per_second': 12.153, 'train_steps_per_second': 1.701, 'train_loss': 0.15031486465817406, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 12.2658, 'train_samples_per_second': 14.675, 'train_steps_per_second': 1.957, 'train_loss': 0.1444383164246877, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 12.4953, 'train_samples_per_second': 14.405, 'train_steps_per_second': 1.921, 'train_loss': 0.15856751799583435, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 12.8282, 'train_samples_per_second': 11.693, 'train_steps_per_second': 1.637, 'train_loss': 0.11682318505786714, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/12 [00:00<?, ?it/s]

{'train_runtime': 5.7778, 'train_samples_per_second': 15.577, 'train_steps_per_second': 2.077, 'train_loss': 0.2608707348505656, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 8.639, 'train_samples_per_second': 13.891, 'train_steps_per_second': 1.736, 'train_loss': 0.36090590159098307, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 11.6262, 'train_samples_per_second': 12.902, 'train_steps_per_second': 1.806, 'train_loss': 0.15189980325244723, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 11.9255, 'train_samples_per_second': 12.578, 'train_steps_per_second': 1.761, 'train_loss': 0.11846974917820521, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/27 [00:00<?, ?it/s]

{'train_runtime': 13.9926, 'train_samples_per_second': 15.008, 'train_steps_per_second': 1.93, 'train_loss': 0.17241809986255788, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 11.4645, 'train_samples_per_second': 13.084, 'train_steps_per_second': 1.832, 'train_loss': 0.1728483041127523, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 7.9877, 'train_samples_per_second': 15.023, 'train_steps_per_second': 1.878, 'train_loss': 0.29063119888305666, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/27 [00:00<?, ?it/s]

{'train_runtime': 12.6654, 'train_samples_per_second': 16.581, 'train_steps_per_second': 2.132, 'train_loss': 0.1304017702738444, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 21.9864, 'train_samples_per_second': 6.822, 'train_steps_per_second': 0.955, 'train_loss': 0.11470051038832892, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 7.9853, 'train_samples_per_second': 15.028, 'train_steps_per_second': 1.878, 'train_loss': 0.41101274490356443, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 11.0938, 'train_samples_per_second': 13.521, 'train_steps_per_second': 1.893, 'train_loss': 0.4417478016444615, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 13.6863, 'train_samples_per_second': 13.152, 'train_steps_per_second': 1.754, 'train_loss': 0.08373699585596721, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 32.5836, 'train_samples_per_second': 4.604, 'train_steps_per_second': 0.644, 'train_loss': 0.08667571204049247, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 11.0608, 'train_samples_per_second': 13.561, 'train_steps_per_second': 1.899, 'train_loss': 0.10735625312441871, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 10.9789, 'train_samples_per_second': 16.395, 'train_steps_per_second': 2.186, 'train_loss': 0.3532784779866536, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 17.4702, 'train_samples_per_second': 6.869, 'train_steps_per_second': 0.859, 'train_loss': 0.19943900108337403, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 9.7515, 'train_samples_per_second': 12.306, 'train_steps_per_second': 1.538, 'train_loss': 0.19474870363871258, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 7.9571, 'train_samples_per_second': 15.081, 'train_steps_per_second': 1.885, 'train_loss': 0.25966792106628417, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 13.0069, 'train_samples_per_second': 11.532, 'train_steps_per_second': 1.615, 'train_loss': 0.12861536798023043, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 13.7831, 'train_samples_per_second': 13.059, 'train_steps_per_second': 1.741, 'train_loss': 0.10308403770128886, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 12.8895, 'train_samples_per_second': 11.637, 'train_steps_per_second': 1.629, 'train_loss': 0.4119876679920015, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 15.0201, 'train_samples_per_second': 11.984, 'train_steps_per_second': 1.598, 'train_loss': 0.4398082097371419, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 12.9791, 'train_samples_per_second': 11.557, 'train_steps_per_second': 1.618, 'train_loss': 0.19776496433076404, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 12.1681, 'train_samples_per_second': 12.327, 'train_steps_per_second': 1.726, 'train_loss': 0.12074384235200428, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 20.1568, 'train_samples_per_second': 7.442, 'train_steps_per_second': 1.042, 'train_loss': 0.16934952281770252, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 26.3768, 'train_samples_per_second': 5.687, 'train_steps_per_second': 0.796, 'train_loss': 0.10664175805591401, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 12.792, 'train_samples_per_second': 11.726, 'train_steps_per_second': 1.642, 'train_loss': 0.14133747418721518, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 8.8992, 'train_samples_per_second': 13.484, 'train_steps_per_second': 1.686, 'train_loss': 0.29776188532511394, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 10.3497, 'train_samples_per_second': 11.595, 'train_steps_per_second': 1.449, 'train_loss': 0.43493445714314777, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 13.955, 'train_samples_per_second': 10.749, 'train_steps_per_second': 1.505, 'train_loss': 0.1629888103121803, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 10.3966, 'train_samples_per_second': 11.542, 'train_steps_per_second': 1.443, 'train_loss': 0.1807225227355957, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 13.8745, 'train_samples_per_second': 12.973, 'train_steps_per_second': 1.73, 'train_loss': 0.180945614973704, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 14.0162, 'train_samples_per_second': 10.702, 'train_steps_per_second': 1.498, 'train_loss': 0.23485671906244188, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 14.1811, 'train_samples_per_second': 10.577, 'train_steps_per_second': 1.481, 'train_loss': 0.08405454385848272, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/24 [00:00<?, ?it/s]

{'train_runtime': 15.4137, 'train_samples_per_second': 11.678, 'train_steps_per_second': 1.557, 'train_loss': 0.14696069558461508, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/15 [00:00<?, ?it/s]

{'train_runtime': 9.1042, 'train_samples_per_second': 13.181, 'train_steps_per_second': 1.648, 'train_loss': 0.32178007761637367, 'epoch': 3.0}


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 14.3642, 'train_samples_per_second': 10.443, 'train_steps_per_second': 1.462, 'train_loss': 0.1941494033450172, 'epoch': 3.0}


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000019B376846A0>>
Traceback (most recent call last):
  File "c:\Users\Liora\anaconda3\envs\blanc\lib\site-packages\ipykernel\ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [None]:
""" Running the Program (English - Persian)"""

tune_en_fa_scores = [
    BLANC_tune_translation(
        sentences,
        translations,
        mbert_checkpoint,
        mbert_model,
        mbert_tokenizer,
        device=DEVICE,
    )
    for sentences, translations in tqdm(
        zip(en_fa_sentences, en_fa_translations), total=len(en_fa_sentences)
    )
]

# Saving the results
en_fa_data = {}
en_fa_data["BLANC_tune_en_fa_translation"] = tune_en_fa_scores
add_results_to_json(en_fa_data)