In [17]:
# %pip install datasets -q
# %pip install transformers -q
# %pip install torch -q
# %pip install seqeval -q
# %pip install evaluate -q
# %pip install accelerate -q

In [18]:
from datasetutils import decode
from iob2converter import iob2_to_dataset
from transformers import AutoModelForTokenClassification

In [19]:
file_path = '../data/TaggedSeparated/German/synopses_02.iob2'

de_ds = iob2_to_dataset(file_path)

ner_feature_fr = de_ds.features['ner_tags']
label_names = ner_feature_fr.feature.names
print(label_names)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC', 'B-POK', 'I-POK']


In [20]:
words = de_ds[0]['tokens']
labels = de_ds[0]['ner_tags']
print('\n'.join(decode(words, labels, label_names)))

Ash   Pikachu und der Rest der Gang sehen ihre größte Herausforderung entgegen als zwei hinterlistige Diebinnen den geheimnisvollsten und gefährlichsten aller Kristalle Herztropfen rauben wollen Fällt er in ihre Hände ist die Zerstörung der Wasserstadt Altomare unvermeidbar Es beginnt ein atemberaubendes Rennen gegen die Zeit bei dem die letzte Hoffnung auf Latios und Latias ruht die als Hüter des Kristalls mit magischen Kräften ausgestattet sind 
B-PER B-POK   O   O   O    O   O    O     O    O      O               O        O   O    O             O         O   O                 O   O              O     O         O           O      O      O     O  O  O    O     O   O   O          O   B-LOC       I-LOC    O            O  O       O   O               O      O     O   O    O   O   O   O      O        O   B-POK  O   B-POK  O    O   O   O     O   O         O   O         O       O            O    


In [21]:
from transformers import AutoTokenizer

model_id = 'google-bert/bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [22]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length", 
        max_length=128,
        is_split_into_words=True
    )

    all_labels = examples["ner_tags"]

    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels

    return tokenized_inputs

def align_labels_with_tokens(labels, word_ids):
    """
    This function aligns labels with tokens produced by the tokenizer.
    - `-100` is used for special tokens to ignore them during training.
    - If the label is B-XXX, subsequent sub-tokens receive I-XXX.
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            # Convert B-XXX to I-XXX for sub-tokens
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [23]:
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value

tag_to_id = {
	'O': 0,
	'B-PER': 1,
	'I-PER': 2,
	'B-ORG': 3,
	'I-ORG': 4,
	'B-LOC': 5,
	'I-LOC': 6,
	'B-MISC': 7,
	'I-MISC': 8,
	'B-POK': 9,
	'I-POK': 10,
}
id_to_tag = {id: tag for tag, id in tag_to_id.items()}

def iob2_to_datasets(file_path, reference_path):
    """
    Converts an IOB2 file into a DatasetDict with train and validation splits.
    Assumes the input file uses whitespace to separate tokens and tags, and that each sentence is separated by a blank line.
    """
    tokens, ner_tags = [], []
    sentences, sentence_tags = [], []

    label_set = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            line = line.strip()
            if not line:
                if tokens and ner_tags:
                    sentences.append(tokens)
                    sentence_tags.append(ner_tags)
                tokens, ner_tags = [], []
            else:
                try:
                    word, tag = line.split()
                except:
                    raise ValueError(f"Each line must have two columns: ({i}) {line}")
                tokens.append(word)
                ner_tags.append(tag)
                label_set.add(tag)

        if tokens and ner_tags:
            sentences.append(tokens)
            sentence_tags.append(ner_tags)

    label_list = list(tag_to_id.keys())
    label_mapping = {label: i for i, label in enumerate(label_list)}

    indexed_tags = [[label_mapping[tag] for tag in tags] for tags in sentence_tags]
    dataset = Dataset.from_dict({"tokens": sentences, "ner_tags": indexed_tags})
    reference_german = iob2_to_dataset(reference_path[0]).remove_columns(["ner_tags_id", "index", "id"])
    reference_french = iob2_to_dataset(reference_path[1]).remove_columns(["ner_tags_id", "index", "id"])
    reference_english = iob2_to_dataset(reference_path[2]).remove_columns(["ner_tags_id", "index", "id"])
    

    features = Features({
        "tokens": Sequence(Value("string")),
        "ner_tags": Sequence(ClassLabel(names=label_list))
    })

    datasets = DatasetDict({
        "train": dataset.cast(features),
        "val_de": reference_german.cast(features),
        "val_fr": reference_french.cast(features),
        "val_en": reference_english.cast(features),
    })

    return datasets

file_path = '../data/TaggedSeparated/German/synopses_02.iob2'
reference_grp_path = ['../ReferenceText/ReferenceTextGerman.iob2', '../ReferenceText/ReferenceTextFrench.iob2', '../ReferenceText/ReferenceTextEnglish.iob2']

de_ds = iob2_to_datasets(file_path, reference_grp_path)

Casting the dataset:   0%|          | 0/14 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

In [24]:
model = AutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_names)
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tokenized_ds = de_ds.map(tokenize_and_align_labels, batched=True)

print(tokenized_ds)

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
    val_de: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
    val_fr: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
    val_en: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})


In [26]:
import evaluate

metric = evaluate.load("seqeval")

In [27]:
# Rob span-f1 

def toSpans(tags):
    spans = set()
    for beg in range(len(tags)):
        if tags[beg][0] == 'B':
            end = beg
            for end in range(beg+1, len(tags)):
                if tags[end][0] != 'I':
                    break
            spans.add(str(beg) + '-' + str(end) + ':' + tags[beg][2:])
    return spans


def getInstanceScores(predSpans, goldSpans):
    tp = 0
    fp = 0
    fn = 0
    overlap = len(goldSpans.intersection(predSpans))
    tp += overlap
    fp += len(predSpans) - overlap
    fn += len(goldSpans) - overlap
        
    prec = 0.0 if tp+fp == 0 else tp/(tp+fp)
    rec = 0.0 if tp+fn == 0 else tp/(tp+fn)
    f1 = 0.0 if prec+rec == 0.0 else 2 * (prec * rec) / (prec + rec)
    return f1


In [28]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    pred_spans, true_spans = toSpans(true_predictions[0]), toSpans(true_labels[0])
    score = getInstanceScores(pred_spans, true_spans)

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
        "span_f1": score
    }

In [29]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained (
    model_id,
    num_labels=len(label_names),
    id2label={id: label for id, label in enumerate(label_names)},
    label2id={label: id for id, label in enumerate(label_names)},
)
model.config.num_labels

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11

In [30]:
from transformers import TrainingArguments
from transformers import Trainer


args = TrainingArguments(
    "mbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [31]:
# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=tokenized_ds["train"],
#     eval_dataset=tokenized_ds["validation"],
#     compute_metrics=compute_metrics,
# )

# trainer.train()


In [32]:
import os
import pandas as pd

llang = {
    "fr": "French",
    "en": "English",
    "de": "German"
}

# create new datframe for results.
df = pd.DataFrame(columns=["train_lang", "train_file", "test_lang", "precision", "recall", "f1", "accuracy", "span_f1"])
for lang in ["fr", "en", "de"]:

    def fine_tune_models_in_folder(folder_path, output_dir):
        files = os.listdir(folder_path)
        
        for file in files:
            file_path = os.path.join(folder_path, file)
            
            data = iob2_to_datasets(file_path, reference_grp_path)
            
            tokenized_ds = data.map(tokenize_and_align_labels, batched=True)
            
            model = AutoModelForTokenClassification.from_pretrained(
                model_id, num_labels=len(label_names)
            )
            
            trainer = Trainer(
                model=model,
                args=args,
                train_dataset=tokenized_ds["train"],
                eval_dataset=tokenized_ds["val_" + lang],
                compute_metrics=compute_metrics,

            )

            trainer.train()

            res_de = trainer.predict(tokenized_ds["val_de"]).metrics
            res_fr = trainer.predict(tokenized_ds["val_fr"]).metrics
            res_en = trainer.predict(tokenized_ds["val_en"]).metrics

            print (res_de)

            # Append result to dataframe
            df.loc[-1] = [
                llang[lang],
                file,
                "german",
                res_de["test_precision"],
                res_de["test_recall"],
                res_de["test_f1"],
                res_de["test_accuracy"],
                res_de["test_span_f1"]
            ]
            df.index = df.index + 1
            df.loc[-1] = [
                llang[lang],
                file,
                "french",
                res_fr["test_precision"],
                res_fr["test_recall"],
                res_fr["test_f1"],
                res_fr["test_accuracy"],
                res_fr["test_span_f1"]
            ]
            df.index = df.index + 1
            df.loc[-1] = [
                llang[lang],
                file,
                "english",
                res_en["test_precision"],
                res_en["test_recall"],
                res_en["test_f1"],
                res_en["test_accuracy"],
                res_en["test_span_f1"]
            ]
            df.index = df.index + 1

    folder_path = '../data/TaggedSeparated/' + llang[lang]
    output_dir = '../Models/m_' + lang

    fine_tune_models_in_folder(folder_path, output_dir)

# Save dataframe as tsv
df.to_csv('results.tsv', sep='\t', index=False)

Casting the dataset:   0%|          | 0/7 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.5537854433059692, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.1166, 'eval_samples_per_second': 8.58, 'eval_steps_per_second': 8.58, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.133070707321167, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.9576916694641113, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.085, 'eval_steps_per_second': 14.085, 'epoch': 3.0}
{'train_runtime': 7.9844, 'train_samples_per_second': 2.63, 'train_steps_per_second': 0.376, 'train_loss': 1.805876096089681, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 1.0778098106384277, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.032, 'test_samples_per_second': 31.249, 'test_steps_per_second': 31.249}


Casting the dataset:   0%|          | 0/14 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.8561270236968994, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7380952380952381, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.3602126836776733, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.077, 'eval_samples_per_second': 12.987, 'eval_steps_per_second': 12.987, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.1449054479599, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 3.0}
{'train_runtime': 11.6967, 'train_samples_per_second': 3.591, 'train_steps_per_second': 0.513, 'train_loss': 1.8085710207621257, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 1.3905038833618164, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8809523809523809, 'test_span_f1': 0.0, 'test_runtime': 0.045, 'test_samples_per_second': 22.223, 'test_steps_per_second': 22.223}


Casting the dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.4979121685028076, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9047619047619048, 'eval_span_f1': 0.0, 'eval_runtime': 0.0668, 'eval_samples_per_second': 14.969, 'eval_steps_per_second': 14.969, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.8177772760391235, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.286, 'eval_steps_per_second': 14.286, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5923576951026917, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.085, 'eval_steps_per_second': 14.085, 'epoch': 3.0}
{'train_runtime': 12.9171, 'train_samples_per_second': 4.877, 'train_steps_per_second': 0.697, 'train_loss': 1.4719780815972223, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.7840378284454346, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.092, 'test_samples_per_second': 10.87, 'test_steps_per_second': 10.87}


Casting the dataset:   0%|          | 0/28 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6333045363426208, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4075520634651184, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.286, 'eval_steps_per_second': 14.286, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.388173371553421, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.0708, 'eval_samples_per_second': 14.123, 'eval_steps_per_second': 14.123, 'epoch': 3.0}
{'train_runtime': 12.3537, 'train_samples_per_second': 6.8, 'train_steps_per_second': 0.971, 'train_loss': 1.2063296635945637, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.5813032388687134, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.036, 'test_samples_per_second': 27.776, 'test_steps_per_second': 27.776}


Casting the dataset:   0%|          | 0/35 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4819100499153137, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.37893542647361755, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.36163121461868286, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.293, 'eval_steps_per_second': 14.293, 'epoch': 3.0}
{'train_runtime': 13.1176, 'train_samples_per_second': 8.005, 'train_steps_per_second': 1.144, 'train_loss': 1.103938674926758, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.5406928062438965, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.0376, 'test_samples_per_second': 26.621, 'test_steps_per_second': 26.621}


Casting the dataset:   0%|          | 0/42 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.42757293581962585, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.088, 'eval_samples_per_second': 11.364, 'eval_steps_per_second': 11.364, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3690783977508545, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.078, 'eval_samples_per_second': 12.821, 'eval_steps_per_second': 12.821, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.35282018780708313, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 3.0}
{'train_runtime': 14.1831, 'train_samples_per_second': 8.884, 'train_steps_per_second': 1.269, 'train_loss': 1.070135646396213, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.550275981426239, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.069, 'test_samples_per_second': 14.493, 'test_steps_per_second': 14.493}


Casting the dataset:   0%|          | 0/49 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.40515413880348206, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.097, 'eval_samples_per_second': 10.309, 'eval_steps_per_second': 10.309, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3561897277832031, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.098, 'eval_samples_per_second': 10.204, 'eval_steps_per_second': 10.204, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.33290985226631165, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 3.0}
{'train_runtime': 13.2707, 'train_samples_per_second': 11.077, 'train_steps_per_second': 1.582, 'train_loss': 0.9772645859491258, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.5062464475631714, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.04, 'test_samples_per_second': 25.0, 'test_steps_per_second': 25.0}


Casting the dataset:   0%|          | 0/56 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4076155126094818, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.0716, 'eval_samples_per_second': 13.976, 'eval_steps_per_second': 13.976, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3464987277984619, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.077, 'eval_samples_per_second': 12.992, 'eval_steps_per_second': 12.992, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3227614462375641, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.0735, 'eval_samples_per_second': 13.601, 'eval_steps_per_second': 13.601, 'epoch': 3.0}
{'train_runtime': 14.213, 'train_samples_per_second': 11.82, 'train_steps_per_second': 1.478, 'train_loss': 0.9758189973377046, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.5082519054412842, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.06, 'test_samples_per_second': 16.668, 'test_steps_per_second': 16.668}


Casting the dataset:   0%|          | 0/63 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3927249610424042, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.085, 'eval_steps_per_second': 14.085, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3291000723838806, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.493, 'eval_steps_per_second': 14.493, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.30962076783180237, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.895, 'eval_steps_per_second': 13.895, 'epoch': 3.0}
{'train_runtime': 14.4913, 'train_samples_per_second': 13.042, 'train_steps_per_second': 1.656, 'train_loss': 0.9187169869740804, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4831274747848511, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.074, 'test_samples_per_second': 13.514, 'test_steps_per_second': 13.514}


Casting the dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.39097151160240173, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.0706, 'eval_samples_per_second': 14.158, 'eval_steps_per_second': 14.158, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3165265917778015, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.29638439416885376, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.493, 'eval_steps_per_second': 14.493, 'epoch': 3.0}
{'train_runtime': 15.8312, 'train_samples_per_second': 13.265, 'train_steps_per_second': 1.705, 'train_loss': 0.8893651609067563, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.47765544056892395, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.058, 'test_samples_per_second': 17.241, 'test_steps_per_second': 17.241}


Casting the dataset:   0%|          | 0/77 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3699527978897095, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.888, 'eval_steps_per_second': 13.888, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.30411574244499207, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.0754, 'eval_samples_per_second': 13.268, 'eval_steps_per_second': 13.268, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2810935974121094, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.492, 'eval_steps_per_second': 14.492, 'epoch': 3.0}
{'train_runtime': 14.2069, 'train_samples_per_second': 16.26, 'train_steps_per_second': 2.112, 'train_loss': 0.8500855127970378, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.47355377674102783, 'test_precision': 0.5, 'test_recall': 0.14285714285714285, 'test_f1': 0.22222222222222224, 'test_accuracy': 0.9047619047619048, 'test_span_f1': 0.0, 'test_runtime': 0.035, 'test_samples_per_second': 28.569, 'test_steps_per_second': 28.569}


Casting the dataset:   0%|          | 0/84 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.36231139302253723, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2851531505584717, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2732143700122833, 'eval_precision': 1.0, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.8, 'eval_accuracy': 0.9523809523809523, 'eval_span_f1': 0.6666666666666666, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.493, 'eval_steps_per_second': 14.493, 'epoch': 3.0}
{'train_runtime': 14.8967, 'train_samples_per_second': 16.916, 'train_steps_per_second': 2.215, 'train_loss': 0.809435815522165, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.46700209379196167, 'test_precision': 0.8, 'test_recall': 0.5714285714285714, 'test_f1': 0.6666666666666666, 'test_accuracy': 0.9285714285714286, 'test_span_f1': 0.6, 'test_runtime': 0.034, 'test_samples_per_second': 29.414, 'test_steps_per_second': 29.414}


Casting the dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3879513740539551, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.30562177300453186, 'eval_precision': 1.0, 'eval_recall': 0.16666666666666666, 'eval_f1': 0.2857142857142857, 'eval_accuracy': 0.9285714285714286, 'eval_span_f1': 0.2857142857142857, 'eval_runtime': 0.0735, 'eval_samples_per_second': 13.611, 'eval_steps_per_second': 13.611, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2937318682670593, 'eval_precision': 0.75, 'eval_recall': 0.5, 'eval_f1': 0.6, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.6666666666666666, 'eval_runtime': 0.101, 'eval_samples_per_second': 9.904, 'eval_steps_per_second': 9.904, 'epoch': 3.0}
{'train_runtime': 14.96, 'train_samples_per_second': 18.249, 'train_steps_per_second': 2.406, 'train_loss': 0.8494755956861708, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4164997339248657, 'test_precision': 0.6, 'test_recall': 0.42857142857142855, 'test_f1': 0.5, 'test_accuracy': 0.9285714285714286, 'test_span_f1': 0.6, 'test_runtime': 0.071, 'test_samples_per_second': 14.084, 'test_steps_per_second': 14.084}


Casting the dataset:   0%|          | 0/98 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3485610783100128, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2772493064403534, 'eval_precision': 0.75, 'eval_recall': 0.5, 'eval_f1': 0.6, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.6, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.888, 'eval_steps_per_second': 13.888, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2739209532737732, 'eval_precision': 0.6, 'eval_recall': 0.5, 'eval_f1': 0.5454545454545454, 'eval_accuracy': 0.9523809523809523, 'eval_span_f1': 0.8, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.09, 'eval_steps_per_second': 14.09, 'epoch': 3.0}
{'train_runtime': 15.5154, 'train_samples_per_second': 18.949, 'train_steps_per_second': 2.514, 'train_loss': 0.802463873838767, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.3628319501876831, 'test_precision': 0.375, 'test_recall': 0.42857142857142855, 'test_f1': 0.39999999999999997, 'test_accuracy': 0.9523809523809523, 'test_span_f1': 0.7272727272727273, 'test_runtime': 0.0741, 'test_samples_per_second': 13.49, 'test_steps_per_second': 13.49}


Casting the dataset:   0%|          | 0/7 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.8190943002700806, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.6746031746031746, 'eval_span_f1': 0.0, 'eval_runtime': 0.105, 'eval_samples_per_second': 9.524, 'eval_steps_per_second': 9.524, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.3735144138336182, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.873015873015873, 'eval_span_f1': 0.0, 'eval_runtime': 0.081, 'eval_samples_per_second': 12.346, 'eval_steps_per_second': 12.346, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.178802490234375, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8809523809523809, 'eval_span_f1': 0.0, 'eval_runtime': 0.1054, 'eval_samples_per_second': 9.492, 'eval_steps_per_second': 9.492, 'epoch': 3.0}
{'train_runtime': 11.2996, 'train_samples_per_second': 1.858, 'train_steps_per_second': 0.265, 'train_loss': 2.0253167152404785, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 1.488442063331604, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8888888888888888, 'test_span_f1': 0.0, 'test_runtime': 0.102, 'test_samples_per_second': 9.804, 'test_steps_per_second': 9.804}


Casting the dataset:   0%|          | 0/14 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.9233531951904297, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.626984126984127, 'eval_span_f1': 0.0, 'eval_runtime': 0.0717, 'eval_samples_per_second': 13.94, 'eval_steps_per_second': 13.94, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.4326838254928589, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.092, 'eval_samples_per_second': 10.869, 'eval_steps_per_second': 10.869, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.2283732891082764, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.09, 'eval_samples_per_second': 11.111, 'eval_steps_per_second': 11.111, 'epoch': 3.0}
{'train_runtime': 12.4935, 'train_samples_per_second': 3.362, 'train_steps_per_second': 0.48, 'train_loss': 1.9376840591430664, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 1.5168464183807373, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.873015873015873, 'test_span_f1': 0.0, 'test_runtime': 0.041, 'test_samples_per_second': 24.39, 'test_steps_per_second': 24.39}


  _warn_prf(average, modifier, msg_start, len(result))


Casting the dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0768003463745117, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.074, 'eval_samples_per_second': 13.514, 'eval_steps_per_second': 13.514, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6102369427680969, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.286, 'eval_steps_per_second': 14.286, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5538895726203918, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 3.0}
{'train_runtime': 12.0218, 'train_samples_per_second': 5.24, 'train_steps_per_second': 0.749, 'train_loss': 1.44158140818278, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.7239379286766052, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.0371, 'test_samples_per_second': 26.94, 'test_steps_per_second': 26.94}


Casting the dataset:   0%|          | 0/28 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7028910517692566, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.285, 'eval_steps_per_second': 14.285, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5115488171577454, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.112, 'eval_samples_per_second': 8.929, 'eval_steps_per_second': 8.929, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5036472678184509, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.085, 'eval_steps_per_second': 14.085, 'epoch': 3.0}
{'train_runtime': 12.6797, 'train_samples_per_second': 6.625, 'train_steps_per_second': 0.946, 'train_loss': 1.2968513170878093, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.7006005644798279, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.041, 'test_samples_per_second': 24.39, 'test_steps_per_second': 24.39}


Casting the dataset:   0%|          | 0/35 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6390478014945984, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5117403864860535, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5045363903045654, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.0716, 'eval_samples_per_second': 13.973, 'eval_steps_per_second': 13.973, 'epoch': 3.0}
{'train_runtime': 13.9163, 'train_samples_per_second': 7.545, 'train_steps_per_second': 1.078, 'train_loss': 1.217734146118164, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.727611243724823, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.036, 'test_samples_per_second': 27.776, 'test_steps_per_second': 27.776}


Casting the dataset:   0%|          | 0/42 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.526608943939209, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.704, 'eval_steps_per_second': 13.704, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5185949206352234, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.707, 'eval_steps_per_second': 14.707, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.47805213928222656, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.893, 'eval_steps_per_second': 13.893, 'epoch': 3.0}
{'train_runtime': 13.3984, 'train_samples_per_second': 9.404, 'train_steps_per_second': 1.343, 'train_loss': 1.1143865585327148, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.7312441468238831, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.04, 'test_samples_per_second': 25.002, 'test_steps_per_second': 25.002}


Casting the dataset:   0%|          | 0/49 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5098874568939209, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.699, 'eval_steps_per_second': 13.699, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4451800584793091, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.0725, 'eval_samples_per_second': 13.8, 'eval_steps_per_second': 13.8, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.36809197068214417, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 3.0}
{'train_runtime': 13.9458, 'train_samples_per_second': 10.541, 'train_steps_per_second': 1.506, 'train_loss': 1.013701393490746, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.6036877632141113, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.9126984126984127, 'test_span_f1': 0.0, 'test_runtime': 0.058, 'test_samples_per_second': 17.24, 'test_steps_per_second': 17.24}


Casting the dataset:   0%|          | 0/56 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5052734017372131, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4722527265548706, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.074, 'eval_samples_per_second': 13.513, 'eval_steps_per_second': 13.513, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.41636979579925537, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 3.0}
{'train_runtime': 14.8309, 'train_samples_per_second': 11.328, 'train_steps_per_second': 1.416, 'train_loss': 1.038907187325614, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.6610168814659119, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.039, 'test_samples_per_second': 25.642, 'test_steps_per_second': 25.642}


Casting the dataset:   0%|          | 0/63 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.47486406564712524, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.707, 'eval_steps_per_second': 14.707, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.357087105512619, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.0639, 'eval_samples_per_second': 15.649, 'eval_steps_per_second': 15.649, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.29864567518234253, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9126984126984127, 'eval_span_f1': 0.0, 'eval_runtime': 0.08, 'eval_samples_per_second': 12.499, 'eval_steps_per_second': 12.499, 'epoch': 3.0}
{'train_runtime': 14.385, 'train_samples_per_second': 13.139, 'train_steps_per_second': 1.668, 'train_loss': 0.913559357325236, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.49993255734443665, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.9285714285714286, 'test_span_f1': 0.0, 'test_runtime': 0.053, 'test_samples_per_second': 18.868, 'test_steps_per_second': 18.868}


Casting the dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4969440698623657, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.09, 'eval_samples_per_second': 11.111, 'eval_steps_per_second': 11.111, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.388505220413208, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.32214102149009705, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.066, 'eval_samples_per_second': 15.151, 'eval_steps_per_second': 15.151, 'epoch': 3.0}
{'train_runtime': 14.3481, 'train_samples_per_second': 14.636, 'train_steps_per_second': 1.882, 'train_loss': 0.939178325511791, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4961089789867401, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.9126984126984127, 'test_span_f1': 0.0, 'test_runtime': 0.065, 'test_samples_per_second': 15.384, 'test_steps_per_second': 15.384}


Casting the dataset:   0%|          | 0/77 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5016160011291504, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3353899419307709, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.27049776911735535, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9126984126984127, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 3.0}
{'train_runtime': 14.6023, 'train_samples_per_second': 15.819, 'train_steps_per_second': 2.054, 'train_loss': 0.8580835978190104, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4045904278755188, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.9285714285714286, 'test_span_f1': 0.0, 'test_runtime': 0.039, 'test_samples_per_second': 25.643, 'test_steps_per_second': 25.643}


Casting the dataset:   0%|          | 0/84 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4532005190849304, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.075, 'eval_samples_per_second': 13.333, 'eval_steps_per_second': 13.333, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.25224319100379944, 'eval_precision': 0.3333333333333333, 'eval_recall': 0.1111111111111111, 'eval_f1': 0.16666666666666666, 'eval_accuracy': 0.9285714285714286, 'eval_span_f1': 0.19999999999999998, 'eval_runtime': 0.077, 'eval_samples_per_second': 12.987, 'eval_steps_per_second': 12.987, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2253778725862503, 'eval_precision': 0.625, 'eval_recall': 0.5555555555555556, 'eval_f1': 0.5882352941176471, 'eval_accuracy': 0.9682539682539683, 'eval_span_f1': 0.6666666666666667, 'eval_runtime': 0.0996, 'eval_samples_per_second': 10.042, 'eval_steps_per_second': 10.042, 'epoch': 3.0}
{'train_runtime': 14.5788, 'train_samples_per_second': 17.285, 'train_steps_per_second': 2.264, 'train_loss': 0.7738457304058652, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.3326525390148163, 'test_precision': 0.8, 'test_recall': 0.5714285714285714, 'test_f1': 0.6666666666666666, 'test_accuracy': 0.9603174603174603, 'test_span_f1': 0.6666666666666666, 'test_runtime': 0.0486, 'test_samples_per_second': 20.586, 'test_steps_per_second': 20.586}


Casting the dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4149339497089386, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.085, 'eval_steps_per_second': 14.085, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.24060414731502533, 'eval_precision': 0.7142857142857143, 'eval_recall': 0.5555555555555556, 'eval_f1': 0.6250000000000001, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.7142857142857143, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 13.698, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.21705970168113708, 'eval_precision': 0.7142857142857143, 'eval_recall': 0.5555555555555556, 'eval_f1': 0.6250000000000001, 'eval_accuracy': 0.9523809523809523, 'eval_span_f1': 0.6666666666666667, 'eval_runtime': 0.088, 'eval_samples_per_second': 11.363, 'eval_steps_per_second': 11.363, 'epoch': 3.0}
{'train_runtime': 15.8136, 'train_samples_per_second': 17.264, 'train_steps_per_second': 2.277, 'train_loss': 0.7938958273993598, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.29594314098358154, 'test_precision': 0.6, 'test_recall': 0.42857142857142855, 'test_f1': 0.5, 'test_accuracy': 0.9444444444444444, 'test_span_f1': 0.5454545454545454, 'test_runtime': 0.0628, 'test_samples_per_second': 15.925, 'test_steps_per_second': 15.925}


Casting the dataset:   0%|          | 0/98 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3696480989456177, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.286, 'eval_steps_per_second': 14.286, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2303251326084137, 'eval_precision': 0.5714285714285714, 'eval_recall': 0.4444444444444444, 'eval_f1': 0.5, 'eval_accuracy': 0.9523809523809523, 'eval_span_f1': 0.5714285714285714, 'eval_runtime': 0.105, 'eval_samples_per_second': 9.524, 'eval_steps_per_second': 9.524, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.21039900183677673, 'eval_precision': 0.8571428571428571, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.75, 'eval_accuracy': 0.9603174603174603, 'eval_span_f1': 0.7142857142857143, 'eval_runtime': 0.0716, 'eval_samples_per_second': 13.964, 'eval_steps_per_second': 13.964, 'epoch': 3.0}
{'train_runtime': 15.4121, 'train_samples_per_second': 19.076, 'train_steps_per_second': 2.53, 'train_loss': 0.7125750321608323, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.29595550894737244, 'test_precision': 0.8, 'test_recall': 0.5714285714285714, 'test_f1': 0.6666666666666666, 'test_accuracy': 0.9603174603174603, 'test_span_f1': 0.6666666666666666, 'test_runtime': 0.062, 'test_samples_per_second': 16.13, 'test_steps_per_second': 16.13}


Casting the dataset:   0%|          | 0/99 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.396351158618927, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8888888888888888, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2179713398218155, 'eval_precision': 0.7142857142857143, 'eval_recall': 0.5555555555555556, 'eval_f1': 0.6250000000000001, 'eval_accuracy': 0.9603174603174603, 'eval_span_f1': 0.6666666666666667, 'eval_runtime': 0.075, 'eval_samples_per_second': 13.333, 'eval_steps_per_second': 13.333, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.18920187652111053, 'eval_precision': 0.8571428571428571, 'eval_recall': 0.6666666666666666, 'eval_f1': 0.75, 'eval_accuracy': 0.9682539682539683, 'eval_span_f1': 0.75, 'eval_runtime': 0.076, 'eval_samples_per_second': 13.157, 'eval_steps_per_second': 13.157, 'epoch': 3.0}
{'train_runtime': 15.3266, 'train_samples_per_second': 19.378, 'train_steps_per_second': 2.545, 'train_loss': 0.7760507143460788, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.25165021419525146, 'test_precision': 0.8, 'test_recall': 0.5714285714285714, 'test_f1': 0.6666666666666666, 'test_accuracy': 0.9603174603174603, 'test_span_f1': 0.6666666666666666, 'test_runtime': 0.034, 'test_samples_per_second': 29.412, 'test_steps_per_second': 29.412}


Casting the dataset:   0%|          | 0/7 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 2.2177882194519043, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.24603174603174602, 'eval_span_f1': 0.0, 'eval_runtime': 0.093, 'eval_samples_per_second': 10.753, 'eval_steps_per_second': 10.753, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.8995682001113892, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7301587301587301, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.7731462717056274, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8015873015873016, 'eval_span_f1': 0.0, 'eval_runtime': 0.076, 'eval_samples_per_second': 13.158, 'eval_steps_per_second': 13.158, 'epoch': 3.0}
{'train_runtime': 12.8724, 'train_samples_per_second': 1.631, 'train_steps_per_second': 0.233, 'train_loss': 2.0699939727783203, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 1.7731462717056274, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8015873015873016, 'test_span_f1': 0.0, 'test_runtime': 0.056, 'test_samples_per_second': 17.857, 'test_steps_per_second': 17.857}


Casting the dataset:   0%|          | 0/14 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.8000669479370117, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.7857142857142857, 'eval_span_f1': 0.0, 'eval_runtime': 0.089, 'eval_samples_per_second': 11.236, 'eval_steps_per_second': 11.236, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.2179750204086304, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.0703, 'eval_samples_per_second': 14.217, 'eval_steps_per_second': 14.217, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.9897025227546692, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.492, 'eval_steps_per_second': 14.492, 'epoch': 3.0}
{'train_runtime': 12.3567, 'train_samples_per_second': 3.399, 'train_steps_per_second': 0.486, 'train_loss': 1.718239466349284, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.9897025227546692, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.042, 'test_samples_per_second': 23.809, 'test_steps_per_second': 23.809}


Casting the dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.3893665075302124, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 14.084, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.7398988008499146, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.075, 'eval_samples_per_second': 13.334, 'eval_steps_per_second': 13.334, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5931596159934998, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.065, 'eval_samples_per_second': 15.385, 'eval_steps_per_second': 15.385, 'epoch': 3.0}
{'train_runtime': 12.4082, 'train_samples_per_second': 5.077, 'train_steps_per_second': 0.725, 'train_loss': 1.4325509601169162, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.5931596159934998, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.044, 'test_samples_per_second': 22.745, 'test_steps_per_second': 22.745}


Casting the dataset:   0%|          | 0/28 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 1.0590527057647705, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.07, 'eval_samples_per_second': 14.285, 'eval_steps_per_second': 14.285, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5104113221168518, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.073, 'eval_samples_per_second': 13.704, 'eval_steps_per_second': 13.704, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4673275053501129, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.493, 'eval_steps_per_second': 14.493, 'epoch': 3.0}
{'train_runtime': 12.5242, 'train_samples_per_second': 6.707, 'train_steps_per_second': 0.958, 'train_loss': 1.2921557426452637, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4673275053501129, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.034, 'test_samples_per_second': 29.411, 'test_steps_per_second': 29.411}


Casting the dataset:   0%|          | 0/35 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5907730460166931, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.11, 'eval_samples_per_second': 9.091, 'eval_steps_per_second': 9.091, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.45542481541633606, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.062, 'eval_samples_per_second': 16.129, 'eval_steps_per_second': 16.129, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.446946382522583, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.065, 'eval_samples_per_second': 15.385, 'eval_steps_per_second': 15.385, 'epoch': 3.0}
{'train_runtime': 13.0369, 'train_samples_per_second': 8.054, 'train_steps_per_second': 1.151, 'train_loss': 1.127728017171224, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.446946382522583, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.041, 'test_samples_per_second': 24.39, 'test_steps_per_second': 24.39}


Casting the dataset:   0%|          | 0/42 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.48937103152275085, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.066, 'eval_samples_per_second': 15.152, 'eval_steps_per_second': 15.152, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4382877051830292, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.499, 'eval_steps_per_second': 14.499, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4047227203845978, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.067, 'eval_samples_per_second': 14.925, 'eval_steps_per_second': 14.925, 'epoch': 3.0}
{'train_runtime': 12.9621, 'train_samples_per_second': 9.721, 'train_steps_per_second': 1.389, 'train_loss': 1.0057540469699435, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4047227203845978, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.037, 'test_samples_per_second': 27.029, 'test_steps_per_second': 27.029}


Casting the dataset:   0%|          | 0/49 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4816591739654541, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.065, 'eval_samples_per_second': 15.385, 'eval_steps_per_second': 15.385, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.45484811067581177, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.069, 'eval_samples_per_second': 14.492, 'eval_steps_per_second': 14.492, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4185329079627991, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.067, 'eval_samples_per_second': 14.925, 'eval_steps_per_second': 14.925, 'epoch': 3.0}
{'train_runtime': 14.0458, 'train_samples_per_second': 10.466, 'train_steps_per_second': 1.495, 'train_loss': 0.9883801596505302, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.4185329079627991, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.044, 'test_samples_per_second': 22.727, 'test_steps_per_second': 22.727}


Casting the dataset:   0%|          | 0/56 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.48018476366996765, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.066, 'eval_samples_per_second': 15.152, 'eval_steps_per_second': 15.152, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.40870776772499084, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3782975375652313, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 3.0}
{'train_runtime': 13.9549, 'train_samples_per_second': 12.039, 'train_steps_per_second': 1.505, 'train_loss': 0.9928538004557291, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.3782975375652313, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.8968253968253969, 'test_span_f1': 0.0, 'test_runtime': 0.073, 'test_samples_per_second': 13.704, 'test_steps_per_second': 13.704}


Casting the dataset:   0%|          | 0/63 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/63 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.44500112533569336, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.064, 'eval_samples_per_second': 15.625, 'eval_steps_per_second': 15.625, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3589496612548828, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.0725, 'eval_samples_per_second': 13.789, 'eval_steps_per_second': 13.789, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.30830785632133484, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9126984126984127, 'eval_span_f1': 0.0, 'eval_runtime': 0.072, 'eval_samples_per_second': 13.889, 'eval_steps_per_second': 13.889, 'epoch': 3.0}
{'train_runtime': 14.9872, 'train_samples_per_second': 12.611, 'train_steps_per_second': 1.601, 'train_loss': 0.844687302907308, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.30830785632133484, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.9126984126984127, 'test_span_f1': 0.0, 'test_runtime': 0.061, 'test_samples_per_second': 16.393, 'test_steps_per_second': 16.393}


Casting the dataset:   0%|          | 0/70 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4521717131137848, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.0701, 'eval_samples_per_second': 14.267, 'eval_steps_per_second': 14.267, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3627975881099701, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.088, 'eval_samples_per_second': 11.363, 'eval_steps_per_second': 11.363, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.3085954189300537, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9206349206349206, 'eval_span_f1': 0.0, 'eval_runtime': 0.075, 'eval_samples_per_second': 13.339, 'eval_steps_per_second': 13.339, 'epoch': 3.0}
{'train_runtime': 14.5335, 'train_samples_per_second': 14.449, 'train_steps_per_second': 1.858, 'train_loss': 0.8643554404929832, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.3085954189300537, 'test_precision': 0.0, 'test_recall': 0.0, 'test_f1': 0.0, 'test_accuracy': 0.9206349206349206, 'test_span_f1': 0.0, 'test_runtime': 0.052, 'test_samples_per_second': 19.23, 'test_steps_per_second': 19.23}


Casting the dataset:   0%|          | 0/77 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.42852988839149475, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.062, 'eval_samples_per_second': 16.129, 'eval_steps_per_second': 16.129, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.318545937538147, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.0675, 'eval_samples_per_second': 14.807, 'eval_steps_per_second': 14.807, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.27634066343307495, 'eval_precision': 0.6, 'eval_recall': 0.42857142857142855, 'eval_f1': 0.5, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.6, 'eval_runtime': 0.0759, 'eval_samples_per_second': 13.179, 'eval_steps_per_second': 13.179, 'epoch': 3.0}
{'train_runtime': 14.1828, 'train_samples_per_second': 16.287, 'train_steps_per_second': 2.115, 'train_loss': 0.8338624954223632, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.27634066343307495, 'test_precision': 0.6, 'test_recall': 0.42857142857142855, 'test_f1': 0.5, 'test_accuracy': 0.9444444444444444, 'test_span_f1': 0.6, 'test_runtime': 0.035, 'test_samples_per_second': 28.572, 'test_steps_per_second': 28.572}


Casting the dataset:   0%|          | 0/84 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/33 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.39764460921287537, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.0661, 'eval_samples_per_second': 15.137, 'eval_steps_per_second': 15.137, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2651619613170624, 'eval_precision': 0.6, 'eval_recall': 0.42857142857142855, 'eval_f1': 0.5, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.5454545454545454, 'eval_runtime': 0.113, 'eval_samples_per_second': 8.85, 'eval_steps_per_second': 8.85, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.25078898668289185, 'eval_precision': 0.6, 'eval_recall': 0.42857142857142855, 'eval_f1': 0.5, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.5454545454545454, 'eval_runtime': 0.071, 'eval_samples_per_second': 14.09, 'eval_steps_per_second': 14.09, 'epoch': 3.0}
{'train_runtime': 15.8731, 'train_samples_per_second': 15.876, 'train_steps_per_second': 2.079, 'train_loss': 0.7420780875466086, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.25078898668289185, 'test_precision': 0.6, 'test_recall': 0.42857142857142855, 'test_f1': 0.5, 'test_accuracy': 0.9444444444444444, 'test_span_f1': 0.5454545454545454, 'test_runtime': 0.076, 'test_samples_per_second': 13.158, 'test_steps_per_second': 13.158}


Casting the dataset:   0%|          | 0/91 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/91 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.40159866213798523, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.068, 'eval_samples_per_second': 14.706, 'eval_steps_per_second': 14.706, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2777915596961975, 'eval_precision': 0.6, 'eval_recall': 0.42857142857142855, 'eval_f1': 0.5, 'eval_accuracy': 0.9365079365079365, 'eval_span_f1': 0.6, 'eval_runtime': 0.08, 'eval_samples_per_second': 12.504, 'eval_steps_per_second': 12.504, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.25894349813461304, 'eval_precision': 0.5, 'eval_recall': 0.42857142857142855, 'eval_f1': 0.4615384615384615, 'eval_accuracy': 0.9523809523809523, 'eval_span_f1': 0.5, 'eval_runtime': 0.1092, 'eval_samples_per_second': 9.159, 'eval_steps_per_second': 9.159, 'epoch': 3.0}
{'train_runtime': 14.9117, 'train_samples_per_second': 18.308, 'train_steps_per_second': 2.414, 'train_loss': 0.7870088683234321, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.25894349813461304, 'test_precision': 0.5, 'test_recall': 0.42857142857142855, 'test_f1': 0.4615384615384615, 'test_accuracy': 0.9523809523809523, 'test_span_f1': 0.5, 'test_runtime': 0.053, 'test_samples_per_second': 18.867, 'test_steps_per_second': 18.867}


Casting the dataset:   0%|          | 0/98 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.36510539054870605, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.8968253968253969, 'eval_span_f1': 0.0, 'eval_runtime': 0.067, 'eval_samples_per_second': 14.927, 'eval_steps_per_second': 14.927, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.24913336336612701, 'eval_precision': 0.8, 'eval_recall': 0.5714285714285714, 'eval_f1': 0.6666666666666666, 'eval_accuracy': 0.9523809523809523, 'eval_span_f1': 0.5454545454545454, 'eval_runtime': 0.1, 'eval_samples_per_second': 10.0, 'eval_steps_per_second': 10.0, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.2420009970664978, 'eval_precision': 0.6, 'eval_recall': 0.42857142857142855, 'eval_f1': 0.5, 'eval_accuracy': 0.9444444444444444, 'eval_span_f1': 0.5454545454545454, 'eval_runtime': 0.119, 'eval_samples_per_second': 8.403, 'eval_steps_per_second': 8.403, 'epoch': 3.0}
{'train_runtime': 16.0773, 'train_samples_per_second': 18.287, 'train_steps_per_second': 2.426, 'train_loss': 0.6905698531713241, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'test_loss': 0.2420009970664978, 'test_precision': 0.6, 'test_recall': 0.42857142857142855, 'test_f1': 0.5, 'test_accuracy': 0.9444444444444444, 'test_span_f1': 0.5454545454545454, 'test_runtime': 0.035, 'test_samples_per_second': 28.566, 'test_steps_per_second': 28.566}


In [18]:
import os
import pandas as pd

df = pd.DataFrame(columns=["train_lang", "train_file", "test_lang", "precision", "recall", "f1", "accuracy", "span_f1"])

data = iob2_to_datasets(file_path, reference_grp_path)

tokenized_ds = data.map(tokenize_and_align_labels, batched=True)

model = AutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_names)
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["val_" + lang],
    compute_metrics=compute_metrics,

)

# trainer.train()

res_de = trainer.predict(tokenized_ds["val_de"]).metrics
res_fr = trainer.predict(tokenized_ds["val_fr"]).metrics
res_en = trainer.predict(tokenized_ds["val_en"]).metrics

df.loc[-1] = [
    "baseline",
    "None",
    "german",
    res_de["test_precision"],
    res_de["test_recall"],
    res_de["test_f1"],
    res_de["test_accuracy"],
    res_de["test_span_f1"]
]
df.index = df.index + 1
df.loc[-1] = [
    "baseline",
    "None",
    "french",
    res_fr["test_precision"],
    res_fr["test_recall"],
    res_fr["test_f1"],
    res_fr["test_accuracy"],
    res_fr["test_span_f1"]
]
df.index = df.index + 1
df.loc[-1] = [
    "baseline",
    "None",
    "english",
    res_en["test_precision"],
    res_en["test_recall"],
    res_en["test_f1"],
    res_en["test_accuracy"],
    res_en["test_span_f1"]
]
df.index = df.index + 1


# Save dataframe as tsv
df.to_csv('results_baseline.tsv', sep='\t', index=False)

Casting the dataset:   0%|          | 0/14 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[['I-PER', 'I-PER', 'I-POK', 'I-PER', 'I-PER', 'I-POK', 'I-PER', 'I-ORG', 'I-POK', 'I-PER', 'I-ORG', 'I-ORG', 'I-POK', 'I-ORG', 'I-POK', 'I-ORG', 'I-ORG', 'I-MISC', 'I-MISC', 'B-LOC', 'I-ORG', 'I-MISC', 'I-ORG', 'B-LOC', 'I-POK', 'I-ORG', 'I-PER', 'I-ORG', 'B-LOC', 'I-ORG', 'I-MISC', 'B-ORG', 'I-MISC', 'I-ORG', 'I-ORG', 'I-MISC', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-POK', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'B-LOC', 'B-LOC', 'I-ORG', 'I-MISC', 'I-POK', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-PER', 'I-PER', 'I-PER', 'B-LOC', 'I-PER', 'I-ORG', 'I-POK', 'B-ORG', 'I-POK', 'B-LOC', 'I-ORG', 'I-ORG', 'I-ORG', 'B-LOC', 'B-LOC', 'I-ORG', 'I-PER', 'B-LOC', 'I-MISC', 'I-ORG', 'B-LOC', 'I-PER', 'I-PER', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-MISC', 'I-MISC', 'I-ORG', 'I-MISC', 'I-MISC', 'I-PER', 'I-ORG', 'I-ORG', 'I-ORG', 'I-POK', 'B-LOC', 'I-ORG', 'I-ORG', 'I-MISC', 'I-P

  _warn_prf(average, modifier, msg_start, len(result))


[['I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-ORG', 'I-PER', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-PER', 'I-PER', 'B-LOC', 'I-POK', 'I-PER', 'B-LOC', 'B-POK', 'I-POK', 'I-POK', 'I-POK', 'I-PER', 'I-MISC', 'I-PER', 'I-MISC', 'B-LOC', 'I-ORG', 'I-ORG', 'I-POK', 'I-ORG', 'I-ORG', 'I-POK', 'I-ORG', 'I-PER', 'I-ORG', 'I-ORG', 'B-ORG', 'I-MISC', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'B-ORG', 'I-MISC', 'I-ORG', 'I-POK', 'I-MISC', 'I-ORG', 'I-ORG', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-MISC', 'I-ORG', 'I-ORG', 'I-ORG', 'I-PER', 'I-PER', 'I-POK', 'I-POK', 'I-PER', 'I-ORG', 'I-ORG', 'I-POK', 'I-POK', 'B-POK', 'I-POK', 'I-POK', 'I-POK', 'I-PER', 'I-PER', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-POK', 'I-PER', 'I-PER', 'I-PER', 'I-POK', 'I-ORG', 'I-POK', 'I-PER', 'I-POK', 'I-MISC', 'B-POK', 'I-POK', 'I-POK', 'I-PER', 'B-POK', 'I-PER', 

[['I-MISC', 'I-POK', 'I-PER', 'B-LOC', 'I-MISC', 'I-MISC', 'I-PER', 'I-ORG', 'I-MISC', 'I-MISC', 'I-MISC', 'I-POK', 'I-MISC', 'B-ORG', 'O', 'I-MISC', 'O', 'I-MISC', 'I-MISC', 'I-MISC', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'I-MISC', 'B-ORG', 'I-POK', 'I-PER', 'I-MISC', 'I-ORG', 'I-PER', 'I-ORG', 'B-MISC', 'I-ORG', 'I-ORG', 'I-MISC', 'I-ORG', 'I-ORG', 'I-ORG', 'I-MISC', 'I-MISC', 'I-ORG', 'B-LOC', 'I-POK', 'I-MISC', 'I-ORG', 'I-MISC', 'I-MISC', 'I-MISC', 'I-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-MISC', 'I-MISC', 'I-MISC', 'I-ORG', 'I-POK', 'I-PER', 'I-PER', 'I-LOC', 'I-POK', 'I-MISC', 'I-MISC', 'I-MISC', 'I-POK', 'I-PER', 'I-PER', 'I-POK', 'I-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'I-MISC', 'I-POK', 'I-POK', 'I-MISC', 'I-PER', 'I-POK', 'I-LOC', 'I-MISC', 'I-LOC', 'B-PER', 'I-POK', 'I-POK', 'I-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-LOC', 'I-POK', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I