### 3. RuT5

In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict

TRAIN_FILE = "in_domain_train.csv"
IN_DOMAIN_DEV_FILE = "in_domain_dev.csv"
OUT_OF_DOMAIN_DEV_FILE = "out_of_domain_dev.csv"
TEST_FILE = "test.csv"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_splits(*, as_datasets):
    train_df, in_domain_dev_df, out_of_domain_dev_df, test_df = map(
        pd.read_csv, (TRAIN_FILE, IN_DOMAIN_DEV_FILE, OUT_OF_DOMAIN_DEV_FILE, TEST_FILE)
    )

    # concatenate datasets to get aggregate metrics
    dev_df = pd.concat((in_domain_dev_df, out_of_domain_dev_df))

    if as_datasets:
        train, dev, test = map(Dataset.from_pandas, (train_df, dev_df, test_df))
        return DatasetDict(train=train, dev=dev, test=test)
    else:
        return train_df, dev_df, test_df

In [3]:
import os
from argparse import ArgumentParser
from functools import partial
from shutil import rmtree

import numpy as np
from datasets import load_metric
from razdel import tokenize
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5Tokenizer,
    T5ForConditionalGeneration,
)

In [4]:
ACCURACY = load_metric("accuracy", keep_in_memory=True, trust_remote_code=True)
MCC = load_metric("matthews_correlation", keep_in_memory=True, trust_remote_code=True)

  ACCURACY = load_metric("accuracy", keep_in_memory=True, trust_remote_code=True)


In [5]:
N_SEEDS = 10
N_EPOCHS = 20
LR_VALUES = (1e-4, 1e-3)
DECAY_VALUES = (0, 1e-4)
BATCH_SIZES = (128,)

POS_LABEL = "yes"
NEG_LABEL = "no"

In [6]:
def compute_metrics(p, tokenizer):
    string_preds = tokenizer.batch_decode(p.predictions, skip_special_tokens=True)
    int_preds = [1 if prediction == POS_LABEL else 0 for prediction in string_preds]

    labels = np.where(p.label_ids != -100, p.label_ids, tokenizer.pad_token_id)
    string_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    int_labels = []

    for string_label in string_labels:
        if string_label == POS_LABEL:
            int_labels.append(1)
        elif string_label == NEG_LABEL or string_label == "":  # second case accounts for test data
            int_labels.append(0)
        else:
            raise ValueError()

    acc_result = ACCURACY.compute(predictions=int_preds, references=int_labels)
    mcc_result = MCC.compute(predictions=int_preds, references=int_labels)

    result = {"accuracy": acc_result["accuracy"], "mcc": mcc_result["matthews_correlation"]}

    return result

In [7]:
def preprocess_examples(examples, tokenizer):
    result = tokenizer(examples["sentence"], padding=False)

    if "acceptable" in examples:
        label_sequences = []
        for label in examples["acceptable"]:
            if label == 1:
                target_sequence = POS_LABEL
            elif label == 0:
                target_sequence = NEG_LABEL
            else:
                raise ValueError("Unknown class label")
            label_sequences.append(target_sequence)

    else:
        # a hack to avoid the "You have to specify either decoder_input_ids or decoder_inputs_embeds" error
        # for test data
        label_sequences = ["" for _ in examples["sentence"]]

    result["labels"] = tokenizer(label_sequences, padding=False)["input_ids"]
    result["length"] = [len(list(tokenize(sentence))) for sentence in examples["sentence"]]
    return result

In [8]:
def main():
    tokenizer = T5Tokenizer.from_pretrained("sberbank-ai/ruT5-base")

    splits = read_splits(as_datasets=True)

    tokenized_splits = splits.map(
        partial(preprocess_examples, tokenizer=tokenizer),
        batched=True,
        remove_columns=["sentence"],
    )

    data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8)

    # seed, lr, wd, bs
    dev_metrics_per_run = np.empty((N_SEEDS, len(LR_VALUES), len(DECAY_VALUES), len(BATCH_SIZES), 2))

    for i, learning_rate in enumerate(LR_VALUES):
        for j, weight_decay in enumerate(DECAY_VALUES):
            for k, batch_size in enumerate(BATCH_SIZES):
                for seed in range(N_SEEDS):
                    model = T5ForConditionalGeneration.from_pretrained("sberbank-ai/ruT5-base")

                    run_base_dir = f"sberbank-ai_ruT5-base_{learning_rate}_{weight_decay}_{batch_size}"

                    training_args = Seq2SeqTrainingArguments(
                        output_dir=f"checkpoints/{run_base_dir}",
                        overwrite_output_dir=True,
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=batch_size,
                        per_device_eval_batch_size=batch_size,
                        learning_rate=learning_rate,
                        weight_decay=weight_decay,
                        num_train_epochs=N_EPOCHS,
                        lr_scheduler_type="constant",
                        save_strategy="epoch",
                        save_total_limit=1,
                        seed=seed,
                        fp16=True,
                        dataloader_num_workers=4,
                        group_by_length=True,
                        report_to="none",
                        load_best_model_at_end=True,
                        metric_for_best_model="eval_mcc",
                        optim="adafactor",
                        predict_with_generate=True,
                    )

                    trainer = Seq2SeqTrainer(
                        model=model,
                        args=training_args,
                        train_dataset=tokenized_splits["train"],
                        eval_dataset=tokenized_splits["dev"],
                        compute_metrics=partial(compute_metrics, tokenizer=tokenizer),
                        tokenizer=tokenizer,
                        data_collator=data_collator,
                    )

                    train_result = trainer.train()
                    print(f"{run_base_dir}_{seed}")
                    print("train", train_result.metrics)

                    os.makedirs(f"results/{run_base_dir}_{seed}", exist_ok=True)

                    dev_predictions = trainer.predict(
                        test_dataset=tokenized_splits["dev"], metric_key_prefix="test", max_length=10
                    )
                    print("dev", dev_predictions.metrics)
                    dev_metrics_per_run[seed, i, j, k] = (
                        dev_predictions.metrics["test_accuracy"],
                        dev_predictions.metrics["test_mcc"],
                    )

                    predictions = trainer.predict(test_dataset=tokenized_splits["test"], max_length=10)

                    string_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)

                    int_preds = [1 if prediction == POS_LABEL else 0 for prediction in string_preds]
                    int_preds = np.asarray(int_preds)

                    np.save(f"results/{run_base_dir}_{seed}/preds.npy", int_preds)

                    rmtree(f"checkpoints/{run_base_dir}")

    os.makedirs("results_agg", exist_ok=True)
    np.save(f"results_agg/sberbank-ai_ruT5-base_dev.npy", dev_metrics_per_run)

In [9]:
main()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 7869/7869 [00:01<00:00, 4285.94 examples/s]
Map: 100%|██████████| 2787/2787 [00:00<00:00, 4645.01 examples/s]
Map: 100%|██████████| 2789/2789 [00:00<00:00, 4775.61 examples/s]
                                                   
  5%|▌         | 62/1240 [18:45<4:35:56, 14.05s/it]

{'eval_loss': 0.33953073620796204, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 283.9748, 'eval_samples_per_second': 9.814, 'eval_steps_per_second': 0.077, 'epoch': 1.0}


                                                    
 10%|█         | 124/1240 [36:42<3:44:50, 12.09s/it]

{'eval_loss': 0.27848392724990845, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 284.2194, 'eval_samples_per_second': 9.806, 'eval_steps_per_second': 0.077, 'epoch': 2.0}


                                                    
 15%|█▌        | 186/1240 [51:03<2:37:42,  8.98s/it]

{'eval_loss': 0.23775430023670197, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 163.0877, 'eval_samples_per_second': 17.089, 'eval_steps_per_second': 0.135, 'epoch': 3.0}


                                                      
 20%|██        | 248/1240 [1:08:21<3:28:09, 12.59s/it]

{'eval_loss': 0.2592877149581909, 'eval_accuracy': 0.6781485468245425, 'eval_mcc': 0.077372430586719, 'eval_runtime': 261.3128, 'eval_samples_per_second': 10.665, 'eval_steps_per_second': 0.084, 'epoch': 4.0}


                                                      
 25%|██▌       | 310/1240 [1:22:47<2:40:33, 10.36s/it]

{'eval_loss': 0.23017176985740662, 'eval_accuracy': 0.6935773232866882, 'eval_mcc': 0.1823045511116742, 'eval_runtime': 160.3413, 'eval_samples_per_second': 17.382, 'eval_steps_per_second': 0.137, 'epoch': 5.0}


                                                      
 30%|███       | 372/1240 [1:38:01<2:17:26,  9.50s/it]

{'eval_loss': 0.24283894896507263, 'eval_accuracy': 0.7007534983853606, 'eval_mcc': 0.21092701012249818, 'eval_runtime': 278.4972, 'eval_samples_per_second': 10.007, 'eval_steps_per_second': 0.079, 'epoch': 6.0}


                                                      
 35%|███▌      | 434/1240 [1:55:39<2:26:01, 10.87s/it]

{'eval_loss': 0.2266327291727066, 'eval_accuracy': 0.7143882310728382, 'eval_mcc': 0.2653978792358558, 'eval_runtime': 306.0001, 'eval_samples_per_second': 9.108, 'eval_steps_per_second': 0.072, 'epoch': 7.0}


                                                      
 40%|████      | 496/1240 [2:13:22<2:21:50, 11.44s/it]

{'eval_loss': 0.21729573607444763, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2905940130171505, 'eval_runtime': 287.2042, 'eval_samples_per_second': 9.704, 'eval_steps_per_second': 0.077, 'epoch': 8.0}


 40%|████      | 500/1240 [2:15:51<11:18:42, 55.03s/it] 

{'loss': 0.4372, 'grad_norm': 0.8744450807571411, 'learning_rate': 0.0001, 'epoch': 8.06}


                                                      
 45%|████▌     | 558/1240 [2:30:45<1:57:28, 10.34s/it]

{'eval_loss': 0.2869114577770233, 'eval_accuracy': 0.7154646573376391, 'eval_mcc': 0.2682284236268117, 'eval_runtime': 220.3848, 'eval_samples_per_second': 12.646, 'eval_steps_per_second': 0.1, 'epoch': 9.0}


                                                      
 50%|█████     | 620/1240 [2:47:00<1:50:41, 10.71s/it]

{'eval_loss': 0.39593878388404846, 'eval_accuracy': 0.710800143523502, 'eval_mcc': 0.250765867333914, 'eval_runtime': 263.7413, 'eval_samples_per_second': 10.567, 'eval_steps_per_second': 0.083, 'epoch': 10.0}


                                                      
 55%|█████▌    | 682/1240 [3:05:53<1:43:21, 11.11s/it]

{'eval_loss': 0.4343354403972626, 'eval_accuracy': 0.7014711158952278, 'eval_mcc': 0.21247612529912768, 'eval_runtime': 324.9305, 'eval_samples_per_second': 8.577, 'eval_steps_per_second': 0.068, 'epoch': 11.0}


                                                      
 60%|██████    | 744/1240 [3:23:54<1:47:50, 13.05s/it]

{'eval_loss': 0.3916717767715454, 'eval_accuracy': 0.7136706135629709, 'eval_mcc': 0.26274296022016136, 'eval_runtime': 273.0719, 'eval_samples_per_second': 10.206, 'eval_steps_per_second': 0.081, 'epoch': 12.0}


                                                      
 65%|██████▌   | 806/1240 [3:40:25<1:26:20, 11.94s/it]

{'eval_loss': 0.5404649972915649, 'eval_accuracy': 0.7039827771797632, 'eval_mcc': 0.22282074863805904, 'eval_runtime': 239.1258, 'eval_samples_per_second': 11.655, 'eval_steps_per_second': 0.092, 'epoch': 13.0}


                                                      
 70%|███████   | 868/1240 [3:59:31<1:27:27, 14.11s/it]

{'eval_loss': 0.39426371455192566, 'eval_accuracy': 0.722999641191245, 'eval_mcc': 0.29634776738265517, 'eval_runtime': 319.167, 'eval_samples_per_second': 8.732, 'eval_steps_per_second': 0.069, 'epoch': 14.0}


                                                    
 75%|███████▌  | 930/1240 [4:15:00<52:23, 10.14s/it]

{'eval_loss': 0.46727731823921204, 'eval_accuracy': 0.71259418729817, 'eval_mcc': 0.2602104960962921, 'eval_runtime': 229.9913, 'eval_samples_per_second': 12.118, 'eval_steps_per_second': 0.096, 'epoch': 15.0}


                                                    
 80%|████████  | 992/1240 [4:31:09<50:17, 12.17s/it]

{'eval_loss': 0.5182844400405884, 'eval_accuracy': 0.7172587011123072, 'eval_mcc': 0.2783275460411395, 'eval_runtime': 184.9398, 'eval_samples_per_second': 15.07, 'eval_steps_per_second': 0.119, 'epoch': 16.0}


 81%|████████  | 1000/1240 [4:33:50<1:09:18, 17.33s/it]

{'loss': 0.0977, 'grad_norm': 0.6372029781341553, 'learning_rate': 0.0001, 'epoch': 16.13}


                                                     
 85%|████████▌ | 1054/1240 [4:47:23<33:02, 10.66s/it]

{'eval_loss': 0.5976054668426514, 'eval_accuracy': 0.7072120559741658, 'eval_mcc': 0.23818440962508436, 'eval_runtime': 259.9186, 'eval_samples_per_second': 10.723, 'eval_steps_per_second': 0.085, 'epoch': 17.0}


                                                     
 90%|█████████ | 1116/1240 [5:00:15<16:39,  8.06s/it]

{'eval_loss': 0.7220181822776794, 'eval_accuracy': 0.7172587011123072, 'eval_mcc': 0.27454499432418566, 'eval_runtime': 175.6521, 'eval_samples_per_second': 15.867, 'eval_steps_per_second': 0.125, 'epoch': 18.0}


                                                     
 95%|█████████▌| 1178/1240 [5:18:14<12:19, 11.92s/it]

{'eval_loss': 0.6182755827903748, 'eval_accuracy': 0.7240760674560459, 'eval_mcc': 0.3044546862894332, 'eval_runtime': 247.8681, 'eval_samples_per_second': 11.244, 'eval_steps_per_second': 0.089, 'epoch': 19.0}


                                                     
100%|██████████| 1240/1240 [5:31:17<00:00, 10.03s/it]

{'eval_loss': 0.6568722128868103, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.27462666655117646, 'eval_runtime': 179.2264, 'eval_samples_per_second': 15.55, 'eval_steps_per_second': 0.123, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [5:31:21<00:00, 10.03s/it]

{'train_runtime': 19881.1382, 'train_samples_per_second': 7.916, 'train_steps_per_second': 0.062, 'train_loss': 0.22288832126125213, 'epoch': 20.0}


100%|██████████| 1240/1240 [5:31:21<00:00, 16.03s/it]


sberbank-ai_ruT5-base_0.0001_0_128_0
train {'train_runtime': 19881.1382, 'train_samples_per_second': 7.916, 'train_steps_per_second': 0.062, 'total_flos': 4769434318602240.0, 'train_loss': 0.22288832126125213, 'epoch': 20.0}


100%|██████████| 22/22 [02:36<00:00,  7.11s/it]


dev {'test_loss': 0.6182755827903748, 'test_accuracy': 0.7240760674560459, 'test_mcc': 0.3044546862894332, 'test_runtime': 186.6856, 'test_samples_per_second': 14.929, 'test_steps_per_second': 0.118}


100%|██████████| 22/22 [01:38<00:00,  4.49s/it]

  5%|▌         | 62/1240 [12:34<2:28:27,  7.56s/it]

{'eval_loss': 0.32506468892097473, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 156.0441, 'eval_samples_per_second': 17.86, 'eval_steps_per_second': 0.141, 'epoch': 1.0}



 10%|█         | 124/1240 [26:30<2:29:29,  8.04s/it]

{'eval_loss': 0.28575122356414795, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 223.8509, 'eval_samples_per_second': 12.45, 'eval_steps_per_second': 0.098, 'epoch': 2.0}



 15%|█▌        | 186/1240 [38:25<2:50:55,  9.73s/it]

{'eval_loss': 0.26971006393432617, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 113.7747, 'eval_samples_per_second': 24.496, 'eval_steps_per_second': 0.193, 'epoch': 3.0}



 20%|██        | 248/1240 [50:38<2:21:58,  8.59s/it]

{'eval_loss': 0.256347119808197, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 139.9329, 'eval_samples_per_second': 19.917, 'eval_steps_per_second': 0.157, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:03:31<2:15:55,  8.77s/it]

{'eval_loss': 0.22097192704677582, 'eval_accuracy': 0.6982418371008252, 'eval_mcc': 0.2004732982526735, 'eval_runtime': 219.528, 'eval_samples_per_second': 12.695, 'eval_steps_per_second': 0.1, 'epoch': 5.0}



 30%|███       | 372/1240 [1:15:36<2:11:12,  9.07s/it]

{'eval_loss': 0.22034406661987305, 'eval_accuracy': 0.7014711158952278, 'eval_mcc': 0.221948563007954, 'eval_runtime': 163.4921, 'eval_samples_per_second': 17.047, 'eval_steps_per_second': 0.135, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:27:30<2:02:55,  9.15s/it]

{'eval_loss': 0.2203121781349182, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2914213068716145, 'eval_runtime': 158.4832, 'eval_samples_per_second': 17.585, 'eval_steps_per_second': 0.139, 'epoch': 7.0}



 40%|████      | 496/1240 [1:39:01<1:46:04,  8.55s/it]

{'eval_loss': 0.26229941844940186, 'eval_accuracy': 0.7244348762109796, 'eval_mcc': 0.2999222684662865, 'eval_runtime': 132.6941, 'eval_samples_per_second': 21.003, 'eval_steps_per_second': 0.166, 'epoch': 8.0}


 40%|████      | 500/1240 [1:40:26<5:51:15, 28.48s/it] 

{'loss': 0.4102, 'grad_norm': 0.8062583804130554, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:50:15<1:29:35,  7.88s/it]

{'eval_loss': 0.3199373185634613, 'eval_accuracy': 0.7147470398277718, 'eval_mcc': 0.2777564769801062, 'eval_runtime': 147.97, 'eval_samples_per_second': 18.835, 'eval_steps_per_second': 0.149, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:01:33<1:29:32,  8.67s/it]

{'eval_loss': 0.28979915380477905, 'eval_accuracy': 0.7244348762109796, 'eval_mcc': 0.29978166275153867, 'eval_runtime': 139.276, 'eval_samples_per_second': 20.011, 'eval_steps_per_second': 0.158, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:12:08<1:01:24,  6.60s/it]

{'eval_loss': 0.32763829827308655, 'eval_accuracy': 0.7294581987800502, 'eval_mcc': 0.3175174161551942, 'eval_runtime': 102.549, 'eval_samples_per_second': 27.177, 'eval_steps_per_second': 0.215, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:24:04<1:05:21,  7.91s/it]

{'eval_loss': 0.357371985912323, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.26964094820571355, 'eval_runtime': 172.8481, 'eval_samples_per_second': 16.124, 'eval_steps_per_second': 0.127, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:35:52<1:06:49,  9.24s/it]

{'eval_loss': 0.4910358190536499, 'eval_accuracy': 0.7115177610333692, 'eval_mcc': 0.253176337785139, 'eval_runtime': 127.1181, 'eval_samples_per_second': 21.924, 'eval_steps_per_second': 0.173, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:47:32<42:49,  6.91s/it]

{'eval_loss': 0.5719196200370789, 'eval_accuracy': 0.7140294223179046, 'eval_mcc': 0.26229991838499406, 'eval_runtime': 152.0612, 'eval_samples_per_second': 18.328, 'eval_steps_per_second': 0.145, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [2:59:30<43:32,  8.43s/it]

{'eval_loss': 0.562828004360199, 'eval_accuracy': 0.7244348762109796, 'eval_mcc': 0.299463586457738, 'eval_runtime': 165.2646, 'eval_samples_per_second': 16.864, 'eval_steps_per_second': 0.133, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:10:43<28:33,  6.91s/it]

{'eval_loss': 0.617846667766571, 'eval_accuracy': 0.7093649085037675, 'eval_mcc': 0.24459751501769386, 'eval_runtime': 122.0621, 'eval_samples_per_second': 22.833, 'eval_steps_per_second': 0.18, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:12:24<47:42, 11.93s/it] 

{'loss': 0.0967, 'grad_norm': 0.37820345163345337, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:22:17<26:35,  8.58s/it]

{'eval_loss': 0.5855653285980225, 'eval_accuracy': 0.7194115536419089, 'eval_mcc': 0.2840003819425707, 'eval_runtime': 158.4194, 'eval_samples_per_second': 17.593, 'eval_steps_per_second': 0.139, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:33:25<14:38,  7.09s/it]

{'eval_loss': 0.6197635531425476, 'eval_accuracy': 0.7176175098672407, 'eval_mcc': 0.27627899039793424, 'eval_runtime': 144.0951, 'eval_samples_per_second': 19.341, 'eval_steps_per_second': 0.153, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:45:15<09:30,  9.21s/it]

{'eval_loss': 0.6740710735321045, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.2707701345709899, 'eval_runtime': 142.7191, 'eval_samples_per_second': 19.528, 'eval_steps_per_second': 0.154, 'epoch': 19.0}



100%|██████████| 1240/1240 [3:56:45<00:00,  7.02s/it]

{'eval_loss': 0.6774826645851135, 'eval_accuracy': 0.7190527448869752, 'eval_mcc': 0.2990029495005887, 'eval_runtime': 150.8219, 'eval_samples_per_second': 18.479, 'eval_steps_per_second': 0.146, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [3:56:48<00:00, 11.46s/it]


{'train_runtime': 14208.2327, 'train_samples_per_second': 11.077, 'train_steps_per_second': 0.087, 'train_loss': 0.2117692808951101, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0_128_1
train {'train_runtime': 14208.2327, 'train_samples_per_second': 11.077, 'train_steps_per_second': 0.087, 'total_flos': 4771232647372800.0, 'train_loss': 0.2117692808951101, 'epoch': 20.0}


100%|██████████| 22/22 [01:33<00:00,  4.23s/it]


dev {'test_loss': 0.32763829827308655, 'test_accuracy': 0.7294581987800502, 'test_mcc': 0.3175174161551942, 'test_runtime': 122.1272, 'test_samples_per_second': 22.82, 'test_steps_per_second': 0.18}


100%|██████████| 22/22 [01:35<00:00,  4.36s/it]

  5%|▌         | 62/1240 [12:52<2:19:38,  7.11s/it]

{'eval_loss': 0.343330055475235, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 181.3687, 'eval_samples_per_second': 15.366, 'eval_steps_per_second': 0.121, 'epoch': 1.0}



 10%|█         | 124/1240 [25:56<3:00:42,  9.72s/it]

{'eval_loss': 0.3553447723388672, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 186.4163, 'eval_samples_per_second': 14.95, 'eval_steps_per_second': 0.118, 'epoch': 2.0}



 15%|█▌        | 186/1240 [38:03<2:51:53,  9.78s/it]

{'eval_loss': 0.30913954973220825, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 122.6061, 'eval_samples_per_second': 22.731, 'eval_steps_per_second': 0.179, 'epoch': 3.0}



 20%|██        | 248/1240 [49:28<2:22:00,  8.59s/it]

{'eval_loss': 0.23191556334495544, 'eval_accuracy': 0.684248295658414, 'eval_mcc': 0.12256541621633768, 'eval_runtime': 120.0115, 'eval_samples_per_second': 23.223, 'eval_steps_per_second': 0.183, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:02:21<2:28:04,  9.55s/it]

{'eval_loss': 0.22061815857887268, 'eval_accuracy': 0.6942949407965554, 'eval_mcc': 0.17993597112763243, 'eval_runtime': 184.1844, 'eval_samples_per_second': 15.132, 'eval_steps_per_second': 0.119, 'epoch': 5.0}



 30%|███       | 372/1240 [1:13:36<2:03:09,  8.51s/it]

{'eval_loss': 0.2561188042163849, 'eval_accuracy': 0.6935773232866882, 'eval_mcc': 0.18373080989896387, 'eval_runtime': 128.1306, 'eval_samples_per_second': 21.751, 'eval_steps_per_second': 0.172, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:26:15<2:04:28,  9.27s/it]

{'eval_loss': 0.24257692694664001, 'eval_accuracy': 0.7143882310728382, 'eval_mcc': 0.26413560702370326, 'eval_runtime': 176.6024, 'eval_samples_per_second': 15.781, 'eval_steps_per_second': 0.125, 'epoch': 7.0}



 40%|████      | 496/1240 [1:39:55<2:08:52, 10.39s/it]

{'eval_loss': 0.2455579787492752, 'eval_accuracy': 0.7140294223179046, 'eval_mcc': 0.27024786625698877, 'eval_runtime': 156.2861, 'eval_samples_per_second': 17.833, 'eval_steps_per_second': 0.141, 'epoch': 8.0}


 40%|████      | 500/1240 [1:41:30<6:49:06, 33.17s/it] 

{'loss': 0.4007, 'grad_norm': 1.1198604106903076, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:51:45<1:25:19,  7.51s/it]

{'eval_loss': 0.28977158665657043, 'eval_accuracy': 0.7290993900251166, 'eval_mcc': 0.3184056981991716, 'eval_runtime': 159.3721, 'eval_samples_per_second': 17.487, 'eval_steps_per_second': 0.138, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:03:53<1:19:00,  7.65s/it]

{'eval_loss': 0.27423983812332153, 'eval_accuracy': 0.7208467886616433, 'eval_mcc': 0.2878234803222649, 'eval_runtime': 146.1821, 'eval_samples_per_second': 19.065, 'eval_steps_per_second': 0.15, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:16:01<1:09:02,  7.42s/it]

{'eval_loss': 0.31429314613342285, 'eval_accuracy': 0.722999641191245, 'eval_mcc': 0.2947105569307808, 'eval_runtime': 158.4261, 'eval_samples_per_second': 17.592, 'eval_steps_per_second': 0.139, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:27:55<1:00:27,  7.31s/it]

{'eval_loss': 0.43837088346481323, 'eval_accuracy': 0.7233584499461787, 'eval_mcc': 0.2970308221817182, 'eval_runtime': 164.1911, 'eval_samples_per_second': 16.974, 'eval_steps_per_second': 0.134, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:39:54<1:05:57,  9.12s/it]

{'eval_loss': 0.4460804760456085, 'eval_accuracy': 0.7226408324363115, 'eval_mcc': 0.2954996017316337, 'eval_runtime': 153.2571, 'eval_samples_per_second': 18.185, 'eval_steps_per_second': 0.144, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:50:49<55:22,  8.93s/it]

{'eval_loss': 0.5550039410591125, 'eval_accuracy': 0.7168998923573735, 'eval_mcc': 0.2762690553732589, 'eval_runtime': 173.4575, 'eval_samples_per_second': 16.067, 'eval_steps_per_second': 0.127, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:02:54<41:58,  8.12s/it]

{'eval_loss': 0.5405493378639221, 'eval_accuracy': 0.7201291711517761, 'eval_mcc': 0.28567298996968765, 'eval_runtime': 131.5802, 'eval_samples_per_second': 21.181, 'eval_steps_per_second': 0.167, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:14:44<25:14,  6.11s/it]

{'eval_loss': 0.4862962067127228, 'eval_accuracy': 0.7294581987800502, 'eval_mcc': 0.31628282922111184, 'eval_runtime': 160.7187, 'eval_samples_per_second': 17.341, 'eval_steps_per_second': 0.137, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:16:50<54:44, 13.69s/it] 

{'loss': 0.0975, 'grad_norm': 0.36904171109199524, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:26:29<26:09,  8.44s/it]

{'eval_loss': 0.6576849222183228, 'eval_accuracy': 0.721205597416577, 'eval_mcc': 0.28901524367517323, 'eval_runtime': 162.8194, 'eval_samples_per_second': 17.117, 'eval_steps_per_second': 0.135, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:39:11<18:35,  8.99s/it]

{'eval_loss': 0.7119777798652649, 'eval_accuracy': 0.7233584499461787, 'eval_mcc': 0.29572040929333215, 'eval_runtime': 159.4542, 'eval_samples_per_second': 17.478, 'eval_steps_per_second': 0.138, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:51:09<09:44,  9.43s/it]

{'eval_loss': 0.8690889477729797, 'eval_accuracy': 0.7151058485827054, 'eval_mcc': 0.26703005596123897, 'eval_runtime': 113.3651, 'eval_samples_per_second': 24.584, 'eval_steps_per_second': 0.194, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:03:32<00:00,  7.07s/it]

{'eval_loss': 0.6811937093734741, 'eval_accuracy': 0.7273053462504485, 'eval_mcc': 0.3093599073835068, 'eval_runtime': 193.5003, 'eval_samples_per_second': 14.403, 'eval_steps_per_second': 0.114, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:03:35<00:00,  7.07s/it]

{'train_runtime': 14615.3773, 'train_samples_per_second': 10.768, 'train_steps_per_second': 0.085, 'train_loss': 0.20863828812876056, 'epoch': 20.0}


100%|██████████| 1240/1240 [4:03:35<00:00, 11.79s/it]


sberbank-ai_ruT5-base_0.0001_0_128_2
train {'train_runtime': 14615.3773, 'train_samples_per_second': 10.768, 'train_steps_per_second': 0.085, 'total_flos': 4777265136476160.0, 'train_loss': 0.20863828812876056, 'epoch': 20.0}


100%|██████████| 22/22 [02:42<00:00,  7.40s/it]


dev {'test_loss': 0.28977158665657043, 'test_accuracy': 0.7290993900251166, 'test_mcc': 0.3184056981991716, 'test_runtime': 193.1443, 'test_samples_per_second': 14.43, 'test_steps_per_second': 0.114}


100%|██████████| 22/22 [02:16<00:00,  6.21s/it]

  5%|▌         | 62/1240 [14:08<2:49:41,  8.64s/it]

{'eval_loss': 0.3242959678173065, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 231.8344, 'eval_samples_per_second': 12.022, 'eval_steps_per_second': 0.095, 'epoch': 1.0}



 10%|█         | 124/1240 [27:40<3:13:26, 10.40s/it]

{'eval_loss': 0.2421770542860031, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 189.1545, 'eval_samples_per_second': 14.734, 'eval_steps_per_second': 0.116, 'epoch': 2.0}



 15%|█▌        | 186/1240 [40:24<2:15:59,  7.74s/it]

{'eval_loss': 0.23346801102161407, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 230.4304, 'eval_samples_per_second': 12.095, 'eval_steps_per_second': 0.095, 'epoch': 3.0}



 20%|██        | 248/1240 [52:45<2:35:41,  9.42s/it]

{'eval_loss': 0.24769705533981323, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 137.2654, 'eval_samples_per_second': 20.304, 'eval_steps_per_second': 0.16, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:05:53<1:48:15,  6.98s/it]

{'eval_loss': 0.27547958493232727, 'eval_accuracy': 0.6849659131682813, 'eval_mcc': 0.13510444123666368, 'eval_runtime': 221.2489, 'eval_samples_per_second': 12.597, 'eval_steps_per_second': 0.099, 'epoch': 5.0}



 30%|███       | 372/1240 [1:19:00<2:17:16,  9.49s/it]

{'eval_loss': 0.25691038370132446, 'eval_accuracy': 0.6960889845712236, 'eval_mcc': 0.19823982648405056, 'eval_runtime': 174.022, 'eval_samples_per_second': 16.015, 'eval_steps_per_second': 0.126, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:30:42<2:01:52,  9.07s/it]

{'eval_loss': 0.2272300273180008, 'eval_accuracy': 0.7118765697883028, 'eval_mcc': 0.2557805786516813, 'eval_runtime': 145.2876, 'eval_samples_per_second': 19.183, 'eval_steps_per_second': 0.151, 'epoch': 7.0}



 40%|████      | 496/1240 [1:42:01<1:48:14,  8.73s/it]

{'eval_loss': 0.2641617953777313, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2630614455700378, 'eval_runtime': 136.8375, 'eval_samples_per_second': 20.367, 'eval_steps_per_second': 0.161, 'epoch': 8.0}


 40%|████      | 500/1240 [1:43:28<6:01:50, 29.34s/it] 

{'loss': 0.4151, 'grad_norm': 0.6403657793998718, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:52:55<1:09:17,  6.10s/it]

{'eval_loss': 0.29846447706222534, 'eval_accuracy': 0.7140294223179046, 'eval_mcc': 0.2694180781176133, 'eval_runtime': 151.4944, 'eval_samples_per_second': 18.397, 'eval_steps_per_second': 0.145, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:04:01<1:18:03,  7.55s/it]

{'eval_loss': 0.3658027648925781, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.2706263578518465, 'eval_runtime': 139.9417, 'eval_samples_per_second': 19.915, 'eval_steps_per_second': 0.157, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:16:15<1:11:17,  7.67s/it]

{'eval_loss': 0.34464550018310547, 'eval_accuracy': 0.7290993900251166, 'eval_mcc': 0.31717196889845073, 'eval_runtime': 149.8212, 'eval_samples_per_second': 18.602, 'eval_steps_per_second': 0.147, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:27:59<1:07:00,  8.11s/it]

{'eval_loss': 0.32509106397628784, 'eval_accuracy': 0.7265877287405813, 'eval_mcc': 0.32896836276269575, 'eval_runtime': 156.5059, 'eval_samples_per_second': 17.808, 'eval_steps_per_second': 0.141, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:39:49<1:03:41,  8.81s/it]

{'eval_loss': 0.4487420916557312, 'eval_accuracy': 0.7301758162899175, 'eval_mcc': 0.3187271658422486, 'eval_runtime': 172.4418, 'eval_samples_per_second': 16.162, 'eval_steps_per_second': 0.128, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:51:07<42:36,  6.87s/it]

{'eval_loss': 0.5947644710540771, 'eval_accuracy': 0.7104413347685683, 'eval_mcc': 0.24925391441216616, 'eval_runtime': 142.7165, 'eval_samples_per_second': 19.528, 'eval_steps_per_second': 0.154, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:01:37<40:29,  7.84s/it]

{'eval_loss': 0.4367785155773163, 'eval_accuracy': 0.725870111230714, 'eval_mcc': 0.3057011079462158, 'eval_runtime': 132.0063, 'eval_samples_per_second': 21.113, 'eval_steps_per_second': 0.167, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:13:29<36:07,  8.74s/it]

{'eval_loss': 0.7390432953834534, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2607071568096838, 'eval_runtime': 148.0447, 'eval_samples_per_second': 18.825, 'eval_steps_per_second': 0.149, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:15:29<51:45, 12.94s/it] 

{'loss': 0.0911, 'grad_norm': 0.7487825155258179, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:24:13<19:01,  6.14s/it]

{'eval_loss': 0.6847727298736572, 'eval_accuracy': 0.7147470398277718, 'eval_mcc': 0.2650518527493611, 'eval_runtime': 105.5979, 'eval_samples_per_second': 26.393, 'eval_steps_per_second': 0.208, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:35:50<16:13,  7.85s/it]

{'eval_loss': 0.7630577087402344, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2857600101720699, 'eval_runtime': 126.2168, 'eval_samples_per_second': 22.081, 'eval_steps_per_second': 0.174, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:47:21<08:07,  7.87s/it]

{'eval_loss': 0.5652984976768494, 'eval_accuracy': 0.7154646573376391, 'eval_mcc': 0.26772399476755987, 'eval_runtime': 203.4569, 'eval_samples_per_second': 13.698, 'eval_steps_per_second': 0.108, 'epoch': 19.0}



100%|██████████| 1240/1240 [3:58:56<00:00,  7.31s/it]

{'eval_loss': 0.635643720626831, 'eval_accuracy': 0.7233584499461787, 'eval_mcc': 0.2978623440738384, 'eval_runtime': 176.4806, 'eval_samples_per_second': 15.792, 'eval_steps_per_second': 0.125, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [3:58:59<00:00,  7.31s/it]

{'train_runtime': 14339.2942, 'train_samples_per_second': 10.975, 'train_steps_per_second': 0.086, 'train_loss': 0.21124693578289402, 'epoch': 20.0}


100%|██████████| 1240/1240 [3:58:59<00:00, 11.56s/it]


sberbank-ai_ruT5-base_0.0001_0_128_3
train {'train_runtime': 14339.2942, 'train_samples_per_second': 10.975, 'train_steps_per_second': 0.086, 'total_flos': 4771870150164480.0, 'train_loss': 0.21124693578289402, 'epoch': 20.0}


100%|██████████| 22/22 [02:43<00:00,  7.44s/it]


dev {'test_loss': 0.32509106397628784, 'test_accuracy': 0.7265877287405813, 'test_mcc': 0.32896836276269575, 'test_runtime': 194.1554, 'test_samples_per_second': 14.354, 'test_steps_per_second': 0.113}


100%|██████████| 22/22 [01:58<00:00,  5.37s/it]

  5%|▌         | 62/1240 [11:42<2:23:57,  7.33s/it]

{'eval_loss': 0.23754224181175232, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 141.9278, 'eval_samples_per_second': 19.637, 'eval_steps_per_second': 0.155, 'epoch': 1.0}



 10%|█         | 124/1240 [24:02<2:41:21,  8.68s/it]

{'eval_loss': 0.2795906364917755, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 163.6345, 'eval_samples_per_second': 17.032, 'eval_steps_per_second': 0.134, 'epoch': 2.0}



 15%|█▌        | 186/1240 [36:03<2:37:09,  8.95s/it]

{'eval_loss': 0.28255823254585266, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 137.436, 'eval_samples_per_second': 20.279, 'eval_steps_per_second': 0.16, 'epoch': 3.0}



 20%|██        | 248/1240 [48:38<2:30:01,  9.07s/it]

{'eval_loss': 0.271607905626297, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 180.0809, 'eval_samples_per_second': 15.476, 'eval_steps_per_second': 0.122, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:02:18<2:35:08, 10.01s/it]

{'eval_loss': 0.26222047209739685, 'eval_accuracy': 0.6820954431288123, 'eval_mcc': 0.1146073652974931, 'eval_runtime': 195.9177, 'eval_samples_per_second': 14.225, 'eval_steps_per_second': 0.112, 'epoch': 5.0}



 30%|███       | 372/1240 [1:15:50<2:11:43,  9.11s/it]

{'eval_loss': 0.26355016231536865, 'eval_accuracy': 0.6885540007176175, 'eval_mcc': 0.16347973892013445, 'eval_runtime': 138.4834, 'eval_samples_per_second': 20.125, 'eval_steps_per_second': 0.159, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:29:35<2:09:15,  9.62s/it]

{'eval_loss': 0.2628893554210663, 'eval_accuracy': 0.6917832795120201, 'eval_mcc': 0.17270619135700643, 'eval_runtime': 160.0325, 'eval_samples_per_second': 17.415, 'eval_steps_per_second': 0.137, 'epoch': 7.0}



 40%|████      | 496/1240 [1:43:09<1:47:35,  8.68s/it]

{'eval_loss': 0.24528126418590546, 'eval_accuracy': 0.7043415859346968, 'eval_mcc': 0.22770608383877772, 'eval_runtime': 141.2682, 'eval_samples_per_second': 19.728, 'eval_steps_per_second': 0.156, 'epoch': 8.0}


 40%|████      | 500/1240 [1:44:36<6:04:57, 29.59s/it] 

{'loss': 0.4158, 'grad_norm': 0.8275555372238159, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:57:59<2:05:04, 11.00s/it]

{'eval_loss': 0.31903693079948425, 'eval_accuracy': 0.6982418371008252, 'eval_mcc': 0.21026919383852052, 'eval_runtime': 252.6437, 'eval_samples_per_second': 11.031, 'eval_steps_per_second': 0.087, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:13:12<1:47:36, 10.41s/it]

{'eval_loss': 0.28012990951538086, 'eval_accuracy': 0.7118765697883028, 'eval_mcc': 0.2550023750589084, 'eval_runtime': 236.7241, 'eval_samples_per_second': 11.773, 'eval_steps_per_second': 0.093, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:28:32<1:26:18,  9.28s/it]

{'eval_loss': 0.3365243077278137, 'eval_accuracy': 0.7068532472192322, 'eval_mcc': 0.23480378203773392, 'eval_runtime': 250.2299, 'eval_samples_per_second': 11.138, 'eval_steps_per_second': 0.088, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:43:15<1:19:18,  9.59s/it]

{'eval_loss': 0.3063610792160034, 'eval_accuracy': 0.7086472909939002, 'eval_mcc': 0.2468324986856367, 'eval_runtime': 205.4556, 'eval_samples_per_second': 13.565, 'eval_steps_per_second': 0.107, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:58:36<1:14:02, 10.24s/it]

{'eval_loss': 0.36839058995246887, 'eval_accuracy': 0.7151058485827054, 'eval_mcc': 0.2693953932356712, 'eval_runtime': 213.3331, 'eval_samples_per_second': 13.064, 'eval_steps_per_second': 0.103, 'epoch': 13.0}



 70%|███████   | 868/1240 [3:13:48<59:44,  9.64s/it]

{'eval_loss': 0.45988407731056213, 'eval_accuracy': 0.7064944384642985, 'eval_mcc': 0.23354066959369477, 'eval_runtime': 250.5692, 'eval_samples_per_second': 11.123, 'eval_steps_per_second': 0.088, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:28:44<45:41,  8.84s/it]

{'eval_loss': 0.41088706254959106, 'eval_accuracy': 0.7111589522784356, 'eval_mcc': 0.2559201720678133, 'eval_runtime': 244.2409, 'eval_samples_per_second': 11.411, 'eval_steps_per_second': 0.09, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:42:41<31:02,  7.51s/it]

{'eval_loss': 0.5359167456626892, 'eval_accuracy': 0.7064944384642985, 'eval_mcc': 0.23696697429032273, 'eval_runtime': 167.0141, 'eval_samples_per_second': 16.687, 'eval_steps_per_second': 0.132, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:44:52<59:39, 14.91s/it] 

{'loss': 0.1086, 'grad_norm': 1.8383554220199585, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:57:48<29:42,  9.58s/it]

{'eval_loss': 0.43077927827835083, 'eval_accuracy': 0.7143882310728382, 'eval_mcc': 0.27855989085581545, 'eval_runtime': 263.0304, 'eval_samples_per_second': 10.596, 'eval_steps_per_second': 0.084, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [4:09:46<18:39,  9.03s/it]

{'eval_loss': 0.5534948110580444, 'eval_accuracy': 0.71259418729817, 'eval_mcc': 0.25867374073926663, 'eval_runtime': 132.9753, 'eval_samples_per_second': 20.959, 'eval_steps_per_second': 0.165, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [4:23:31<10:28, 10.14s/it]

{'eval_loss': 0.6247324347496033, 'eval_accuracy': 0.7050592034445641, 'eval_mcc': 0.23980534656087946, 'eval_runtime': 197.8874, 'eval_samples_per_second': 14.084, 'eval_steps_per_second': 0.111, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:38:05<00:00, 10.25s/it]

{'eval_loss': 0.5992313027381897, 'eval_accuracy': 0.710800143523502, 'eval_mcc': 0.25567985616580136, 'eval_runtime': 235.0205, 'eval_samples_per_second': 11.859, 'eval_steps_per_second': 0.094, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:38:08<00:00, 10.25s/it]

{'train_runtime': 16688.9257, 'train_samples_per_second': 9.43, 'train_steps_per_second': 0.074, 'train_loss': 0.22038038469129992, 'epoch': 20.0}


100%|██████████| 1240/1240 [4:38:09<00:00, 13.46s/it]


sberbank-ai_ruT5-base_0.0001_0_128_4
train {'train_runtime': 16688.9257, 'train_samples_per_second': 9.43, 'train_steps_per_second': 0.074, 'total_flos': 4758473076572160.0, 'train_loss': 0.22038038469129992, 'epoch': 20.0}


100%|██████████| 22/22 [03:22<00:00,  9.20s/it]


dev {'test_loss': 0.43077927827835083, 'test_accuracy': 0.7143882310728382, 'test_mcc': 0.27855989085581545, 'test_runtime': 240.8845, 'test_samples_per_second': 11.57, 'test_steps_per_second': 0.091}


100%|██████████| 22/22 [02:10<00:00,  5.94s/it]

  5%|▌         | 62/1240 [19:34<4:35:17, 14.02s/it]

{'eval_loss': 0.2625352144241333, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 255.2046, 'eval_samples_per_second': 10.921, 'eval_steps_per_second': 0.086, 'epoch': 1.0}



 10%|█         | 124/1240 [35:06<4:42:13, 15.17s/it]

{'eval_loss': 0.23903633654117584, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 199.2474, 'eval_samples_per_second': 13.988, 'eval_steps_per_second': 0.11, 'epoch': 2.0}



 15%|█▌        | 186/1240 [52:25<3:22:44, 11.54s/it]

{'eval_loss': 0.30655959248542786, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 220.6819, 'eval_samples_per_second': 12.629, 'eval_steps_per_second': 0.1, 'epoch': 3.0}



 20%|██        | 248/1240 [1:06:43<2:17:52,  8.34s/it]

{'eval_loss': 0.2903214395046234, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 201.2782, 'eval_samples_per_second': 13.847, 'eval_steps_per_second': 0.109, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:22:17<2:50:44, 11.02s/it]

{'eval_loss': 0.22141499817371368, 'eval_accuracy': 0.6946537495514891, 'eval_mcc': 0.18540926533211063, 'eval_runtime': 246.4844, 'eval_samples_per_second': 11.307, 'eval_steps_per_second': 0.089, 'epoch': 5.0}



 30%|███       | 372/1240 [1:36:59<2:36:51, 10.84s/it]

{'eval_loss': 0.24811281263828278, 'eval_accuracy': 0.6878363832077503, 'eval_mcc': 0.15874813519743353, 'eval_runtime': 175.1892, 'eval_samples_per_second': 15.909, 'eval_steps_per_second': 0.126, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:51:46<2:29:05, 11.10s/it]

{'eval_loss': 0.24482044577598572, 'eval_accuracy': 0.710800143523502, 'eval_mcc': 0.2563355197527784, 'eval_runtime': 198.8671, 'eval_samples_per_second': 14.014, 'eval_steps_per_second': 0.111, 'epoch': 7.0}



 40%|████      | 496/1240 [2:07:42<2:26:00, 11.77s/it]

{'eval_loss': 0.2590676248073578, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2619628465095906, 'eval_runtime': 251.5045, 'eval_samples_per_second': 11.081, 'eval_steps_per_second': 0.087, 'epoch': 8.0}


 40%|████      | 500/1240 [2:09:36<9:30:45, 46.28s/it]  

{'loss': 0.4248, 'grad_norm': 1.5460054874420166, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [2:22:25<1:47:19,  9.44s/it]

{'eval_loss': 0.3030848801136017, 'eval_accuracy': 0.7186939361320416, 'eval_mcc': 0.28756234541207065, 'eval_runtime': 165.8552, 'eval_samples_per_second': 16.804, 'eval_steps_per_second': 0.133, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:37:44<1:48:02, 10.46s/it]

{'eval_loss': 0.30298566818237305, 'eval_accuracy': 0.7104413347685683, 'eval_mcc': 0.2511088021537992, 'eval_runtime': 184.5712, 'eval_samples_per_second': 15.1, 'eval_steps_per_second': 0.119, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:54:06<1:52:20, 12.08s/it]

{'eval_loss': 0.3845653533935547, 'eval_accuracy': 0.7129529960531037, 'eval_mcc': 0.2598622130766447, 'eval_runtime': 239.0256, 'eval_samples_per_second': 11.66, 'eval_steps_per_second': 0.092, 'epoch': 11.0}



 60%|██████    | 744/1240 [3:11:25<1:32:30, 11.19s/it]

{'eval_loss': 0.4094178378582001, 'eval_accuracy': 0.721205597416577, 'eval_mcc': 0.28970900239729214, 'eval_runtime': 209.7389, 'eval_samples_per_second': 13.288, 'eval_steps_per_second': 0.105, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [3:27:30<1:32:14, 12.75s/it]

{'eval_loss': 0.4061751067638397, 'eval_accuracy': 0.7341227125941873, 'eval_mcc': 0.3318244657382178, 'eval_runtime': 224.6536, 'eval_samples_per_second': 12.406, 'eval_steps_per_second': 0.098, 'epoch': 13.0}



 70%|███████   | 868/1240 [3:40:02<52:10,  8.41s/it]

{'eval_loss': 0.4318067729473114, 'eval_accuracy': 0.722999641191245, 'eval_mcc': 0.2944556619565819, 'eval_runtime': 186.8285, 'eval_samples_per_second': 14.917, 'eval_steps_per_second': 0.118, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:52:57<49:23,  9.56s/it]

{'eval_loss': 0.47342804074287415, 'eval_accuracy': 0.7219232149264442, 'eval_mcc': 0.29108953208928395, 'eval_runtime': 184.5613, 'eval_samples_per_second': 15.101, 'eval_steps_per_second': 0.119, 'epoch': 15.0}



 80%|████████  | 992/1240 [4:06:27<33:14,  8.04s/it]

{'eval_loss': 0.6159875392913818, 'eval_accuracy': 0.7147470398277718, 'eval_mcc': 0.2649685081555382, 'eval_runtime': 188.3187, 'eval_samples_per_second': 14.799, 'eval_steps_per_second': 0.117, 'epoch': 16.0}


 81%|████████  | 1000/1240 [4:08:51<1:02:56, 15.74s/it]

{'loss': 0.0919, 'grad_norm': 0.5933664441108704, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [4:18:27<26:51,  8.67s/it]

{'eval_loss': 0.4945184588432312, 'eval_accuracy': 0.7240760674560459, 'eval_mcc': 0.3003348286462944, 'eval_runtime': 149.4963, 'eval_samples_per_second': 18.643, 'eval_steps_per_second': 0.147, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [4:33:23<24:20, 11.78s/it]

{'eval_loss': 0.5423580408096313, 'eval_accuracy': 0.7265877287405813, 'eval_mcc': 0.30671217569898246, 'eval_runtime': 309.7071, 'eval_samples_per_second': 8.999, 'eval_steps_per_second': 0.071, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [4:48:12<10:54, 10.55s/it]

{'eval_loss': 0.6182430386543274, 'eval_accuracy': 0.7276641550053822, 'eval_mcc': 0.3113807064004509, 'eval_runtime': 231.6778, 'eval_samples_per_second': 12.03, 'eval_steps_per_second': 0.095, 'epoch': 19.0}



100%|██████████| 1240/1240 [5:03:20<00:00,  9.86s/it]

{'eval_loss': 0.7349226474761963, 'eval_accuracy': 0.7237172587011123, 'eval_mcc': 0.2969065099880764, 'eval_runtime': 225.5611, 'eval_samples_per_second': 12.356, 'eval_steps_per_second': 0.098, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [5:03:26<00:00, 14.68s/it]


{'train_runtime': 18205.9393, 'train_samples_per_second': 8.644, 'train_steps_per_second': 0.068, 'train_loss': 0.21537752843672228, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0_128_5
train {'train_runtime': 18205.9393, 'train_samples_per_second': 8.644, 'train_steps_per_second': 0.068, 'total_flos': 4771870150164480.0, 'train_loss': 0.21537752843672228, 'epoch': 20.0}


100%|██████████| 22/22 [02:56<00:00,  8.02s/it]


dev {'test_loss': 0.4061751067638397, 'test_accuracy': 0.7341227125941873, 'test_mcc': 0.3318244657382178, 'test_runtime': 215.0376, 'test_samples_per_second': 12.961, 'test_steps_per_second': 0.102}


100%|██████████| 22/22 [02:21<00:00,  6.41s/it]

  5%|▌         | 62/1240 [19:59<5:23:15, 16.46s/it]

{'eval_loss': 0.29400548338890076, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 239.1032, 'eval_samples_per_second': 11.656, 'eval_steps_per_second': 0.092, 'epoch': 1.0}



 10%|█         | 124/1240 [34:15<3:26:13, 11.09s/it]

{'eval_loss': 0.2926821708679199, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 200.293, 'eval_samples_per_second': 13.915, 'eval_steps_per_second': 0.11, 'epoch': 2.0}



 15%|█▌        | 186/1240 [48:10<2:37:04,  8.94s/it]

{'eval_loss': 0.2299289107322693, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 145.3697, 'eval_samples_per_second': 19.172, 'eval_steps_per_second': 0.151, 'epoch': 3.0}



 20%|██        | 248/1240 [1:01:35<2:02:02,  7.38s/it]

{'eval_loss': 0.2913198471069336, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 200.8629, 'eval_samples_per_second': 13.875, 'eval_steps_per_second': 0.11, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:15:37<2:26:18,  9.44s/it]

{'eval_loss': 0.31810155510902405, 'eval_accuracy': 0.6785073555794762, 'eval_mcc': 0.08208062486045599, 'eval_runtime': 233.451, 'eval_samples_per_second': 11.938, 'eval_steps_per_second': 0.094, 'epoch': 5.0}



 30%|███       | 372/1240 [1:28:13<2:12:49,  9.18s/it]

{'eval_loss': 0.23311179876327515, 'eval_accuracy': 0.6939361320416219, 'eval_mcc': 0.1773938884524333, 'eval_runtime': 183.9138, 'eval_samples_per_second': 15.154, 'eval_steps_per_second': 0.12, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:41:07<1:51:46,  8.32s/it]

{'eval_loss': 0.22291742265224457, 'eval_accuracy': 0.7208467886616433, 'eval_mcc': 0.2882399874986916, 'eval_runtime': 176.6844, 'eval_samples_per_second': 15.774, 'eval_steps_per_second': 0.125, 'epoch': 7.0}



 40%|████      | 496/1240 [1:53:13<1:55:31,  9.32s/it]

{'eval_loss': 0.29362303018569946, 'eval_accuracy': 0.7025475421600287, 'eval_mcc': 0.22953794771638816, 'eval_runtime': 155.8357, 'eval_samples_per_second': 17.884, 'eval_steps_per_second': 0.141, 'epoch': 8.0}


 40%|████      | 500/1240 [1:54:40<6:23:23, 31.09s/it] 

{'loss': 0.389, 'grad_norm': 0.9233601093292236, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [2:05:17<1:40:39,  8.85s/it]

{'eval_loss': 0.26116350293159485, 'eval_accuracy': 0.7273053462504485, 'eval_mcc': 0.31303854161609873, 'eval_runtime': 166.5124, 'eval_samples_per_second': 16.737, 'eval_steps_per_second': 0.132, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:17:30<1:31:03,  8.81s/it]

{'eval_loss': 0.28339987993240356, 'eval_accuracy': 0.7262289199856476, 'eval_mcc': 0.3064156860345076, 'eval_runtime': 172.2279, 'eval_samples_per_second': 16.182, 'eval_steps_per_second': 0.128, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:30:37<1:10:01,  7.53s/it]

{'eval_loss': 0.3959903419017792, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2597568487856101, 'eval_runtime': 228.3818, 'eval_samples_per_second': 12.203, 'eval_steps_per_second': 0.096, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:42:47<1:12:59,  8.83s/it]

{'eval_loss': 0.435736745595932, 'eval_accuracy': 0.7093649085037675, 'eval_mcc': 0.2467173045358495, 'eval_runtime': 152.8639, 'eval_samples_per_second': 18.232, 'eval_steps_per_second': 0.144, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:53:49<1:03:19,  8.75s/it]

{'eval_loss': 0.5189849734306335, 'eval_accuracy': 0.7050592034445641, 'eval_mcc': 0.22735238894769352, 'eval_runtime': 140.3955, 'eval_samples_per_second': 19.851, 'eval_steps_per_second': 0.157, 'epoch': 13.0}



 70%|███████   | 868/1240 [3:05:01<47:55,  7.73s/it]

{'eval_loss': 0.6053552031517029, 'eval_accuracy': 0.7047003946896304, 'eval_mcc': 0.2258007344153376, 'eval_runtime': 186.969, 'eval_samples_per_second': 14.906, 'eval_steps_per_second': 0.118, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:16:59<45:09,  8.74s/it]

{'eval_loss': 0.561018168926239, 'eval_accuracy': 0.7075708647290994, 'eval_mcc': 0.23770185155268211, 'eval_runtime': 172.1932, 'eval_samples_per_second': 16.185, 'eval_steps_per_second': 0.128, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:28:02<34:28,  8.34s/it]

{'eval_loss': 0.5749154090881348, 'eval_accuracy': 0.7179763186221744, 'eval_mcc': 0.27731764115137925, 'eval_runtime': 141.2199, 'eval_samples_per_second': 19.735, 'eval_steps_per_second': 0.156, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:29:58<52:24, 13.10s/it] 

{'loss': 0.0932, 'grad_norm': 0.6024113893508911, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:39:38<26:49,  8.66s/it]

{'eval_loss': 0.5728225111961365, 'eval_accuracy': 0.7215644061715106, 'eval_mcc': 0.2947399678222593, 'eval_runtime': 145.5043, 'eval_samples_per_second': 19.154, 'eval_steps_per_second': 0.151, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:51:50<14:46,  7.15s/it]

{'eval_loss': 0.5942762494087219, 'eval_accuracy': 0.7122353785432365, 'eval_mcc': 0.25568922543467526, 'eval_runtime': 163.0178, 'eval_samples_per_second': 17.096, 'eval_steps_per_second': 0.135, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [4:04:01<08:56,  8.65s/it]

{'eval_loss': 0.7254924178123474, 'eval_accuracy': 0.7072120559741658, 'eval_mcc': 0.2360642511173604, 'eval_runtime': 208.8614, 'eval_samples_per_second': 13.344, 'eval_steps_per_second': 0.105, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:15:41<00:00,  8.55s/it]

{'eval_loss': 0.6542835831642151, 'eval_accuracy': 0.7136706135629709, 'eval_mcc': 0.26532068214525323, 'eval_runtime': 171.5001, 'eval_samples_per_second': 16.251, 'eval_steps_per_second': 0.128, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:15:44<00:00,  8.55s/it]

{'train_runtime': 15344.3923, 'train_samples_per_second': 10.257, 'train_steps_per_second': 0.081, 'train_loss': 0.20170835602668025, 'epoch': 20.0}


100%|██████████| 1240/1240 [4:15:44<00:00, 12.37s/it]


sberbank-ai_ruT5-base_0.0001_0_128_6
train {'train_runtime': 15344.3923, 'train_samples_per_second': 10.257, 'train_steps_per_second': 0.081, 'total_flos': 4770595144581120.0, 'train_loss': 0.20170835602668025, 'epoch': 20.0}


100%|██████████| 22/22 [02:29<00:00,  6.77s/it]


dev {'test_loss': 0.26116350293159485, 'test_accuracy': 0.7273053462504485, 'test_mcc': 0.31303854161609873, 'test_runtime': 180.5106, 'test_samples_per_second': 15.44, 'test_steps_per_second': 0.122}


100%|██████████| 22/22 [01:30<00:00,  4.10s/it]

  5%|▌         | 62/1240 [11:36<2:46:25,  8.48s/it]

{'eval_loss': 0.24171601235866547, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 136.4423, 'eval_samples_per_second': 20.426, 'eval_steps_per_second': 0.161, 'epoch': 1.0}



 10%|█         | 124/1240 [23:43<2:56:23,  9.48s/it]

{'eval_loss': 0.29865044355392456, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 166.2791, 'eval_samples_per_second': 16.761, 'eval_steps_per_second': 0.132, 'epoch': 2.0}



 15%|█▌        | 186/1240 [36:15<2:09:39,  7.38s/it]

{'eval_loss': 0.2999988794326782, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 137.3749, 'eval_samples_per_second': 20.288, 'eval_steps_per_second': 0.16, 'epoch': 3.0}



 20%|██        | 248/1240 [48:25<2:35:22,  9.40s/it]

{'eval_loss': 0.2357209324836731, 'eval_accuracy': 0.6849659131682813, 'eval_mcc': 0.1383299058281172, 'eval_runtime': 173.787, 'eval_samples_per_second': 16.037, 'eval_steps_per_second': 0.127, 'epoch': 4.0}



 25%|██▌       | 310/1240 [59:26<1:39:01,  6.39s/it]

{'eval_loss': 0.22135214507579803, 'eval_accuracy': 0.7025475421600287, 'eval_mcc': 0.21869301591616652, 'eval_runtime': 123.6709, 'eval_samples_per_second': 22.536, 'eval_steps_per_second': 0.178, 'epoch': 5.0}



 30%|███       | 372/1240 [1:11:26<1:45:38,  7.30s/it]

{'eval_loss': 0.2303372025489807, 'eval_accuracy': 0.7082884822389667, 'eval_mcc': 0.24608783447929025, 'eval_runtime': 161.7083, 'eval_samples_per_second': 17.235, 'eval_steps_per_second': 0.136, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:22:05<1:40:40,  7.50s/it]

{'eval_loss': 0.27215689420700073, 'eval_accuracy': 0.7072120559741658, 'eval_mcc': 0.2470601146156424, 'eval_runtime': 128.2621, 'eval_samples_per_second': 21.729, 'eval_steps_per_second': 0.172, 'epoch': 7.0}



 40%|████      | 496/1240 [1:34:24<1:28:25,  7.13s/it]

{'eval_loss': 0.2546052634716034, 'eval_accuracy': 0.7086472909939002, 'eval_mcc': 0.2477282404697853, 'eval_runtime': 169.451, 'eval_samples_per_second': 16.447, 'eval_steps_per_second': 0.13, 'epoch': 8.0}


 40%|████      | 500/1240 [1:35:45<6:21:19, 30.92s/it] 

{'loss': 0.3879, 'grad_norm': 0.8823664784431458, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:47:14<1:42:42,  9.04s/it]

{'eval_loss': 0.26838231086730957, 'eval_accuracy': 0.7093649085037675, 'eval_mcc': 0.24514202036781874, 'eval_runtime': 207.2117, 'eval_samples_per_second': 13.45, 'eval_steps_per_second': 0.106, 'epoch': 9.0}



 50%|█████     | 620/1240 [1:58:40<1:23:03,  8.04s/it]

{'eval_loss': 0.33150455355644226, 'eval_accuracy': 0.7172587011123072, 'eval_mcc': 0.2741386319030288, 'eval_runtime': 151.6199, 'eval_samples_per_second': 18.381, 'eval_steps_per_second': 0.145, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:10:20<1:18:11,  8.41s/it]

{'eval_loss': 0.4805898666381836, 'eval_accuracy': 0.7075708647290994, 'eval_mcc': 0.24314078306994447, 'eval_runtime': 134.3199, 'eval_samples_per_second': 20.749, 'eval_steps_per_second': 0.164, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:22:25<1:08:15,  8.26s/it]

{'eval_loss': 0.35667315125465393, 'eval_accuracy': 0.7179763186221744, 'eval_mcc': 0.2840742243275961, 'eval_runtime': 134.2894, 'eval_samples_per_second': 20.754, 'eval_steps_per_second': 0.164, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:33:55<1:00:25,  8.35s/it]

{'eval_loss': 0.3445371091365814, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2944607477129878, 'eval_runtime': 141.2262, 'eval_samples_per_second': 19.734, 'eval_steps_per_second': 0.156, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:45:39<43:03,  6.95s/it]

{'eval_loss': 0.5833361148834229, 'eval_accuracy': 0.7122353785432365, 'eval_mcc': 0.25630192059645435, 'eval_runtime': 164.3507, 'eval_samples_per_second': 16.958, 'eval_steps_per_second': 0.134, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [2:57:40<44:01,  8.52s/it]

{'eval_loss': 0.5043953061103821, 'eval_accuracy': 0.718335127377108, 'eval_mcc': 0.2816552618319439, 'eval_runtime': 147.4753, 'eval_samples_per_second': 18.898, 'eval_steps_per_second': 0.149, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:09:51<36:44,  8.89s/it]

{'eval_loss': 0.6884192824363708, 'eval_accuracy': 0.7172587011123072, 'eval_mcc': 0.2741386319030288, 'eval_runtime': 182.2572, 'eval_samples_per_second': 15.292, 'eval_steps_per_second': 0.121, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:11:51<59:04, 14.77s/it] 

{'loss': 0.0944, 'grad_norm': 0.5669350028038025, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:21:40<28:27,  9.18s/it]

{'eval_loss': 0.8283550143241882, 'eval_accuracy': 0.7151058485827054, 'eval_mcc': 0.26760435202164456, 'eval_runtime': 114.7841, 'eval_samples_per_second': 24.28, 'eval_steps_per_second': 0.192, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:33:34<17:57,  8.69s/it]

{'eval_loss': 0.6817898154258728, 'eval_accuracy': 0.7176175098672407, 'eval_mcc': 0.27661585162023206, 'eval_runtime': 168.9541, 'eval_samples_per_second': 16.496, 'eval_steps_per_second': 0.13, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:44:59<06:44,  6.52s/it]

{'eval_loss': 0.6433241963386536, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.2705541612190762, 'eval_runtime': 141.9171, 'eval_samples_per_second': 19.638, 'eval_steps_per_second': 0.155, 'epoch': 19.0}



100%|██████████| 1240/1240 [3:56:18<00:00,  8.06s/it]

{'eval_loss': 0.7012242078781128, 'eval_accuracy': 0.7090060997488339, 'eval_mcc': 0.24577872193463102, 'eval_runtime': 154.5301, 'eval_samples_per_second': 18.035, 'eval_steps_per_second': 0.142, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [3:56:21<00:00,  8.06s/it]

{'train_runtime': 14181.599, 'train_samples_per_second': 11.097, 'train_steps_per_second': 0.087, 'train_loss': 0.2022424659421367, 'epoch': 20.0}


100%|██████████| 1240/1240 [3:56:21<00:00, 11.44s/it]


sberbank-ai_ruT5-base_0.0001_0_128_7
train {'train_runtime': 14181.599, 'train_samples_per_second': 11.097, 'train_steps_per_second': 0.087, 'total_flos': 4750528079093760.0, 'train_loss': 0.2022424659421367, 'epoch': 20.0}


100%|██████████| 22/22 [02:03<00:00,  5.61s/it]


dev {'test_loss': 0.3445371091365814, 'test_accuracy': 0.7204879799067098, 'test_mcc': 0.2944607477129878, 'test_runtime': 153.1021, 'test_samples_per_second': 18.204, 'test_steps_per_second': 0.144}


100%|██████████| 22/22 [02:10<00:00,  5.92s/it]

  5%|▌         | 62/1240 [11:29<2:43:01,  8.30s/it]

{'eval_loss': 0.3374492824077606, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 141.9532, 'eval_samples_per_second': 19.633, 'eval_steps_per_second': 0.155, 'epoch': 1.0}



 10%|█         | 124/1240 [23:43<2:52:03,  9.25s/it]

{'eval_loss': 0.29561251401901245, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 176.0297, 'eval_samples_per_second': 15.833, 'eval_steps_per_second': 0.125, 'epoch': 2.0}



 15%|█▌        | 186/1240 [35:22<2:31:08,  8.60s/it]

{'eval_loss': 0.31289753317832947, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 137.9931, 'eval_samples_per_second': 20.197, 'eval_steps_per_second': 0.159, 'epoch': 3.0}



 20%|██        | 248/1240 [46:46<1:56:27,  7.04s/it]

{'eval_loss': 0.23408369719982147, 'eval_accuracy': 0.6788661643344097, 'eval_mcc': 0.07915290799691639, 'eval_runtime': 156.1026, 'eval_samples_per_second': 17.854, 'eval_steps_per_second': 0.141, 'epoch': 4.0}



 25%|██▌       | 310/1240 [58:35<1:45:51,  6.83s/it]

{'eval_loss': 0.2551083564758301, 'eval_accuracy': 0.6813778256189451, 'eval_mcc': 0.10468939788144112, 'eval_runtime': 141.1493, 'eval_samples_per_second': 19.745, 'eval_steps_per_second': 0.156, 'epoch': 5.0}



 30%|███       | 372/1240 [1:09:58<2:00:20,  8.32s/it]

{'eval_loss': 0.2197677493095398, 'eval_accuracy': 0.7011123071402943, 'eval_mcc': 0.2269401411885519, 'eval_runtime': 134.7812, 'eval_samples_per_second': 20.678, 'eval_steps_per_second': 0.163, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:21:03<1:53:09,  8.42s/it]

{'eval_loss': 0.22698666155338287, 'eval_accuracy': 0.7104413347685683, 'eval_mcc': 0.24929524839853115, 'eval_runtime': 126.0299, 'eval_samples_per_second': 22.114, 'eval_steps_per_second': 0.175, 'epoch': 7.0}



 40%|████      | 496/1240 [1:32:28<1:31:22,  7.37s/it]

{'eval_loss': 0.22214315831661224, 'eval_accuracy': 0.71654108360244, 'eval_mcc': 0.2719902141464821, 'eval_runtime': 143.7208, 'eval_samples_per_second': 19.392, 'eval_steps_per_second': 0.153, 'epoch': 8.0}


 40%|████      | 500/1240 [1:33:49<5:49:16, 28.32s/it] 

{'loss': 0.4038, 'grad_norm': 0.7213183045387268, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:43:59<1:37:26,  8.57s/it]

{'eval_loss': 0.27149027585983276, 'eval_accuracy': 0.7154646573376391, 'eval_mcc': 0.2723186132141847, 'eval_runtime': 142.962, 'eval_samples_per_second': 19.495, 'eval_steps_per_second': 0.154, 'epoch': 9.0}



 50%|█████     | 620/1240 [1:55:43<1:29:35,  8.67s/it]

{'eval_loss': 0.27516576647758484, 'eval_accuracy': 0.7064944384642985, 'eval_mcc': 0.2336987668019746, 'eval_runtime': 142.3065, 'eval_samples_per_second': 19.584, 'eval_steps_per_second': 0.155, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:07:25<1:19:53,  8.59s/it]

{'eval_loss': 0.3413993716239929, 'eval_accuracy': 0.7176175098672407, 'eval_mcc': 0.2772398310017162, 'eval_runtime': 197.8054, 'eval_samples_per_second': 14.09, 'eval_steps_per_second': 0.111, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:18:47<1:08:59,  8.35s/it]

{'eval_loss': 0.4355643093585968, 'eval_accuracy': 0.707929673484033, 'eval_mcc': 0.24548411795745415, 'eval_runtime': 155.2822, 'eval_samples_per_second': 17.948, 'eval_steps_per_second': 0.142, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:30:33<59:10,  8.18s/it]

{'eval_loss': 0.3941795527935028, 'eval_accuracy': 0.7190527448869752, 'eval_mcc': 0.2807726444097136, 'eval_runtime': 152.5133, 'eval_samples_per_second': 18.274, 'eval_steps_per_second': 0.144, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:43:09<45:07,  7.28s/it]

{'eval_loss': 0.3336189389228821, 'eval_accuracy': 0.721205597416577, 'eval_mcc': 0.3083156091757199, 'eval_runtime': 204.7438, 'eval_samples_per_second': 13.612, 'eval_steps_per_second': 0.107, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [2:55:17<43:03,  8.33s/it]

{'eval_loss': 0.46979835629463196, 'eval_accuracy': 0.7215644061715106, 'eval_mcc': 0.2922926060849246, 'eval_runtime': 159.048, 'eval_samples_per_second': 17.523, 'eval_steps_per_second': 0.138, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:07:17<35:17,  8.54s/it]

{'eval_loss': 0.4944417178630829, 'eval_accuracy': 0.700394689630427, 'eval_mcc': 0.20867530199473366, 'eval_runtime': 170.4269, 'eval_samples_per_second': 16.353, 'eval_steps_per_second': 0.129, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:09:08<52:50, 13.21s/it] 

{'loss': 0.1119, 'grad_norm': 0.9110342860221863, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:19:07<27:42,  8.94s/it]

{'eval_loss': 0.49552062153816223, 'eval_accuracy': 0.7118765697883028, 'eval_mcc': 0.25565080278133023, 'eval_runtime': 155.5651, 'eval_samples_per_second': 17.915, 'eval_steps_per_second': 0.141, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:30:11<17:39,  8.54s/it]

{'eval_loss': 0.6220601797103882, 'eval_accuracy': 0.718335127377108, 'eval_mcc': 0.2875488356312943, 'eval_runtime': 155.8524, 'eval_samples_per_second': 17.882, 'eval_steps_per_second': 0.141, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:40:39<06:46,  6.56s/it]

{'eval_loss': 0.6668115258216858, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.27123443071729114, 'eval_runtime': 128.1612, 'eval_samples_per_second': 21.746, 'eval_steps_per_second': 0.172, 'epoch': 19.0}



100%|██████████| 1240/1240 [3:52:53<00:00,  8.74s/it]

{'eval_loss': 0.6996816992759705, 'eval_accuracy': 0.7161822748475063, 'eval_mcc': 0.27272214623112645, 'eval_runtime': 171.7401, 'eval_samples_per_second': 16.228, 'eval_steps_per_second': 0.128, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [3:52:56<00:00,  8.74s/it]

{'train_runtime': 13976.3865, 'train_samples_per_second': 11.26, 'train_steps_per_second': 0.089, 'train_loss': 0.21669967635985343, 'epoch': 20.0}


100%|██████████| 1240/1240 [3:52:56<00:00, 11.27s/it]


sberbank-ai_ruT5-base_0.0001_0_128_8
train {'train_runtime': 13976.3865, 'train_samples_per_second': 11.26, 'train_steps_per_second': 0.089, 'total_flos': 4755399742218240.0, 'train_loss': 0.21669967635985343, 'epoch': 20.0}


100%|██████████| 22/22 [02:18<00:00,  6.30s/it]


dev {'test_loss': 0.3336189389228821, 'test_accuracy': 0.721205597416577, 'test_mcc': 0.3083156091757199, 'test_runtime': 169.4441, 'test_samples_per_second': 16.448, 'test_steps_per_second': 0.13}


100%|██████████| 22/22 [02:06<00:00,  5.77s/it]

  5%|▌         | 62/1240 [12:15<2:49:20,  8.63s/it]

{'eval_loss': 0.2828591763973236, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 140.3426, 'eval_samples_per_second': 19.859, 'eval_steps_per_second': 0.157, 'epoch': 1.0}



 10%|█         | 124/1240 [26:04<2:43:51,  8.81s/it]

{'eval_loss': 0.2601367235183716, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 157.2125, 'eval_samples_per_second': 17.728, 'eval_steps_per_second': 0.14, 'epoch': 2.0}



 15%|█▌        | 186/1240 [38:23<2:50:18,  9.69s/it]

{'eval_loss': 0.2329195886850357, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 148.4313, 'eval_samples_per_second': 18.776, 'eval_steps_per_second': 0.148, 'epoch': 3.0}



 20%|██        | 248/1240 [50:49<2:32:01,  9.20s/it]

{'eval_loss': 0.2249794900417328, 'eval_accuracy': 0.6860423394330821, 'eval_mcc': 0.13359869734537042, 'eval_runtime': 160.2934, 'eval_samples_per_second': 17.387, 'eval_steps_per_second': 0.137, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:02:18<1:45:58,  6.84s/it]

{'eval_loss': 0.2634780704975128, 'eval_accuracy': 0.6903480444922856, 'eval_mcc': 0.1629126665887152, 'eval_runtime': 130.5353, 'eval_samples_per_second': 21.351, 'eval_steps_per_second': 0.169, 'epoch': 5.0}



 30%|███       | 372/1240 [1:14:15<1:45:30,  7.29s/it]

{'eval_loss': 0.2239702343940735, 'eval_accuracy': 0.7021887334050951, 'eval_mcc': 0.22373890602025598, 'eval_runtime': 165.5802, 'eval_samples_per_second': 16.832, 'eval_steps_per_second': 0.133, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:25:52<1:53:23,  8.44s/it]

{'eval_loss': 0.3117675483226776, 'eval_accuracy': 0.6971654108360243, 'eval_mcc': 0.19720358802887808, 'eval_runtime': 132.2677, 'eval_samples_per_second': 21.071, 'eval_steps_per_second': 0.166, 'epoch': 7.0}



 40%|████      | 496/1240 [1:37:21<1:50:22,  8.90s/it]

{'eval_loss': 0.23740211129188538, 'eval_accuracy': 0.7122353785432365, 'eval_mcc': 0.25719865457511365, 'eval_runtime': 135.5385, 'eval_samples_per_second': 20.562, 'eval_steps_per_second': 0.162, 'epoch': 8.0}


 40%|████      | 500/1240 [1:38:55<6:11:01, 30.08s/it] 

{'loss': 0.3956, 'grad_norm': 0.790911078453064, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:49:10<1:36:23,  8.48s/it]

{'eval_loss': 0.3484959304332733, 'eval_accuracy': 0.7147470398277718, 'eval_mcc': 0.26821244485394236, 'eval_runtime': 142.2474, 'eval_samples_per_second': 19.593, 'eval_steps_per_second': 0.155, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:00:41<1:12:09,  6.98s/it]

{'eval_loss': 0.38907286524772644, 'eval_accuracy': 0.7136706135629709, 'eval_mcc': 0.2626611681935355, 'eval_runtime': 130.7191, 'eval_samples_per_second': 21.321, 'eval_steps_per_second': 0.168, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:12:35<1:25:28,  9.19s/it]

{'eval_loss': 0.4100783169269562, 'eval_accuracy': 0.7136706135629709, 'eval_mcc': 0.2625069409589123, 'eval_runtime': 152.563, 'eval_samples_per_second': 18.268, 'eval_steps_per_second': 0.144, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:24:01<1:10:56,  8.58s/it]

{'eval_loss': 0.3930780291557312, 'eval_accuracy': 0.7215644061715106, 'eval_mcc': 0.29084980563802826, 'eval_runtime': 135.6517, 'eval_samples_per_second': 20.545, 'eval_steps_per_second': 0.162, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:35:39<1:01:50,  8.55s/it]

{'eval_loss': 0.43499889969825745, 'eval_accuracy': 0.7262289199856476, 'eval_mcc': 0.3060621549513278, 'eval_runtime': 138.4435, 'eval_samples_per_second': 20.131, 'eval_steps_per_second': 0.159, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:46:54<52:09,  8.41s/it]

{'eval_loss': 0.45473194122314453, 'eval_accuracy': 0.7208467886616433, 'eval_mcc': 0.2911613728328669, 'eval_runtime': 117.1023, 'eval_samples_per_second': 23.8, 'eval_steps_per_second': 0.188, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [2:58:52<39:01,  7.55s/it]

{'eval_loss': 0.6545436978340149, 'eval_accuracy': 0.71259418729817, 'eval_mcc': 0.2577116791101455, 'eval_runtime': 148.7133, 'eval_samples_per_second': 18.741, 'eval_steps_per_second': 0.148, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:11:13<36:41,  8.88s/it]

{'eval_loss': 0.4879131317138672, 'eval_accuracy': 0.728740581270183, 'eval_mcc': 0.3144549979280177, 'eval_runtime': 173.1343, 'eval_samples_per_second': 16.097, 'eval_steps_per_second': 0.127, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:13:17<56:07, 14.03s/it] 

{'loss': 0.0881, 'grad_norm': 1.4694468975067139, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:24:36<28:31,  9.20s/it]

{'eval_loss': 0.48635169863700867, 'eval_accuracy': 0.7237172587011123, 'eval_mcc': 0.2983263275789229, 'eval_runtime': 204.8313, 'eval_samples_per_second': 13.606, 'eval_steps_per_second': 0.107, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:36:43<18:26,  8.92s/it]

{'eval_loss': 0.714713990688324, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.28568930272073, 'eval_runtime': 162.3012, 'eval_samples_per_second': 17.172, 'eval_steps_per_second': 0.136, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:48:18<08:50,  8.55s/it]

{'eval_loss': 0.5414236187934875, 'eval_accuracy': 0.728740581270183, 'eval_mcc': 0.320954703580785, 'eval_runtime': 157.2482, 'eval_samples_per_second': 17.724, 'eval_steps_per_second': 0.14, 'epoch': 19.0}



100%|██████████| 1240/1240 [3:59:47<00:00,  7.07s/it]

{'eval_loss': 0.5651960968971252, 'eval_accuracy': 0.7237172587011123, 'eval_mcc': 0.30110324401518074, 'eval_runtime': 133.3922, 'eval_samples_per_second': 20.893, 'eval_steps_per_second': 0.165, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [3:59:51<00:00, 11.61s/it]


{'train_runtime': 14390.9457, 'train_samples_per_second': 10.936, 'train_steps_per_second': 0.086, 'train_loss': 0.20215569080845003, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0_128_9
train {'train_runtime': 14390.9457, 'train_samples_per_second': 10.936, 'train_steps_per_second': 0.086, 'total_flos': 4780338470830080.0, 'train_loss': 0.20215569080845003, 'epoch': 20.0}


100%|██████████| 22/22 [01:38<00:00,  4.49s/it]


dev {'test_loss': 0.5414236187934875, 'test_accuracy': 0.728740581270183, 'test_mcc': 0.320954703580785, 'test_runtime': 127.5274, 'test_samples_per_second': 21.854, 'test_steps_per_second': 0.173}


100%|██████████| 22/22 [01:35<00:00,  4.35s/it]

  5%|▌         | 62/1240 [13:04<2:51:46,  8.75s/it]

{'eval_loss': 0.33953073620796204, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 188.5771, 'eval_samples_per_second': 14.779, 'eval_steps_per_second': 0.117, 'epoch': 1.0}



 10%|█         | 124/1240 [25:51<3:04:14,  9.91s/it]

{'eval_loss': 0.27848392724990845, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 160.2884, 'eval_samples_per_second': 17.387, 'eval_steps_per_second': 0.137, 'epoch': 2.0}



 15%|█▌        | 186/1240 [38:33<2:15:43,  7.73s/it]

{'eval_loss': 0.23775430023670197, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 164.9326, 'eval_samples_per_second': 16.898, 'eval_steps_per_second': 0.133, 'epoch': 3.0}



 20%|██        | 248/1240 [51:21<2:24:15,  8.73s/it]

{'eval_loss': 0.2592877149581909, 'eval_accuracy': 0.6781485468245425, 'eval_mcc': 0.077372430586719, 'eval_runtime': 167.6692, 'eval_samples_per_second': 16.622, 'eval_steps_per_second': 0.131, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:03:07<1:49:01,  7.03s/it]

{'eval_loss': 0.23017176985740662, 'eval_accuracy': 0.6935773232866882, 'eval_mcc': 0.1823045511116742, 'eval_runtime': 131.9117, 'eval_samples_per_second': 21.128, 'eval_steps_per_second': 0.167, 'epoch': 5.0}



 30%|███       | 372/1240 [1:14:59<2:04:43,  8.62s/it]

{'eval_loss': 0.24283894896507263, 'eval_accuracy': 0.7007534983853606, 'eval_mcc': 0.21092701012249818, 'eval_runtime': 124.1675, 'eval_samples_per_second': 22.445, 'eval_steps_per_second': 0.177, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:26:50<1:44:09,  7.75s/it]

{'eval_loss': 0.2266327291727066, 'eval_accuracy': 0.7143882310728382, 'eval_mcc': 0.2653978792358558, 'eval_runtime': 161.1095, 'eval_samples_per_second': 17.299, 'eval_steps_per_second': 0.137, 'epoch': 7.0}



 40%|████      | 496/1240 [1:39:36<1:52:57,  9.11s/it]

{'eval_loss': 0.21729573607444763, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2905940130171505, 'eval_runtime': 181.1134, 'eval_samples_per_second': 15.388, 'eval_steps_per_second': 0.121, 'epoch': 8.0}


 40%|████      | 500/1240 [1:41:13<7:19:45, 35.66s/it] 

{'loss': 0.4372, 'grad_norm': 0.8744450807571411, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:51:44<1:13:50,  6.50s/it]

{'eval_loss': 0.2869114577770233, 'eval_accuracy': 0.7154646573376391, 'eval_mcc': 0.2682284236268117, 'eval_runtime': 177.0494, 'eval_samples_per_second': 15.741, 'eval_steps_per_second': 0.124, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:03:23<1:29:18,  8.64s/it]

{'eval_loss': 0.39593878388404846, 'eval_accuracy': 0.710800143523502, 'eval_mcc': 0.250765867333914, 'eval_runtime': 133.8592, 'eval_samples_per_second': 20.82, 'eval_steps_per_second': 0.164, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:14:56<1:00:31,  6.51s/it]

{'eval_loss': 0.4343354403972626, 'eval_accuracy': 0.7014711158952278, 'eval_mcc': 0.21247612529912768, 'eval_runtime': 168.5553, 'eval_samples_per_second': 16.535, 'eval_steps_per_second': 0.131, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:27:41<1:11:02,  8.59s/it]

{'eval_loss': 0.3916717767715454, 'eval_accuracy': 0.7136706135629709, 'eval_mcc': 0.26274296022016136, 'eval_runtime': 191.6767, 'eval_samples_per_second': 14.54, 'eval_steps_per_second': 0.115, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:39:02<1:01:09,  8.45s/it]

{'eval_loss': 0.5404649972915649, 'eval_accuracy': 0.7039827771797632, 'eval_mcc': 0.22282074863805904, 'eval_runtime': 157.7521, 'eval_samples_per_second': 17.667, 'eval_steps_per_second': 0.139, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:50:46<54:17,  8.76s/it]

{'eval_loss': 0.39426371455192566, 'eval_accuracy': 0.722999641191245, 'eval_mcc': 0.29634776738265517, 'eval_runtime': 146.4134, 'eval_samples_per_second': 19.035, 'eval_steps_per_second': 0.15, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:02:14<33:55,  6.57s/it]

{'eval_loss': 0.46727731823921204, 'eval_accuracy': 0.71259418729817, 'eval_mcc': 0.2602104960962921, 'eval_runtime': 136.648, 'eval_samples_per_second': 20.395, 'eval_steps_per_second': 0.161, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:15:03<36:37,  8.86s/it]

{'eval_loss': 0.5182844400405884, 'eval_accuracy': 0.7172587011123072, 'eval_mcc': 0.2783275460411395, 'eval_runtime': 182.7409, 'eval_samples_per_second': 15.251, 'eval_steps_per_second': 0.12, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:17:00<54:40, 13.67s/it] 

{'loss': 0.0977, 'grad_norm': 0.6372029781341553, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:26:56<21:43,  7.01s/it]

{'eval_loss': 0.5976054668426514, 'eval_accuracy': 0.7072120559741658, 'eval_mcc': 0.23818440962508436, 'eval_runtime': 179.9757, 'eval_samples_per_second': 15.485, 'eval_steps_per_second': 0.122, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:38:42<13:04,  6.33s/it]

{'eval_loss': 0.7220181822776794, 'eval_accuracy': 0.7172587011123072, 'eval_mcc': 0.27454499432418566, 'eval_runtime': 154.8495, 'eval_samples_per_second': 17.998, 'eval_steps_per_second': 0.142, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:51:24<08:57,  8.67s/it]

{'eval_loss': 0.6182755827903748, 'eval_accuracy': 0.7240760674560459, 'eval_mcc': 0.3044546862894332, 'eval_runtime': 197.0115, 'eval_samples_per_second': 14.146, 'eval_steps_per_second': 0.112, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:02:51<00:00,  8.78s/it]

{'eval_loss': 0.6568722128868103, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.27462666655117646, 'eval_runtime': 130.1553, 'eval_samples_per_second': 21.413, 'eval_steps_per_second': 0.169, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:02:54<00:00,  8.78s/it]

{'train_runtime': 14574.9026, 'train_samples_per_second': 10.798, 'train_steps_per_second': 0.085, 'train_loss': 0.22288832126125213, 'epoch': 20.0}


100%|██████████| 1240/1240 [4:02:55<00:00, 11.75s/it]


sberbank-ai_ruT5-base_0.0001_0.0001_128_0
train {'train_runtime': 14574.9026, 'train_samples_per_second': 10.798, 'train_steps_per_second': 0.085, 'total_flos': 4769434318602240.0, 'train_loss': 0.22288832126125213, 'epoch': 20.0}


100%|██████████| 22/22 [01:33<00:00,  4.25s/it]


dev {'test_loss': 0.6182755827903748, 'test_accuracy': 0.7240760674560459, 'test_mcc': 0.3044546862894332, 'test_runtime': 123.6552, 'test_samples_per_second': 22.538, 'test_steps_per_second': 0.178}


100%|██████████| 22/22 [01:37<00:00,  4.44s/it]

  5%|▌         | 62/1240 [11:07<2:32:28,  7.77s/it]

{'eval_loss': 0.32506468892097473, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 120.757, 'eval_samples_per_second': 23.079, 'eval_steps_per_second': 0.182, 'epoch': 1.0}



 10%|█         | 124/1240 [23:23<2:17:08,  7.37s/it]

{'eval_loss': 0.28575122356414795, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 161.9503, 'eval_samples_per_second': 17.209, 'eval_steps_per_second': 0.136, 'epoch': 2.0}



 15%|█▌        | 186/1240 [37:02<2:47:57,  9.56s/it]

{'eval_loss': 0.26971006393432617, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 190.1274, 'eval_samples_per_second': 14.659, 'eval_steps_per_second': 0.116, 'epoch': 3.0}



 20%|██        | 248/1240 [49:09<2:05:30,  7.59s/it]

{'eval_loss': 0.256347119808197, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 160.1849, 'eval_samples_per_second': 17.399, 'eval_steps_per_second': 0.137, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:02:47<2:18:19,  8.92s/it]

{'eval_loss': 0.22097192704677582, 'eval_accuracy': 0.6982418371008252, 'eval_mcc': 0.2004732982526735, 'eval_runtime': 248.5666, 'eval_samples_per_second': 11.212, 'eval_steps_per_second': 0.089, 'epoch': 5.0}



 30%|███       | 372/1240 [1:15:23<2:12:36,  9.17s/it]

{'eval_loss': 0.22034406661987305, 'eval_accuracy': 0.7014711158952278, 'eval_mcc': 0.221948563007954, 'eval_runtime': 155.6405, 'eval_samples_per_second': 17.907, 'eval_steps_per_second': 0.141, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:27:20<2:03:23,  9.19s/it]

{'eval_loss': 0.2203121781349182, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2914213068716145, 'eval_runtime': 157.7141, 'eval_samples_per_second': 17.671, 'eval_steps_per_second': 0.139, 'epoch': 7.0}



 40%|████      | 496/1240 [1:39:54<1:49:17,  8.81s/it]

{'eval_loss': 0.26229941844940186, 'eval_accuracy': 0.7244348762109796, 'eval_mcc': 0.2999222684662865, 'eval_runtime': 136.2891, 'eval_samples_per_second': 20.449, 'eval_steps_per_second': 0.161, 'epoch': 8.0}


 40%|████      | 500/1240 [1:41:27<6:08:00, 29.84s/it] 

{'loss': 0.4102, 'grad_norm': 0.8062583804130554, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:51:26<1:37:22,  8.57s/it]

{'eval_loss': 0.3199373185634613, 'eval_accuracy': 0.7147470398277718, 'eval_mcc': 0.2777564769801062, 'eval_runtime': 120.8999, 'eval_samples_per_second': 23.052, 'eval_steps_per_second': 0.182, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:03:41<1:29:44,  8.68s/it]

{'eval_loss': 0.28979915380477905, 'eval_accuracy': 0.7244348762109796, 'eval_mcc': 0.29978166275153867, 'eval_runtime': 178.133, 'eval_samples_per_second': 15.646, 'eval_steps_per_second': 0.124, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:16:15<1:02:28,  6.72s/it]

{'eval_loss': 0.32763829827308655, 'eval_accuracy': 0.7294581987800502, 'eval_mcc': 0.3175174161551942, 'eval_runtime': 151.4324, 'eval_samples_per_second': 18.404, 'eval_steps_per_second': 0.145, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:27:30<58:46,  7.11s/it]

{'eval_loss': 0.357371985912323, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.26964094820571355, 'eval_runtime': 132.5511, 'eval_samples_per_second': 21.026, 'eval_steps_per_second': 0.166, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:39:06<1:06:28,  9.19s/it]

{'eval_loss': 0.4910358190536499, 'eval_accuracy': 0.7115177610333692, 'eval_mcc': 0.253176337785139, 'eval_runtime': 145.4765, 'eval_samples_per_second': 19.158, 'eval_steps_per_second': 0.151, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:54:58<1:01:09,  9.86s/it]

{'eval_loss': 0.5719196200370789, 'eval_accuracy': 0.7140294223179046, 'eval_mcc': 0.26229991838499406, 'eval_runtime': 159.5963, 'eval_samples_per_second': 17.463, 'eval_steps_per_second': 0.138, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:06:28<44:58,  8.70s/it]

{'eval_loss': 0.562828004360199, 'eval_accuracy': 0.7244348762109796, 'eval_mcc': 0.299463586457738, 'eval_runtime': 152.0777, 'eval_samples_per_second': 18.326, 'eval_steps_per_second': 0.145, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:17:27<26:45,  6.47s/it]

{'eval_loss': 0.617846667766571, 'eval_accuracy': 0.7093649085037675, 'eval_mcc': 0.24459751501769386, 'eval_runtime': 116.1321, 'eval_samples_per_second': 23.999, 'eval_steps_per_second': 0.189, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:19:24<52:07, 13.03s/it] 

{'loss': 0.0967, 'grad_norm': 0.37820345163345337, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:30:17<28:24,  9.17s/it]

{'eval_loss': 0.5855653285980225, 'eval_accuracy': 0.7194115536419089, 'eval_mcc': 0.2840003819425707, 'eval_runtime': 190.4106, 'eval_samples_per_second': 14.637, 'eval_steps_per_second': 0.116, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:41:46<14:02,  6.79s/it]

{'eval_loss': 0.6197635531425476, 'eval_accuracy': 0.7176175098672407, 'eval_mcc': 0.27627899039793424, 'eval_runtime': 125.2923, 'eval_samples_per_second': 22.244, 'eval_steps_per_second': 0.176, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:53:57<08:29,  8.22s/it]

{'eval_loss': 0.6740710735321045, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.2707701345709899, 'eval_runtime': 161.0871, 'eval_samples_per_second': 17.301, 'eval_steps_per_second': 0.137, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:09:13<00:00,  9.64s/it]

{'eval_loss': 0.6774826645851135, 'eval_accuracy': 0.7190527448869752, 'eval_mcc': 0.2990029495005887, 'eval_runtime': 169.714, 'eval_samples_per_second': 16.422, 'eval_steps_per_second': 0.13, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:09:16<00:00, 12.06s/it]


{'train_runtime': 14956.603, 'train_samples_per_second': 10.522, 'train_steps_per_second': 0.083, 'train_loss': 0.2117692808951101, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0.0001_128_1
train {'train_runtime': 14956.603, 'train_samples_per_second': 10.522, 'train_steps_per_second': 0.083, 'total_flos': 4771232647372800.0, 'train_loss': 0.2117692808951101, 'epoch': 20.0}


100%|██████████| 22/22 [02:20<00:00,  6.37s/it]


dev {'test_loss': 0.32763829827308655, 'test_accuracy': 0.7294581987800502, 'test_mcc': 0.3175174161551942, 'test_runtime': 173.2697, 'test_samples_per_second': 16.085, 'test_steps_per_second': 0.127}


100%|██████████| 22/22 [02:14<00:00,  6.10s/it]

  5%|▌         | 62/1240 [12:23<2:23:27,  7.31s/it]

{'eval_loss': 0.343330055475235, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 233.8407, 'eval_samples_per_second': 11.918, 'eval_steps_per_second': 0.094, 'epoch': 1.0}



 10%|█         | 124/1240 [24:07<2:53:00,  9.30s/it]

{'eval_loss': 0.3553447723388672, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 110.7782, 'eval_samples_per_second': 25.158, 'eval_steps_per_second': 0.199, 'epoch': 2.0}



 15%|█▌        | 186/1240 [37:03<2:42:02,  9.22s/it]

{'eval_loss': 0.30913954973220825, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 164.9809, 'eval_samples_per_second': 16.893, 'eval_steps_per_second': 0.133, 'epoch': 3.0}



 20%|██        | 248/1240 [49:22<2:32:58,  9.25s/it]

{'eval_loss': 0.23191556334495544, 'eval_accuracy': 0.684248295658414, 'eval_mcc': 0.12256541621633768, 'eval_runtime': 147.8714, 'eval_samples_per_second': 18.847, 'eval_steps_per_second': 0.149, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:01:17<2:24:05,  9.30s/it]

{'eval_loss': 0.22061815857887268, 'eval_accuracy': 0.6942949407965554, 'eval_mcc': 0.17993597112763243, 'eval_runtime': 156.5656, 'eval_samples_per_second': 17.801, 'eval_steps_per_second': 0.141, 'epoch': 5.0}



 30%|███       | 372/1240 [1:14:40<2:08:37,  8.89s/it]

{'eval_loss': 0.2561188042163849, 'eval_accuracy': 0.6935773232866882, 'eval_mcc': 0.18373080989896387, 'eval_runtime': 192.1679, 'eval_samples_per_second': 14.503, 'eval_steps_per_second': 0.114, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:28:03<2:06:11,  9.39s/it]

{'eval_loss': 0.24257692694664001, 'eval_accuracy': 0.7143882310728382, 'eval_mcc': 0.26413560702370326, 'eval_runtime': 138.1143, 'eval_samples_per_second': 20.179, 'eval_steps_per_second': 0.159, 'epoch': 7.0}



 40%|████      | 496/1240 [1:40:11<1:52:40,  9.09s/it]

{'eval_loss': 0.2455579787492752, 'eval_accuracy': 0.7140294223179046, 'eval_mcc': 0.27024786625698877, 'eval_runtime': 155.311, 'eval_samples_per_second': 17.945, 'eval_steps_per_second': 0.142, 'epoch': 8.0}


 40%|████      | 500/1240 [1:41:47<6:43:35, 32.72s/it] 

{'loss': 0.4007, 'grad_norm': 1.1198604106903076, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:51:38<1:24:27,  7.43s/it]

{'eval_loss': 0.28977158665657043, 'eval_accuracy': 0.7290993900251166, 'eval_mcc': 0.3184056981991716, 'eval_runtime': 115.4373, 'eval_samples_per_second': 24.143, 'eval_steps_per_second': 0.191, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:03:53<1:20:21,  7.78s/it]

{'eval_loss': 0.27423983812332153, 'eval_accuracy': 0.7208467886616433, 'eval_mcc': 0.2878234803222649, 'eval_runtime': 160.548, 'eval_samples_per_second': 17.359, 'eval_steps_per_second': 0.137, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:15:55<1:22:06,  8.83s/it]

{'eval_loss': 0.31429314613342285, 'eval_accuracy': 0.722999641191245, 'eval_mcc': 0.2947105569307808, 'eval_runtime': 141.391, 'eval_samples_per_second': 19.711, 'eval_steps_per_second': 0.156, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:28:16<1:03:25,  7.67s/it]

{'eval_loss': 0.43837088346481323, 'eval_accuracy': 0.7233584499461787, 'eval_mcc': 0.2970308221817182, 'eval_runtime': 166.6631, 'eval_samples_per_second': 16.722, 'eval_steps_per_second': 0.132, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:40:44<1:03:09,  8.73s/it]

{'eval_loss': 0.4460804760456085, 'eval_accuracy': 0.7226408324363115, 'eval_mcc': 0.2954996017316337, 'eval_runtime': 155.3261, 'eval_samples_per_second': 17.943, 'eval_steps_per_second': 0.142, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:53:16<59:08,  9.54s/it]

{'eval_loss': 0.5550039410591125, 'eval_accuracy': 0.7168998923573735, 'eval_mcc': 0.2762690553732589, 'eval_runtime': 193.6091, 'eval_samples_per_second': 14.395, 'eval_steps_per_second': 0.114, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [3:06:02<42:08,  8.16s/it]

{'eval_loss': 0.5405493378639221, 'eval_accuracy': 0.7201291711517761, 'eval_mcc': 0.28567298996968765, 'eval_runtime': 181.8624, 'eval_samples_per_second': 15.325, 'eval_steps_per_second': 0.121, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:18:23<27:41,  6.70s/it]

{'eval_loss': 0.4862962067127228, 'eval_accuracy': 0.7294581987800502, 'eval_mcc': 0.31628282922111184, 'eval_runtime': 152.1602, 'eval_samples_per_second': 18.316, 'eval_steps_per_second': 0.145, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:20:29<53:56, 13.49s/it] 

{'loss': 0.0975, 'grad_norm': 0.36904171109199524, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:30:36<28:48,  9.29s/it]

{'eval_loss': 0.6576849222183228, 'eval_accuracy': 0.721205597416577, 'eval_mcc': 0.28901524367517323, 'eval_runtime': 146.753, 'eval_samples_per_second': 18.991, 'eval_steps_per_second': 0.15, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:43:50<19:53,  9.62s/it]

{'eval_loss': 0.7119777798652649, 'eval_accuracy': 0.7233584499461787, 'eval_mcc': 0.29572040929333215, 'eval_runtime': 148.82, 'eval_samples_per_second': 18.727, 'eval_steps_per_second': 0.148, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:58:14<11:50, 11.47s/it]

{'eval_loss': 0.8690889477729797, 'eval_accuracy': 0.7151058485827054, 'eval_mcc': 0.26703005596123897, 'eval_runtime': 135.7179, 'eval_samples_per_second': 20.535, 'eval_steps_per_second': 0.162, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:10:29<00:00,  7.49s/it]

{'eval_loss': 0.6811937093734741, 'eval_accuracy': 0.7273053462504485, 'eval_mcc': 0.3093599073835068, 'eval_runtime': 163.9506, 'eval_samples_per_second': 16.999, 'eval_steps_per_second': 0.134, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:10:38<00:00, 12.13s/it]


{'train_runtime': 15038.1987, 'train_samples_per_second': 10.465, 'train_steps_per_second': 0.082, 'train_loss': 0.20863828812876056, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0.0001_128_2
train {'train_runtime': 15038.1987, 'train_samples_per_second': 10.465, 'train_steps_per_second': 0.082, 'total_flos': 4777265136476160.0, 'train_loss': 0.20863828812876056, 'epoch': 20.0}


100%|██████████| 22/22 [02:13<00:00,  6.08s/it]


dev {'test_loss': 0.28977158665657043, 'test_accuracy': 0.7290993900251166, 'test_mcc': 0.3184056981991716, 'test_runtime': 167.4547, 'test_samples_per_second': 16.643, 'test_steps_per_second': 0.131}


100%|██████████| 22/22 [02:13<00:00,  6.09s/it]

  5%|▌         | 62/1240 [11:04<2:54:29,  8.89s/it]

{'eval_loss': 0.3242959678173065, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 122.9771, 'eval_samples_per_second': 22.663, 'eval_steps_per_second': 0.179, 'epoch': 1.0}



 10%|█         | 124/1240 [22:59<2:39:08,  8.56s/it]

{'eval_loss': 0.2421770542860031, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 149.0021, 'eval_samples_per_second': 18.704, 'eval_steps_per_second': 0.148, 'epoch': 2.0}



 15%|█▌        | 186/1240 [35:14<2:24:29,  8.23s/it]

{'eval_loss': 0.23346801102161407, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 126.8651, 'eval_samples_per_second': 21.968, 'eval_steps_per_second': 0.173, 'epoch': 3.0}



 20%|██        | 248/1240 [48:11<2:26:42,  8.87s/it]

{'eval_loss': 0.24769705533981323, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 180.4461, 'eval_samples_per_second': 15.445, 'eval_steps_per_second': 0.122, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:00:06<1:55:38,  7.46s/it]

{'eval_loss': 0.27547958493232727, 'eval_accuracy': 0.6849659131682813, 'eval_mcc': 0.13510444123666368, 'eval_runtime': 139.6941, 'eval_samples_per_second': 19.951, 'eval_steps_per_second': 0.157, 'epoch': 5.0}



 30%|███       | 372/1240 [1:11:57<2:04:59,  8.64s/it]

{'eval_loss': 0.25691038370132446, 'eval_accuracy': 0.6960889845712236, 'eval_mcc': 0.19823982648405056, 'eval_runtime': 148.97, 'eval_samples_per_second': 18.708, 'eval_steps_per_second': 0.148, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:24:00<1:51:01,  8.26s/it]

{'eval_loss': 0.2272300273180008, 'eval_accuracy': 0.7118765697883028, 'eval_mcc': 0.2557805786516813, 'eval_runtime': 154.8347, 'eval_samples_per_second': 18.0, 'eval_steps_per_second': 0.142, 'epoch': 7.0}



 40%|████      | 496/1240 [1:35:37<1:45:05,  8.48s/it]

{'eval_loss': 0.2641617953777313, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2630614455700378, 'eval_runtime': 113.9762, 'eval_samples_per_second': 24.452, 'eval_steps_per_second': 0.193, 'epoch': 8.0}


 40%|████      | 500/1240 [1:37:05<5:35:14, 27.18s/it] 

{'loss': 0.4151, 'grad_norm': 0.6403657793998718, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:46:41<1:18:16,  6.89s/it]

{'eval_loss': 0.29846447706222534, 'eval_accuracy': 0.7140294223179046, 'eval_mcc': 0.2694180781176133, 'eval_runtime': 133.7812, 'eval_samples_per_second': 20.833, 'eval_steps_per_second': 0.164, 'epoch': 9.0}



 50%|█████     | 620/1240 [1:58:25<1:34:14,  9.12s/it]

{'eval_loss': 0.3658027648925781, 'eval_accuracy': 0.7158234660925726, 'eval_mcc': 0.2706263578518465, 'eval_runtime': 108.5375, 'eval_samples_per_second': 25.678, 'eval_steps_per_second': 0.203, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:09:48<1:11:09,  7.65s/it]

{'eval_loss': 0.34464550018310547, 'eval_accuracy': 0.7290993900251166, 'eval_mcc': 0.31717196889845073, 'eval_runtime': 145.933, 'eval_samples_per_second': 19.098, 'eval_steps_per_second': 0.151, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:22:02<1:10:19,  8.51s/it]

{'eval_loss': 0.32509106397628784, 'eval_accuracy': 0.7265877287405813, 'eval_mcc': 0.32896836276269575, 'eval_runtime': 183.6962, 'eval_samples_per_second': 15.172, 'eval_steps_per_second': 0.12, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:34:56<1:06:47,  9.23s/it]

{'eval_loss': 0.4487420916557312, 'eval_accuracy': 0.7301758162899175, 'eval_mcc': 0.3187271658422486, 'eval_runtime': 178.8349, 'eval_samples_per_second': 15.584, 'eval_steps_per_second': 0.123, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:47:27<55:24,  8.94s/it]

{'eval_loss': 0.5947644710540771, 'eval_accuracy': 0.7104413347685683, 'eval_mcc': 0.24925391441216616, 'eval_runtime': 174.2144, 'eval_samples_per_second': 15.998, 'eval_steps_per_second': 0.126, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [2:59:08<44:58,  8.70s/it]

{'eval_loss': 0.4367785155773163, 'eval_accuracy': 0.725870111230714, 'eval_mcc': 0.3057011079462158, 'eval_runtime': 157.2962, 'eval_samples_per_second': 17.718, 'eval_steps_per_second': 0.14, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:11:29<37:51,  9.16s/it]

{'eval_loss': 0.7390432953834534, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2607071568096838, 'eval_runtime': 147.0321, 'eval_samples_per_second': 18.955, 'eval_steps_per_second': 0.15, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:13:37<53:12, 13.30s/it] 

{'loss': 0.0911, 'grad_norm': 0.7487825155258179, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:23:29<23:45,  7.66s/it]

{'eval_loss': 0.6847727298736572, 'eval_accuracy': 0.7147470398277718, 'eval_mcc': 0.2650518527493611, 'eval_runtime': 149.2161, 'eval_samples_per_second': 18.678, 'eval_steps_per_second': 0.147, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:35:25<19:32,  9.46s/it]

{'eval_loss': 0.7630577087402344, 'eval_accuracy': 0.7204879799067098, 'eval_mcc': 0.2857600101720699, 'eval_runtime': 127.5439, 'eval_samples_per_second': 21.851, 'eval_steps_per_second': 0.172, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:48:18<09:27,  9.15s/it]

{'eval_loss': 0.5652984976768494, 'eval_accuracy': 0.7154646573376391, 'eval_mcc': 0.26772399476755987, 'eval_runtime': 198.8931, 'eval_samples_per_second': 14.013, 'eval_steps_per_second': 0.111, 'epoch': 19.0}



100%|██████████| 1240/1240 [4:00:00<00:00,  8.67s/it]

{'eval_loss': 0.635643720626831, 'eval_accuracy': 0.7233584499461787, 'eval_mcc': 0.2978623440738384, 'eval_runtime': 131.0406, 'eval_samples_per_second': 21.268, 'eval_steps_per_second': 0.168, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [4:00:03<00:00, 11.62s/it]


{'train_runtime': 14403.42, 'train_samples_per_second': 10.927, 'train_steps_per_second': 0.086, 'train_loss': 0.21124693578289402, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0.0001_128_3
train {'train_runtime': 14403.42, 'train_samples_per_second': 10.927, 'train_steps_per_second': 0.086, 'total_flos': 4771870150164480.0, 'train_loss': 0.21124693578289402, 'epoch': 20.0}


100%|██████████| 22/22 [01:34<00:00,  4.30s/it]


dev {'test_loss': 0.32509106397628784, 'test_accuracy': 0.7265877287405813, 'test_mcc': 0.32896836276269575, 'test_runtime': 127.3675, 'test_samples_per_second': 21.882, 'test_steps_per_second': 0.173}


100%|██████████| 22/22 [01:29<00:00,  4.08s/it]

  5%|▌         | 62/1240 [12:10<2:22:17,  7.25s/it]

{'eval_loss': 0.23754224181175232, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 156.3143, 'eval_samples_per_second': 17.829, 'eval_steps_per_second': 0.141, 'epoch': 1.0}



 10%|█         | 124/1240 [24:25<2:57:01,  9.52s/it]

{'eval_loss': 0.2795906364917755, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 171.4872, 'eval_samples_per_second': 16.252, 'eval_steps_per_second': 0.128, 'epoch': 2.0}



 15%|█▌        | 186/1240 [36:30<2:39:30,  9.08s/it]

{'eval_loss': 0.28255823254585266, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 141.2637, 'eval_samples_per_second': 19.729, 'eval_steps_per_second': 0.156, 'epoch': 3.0}



 20%|██        | 248/1240 [48:33<2:25:21,  8.79s/it]

{'eval_loss': 0.271607905626297, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 172.7655, 'eval_samples_per_second': 16.132, 'eval_steps_per_second': 0.127, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:00:30<2:13:21,  8.60s/it]

{'eval_loss': 0.26222047209739685, 'eval_accuracy': 0.6820954431288123, 'eval_mcc': 0.1146073652974931, 'eval_runtime': 163.3102, 'eval_samples_per_second': 17.066, 'eval_steps_per_second': 0.135, 'epoch': 5.0}



 30%|███       | 372/1240 [1:12:40<2:05:50,  8.70s/it]

{'eval_loss': 0.26355016231536865, 'eval_accuracy': 0.6885540007176175, 'eval_mcc': 0.16347973892013445, 'eval_runtime': 191.8162, 'eval_samples_per_second': 14.53, 'eval_steps_per_second': 0.115, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:25:15<2:00:34,  8.98s/it]

{'eval_loss': 0.2628893554210663, 'eval_accuracy': 0.6917832795120201, 'eval_mcc': 0.17270619135700643, 'eval_runtime': 193.2451, 'eval_samples_per_second': 14.422, 'eval_steps_per_second': 0.114, 'epoch': 7.0}



 40%|████      | 496/1240 [1:36:51<1:27:57,  7.09s/it]

{'eval_loss': 0.24528126418590546, 'eval_accuracy': 0.7043415859346968, 'eval_mcc': 0.22770608383877772, 'eval_runtime': 153.2511, 'eval_samples_per_second': 18.186, 'eval_steps_per_second': 0.144, 'epoch': 8.0}


 40%|████      | 500/1240 [1:38:16<6:06:36, 29.73s/it] 

{'loss': 0.4158, 'grad_norm': 0.8275555372238159, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:48:30<1:35:34,  8.41s/it]

{'eval_loss': 0.31903693079948425, 'eval_accuracy': 0.6982418371008252, 'eval_mcc': 0.21026919383852052, 'eval_runtime': 136.5554, 'eval_samples_per_second': 20.409, 'eval_steps_per_second': 0.161, 'epoch': 9.0}



 50%|█████     | 620/1240 [2:00:32<1:30:34,  8.77s/it]

{'eval_loss': 0.28012990951538086, 'eval_accuracy': 0.7118765697883028, 'eval_mcc': 0.2550023750589084, 'eval_runtime': 143.5488, 'eval_samples_per_second': 19.415, 'eval_steps_per_second': 0.153, 'epoch': 10.0}



 55%|█████▌    | 682/1240 [2:11:58<1:05:08,  7.00s/it]

{'eval_loss': 0.3365243077278137, 'eval_accuracy': 0.7068532472192322, 'eval_mcc': 0.23480378203773392, 'eval_runtime': 138.0082, 'eval_samples_per_second': 20.194, 'eval_steps_per_second': 0.159, 'epoch': 11.0}



 60%|██████    | 744/1240 [2:24:03<1:08:57,  8.34s/it]

{'eval_loss': 0.3063610792160034, 'eval_accuracy': 0.7086472909939002, 'eval_mcc': 0.2468324986856367, 'eval_runtime': 158.413, 'eval_samples_per_second': 17.593, 'eval_steps_per_second': 0.139, 'epoch': 12.0}



 65%|██████▌   | 806/1240 [2:35:34<1:00:38,  8.38s/it]

{'eval_loss': 0.36839058995246887, 'eval_accuracy': 0.7151058485827054, 'eval_mcc': 0.2693953932356712, 'eval_runtime': 137.2816, 'eval_samples_per_second': 20.301, 'eval_steps_per_second': 0.16, 'epoch': 13.0}



 70%|███████   | 868/1240 [2:47:42<57:47,  9.32s/it]

{'eval_loss': 0.45988407731056213, 'eval_accuracy': 0.7064944384642985, 'eval_mcc': 0.23354066959369477, 'eval_runtime': 144.7195, 'eval_samples_per_second': 19.258, 'eval_steps_per_second': 0.152, 'epoch': 14.0}



 75%|███████▌  | 930/1240 [2:58:52<36:04,  6.98s/it]

{'eval_loss': 0.41088706254959106, 'eval_accuracy': 0.7111589522784356, 'eval_mcc': 0.2559201720678133, 'eval_runtime': 133.3281, 'eval_samples_per_second': 20.903, 'eval_steps_per_second': 0.165, 'epoch': 15.0}



 80%|████████  | 992/1240 [3:10:18<27:49,  6.73s/it]

{'eval_loss': 0.5359167456626892, 'eval_accuracy': 0.7064944384642985, 'eval_mcc': 0.23696697429032273, 'eval_runtime': 130.4088, 'eval_samples_per_second': 21.371, 'eval_steps_per_second': 0.169, 'epoch': 16.0}


 81%|████████  | 1000/1240 [3:12:11<48:49, 12.21s/it] 

{'loss': 0.1086, 'grad_norm': 1.8383554220199585, 'learning_rate': 0.0001, 'epoch': 16.13}



 85%|████████▌ | 1054/1240 [3:22:14<26:16,  8.47s/it]

{'eval_loss': 0.43077927827835083, 'eval_accuracy': 0.7143882310728382, 'eval_mcc': 0.27855989085581545, 'eval_runtime': 179.6138, 'eval_samples_per_second': 15.517, 'eval_steps_per_second': 0.122, 'epoch': 17.0}



 90%|█████████ | 1116/1240 [3:34:28<18:18,  8.86s/it]

{'eval_loss': 0.5534948110580444, 'eval_accuracy': 0.71259418729817, 'eval_mcc': 0.25867374073926663, 'eval_runtime': 168.4426, 'eval_samples_per_second': 16.546, 'eval_steps_per_second': 0.131, 'epoch': 18.0}



 95%|█████████▌| 1178/1240 [3:45:30<08:48,  8.53s/it]

{'eval_loss': 0.6247324347496033, 'eval_accuracy': 0.7050592034445641, 'eval_mcc': 0.23980534656087946, 'eval_runtime': 148.7044, 'eval_samples_per_second': 18.742, 'eval_steps_per_second': 0.148, 'epoch': 19.0}



100%|██████████| 1240/1240 [3:56:59<00:00,  8.53s/it]

{'eval_loss': 0.5992313027381897, 'eval_accuracy': 0.710800143523502, 'eval_mcc': 0.25567985616580136, 'eval_runtime': 151.3243, 'eval_samples_per_second': 18.417, 'eval_steps_per_second': 0.145, 'epoch': 20.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 1240/1240 [3:57:08<00:00, 11.47s/it]


{'train_runtime': 14228.6839, 'train_samples_per_second': 11.061, 'train_steps_per_second': 0.087, 'train_loss': 0.22038038469129992, 'epoch': 20.0}
sberbank-ai_ruT5-base_0.0001_0.0001_128_4
train {'train_runtime': 14228.6839, 'train_samples_per_second': 11.061, 'train_steps_per_second': 0.087, 'total_flos': 4758473076572160.0, 'train_loss': 0.22038038469129992, 'epoch': 20.0}


100%|██████████| 22/22 [01:55<00:00,  5.26s/it]


dev {'test_loss': 0.43077927827835083, 'test_accuracy': 0.7143882310728382, 'test_mcc': 0.27855989085581545, 'test_runtime': 150.5935, 'test_samples_per_second': 18.507, 'test_steps_per_second': 0.146}


100%|██████████| 22/22 [01:43<00:00,  4.72s/it]

  5%|▌         | 62/1240 [11:21<2:44:14,  8.37s/it]

{'eval_loss': 0.2625352144241333, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 128.3645, 'eval_samples_per_second': 21.712, 'eval_steps_per_second': 0.171, 'epoch': 1.0}



 10%|█         | 124/1240 [23:12<2:50:43,  9.18s/it]

{'eval_loss': 0.23903633654117584, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 174.0634, 'eval_samples_per_second': 16.011, 'eval_steps_per_second': 0.126, 'epoch': 2.0}



 15%|█▌        | 186/1240 [35:48<2:42:35,  9.26s/it]

{'eval_loss': 0.30655959248542786, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 159.6299, 'eval_samples_per_second': 17.459, 'eval_steps_per_second': 0.138, 'epoch': 3.0}



 20%|██        | 248/1240 [47:59<2:04:47,  7.55s/it]

{'eval_loss': 0.2903214395046234, 'eval_accuracy': 0.6752780767850736, 'eval_mcc': 0.0, 'eval_runtime': 140.8446, 'eval_samples_per_second': 19.788, 'eval_steps_per_second': 0.156, 'epoch': 4.0}



 25%|██▌       | 310/1240 [1:00:29<2:11:54,  8.51s/it]

{'eval_loss': 0.22141499817371368, 'eval_accuracy': 0.6946537495514891, 'eval_mcc': 0.18540926533211063, 'eval_runtime': 177.2005, 'eval_samples_per_second': 15.728, 'eval_steps_per_second': 0.124, 'epoch': 5.0}



 30%|███       | 372/1240 [1:13:28<2:17:53,  9.53s/it]

{'eval_loss': 0.24811281263828278, 'eval_accuracy': 0.6878363832077503, 'eval_mcc': 0.15874813519743353, 'eval_runtime': 164.4099, 'eval_samples_per_second': 16.952, 'eval_steps_per_second': 0.134, 'epoch': 6.0}



 35%|███▌      | 434/1240 [1:25:22<2:00:31,  8.97s/it]

{'eval_loss': 0.24482044577598572, 'eval_accuracy': 0.710800143523502, 'eval_mcc': 0.2563355197527784, 'eval_runtime': 133.5693, 'eval_samples_per_second': 20.866, 'eval_steps_per_second': 0.165, 'epoch': 7.0}



 40%|████      | 496/1240 [1:38:47<1:39:31,  8.03s/it]

{'eval_loss': 0.2590676248073578, 'eval_accuracy': 0.7133118048080374, 'eval_mcc': 0.2619628465095906, 'eval_runtime': 142.7558, 'eval_samples_per_second': 19.523, 'eval_steps_per_second': 0.154, 'epoch': 8.0}


 40%|████      | 500/1240 [1:40:23<6:27:24, 31.41s/it] 

{'loss': 0.4248, 'grad_norm': 1.5460054874420166, 'learning_rate': 0.0001, 'epoch': 8.06}



 45%|████▌     | 558/1240 [1:52:05<1:56:10, 10.22s/it]

{'eval_loss': 0.3030848801136017, 'eval_accuracy': 0.7186939361320416, 'eval_mcc': 0.28756234541207065, 'eval_runtime': 203.076, 'eval_samples_per_second': 13.724, 'eval_steps_per_second': 0.108, 'epoch': 9.0}


 48%|████▊     | 590/1240 [1:59:04<2:31:54, 14.02s/it] 

Вывод. Действительно, Transformer - мощная штука. Позволяет использовать уже обученные модели - дообучение позволяет создавать свои модели на основе уже предобученных моделей.