In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import gc
from datasets import load_dataset, DatasetDict, Dataset
import numpy as np
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, accuracy_score, classification_report, matthews_corrcoef
import pandas as pd
import wandb
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.environ["WANDB_PROJECT"] = "DEBATE-ModernBERT-Large"

modname = "MoritzLaurer/ModernBERT-large-zeroshot-v2.0"
training_directory ='training_ModernLarge'
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

In [4]:
ds = load_dataset("mlburnham/Pol_NLI")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(modname)
id2label = {0: "entailment", 1: "not_entailment"}
label2id = {"entailment":0, "not_entailment":1}

def tokenize_function(docs):
    return tokenizer(docs['premise'], docs['augmented_hypothesis'], padding = False, truncation = False)
def model_init():
  return AutoModelForSequenceClassification.from_pretrained(modname, 
                                                           num_labels=2,
                                                           ignore_mismatched_sizes=True,
                                                           label2id = label2id, 
                                                           id2label = id2label)

dstok = ds.map(tokenize_function, batched = True)
dstok = dstok.rename_columns({'entailment':'label'})

training_args = TrainingArguments(output_dir=training_directory,
    logging_dir=f'{training_directory}/logs',
    lr_scheduler_type= "linear",
    group_by_length=True,
    learning_rate=9e-6 if "large" in modname else 2e-5,
    per_device_train_batch_size=4 if "large" in modname else 16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4 if "large" in modname else 1,
    num_train_epochs=10,
    warmup_ratio=0.06,
    weight_decay=0.01,
    fp16=True,
    fp16_full_eval=True,
    eval_strategy="epoch",
    seed=1,
    save_strategy="epoch",
    dataloader_num_workers = 1
)

def compute_metrics_standard(eval_pred, label_text_alphabetical=list(id2label.values())):
    labels = eval_pred.label_ids
    pred_logits = eval_pred.predictions
    preds_max = np.argmax(pred_logits, axis=1)

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(labels, preds_max, average='macro')
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(labels, preds_max, average='micro')
    acc_balanced = balanced_accuracy_score(labels, preds_max)
    acc_not_balanced = accuracy_score(labels, preds_max)
    mcc = matthews_corrcoef(labels, preds_max)

    metrics = {'MCC': mcc,
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy_balanced': acc_balanced,
            'accuracy': acc_not_balanced,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            }
    print("Aggregate metrics: ", {key: metrics[key] for key in metrics if key not in ["label_gold_raw", "label_predicted_raw"]} ) 
    print("Detailed metrics: ", classification_report(
        labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
        target_names=label_text_alphabetical, sample_weight=None,
        digits=2, output_dict=True, zero_division='warn'),
    "\n")

    return metrics

trainer = Trainer(
    model_init=model_init,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=dstok['train'],
    eval_dataset=dstok['validation'],
    compute_metrics=lambda x: compute_metrics_standard(x, label_text_alphabetical=list(id2label.values()))
)

In [10]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmlburnham[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Mcc,F1 Macro,F1 Micro,Accuracy Balanced,Accuracy,Precision Macro,Recall Macro,Precision Micro,Recall Micro
0,0.732,0.182307,0.893737,0.946868,0.948191,0.946724,0.948191,0.947013,0.946724,0.948191,0.948191
1,0.4766,0.204826,0.916039,0.957974,0.959098,0.956987,0.959098,0.959055,0.956987,0.959098,0.959098
2,0.2552,0.291375,0.905836,0.952747,0.95411,0.950857,0.95411,0.954989,0.950857,0.95411,0.95411
3,0.0991,0.3077,0.906207,0.952824,0.954243,0.950416,0.954243,0.955807,0.950416,0.954243,0.954243
4,0.0726,0.312559,0.911532,0.95573,0.956903,0.954854,0.956903,0.95668,0.954854,0.956903,0.956903
5,0.0454,0.374698,0.911276,0.955364,0.956704,0.952971,0.956704,0.95832,0.952971,0.956704,0.956704
6,0.0099,0.356787,0.919876,0.959918,0.96096,0.95926,0.96096,0.960617,0.95926,0.96096,0.96096
7,0.0151,0.478835,0.914809,0.957358,0.9585,0.956363,0.9585,0.958449,0.956363,0.9585,0.9585
8,0.0,0.505373,0.921822,0.960786,0.961891,0.959145,0.961891,0.962684,0.959145,0.961891,0.961891
9,0.0036,0.565398,0.920444,0.960109,0.961226,0.958549,0.961226,0.961902,0.958549,0.961226,0.961226


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.8937372016784191), 'f1_macro': 0.9468676111277339, 'f1_micro': 0.9481910082468742, 'accuracy_balanced': np.float64(0.9467243166518844), 'accuracy': 0.9481910082468742, 'precision_macro': 0.9470129316278186, 'recall_macro': 0.9467243166518844, 'precision_micro': 0.9481910082468742, 'recall_micro': 0.9481910082468742}
Detailed metrics:  {'entailment': {'precision': 0.939595192915876, 'recall': 0.9373718252090235, 'f1-score': 0.9384821922135355, 'support': 6339.0}, 'not_entailment': {'precision': 0.9544306703397613, 'recall': 0.9560768080947453, 'f1-score': 0.9552530300419323, 'support': 8697.0}, 'accuracy': 0.9481910082468742, 'macro avg': {'precision': 0.9470129316278186, 'recall': 0.9467243166518844, 'f1-score': 0.9468676111277339, 'support': 15036.0}, 'weighted avg': {'precision': 0.9481762082893483, 'recall': 0.9481910082468742, 'f1-score': 0.9481826429047809, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9160394791926961), 'f1_macro': 0.9579741578387564, 'f1_micro': 0.959098164405427, 'accuracy_balanced': np.float64(0.9569868977454732), 'accuracy': 0.959098164405427, 'precision_macro': 0.9590549157861157, 'recall_macro': 0.9569868977454732, 'precision_micro': 0.959098164405427, 'recall_micro': 0.959098164405427}
Detailed metrics:  {'entailment': {'precision': 0.9588008977236294, 'recall': 0.9435242151758952, 'f1-score': 0.9511012165063211, 'support': 6339.0}, 'not_entailment': {'precision': 0.9593089338486019, 'recall': 0.9704495803150511, 'f1-score': 0.9648470991711918, 'support': 8697.0}, 'accuracy': 0.959098164405427, 'macro avg': {'precision': 0.9590549157861157, 'recall': 0.9569868977454732, 'f1-score': 0.9579741578387564, 'support': 15036.0}, 'weighted avg': {'precision': 0.9590947518190595, 'recall': 0.959098164405427, 'f1-score': 0.9590519974012652, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9058364461756198), 'f1_macro': 0.952746781972066, 'f1_micro': 0.9541101356743815, 'accuracy_balanced': np.float64(0.9508572811062841), 'accuracy': 0.9541101356743815, 'precision_macro': 0.9549885859704261, 'recall_macro': 0.9508572811062841, 'precision_micro': 0.9541101356743815, 'recall_micro': 0.9541101356743815}
Detailed metrics:  {'entailment': {'precision': 0.9597916327527267, 'recall': 0.9301151601198927, 'f1-score': 0.944720397372216, 'support': 6339.0}, 'not_entailment': {'precision': 0.9501855391881255, 'recall': 0.9715994020926756, 'f1-score': 0.9607731665719159, 'support': 8697.0}, 'accuracy': 0.9541101356743815, 'macro avg': {'precision': 0.9549885859704261, 'recall': 0.9508572811062841, 'f1-score': 0.952746781972066, 'support': 15036.0}, 'weighted avg': {'precision': 0.9542353547711268, 'recall': 0.9541101356743815, 'f1-score': 0.9540055086870465, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9062070506839708), 'f1_macro': 0.9528238536992801, 'f1_micro': 0.954243149773876, 'accuracy_balanced': np.float64(0.9504162349393346), 'accuracy': 0.954243149773876, 'precision_macro': 0.955806848758431, 'recall_macro': 0.9504162349393346, 'precision_micro': 0.954243149773876, 'recall_micro': 0.954243149773876}
Detailed metrics:  {'entailment': {'precision': 0.9640335030382657, 'recall': 0.9260135668086449, 'f1-score': 0.9446411329256518, 'support': 6339.0}, 'not_entailment': {'precision': 0.9475801944785962, 'recall': 0.9748189030700242, 'f1-score': 0.9610065744729086, 'support': 8697.0}, 'accuracy': 0.954243149773876, 'macro avg': {'precision': 0.955806848758431, 'recall': 0.9504162349393346, 'f1-score': 0.9528238536992801, 'support': 15036.0}, 'weighted avg': {'precision': 0.9545167150265974, 'recall': 0.954243149773876, 'f1-score': 0.9541070976194861, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9115321229608633), 'f1_macro': 0.9557299725921278, 'f1_micro': 0.9569034317637669, 'accuracy_balanced': np.float64(0.9548544490511685), 'accuracy': 0.9569034317637669, 'precision_macro': 0.9566795009495819, 'recall_macro': 0.9548544490511685, 'precision_micro': 0.9569034317637669, 'recall_micro': 0.9569034317637669}
Detailed metrics:  {'entailment': {'precision': 0.9553528564570332, 'recall': 0.9417889256980596, 'f1-score': 0.9485224022878932, 'support': 6339.0}, 'not_entailment': {'precision': 0.9580061454421305, 'recall': 0.9679199724042773, 'f1-score': 0.9629375428963624, 'support': 8697.0}, 'accuracy': 0.9569034317637669, 'macro avg': {'precision': 0.9566795009495819, 'recall': 0.9548544490511685, 'f1-score': 0.9557299725921278, 'support': 15036.0}, 'weighted avg': {'precision': 0.9568875501457396, 'recall': 0.9569034317637669, 'f1-score': 0.9568602898824566, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.911275673647377), 'f1_macro': 0.9553641263882977, 'f1_micro': 0.9567039106145251, 'accuracy_balanced': np.float64(0.9529711193392568), 'accuracy': 0.9567039106145251, 'precision_macro': 0.9583202537243483, 'recall_macro': 0.9529711193392568, 'precision_micro': 0.9567039106145251, 'recall_micro': 0.9567039106145251}
Detailed metrics:  {'entailment': {'precision': 0.9668417596848325, 'recall': 0.9291686385865279, 'f1-score': 0.9476309226932669, 'support': 6339.0}, 'not_entailment': {'precision': 0.9497987477638641, 'recall': 0.9767736000919858, 'f1-score': 0.9630973300833287, 'support': 8697.0}, 'accuracy': 0.9567039106145251, 'macro avg': {'precision': 0.9583202537243483, 'recall': 0.9529711193392568, 'f1-score': 0.9553641263882977, 'support': 15036.0}, 'weighted avg': {'precision': 0.956983880283618, 'recall': 0.9567039106145251, 'f1-score': 0.956576875411501, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9198756977445397), 'f1_macro': 0.9599177788154878, 'f1_micro': 0.9609603617983506, 'accuracy_balanced': np.float64(0.9592596051066888), 'accuracy': 0.9609603617983506, 'precision_macro': 0.9606170942818022, 'recall_macro': 0.9592596051066888, 'precision_micro': 0.9609603617983506, 'recall_micro': 0.9609603617983506}
Detailed metrics:  {'entailment': {'precision': 0.9585459183673469, 'recall': 0.9484145764316139, 'f1-score': 0.9534533343906114, 'support': 6339.0}, 'not_entailment': {'precision': 0.9626882701962575, 'recall': 0.9701046337817638, 'f1-score': 0.9663822232403643, 'support': 8697.0}, 'accuracy': 0.9609603617983506, 'macro avg': {'precision': 0.9606170942818022, 'recall': 0.9592596051066888, 'f1-score': 0.9599177788154878, 'support': 15036.0}, 'weighted avg': {'precision': 0.9609419035932072, 'recall': 0.9609603617983506, 'f1-score': 0.9609315564128448, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9148093048376341), 'f1_macro': 0.9573581922323794, 'f1_micro': 0.9584996009577015, 'accuracy_balanced': np.float64(0.956362549417713), 'accuracy': 0.9584996009577015, 'precision_macro': 0.9584491350601136, 'recall_macro': 0.956362549417713, 'precision_micro': 0.9584996009577015, 'recall_micro': 0.9584996009577015}
Detailed metrics:  {'entailment': {'precision': 0.9581529581529582, 'recall': 0.9427354472314246, 'f1-score': 0.950381679389313, 'support': 6339.0}, 'not_entailment': {'precision': 0.958745311967269, 'recall': 0.9699896516040014, 'f1-score': 0.9643347050754458, 'support': 8697.0}, 'accuracy': 0.9584996009577015, 'macro avg': {'precision': 0.9584491350601136, 'recall': 0.956362549417713, 'f1-score': 0.9573581922323794, 'support': 15036.0}, 'weighted avg': {'precision': 0.9584955825958327, 'recall': 0.9584996009577015, 'f1-score': 0.9584522742544564, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.921821926774226), 'f1_macro': 0.9607855911921364, 'f1_micro': 0.9618914604948124, 'accuracy_balanced': np.float64(0.9591448950116943), 'accuracy': 0.9618914604948124, 'precision_macro': 0.9626838248196701, 'recall_macro': 0.9591448950116943, 'precision_micro': 0.9618914604948124, 'recall_micro': 0.9618914604948124}
Detailed metrics:  {'entailment': {'precision': 0.9671095268956578, 'recall': 0.9416311721091655, 'f1-score': 0.9542003037327152, 'support': 6339.0}, 'not_entailment': {'precision': 0.9582581227436823, 'recall': 0.9766586179142233, 'f1-score': 0.9673708786515575, 'support': 8697.0}, 'accuracy': 0.9618914604948124, 'macro avg': {'precision': 0.9626838248196701, 'recall': 0.9591448950116943, 'f1-score': 0.9607855911921364, 'support': 15036.0}, 'weighted avg': {'precision': 0.9619897701844492, 'recall': 0.9618914604948124, 'f1-score': 0.9618183198320216, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Aggregate metrics:  {'MCC': np.float64(0.9204441364123934), 'f1_macro': 0.9601085447709636, 'f1_micro': 0.9612263899973397, 'accuracy_balanced': np.float64(0.9585485984173163), 'accuracy': 0.9612263899973397, 'precision_macro': 0.9619016453109515, 'recall_macro': 0.9585485984173163, 'precision_micro': 0.9612263899973397, 'recall_micro': 0.9612263899973397}
Detailed metrics:  {'entailment': {'precision': 0.9656957928802589, 'recall': 0.9414734185202713, 'f1-score': 0.9534307852064862, 'support': 6339.0}, 'not_entailment': {'precision': 0.958107497741644, 'recall': 0.9756237783143613, 'f1-score': 0.9667863043354412, 'support': 8697.0}, 'accuracy': 0.9612263899973397, 'macro avg': {'precision': 0.9619016453109515, 'recall': 0.9585485984173163, 'f1-score': 0.9601085447709636, 'support': 15036.0}, 'weighted avg': {'precision': 0.9613066333417823, 'recall': 0.9612263899973397, 'f1-score': 0.9611557752214185, 'support': 15036.0}} 



  labels, preds_max, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]),


TrainOutput(global_step=107050, training_loss=0.19017183457118003, metrics={'train_runtime': 23024.6099, 'train_samples_per_second': 74.394, 'train_steps_per_second': 4.649, 'total_flos': 1.1064603257603529e+18, 'train_loss': 0.19017183457118003, 'epoch': 9.999929944188871})