In [24]:
# Import necessary libraries
from datasets import Dataset
import pandas as pd

# Load the dataset from the local files using raw string literals
train_dataset = Dataset.from_text(r"C:\Users\bless\OneDrive\Desktop\Google Colab\train.txt")
val_dataset = Dataset.from_text(r"C:\Users\bless\OneDrive\Desktop\Google Colab\validation.txt")
test_dataset = Dataset.from_text(r"C:\Users\bless\OneDrive\Desktop\Google Colab\test.txt")

# Combine the datasets into a single dictionary
dataset = {
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
}

# Inspect the dataset structure
print(dataset)

# Optionally, display the first few rows of each dataset to confirm successful upload
print("\nTrain Dataset:")
print(train_dataset.to_pandas().head())

print("\nValidation Dataset:")
print(val_dataset.to_pandas().head())

print("\nTest Dataset:")
print(test_dataset.to_pandas().head())


{'train': Dataset({
    features: ['text'],
    num_rows: 144328
}), 'validation': Dataset({
    features: ['text'],
    num_rows: 20618
}), 'test': Dataset({
    features: ['text'],
    num_rows: 41238
})}

Train Dataset:
                                            text
0                                               
1  iimphathiswa ezibuya emisebenzini yezefundo -
2                                          IBIZO
3                                               
4                                        wabantu

Validation Dataset:
                                                text
0  Amaprojekthi ahlathulula bona ilimi emthethwen...
1                                                   
2  â€¢ Abadosa umtato baseSewula Afrika : 0861 84...
3  Umma ogade alipholisa gade kufanele abuyiselwe...
4                           ithungelelwanohlanganiso

Test Dataset:
                                                text
0                                                   
1                      

In [25]:
pip install transformers datasets torch


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
import re
import textwrap

def clean_text(text):
    # Remove lines containing only underscores
    text = re.sub(r'^_*\s*$', '', text, flags=re.MULTILINE)

    # Remove HTML links
    text = re.sub(r'<[^>]*>', '', text)

    # Remove lines containing only numbers
    text = re.sub(r'\b\d+\b', '', text)

    # Remove ellipses
    text = re.sub(r'\.{3,}', '', text)

    # Remove unusual symbols
    unusual_symbols = ['[', ']', '%', '#', '@', '|']
    for symbol in unusual_symbols:
        text = text.replace(symbol, '')

    # Remove digits, '/', and '-' at the beginning and end of sentences
    text = re.sub(r'^[0-9\/-]+|[0-9\/-]+$', '', text)

    # Remove continuous characters
    text = re.sub(r'\.{2,}|-{2,}|\?{2,}|,{2,}', '', text)

    # Remove empty lines
    text = re.sub(r'\n\s*\n', '\n', text)

    # Remove non-closed brackets
    text = re.sub(r'\[[^\]]*$', '', text)

    # Normalize letters into small letters
    text = text.lower()

    return text

def process_file(file_path, num_paragraphs=3, words_per_paragraph=5000):
    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read()

    cleaned_data = clean_text(data)
    sentences = cleaned_data.split('\n')

    paragraphs = []
    paragraph = ""
    word_count = 0

    for sentence in sentences:
        words = sentence.split()
        for word in words:
            if word_count < words_per_paragraph:
                paragraph += word + " "
                word_count += 1
            else:
                break
        if word_count >= words_per_paragraph:
            paragraphs.append(paragraph.strip())
            paragraph = ""
            word_count = 0
        paragraph += "\n"

    if paragraph:  # Add the last paragraph if not empty
        paragraphs.append(paragraph.strip())

    return paragraphs[:num_paragraphs]

def display_paragraphs(paragraphs):
    for i, paragraph in enumerate(paragraphs):
        wrapped_paragraph = textwrap.fill(paragraph, width=80)
        print(f"\nParagraph {i+1}:\n{wrapped_paragraph}\n")

# Load and clean the train dataset
train_file_path = r"C:\Users\bless\OneDrive\Desktop\Google Colab\train.txt"
train_paragraphs = process_file(train_file_path)

# Load and clean the validation dataset
validation_file_path = r"C:\Users\bless\OneDrive\Desktop\Google Colab\validation.txt"
val_paragraphs = process_file(validation_file_path)

# Load and clean the test dataset
test_file_path = r"C:\Users\bless\OneDrive\Desktop\Google Colab\test.txt"
test_paragraphs = process_file(test_file_path)

# Display cleaned paragraphs from each dataset
print("Train Dataset (first few paragraphs):")
display_paragraphs(train_paragraphs)

print("\nValidation Dataset (first few paragraphs):")
display_paragraphs(val_paragraphs)

print("\nTest Dataset (first few paragraphs):")
display_paragraphs(test_paragraphs)


Train Dataset (first few paragraphs):

Paragraph 1:
iimphathiswa ezibuya emisebenzini yezefundo -  ibizo  wabantu  service () .  .
imitjhado ekabili etranskei  * kungafuneka ukuthi uzenzele isikhathi
ozasisebenzisela ihlelo lokulingelela, njengokuthi nje wenze isikhathi sokunande
uvakatjhela indawo la kulingelelwa khona;  kweminye neminye yemibuzo elandelako,
khetha ipendulo ekuhlathulula ngcono.  amagadango okufanele alandelwe  northern
cape  lomzimba  -jame  ilaza  sikhuluma nje, ababanduli abanelwazi banikela
ngemisebenzi eyifihlo begodu banikela nangamakhondomu emangenelweni wemikhawulo
le elandelako:  iinqhema ezikhethekileko  zokulima  imibiko yeendaba yebhodo
(h) umbuso kufanele wethule amabanga atlolwe phasi phambi kwekhotho ukukhombisa
ukuqakatheka ngokungezelelwa kokubotjhwa komuntu loyo, begodu umbuso kufanele
unikele isibotjhwa leso ikhophi yamabanga wokubotjhwa kwaso kusese namalanga
amabili ngaphambi kobana ikhotho ihlolisise ukubotjhwa kwaso.  ifeksi: -
ukutransferwa nge

In [27]:
import textwrap
from datasets import Dataset, DatasetDict
from transformers import MT5Tokenizer, DataCollatorForSeq2Seq

# Constants
CHECKPOINT = "google/mt5-small"

# Process the files
def process_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read().strip().split('\n')
    return data

train_data = process_file('C:\\Users\\bless\\OneDrive\\Desktop\\Google Colab\\train.txt')
val_data = process_file('C:\\Users\\bless\\OneDrive\\Desktop\\Google Colab\\validation.txt')
test_data = process_file('C:\\Users\\bless\\OneDrive\\Desktop\\Google Colab\\test.txt')

# Assuming each line is an example with source and target texts separated by a tab
def format_data(data):
    formatted_data = {'source_text': [], 'target_text': []}
    for line in data:
        parts = line.split('\t')
        if len(parts) == 2:
            formatted_data['source_text'].append(parts[0])
            formatted_data['target_text'].append(parts[1])
    return formatted_data

# Format the datasets
train_formatted = format_data(train_data)
val_formatted = format_data(val_data)
test_formatted = format_data(test_data)

# Create Hugging Face datasets
train_dataset = Dataset.from_dict(train_formatted)
val_dataset = Dataset.from_dict(val_formatted)
test_dataset = Dataset.from_dict(test_formatted)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# Initialize the tokenizer
tokenizer = MT5Tokenizer.from_pretrained(CHECKPOINT)

# Define the preprocessing function
def preprocess_function(examples):
    inputs = [ex for ex in examples['source_text']]
    targets = [ex for ex in examples['target_text']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=30, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# Display tokenized results from each split
for split in ['train', 'validation', 'test']:
    print(f"Sample tokenized data from the {split} split:")
    sample_data = tokenized_datasets[split].select(range(10))  # Selecting first 10 samples for display
    for i, sample in enumerate(sample_data):
        print(f"\nSample {i+1}:")
        print(f"Source Text (tokenized): {sample['input_ids']}")
        print(f"Target Text (tokenized): {sample['labels']}")

print(tokenized_datasets)


Map:   0%|          | 0/4127 [00:00<?, ? examples/s]



Map:   0%|          | 0/611 [00:00<?, ? examples/s]

Map:   0%|          | 0/1138 [00:00<?, ? examples/s]

Sample tokenized data from the train split:

Sample 1:
Source Text (tokenized): [12467, 1]
Target Text (tokenized): [336, 4973, 8874, 870, 415, 127579, 415, 26747, 62780, 1]

Sample 2:
Source Text (tokenized): [274, 334, 271, 1]
Target Text (tokenized): [3048, 163122, 601, 115948, 36061, 79199, 4467, 158413, 344, 40033, 1675, 690, 1783, 259, 143715, 16939, 11262, 32947, 259, 273, 86066, 181581, 6677, 6911, 4270, 170916, 20231, 370, 9453, 1]

Sample 3:
Source Text (tokenized): [336, 142357, 68924, 594, 9582, 264, 2156, 108770, 494, 7174, 1]
Target Text (tokenized): [27612, 163381, 1]

Sample 4:
Source Text (tokenized): [904, 1]
Target Text (tokenized): [23057, 57522, 18170, 260, 1]

Sample 5:
Source Text (tokenized): [259, 291, 1]
Target Text (tokenized): [11615, 213456, 273, 1]

Sample 6:
Source Text (tokenized): [274, 316, 271, 1]
Target Text (tokenized): [205424, 213699, 2658, 16939, 220988, 270, 8874, 273, 860, 266, 201250, 129771, 259, 119587, 16939, 54410, 18081, 186963, 704, 261,

In [30]:
from transformers import DataCollatorForSeq2Seq

# Create data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)


In [31]:
from torch.utils.data import DataLoader

batch_size = 4

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, collate_fn=data_collator)


In [7]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')


In [15]:
from transformers import Seq2SeqTrainingArguments

logging_steps = len(tokenized_datasets["train"]) // batch_size
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=60,
    predict_with_generate=True,
    fp16= True,
    logging_steps=logging_steps,
    report_to="none"

)




In [16]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer
)


In [23]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small')

In [25]:

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MT5ForConditionalGeneration

# Define constants and model
BATCH_SIZE = 4

# Assuming tokenized_dataset and tokenizer are already defined
# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=MT5ForConditionalGeneration.from_pretrained(CHECKPOINT))

# Training arguments
logging_steps = len(tokenized_datasets["train"]) // (BATCH_SIZE * 2)  # Increased logging steps
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=60,  # Training for 30 epochs
    predict_with_generate=True,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
    report_to="none"
)

# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained(CHECKPOINT)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()



  0%|          | 0/6180 [00:00<?, ?it/s]

{'loss': 28.4274, 'grad_norm': 1431.539306640625, 'learning_rate': 1.983495145631068e-05, 'epoch': 0.5}
{'loss': 25.973, 'grad_norm': 2046.091064453125, 'learning_rate': 1.9669902912621363e-05, 'epoch': 0.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 19.482833862304688, 'eval_runtime': 85.5277, 'eval_samples_per_second': 7.144, 'eval_steps_per_second': 1.789, 'epoch': 1.0}
{'loss': 25.1725, 'grad_norm': 2829.773193359375, 'learning_rate': 1.9504854368932042e-05, 'epoch': 1.49}
{'loss': 22.3126, 'grad_norm': 2111.1748046875, 'learning_rate': 1.9339805825242717e-05, 'epoch': 1.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 15.532411575317383, 'eval_runtime': 80.1034, 'eval_samples_per_second': 7.628, 'eval_steps_per_second': 1.91, 'epoch': 2.0}
{'loss': 20.5776, 'grad_norm': 5762.08056640625, 'learning_rate': 1.91747572815534e-05, 'epoch': 2.48}
{'loss': 19.7206, 'grad_norm': 3373.3115234375, 'learning_rate': 1.900970873786408e-05, 'epoch': 2.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 13.24991226196289, 'eval_runtime': 81.8387, 'eval_samples_per_second': 7.466, 'eval_steps_per_second': 1.87, 'epoch': 3.0}
{'loss': 18.1973, 'grad_norm': 2393.6875, 'learning_rate': 1.884466019417476e-05, 'epoch': 3.47}
{'loss': 17.4646, 'grad_norm': 618.2042846679688, 'learning_rate': 1.867961165048544e-05, 'epoch': 3.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 11.847771644592285, 'eval_runtime': 80.0697, 'eval_samples_per_second': 7.631, 'eval_steps_per_second': 1.911, 'epoch': 4.0}
{'loss': 16.3605, 'grad_norm': 648.6400756835938, 'learning_rate': 1.851456310679612e-05, 'epoch': 4.46}
{'loss': 15.6595, 'grad_norm': 1178.1014404296875, 'learning_rate': 1.8349514563106798e-05, 'epoch': 4.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 10.74303150177002, 'eval_runtime': 81.601, 'eval_samples_per_second': 7.488, 'eval_steps_per_second': 1.875, 'epoch': 5.0}
{'loss': 14.6424, 'grad_norm': 980.7116088867188, 'learning_rate': 1.8184466019417477e-05, 'epoch': 5.45}
{'loss': 13.9763, 'grad_norm': 1720.4417724609375, 'learning_rate': 1.8019417475728156e-05, 'epoch': 5.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 10.017868995666504, 'eval_runtime': 83.5848, 'eval_samples_per_second': 7.31, 'eval_steps_per_second': 1.83, 'epoch': 6.0}
{'loss': 13.3695, 'grad_norm': 824.5519409179688, 'learning_rate': 1.7854368932038838e-05, 'epoch': 6.44}
{'loss': 12.7711, 'grad_norm': 539.0938110351562, 'learning_rate': 1.7689320388349517e-05, 'epoch': 6.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 9.418469429016113, 'eval_runtime': 80.9916, 'eval_samples_per_second': 7.544, 'eval_steps_per_second': 1.889, 'epoch': 7.0}
{'loss': 12.0483, 'grad_norm': 1339.1234130859375, 'learning_rate': 1.7524271844660196e-05, 'epoch': 7.43}
{'loss': 12.0329, 'grad_norm': 309.1724853515625, 'learning_rate': 1.7359223300970875e-05, 'epoch': 7.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.994001388549805, 'eval_runtime': 82.0367, 'eval_samples_per_second': 7.448, 'eval_steps_per_second': 1.865, 'epoch': 8.0}
{'loss': 11.4229, 'grad_norm': 242.5736846923828, 'learning_rate': 1.7194174757281554e-05, 'epoch': 8.42}
{'loss': 10.9626, 'grad_norm': 281.29827880859375, 'learning_rate': 1.7029126213592236e-05, 'epoch': 8.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.594511032104492, 'eval_runtime': 82.307, 'eval_samples_per_second': 7.423, 'eval_steps_per_second': 1.859, 'epoch': 9.0}
{'loss': 10.906, 'grad_norm': 227.44552612304688, 'learning_rate': 1.6864077669902915e-05, 'epoch': 9.41}
{'loss': 10.3617, 'grad_norm': 326.4125061035156, 'learning_rate': 1.6699029126213594e-05, 'epoch': 9.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.311810493469238, 'eval_runtime': 81.031, 'eval_samples_per_second': 7.54, 'eval_steps_per_second': 1.888, 'epoch': 10.0}
{'loss': 9.9972, 'grad_norm': 213.32530212402344, 'learning_rate': 1.6533980582524273e-05, 'epoch': 10.4}
{'loss': 9.9915, 'grad_norm': 215.3939666748047, 'learning_rate': 1.6368932038834952e-05, 'epoch': 10.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.019424438476562, 'eval_runtime': 82.4996, 'eval_samples_per_second': 7.406, 'eval_steps_per_second': 1.855, 'epoch': 11.0}
{'loss': 9.4466, 'grad_norm': 66.42263793945312, 'learning_rate': 1.620388349514563e-05, 'epoch': 11.39}
{'loss': 9.088, 'grad_norm': 71.0724105834961, 'learning_rate': 1.6038834951456313e-05, 'epoch': 11.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.733203887939453, 'eval_runtime': 82.5248, 'eval_samples_per_second': 7.404, 'eval_steps_per_second': 1.854, 'epoch': 12.0}
{'loss': 9.0453, 'grad_norm': 72.8863296508789, 'learning_rate': 1.5873786407766992e-05, 'epoch': 12.38}
{'loss': 8.8119, 'grad_norm': 36.717674255371094, 'learning_rate': 1.570873786407767e-05, 'epoch': 12.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.521734237670898, 'eval_runtime': 82.2465, 'eval_samples_per_second': 7.429, 'eval_steps_per_second': 1.86, 'epoch': 13.0}
{'loss': 8.7221, 'grad_norm': 10.724045753479004, 'learning_rate': 1.554368932038835e-05, 'epoch': 13.37}
{'loss': 8.5186, 'grad_norm': 71.12259674072266, 'learning_rate': 1.537864077669903e-05, 'epoch': 13.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.3050642013549805, 'eval_runtime': 79.598, 'eval_samples_per_second': 7.676, 'eval_steps_per_second': 1.922, 'epoch': 14.0}
{'loss': 8.382, 'grad_norm': 15.149255752563477, 'learning_rate': 1.521359223300971e-05, 'epoch': 14.36}
{'loss': 8.1242, 'grad_norm': 8.12736701965332, 'learning_rate': 1.5048543689320389e-05, 'epoch': 14.85}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.087971210479736, 'eval_runtime': 80.7488, 'eval_samples_per_second': 7.567, 'eval_steps_per_second': 1.895, 'epoch': 15.0}
{'loss': 8.0923, 'grad_norm': 19.44567108154297, 'learning_rate': 1.4883495145631068e-05, 'epoch': 15.35}
{'loss': 8.0737, 'grad_norm': 22.15884017944336, 'learning_rate': 1.4718446601941749e-05, 'epoch': 15.84}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.920391082763672, 'eval_runtime': 80.6535, 'eval_samples_per_second': 7.576, 'eval_steps_per_second': 1.897, 'epoch': 16.0}
{'loss': 7.7707, 'grad_norm': 278.51666259765625, 'learning_rate': 1.4553398058252427e-05, 'epoch': 16.34}
{'loss': 7.7777, 'grad_norm': 28.40470314025879, 'learning_rate': 1.4388349514563106e-05, 'epoch': 16.83}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.755517959594727, 'eval_runtime': 80.2719, 'eval_samples_per_second': 7.612, 'eval_steps_per_second': 1.906, 'epoch': 17.0}
{'loss': 7.7937, 'grad_norm': 25.30533218383789, 'learning_rate': 1.4223300970873787e-05, 'epoch': 17.33}
{'loss': 7.644, 'grad_norm': 18.780364990234375, 'learning_rate': 1.4058252427184466e-05, 'epoch': 17.83}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.619002342224121, 'eval_runtime': 79.5981, 'eval_samples_per_second': 7.676, 'eval_steps_per_second': 1.922, 'epoch': 18.0}
{'loss': 7.5731, 'grad_norm': 5.504852771759033, 'learning_rate': 1.3893203883495148e-05, 'epoch': 18.32}
{'loss': 7.4569, 'grad_norm': 7.346883296966553, 'learning_rate': 1.3728155339805826e-05, 'epoch': 18.82}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.486224174499512, 'eval_runtime': 80.2718, 'eval_samples_per_second': 7.612, 'eval_steps_per_second': 1.906, 'epoch': 19.0}
{'loss': 7.4029, 'grad_norm': 6.257999897003174, 'learning_rate': 1.3563106796116505e-05, 'epoch': 19.31}
{'loss': 7.4104, 'grad_norm': 27.4409236907959, 'learning_rate': 1.3398058252427187e-05, 'epoch': 19.81}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.376990795135498, 'eval_runtime': 80.7991, 'eval_samples_per_second': 7.562, 'eval_steps_per_second': 1.894, 'epoch': 20.0}
{'loss': 7.1678, 'grad_norm': 10.862713813781738, 'learning_rate': 1.3233009708737866e-05, 'epoch': 20.3}
{'loss': 7.2395, 'grad_norm': 9.81322193145752, 'learning_rate': 1.3067961165048543e-05, 'epoch': 20.8}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.277403831481934, 'eval_runtime': 79.9384, 'eval_samples_per_second': 7.643, 'eval_steps_per_second': 1.914, 'epoch': 21.0}
{'loss': 7.1663, 'grad_norm': 4.902098178863525, 'learning_rate': 1.2902912621359226e-05, 'epoch': 21.29}
{'loss': 7.0971, 'grad_norm': 15.619028091430664, 'learning_rate': 1.2737864077669904e-05, 'epoch': 21.79}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.1853928565979, 'eval_runtime': 81.5571, 'eval_samples_per_second': 7.492, 'eval_steps_per_second': 1.876, 'epoch': 22.0}
{'loss': 7.0354, 'grad_norm': 24.27941131591797, 'learning_rate': 1.2572815533980585e-05, 'epoch': 22.28}
{'loss': 6.9787, 'grad_norm': 6.409817695617676, 'learning_rate': 1.2407766990291264e-05, 'epoch': 22.78}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.095200061798096, 'eval_runtime': 80.8093, 'eval_samples_per_second': 7.561, 'eval_steps_per_second': 1.893, 'epoch': 23.0}
{'loss': 6.8506, 'grad_norm': 4.5065507888793945, 'learning_rate': 1.2242718446601943e-05, 'epoch': 23.27}
{'loss': 6.8355, 'grad_norm': 5.982316970825195, 'learning_rate': 1.2077669902912624e-05, 'epoch': 23.77}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.014676570892334, 'eval_runtime': 81.1382, 'eval_samples_per_second': 7.53, 'eval_steps_per_second': 1.886, 'epoch': 24.0}
{'loss': 6.8152, 'grad_norm': 19.871166229248047, 'learning_rate': 1.1912621359223303e-05, 'epoch': 24.26}
{'loss': 6.6991, 'grad_norm': 7.570495128631592, 'learning_rate': 1.1747572815533982e-05, 'epoch': 24.76}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.944276809692383, 'eval_runtime': 82.8231, 'eval_samples_per_second': 7.377, 'eval_steps_per_second': 1.847, 'epoch': 25.0}
{'loss': 6.7853, 'grad_norm': 5.743812084197998, 'learning_rate': 1.1582524271844662e-05, 'epoch': 25.25}
{'loss': 6.6643, 'grad_norm': 17.578317642211914, 'learning_rate': 1.1417475728155341e-05, 'epoch': 25.75}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.9044365882873535, 'eval_runtime': 81.1564, 'eval_samples_per_second': 7.529, 'eval_steps_per_second': 1.885, 'epoch': 26.0}
{'loss': 6.661, 'grad_norm': 14.096580505371094, 'learning_rate': 1.125242718446602e-05, 'epoch': 26.24}
{'loss': 6.5674, 'grad_norm': 18.521818161010742, 'learning_rate': 1.10873786407767e-05, 'epoch': 26.74}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.8542914390563965, 'eval_runtime': 82.0094, 'eval_samples_per_second': 7.45, 'eval_steps_per_second': 1.866, 'epoch': 27.0}
{'loss': 6.5548, 'grad_norm': 5.062905311584473, 'learning_rate': 1.092233009708738e-05, 'epoch': 27.23}
{'loss': 6.5313, 'grad_norm': 4.375811576843262, 'learning_rate': 1.075728155339806e-05, 'epoch': 27.73}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.806692600250244, 'eval_runtime': 82.1901, 'eval_samples_per_second': 7.434, 'eval_steps_per_second': 1.862, 'epoch': 28.0}
{'loss': 6.4801, 'grad_norm': 4.814663887023926, 'learning_rate': 1.059223300970874e-05, 'epoch': 28.22}
{'loss': 6.3778, 'grad_norm': 5.264479160308838, 'learning_rate': 1.0427184466019418e-05, 'epoch': 28.72}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.749115467071533, 'eval_runtime': 82.9031, 'eval_samples_per_second': 7.37, 'eval_steps_per_second': 1.846, 'epoch': 29.0}
{'loss': 6.3184, 'grad_norm': 22.8049259185791, 'learning_rate': 1.0262135922330099e-05, 'epoch': 29.21}
{'loss': 6.3744, 'grad_norm': 6.451199054718018, 'learning_rate': 1.0097087378640778e-05, 'epoch': 29.71}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.712360382080078, 'eval_runtime': 82.4216, 'eval_samples_per_second': 7.413, 'eval_steps_per_second': 1.856, 'epoch': 30.0}
{'loss': 6.2683, 'grad_norm': 20.01512336730957, 'learning_rate': 9.932038834951457e-06, 'epoch': 30.2}
{'loss': 6.2766, 'grad_norm': 6.220498561859131, 'learning_rate': 9.766990291262138e-06, 'epoch': 30.7}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.682943344116211, 'eval_runtime': 82.6594, 'eval_samples_per_second': 7.392, 'eval_steps_per_second': 1.851, 'epoch': 31.0}
{'loss': 6.3148, 'grad_norm': 6.916089057922363, 'learning_rate': 9.601941747572816e-06, 'epoch': 31.19}
{'loss': 6.2533, 'grad_norm': 8.867959022521973, 'learning_rate': 9.436893203883495e-06, 'epoch': 31.69}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.646299362182617, 'eval_runtime': 83.2432, 'eval_samples_per_second': 7.34, 'eval_steps_per_second': 1.838, 'epoch': 32.0}
{'loss': 6.242, 'grad_norm': 4.8075947761535645, 'learning_rate': 9.271844660194176e-06, 'epoch': 32.18}
{'loss': 6.2052, 'grad_norm': 4.8935699462890625, 'learning_rate': 9.106796116504855e-06, 'epoch': 32.68}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.612290382385254, 'eval_runtime': 82.969, 'eval_samples_per_second': 7.364, 'eval_steps_per_second': 1.844, 'epoch': 33.0}
{'loss': 6.1145, 'grad_norm': 4.045183181762695, 'learning_rate': 8.941747572815534e-06, 'epoch': 33.17}
{'loss': 6.1892, 'grad_norm': 7.050703525543213, 'learning_rate': 8.776699029126215e-06, 'epoch': 33.67}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.5911173820495605, 'eval_runtime': 83.9332, 'eval_samples_per_second': 7.28, 'eval_steps_per_second': 1.823, 'epoch': 34.0}
{'loss': 6.171, 'grad_norm': 4.436441421508789, 'learning_rate': 8.611650485436894e-06, 'epoch': 34.17}
{'loss': 6.0947, 'grad_norm': 4.040370464324951, 'learning_rate': 8.446601941747573e-06, 'epoch': 34.66}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.564172267913818, 'eval_runtime': 82.943, 'eval_samples_per_second': 7.367, 'eval_steps_per_second': 1.845, 'epoch': 35.0}
{'loss': 6.1447, 'grad_norm': 8.956969261169434, 'learning_rate': 8.281553398058253e-06, 'epoch': 35.16}
{'loss': 6.1312, 'grad_norm': 3.7405765056610107, 'learning_rate': 8.116504854368932e-06, 'epoch': 35.65}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.537458896636963, 'eval_runtime': 83.0979, 'eval_samples_per_second': 7.353, 'eval_steps_per_second': 1.841, 'epoch': 36.0}
{'loss': 6.1184, 'grad_norm': 5.037674903869629, 'learning_rate': 7.951456310679613e-06, 'epoch': 36.15}
{'loss': 6.0685, 'grad_norm': 6.831835746765137, 'learning_rate': 7.786407766990292e-06, 'epoch': 36.64}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.5177154541015625, 'eval_runtime': 81.7054, 'eval_samples_per_second': 7.478, 'eval_steps_per_second': 1.873, 'epoch': 37.0}
{'loss': 6.0651, 'grad_norm': 4.537544250488281, 'learning_rate': 7.6213592233009715e-06, 'epoch': 37.14}
{'loss': 5.9837, 'grad_norm': 4.199422836303711, 'learning_rate': 7.456310679611651e-06, 'epoch': 37.63}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.500777244567871, 'eval_runtime': 82.4675, 'eval_samples_per_second': 7.409, 'eval_steps_per_second': 1.855, 'epoch': 38.0}
{'loss': 6.0457, 'grad_norm': 4.296531677246094, 'learning_rate': 7.291262135922331e-06, 'epoch': 38.13}
{'loss': 6.0312, 'grad_norm': 4.471918106079102, 'learning_rate': 7.12621359223301e-06, 'epoch': 38.62}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.486743927001953, 'eval_runtime': 84.5283, 'eval_samples_per_second': 7.228, 'eval_steps_per_second': 1.81, 'epoch': 39.0}
{'loss': 6.0441, 'grad_norm': 5.359346389770508, 'learning_rate': 6.96116504854369e-06, 'epoch': 39.12}
{'loss': 5.954, 'grad_norm': 5.284548282623291, 'learning_rate': 6.79611650485437e-06, 'epoch': 39.61}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.468367576599121, 'eval_runtime': 83.4092, 'eval_samples_per_second': 7.325, 'eval_steps_per_second': 1.834, 'epoch': 40.0}
{'loss': 5.8938, 'grad_norm': 4.26351261138916, 'learning_rate': 6.631067961165049e-06, 'epoch': 40.11}
{'loss': 5.8811, 'grad_norm': 7.4968180656433105, 'learning_rate': 6.4660194174757285e-06, 'epoch': 40.6}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.453425407409668, 'eval_runtime': 84.3355, 'eval_samples_per_second': 7.245, 'eval_steps_per_second': 1.814, 'epoch': 41.0}
{'loss': 5.9898, 'grad_norm': 3.8698391914367676, 'learning_rate': 6.300970873786408e-06, 'epoch': 41.1}
{'loss': 5.9869, 'grad_norm': 4.425201416015625, 'learning_rate': 6.135922330097088e-06, 'epoch': 41.59}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.4404497146606445, 'eval_runtime': 83.9298, 'eval_samples_per_second': 7.28, 'eval_steps_per_second': 1.823, 'epoch': 42.0}
{'loss': 5.8695, 'grad_norm': 4.507405757904053, 'learning_rate': 5.970873786407767e-06, 'epoch': 42.09}
{'loss': 5.9714, 'grad_norm': 4.908419609069824, 'learning_rate': 5.805825242718447e-06, 'epoch': 42.58}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.429843902587891, 'eval_runtime': 84.8309, 'eval_samples_per_second': 7.203, 'eval_steps_per_second': 1.804, 'epoch': 43.0}
{'loss': 5.8858, 'grad_norm': 5.08618688583374, 'learning_rate': 5.640776699029127e-06, 'epoch': 43.08}
{'loss': 5.8368, 'grad_norm': 4.4491143226623535, 'learning_rate': 5.4757281553398064e-06, 'epoch': 43.57}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.417590618133545, 'eval_runtime': 83.9968, 'eval_samples_per_second': 7.274, 'eval_steps_per_second': 1.821, 'epoch': 44.0}
{'loss': 5.8145, 'grad_norm': 4.000674724578857, 'learning_rate': 5.310679611650485e-06, 'epoch': 44.07}
{'loss': 5.8971, 'grad_norm': 9.60603141784668, 'learning_rate': 5.145631067961165e-06, 'epoch': 44.56}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.406925201416016, 'eval_runtime': 84.5225, 'eval_samples_per_second': 7.229, 'eval_steps_per_second': 1.81, 'epoch': 45.0}
{'loss': 5.8633, 'grad_norm': 4.78414249420166, 'learning_rate': 4.980582524271845e-06, 'epoch': 45.06}
{'loss': 5.8323, 'grad_norm': 4.638838768005371, 'learning_rate': 4.815533980582525e-06, 'epoch': 45.55}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.401245594024658, 'eval_runtime': 84.5431, 'eval_samples_per_second': 7.227, 'eval_steps_per_second': 1.81, 'epoch': 46.0}
{'loss': 5.8521, 'grad_norm': 6.04580020904541, 'learning_rate': 4.650485436893205e-06, 'epoch': 46.05}
{'loss': 5.7553, 'grad_norm': 4.621269226074219, 'learning_rate': 4.4854368932038836e-06, 'epoch': 46.54}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.391746997833252, 'eval_runtime': 85.256, 'eval_samples_per_second': 7.167, 'eval_steps_per_second': 1.795, 'epoch': 47.0}
{'loss': 5.8583, 'grad_norm': 4.70026969909668, 'learning_rate': 4.320388349514563e-06, 'epoch': 47.04}
{'loss': 5.8365, 'grad_norm': 14.754907608032227, 'learning_rate': 4.155339805825243e-06, 'epoch': 47.53}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.384477615356445, 'eval_runtime': 85.2873, 'eval_samples_per_second': 7.164, 'eval_steps_per_second': 1.794, 'epoch': 48.0}
{'loss': 5.7489, 'grad_norm': 5.014030933380127, 'learning_rate': 3.990291262135922e-06, 'epoch': 48.03}
{'loss': 5.805, 'grad_norm': 4.404571056365967, 'learning_rate': 3.825242718446602e-06, 'epoch': 48.52}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.378818511962891, 'eval_runtime': 84.9271, 'eval_samples_per_second': 7.194, 'eval_steps_per_second': 1.802, 'epoch': 49.0}
{'loss': 5.8459, 'grad_norm': 1222.2996826171875, 'learning_rate': 3.6601941747572817e-06, 'epoch': 49.02}
{'loss': 5.7655, 'grad_norm': 4.9083428382873535, 'learning_rate': 3.4951456310679615e-06, 'epoch': 49.51}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.371776580810547, 'eval_runtime': 85.2402, 'eval_samples_per_second': 7.168, 'eval_steps_per_second': 1.795, 'epoch': 50.0}
{'loss': 5.7912, 'grad_norm': 7.386207103729248, 'learning_rate': 3.330097087378641e-06, 'epoch': 50.01}
{'loss': 5.7477, 'grad_norm': 10.932997703552246, 'learning_rate': 3.1650485436893207e-06, 'epoch': 50.5}
{'loss': 5.7732, 'grad_norm': 5.711125373840332, 'learning_rate': 3e-06, 'epoch': 51.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.364414215087891, 'eval_runtime': 85.2548, 'eval_samples_per_second': 7.167, 'eval_steps_per_second': 1.795, 'epoch': 51.0}
{'loss': 5.781, 'grad_norm': 5.912744045257568, 'learning_rate': 2.83495145631068e-06, 'epoch': 51.5}
{'loss': 5.7659, 'grad_norm': 4.292966365814209, 'learning_rate': 2.6699029126213593e-06, 'epoch': 51.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.361138343811035, 'eval_runtime': 87.6664, 'eval_samples_per_second': 6.97, 'eval_steps_per_second': 1.745, 'epoch': 52.0}
{'loss': 5.7179, 'grad_norm': 10.073630332946777, 'learning_rate': 2.504854368932039e-06, 'epoch': 52.49}
{'loss': 5.7934, 'grad_norm': 6.553524971008301, 'learning_rate': 2.3398058252427184e-06, 'epoch': 52.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.356017589569092, 'eval_runtime': 85.3976, 'eval_samples_per_second': 7.155, 'eval_steps_per_second': 1.792, 'epoch': 53.0}
{'loss': 5.7959, 'grad_norm': 5.59245491027832, 'learning_rate': 2.1747572815533982e-06, 'epoch': 53.48}
{'loss': 5.7121, 'grad_norm': 5.390320777893066, 'learning_rate': 2.009708737864078e-06, 'epoch': 53.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.353317737579346, 'eval_runtime': 86.2708, 'eval_samples_per_second': 7.082, 'eval_steps_per_second': 1.773, 'epoch': 54.0}
{'loss': 5.7004, 'grad_norm': 4.14031457901001, 'learning_rate': 1.8446601941747574e-06, 'epoch': 54.47}
{'loss': 5.6842, 'grad_norm': 4.221437454223633, 'learning_rate': 1.679611650485437e-06, 'epoch': 54.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.348621368408203, 'eval_runtime': 85.7623, 'eval_samples_per_second': 7.124, 'eval_steps_per_second': 1.784, 'epoch': 55.0}
{'loss': 5.7988, 'grad_norm': 13.314461708068848, 'learning_rate': 1.5145631067961166e-06, 'epoch': 55.46}
{'loss': 5.6636, 'grad_norm': 4.044473648071289, 'learning_rate': 1.3495145631067962e-06, 'epoch': 55.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.345564365386963, 'eval_runtime': 85.6415, 'eval_samples_per_second': 7.134, 'eval_steps_per_second': 1.787, 'epoch': 56.0}
{'loss': 5.8221, 'grad_norm': 5.5638556480407715, 'learning_rate': 1.1844660194174758e-06, 'epoch': 56.45}
{'loss': 5.7007, 'grad_norm': 5.150637149810791, 'learning_rate': 1.0194174757281554e-06, 'epoch': 56.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.342939376831055, 'eval_runtime': 86.6012, 'eval_samples_per_second': 7.055, 'eval_steps_per_second': 1.767, 'epoch': 57.0}
{'loss': 5.7692, 'grad_norm': 4.163078784942627, 'learning_rate': 8.54368932038835e-07, 'epoch': 57.44}
{'loss': 5.7566, 'grad_norm': 5.424041748046875, 'learning_rate': 6.893203883495147e-07, 'epoch': 57.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.341921329498291, 'eval_runtime': 86.8314, 'eval_samples_per_second': 7.037, 'eval_steps_per_second': 1.762, 'epoch': 58.0}
{'loss': 5.746, 'grad_norm': 5.200376033782959, 'learning_rate': 5.242718446601942e-07, 'epoch': 58.43}
{'loss': 5.6589, 'grad_norm': 4.692885875701904, 'learning_rate': 3.592233009708738e-07, 'epoch': 58.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.341062545776367, 'eval_runtime': 80.9524, 'eval_samples_per_second': 7.548, 'eval_steps_per_second': 1.89, 'epoch': 59.0}
{'loss': 5.7121, 'grad_norm': 13.03831958770752, 'learning_rate': 1.941747572815534e-07, 'epoch': 59.42}
{'loss': 5.7036, 'grad_norm': 4.558351993560791, 'learning_rate': 2.9126213592233013e-08, 'epoch': 59.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.34064245223999, 'eval_runtime': 76.9684, 'eval_samples_per_second': 7.938, 'eval_steps_per_second': 1.988, 'epoch': 60.0}
{'train_runtime': 29931.4294, 'train_samples_per_second': 0.826, 'train_steps_per_second': 0.206, 'train_loss': 8.220918254790568, 'epoch': 60.0}


TrainOutput(global_step=6180, training_loss=8.220918254790568, metrics={'train_runtime': 29931.4294, 'train_samples_per_second': 0.826, 'train_steps_per_second': 0.206, 'total_flos': 140668072058880.0, 'train_loss': 8.220918254790568, 'epoch': 60.0})

In [28]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MT5ForConditionalGeneration

# Define constants and model
BATCH_SIZE = 4

# Assuming tokenized_dataset and tokenizer are already defined
# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

logging_steps = len(tokenized_datasets["train"]) // batch_size
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    predict_with_generate=True,
    fp16= True,
    logging_steps=logging_steps,
    report_to="none"

)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x000001CDFB152700>
Traceback (most recent call last):
  File "C:\Users\bless\AppData\Roaming\Python\Python312\site-packages\torch\utils\data\dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "C:\Users\bless\AppData\Roaming\Python\Python312\site-packages\torch\utils\data\dataloader.py", line 1437, in _shutdown_workers
    if self._persistent_workers or self._workers_status[worker_id]:
                                   ^^^^^^^^^^^^^^^^^^^^
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status'


  0%|          | 0/10300 [00:00<?, ?it/s]

{'loss': 27.4445, 'grad_norm': 5916.13916015625, 'learning_rate': 1.98e-05, 'epoch': 1.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 19.90825653076172, 'eval_runtime': 60.5357, 'eval_samples_per_second': 10.093, 'eval_steps_per_second': 2.527, 'epoch': 1.0}
{'loss': 23.9631, 'grad_norm': 4789.9345703125, 'learning_rate': 1.9600000000000002e-05, 'epoch': 2.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 15.540319442749023, 'eval_runtime': 65.2096, 'eval_samples_per_second': 9.37, 'eval_steps_per_second': 2.346, 'epoch': 2.0}
{'loss': 19.9966, 'grad_norm': 1940.254638671875, 'learning_rate': 1.94e-05, 'epoch': 3.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 13.370594024658203, 'eval_runtime': 71.3421, 'eval_samples_per_second': 8.564, 'eval_steps_per_second': 2.145, 'epoch': 3.0}
{'loss': 18.1634, 'grad_norm': 2616.62109375, 'learning_rate': 1.9200000000000003e-05, 'epoch': 4.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 12.393510818481445, 'eval_runtime': 75.9013, 'eval_samples_per_second': 8.05, 'eval_steps_per_second': 2.016, 'epoch': 4.0}
{'loss': 16.388, 'grad_norm': 4102.17724609375, 'learning_rate': 1.9e-05, 'epoch': 5.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 11.213177680969238, 'eval_runtime': 77.9623, 'eval_samples_per_second': 7.837, 'eval_steps_per_second': 1.962, 'epoch': 5.0}
{'loss': 14.5382, 'grad_norm': 685.4930419921875, 'learning_rate': 1.88e-05, 'epoch': 6.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 10.065652847290039, 'eval_runtime': 75.9784, 'eval_samples_per_second': 8.042, 'eval_steps_per_second': 2.014, 'epoch': 6.0}
{'loss': 13.1231, 'grad_norm': 275.94378662109375, 'learning_rate': 1.86e-05, 'epoch': 7.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 9.51534366607666, 'eval_runtime': 75.7418, 'eval_samples_per_second': 8.067, 'eval_steps_per_second': 2.02, 'epoch': 7.0}
{'loss': 12.2797, 'grad_norm': 419.4716491699219, 'learning_rate': 1.8400000000000003e-05, 'epoch': 8.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 9.017219543457031, 'eval_runtime': 77.501, 'eval_samples_per_second': 7.884, 'eval_steps_per_second': 1.974, 'epoch': 8.0}
{'loss': 11.4069, 'grad_norm': 190.93020629882812, 'learning_rate': 1.8200000000000002e-05, 'epoch': 9.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.679966926574707, 'eval_runtime': 75.901, 'eval_samples_per_second': 8.05, 'eval_steps_per_second': 2.016, 'epoch': 9.0}
{'loss': 10.6656, 'grad_norm': 520.9244384765625, 'learning_rate': 1.8e-05, 'epoch': 10.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.307591438293457, 'eval_runtime': 76.2573, 'eval_samples_per_second': 8.012, 'eval_steps_per_second': 2.006, 'epoch': 10.0}
{'loss': 10.0563, 'grad_norm': 77.23258972167969, 'learning_rate': 1.7800000000000002e-05, 'epoch': 11.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.033371925354004, 'eval_runtime': 72.1551, 'eval_samples_per_second': 8.468, 'eval_steps_per_second': 2.12, 'epoch': 11.0}
{'loss': 9.2948, 'grad_norm': 21.231924057006836, 'learning_rate': 1.76e-05, 'epoch': 12.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.711071491241455, 'eval_runtime': 54.6972, 'eval_samples_per_second': 11.171, 'eval_steps_per_second': 2.797, 'epoch': 12.0}
{'loss': 8.8293, 'grad_norm': 12.929583549499512, 'learning_rate': 1.7400000000000003e-05, 'epoch': 13.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.478630542755127, 'eval_runtime': 54.545, 'eval_samples_per_second': 11.202, 'eval_steps_per_second': 2.805, 'epoch': 13.0}
{'loss': 8.4825, 'grad_norm': 31.99912452697754, 'learning_rate': 1.72e-05, 'epoch': 14.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.242588996887207, 'eval_runtime': 83.1722, 'eval_samples_per_second': 7.346, 'eval_steps_per_second': 1.84, 'epoch': 14.0}
{'loss': 8.1424, 'grad_norm': 9.837713241577148, 'learning_rate': 1.7e-05, 'epoch': 15.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 7.03956413269043, 'eval_runtime': 190.0915, 'eval_samples_per_second': 3.214, 'eval_steps_per_second': 0.805, 'epoch': 15.0}
{'loss': 7.9657, 'grad_norm': 33.838706970214844, 'learning_rate': 1.6800000000000002e-05, 'epoch': 16.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.837947845458984, 'eval_runtime': 186.6742, 'eval_samples_per_second': 3.273, 'eval_steps_per_second': 0.82, 'epoch': 16.0}
{'loss': 7.6292, 'grad_norm': 10.949819564819336, 'learning_rate': 1.66e-05, 'epoch': 17.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.624322891235352, 'eval_runtime': 69.9724, 'eval_samples_per_second': 8.732, 'eval_steps_per_second': 2.187, 'epoch': 17.0}
{'loss': 7.5401, 'grad_norm': 5.7476983070373535, 'learning_rate': 1.64e-05, 'epoch': 18.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.485657215118408, 'eval_runtime': 71.75, 'eval_samples_per_second': 8.516, 'eval_steps_per_second': 2.132, 'epoch': 18.0}
{'loss': 7.3482, 'grad_norm': 19.888580322265625, 'learning_rate': 1.62e-05, 'epoch': 19.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.337471008300781, 'eval_runtime': 73.5734, 'eval_samples_per_second': 8.305, 'eval_steps_per_second': 2.08, 'epoch': 19.0}
{'loss': 7.1426, 'grad_norm': 11.78820514678955, 'learning_rate': 1.6000000000000003e-05, 'epoch': 20.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.210635662078857, 'eval_runtime': 72.8987, 'eval_samples_per_second': 8.381, 'eval_steps_per_second': 2.099, 'epoch': 20.0}
{'loss': 6.9967, 'grad_norm': 28.22996711730957, 'learning_rate': 1.58e-05, 'epoch': 21.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.105959892272949, 'eval_runtime': 68.3272, 'eval_samples_per_second': 8.942, 'eval_steps_per_second': 2.239, 'epoch': 21.0}
{'loss': 6.8732, 'grad_norm': 6.215574741363525, 'learning_rate': 1.5600000000000003e-05, 'epoch': 22.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.014019966125488, 'eval_runtime': 68.2901, 'eval_samples_per_second': 8.947, 'eval_steps_per_second': 2.24, 'epoch': 22.0}
{'loss': 6.7231, 'grad_norm': 5.466944217681885, 'learning_rate': 1.54e-05, 'epoch': 23.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.9203386306762695, 'eval_runtime': 69.4493, 'eval_samples_per_second': 8.798, 'eval_steps_per_second': 2.203, 'epoch': 23.0}
{'loss': 6.5993, 'grad_norm': 6.993802547454834, 'learning_rate': 1.5200000000000002e-05, 'epoch': 24.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.835198879241943, 'eval_runtime': 66.3338, 'eval_samples_per_second': 9.211, 'eval_steps_per_second': 2.307, 'epoch': 24.0}
{'loss': 6.4873, 'grad_norm': 4.5483012199401855, 'learning_rate': 1.5000000000000002e-05, 'epoch': 25.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.767497539520264, 'eval_runtime': 67.1861, 'eval_samples_per_second': 9.094, 'eval_steps_per_second': 2.277, 'epoch': 25.0}
{'loss': 6.4512, 'grad_norm': 4.531797409057617, 'learning_rate': 1.48e-05, 'epoch': 26.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.719638824462891, 'eval_runtime': 67.4407, 'eval_samples_per_second': 9.06, 'eval_steps_per_second': 2.269, 'epoch': 26.0}
{'loss': 6.3396, 'grad_norm': 4.989264965057373, 'learning_rate': 1.46e-05, 'epoch': 27.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.66194486618042, 'eval_runtime': 68.129, 'eval_samples_per_second': 8.968, 'eval_steps_per_second': 2.246, 'epoch': 27.0}
{'loss': 6.2996, 'grad_norm': 3.6313230991363525, 'learning_rate': 1.4400000000000001e-05, 'epoch': 28.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.621875762939453, 'eval_runtime': 68.0079, 'eval_samples_per_second': 8.984, 'eval_steps_per_second': 2.25, 'epoch': 28.0}
{'loss': 6.1513, 'grad_norm': 5.571856498718262, 'learning_rate': 1.4200000000000001e-05, 'epoch': 29.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.566507339477539, 'eval_runtime': 69.2332, 'eval_samples_per_second': 8.825, 'eval_steps_per_second': 2.21, 'epoch': 29.0}
{'loss': 6.1379, 'grad_norm': 8.051456451416016, 'learning_rate': 1.4e-05, 'epoch': 30.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.53464937210083, 'eval_runtime': 67.7299, 'eval_samples_per_second': 9.021, 'eval_steps_per_second': 2.259, 'epoch': 30.0}
{'loss': 6.0704, 'grad_norm': 6.928951263427734, 'learning_rate': 1.38e-05, 'epoch': 31.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.5047807693481445, 'eval_runtime': 67.1953, 'eval_samples_per_second': 9.093, 'eval_steps_per_second': 2.277, 'epoch': 31.0}
{'loss': 6.0361, 'grad_norm': 4.307233810424805, 'learning_rate': 1.3600000000000002e-05, 'epoch': 32.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.467705249786377, 'eval_runtime': 61.3945, 'eval_samples_per_second': 9.952, 'eval_steps_per_second': 2.492, 'epoch': 32.0}
{'loss': 5.9648, 'grad_norm': 4.690403938293457, 'learning_rate': 1.3400000000000002e-05, 'epoch': 33.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.430727958679199, 'eval_runtime': 64.0068, 'eval_samples_per_second': 9.546, 'eval_steps_per_second': 2.39, 'epoch': 33.0}
{'loss': 5.9398, 'grad_norm': 4.9901018142700195, 'learning_rate': 1.3200000000000002e-05, 'epoch': 34.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.408313274383545, 'eval_runtime': 66.9863, 'eval_samples_per_second': 9.121, 'eval_steps_per_second': 2.284, 'epoch': 34.0}
{'loss': 5.8598, 'grad_norm': 5.606125354766846, 'learning_rate': 1.3000000000000001e-05, 'epoch': 35.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.384648323059082, 'eval_runtime': 61.8505, 'eval_samples_per_second': 9.879, 'eval_steps_per_second': 2.474, 'epoch': 35.0}
{'loss': 5.8961, 'grad_norm': 6.919256687164307, 'learning_rate': 1.2800000000000001e-05, 'epoch': 36.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.364475727081299, 'eval_runtime': 60.4898, 'eval_samples_per_second': 10.101, 'eval_steps_per_second': 2.529, 'epoch': 36.0}
{'loss': 5.8027, 'grad_norm': 3.456058979034424, 'learning_rate': 1.2600000000000001e-05, 'epoch': 37.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.340020179748535, 'eval_runtime': 60.5825, 'eval_samples_per_second': 10.085, 'eval_steps_per_second': 2.525, 'epoch': 37.0}
{'loss': 5.7248, 'grad_norm': 4.676919460296631, 'learning_rate': 1.2400000000000002e-05, 'epoch': 38.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.322360038757324, 'eval_runtime': 63.4926, 'eval_samples_per_second': 9.623, 'eval_steps_per_second': 2.41, 'epoch': 38.0}
{'loss': 5.7508, 'grad_norm': 4.349728584289551, 'learning_rate': 1.22e-05, 'epoch': 39.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.308380603790283, 'eval_runtime': 66.3788, 'eval_samples_per_second': 9.205, 'eval_steps_per_second': 2.305, 'epoch': 39.0}
{'loss': 5.6473, 'grad_norm': 7.086997032165527, 'learning_rate': 1.2e-05, 'epoch': 40.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.287155628204346, 'eval_runtime': 61.8026, 'eval_samples_per_second': 9.886, 'eval_steps_per_second': 2.476, 'epoch': 40.0}
{'loss': 5.6043, 'grad_norm': 4.75589656829834, 'learning_rate': 1.18e-05, 'epoch': 41.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.268996715545654, 'eval_runtime': 62.9568, 'eval_samples_per_second': 9.705, 'eval_steps_per_second': 2.43, 'epoch': 41.0}
{'loss': 5.5699, 'grad_norm': 5.975458145141602, 'learning_rate': 1.16e-05, 'epoch': 42.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.2493577003479, 'eval_runtime': 63.7944, 'eval_samples_per_second': 9.578, 'eval_steps_per_second': 2.398, 'epoch': 42.0}
{'loss': 5.571, 'grad_norm': 5.096044063568115, 'learning_rate': 1.14e-05, 'epoch': 43.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.236778736114502, 'eval_runtime': 60.9377, 'eval_samples_per_second': 10.027, 'eval_steps_per_second': 2.511, 'epoch': 43.0}
{'loss': 5.4887, 'grad_norm': 6.145748138427734, 'learning_rate': 1.1200000000000001e-05, 'epoch': 44.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.223758697509766, 'eval_runtime': 60.5672, 'eval_samples_per_second': 10.088, 'eval_steps_per_second': 2.526, 'epoch': 44.0}
{'loss': 5.4622, 'grad_norm': 12.192926406860352, 'learning_rate': 1.1000000000000001e-05, 'epoch': 45.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.207860946655273, 'eval_runtime': 62.1863, 'eval_samples_per_second': 9.825, 'eval_steps_per_second': 2.46, 'epoch': 45.0}
{'loss': 5.4546, 'grad_norm': 7.817627429962158, 'learning_rate': 1.0800000000000002e-05, 'epoch': 46.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.197959899902344, 'eval_runtime': 58.6353, 'eval_samples_per_second': 10.42, 'eval_steps_per_second': 2.609, 'epoch': 46.0}
{'loss': 5.4152, 'grad_norm': 4.553167343139648, 'learning_rate': 1.0600000000000002e-05, 'epoch': 47.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.182511806488037, 'eval_runtime': 60.4458, 'eval_samples_per_second': 10.108, 'eval_steps_per_second': 2.531, 'epoch': 47.0}
{'loss': 5.3692, 'grad_norm': 3.839874267578125, 'learning_rate': 1.04e-05, 'epoch': 48.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.171911239624023, 'eval_runtime': 61.5078, 'eval_samples_per_second': 9.934, 'eval_steps_per_second': 2.487, 'epoch': 48.0}
{'loss': 5.4053, 'grad_norm': 3.8685028553009033, 'learning_rate': 1.02e-05, 'epoch': 49.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.162209987640381, 'eval_runtime': 63.3475, 'eval_samples_per_second': 9.645, 'eval_steps_per_second': 2.415, 'epoch': 49.0}
{'loss': 5.3559, 'grad_norm': 3.91186261177063, 'learning_rate': 1e-05, 'epoch': 50.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.150639057159424, 'eval_runtime': 60.5395, 'eval_samples_per_second': 10.093, 'eval_steps_per_second': 2.527, 'epoch': 50.0}
{'loss': 5.3118, 'grad_norm': 4.796220302581787, 'learning_rate': 9.800000000000001e-06, 'epoch': 51.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.135669231414795, 'eval_runtime': 59.5177, 'eval_samples_per_second': 10.266, 'eval_steps_per_second': 2.571, 'epoch': 51.0}
{'loss': 5.3116, 'grad_norm': 6.8209686279296875, 'learning_rate': 9.600000000000001e-06, 'epoch': 52.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.130849838256836, 'eval_runtime': 60.3515, 'eval_samples_per_second': 10.124, 'eval_steps_per_second': 2.535, 'epoch': 52.0}
{'loss': 5.2836, 'grad_norm': 5.304643630981445, 'learning_rate': 9.4e-06, 'epoch': 53.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.121163368225098, 'eval_runtime': 62.4114, 'eval_samples_per_second': 9.79, 'eval_steps_per_second': 2.451, 'epoch': 53.0}
{'loss': 5.276, 'grad_norm': 4.238455295562744, 'learning_rate': 9.200000000000002e-06, 'epoch': 54.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.116784572601318, 'eval_runtime': 59.3274, 'eval_samples_per_second': 10.299, 'eval_steps_per_second': 2.579, 'epoch': 54.0}
{'loss': 5.2156, 'grad_norm': 3.62345290184021, 'learning_rate': 9e-06, 'epoch': 55.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.107390880584717, 'eval_runtime': 59.2021, 'eval_samples_per_second': 10.321, 'eval_steps_per_second': 2.584, 'epoch': 55.0}
{'loss': 5.2314, 'grad_norm': 3.9143471717834473, 'learning_rate': 8.8e-06, 'epoch': 56.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.098495006561279, 'eval_runtime': 58.4771, 'eval_samples_per_second': 10.449, 'eval_steps_per_second': 2.616, 'epoch': 56.0}
{'loss': 5.2363, 'grad_norm': 4.600462436676025, 'learning_rate': 8.6e-06, 'epoch': 57.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.090442180633545, 'eval_runtime': 63.4886, 'eval_samples_per_second': 9.624, 'eval_steps_per_second': 2.41, 'epoch': 57.0}
{'loss': 5.2168, 'grad_norm': 5.321626663208008, 'learning_rate': 8.400000000000001e-06, 'epoch': 58.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.085375785827637, 'eval_runtime': 61.1514, 'eval_samples_per_second': 9.992, 'eval_steps_per_second': 2.502, 'epoch': 58.0}
{'loss': 5.1694, 'grad_norm': 4.0965576171875, 'learning_rate': 8.2e-06, 'epoch': 59.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.079314708709717, 'eval_runtime': 60.3882, 'eval_samples_per_second': 10.118, 'eval_steps_per_second': 2.534, 'epoch': 59.0}
{'loss': 5.1513, 'grad_norm': 6.399101734161377, 'learning_rate': 8.000000000000001e-06, 'epoch': 60.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.070826053619385, 'eval_runtime': 59.5096, 'eval_samples_per_second': 10.267, 'eval_steps_per_second': 2.571, 'epoch': 60.0}
{'loss': 5.1861, 'grad_norm': 6.367498397827148, 'learning_rate': 7.800000000000002e-06, 'epoch': 61.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.067293167114258, 'eval_runtime': 59.1776, 'eval_samples_per_second': 10.325, 'eval_steps_per_second': 2.585, 'epoch': 61.0}
{'loss': 5.1451, 'grad_norm': 4.850258827209473, 'learning_rate': 7.600000000000001e-06, 'epoch': 62.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.062058448791504, 'eval_runtime': 58.2864, 'eval_samples_per_second': 10.483, 'eval_steps_per_second': 2.625, 'epoch': 62.0}
{'loss': 5.1036, 'grad_norm': 4.488667964935303, 'learning_rate': 7.4e-06, 'epoch': 63.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.055200576782227, 'eval_runtime': 59.9254, 'eval_samples_per_second': 10.196, 'eval_steps_per_second': 2.553, 'epoch': 63.0}
{'loss': 5.0754, 'grad_norm': 3.670449733734131, 'learning_rate': 7.2000000000000005e-06, 'epoch': 64.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.048724174499512, 'eval_runtime': 58.4011, 'eval_samples_per_second': 10.462, 'eval_steps_per_second': 2.62, 'epoch': 64.0}
{'loss': 5.101, 'grad_norm': 7.592871189117432, 'learning_rate': 7e-06, 'epoch': 65.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.045718193054199, 'eval_runtime': 59.3632, 'eval_samples_per_second': 10.293, 'eval_steps_per_second': 2.577, 'epoch': 65.0}
{'loss': 5.086, 'grad_norm': 5.315604209899902, 'learning_rate': 6.800000000000001e-06, 'epoch': 66.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.042441368103027, 'eval_runtime': 60.8063, 'eval_samples_per_second': 10.048, 'eval_steps_per_second': 2.516, 'epoch': 66.0}
{'loss': 5.0331, 'grad_norm': 4.584228038787842, 'learning_rate': 6.600000000000001e-06, 'epoch': 67.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.03785514831543, 'eval_runtime': 61.3082, 'eval_samples_per_second': 9.966, 'eval_steps_per_second': 2.496, 'epoch': 67.0}
{'loss': 5.0445, 'grad_norm': 3.879551649093628, 'learning_rate': 6.4000000000000006e-06, 'epoch': 68.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.034576416015625, 'eval_runtime': 64.9666, 'eval_samples_per_second': 9.405, 'eval_steps_per_second': 2.355, 'epoch': 68.0}
{'loss': 5.0598, 'grad_norm': 4.366758823394775, 'learning_rate': 6.200000000000001e-06, 'epoch': 69.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.032015800476074, 'eval_runtime': 59.7881, 'eval_samples_per_second': 10.219, 'eval_steps_per_second': 2.559, 'epoch': 69.0}
{'loss': 5.0267, 'grad_norm': 3.77443265914917, 'learning_rate': 6e-06, 'epoch': 70.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.0274457931518555, 'eval_runtime': 59.807, 'eval_samples_per_second': 10.216, 'eval_steps_per_second': 2.558, 'epoch': 70.0}
{'loss': 5.0208, 'grad_norm': 4.2311296463012695, 'learning_rate': 5.8e-06, 'epoch': 71.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.025090217590332, 'eval_runtime': 61.1037, 'eval_samples_per_second': 9.999, 'eval_steps_per_second': 2.504, 'epoch': 71.0}
{'loss': 5.0357, 'grad_norm': 5.541591167449951, 'learning_rate': 5.600000000000001e-06, 'epoch': 72.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.022941589355469, 'eval_runtime': 57.4573, 'eval_samples_per_second': 10.634, 'eval_steps_per_second': 2.663, 'epoch': 72.0}
{'loss': 5.0281, 'grad_norm': 4.43179178237915, 'learning_rate': 5.400000000000001e-06, 'epoch': 73.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.019898891448975, 'eval_runtime': 60.3155, 'eval_samples_per_second': 10.13, 'eval_steps_per_second': 2.537, 'epoch': 73.0}
{'loss': 5.0041, 'grad_norm': 5.455342769622803, 'learning_rate': 5.2e-06, 'epoch': 74.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.017132759094238, 'eval_runtime': 61.8989, 'eval_samples_per_second': 9.871, 'eval_steps_per_second': 2.472, 'epoch': 74.0}
{'loss': 4.9773, 'grad_norm': 5.162992000579834, 'learning_rate': 5e-06, 'epoch': 75.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.013693809509277, 'eval_runtime': 60.34, 'eval_samples_per_second': 10.126, 'eval_steps_per_second': 2.536, 'epoch': 75.0}
{'loss': 4.9881, 'grad_norm': 5.163087844848633, 'learning_rate': 4.800000000000001e-06, 'epoch': 76.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.012209892272949, 'eval_runtime': 61.1459, 'eval_samples_per_second': 9.992, 'eval_steps_per_second': 2.502, 'epoch': 76.0}
{'loss': 4.9554, 'grad_norm': 3.888664960861206, 'learning_rate': 4.600000000000001e-06, 'epoch': 77.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.010674476623535, 'eval_runtime': 63.4069, 'eval_samples_per_second': 9.636, 'eval_steps_per_second': 2.413, 'epoch': 77.0}
{'loss': 4.9633, 'grad_norm': 4.602688312530518, 'learning_rate': 4.4e-06, 'epoch': 78.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.00776481628418, 'eval_runtime': 60.5338, 'eval_samples_per_second': 10.094, 'eval_steps_per_second': 2.528, 'epoch': 78.0}
{'loss': 4.965, 'grad_norm': 4.897233486175537, 'learning_rate': 4.2000000000000004e-06, 'epoch': 79.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.006227016448975, 'eval_runtime': 60.3422, 'eval_samples_per_second': 10.126, 'eval_steps_per_second': 2.536, 'epoch': 79.0}
{'loss': 4.9689, 'grad_norm': 3.368206739425659, 'learning_rate': 4.000000000000001e-06, 'epoch': 80.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.004040241241455, 'eval_runtime': 58.1313, 'eval_samples_per_second': 10.511, 'eval_steps_per_second': 2.632, 'epoch': 80.0}
{'loss': 4.9267, 'grad_norm': 4.26708459854126, 'learning_rate': 3.8000000000000005e-06, 'epoch': 81.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.00234317779541, 'eval_runtime': 60.3895, 'eval_samples_per_second': 10.118, 'eval_steps_per_second': 2.534, 'epoch': 81.0}
{'loss': 4.971, 'grad_norm': 4.145437240600586, 'learning_rate': 3.6000000000000003e-06, 'epoch': 82.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.000918388366699, 'eval_runtime': 64.4249, 'eval_samples_per_second': 9.484, 'eval_steps_per_second': 2.375, 'epoch': 82.0}
{'loss': 4.9451, 'grad_norm': 3.712034225463867, 'learning_rate': 3.4000000000000005e-06, 'epoch': 83.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.998806476593018, 'eval_runtime': 63.5914, 'eval_samples_per_second': 9.608, 'eval_steps_per_second': 2.406, 'epoch': 83.0}
{'loss': 4.9516, 'grad_norm': 5.300727844238281, 'learning_rate': 3.2000000000000003e-06, 'epoch': 84.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.997469902038574, 'eval_runtime': 65.2383, 'eval_samples_per_second': 9.366, 'eval_steps_per_second': 2.345, 'epoch': 84.0}
{'loss': 4.9238, 'grad_norm': 4.333500862121582, 'learning_rate': 3e-06, 'epoch': 85.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.996390342712402, 'eval_runtime': 60.8739, 'eval_samples_per_second': 10.037, 'eval_steps_per_second': 2.513, 'epoch': 85.0}
{'loss': 4.9173, 'grad_norm': 4.054081439971924, 'learning_rate': 2.8000000000000003e-06, 'epoch': 86.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.995001792907715, 'eval_runtime': 64.0685, 'eval_samples_per_second': 9.537, 'eval_steps_per_second': 2.388, 'epoch': 86.0}
{'loss': 4.9372, 'grad_norm': 5.675014019012451, 'learning_rate': 2.6e-06, 'epoch': 87.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.9942216873168945, 'eval_runtime': 60.7302, 'eval_samples_per_second': 10.061, 'eval_steps_per_second': 2.519, 'epoch': 87.0}
{'loss': 4.9107, 'grad_norm': 4.9283623695373535, 'learning_rate': 2.4000000000000003e-06, 'epoch': 88.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.992649078369141, 'eval_runtime': 60.6384, 'eval_samples_per_second': 10.076, 'eval_steps_per_second': 2.523, 'epoch': 88.0}
{'loss': 4.9104, 'grad_norm': 4.692866802215576, 'learning_rate': 2.2e-06, 'epoch': 89.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.991854667663574, 'eval_runtime': 61.1421, 'eval_samples_per_second': 9.993, 'eval_steps_per_second': 2.502, 'epoch': 89.0}
{'loss': 4.9127, 'grad_norm': 4.676869869232178, 'learning_rate': 2.0000000000000003e-06, 'epoch': 90.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.990725517272949, 'eval_runtime': 61.4855, 'eval_samples_per_second': 9.937, 'eval_steps_per_second': 2.488, 'epoch': 90.0}
{'loss': 4.9305, 'grad_norm': 4.844012260437012, 'learning_rate': 1.8000000000000001e-06, 'epoch': 91.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.989464282989502, 'eval_runtime': 62.4067, 'eval_samples_per_second': 9.791, 'eval_steps_per_second': 2.452, 'epoch': 91.0}
{'loss': 4.9658, 'grad_norm': 5.924988746643066, 'learning_rate': 1.6000000000000001e-06, 'epoch': 92.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.988868236541748, 'eval_runtime': 62.6752, 'eval_samples_per_second': 9.749, 'eval_steps_per_second': 2.441, 'epoch': 92.0}
{'loss': 4.9163, 'grad_norm': 4.113839626312256, 'learning_rate': 1.4000000000000001e-06, 'epoch': 93.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.98797607421875, 'eval_runtime': 63.6767, 'eval_samples_per_second': 9.595, 'eval_steps_per_second': 2.403, 'epoch': 93.0}
{'loss': 4.9169, 'grad_norm': 6.080443859100342, 'learning_rate': 1.2000000000000002e-06, 'epoch': 94.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.987217903137207, 'eval_runtime': 61.352, 'eval_samples_per_second': 9.959, 'eval_steps_per_second': 2.494, 'epoch': 94.0}
{'loss': 4.9083, 'grad_norm': 4.029347896575928, 'learning_rate': 1.0000000000000002e-06, 'epoch': 95.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.9866838455200195, 'eval_runtime': 65.4129, 'eval_samples_per_second': 9.341, 'eval_steps_per_second': 2.339, 'epoch': 95.0}
{'loss': 4.9208, 'grad_norm': 4.873000144958496, 'learning_rate': 8.000000000000001e-07, 'epoch': 96.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.986694812774658, 'eval_runtime': 63.9535, 'eval_samples_per_second': 9.554, 'eval_steps_per_second': 2.392, 'epoch': 96.0}
{'loss': 4.8592, 'grad_norm': 4.144442558288574, 'learning_rate': 6.000000000000001e-07, 'epoch': 97.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.986292362213135, 'eval_runtime': 66.1745, 'eval_samples_per_second': 9.233, 'eval_steps_per_second': 2.312, 'epoch': 97.0}
{'loss': 4.8748, 'grad_norm': 5.562098026275635, 'learning_rate': 4.0000000000000003e-07, 'epoch': 98.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.986132621765137, 'eval_runtime': 62.7478, 'eval_samples_per_second': 9.737, 'eval_steps_per_second': 2.438, 'epoch': 98.0}
{'loss': 4.8815, 'grad_norm': 6.394643783569336, 'learning_rate': 2.0000000000000002e-07, 'epoch': 99.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.98594331741333, 'eval_runtime': 62.7281, 'eval_samples_per_second': 9.74, 'eval_steps_per_second': 2.439, 'epoch': 99.0}
{'loss': 4.8984, 'grad_norm': 5.658607482910156, 'learning_rate': 0.0, 'epoch': 100.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.9858717918396, 'eval_runtime': 69.7377, 'eval_samples_per_second': 8.761, 'eval_steps_per_second': 2.194, 'epoch': 100.0}
{'train_runtime': 42848.611, 'train_samples_per_second': 0.962, 'train_steps_per_second': 0.24, 'train_loss': 6.803988822270366, 'epoch': 100.0}


TrainOutput(global_step=10300, training_loss=6.803988822270366, metrics={'train_runtime': 42848.611, 'train_samples_per_second': 0.962, 'train_steps_per_second': 0.24, 'total_flos': 234388954767360.0, 'train_loss': 6.803988822270366, 'epoch': 100.0})

In [25]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MT5ForConditionalGeneration

# Define constants and model
BATCH_SIZE = 4

# Assuming tokenized_dataset and tokenizer are already defined
# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

logging_steps = len(tokenized_datasets["train"]) // batch_size
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=150,
    predict_with_generate=True,
    fp16= True,
    logging_steps=logging_steps,
    report_to="none"

)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  0%|          | 0/154800 [00:00<?, ?it/s]

{'loss': 17.0516, 'grad_norm': 330.7328186035156, 'learning_rate': 1.9866795865633076e-05, 'epoch': 1.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 8.199318885803223, 'eval_runtime': 61.782, 'eval_samples_per_second': 9.89, 'eval_steps_per_second': 2.476, 'epoch': 1.0}
{'loss': 8.3564, 'grad_norm': 8.160903930664062, 'learning_rate': 1.973359173126615e-05, 'epoch': 2.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 6.100841999053955, 'eval_runtime': 79.5579, 'eval_samples_per_second': 7.68, 'eval_steps_per_second': 1.923, 'epoch': 2.0}
{'loss': 6.5714, 'grad_norm': 4.302398204803467, 'learning_rate': 1.9600387596899226e-05, 'epoch': 3.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 5.159071922302246, 'eval_runtime': 77.1438, 'eval_samples_per_second': 7.92, 'eval_steps_per_second': 1.983, 'epoch': 3.0}
{'loss': 5.8846, 'grad_norm': 3.471472978591919, 'learning_rate': 1.94671834625323e-05, 'epoch': 4.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.812417030334473, 'eval_runtime': 69.5032, 'eval_samples_per_second': 8.791, 'eval_steps_per_second': 2.201, 'epoch': 4.0}
{'loss': 5.5111, 'grad_norm': 5.0173492431640625, 'learning_rate': 1.9333979328165376e-05, 'epoch': 5.0}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.602696895599365, 'eval_runtime': 68.8137, 'eval_samples_per_second': 8.879, 'eval_steps_per_second': 2.223, 'epoch': 5.0}
{'loss': 5.2787, 'grad_norm': 6.156761646270752, 'learning_rate': 1.920077519379845e-05, 'epoch': 5.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.458512306213379, 'eval_runtime': 68.9973, 'eval_samples_per_second': 8.855, 'eval_steps_per_second': 2.217, 'epoch': 6.0}
{'loss': 5.1035, 'grad_norm': 4.731359004974365, 'learning_rate': 1.9067571059431526e-05, 'epoch': 6.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.340379238128662, 'eval_runtime': 65.2256, 'eval_samples_per_second': 9.367, 'eval_steps_per_second': 2.346, 'epoch': 7.0}
{'loss': 4.9426, 'grad_norm': 4.004457950592041, 'learning_rate': 1.89343669250646e-05, 'epoch': 7.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.248589515686035, 'eval_runtime': 69.3026, 'eval_samples_per_second': 8.816, 'eval_steps_per_second': 2.208, 'epoch': 8.0}
{'loss': 4.8423, 'grad_norm': 4.495175838470459, 'learning_rate': 1.8801162790697675e-05, 'epoch': 8.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.174522876739502, 'eval_runtime': 68.9438, 'eval_samples_per_second': 8.862, 'eval_steps_per_second': 2.219, 'epoch': 9.0}
{'loss': 4.7355, 'grad_norm': 3.9680140018463135, 'learning_rate': 1.866795865633075e-05, 'epoch': 9.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.1079020500183105, 'eval_runtime': 72.7215, 'eval_samples_per_second': 8.402, 'eval_steps_per_second': 2.104, 'epoch': 10.0}
{'loss': 4.629, 'grad_norm': 4.0854597091674805, 'learning_rate': 1.8534754521963825e-05, 'epoch': 10.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.050814151763916, 'eval_runtime': 71.7306, 'eval_samples_per_second': 8.518, 'eval_steps_per_second': 2.133, 'epoch': 11.0}
{'loss': 4.557, 'grad_norm': 4.075471878051758, 'learning_rate': 1.84015503875969e-05, 'epoch': 11.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 4.007205963134766, 'eval_runtime': 72.609, 'eval_samples_per_second': 8.415, 'eval_steps_per_second': 2.107, 'epoch': 12.0}
{'loss': 4.4759, 'grad_norm': 3.7988393306732178, 'learning_rate': 1.8268346253229975e-05, 'epoch': 12.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.958195686340332, 'eval_runtime': 78.3259, 'eval_samples_per_second': 7.801, 'eval_steps_per_second': 1.953, 'epoch': 13.0}
{'loss': 4.4132, 'grad_norm': 3.580249309539795, 'learning_rate': 1.8135142118863053e-05, 'epoch': 13.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.919400453567505, 'eval_runtime': 82.2498, 'eval_samples_per_second': 7.429, 'eval_steps_per_second': 1.86, 'epoch': 14.0}
{'loss': 4.3445, 'grad_norm': 3.9425241947174072, 'learning_rate': 1.8001937984496124e-05, 'epoch': 14.99}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.884521484375, 'eval_runtime': 85.5008, 'eval_samples_per_second': 7.146, 'eval_steps_per_second': 1.789, 'epoch': 15.0}
{'loss': 4.29, 'grad_norm': 4.795021057128906, 'learning_rate': 1.78687338501292e-05, 'epoch': 15.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.8461754322052, 'eval_runtime': 89.3875, 'eval_samples_per_second': 6.835, 'eval_steps_per_second': 1.712, 'epoch': 16.0}
{'loss': 4.2515, 'grad_norm': 3.180375814437866, 'learning_rate': 1.7735529715762277e-05, 'epoch': 16.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.8164656162261963, 'eval_runtime': 89.8337, 'eval_samples_per_second': 6.801, 'eval_steps_per_second': 1.703, 'epoch': 17.0}
{'loss': 4.1804, 'grad_norm': 4.238195419311523, 'learning_rate': 1.760232558139535e-05, 'epoch': 17.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.788952350616455, 'eval_runtime': 263.8395, 'eval_samples_per_second': 2.316, 'eval_steps_per_second': 0.58, 'epoch': 18.0}
{'loss': 4.1385, 'grad_norm': 4.006582736968994, 'learning_rate': 1.7469121447028427e-05, 'epoch': 18.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.7630791664123535, 'eval_runtime': 130.3146, 'eval_samples_per_second': 4.689, 'eval_steps_per_second': 1.174, 'epoch': 19.0}
{'loss': 4.0942, 'grad_norm': 3.901974678039551, 'learning_rate': 1.7335917312661502e-05, 'epoch': 19.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.736933708190918, 'eval_runtime': 78.7861, 'eval_samples_per_second': 7.755, 'eval_steps_per_second': 1.942, 'epoch': 20.0}
{'loss': 4.0616, 'grad_norm': 4.133163928985596, 'learning_rate': 1.7202713178294573e-05, 'epoch': 20.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.7134690284729004, 'eval_runtime': 73.022, 'eval_samples_per_second': 8.367, 'eval_steps_per_second': 2.095, 'epoch': 21.0}
{'loss': 4.0219, 'grad_norm': 4.239965438842773, 'learning_rate': 1.706950904392765e-05, 'epoch': 21.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.6903645992279053, 'eval_runtime': 72.5432, 'eval_samples_per_second': 8.423, 'eval_steps_per_second': 2.109, 'epoch': 22.0}
{'loss': 3.9768, 'grad_norm': 3.3929123878479004, 'learning_rate': 1.6936304909560726e-05, 'epoch': 22.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.672515630722046, 'eval_runtime': 71.4376, 'eval_samples_per_second': 8.553, 'eval_steps_per_second': 2.142, 'epoch': 23.0}
{'loss': 3.9475, 'grad_norm': 3.3440656661987305, 'learning_rate': 1.6803100775193798e-05, 'epoch': 23.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.65134334564209, 'eval_runtime': 67.9822, 'eval_samples_per_second': 8.988, 'eval_steps_per_second': 2.251, 'epoch': 24.0}
{'loss': 3.9042, 'grad_norm': 4.307568073272705, 'learning_rate': 1.6669896640826876e-05, 'epoch': 24.98}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.631556987762451, 'eval_runtime': 71.3537, 'eval_samples_per_second': 8.563, 'eval_steps_per_second': 2.144, 'epoch': 25.0}
{'loss': 3.8723, 'grad_norm': 5.013698101043701, 'learning_rate': 1.653669250645995e-05, 'epoch': 25.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.612738609313965, 'eval_runtime': 72.1306, 'eval_samples_per_second': 8.471, 'eval_steps_per_second': 2.121, 'epoch': 26.0}
{'loss': 3.8379, 'grad_norm': 4.526628494262695, 'learning_rate': 1.6403488372093022e-05, 'epoch': 26.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.5996856689453125, 'eval_runtime': 70.5411, 'eval_samples_per_second': 8.662, 'eval_steps_per_second': 2.169, 'epoch': 27.0}
{'loss': 3.7971, 'grad_norm': 3.6851933002471924, 'learning_rate': 1.62702842377261e-05, 'epoch': 27.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.5776236057281494, 'eval_runtime': 66.4159, 'eval_samples_per_second': 9.2, 'eval_steps_per_second': 2.304, 'epoch': 28.0}
{'loss': 3.7785, 'grad_norm': 4.4364848136901855, 'learning_rate': 1.6137080103359175e-05, 'epoch': 28.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.5673940181732178, 'eval_runtime': 67.4232, 'eval_samples_per_second': 9.062, 'eval_steps_per_second': 2.269, 'epoch': 29.0}
{'loss': 3.7496, 'grad_norm': 4.044285297393799, 'learning_rate': 1.6003875968992247e-05, 'epoch': 29.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.550037145614624, 'eval_runtime': 64.8933, 'eval_samples_per_second': 9.415, 'eval_steps_per_second': 2.358, 'epoch': 30.0}
{'loss': 3.7146, 'grad_norm': 5.140530586242676, 'learning_rate': 1.5870671834625325e-05, 'epoch': 30.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.535153865814209, 'eval_runtime': 67.9861, 'eval_samples_per_second': 8.987, 'eval_steps_per_second': 2.25, 'epoch': 31.0}
{'loss': 3.6956, 'grad_norm': 5.126178741455078, 'learning_rate': 1.57374677002584e-05, 'epoch': 31.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.5251951217651367, 'eval_runtime': 74.381, 'eval_samples_per_second': 8.214, 'eval_steps_per_second': 2.057, 'epoch': 32.0}
{'loss': 3.6856, 'grad_norm': 4.814952850341797, 'learning_rate': 1.5604263565891475e-05, 'epoch': 32.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.5142881870269775, 'eval_runtime': 74.1333, 'eval_samples_per_second': 8.242, 'eval_steps_per_second': 2.064, 'epoch': 33.0}
{'loss': 3.6518, 'grad_norm': 4.276726245880127, 'learning_rate': 1.547105943152455e-05, 'epoch': 33.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.499295473098755, 'eval_runtime': 78.071, 'eval_samples_per_second': 7.826, 'eval_steps_per_second': 1.96, 'epoch': 34.0}
{'loss': 3.6115, 'grad_norm': 5.786471366882324, 'learning_rate': 1.5337855297157624e-05, 'epoch': 34.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4841396808624268, 'eval_runtime': 78.6344, 'eval_samples_per_second': 7.77, 'eval_steps_per_second': 1.946, 'epoch': 35.0}
{'loss': 3.5975, 'grad_norm': 6.192135810852051, 'learning_rate': 1.5204651162790698e-05, 'epoch': 35.97}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4806132316589355, 'eval_runtime': 77.7169, 'eval_samples_per_second': 7.862, 'eval_steps_per_second': 1.969, 'epoch': 36.0}
{'loss': 3.5809, 'grad_norm': 3.4881465435028076, 'learning_rate': 1.5071447028423774e-05, 'epoch': 36.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4688026905059814, 'eval_runtime': 78.5442, 'eval_samples_per_second': 7.779, 'eval_steps_per_second': 1.948, 'epoch': 37.0}
{'loss': 3.551, 'grad_norm': 4.725707530975342, 'learning_rate': 1.4938242894056849e-05, 'epoch': 37.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4442880153656006, 'eval_runtime': 78.0079, 'eval_samples_per_second': 7.833, 'eval_steps_per_second': 1.961, 'epoch': 38.0}
{'loss': 3.5329, 'grad_norm': 3.9028468132019043, 'learning_rate': 1.4805038759689926e-05, 'epoch': 38.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4422459602355957, 'eval_runtime': 79.319, 'eval_samples_per_second': 7.703, 'eval_steps_per_second': 1.929, 'epoch': 39.0}
{'loss': 3.5121, 'grad_norm': 3.3910093307495117, 'learning_rate': 1.4671834625322999e-05, 'epoch': 39.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4267265796661377, 'eval_runtime': 78.4316, 'eval_samples_per_second': 7.79, 'eval_steps_per_second': 1.951, 'epoch': 40.0}
{'loss': 3.5005, 'grad_norm': 3.7406673431396484, 'learning_rate': 1.4538630490956073e-05, 'epoch': 40.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4247918128967285, 'eval_runtime': 110.3938, 'eval_samples_per_second': 5.535, 'eval_steps_per_second': 1.386, 'epoch': 41.0}
{'loss': 3.4784, 'grad_norm': 4.694282054901123, 'learning_rate': 1.440542635658915e-05, 'epoch': 41.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4156417846679688, 'eval_runtime': 100.471, 'eval_samples_per_second': 6.081, 'eval_steps_per_second': 1.523, 'epoch': 42.0}
{'loss': 3.4528, 'grad_norm': 3.822388172149658, 'learning_rate': 1.4272222222222223e-05, 'epoch': 42.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.4008452892303467, 'eval_runtime': 58.2987, 'eval_samples_per_second': 10.481, 'eval_steps_per_second': 2.624, 'epoch': 43.0}
{'loss': 3.44, 'grad_norm': 4.055434226989746, 'learning_rate': 1.4139018087855298e-05, 'epoch': 43.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.402384042739868, 'eval_runtime': 79.0455, 'eval_samples_per_second': 7.73, 'eval_steps_per_second': 1.936, 'epoch': 44.0}
{'loss': 3.4086, 'grad_norm': 10.702330589294434, 'learning_rate': 1.4005813953488375e-05, 'epoch': 44.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3898653984069824, 'eval_runtime': 81.2245, 'eval_samples_per_second': 7.522, 'eval_steps_per_second': 1.884, 'epoch': 45.0}
{'loss': 3.4013, 'grad_norm': 4.896146774291992, 'learning_rate': 1.3872609819121448e-05, 'epoch': 45.96}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3727872371673584, 'eval_runtime': 68.6482, 'eval_samples_per_second': 8.9, 'eval_steps_per_second': 2.229, 'epoch': 46.0}
{'loss': 3.3932, 'grad_norm': 5.436364650726318, 'learning_rate': 1.3739405684754523e-05, 'epoch': 46.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.371950626373291, 'eval_runtime': 79.8274, 'eval_samples_per_second': 7.654, 'eval_steps_per_second': 1.917, 'epoch': 47.0}
{'loss': 3.3618, 'grad_norm': 4.941645622253418, 'learning_rate': 1.3606201550387599e-05, 'epoch': 47.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3608014583587646, 'eval_runtime': 79.9495, 'eval_samples_per_second': 7.642, 'eval_steps_per_second': 1.914, 'epoch': 48.0}
{'loss': 3.362, 'grad_norm': 6.9591875076293945, 'learning_rate': 1.3472997416020672e-05, 'epoch': 48.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3553905487060547, 'eval_runtime': 70.9129, 'eval_samples_per_second': 8.616, 'eval_steps_per_second': 2.158, 'epoch': 49.0}
{'loss': 3.3382, 'grad_norm': 3.9322915077209473, 'learning_rate': 1.3339793281653747e-05, 'epoch': 49.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3556203842163086, 'eval_runtime': 74.0488, 'eval_samples_per_second': 8.251, 'eval_steps_per_second': 2.066, 'epoch': 50.0}
{'loss': 3.3244, 'grad_norm': 5.350111961364746, 'learning_rate': 1.3206589147286824e-05, 'epoch': 50.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3377931118011475, 'eval_runtime': 72.0304, 'eval_samples_per_second': 8.483, 'eval_steps_per_second': 2.124, 'epoch': 51.0}
{'loss': 3.3113, 'grad_norm': 4.946474075317383, 'learning_rate': 1.3073385012919897e-05, 'epoch': 51.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3359885215759277, 'eval_runtime': 76.7606, 'eval_samples_per_second': 7.96, 'eval_steps_per_second': 1.993, 'epoch': 52.0}
{'loss': 3.3027, 'grad_norm': 7.8226423263549805, 'learning_rate': 1.2940180878552973e-05, 'epoch': 52.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3291432857513428, 'eval_runtime': 79.4993, 'eval_samples_per_second': 7.686, 'eval_steps_per_second': 1.925, 'epoch': 53.0}
{'loss': 3.2754, 'grad_norm': 4.704585075378418, 'learning_rate': 1.2806976744186048e-05, 'epoch': 53.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3265178203582764, 'eval_runtime': 80.0893, 'eval_samples_per_second': 7.629, 'eval_steps_per_second': 1.91, 'epoch': 54.0}
{'loss': 3.2633, 'grad_norm': 5.182844161987305, 'learning_rate': 1.2673772609819121e-05, 'epoch': 54.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3141841888427734, 'eval_runtime': 79.4544, 'eval_samples_per_second': 7.69, 'eval_steps_per_second': 1.926, 'epoch': 55.0}
{'loss': 3.2519, 'grad_norm': 6.183361530303955, 'learning_rate': 1.2540568475452198e-05, 'epoch': 55.95}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3093366622924805, 'eval_runtime': 106.4923, 'eval_samples_per_second': 5.738, 'eval_steps_per_second': 1.437, 'epoch': 56.0}
{'loss': 3.2445, 'grad_norm': 4.2950263023376465, 'learning_rate': 1.2407364341085273e-05, 'epoch': 56.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.302577495574951, 'eval_runtime': 101.4909, 'eval_samples_per_second': 6.02, 'eval_steps_per_second': 1.508, 'epoch': 57.0}
{'loss': 3.2319, 'grad_norm': 5.00634241104126, 'learning_rate': 1.2274160206718346e-05, 'epoch': 57.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.3044304847717285, 'eval_runtime': 80.4301, 'eval_samples_per_second': 7.597, 'eval_steps_per_second': 1.902, 'epoch': 58.0}
{'loss': 3.214, 'grad_norm': 4.670313358306885, 'learning_rate': 1.2140956072351422e-05, 'epoch': 58.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2891933917999268, 'eval_runtime': 80.9248, 'eval_samples_per_second': 7.55, 'eval_steps_per_second': 1.891, 'epoch': 59.0}
{'loss': 3.2067, 'grad_norm': 4.70048713684082, 'learning_rate': 1.2007751937984497e-05, 'epoch': 59.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.290844202041626, 'eval_runtime': 115.5489, 'eval_samples_per_second': 5.288, 'eval_steps_per_second': 1.324, 'epoch': 60.0}
{'loss': 3.1865, 'grad_norm': 4.929094314575195, 'learning_rate': 1.187454780361757e-05, 'epoch': 60.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2852694988250732, 'eval_runtime': 303.4321, 'eval_samples_per_second': 2.014, 'eval_steps_per_second': 0.504, 'epoch': 61.0}
{'loss': 3.1729, 'grad_norm': 5.937806129455566, 'learning_rate': 1.1741343669250647e-05, 'epoch': 61.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2808291912078857, 'eval_runtime': 78.267, 'eval_samples_per_second': 7.807, 'eval_steps_per_second': 1.955, 'epoch': 62.0}
{'loss': 3.1702, 'grad_norm': 6.234220504760742, 'learning_rate': 1.1608139534883722e-05, 'epoch': 62.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2737762928009033, 'eval_runtime': 78.5691, 'eval_samples_per_second': 7.777, 'eval_steps_per_second': 1.947, 'epoch': 63.0}
{'loss': 3.1431, 'grad_norm': 4.490846157073975, 'learning_rate': 1.1474935400516798e-05, 'epoch': 63.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.268387794494629, 'eval_runtime': 78.5544, 'eval_samples_per_second': 7.778, 'eval_steps_per_second': 1.948, 'epoch': 64.0}
{'loss': 3.1463, 'grad_norm': 4.486941337585449, 'learning_rate': 1.1341731266149871e-05, 'epoch': 64.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.270488977432251, 'eval_runtime': 78.903, 'eval_samples_per_second': 7.744, 'eval_steps_per_second': 1.939, 'epoch': 65.0}
{'loss': 3.1227, 'grad_norm': 5.678085803985596, 'learning_rate': 1.1208527131782946e-05, 'epoch': 65.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.26741623878479, 'eval_runtime': 79.2069, 'eval_samples_per_second': 7.714, 'eval_steps_per_second': 1.932, 'epoch': 66.0}
{'loss': 3.1201, 'grad_norm': 5.050079822540283, 'learning_rate': 1.1075322997416023e-05, 'epoch': 66.94}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2563676834106445, 'eval_runtime': 78.9128, 'eval_samples_per_second': 7.743, 'eval_steps_per_second': 1.939, 'epoch': 67.0}
{'loss': 3.1135, 'grad_norm': 6.550920486450195, 'learning_rate': 1.0942118863049096e-05, 'epoch': 67.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.253418207168579, 'eval_runtime': 79.3138, 'eval_samples_per_second': 7.704, 'eval_steps_per_second': 1.929, 'epoch': 68.0}
{'loss': 3.084, 'grad_norm': 5.013942718505859, 'learning_rate': 1.0808914728682172e-05, 'epoch': 68.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.247164726257324, 'eval_runtime': 202.1801, 'eval_samples_per_second': 3.022, 'eval_steps_per_second': 0.757, 'epoch': 69.0}
{'loss': 3.0956, 'grad_norm': 4.683150291442871, 'learning_rate': 1.0675710594315247e-05, 'epoch': 69.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2542762756347656, 'eval_runtime': 534.3818, 'eval_samples_per_second': 1.143, 'eval_steps_per_second': 0.286, 'epoch': 70.0}
{'loss': 3.0757, 'grad_norm': 5.466732501983643, 'learning_rate': 1.054250645994832e-05, 'epoch': 70.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.243194818496704, 'eval_runtime': 174.72, 'eval_samples_per_second': 3.497, 'eval_steps_per_second': 0.876, 'epoch': 71.0}
{'loss': 3.0793, 'grad_norm': 7.8890380859375, 'learning_rate': 1.0409302325581397e-05, 'epoch': 71.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2456939220428467, 'eval_runtime': 286.4546, 'eval_samples_per_second': 2.133, 'eval_steps_per_second': 0.534, 'epoch': 72.0}
{'loss': 3.0558, 'grad_norm': 5.8329548835754395, 'learning_rate': 1.0276098191214472e-05, 'epoch': 72.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2380549907684326, 'eval_runtime': 79.5266, 'eval_samples_per_second': 7.683, 'eval_steps_per_second': 1.924, 'epoch': 73.0}
{'loss': 3.0406, 'grad_norm': 5.732705116271973, 'learning_rate': 1.0142894056847545e-05, 'epoch': 73.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.234647750854492, 'eval_runtime': 80.0737, 'eval_samples_per_second': 7.63, 'eval_steps_per_second': 1.911, 'epoch': 74.0}
{'loss': 3.0368, 'grad_norm': 6.197051525115967, 'learning_rate': 1.0009689922480621e-05, 'epoch': 74.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.240847110748291, 'eval_runtime': 79.2388, 'eval_samples_per_second': 7.711, 'eval_steps_per_second': 1.931, 'epoch': 75.0}
{'loss': 3.0308, 'grad_norm': 4.620599746704102, 'learning_rate': 9.876485788113696e-06, 'epoch': 75.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2365286350250244, 'eval_runtime': 80.9259, 'eval_samples_per_second': 7.55, 'eval_steps_per_second': 1.891, 'epoch': 76.0}
{'loss': 3.0137, 'grad_norm': 4.2274932861328125, 'learning_rate': 9.743281653746771e-06, 'epoch': 76.93}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.229710102081299, 'eval_runtime': 79.6343, 'eval_samples_per_second': 7.673, 'eval_steps_per_second': 1.921, 'epoch': 77.0}
{'loss': 2.9918, 'grad_norm': 5.169785976409912, 'learning_rate': 9.610077519379846e-06, 'epoch': 77.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2238969802856445, 'eval_runtime': 79.42, 'eval_samples_per_second': 7.693, 'eval_steps_per_second': 1.926, 'epoch': 78.0}
{'loss': 3.0195, 'grad_norm': 5.409465789794922, 'learning_rate': 9.47687338501292e-06, 'epoch': 78.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.223145008087158, 'eval_runtime': 79.4212, 'eval_samples_per_second': 7.693, 'eval_steps_per_second': 1.926, 'epoch': 79.0}
{'loss': 2.9934, 'grad_norm': 4.521819591522217, 'learning_rate': 9.343669250645996e-06, 'epoch': 79.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.217130661010742, 'eval_runtime': 78.2617, 'eval_samples_per_second': 7.807, 'eval_steps_per_second': 1.955, 'epoch': 80.0}
{'loss': 2.9838, 'grad_norm': 5.096582412719727, 'learning_rate': 9.21046511627907e-06, 'epoch': 80.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2184407711029053, 'eval_runtime': 78.3135, 'eval_samples_per_second': 7.802, 'eval_steps_per_second': 1.954, 'epoch': 81.0}
{'loss': 2.9815, 'grad_norm': 4.651862144470215, 'learning_rate': 9.077260981912145e-06, 'epoch': 81.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.212413787841797, 'eval_runtime': 78.3576, 'eval_samples_per_second': 7.798, 'eval_steps_per_second': 1.953, 'epoch': 82.0}
{'loss': 2.9697, 'grad_norm': 4.192619800567627, 'learning_rate': 8.94405684754522e-06, 'epoch': 82.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.216473340988159, 'eval_runtime': 79.2684, 'eval_samples_per_second': 7.708, 'eval_steps_per_second': 1.93, 'epoch': 83.0}
{'loss': 2.9675, 'grad_norm': 5.570767879486084, 'learning_rate': 8.810852713178295e-06, 'epoch': 83.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.206214666366577, 'eval_runtime': 78.1927, 'eval_samples_per_second': 7.814, 'eval_steps_per_second': 1.957, 'epoch': 84.0}
{'loss': 2.9502, 'grad_norm': 4.453334808349609, 'learning_rate': 8.67764857881137e-06, 'epoch': 84.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2004165649414062, 'eval_runtime': 78.0008, 'eval_samples_per_second': 7.833, 'eval_steps_per_second': 1.962, 'epoch': 85.0}
{'loss': 2.9498, 'grad_norm': 4.58155632019043, 'learning_rate': 8.544444444444445e-06, 'epoch': 85.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2016472816467285, 'eval_runtime': 78.5577, 'eval_samples_per_second': 7.778, 'eval_steps_per_second': 1.948, 'epoch': 86.0}
{'loss': 2.9279, 'grad_norm': 5.51931619644165, 'learning_rate': 8.41124031007752e-06, 'epoch': 86.92}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.2076504230499268, 'eval_runtime': 78.7765, 'eval_samples_per_second': 7.756, 'eval_steps_per_second': 1.942, 'epoch': 87.0}
{'loss': 2.9399, 'grad_norm': 6.121025562286377, 'learning_rate': 8.278036175710596e-06, 'epoch': 87.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1975979804992676, 'eval_runtime': 78.268, 'eval_samples_per_second': 7.807, 'eval_steps_per_second': 1.955, 'epoch': 88.0}
{'loss': 2.9292, 'grad_norm': 7.15089225769043, 'learning_rate': 8.14483204134367e-06, 'epoch': 88.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1996233463287354, 'eval_runtime': 78.6372, 'eval_samples_per_second': 7.77, 'eval_steps_per_second': 1.946, 'epoch': 89.0}
{'loss': 2.9226, 'grad_norm': 4.547684669494629, 'learning_rate': 8.011627906976744e-06, 'epoch': 89.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1978025436401367, 'eval_runtime': 80.0774, 'eval_samples_per_second': 7.63, 'eval_steps_per_second': 1.911, 'epoch': 90.0}
{'loss': 2.9069, 'grad_norm': 7.293493747711182, 'learning_rate': 7.87842377260982e-06, 'epoch': 90.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1910600662231445, 'eval_runtime': 79.165, 'eval_samples_per_second': 7.718, 'eval_steps_per_second': 1.933, 'epoch': 91.0}
{'loss': 2.9122, 'grad_norm': 5.705012321472168, 'learning_rate': 7.745219638242894e-06, 'epoch': 91.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.192939281463623, 'eval_runtime': 104.5347, 'eval_samples_per_second': 5.845, 'eval_steps_per_second': 1.464, 'epoch': 92.0}
{'loss': 2.8969, 'grad_norm': 8.02780818939209, 'learning_rate': 7.6120155038759694e-06, 'epoch': 92.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1859025955200195, 'eval_runtime': 97.2035, 'eval_samples_per_second': 6.286, 'eval_steps_per_second': 1.574, 'epoch': 93.0}
{'loss': 2.8807, 'grad_norm': 4.562335014343262, 'learning_rate': 7.478811369509045e-06, 'epoch': 93.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.187920570373535, 'eval_runtime': 105.0835, 'eval_samples_per_second': 5.814, 'eval_steps_per_second': 1.456, 'epoch': 94.0}
{'loss': 2.8949, 'grad_norm': 4.937468528747559, 'learning_rate': 7.345607235142119e-06, 'epoch': 94.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1919009685516357, 'eval_runtime': 79.467, 'eval_samples_per_second': 7.689, 'eval_steps_per_second': 1.925, 'epoch': 95.0}
{'loss': 2.8756, 'grad_norm': 4.935351848602295, 'learning_rate': 7.212403100775195e-06, 'epoch': 95.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1800365447998047, 'eval_runtime': 79.2802, 'eval_samples_per_second': 7.707, 'eval_steps_per_second': 1.93, 'epoch': 96.0}
{'loss': 2.8721, 'grad_norm': 4.5955305099487305, 'learning_rate': 7.07919896640827e-06, 'epoch': 96.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.180633544921875, 'eval_runtime': 78.933, 'eval_samples_per_second': 7.741, 'eval_steps_per_second': 1.938, 'epoch': 97.0}
{'loss': 2.8632, 'grad_norm': 5.352871894836426, 'learning_rate': 6.945994832041344e-06, 'epoch': 97.91}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1819465160369873, 'eval_runtime': 78.865, 'eval_samples_per_second': 7.747, 'eval_steps_per_second': 1.94, 'epoch': 98.0}
{'loss': 2.8371, 'grad_norm': 4.471981048583984, 'learning_rate': 6.812790697674419e-06, 'epoch': 98.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1822266578674316, 'eval_runtime': 79.0915, 'eval_samples_per_second': 7.725, 'eval_steps_per_second': 1.934, 'epoch': 99.0}
{'loss': 2.8712, 'grad_norm': 4.291103839874268, 'learning_rate': 6.679586563307494e-06, 'epoch': 99.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1816511154174805, 'eval_runtime': 79.7591, 'eval_samples_per_second': 7.661, 'eval_steps_per_second': 1.918, 'epoch': 100.0}
{'loss': 2.8373, 'grad_norm': 6.408052444458008, 'learning_rate': 6.546382428940569e-06, 'epoch': 100.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1746160984039307, 'eval_runtime': 89.005, 'eval_samples_per_second': 6.865, 'eval_steps_per_second': 1.719, 'epoch': 101.0}
{'loss': 2.8548, 'grad_norm': 4.13137674331665, 'learning_rate': 6.413178294573644e-06, 'epoch': 101.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1786201000213623, 'eval_runtime': 79.909, 'eval_samples_per_second': 7.646, 'eval_steps_per_second': 1.915, 'epoch': 102.0}
{'loss': 2.8359, 'grad_norm': 4.733768939971924, 'learning_rate': 6.279974160206719e-06, 'epoch': 102.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.174837827682495, 'eval_runtime': 79.811, 'eval_samples_per_second': 7.656, 'eval_steps_per_second': 1.917, 'epoch': 103.0}
{'loss': 2.8304, 'grad_norm': 4.7253875732421875, 'learning_rate': 6.1467700258397935e-06, 'epoch': 103.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1756513118743896, 'eval_runtime': 112.7995, 'eval_samples_per_second': 5.417, 'eval_steps_per_second': 1.356, 'epoch': 104.0}
{'loss': 2.8317, 'grad_norm': 4.843450546264648, 'learning_rate': 6.013565891472869e-06, 'epoch': 104.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1717967987060547, 'eval_runtime': 80.9785, 'eval_samples_per_second': 7.545, 'eval_steps_per_second': 1.889, 'epoch': 105.0}
{'loss': 2.8175, 'grad_norm': 5.473032474517822, 'learning_rate': 5.880361757105943e-06, 'epoch': 105.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1685500144958496, 'eval_runtime': 79.8708, 'eval_samples_per_second': 7.65, 'eval_steps_per_second': 1.916, 'epoch': 106.0}
{'loss': 2.8134, 'grad_norm': 9.296467781066895, 'learning_rate': 5.747157622739018e-06, 'epoch': 106.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1618971824645996, 'eval_runtime': 79.33, 'eval_samples_per_second': 7.702, 'eval_steps_per_second': 1.929, 'epoch': 107.0}
{'loss': 2.815, 'grad_norm': 5.149070739746094, 'learning_rate': 5.613953488372094e-06, 'epoch': 107.9}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1662955284118652, 'eval_runtime': 79.5825, 'eval_samples_per_second': 7.678, 'eval_steps_per_second': 1.923, 'epoch': 108.0}
{'loss': 2.8191, 'grad_norm': 4.797739028930664, 'learning_rate': 5.480749354005169e-06, 'epoch': 108.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1668994426727295, 'eval_runtime': 126.101, 'eval_samples_per_second': 4.845, 'eval_steps_per_second': 1.213, 'epoch': 109.0}
{'loss': 2.794, 'grad_norm': 5.788322925567627, 'learning_rate': 5.347545219638243e-06, 'epoch': 109.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.170778274536133, 'eval_runtime': 305.8626, 'eval_samples_per_second': 1.998, 'eval_steps_per_second': 0.5, 'epoch': 110.0}
{'loss': 2.798, 'grad_norm': 4.488123416900635, 'learning_rate': 5.214341085271318e-06, 'epoch': 110.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1673190593719482, 'eval_runtime': 79.4385, 'eval_samples_per_second': 7.691, 'eval_steps_per_second': 1.926, 'epoch': 111.0}
{'loss': 2.7894, 'grad_norm': 5.346701145172119, 'learning_rate': 5.081136950904393e-06, 'epoch': 111.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.163666009902954, 'eval_runtime': 78.728, 'eval_samples_per_second': 7.761, 'eval_steps_per_second': 1.943, 'epoch': 112.0}
{'loss': 2.803, 'grad_norm': 9.438949584960938, 'learning_rate': 4.947932816537468e-06, 'epoch': 112.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1637542247772217, 'eval_runtime': 78.4715, 'eval_samples_per_second': 7.786, 'eval_steps_per_second': 1.95, 'epoch': 113.0}
{'loss': 2.7797, 'grad_norm': 6.053882598876953, 'learning_rate': 4.814728682170543e-06, 'epoch': 113.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1605420112609863, 'eval_runtime': 78.958, 'eval_samples_per_second': 7.738, 'eval_steps_per_second': 1.938, 'epoch': 114.0}
{'loss': 2.7959, 'grad_norm': 5.037245273590088, 'learning_rate': 4.6815245478036185e-06, 'epoch': 114.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1628756523132324, 'eval_runtime': 79.133, 'eval_samples_per_second': 7.721, 'eval_steps_per_second': 1.933, 'epoch': 115.0}
{'loss': 2.7768, 'grad_norm': 5.270567893981934, 'learning_rate': 4.5483204134366925e-06, 'epoch': 115.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.163506269454956, 'eval_runtime': 79.427, 'eval_samples_per_second': 7.693, 'eval_steps_per_second': 1.926, 'epoch': 116.0}
{'loss': 2.7872, 'grad_norm': 5.149022579193115, 'learning_rate': 4.415116279069767e-06, 'epoch': 116.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1600704193115234, 'eval_runtime': 73.9785, 'eval_samples_per_second': 8.259, 'eval_steps_per_second': 2.068, 'epoch': 117.0}
{'loss': 2.7628, 'grad_norm': 6.5561323165893555, 'learning_rate': 4.281912144702843e-06, 'epoch': 117.89}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.160885810852051, 'eval_runtime': 74.931, 'eval_samples_per_second': 8.154, 'eval_steps_per_second': 2.042, 'epoch': 118.0}
{'loss': 2.7931, 'grad_norm': 6.556570053100586, 'learning_rate': 4.148708010335918e-06, 'epoch': 118.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.158107042312622, 'eval_runtime': 78.657, 'eval_samples_per_second': 7.768, 'eval_steps_per_second': 1.945, 'epoch': 119.0}
{'loss': 2.7486, 'grad_norm': 5.058747291564941, 'learning_rate': 4.015503875968993e-06, 'epoch': 119.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1530072689056396, 'eval_runtime': 78.8115, 'eval_samples_per_second': 7.753, 'eval_steps_per_second': 1.941, 'epoch': 120.0}
{'loss': 2.7747, 'grad_norm': 5.552943229675293, 'learning_rate': 3.8822997416020675e-06, 'epoch': 120.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1596364974975586, 'eval_runtime': 304.9295, 'eval_samples_per_second': 2.004, 'eval_steps_per_second': 0.502, 'epoch': 121.0}
{'loss': 2.7651, 'grad_norm': 7.0377397537231445, 'learning_rate': 3.7490956072351424e-06, 'epoch': 121.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.160316228866577, 'eval_runtime': 83.439, 'eval_samples_per_second': 7.323, 'eval_steps_per_second': 1.834, 'epoch': 122.0}
{'loss': 2.7568, 'grad_norm': 6.017354965209961, 'learning_rate': 3.6158914728682176e-06, 'epoch': 122.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.150890827178955, 'eval_runtime': 260.7021, 'eval_samples_per_second': 2.344, 'eval_steps_per_second': 0.587, 'epoch': 123.0}
{'loss': 2.7486, 'grad_norm': 4.852512836456299, 'learning_rate': 3.482687338501292e-06, 'epoch': 123.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1514341831207275, 'eval_runtime': 286.3078, 'eval_samples_per_second': 2.134, 'eval_steps_per_second': 0.534, 'epoch': 124.0}
{'loss': 2.7405, 'grad_norm': 7.086965560913086, 'learning_rate': 3.3494832041343673e-06, 'epoch': 124.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.156853199005127, 'eval_runtime': 78.445, 'eval_samples_per_second': 7.789, 'eval_steps_per_second': 1.95, 'epoch': 125.0}
{'loss': 2.7513, 'grad_norm': 5.139013767242432, 'learning_rate': 3.216279069767442e-06, 'epoch': 125.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1565029621124268, 'eval_runtime': 111.71, 'eval_samples_per_second': 5.47, 'eval_steps_per_second': 1.37, 'epoch': 126.0}
{'loss': 2.7461, 'grad_norm': 5.098872661590576, 'learning_rate': 3.083074935400517e-06, 'epoch': 126.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1528167724609375, 'eval_runtime': 106.9755, 'eval_samples_per_second': 5.712, 'eval_steps_per_second': 1.43, 'epoch': 127.0}
{'loss': 2.7414, 'grad_norm': 5.781871795654297, 'learning_rate': 2.949870801033592e-06, 'epoch': 127.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.154263734817505, 'eval_runtime': 107.2965, 'eval_samples_per_second': 5.695, 'eval_steps_per_second': 1.426, 'epoch': 128.0}
{'loss': 2.7459, 'grad_norm': 5.903141975402832, 'learning_rate': 2.816666666666667e-06, 'epoch': 128.88}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1527822017669678, 'eval_runtime': 107.146, 'eval_samples_per_second': 5.702, 'eval_steps_per_second': 1.428, 'epoch': 129.0}
{'loss': 2.7425, 'grad_norm': 4.172565937042236, 'learning_rate': 2.683462532299742e-06, 'epoch': 129.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1523489952087402, 'eval_runtime': 105.1405, 'eval_samples_per_second': 5.811, 'eval_steps_per_second': 1.455, 'epoch': 130.0}
{'loss': 2.7433, 'grad_norm': 5.075046539306641, 'learning_rate': 2.5502583979328168e-06, 'epoch': 130.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.150815010070801, 'eval_runtime': 144.5692, 'eval_samples_per_second': 4.226, 'eval_steps_per_second': 1.058, 'epoch': 131.0}
{'loss': 2.7376, 'grad_norm': 6.775362968444824, 'learning_rate': 2.4170542635658916e-06, 'epoch': 131.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1537468433380127, 'eval_runtime': 91.6895, 'eval_samples_per_second': 6.664, 'eval_steps_per_second': 1.669, 'epoch': 132.0}
{'loss': 2.734, 'grad_norm': 6.134314060211182, 'learning_rate': 2.2838501291989664e-06, 'epoch': 132.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.154477596282959, 'eval_runtime': 99.8755, 'eval_samples_per_second': 6.118, 'eval_steps_per_second': 1.532, 'epoch': 133.0}
{'loss': 2.7291, 'grad_norm': 6.463385581970215, 'learning_rate': 2.1506459948320417e-06, 'epoch': 133.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.150434732437134, 'eval_runtime': 96.0345, 'eval_samples_per_second': 6.362, 'eval_steps_per_second': 1.593, 'epoch': 134.0}
{'loss': 2.7132, 'grad_norm': 5.839410781860352, 'learning_rate': 2.0174418604651166e-06, 'epoch': 134.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1505961418151855, 'eval_runtime': 97.7355, 'eval_samples_per_second': 6.252, 'eval_steps_per_second': 1.565, 'epoch': 135.0}
{'loss': 2.739, 'grad_norm': 7.107097625732422, 'learning_rate': 1.8842377260981914e-06, 'epoch': 135.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1443893909454346, 'eval_runtime': 98.7161, 'eval_samples_per_second': 6.189, 'eval_steps_per_second': 1.55, 'epoch': 136.0}
{'loss': 2.7324, 'grad_norm': 5.280280590057373, 'learning_rate': 1.7510335917312662e-06, 'epoch': 136.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.15047550201416, 'eval_runtime': 147.4205, 'eval_samples_per_second': 4.145, 'eval_steps_per_second': 1.038, 'epoch': 137.0}
{'loss': 2.7192, 'grad_norm': 5.079336643218994, 'learning_rate': 1.6178294573643413e-06, 'epoch': 137.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1498687267303467, 'eval_runtime': 138.9675, 'eval_samples_per_second': 4.397, 'eval_steps_per_second': 1.101, 'epoch': 138.0}
{'loss': 2.7279, 'grad_norm': 6.488892555236816, 'learning_rate': 1.4846253229974161e-06, 'epoch': 138.87}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.146120071411133, 'eval_runtime': 125.048, 'eval_samples_per_second': 4.886, 'eval_steps_per_second': 1.224, 'epoch': 139.0}
{'loss': 2.7159, 'grad_norm': 5.558673858642578, 'learning_rate': 1.351421188630491e-06, 'epoch': 139.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.146765947341919, 'eval_runtime': 121.9443, 'eval_samples_per_second': 5.01, 'eval_steps_per_second': 1.255, 'epoch': 140.0}
{'loss': 2.7139, 'grad_norm': 3.793421983718872, 'learning_rate': 1.218217054263566e-06, 'epoch': 140.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.147934913635254, 'eval_runtime': 130.5435, 'eval_samples_per_second': 4.68, 'eval_steps_per_second': 1.172, 'epoch': 141.0}
{'loss': 2.7237, 'grad_norm': 5.7456841468811035, 'learning_rate': 1.0850129198966409e-06, 'epoch': 141.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.146505117416382, 'eval_runtime': 305.12, 'eval_samples_per_second': 2.002, 'eval_steps_per_second': 0.501, 'epoch': 142.0}
{'loss': 2.7204, 'grad_norm': 5.918363094329834, 'learning_rate': 9.518087855297159e-07, 'epoch': 142.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1471259593963623, 'eval_runtime': 314.7765, 'eval_samples_per_second': 1.941, 'eval_steps_per_second': 0.486, 'epoch': 143.0}
{'loss': 2.721, 'grad_norm': 6.180527687072754, 'learning_rate': 8.186046511627906e-07, 'epoch': 143.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.147383451461792, 'eval_runtime': 312.7467, 'eval_samples_per_second': 1.954, 'eval_steps_per_second': 0.489, 'epoch': 144.0}
{'loss': 2.7043, 'grad_norm': 6.420068264007568, 'learning_rate': 6.854005167958656e-07, 'epoch': 144.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1496193408966064, 'eval_runtime': 528.3749, 'eval_samples_per_second': 1.156, 'eval_steps_per_second': 0.29, 'epoch': 145.0}
{'loss': 2.7113, 'grad_norm': 5.825915336608887, 'learning_rate': 5.521963824289405e-07, 'epoch': 145.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1495401859283447, 'eval_runtime': 451.7657, 'eval_samples_per_second': 1.352, 'eval_steps_per_second': 0.339, 'epoch': 146.0}
{'loss': 2.719, 'grad_norm': 4.428551197052002, 'learning_rate': 4.1899224806201554e-07, 'epoch': 146.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1478867530822754, 'eval_runtime': 119.1798, 'eval_samples_per_second': 5.127, 'eval_steps_per_second': 1.284, 'epoch': 147.0}
{'loss': 2.7135, 'grad_norm': 6.700161457061768, 'learning_rate': 2.857881136950905e-07, 'epoch': 147.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.1486620903015137, 'eval_runtime': 101.0995, 'eval_samples_per_second': 6.044, 'eval_steps_per_second': 1.513, 'epoch': 148.0}
{'loss': 2.7019, 'grad_norm': 6.355393886566162, 'learning_rate': 1.5258397932816538e-07, 'epoch': 148.86}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.14864444732666, 'eval_runtime': 121.7125, 'eval_samples_per_second': 5.02, 'eval_steps_per_second': 1.257, 'epoch': 149.0}
{'loss': 2.7174, 'grad_norm': 5.058971405029297, 'learning_rate': 1.937984496124031e-08, 'epoch': 149.85}


  0%|          | 0/153 [00:00<?, ?it/s]

{'eval_loss': 3.14858341217041, 'eval_runtime': 105.7165, 'eval_samples_per_second': 5.78, 'eval_steps_per_second': 1.447, 'epoch': 150.0}
{'train_runtime': 780407.035, 'train_samples_per_second': 0.793, 'train_steps_per_second': 0.198, 'train_loss': 3.4096622116128295, 'epoch': 150.0}


TrainOutput(global_step=154800, training_loss=3.4096622116128295, metrics={'train_runtime': 780407.035, 'train_samples_per_second': 0.793, 'train_steps_per_second': 0.198, 'total_flos': 4095707498127360.0, 'train_loss': 3.4096622116128295, 'epoch': 150.0})

In [11]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MT5ForConditionalGeneration

# Define constants and model
BATCH_SIZE = 8 # Increased batch size to reduce iterations

# Assuming tokenized_dataset and tokenizer are already defined
# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

logging_steps = len(tokenized_datasets["train"]) // BATCH_SIZE
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=150,  # Reduced number of epochs to 50
    predict_with_generate=True,
    fp16=True,  # Enabled mixed precision training
    logging_steps=logging_steps,
    report_to="none"
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

  0%|          | 0/77400 [00:00<?, ?it/s]

{'loss': 20.1367, 'grad_norm': 614.359130859375, 'learning_rate': 1.9866925064599486e-05, 'epoch': 1.0}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 10.498639106750488, 'eval_runtime': 98.517, 'eval_samples_per_second': 6.202, 'eval_steps_per_second': 0.782, 'epoch': 1.0}
{'loss': 11.2846, 'grad_norm': 83.03507995605469, 'learning_rate': 1.973385012919897e-05, 'epoch': 2.0}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 7.829776763916016, 'eval_runtime': 61.5822, 'eval_samples_per_second': 9.922, 'eval_steps_per_second': 1.25, 'epoch': 2.0}
{'loss': 8.426, 'grad_norm': 9.435274124145508, 'learning_rate': 1.960077519379845e-05, 'epoch': 2.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 6.597095012664795, 'eval_runtime': 61.2466, 'eval_samples_per_second': 9.976, 'eval_steps_per_second': 1.257, 'epoch': 3.0}
{'loss': 7.0633, 'grad_norm': 7.49997091293335, 'learning_rate': 1.9467700258397934e-05, 'epoch': 3.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 5.544681072235107, 'eval_runtime': 61.7207, 'eval_samples_per_second': 9.899, 'eval_steps_per_second': 1.248, 'epoch': 4.0}
{'loss': 6.3257, 'grad_norm': 3.9559500217437744, 'learning_rate': 1.9334625322997418e-05, 'epoch': 4.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 5.1110382080078125, 'eval_runtime': 61.7181, 'eval_samples_per_second': 9.9, 'eval_steps_per_second': 1.248, 'epoch': 5.0}
{'loss': 5.8627, 'grad_norm': 4.172357082366943, 'learning_rate': 1.9201550387596902e-05, 'epoch': 5.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.8446784019470215, 'eval_runtime': 61.543, 'eval_samples_per_second': 9.928, 'eval_steps_per_second': 1.251, 'epoch': 6.0}
{'loss': 5.5792, 'grad_norm': 2.4350268840789795, 'learning_rate': 1.9068475452196383e-05, 'epoch': 6.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.672014236450195, 'eval_runtime': 80.6637, 'eval_samples_per_second': 7.575, 'eval_steps_per_second': 0.955, 'epoch': 7.0}
{'loss': 5.3884, 'grad_norm': 2.7827043533325195, 'learning_rate': 1.8935400516795867e-05, 'epoch': 7.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.5460124015808105, 'eval_runtime': 84.9988, 'eval_samples_per_second': 7.188, 'eval_steps_per_second': 0.906, 'epoch': 8.0}
{'loss': 5.2424, 'grad_norm': 3.1685125827789307, 'learning_rate': 1.880232558139535e-05, 'epoch': 8.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.440890789031982, 'eval_runtime': 81.7825, 'eval_samples_per_second': 7.471, 'eval_steps_per_second': 0.942, 'epoch': 9.0}
{'loss': 5.119, 'grad_norm': 2.874269962310791, 'learning_rate': 1.8669250645994835e-05, 'epoch': 9.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.359720706939697, 'eval_runtime': 86.0011, 'eval_samples_per_second': 7.105, 'eval_steps_per_second': 0.895, 'epoch': 10.0}
{'loss': 5.0018, 'grad_norm': 3.221372127532959, 'learning_rate': 1.853617571059432e-05, 'epoch': 10.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.288365840911865, 'eval_runtime': 70.994, 'eval_samples_per_second': 8.606, 'eval_steps_per_second': 1.085, 'epoch': 11.0}
{'loss': 4.9201, 'grad_norm': 2.832091808319092, 'learning_rate': 1.84031007751938e-05, 'epoch': 11.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.228158950805664, 'eval_runtime': 296.9179, 'eval_samples_per_second': 2.058, 'eval_steps_per_second': 0.259, 'epoch': 12.0}
{'loss': 4.8315, 'grad_norm': 2.5794425010681152, 'learning_rate': 1.8270025839793283e-05, 'epoch': 12.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.176252841949463, 'eval_runtime': 88.1648, 'eval_samples_per_second': 6.93, 'eval_steps_per_second': 0.873, 'epoch': 13.0}
{'loss': 4.7636, 'grad_norm': 3.1710832118988037, 'learning_rate': 1.8136950904392767e-05, 'epoch': 13.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.131303310394287, 'eval_runtime': 88.3686, 'eval_samples_per_second': 6.914, 'eval_steps_per_second': 0.871, 'epoch': 14.0}
{'loss': 4.6941, 'grad_norm': 2.8701717853546143, 'learning_rate': 1.800387596899225e-05, 'epoch': 14.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.085657596588135, 'eval_runtime': 81.3745, 'eval_samples_per_second': 7.508, 'eval_steps_per_second': 0.946, 'epoch': 15.0}
{'loss': 4.6314, 'grad_norm': 2.5530765056610107, 'learning_rate': 1.7870801033591732e-05, 'epoch': 15.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.049203872680664, 'eval_runtime': 90.9401, 'eval_samples_per_second': 6.719, 'eval_steps_per_second': 0.847, 'epoch': 16.0}
{'loss': 4.5901, 'grad_norm': 3.187238931655884, 'learning_rate': 1.7737726098191216e-05, 'epoch': 16.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.0167388916015625, 'eval_runtime': 63.0854, 'eval_samples_per_second': 9.685, 'eval_steps_per_second': 1.221, 'epoch': 17.0}
{'loss': 4.5285, 'grad_norm': 3.1851890087127686, 'learning_rate': 1.76046511627907e-05, 'epoch': 17.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.9821040630340576, 'eval_runtime': 62.1659, 'eval_samples_per_second': 9.829, 'eval_steps_per_second': 1.239, 'epoch': 18.0}
{'loss': 4.4775, 'grad_norm': 2.4279000759124756, 'learning_rate': 1.747157622739018e-05, 'epoch': 18.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.9519495964050293, 'eval_runtime': 61.9359, 'eval_samples_per_second': 9.865, 'eval_steps_per_second': 1.243, 'epoch': 19.0}
{'loss': 4.4231, 'grad_norm': 3.3391740322113037, 'learning_rate': 1.7338501291989665e-05, 'epoch': 19.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.9220025539398193, 'eval_runtime': 62.3963, 'eval_samples_per_second': 9.792, 'eval_steps_per_second': 1.234, 'epoch': 20.0}
{'loss': 4.3921, 'grad_norm': 2.948448896408081, 'learning_rate': 1.720542635658915e-05, 'epoch': 20.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.896437644958496, 'eval_runtime': 62.4477, 'eval_samples_per_second': 9.784, 'eval_steps_per_second': 1.233, 'epoch': 21.0}
{'loss': 4.35, 'grad_norm': 2.558866262435913, 'learning_rate': 1.7072351421188633e-05, 'epoch': 21.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.872109889984131, 'eval_runtime': 62.3819, 'eval_samples_per_second': 9.795, 'eval_steps_per_second': 1.234, 'epoch': 22.0}
{'loss': 4.2989, 'grad_norm': 3.8116743564605713, 'learning_rate': 1.6939276485788113e-05, 'epoch': 22.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.847226619720459, 'eval_runtime': 62.696, 'eval_samples_per_second': 9.745, 'eval_steps_per_second': 1.228, 'epoch': 23.0}
{'loss': 4.2756, 'grad_norm': 2.846177101135254, 'learning_rate': 1.6806201550387597e-05, 'epoch': 23.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.8285930156707764, 'eval_runtime': 61.071, 'eval_samples_per_second': 10.005, 'eval_steps_per_second': 1.261, 'epoch': 24.0}
{'loss': 4.2314, 'grad_norm': 3.1122190952301025, 'learning_rate': 1.667312661498708e-05, 'epoch': 24.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.8064401149749756, 'eval_runtime': 61.843, 'eval_samples_per_second': 9.88, 'eval_steps_per_second': 1.245, 'epoch': 25.0}
{'loss': 4.1979, 'grad_norm': 3.0519349575042725, 'learning_rate': 1.6540051679586565e-05, 'epoch': 25.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.7879273891448975, 'eval_runtime': 62.7856, 'eval_samples_per_second': 9.732, 'eval_steps_per_second': 1.226, 'epoch': 26.0}
{'loss': 4.1614, 'grad_norm': 3.028542995452881, 'learning_rate': 1.6406976744186046e-05, 'epoch': 26.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.772365093231201, 'eval_runtime': 61.4635, 'eval_samples_per_second': 9.941, 'eval_steps_per_second': 1.253, 'epoch': 27.0}
{'loss': 4.1424, 'grad_norm': 2.5390100479125977, 'learning_rate': 1.627390180878553e-05, 'epoch': 27.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.7515337467193604, 'eval_runtime': 63.0173, 'eval_samples_per_second': 9.696, 'eval_steps_per_second': 1.222, 'epoch': 28.0}
{'loss': 4.0981, 'grad_norm': 3.239469528198242, 'learning_rate': 1.6140826873385014e-05, 'epoch': 28.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.7354001998901367, 'eval_runtime': 61.8661, 'eval_samples_per_second': 9.876, 'eval_steps_per_second': 1.245, 'epoch': 29.0}
{'loss': 4.088, 'grad_norm': 2.6823465824127197, 'learning_rate': 1.6007751937984498e-05, 'epoch': 29.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.720628261566162, 'eval_runtime': 62.5164, 'eval_samples_per_second': 9.773, 'eval_steps_per_second': 1.232, 'epoch': 30.0}
{'loss': 4.0345, 'grad_norm': 3.0659797191619873, 'learning_rate': 1.5874677002583982e-05, 'epoch': 30.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.7023284435272217, 'eval_runtime': 94.6867, 'eval_samples_per_second': 6.453, 'eval_steps_per_second': 0.813, 'epoch': 31.0}
{'loss': 4.0217, 'grad_norm': 3.6665537357330322, 'learning_rate': 1.5741602067183462e-05, 'epoch': 31.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6895687580108643, 'eval_runtime': 63.3285, 'eval_samples_per_second': 9.648, 'eval_steps_per_second': 1.216, 'epoch': 32.0}
{'loss': 3.9996, 'grad_norm': 2.9158363342285156, 'learning_rate': 1.5608527131782946e-05, 'epoch': 32.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.673503875732422, 'eval_runtime': 113.849, 'eval_samples_per_second': 5.367, 'eval_steps_per_second': 0.676, 'epoch': 33.0}
{'loss': 3.9787, 'grad_norm': 3.092495918273926, 'learning_rate': 1.547545219638243e-05, 'epoch': 33.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6639811992645264, 'eval_runtime': 140.28, 'eval_samples_per_second': 4.356, 'eval_steps_per_second': 0.549, 'epoch': 34.0}
{'loss': 3.9368, 'grad_norm': 2.993306875228882, 'learning_rate': 1.5342377260981914e-05, 'epoch': 34.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6433026790618896, 'eval_runtime': 219.4935, 'eval_samples_per_second': 2.784, 'eval_steps_per_second': 0.351, 'epoch': 35.0}
{'loss': 3.9281, 'grad_norm': 3.1516501903533936, 'learning_rate': 1.5209302325581397e-05, 'epoch': 35.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6369407176971436, 'eval_runtime': 336.334, 'eval_samples_per_second': 1.817, 'eval_steps_per_second': 0.229, 'epoch': 36.0}
{'loss': 3.9066, 'grad_norm': 2.9132065773010254, 'learning_rate': 1.5076227390180879e-05, 'epoch': 36.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6235358715057373, 'eval_runtime': 316.0924, 'eval_samples_per_second': 1.933, 'eval_steps_per_second': 0.244, 'epoch': 37.0}
{'loss': 3.8855, 'grad_norm': 3.3715741634368896, 'learning_rate': 1.4943152454780363e-05, 'epoch': 37.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6080400943756104, 'eval_runtime': 323.5564, 'eval_samples_per_second': 1.888, 'eval_steps_per_second': 0.238, 'epoch': 38.0}
{'loss': 3.8598, 'grad_norm': 2.684797763824463, 'learning_rate': 1.4810077519379847e-05, 'epoch': 38.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.600231647491455, 'eval_runtime': 61.542, 'eval_samples_per_second': 9.928, 'eval_steps_per_second': 1.251, 'epoch': 39.0}
{'loss': 3.8502, 'grad_norm': 2.9660048484802246, 'learning_rate': 1.467700258397933e-05, 'epoch': 39.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.588284492492676, 'eval_runtime': 62.8125, 'eval_samples_per_second': 9.727, 'eval_steps_per_second': 1.226, 'epoch': 40.0}
{'loss': 3.8196, 'grad_norm': 2.9783101081848145, 'learning_rate': 1.4543927648578813e-05, 'epoch': 40.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.576374053955078, 'eval_runtime': 62.058, 'eval_samples_per_second': 9.846, 'eval_steps_per_second': 1.241, 'epoch': 41.0}
{'loss': 3.8084, 'grad_norm': 3.2610666751861572, 'learning_rate': 1.4410852713178296e-05, 'epoch': 41.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.573369026184082, 'eval_runtime': 62.034, 'eval_samples_per_second': 9.849, 'eval_steps_per_second': 1.241, 'epoch': 42.0}
{'loss': 3.7759, 'grad_norm': 3.2587804794311523, 'learning_rate': 1.427777777777778e-05, 'epoch': 42.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.559082508087158, 'eval_runtime': 62.9396, 'eval_samples_per_second': 9.708, 'eval_steps_per_second': 1.223, 'epoch': 43.0}
{'loss': 3.7727, 'grad_norm': 3.2281343936920166, 'learning_rate': 1.4144702842377262e-05, 'epoch': 43.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5501019954681396, 'eval_runtime': 61.5565, 'eval_samples_per_second': 9.926, 'eval_steps_per_second': 1.251, 'epoch': 44.0}
{'loss': 3.7431, 'grad_norm': 3.002129077911377, 'learning_rate': 1.4011627906976746e-05, 'epoch': 44.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5378406047821045, 'eval_runtime': 61.3085, 'eval_samples_per_second': 9.966, 'eval_steps_per_second': 1.256, 'epoch': 45.0}
{'loss': 3.7343, 'grad_norm': 3.9238104820251465, 'learning_rate': 1.3878552971576228e-05, 'epoch': 45.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5298335552215576, 'eval_runtime': 62.304, 'eval_samples_per_second': 9.807, 'eval_steps_per_second': 1.236, 'epoch': 46.0}
{'loss': 3.7148, 'grad_norm': 3.6463053226470947, 'learning_rate': 1.3745478036175712e-05, 'epoch': 46.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.523484706878662, 'eval_runtime': 61.0735, 'eval_samples_per_second': 10.004, 'eval_steps_per_second': 1.261, 'epoch': 47.0}
{'loss': 3.6872, 'grad_norm': 3.197974920272827, 'learning_rate': 1.3612403100775196e-05, 'epoch': 47.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.513725757598877, 'eval_runtime': 62.1405, 'eval_samples_per_second': 9.833, 'eval_steps_per_second': 1.239, 'epoch': 48.0}
{'loss': 3.6833, 'grad_norm': 3.5236642360687256, 'learning_rate': 1.3479328165374679e-05, 'epoch': 48.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5091395378112793, 'eval_runtime': 61.1395, 'eval_samples_per_second': 9.994, 'eval_steps_per_second': 1.259, 'epoch': 49.0}
{'loss': 3.6724, 'grad_norm': 3.4376657009124756, 'learning_rate': 1.3346253229974163e-05, 'epoch': 49.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.509408950805664, 'eval_runtime': 110.8975, 'eval_samples_per_second': 5.51, 'eval_steps_per_second': 0.694, 'epoch': 50.0}
{'loss': 3.6447, 'grad_norm': 2.9337635040283203, 'learning_rate': 1.3213178294573645e-05, 'epoch': 50.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.487698554992676, 'eval_runtime': 88.4055, 'eval_samples_per_second': 6.911, 'eval_steps_per_second': 0.871, 'epoch': 51.0}
{'loss': 3.6352, 'grad_norm': 3.404008150100708, 'learning_rate': 1.3080103359173129e-05, 'epoch': 51.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4835128784179688, 'eval_runtime': 83.7535, 'eval_samples_per_second': 7.295, 'eval_steps_per_second': 0.919, 'epoch': 52.0}
{'loss': 3.6274, 'grad_norm': 3.7519967555999756, 'learning_rate': 1.2947028423772611e-05, 'epoch': 52.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.471344470977783, 'eval_runtime': 191.6505, 'eval_samples_per_second': 3.188, 'eval_steps_per_second': 0.402, 'epoch': 53.0}
{'loss': 3.6064, 'grad_norm': 4.108335018157959, 'learning_rate': 1.2813953488372095e-05, 'epoch': 53.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.474226236343384, 'eval_runtime': 204.5926, 'eval_samples_per_second': 2.986, 'eval_steps_per_second': 0.376, 'epoch': 54.0}
{'loss': 3.5918, 'grad_norm': 2.9997732639312744, 'learning_rate': 1.2680878552971579e-05, 'epoch': 54.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4615085124969482, 'eval_runtime': 224.474, 'eval_samples_per_second': 2.722, 'eval_steps_per_second': 0.343, 'epoch': 55.0}
{'loss': 3.5894, 'grad_norm': 3.586928606033325, 'learning_rate': 1.254780361757106e-05, 'epoch': 55.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.454735279083252, 'eval_runtime': 223.3295, 'eval_samples_per_second': 2.736, 'eval_steps_per_second': 0.345, 'epoch': 56.0}
{'loss': 3.573, 'grad_norm': 3.0563840866088867, 'learning_rate': 1.2414728682170542e-05, 'epoch': 56.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.445779323577881, 'eval_runtime': 89.599, 'eval_samples_per_second': 6.819, 'eval_steps_per_second': 0.859, 'epoch': 57.0}
{'loss': 3.5504, 'grad_norm': 4.480730056762695, 'learning_rate': 1.2281653746770026e-05, 'epoch': 57.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4400174617767334, 'eval_runtime': 79.821, 'eval_samples_per_second': 7.655, 'eval_steps_per_second': 0.965, 'epoch': 58.0}
{'loss': 3.5475, 'grad_norm': 2.8499202728271484, 'learning_rate': 1.2148578811369508e-05, 'epoch': 58.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4366273880004883, 'eval_runtime': 61.7525, 'eval_samples_per_second': 9.894, 'eval_steps_per_second': 1.247, 'epoch': 59.0}
{'loss': 3.5402, 'grad_norm': 3.3940303325653076, 'learning_rate': 1.2015503875968992e-05, 'epoch': 59.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4275543689727783, 'eval_runtime': 62.3395, 'eval_samples_per_second': 9.801, 'eval_steps_per_second': 1.235, 'epoch': 60.0}
{'loss': 3.5172, 'grad_norm': 3.81459903717041, 'learning_rate': 1.1882428940568476e-05, 'epoch': 60.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.425278663635254, 'eval_runtime': 60.7585, 'eval_samples_per_second': 10.056, 'eval_steps_per_second': 1.267, 'epoch': 61.0}
{'loss': 3.5041, 'grad_norm': 2.9302849769592285, 'learning_rate': 1.1749354005167959e-05, 'epoch': 61.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.417884349822998, 'eval_runtime': 62.422, 'eval_samples_per_second': 9.788, 'eval_steps_per_second': 1.234, 'epoch': 62.0}
{'loss': 3.5105, 'grad_norm': 3.760425090789795, 'learning_rate': 1.1616279069767443e-05, 'epoch': 62.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.412822723388672, 'eval_runtime': 61.8875, 'eval_samples_per_second': 9.873, 'eval_steps_per_second': 1.244, 'epoch': 63.0}
{'loss': 3.4732, 'grad_norm': 3.0415031909942627, 'learning_rate': 1.1483204134366925e-05, 'epoch': 63.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4105584621429443, 'eval_runtime': 61.1505, 'eval_samples_per_second': 9.992, 'eval_steps_per_second': 1.259, 'epoch': 64.0}
{'loss': 3.4795, 'grad_norm': 3.1660571098327637, 'learning_rate': 1.1350129198966409e-05, 'epoch': 64.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4046566486358643, 'eval_runtime': 63.2925, 'eval_samples_per_second': 9.654, 'eval_steps_per_second': 1.217, 'epoch': 65.0}
{'loss': 3.4584, 'grad_norm': 3.287834644317627, 'learning_rate': 1.1217054263565891e-05, 'epoch': 65.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4054081439971924, 'eval_runtime': 63.0585, 'eval_samples_per_second': 9.689, 'eval_steps_per_second': 1.221, 'epoch': 66.0}
{'loss': 3.4491, 'grad_norm': 3.9148693084716797, 'learning_rate': 1.1083979328165375e-05, 'epoch': 66.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.391110420227051, 'eval_runtime': 62.911, 'eval_samples_per_second': 9.712, 'eval_steps_per_second': 1.224, 'epoch': 67.0}
{'loss': 3.4452, 'grad_norm': 3.4027154445648193, 'learning_rate': 1.095090439276486e-05, 'epoch': 67.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3919246196746826, 'eval_runtime': 64.5715, 'eval_samples_per_second': 9.462, 'eval_steps_per_second': 1.192, 'epoch': 68.0}
{'loss': 3.413, 'grad_norm': 3.288088321685791, 'learning_rate': 1.0817829457364342e-05, 'epoch': 68.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3739964962005615, 'eval_runtime': 62.5305, 'eval_samples_per_second': 9.771, 'eval_steps_per_second': 1.231, 'epoch': 69.0}
{'loss': 3.4312, 'grad_norm': 3.4442687034606934, 'learning_rate': 1.0684754521963826e-05, 'epoch': 69.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.382892608642578, 'eval_runtime': 62.5875, 'eval_samples_per_second': 9.762, 'eval_steps_per_second': 1.23, 'epoch': 70.0}
{'loss': 3.4134, 'grad_norm': 3.4753739833831787, 'learning_rate': 1.0551679586563308e-05, 'epoch': 70.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3916332721710205, 'eval_runtime': 64.611, 'eval_samples_per_second': 9.457, 'eval_steps_per_second': 1.192, 'epoch': 71.0}
{'loss': 3.4005, 'grad_norm': 3.0489256381988525, 'learning_rate': 1.0418604651162792e-05, 'epoch': 71.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3662819862365723, 'eval_runtime': 64.049, 'eval_samples_per_second': 9.54, 'eval_steps_per_second': 1.202, 'epoch': 72.0}
{'loss': 3.4001, 'grad_norm': 3.4742472171783447, 'learning_rate': 1.0285529715762274e-05, 'epoch': 72.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.365097999572754, 'eval_runtime': 62.6105, 'eval_samples_per_second': 9.759, 'eval_steps_per_second': 1.23, 'epoch': 73.0}
{'loss': 3.3868, 'grad_norm': 3.7314646244049072, 'learning_rate': 1.0152454780361758e-05, 'epoch': 73.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.360720157623291, 'eval_runtime': 64.126, 'eval_samples_per_second': 9.528, 'eval_steps_per_second': 1.201, 'epoch': 74.0}
{'loss': 3.3767, 'grad_norm': 3.0795955657958984, 'learning_rate': 1.001937984496124e-05, 'epoch': 74.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3568294048309326, 'eval_runtime': 64.026, 'eval_samples_per_second': 9.543, 'eval_steps_per_second': 1.203, 'epoch': 75.0}
{'loss': 3.3626, 'grad_norm': 4.0000319480896, 'learning_rate': 9.886304909560724e-06, 'epoch': 75.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.363809585571289, 'eval_runtime': 87.9385, 'eval_samples_per_second': 6.948, 'eval_steps_per_second': 0.876, 'epoch': 76.0}
{'loss': 3.3617, 'grad_norm': 3.491523504257202, 'learning_rate': 9.753229974160208e-06, 'epoch': 76.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3609302043914795, 'eval_runtime': 81.076, 'eval_samples_per_second': 7.536, 'eval_steps_per_second': 0.95, 'epoch': 77.0}
{'loss': 3.3418, 'grad_norm': 3.8557345867156982, 'learning_rate': 9.62015503875969e-06, 'epoch': 77.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3490195274353027, 'eval_runtime': 81.9135, 'eval_samples_per_second': 7.459, 'eval_steps_per_second': 0.94, 'epoch': 78.0}
{'loss': 3.3631, 'grad_norm': 3.2624709606170654, 'learning_rate': 9.487080103359175e-06, 'epoch': 78.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.341893196105957, 'eval_runtime': 80.427, 'eval_samples_per_second': 7.597, 'eval_steps_per_second': 0.957, 'epoch': 79.0}
{'loss': 3.3268, 'grad_norm': 3.0752758979797363, 'learning_rate': 9.354005167958657e-06, 'epoch': 79.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.346271514892578, 'eval_runtime': 82.007, 'eval_samples_per_second': 7.451, 'eval_steps_per_second': 0.939, 'epoch': 80.0}
{'loss': 3.3246, 'grad_norm': 2.8800182342529297, 'learning_rate': 9.220930232558141e-06, 'epoch': 80.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.335587978363037, 'eval_runtime': 79.3835, 'eval_samples_per_second': 7.697, 'eval_steps_per_second': 0.97, 'epoch': 81.0}
{'loss': 3.3265, 'grad_norm': 4.107977867126465, 'learning_rate': 9.087855297157623e-06, 'epoch': 81.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3350751399993896, 'eval_runtime': 315.8494, 'eval_samples_per_second': 1.934, 'eval_steps_per_second': 0.244, 'epoch': 82.0}
{'loss': 3.3085, 'grad_norm': 3.119832754135132, 'learning_rate': 8.954780361757107e-06, 'epoch': 82.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.33469820022583, 'eval_runtime': 79.4655, 'eval_samples_per_second': 7.689, 'eval_steps_per_second': 0.969, 'epoch': 83.0}
{'loss': 3.3139, 'grad_norm': 3.73928165435791, 'learning_rate': 8.82170542635659e-06, 'epoch': 83.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3251755237579346, 'eval_runtime': 77.7475, 'eval_samples_per_second': 7.859, 'eval_steps_per_second': 0.99, 'epoch': 84.0}
{'loss': 3.2881, 'grad_norm': 3.1547045707702637, 'learning_rate': 8.688630490956072e-06, 'epoch': 84.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3244476318359375, 'eval_runtime': 78.962, 'eval_samples_per_second': 7.738, 'eval_steps_per_second': 0.975, 'epoch': 85.0}
{'loss': 3.2955, 'grad_norm': 3.897055149078369, 'learning_rate': 8.555555555555556e-06, 'epoch': 85.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.313204765319824, 'eval_runtime': 60.682, 'eval_samples_per_second': 10.069, 'eval_steps_per_second': 1.269, 'epoch': 86.0}
{'loss': 3.2777, 'grad_norm': 3.5668528079986572, 'learning_rate': 8.42248062015504e-06, 'epoch': 86.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3276631832122803, 'eval_runtime': 60.304, 'eval_samples_per_second': 10.132, 'eval_steps_per_second': 1.277, 'epoch': 87.0}
{'loss': 3.2858, 'grad_norm': 3.2907373905181885, 'learning_rate': 8.289405684754522e-06, 'epoch': 87.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.31903076171875, 'eval_runtime': 60.5834, 'eval_samples_per_second': 10.085, 'eval_steps_per_second': 1.271, 'epoch': 88.0}
{'loss': 3.271, 'grad_norm': 3.046903371810913, 'learning_rate': 8.156330749354006e-06, 'epoch': 88.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.316058397293091, 'eval_runtime': 61.0765, 'eval_samples_per_second': 10.004, 'eval_steps_per_second': 1.261, 'epoch': 89.0}
{'loss': 3.2701, 'grad_norm': 3.8371806144714355, 'learning_rate': 8.023255813953488e-06, 'epoch': 89.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3028957843780518, 'eval_runtime': 61.968, 'eval_samples_per_second': 9.86, 'eval_steps_per_second': 1.243, 'epoch': 90.0}
{'loss': 3.2553, 'grad_norm': 3.6150002479553223, 'learning_rate': 7.890180878552972e-06, 'epoch': 90.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3001832962036133, 'eval_runtime': 62.3876, 'eval_samples_per_second': 9.794, 'eval_steps_per_second': 1.234, 'epoch': 91.0}
{'loss': 3.2547, 'grad_norm': 4.340101718902588, 'learning_rate': 7.757105943152455e-06, 'epoch': 91.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2995736598968506, 'eval_runtime': 63.509, 'eval_samples_per_second': 9.621, 'eval_steps_per_second': 1.212, 'epoch': 92.0}
{'loss': 3.2451, 'grad_norm': 3.6193795204162598, 'learning_rate': 7.624031007751939e-06, 'epoch': 92.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3042449951171875, 'eval_runtime': 60.609, 'eval_samples_per_second': 10.081, 'eval_steps_per_second': 1.27, 'epoch': 93.0}
{'loss': 3.2338, 'grad_norm': 3.738776683807373, 'learning_rate': 7.490956072351422e-06, 'epoch': 93.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3045849800109863, 'eval_runtime': 64.4305, 'eval_samples_per_second': 9.483, 'eval_steps_per_second': 1.195, 'epoch': 94.0}
{'loss': 3.2536, 'grad_norm': 4.279685974121094, 'learning_rate': 7.357881136950905e-06, 'epoch': 94.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.307154655456543, 'eval_runtime': 60.879, 'eval_samples_per_second': 10.036, 'eval_steps_per_second': 1.265, 'epoch': 95.0}
{'loss': 3.2299, 'grad_norm': 3.7244927883148193, 'learning_rate': 7.224806201550388e-06, 'epoch': 95.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2996485233306885, 'eval_runtime': 60.885, 'eval_samples_per_second': 10.035, 'eval_steps_per_second': 1.265, 'epoch': 96.0}
{'loss': 3.2158, 'grad_norm': 3.2696478366851807, 'learning_rate': 7.091731266149871e-06, 'epoch': 96.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.297679901123047, 'eval_runtime': 60.3765, 'eval_samples_per_second': 10.12, 'eval_steps_per_second': 1.275, 'epoch': 97.0}
{'loss': 3.2314, 'grad_norm': 3.5173869132995605, 'learning_rate': 6.9586563307493545e-06, 'epoch': 97.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2995176315307617, 'eval_runtime': 60.568, 'eval_samples_per_second': 10.088, 'eval_steps_per_second': 1.271, 'epoch': 98.0}
{'loss': 3.1914, 'grad_norm': 3.8466312885284424, 'learning_rate': 6.8255813953488385e-06, 'epoch': 98.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2928130626678467, 'eval_runtime': 61.3085, 'eval_samples_per_second': 9.966, 'eval_steps_per_second': 1.256, 'epoch': 99.0}
{'loss': 3.2055, 'grad_norm': 4.40037202835083, 'learning_rate': 6.692506459948322e-06, 'epoch': 99.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.289578676223755, 'eval_runtime': 60.6795, 'eval_samples_per_second': 10.069, 'eval_steps_per_second': 1.269, 'epoch': 100.0}
{'loss': 3.2005, 'grad_norm': 3.759474754333496, 'learning_rate': 6.559431524547805e-06, 'epoch': 100.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.290576934814453, 'eval_runtime': 60.3125, 'eval_samples_per_second': 10.131, 'eval_steps_per_second': 1.277, 'epoch': 101.0}
{'loss': 3.1986, 'grad_norm': 4.366962909698486, 'learning_rate': 6.426356589147288e-06, 'epoch': 101.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.282942771911621, 'eval_runtime': 60.385, 'eval_samples_per_second': 10.118, 'eval_steps_per_second': 1.275, 'epoch': 102.0}
{'loss': 3.1821, 'grad_norm': 3.1295454502105713, 'learning_rate': 6.29328165374677e-06, 'epoch': 102.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.291046142578125, 'eval_runtime': 60.3725, 'eval_samples_per_second': 10.121, 'eval_steps_per_second': 1.275, 'epoch': 103.0}
{'loss': 3.1817, 'grad_norm': 3.755035161972046, 'learning_rate': 6.1602067183462534e-06, 'epoch': 103.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.283850908279419, 'eval_runtime': 59.9575, 'eval_samples_per_second': 10.191, 'eval_steps_per_second': 1.284, 'epoch': 104.0}
{'loss': 3.1872, 'grad_norm': 3.699336051940918, 'learning_rate': 6.027131782945737e-06, 'epoch': 104.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2818663120269775, 'eval_runtime': 62.312, 'eval_samples_per_second': 9.805, 'eval_steps_per_second': 1.236, 'epoch': 105.0}
{'loss': 3.1802, 'grad_norm': 3.146758556365967, 'learning_rate': 5.89405684754522e-06, 'epoch': 105.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2764956951141357, 'eval_runtime': 61.306, 'eval_samples_per_second': 9.966, 'eval_steps_per_second': 1.256, 'epoch': 106.0}
{'loss': 3.1705, 'grad_norm': 3.8322837352752686, 'learning_rate': 5.760981912144703e-06, 'epoch': 106.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.270714044570923, 'eval_runtime': 61.2355, 'eval_samples_per_second': 9.978, 'eval_steps_per_second': 1.257, 'epoch': 107.0}
{'loss': 3.1755, 'grad_norm': 3.350661039352417, 'learning_rate': 5.627906976744186e-06, 'epoch': 107.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2782247066497803, 'eval_runtime': 65.038, 'eval_samples_per_second': 9.395, 'eval_steps_per_second': 1.184, 'epoch': 108.0}
{'loss': 3.1708, 'grad_norm': 3.178488254547119, 'learning_rate': 5.494832041343669e-06, 'epoch': 108.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2700090408325195, 'eval_runtime': 64.8925, 'eval_samples_per_second': 9.416, 'eval_steps_per_second': 1.187, 'epoch': 109.0}
{'loss': 3.1621, 'grad_norm': 4.086451530456543, 'learning_rate': 5.361757105943153e-06, 'epoch': 109.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2705047130584717, 'eval_runtime': 66.8005, 'eval_samples_per_second': 9.147, 'eval_steps_per_second': 1.153, 'epoch': 110.0}
{'loss': 3.1547, 'grad_norm': 3.2675321102142334, 'learning_rate': 5.228682170542636e-06, 'epoch': 110.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2686641216278076, 'eval_runtime': 65.4925, 'eval_samples_per_second': 9.329, 'eval_steps_per_second': 1.176, 'epoch': 111.0}
{'loss': 3.1467, 'grad_norm': 3.1292121410369873, 'learning_rate': 5.0956072351421195e-06, 'epoch': 111.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.263585090637207, 'eval_runtime': 66.1435, 'eval_samples_per_second': 9.237, 'eval_steps_per_second': 1.164, 'epoch': 112.0}
{'loss': 3.1458, 'grad_norm': 4.0308756828308105, 'learning_rate': 4.962532299741603e-06, 'epoch': 112.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2638518810272217, 'eval_runtime': 65.0715, 'eval_samples_per_second': 9.39, 'eval_steps_per_second': 1.183, 'epoch': 113.0}
{'loss': 3.1488, 'grad_norm': 3.8618197441101074, 'learning_rate': 4.829457364341086e-06, 'epoch': 113.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2571864128112793, 'eval_runtime': 65.1405, 'eval_samples_per_second': 9.38, 'eval_steps_per_second': 1.182, 'epoch': 114.0}
{'loss': 3.1526, 'grad_norm': 3.918487071990967, 'learning_rate': 4.696382428940569e-06, 'epoch': 114.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.263927459716797, 'eval_runtime': 65.9075, 'eval_samples_per_second': 9.271, 'eval_steps_per_second': 1.168, 'epoch': 115.0}
{'loss': 3.1267, 'grad_norm': 3.851020097732544, 'learning_rate': 4.563307493540052e-06, 'epoch': 115.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.267847776412964, 'eval_runtime': 89.5165, 'eval_samples_per_second': 6.826, 'eval_steps_per_second': 0.86, 'epoch': 116.0}
{'loss': 3.1478, 'grad_norm': 3.878858804702759, 'learning_rate': 4.430232558139535e-06, 'epoch': 116.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.263989210128784, 'eval_runtime': 85.7995, 'eval_samples_per_second': 7.121, 'eval_steps_per_second': 0.897, 'epoch': 117.0}
{'loss': 3.1301, 'grad_norm': 3.7207345962524414, 'learning_rate': 4.297157622739018e-06, 'epoch': 117.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2566657066345215, 'eval_runtime': 87.9585, 'eval_samples_per_second': 6.946, 'eval_steps_per_second': 0.875, 'epoch': 118.0}
{'loss': 3.1451, 'grad_norm': 4.2082953453063965, 'learning_rate': 4.1640826873385015e-06, 'epoch': 118.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.257394552230835, 'eval_runtime': 88.9044, 'eval_samples_per_second': 6.873, 'eval_steps_per_second': 0.866, 'epoch': 119.0}
{'loss': 3.1125, 'grad_norm': 5.024595737457275, 'learning_rate': 4.031007751937985e-06, 'epoch': 119.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2565150260925293, 'eval_runtime': 90.6485, 'eval_samples_per_second': 6.74, 'eval_steps_per_second': 0.849, 'epoch': 120.0}
{'loss': 3.12, 'grad_norm': 3.810997247695923, 'learning_rate': 3.897932816537468e-06, 'epoch': 120.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2560644149780273, 'eval_runtime': 66.5975, 'eval_samples_per_second': 9.175, 'eval_steps_per_second': 1.156, 'epoch': 121.0}
{'loss': 3.1215, 'grad_norm': 3.8186678886413574, 'learning_rate': 3.7648578811369514e-06, 'epoch': 121.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2567758560180664, 'eval_runtime': 65.4345, 'eval_samples_per_second': 9.338, 'eval_steps_per_second': 1.177, 'epoch': 122.0}
{'loss': 3.125, 'grad_norm': 3.759855270385742, 'learning_rate': 3.6317829457364346e-06, 'epoch': 122.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2560746669769287, 'eval_runtime': 64.9405, 'eval_samples_per_second': 9.409, 'eval_steps_per_second': 1.186, 'epoch': 123.0}
{'loss': 3.1124, 'grad_norm': 4.0962300300598145, 'learning_rate': 3.4987080103359177e-06, 'epoch': 123.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2558155059814453, 'eval_runtime': 64.3205, 'eval_samples_per_second': 9.499, 'eval_steps_per_second': 1.197, 'epoch': 124.0}
{'loss': 3.1145, 'grad_norm': 5.006035804748535, 'learning_rate': 3.365633074935401e-06, 'epoch': 124.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.25603985786438, 'eval_runtime': 62.7221, 'eval_samples_per_second': 9.741, 'eval_steps_per_second': 1.228, 'epoch': 125.0}
{'loss': 3.1102, 'grad_norm': 4.810570240020752, 'learning_rate': 3.2325581395348836e-06, 'epoch': 125.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2528128623962402, 'eval_runtime': 65.349, 'eval_samples_per_second': 9.35, 'eval_steps_per_second': 1.178, 'epoch': 126.0}
{'loss': 3.102, 'grad_norm': 3.922431468963623, 'learning_rate': 3.099483204134367e-06, 'epoch': 126.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.247735023498535, 'eval_runtime': 64.6525, 'eval_samples_per_second': 9.451, 'eval_steps_per_second': 1.191, 'epoch': 127.0}
{'loss': 3.1099, 'grad_norm': 4.706374168395996, 'learning_rate': 2.9664082687338503e-06, 'epoch': 127.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.25039005279541, 'eval_runtime': 64.099, 'eval_samples_per_second': 9.532, 'eval_steps_per_second': 1.201, 'epoch': 128.0}
{'loss': 3.1206, 'grad_norm': 5.751654148101807, 'learning_rate': 2.8333333333333335e-06, 'epoch': 128.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.247791290283203, 'eval_runtime': 64.742, 'eval_samples_per_second': 9.437, 'eval_steps_per_second': 1.189, 'epoch': 129.0}
{'loss': 3.1043, 'grad_norm': 4.224480628967285, 'learning_rate': 2.7002583979328166e-06, 'epoch': 129.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2470345497131348, 'eval_runtime': 66.672, 'eval_samples_per_second': 9.164, 'eval_steps_per_second': 1.155, 'epoch': 130.0}
{'loss': 3.0991, 'grad_norm': 4.370205879211426, 'learning_rate': 2.5671834625323e-06, 'epoch': 130.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.248889684677124, 'eval_runtime': 67.318, 'eval_samples_per_second': 9.076, 'eval_steps_per_second': 1.144, 'epoch': 131.0}
{'loss': 3.1055, 'grad_norm': 3.950213670730591, 'learning_rate': 2.4341085271317833e-06, 'epoch': 131.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2468068599700928, 'eval_runtime': 66.71, 'eval_samples_per_second': 9.159, 'eval_steps_per_second': 1.154, 'epoch': 132.0}
{'loss': 3.105, 'grad_norm': 4.23231315612793, 'learning_rate': 2.301033591731266e-06, 'epoch': 132.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2470591068267822, 'eval_runtime': 66.9095, 'eval_samples_per_second': 9.132, 'eval_steps_per_second': 1.151, 'epoch': 133.0}
{'loss': 3.091, 'grad_norm': 4.045052528381348, 'learning_rate': 2.1679586563307496e-06, 'epoch': 133.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2469699382781982, 'eval_runtime': 65.9905, 'eval_samples_per_second': 9.259, 'eval_steps_per_second': 1.167, 'epoch': 134.0}
{'loss': 3.0841, 'grad_norm': 3.148669958114624, 'learning_rate': 2.0348837209302328e-06, 'epoch': 134.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.244521379470825, 'eval_runtime': 67.654, 'eval_samples_per_second': 9.031, 'eval_steps_per_second': 1.138, 'epoch': 135.0}
{'loss': 3.1023, 'grad_norm': 3.4120984077453613, 'learning_rate': 1.901808785529716e-06, 'epoch': 135.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.246352195739746, 'eval_runtime': 67.044, 'eval_samples_per_second': 9.113, 'eval_steps_per_second': 1.148, 'epoch': 136.0}
{'loss': 3.0953, 'grad_norm': 3.1404738426208496, 'learning_rate': 1.7687338501291993e-06, 'epoch': 136.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2501657009124756, 'eval_runtime': 68.2755, 'eval_samples_per_second': 8.949, 'eval_steps_per_second': 1.128, 'epoch': 137.0}
{'loss': 3.0773, 'grad_norm': 3.6745986938476562, 'learning_rate': 1.6356589147286822e-06, 'epoch': 137.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2449469566345215, 'eval_runtime': 68.885, 'eval_samples_per_second': 8.87, 'eval_steps_per_second': 1.118, 'epoch': 138.0}
{'loss': 3.0837, 'grad_norm': 3.894726276397705, 'learning_rate': 1.5025839793281654e-06, 'epoch': 138.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2416939735412598, 'eval_runtime': 67.7835, 'eval_samples_per_second': 9.014, 'eval_steps_per_second': 1.136, 'epoch': 139.0}
{'loss': 3.0954, 'grad_norm': 4.014163970947266, 'learning_rate': 1.3695090439276487e-06, 'epoch': 139.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2476189136505127, 'eval_runtime': 68.5135, 'eval_samples_per_second': 8.918, 'eval_steps_per_second': 1.124, 'epoch': 140.0}
{'loss': 3.0844, 'grad_norm': 3.433940887451172, 'learning_rate': 1.2364341085271319e-06, 'epoch': 140.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.24550199508667, 'eval_runtime': 61.747, 'eval_samples_per_second': 9.895, 'eval_steps_per_second': 1.247, 'epoch': 141.0}
{'loss': 3.0956, 'grad_norm': 3.5562212467193604, 'learning_rate': 1.103359173126615e-06, 'epoch': 141.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2420926094055176, 'eval_runtime': 62.729, 'eval_samples_per_second': 9.74, 'eval_steps_per_second': 1.228, 'epoch': 142.0}
{'loss': 3.0836, 'grad_norm': 4.044327259063721, 'learning_rate': 9.702842377260984e-07, 'epoch': 142.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2444756031036377, 'eval_runtime': 61.8975, 'eval_samples_per_second': 9.871, 'eval_steps_per_second': 1.244, 'epoch': 143.0}
{'loss': 3.0829, 'grad_norm': 4.522704124450684, 'learning_rate': 8.372093023255814e-07, 'epoch': 143.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2436728477478027, 'eval_runtime': 63.7845, 'eval_samples_per_second': 9.579, 'eval_steps_per_second': 1.207, 'epoch': 144.0}
{'loss': 3.0652, 'grad_norm': 3.5477445125579834, 'learning_rate': 7.041343669250647e-07, 'epoch': 144.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.242199420928955, 'eval_runtime': 63.194, 'eval_samples_per_second': 9.669, 'eval_steps_per_second': 1.218, 'epoch': 145.0}
{'loss': 3.0894, 'grad_norm': 3.682769298553467, 'learning_rate': 5.710594315245478e-07, 'epoch': 145.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.241936683654785, 'eval_runtime': 62.8875, 'eval_samples_per_second': 9.716, 'eval_steps_per_second': 1.224, 'epoch': 146.0}
{'loss': 3.0892, 'grad_norm': 4.130073547363281, 'learning_rate': 4.3798449612403105e-07, 'epoch': 146.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.244319438934326, 'eval_runtime': 63.3535, 'eval_samples_per_second': 9.644, 'eval_steps_per_second': 1.215, 'epoch': 147.0}
{'loss': 3.0726, 'grad_norm': 3.161574363708496, 'learning_rate': 3.049095607235142e-07, 'epoch': 147.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2437126636505127, 'eval_runtime': 62.5865, 'eval_samples_per_second': 9.762, 'eval_steps_per_second': 1.23, 'epoch': 148.0}
{'loss': 3.0879, 'grad_norm': 4.166016101837158, 'learning_rate': 1.7183462532299745e-07, 'epoch': 148.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2439818382263184, 'eval_runtime': 62.1245, 'eval_samples_per_second': 9.835, 'eval_steps_per_second': 1.239, 'epoch': 149.0}
{'loss': 3.084, 'grad_norm': 3.3931524753570557, 'learning_rate': 3.875968992248062e-08, 'epoch': 149.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.243633985519409, 'eval_runtime': 62.2865, 'eval_samples_per_second': 9.81, 'eval_steps_per_second': 1.236, 'epoch': 150.0}
{'train_runtime': 453132.029, 'train_samples_per_second': 1.366, 'train_steps_per_second': 0.171, 'train_loss': 3.8123453764213147, 'epoch': 150.0}


TrainOutput(global_step=77400, training_loss=3.8123453764213147, metrics={'train_runtime': 453132.029, 'train_samples_per_second': 1.366, 'train_steps_per_second': 0.171, 'total_flos': 5346095180206080.0, 'train_loss': 3.8123453764213147, 'epoch': 150.0})

In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MT5ForConditionalGeneration

# Define constants and model
BATCH_SIZE = 8 # Increased batch size to reduce iterations

# Assuming tokenized_dataset and tokenizer are already defined
# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

logging_steps = len(tokenized_datasets["train"]) // BATCH_SIZE
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=150,  # Reduced number of epochs to 50
    predict_with_generate=True,
    fp16=True,  # Enabled mixed precision training
    logging_steps=logging_steps,
    report_to="none"
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()



  0%|          | 0/77400 [00:00<?, ?it/s]

{'loss': 15.4839, 'grad_norm': 562.9039916992188, 'learning_rate': 4.966731266149871e-05, 'epoch': 1.0}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 7.835846424102783, 'eval_runtime': 78.0871, 'eval_samples_per_second': 7.825, 'eval_steps_per_second': 0.986, 'epoch': 1.0}
{'loss': 7.9222, 'grad_norm': 10.299392700195312, 'learning_rate': 4.9334625322997416e-05, 'epoch': 2.0}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 5.7639617919921875, 'eval_runtime': 64.8473, 'eval_samples_per_second': 9.422, 'eval_steps_per_second': 1.187, 'epoch': 2.0}
{'loss': 6.1067, 'grad_norm': 4.144545078277588, 'learning_rate': 4.9001937984496126e-05, 'epoch': 2.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.787199020385742, 'eval_runtime': 65.2933, 'eval_samples_per_second': 9.358, 'eval_steps_per_second': 1.179, 'epoch': 3.0}
{'loss': 5.3305, 'grad_norm': 3.2596793174743652, 'learning_rate': 4.8669250645994836e-05, 'epoch': 3.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.4214043617248535, 'eval_runtime': 65.118, 'eval_samples_per_second': 9.383, 'eval_steps_per_second': 1.182, 'epoch': 4.0}
{'loss': 4.9791, 'grad_norm': 2.8998146057128906, 'learning_rate': 4.8336563307493546e-05, 'epoch': 4.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.237486839294434, 'eval_runtime': 67.6078, 'eval_samples_per_second': 9.037, 'eval_steps_per_second': 1.139, 'epoch': 5.0}
{'loss': 4.7482, 'grad_norm': 2.684558868408203, 'learning_rate': 4.800387596899225e-05, 'epoch': 5.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 4.1001081466674805, 'eval_runtime': 58.2299, 'eval_samples_per_second': 10.493, 'eval_steps_per_second': 1.322, 'epoch': 6.0}
{'loss': 4.5857, 'grad_norm': 2.6207809448242188, 'learning_rate': 4.767118863049096e-05, 'epoch': 6.99}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.999142646789551, 'eval_runtime': 63.3654, 'eval_samples_per_second': 9.642, 'eval_steps_per_second': 1.215, 'epoch': 7.0}
{'loss': 4.4533, 'grad_norm': 2.989001512527466, 'learning_rate': 4.733850129198967e-05, 'epoch': 7.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.9220173358917236, 'eval_runtime': 232.7314, 'eval_samples_per_second': 2.625, 'eval_steps_per_second': 0.331, 'epoch': 8.0}
{'loss': 4.3331, 'grad_norm': 2.971904993057251, 'learning_rate': 4.700581395348838e-05, 'epoch': 8.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.8505518436431885, 'eval_runtime': 238.4953, 'eval_samples_per_second': 2.562, 'eval_steps_per_second': 0.323, 'epoch': 9.0}
{'loss': 4.2435, 'grad_norm': 2.76456618309021, 'learning_rate': 4.667312661498708e-05, 'epoch': 9.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.7964508533477783, 'eval_runtime': 231.731, 'eval_samples_per_second': 2.637, 'eval_steps_per_second': 0.332, 'epoch': 10.0}
{'loss': 4.1455, 'grad_norm': 2.555974245071411, 'learning_rate': 4.634043927648579e-05, 'epoch': 10.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.7446060180664062, 'eval_runtime': 237.9412, 'eval_samples_per_second': 2.568, 'eval_steps_per_second': 0.324, 'epoch': 11.0}
{'loss': 4.0765, 'grad_norm': 2.6702260971069336, 'learning_rate': 4.60077519379845e-05, 'epoch': 11.98}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.702623128890991, 'eval_runtime': 65.7345, 'eval_samples_per_second': 9.295, 'eval_steps_per_second': 1.171, 'epoch': 12.0}
{'loss': 3.9919, 'grad_norm': 2.279003381729126, 'learning_rate': 4.5675064599483205e-05, 'epoch': 12.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.657169818878174, 'eval_runtime': 65.6237, 'eval_samples_per_second': 9.311, 'eval_steps_per_second': 1.173, 'epoch': 13.0}
{'loss': 3.9356, 'grad_norm': 2.9988787174224854, 'learning_rate': 4.5342377260981915e-05, 'epoch': 13.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.6210989952087402, 'eval_runtime': 65.5307, 'eval_samples_per_second': 9.324, 'eval_steps_per_second': 1.175, 'epoch': 14.0}
{'loss': 3.8723, 'grad_norm': 2.7964916229248047, 'learning_rate': 4.5009689922480625e-05, 'epoch': 14.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5873324871063232, 'eval_runtime': 65.9863, 'eval_samples_per_second': 9.26, 'eval_steps_per_second': 1.167, 'epoch': 15.0}
{'loss': 3.8101, 'grad_norm': 2.825888156890869, 'learning_rate': 4.4677002583979335e-05, 'epoch': 15.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5563125610351562, 'eval_runtime': 65.6525, 'eval_samples_per_second': 9.307, 'eval_steps_per_second': 1.173, 'epoch': 16.0}
{'loss': 3.7739, 'grad_norm': 3.124220848083496, 'learning_rate': 4.434431524547804e-05, 'epoch': 16.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.5278725624084473, 'eval_runtime': 85.7145, 'eval_samples_per_second': 7.128, 'eval_steps_per_second': 0.898, 'epoch': 17.0}
{'loss': 3.7127, 'grad_norm': 3.0442938804626465, 'learning_rate': 4.401162790697675e-05, 'epoch': 17.97}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4994914531707764, 'eval_runtime': 81.4844, 'eval_samples_per_second': 7.498, 'eval_steps_per_second': 0.945, 'epoch': 18.0}
{'loss': 3.6594, 'grad_norm': 2.5219743251800537, 'learning_rate': 4.367894056847545e-05, 'epoch': 18.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4759232997894287, 'eval_runtime': 82.0569, 'eval_samples_per_second': 7.446, 'eval_steps_per_second': 0.938, 'epoch': 19.0}
{'loss': 3.6062, 'grad_norm': 3.0997722148895264, 'learning_rate': 4.334625322997416e-05, 'epoch': 19.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4606306552886963, 'eval_runtime': 85.7543, 'eval_samples_per_second': 7.125, 'eval_steps_per_second': 0.898, 'epoch': 20.0}
{'loss': 3.575, 'grad_norm': 2.591853618621826, 'learning_rate': 4.3013565891472865e-05, 'epoch': 20.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.442619800567627, 'eval_runtime': 84.0263, 'eval_samples_per_second': 7.272, 'eval_steps_per_second': 0.916, 'epoch': 21.0}
{'loss': 3.5389, 'grad_norm': 2.450822591781616, 'learning_rate': 4.2680878552971575e-05, 'epoch': 21.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.4179513454437256, 'eval_runtime': 84.9449, 'eval_samples_per_second': 7.193, 'eval_steps_per_second': 0.906, 'epoch': 22.0}
{'loss': 3.4826, 'grad_norm': 3.88551926612854, 'learning_rate': 4.2348191214470285e-05, 'epoch': 22.96}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.398402690887451, 'eval_runtime': 88.757, 'eval_samples_per_second': 6.884, 'eval_steps_per_second': 0.868, 'epoch': 23.0}
{'loss': 3.4645, 'grad_norm': 2.9635772705078125, 'learning_rate': 4.2015503875968995e-05, 'epoch': 23.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.381269931793213, 'eval_runtime': 88.0857, 'eval_samples_per_second': 6.936, 'eval_steps_per_second': 0.874, 'epoch': 24.0}
{'loss': 3.4172, 'grad_norm': 3.5208749771118164, 'learning_rate': 4.16828165374677e-05, 'epoch': 24.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3504092693328857, 'eval_runtime': 96.2384, 'eval_samples_per_second': 6.349, 'eval_steps_per_second': 0.8, 'epoch': 25.0}
{'loss': 3.3812, 'grad_norm': 3.2860782146453857, 'learning_rate': 4.135012919896641e-05, 'epoch': 25.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.350475788116455, 'eval_runtime': 86.188, 'eval_samples_per_second': 7.089, 'eval_steps_per_second': 0.893, 'epoch': 26.0}
{'loss': 3.3458, 'grad_norm': 2.8909199237823486, 'learning_rate': 4.101744186046512e-05, 'epoch': 26.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3356173038482666, 'eval_runtime': 87.806, 'eval_samples_per_second': 6.959, 'eval_steps_per_second': 0.877, 'epoch': 27.0}
{'loss': 3.3218, 'grad_norm': 2.7290942668914795, 'learning_rate': 4.068475452196383e-05, 'epoch': 27.95}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.3139452934265137, 'eval_runtime': 87.6533, 'eval_samples_per_second': 6.971, 'eval_steps_per_second': 0.878, 'epoch': 28.0}
{'loss': 3.2836, 'grad_norm': 3.036971092224121, 'learning_rate': 4.035206718346253e-05, 'epoch': 28.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.311338186264038, 'eval_runtime': 99.8521, 'eval_samples_per_second': 6.119, 'eval_steps_per_second': 0.771, 'epoch': 29.0}
{'loss': 3.2639, 'grad_norm': 2.7879843711853027, 'learning_rate': 4.001937984496124e-05, 'epoch': 29.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.304962635040283, 'eval_runtime': 93.3253, 'eval_samples_per_second': 6.547, 'eval_steps_per_second': 0.825, 'epoch': 30.0}
{'loss': 3.2161, 'grad_norm': 3.258340835571289, 'learning_rate': 3.968669250645995e-05, 'epoch': 30.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2800087928771973, 'eval_runtime': 84.3897, 'eval_samples_per_second': 7.24, 'eval_steps_per_second': 0.912, 'epoch': 31.0}
{'loss': 3.1954, 'grad_norm': 3.993616819381714, 'learning_rate': 3.9354005167958654e-05, 'epoch': 31.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2728874683380127, 'eval_runtime': 83.6679, 'eval_samples_per_second': 7.303, 'eval_steps_per_second': 0.92, 'epoch': 32.0}
{'loss': 3.1764, 'grad_norm': 2.845658302307129, 'learning_rate': 3.9021317829457364e-05, 'epoch': 32.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.264864921569824, 'eval_runtime': 65.7291, 'eval_samples_per_second': 9.296, 'eval_steps_per_second': 1.171, 'epoch': 33.0}
{'loss': 3.1501, 'grad_norm': 3.4014699459075928, 'learning_rate': 3.8688630490956074e-05, 'epoch': 33.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2448644638061523, 'eval_runtime': 65.6272, 'eval_samples_per_second': 9.31, 'eval_steps_per_second': 1.173, 'epoch': 34.0}
{'loss': 3.1151, 'grad_norm': 3.512554168701172, 'learning_rate': 3.8355943152454784e-05, 'epoch': 34.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2267332077026367, 'eval_runtime': 63.0006, 'eval_samples_per_second': 9.698, 'eval_steps_per_second': 1.222, 'epoch': 35.0}
{'loss': 3.0965, 'grad_norm': 3.045501470565796, 'learning_rate': 3.802325581395349e-05, 'epoch': 35.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2337706089019775, 'eval_runtime': 59.9214, 'eval_samples_per_second': 10.197, 'eval_steps_per_second': 1.285, 'epoch': 36.0}
{'loss': 3.0744, 'grad_norm': 3.261730194091797, 'learning_rate': 3.76905684754522e-05, 'epoch': 36.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2317087650299072, 'eval_runtime': 61.131, 'eval_samples_per_second': 9.995, 'eval_steps_per_second': 1.26, 'epoch': 37.0}
{'loss': 3.0567, 'grad_norm': 3.37795090675354, 'learning_rate': 3.735788113695091e-05, 'epoch': 37.93}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.21884822845459, 'eval_runtime': 59.7523, 'eval_samples_per_second': 10.226, 'eval_steps_per_second': 1.289, 'epoch': 38.0}
{'loss': 3.0241, 'grad_norm': 3.1085259914398193, 'learning_rate': 3.702519379844962e-05, 'epoch': 38.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.202686309814453, 'eval_runtime': 65.1488, 'eval_samples_per_second': 9.379, 'eval_steps_per_second': 1.182, 'epoch': 39.0}
{'loss': 3.0157, 'grad_norm': 3.6604514122009277, 'learning_rate': 3.669250645994832e-05, 'epoch': 39.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.2073018550872803, 'eval_runtime': 66.1494, 'eval_samples_per_second': 9.237, 'eval_steps_per_second': 1.164, 'epoch': 40.0}
{'loss': 2.9787, 'grad_norm': 3.325516700744629, 'learning_rate': 3.635981912144703e-05, 'epoch': 40.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.193779945373535, 'eval_runtime': 66.0195, 'eval_samples_per_second': 9.255, 'eval_steps_per_second': 1.166, 'epoch': 41.0}
{'loss': 2.9676, 'grad_norm': 3.033082962036133, 'learning_rate': 3.602713178294574e-05, 'epoch': 41.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1851658821105957, 'eval_runtime': 65.9115, 'eval_samples_per_second': 9.27, 'eval_steps_per_second': 1.168, 'epoch': 42.0}
{'loss': 2.9331, 'grad_norm': 4.1674580574035645, 'learning_rate': 3.5694444444444444e-05, 'epoch': 42.92}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1766395568847656, 'eval_runtime': 65.7666, 'eval_samples_per_second': 9.29, 'eval_steps_per_second': 1.171, 'epoch': 43.0}
{'loss': 2.9231, 'grad_norm': 3.28200101852417, 'learning_rate': 3.5361757105943154e-05, 'epoch': 43.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.176231861114502, 'eval_runtime': 65.8404, 'eval_samples_per_second': 9.28, 'eval_steps_per_second': 1.169, 'epoch': 44.0}
{'loss': 2.8967, 'grad_norm': 3.6172382831573486, 'learning_rate': 3.5029069767441864e-05, 'epoch': 44.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.161860227584839, 'eval_runtime': 66.2318, 'eval_samples_per_second': 9.225, 'eval_steps_per_second': 1.163, 'epoch': 45.0}
{'loss': 2.8844, 'grad_norm': 4.251365661621094, 'learning_rate': 3.4696382428940574e-05, 'epoch': 45.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1606805324554443, 'eval_runtime': 89.3578, 'eval_samples_per_second': 6.838, 'eval_steps_per_second': 0.862, 'epoch': 46.0}
{'loss': 2.8583, 'grad_norm': 3.6268019676208496, 'learning_rate': 3.436369509043928e-05, 'epoch': 46.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.162576913833618, 'eval_runtime': 94.4107, 'eval_samples_per_second': 6.472, 'eval_steps_per_second': 0.816, 'epoch': 47.0}
{'loss': 2.833, 'grad_norm': 3.8149774074554443, 'learning_rate': 3.403100775193799e-05, 'epoch': 47.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.155965566635132, 'eval_runtime': 88.0264, 'eval_samples_per_second': 6.941, 'eval_steps_per_second': 0.875, 'epoch': 48.0}
{'loss': 2.8314, 'grad_norm': 3.9580631256103516, 'learning_rate': 3.36983204134367e-05, 'epoch': 48.91}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1492490768432617, 'eval_runtime': 86.9355, 'eval_samples_per_second': 7.028, 'eval_steps_per_second': 0.886, 'epoch': 49.0}
{'loss': 2.8115, 'grad_norm': 3.742579221725464, 'learning_rate': 3.336563307493541e-05, 'epoch': 49.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.154707431793213, 'eval_runtime': 456.3269, 'eval_samples_per_second': 1.339, 'eval_steps_per_second': 0.169, 'epoch': 50.0}
{'loss': 2.7823, 'grad_norm': 2.973374605178833, 'learning_rate': 3.303294573643411e-05, 'epoch': 50.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1350793838500977, 'eval_runtime': 232.6213, 'eval_samples_per_second': 2.627, 'eval_steps_per_second': 0.331, 'epoch': 51.0}
{'loss': 2.7675, 'grad_norm': 3.130192995071411, 'learning_rate': 3.270025839793282e-05, 'epoch': 51.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.133108139038086, 'eval_runtime': 207.8656, 'eval_samples_per_second': 2.939, 'eval_steps_per_second': 0.37, 'epoch': 52.0}
{'loss': 2.7624, 'grad_norm': 4.656269073486328, 'learning_rate': 3.236757105943153e-05, 'epoch': 52.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1375765800476074, 'eval_runtime': 62.2203, 'eval_samples_per_second': 9.82, 'eval_steps_per_second': 1.238, 'epoch': 53.0}
{'loss': 2.7387, 'grad_norm': 4.228492259979248, 'learning_rate': 3.2034883720930234e-05, 'epoch': 53.9}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.134326219558716, 'eval_runtime': 66.1354, 'eval_samples_per_second': 9.239, 'eval_steps_per_second': 1.164, 'epoch': 54.0}
{'loss': 2.7248, 'grad_norm': 3.4419779777526855, 'learning_rate': 3.1702196382428944e-05, 'epoch': 54.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.132995128631592, 'eval_runtime': 67.0886, 'eval_samples_per_second': 9.107, 'eval_steps_per_second': 1.148, 'epoch': 55.0}
{'loss': 2.7075, 'grad_norm': 3.5307276248931885, 'learning_rate': 3.136950904392765e-05, 'epoch': 55.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1172266006469727, 'eval_runtime': 65.8895, 'eval_samples_per_second': 9.273, 'eval_steps_per_second': 1.169, 'epoch': 56.0}
{'loss': 2.696, 'grad_norm': 4.434866428375244, 'learning_rate': 3.103682170542636e-05, 'epoch': 56.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1158182621002197, 'eval_runtime': 66.0061, 'eval_samples_per_second': 9.257, 'eval_steps_per_second': 1.167, 'epoch': 57.0}
{'loss': 2.6681, 'grad_norm': 3.9239211082458496, 'learning_rate': 3.070413436692507e-05, 'epoch': 57.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1205132007598877, 'eval_runtime': 66.0973, 'eval_samples_per_second': 9.244, 'eval_steps_per_second': 1.165, 'epoch': 58.0}
{'loss': 2.6651, 'grad_norm': 3.328791856765747, 'learning_rate': 3.0371447028423773e-05, 'epoch': 58.89}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1144649982452393, 'eval_runtime': 66.5442, 'eval_samples_per_second': 9.182, 'eval_steps_per_second': 1.157, 'epoch': 59.0}
{'loss': 2.6507, 'grad_norm': 3.8693065643310547, 'learning_rate': 3.003875968992248e-05, 'epoch': 59.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.108001232147217, 'eval_runtime': 70.5997, 'eval_samples_per_second': 8.654, 'eval_steps_per_second': 1.091, 'epoch': 60.0}
{'loss': 2.6289, 'grad_norm': 4.288758754730225, 'learning_rate': 2.9706072351421187e-05, 'epoch': 60.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1146762371063232, 'eval_runtime': 65.945, 'eval_samples_per_second': 9.265, 'eval_steps_per_second': 1.168, 'epoch': 61.0}
{'loss': 2.609, 'grad_norm': 3.5702764987945557, 'learning_rate': 2.9373385012919897e-05, 'epoch': 61.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0998260974884033, 'eval_runtime': 63.7913, 'eval_samples_per_second': 9.578, 'eval_steps_per_second': 1.207, 'epoch': 62.0}
{'loss': 2.6088, 'grad_norm': 3.790672779083252, 'learning_rate': 2.9040697674418603e-05, 'epoch': 62.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1168124675750732, 'eval_runtime': 65.9014, 'eval_samples_per_second': 9.271, 'eval_steps_per_second': 1.168, 'epoch': 63.0}
{'loss': 2.581, 'grad_norm': 3.5201926231384277, 'learning_rate': 2.8708010335917313e-05, 'epoch': 63.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1027190685272217, 'eval_runtime': 65.2805, 'eval_samples_per_second': 9.36, 'eval_steps_per_second': 1.18, 'epoch': 64.0}
{'loss': 2.5779, 'grad_norm': 3.4553487300872803, 'learning_rate': 2.837532299741602e-05, 'epoch': 64.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.114215612411499, 'eval_runtime': 65.3019, 'eval_samples_per_second': 9.357, 'eval_steps_per_second': 1.179, 'epoch': 65.0}
{'loss': 2.5515, 'grad_norm': 3.2166268825531006, 'learning_rate': 2.804263565891473e-05, 'epoch': 65.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.101544141769409, 'eval_runtime': 66.5604, 'eval_samples_per_second': 9.18, 'eval_steps_per_second': 1.157, 'epoch': 66.0}
{'loss': 2.5411, 'grad_norm': 3.964566707611084, 'learning_rate': 2.7709948320413436e-05, 'epoch': 66.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1019747257232666, 'eval_runtime': 66.1421, 'eval_samples_per_second': 9.238, 'eval_steps_per_second': 1.164, 'epoch': 67.0}
{'loss': 2.5351, 'grad_norm': 4.167932987213135, 'learning_rate': 2.7377260981912146e-05, 'epoch': 67.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.098146438598633, 'eval_runtime': 66.0964, 'eval_samples_per_second': 9.244, 'eval_steps_per_second': 1.165, 'epoch': 68.0}
{'loss': 2.5062, 'grad_norm': 4.017559051513672, 'learning_rate': 2.7044573643410853e-05, 'epoch': 68.87}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.085899591445923, 'eval_runtime': 66.2238, 'eval_samples_per_second': 9.226, 'eval_steps_per_second': 1.163, 'epoch': 69.0}
{'loss': 2.5159, 'grad_norm': 3.2226510047912598, 'learning_rate': 2.6711886304909563e-05, 'epoch': 69.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0983622074127197, 'eval_runtime': 67.4025, 'eval_samples_per_second': 9.065, 'eval_steps_per_second': 1.142, 'epoch': 70.0}
{'loss': 2.4973, 'grad_norm': 3.635690927505493, 'learning_rate': 2.637919896640827e-05, 'epoch': 70.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.114682197570801, 'eval_runtime': 65.8185, 'eval_samples_per_second': 9.283, 'eval_steps_per_second': 1.17, 'epoch': 71.0}
{'loss': 2.4786, 'grad_norm': 3.111114740371704, 'learning_rate': 2.6046511627906976e-05, 'epoch': 71.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.088705062866211, 'eval_runtime': 65.6004, 'eval_samples_per_second': 9.314, 'eval_steps_per_second': 1.174, 'epoch': 72.0}
{'loss': 2.4796, 'grad_norm': 4.493267059326172, 'learning_rate': 2.5713824289405686e-05, 'epoch': 72.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1059629917144775, 'eval_runtime': 176.2649, 'eval_samples_per_second': 3.466, 'eval_steps_per_second': 0.437, 'epoch': 73.0}
{'loss': 2.4632, 'grad_norm': 3.397878408432007, 'learning_rate': 2.5381136950904393e-05, 'epoch': 73.86}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0914382934570312, 'eval_runtime': 106.9916, 'eval_samples_per_second': 5.711, 'eval_steps_per_second': 0.72, 'epoch': 74.0}
{'loss': 2.4448, 'grad_norm': 3.172910690307617, 'learning_rate': 2.5048449612403103e-05, 'epoch': 74.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1000382900238037, 'eval_runtime': 87.4871, 'eval_samples_per_second': 6.984, 'eval_steps_per_second': 0.88, 'epoch': 75.0}
{'loss': 2.4343, 'grad_norm': 4.330005645751953, 'learning_rate': 2.471576227390181e-05, 'epoch': 75.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1080868244171143, 'eval_runtime': 91.7566, 'eval_samples_per_second': 6.659, 'eval_steps_per_second': 0.839, 'epoch': 76.0}
{'loss': 2.4309, 'grad_norm': 3.4265458583831787, 'learning_rate': 2.438307493540052e-05, 'epoch': 76.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.105656385421753, 'eval_runtime': 88.6418, 'eval_samples_per_second': 6.893, 'eval_steps_per_second': 0.869, 'epoch': 77.0}
{'loss': 2.4038, 'grad_norm': 3.987464189529419, 'learning_rate': 2.4050387596899226e-05, 'epoch': 77.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.084239959716797, 'eval_runtime': 86.6568, 'eval_samples_per_second': 7.051, 'eval_steps_per_second': 0.889, 'epoch': 78.0}
{'loss': 2.4215, 'grad_norm': 3.7605950832366943, 'learning_rate': 2.3717700258397936e-05, 'epoch': 78.85}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0895543098449707, 'eval_runtime': 85.4295, 'eval_samples_per_second': 7.152, 'eval_steps_per_second': 0.901, 'epoch': 79.0}
{'loss': 2.3914, 'grad_norm': 3.350142002105713, 'learning_rate': 2.3385012919896642e-05, 'epoch': 79.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.099492073059082, 'eval_runtime': 68.7395, 'eval_samples_per_second': 8.889, 'eval_steps_per_second': 1.12, 'epoch': 80.0}
{'loss': 2.3782, 'grad_norm': 3.6777029037475586, 'learning_rate': 2.3052325581395352e-05, 'epoch': 80.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.090599536895752, 'eval_runtime': 79.6732, 'eval_samples_per_second': 7.669, 'eval_steps_per_second': 0.966, 'epoch': 81.0}
{'loss': 2.3775, 'grad_norm': 3.7364680767059326, 'learning_rate': 2.271963824289406e-05, 'epoch': 81.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.084036350250244, 'eval_runtime': 224.6055, 'eval_samples_per_second': 2.72, 'eval_steps_per_second': 0.343, 'epoch': 82.0}
{'loss': 2.3588, 'grad_norm': 3.742983818054199, 'learning_rate': 2.2386950904392766e-05, 'epoch': 82.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0884621143341064, 'eval_runtime': 241.2637, 'eval_samples_per_second': 2.532, 'eval_steps_per_second': 0.319, 'epoch': 83.0}
{'loss': 2.3655, 'grad_norm': 4.090692043304443, 'learning_rate': 2.2054263565891472e-05, 'epoch': 83.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.097700357437134, 'eval_runtime': 240.4701, 'eval_samples_per_second': 2.541, 'eval_steps_per_second': 0.32, 'epoch': 84.0}
{'loss': 2.3412, 'grad_norm': 3.2891199588775635, 'learning_rate': 2.1721576227390182e-05, 'epoch': 84.84}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.089613199234009, 'eval_runtime': 242.5759, 'eval_samples_per_second': 2.519, 'eval_steps_per_second': 0.317, 'epoch': 85.0}
{'loss': 2.3384, 'grad_norm': 3.989464044570923, 'learning_rate': 2.138888888888889e-05, 'epoch': 85.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.081033229827881, 'eval_runtime': 232.7472, 'eval_samples_per_second': 2.625, 'eval_steps_per_second': 0.331, 'epoch': 86.0}
{'loss': 2.3169, 'grad_norm': 3.588170051574707, 'learning_rate': 2.1056201550387596e-05, 'epoch': 86.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.096750020980835, 'eval_runtime': 62.1673, 'eval_samples_per_second': 9.828, 'eval_steps_per_second': 1.239, 'epoch': 87.0}
{'loss': 2.3252, 'grad_norm': 4.315685272216797, 'learning_rate': 2.0723514211886305e-05, 'epoch': 87.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0970616340637207, 'eval_runtime': 69.1779, 'eval_samples_per_second': 8.832, 'eval_steps_per_second': 1.113, 'epoch': 88.0}
{'loss': 2.3048, 'grad_norm': 3.3722357749938965, 'learning_rate': 2.0390826873385012e-05, 'epoch': 88.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0965981483459473, 'eval_runtime': 72.1135, 'eval_samples_per_second': 8.473, 'eval_steps_per_second': 1.068, 'epoch': 89.0}
{'loss': 2.307, 'grad_norm': 3.996382713317871, 'learning_rate': 2.0058139534883722e-05, 'epoch': 89.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.097105026245117, 'eval_runtime': 108.236, 'eval_samples_per_second': 5.645, 'eval_steps_per_second': 0.711, 'epoch': 90.0}
{'loss': 2.2792, 'grad_norm': 4.256079196929932, 'learning_rate': 1.972545219638243e-05, 'epoch': 90.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0811591148376465, 'eval_runtime': 117.8331, 'eval_samples_per_second': 5.185, 'eval_steps_per_second': 0.653, 'epoch': 91.0}
{'loss': 2.291, 'grad_norm': 5.403411865234375, 'learning_rate': 1.939276485788114e-05, 'epoch': 91.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.094696044921875, 'eval_runtime': 66.1555, 'eval_samples_per_second': 9.236, 'eval_steps_per_second': 1.164, 'epoch': 92.0}
{'loss': 2.2759, 'grad_norm': 3.550131320953369, 'learning_rate': 1.9060077519379845e-05, 'epoch': 92.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0910568237304688, 'eval_runtime': 238.8467, 'eval_samples_per_second': 2.558, 'eval_steps_per_second': 0.322, 'epoch': 93.0}
{'loss': 2.2615, 'grad_norm': 6.666769504547119, 'learning_rate': 1.8727390180878555e-05, 'epoch': 93.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.099881410598755, 'eval_runtime': 83.7243, 'eval_samples_per_second': 7.298, 'eval_steps_per_second': 0.92, 'epoch': 94.0}
{'loss': 2.2689, 'grad_norm': 4.085936069488525, 'learning_rate': 1.8394702842377262e-05, 'epoch': 94.82}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.097994327545166, 'eval_runtime': 84.4004, 'eval_samples_per_second': 7.239, 'eval_steps_per_second': 0.912, 'epoch': 95.0}
{'loss': 2.2547, 'grad_norm': 4.038984775543213, 'learning_rate': 1.8062015503875972e-05, 'epoch': 95.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.09238338470459, 'eval_runtime': 85.6016, 'eval_samples_per_second': 7.138, 'eval_steps_per_second': 0.9, 'epoch': 96.0}
{'loss': 2.237, 'grad_norm': 3.4246511459350586, 'learning_rate': 1.772932816537468e-05, 'epoch': 96.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.098695993423462, 'eval_runtime': 101.5807, 'eval_samples_per_second': 6.015, 'eval_steps_per_second': 0.758, 'epoch': 97.0}
{'loss': 2.2488, 'grad_norm': 3.9261374473571777, 'learning_rate': 1.7396640826873385e-05, 'epoch': 97.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0996241569519043, 'eval_runtime': 95.2575, 'eval_samples_per_second': 6.414, 'eval_steps_per_second': 0.808, 'epoch': 98.0}
{'loss': 2.2049, 'grad_norm': 4.158993244171143, 'learning_rate': 1.7063953488372095e-05, 'epoch': 98.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1002907752990723, 'eval_runtime': 96.0028, 'eval_samples_per_second': 6.364, 'eval_steps_per_second': 0.802, 'epoch': 99.0}
{'loss': 2.2251, 'grad_norm': 5.634504318237305, 'learning_rate': 1.67312661498708e-05, 'epoch': 99.81}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1054599285125732, 'eval_runtime': 96.1314, 'eval_samples_per_second': 6.356, 'eval_steps_per_second': 0.801, 'epoch': 100.0}
{'loss': 2.2134, 'grad_norm': 4.284085273742676, 'learning_rate': 1.639857881136951e-05, 'epoch': 100.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0985987186431885, 'eval_runtime': 95.1223, 'eval_samples_per_second': 6.423, 'eval_steps_per_second': 0.809, 'epoch': 101.0}
{'loss': 2.2043, 'grad_norm': 5.061241626739502, 'learning_rate': 1.6065891472868218e-05, 'epoch': 101.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.0980629920959473, 'eval_runtime': 94.7259, 'eval_samples_per_second': 6.45, 'eval_steps_per_second': 0.813, 'epoch': 102.0}
{'loss': 2.1996, 'grad_norm': 3.440385103225708, 'learning_rate': 1.5733204134366925e-05, 'epoch': 102.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1070477962493896, 'eval_runtime': 93.2004, 'eval_samples_per_second': 6.556, 'eval_steps_per_second': 0.826, 'epoch': 103.0}
{'loss': 2.1856, 'grad_norm': 4.391870975494385, 'learning_rate': 1.540051679586563e-05, 'epoch': 103.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1080775260925293, 'eval_runtime': 68.9802, 'eval_samples_per_second': 8.858, 'eval_steps_per_second': 1.116, 'epoch': 104.0}
{'loss': 2.1903, 'grad_norm': 4.256255626678467, 'learning_rate': 1.5067829457364341e-05, 'epoch': 104.8}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1071460247039795, 'eval_runtime': 67.7742, 'eval_samples_per_second': 9.015, 'eval_steps_per_second': 1.136, 'epoch': 105.0}
{'loss': 2.1822, 'grad_norm': 3.715315103530884, 'learning_rate': 1.473514211886305e-05, 'epoch': 105.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1078038215637207, 'eval_runtime': 67.2441, 'eval_samples_per_second': 9.086, 'eval_steps_per_second': 1.145, 'epoch': 106.0}
{'loss': 2.1766, 'grad_norm': 4.258523464202881, 'learning_rate': 1.4402454780361756e-05, 'epoch': 106.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.106405735015869, 'eval_runtime': 68.569, 'eval_samples_per_second': 8.911, 'eval_steps_per_second': 1.123, 'epoch': 107.0}
{'loss': 2.1699, 'grad_norm': 3.5184218883514404, 'learning_rate': 1.4069767441860465e-05, 'epoch': 107.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1066157817840576, 'eval_runtime': 67.6588, 'eval_samples_per_second': 9.031, 'eval_steps_per_second': 1.138, 'epoch': 108.0}
{'loss': 2.167, 'grad_norm': 3.8404877185821533, 'learning_rate': 1.3737080103359173e-05, 'epoch': 108.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1010725498199463, 'eval_runtime': 67.1729, 'eval_samples_per_second': 9.096, 'eval_steps_per_second': 1.146, 'epoch': 109.0}
{'loss': 2.158, 'grad_norm': 4.7289958000183105, 'learning_rate': 1.3404392764857881e-05, 'epoch': 109.79}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.10669207572937, 'eval_runtime': 68.7014, 'eval_samples_per_second': 8.894, 'eval_steps_per_second': 1.121, 'epoch': 110.0}
{'loss': 2.1519, 'grad_norm': 3.832672119140625, 'learning_rate': 1.307170542635659e-05, 'epoch': 110.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1047518253326416, 'eval_runtime': 67.4036, 'eval_samples_per_second': 9.065, 'eval_steps_per_second': 1.142, 'epoch': 111.0}
{'loss': 2.145, 'grad_norm': 3.410369396209717, 'learning_rate': 1.2739018087855298e-05, 'epoch': 111.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1032304763793945, 'eval_runtime': 67.0065, 'eval_samples_per_second': 9.119, 'eval_steps_per_second': 1.149, 'epoch': 112.0}
{'loss': 2.1389, 'grad_norm': 4.424113750457764, 'learning_rate': 1.2406330749354006e-05, 'epoch': 112.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1093921661376953, 'eval_runtime': 68.4628, 'eval_samples_per_second': 8.925, 'eval_steps_per_second': 1.125, 'epoch': 113.0}
{'loss': 2.14, 'grad_norm': 4.291368007659912, 'learning_rate': 1.2073643410852714e-05, 'epoch': 113.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1003777980804443, 'eval_runtime': 67.2737, 'eval_samples_per_second': 9.082, 'eval_steps_per_second': 1.145, 'epoch': 114.0}
{'loss': 2.1393, 'grad_norm': 4.568911552429199, 'learning_rate': 1.1740956072351423e-05, 'epoch': 114.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.113292694091797, 'eval_runtime': 68.8949, 'eval_samples_per_second': 8.869, 'eval_steps_per_second': 1.118, 'epoch': 115.0}
{'loss': 2.1195, 'grad_norm': 5.011117458343506, 'learning_rate': 1.1408268733850131e-05, 'epoch': 115.78}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.104832172393799, 'eval_runtime': 67.9508, 'eval_samples_per_second': 8.992, 'eval_steps_per_second': 1.133, 'epoch': 116.0}
{'loss': 2.1308, 'grad_norm': 4.666243076324463, 'learning_rate': 1.1075581395348838e-05, 'epoch': 116.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.115800619125366, 'eval_runtime': 67.4037, 'eval_samples_per_second': 9.065, 'eval_steps_per_second': 1.142, 'epoch': 117.0}
{'loss': 2.1174, 'grad_norm': 4.006885528564453, 'learning_rate': 1.0742894056847546e-05, 'epoch': 117.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.111804485321045, 'eval_runtime': 137.2026, 'eval_samples_per_second': 4.453, 'eval_steps_per_second': 0.561, 'epoch': 118.0}
{'loss': 2.124, 'grad_norm': 4.105653285980225, 'learning_rate': 1.0410206718346254e-05, 'epoch': 118.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1146790981292725, 'eval_runtime': 69.1732, 'eval_samples_per_second': 8.833, 'eval_steps_per_second': 1.113, 'epoch': 119.0}
{'loss': 2.1003, 'grad_norm': 4.741890907287598, 'learning_rate': 1.0077519379844961e-05, 'epoch': 119.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.104140281677246, 'eval_runtime': 99.6502, 'eval_samples_per_second': 6.131, 'eval_steps_per_second': 0.773, 'epoch': 120.0}
{'loss': 2.1, 'grad_norm': 4.049677848815918, 'learning_rate': 9.74483204134367e-06, 'epoch': 120.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1162972450256348, 'eval_runtime': 93.7653, 'eval_samples_per_second': 6.516, 'eval_steps_per_second': 0.821, 'epoch': 121.0}
{'loss': 2.1076, 'grad_norm': 4.578023910522461, 'learning_rate': 9.412144702842377e-06, 'epoch': 121.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1116244792938232, 'eval_runtime': 94.4921, 'eval_samples_per_second': 6.466, 'eval_steps_per_second': 0.815, 'epoch': 122.0}
{'loss': 2.1031, 'grad_norm': 4.267454147338867, 'learning_rate': 9.079457364341086e-06, 'epoch': 122.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.120954751968384, 'eval_runtime': 94.7289, 'eval_samples_per_second': 6.45, 'eval_steps_per_second': 0.813, 'epoch': 123.0}
{'loss': 2.0912, 'grad_norm': 4.680304527282715, 'learning_rate': 8.746770025839794e-06, 'epoch': 123.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1138267517089844, 'eval_runtime': 95.006, 'eval_samples_per_second': 6.431, 'eval_steps_per_second': 0.81, 'epoch': 124.0}
{'loss': 2.0853, 'grad_norm': 3.8800277709960938, 'learning_rate': 8.414082687338502e-06, 'epoch': 124.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.11715030670166, 'eval_runtime': 105.6113, 'eval_samples_per_second': 5.785, 'eval_steps_per_second': 0.729, 'epoch': 125.0}
{'loss': 2.0826, 'grad_norm': 5.527578830718994, 'learning_rate': 8.081395348837209e-06, 'epoch': 125.76}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1166889667510986, 'eval_runtime': 94.9414, 'eval_samples_per_second': 6.436, 'eval_steps_per_second': 0.811, 'epoch': 126.0}
{'loss': 2.0873, 'grad_norm': 4.143652439117432, 'learning_rate': 7.748708010335917e-06, 'epoch': 126.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1181797981262207, 'eval_runtime': 94.4281, 'eval_samples_per_second': 6.471, 'eval_steps_per_second': 0.815, 'epoch': 127.0}
{'loss': 2.0722, 'grad_norm': 5.526092052459717, 'learning_rate': 7.4160206718346255e-06, 'epoch': 127.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1185085773468018, 'eval_runtime': 89.8414, 'eval_samples_per_second': 6.801, 'eval_steps_per_second': 0.857, 'epoch': 128.0}
{'loss': 2.0932, 'grad_norm': 4.102197647094727, 'learning_rate': 7.083333333333334e-06, 'epoch': 128.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1160831451416016, 'eval_runtime': 87.265, 'eval_samples_per_second': 7.002, 'eval_steps_per_second': 0.882, 'epoch': 129.0}
{'loss': 2.0768, 'grad_norm': 4.1581268310546875, 'learning_rate': 6.750645994832041e-06, 'epoch': 129.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.117832660675049, 'eval_runtime': 81.4061, 'eval_samples_per_second': 7.506, 'eval_steps_per_second': 0.946, 'epoch': 130.0}
{'loss': 2.0706, 'grad_norm': 4.548557758331299, 'learning_rate': 6.41795865633075e-06, 'epoch': 130.75}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.120535373687744, 'eval_runtime': 86.0687, 'eval_samples_per_second': 7.099, 'eval_steps_per_second': 0.895, 'epoch': 131.0}
{'loss': 2.0706, 'grad_norm': 4.202262878417969, 'learning_rate': 6.085271317829458e-06, 'epoch': 131.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1191201210021973, 'eval_runtime': 86.5075, 'eval_samples_per_second': 7.063, 'eval_steps_per_second': 0.89, 'epoch': 132.0}
{'loss': 2.076, 'grad_norm': 5.116209983825684, 'learning_rate': 5.752583979328165e-06, 'epoch': 132.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1294162273406982, 'eval_runtime': 84.4639, 'eval_samples_per_second': 7.234, 'eval_steps_per_second': 0.912, 'epoch': 133.0}
{'loss': 2.0606, 'grad_norm': 3.60550856590271, 'learning_rate': 5.419896640826874e-06, 'epoch': 133.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.124026298522949, 'eval_runtime': 88.6792, 'eval_samples_per_second': 6.89, 'eval_steps_per_second': 0.868, 'epoch': 134.0}
{'loss': 2.0521, 'grad_norm': 4.237996578216553, 'learning_rate': 5.087209302325582e-06, 'epoch': 134.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1214892864227295, 'eval_runtime': 83.6412, 'eval_samples_per_second': 7.305, 'eval_steps_per_second': 0.921, 'epoch': 135.0}
{'loss': 2.062, 'grad_norm': 3.952080249786377, 'learning_rate': 4.754521963824289e-06, 'epoch': 135.74}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1279361248016357, 'eval_runtime': 68.5646, 'eval_samples_per_second': 8.911, 'eval_steps_per_second': 1.123, 'epoch': 136.0}
{'loss': 2.0632, 'grad_norm': 4.327482223510742, 'learning_rate': 4.421834625322998e-06, 'epoch': 136.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1259868144989014, 'eval_runtime': 69.8786, 'eval_samples_per_second': 8.744, 'eval_steps_per_second': 1.102, 'epoch': 137.0}
{'loss': 2.0459, 'grad_norm': 4.8115129470825195, 'learning_rate': 4.089147286821705e-06, 'epoch': 137.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1275174617767334, 'eval_runtime': 68.2373, 'eval_samples_per_second': 8.954, 'eval_steps_per_second': 1.128, 'epoch': 138.0}
{'loss': 2.0443, 'grad_norm': 4.318519115447998, 'learning_rate': 3.7564599483204134e-06, 'epoch': 138.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.128499746322632, 'eval_runtime': 68.603, 'eval_samples_per_second': 8.906, 'eval_steps_per_second': 1.122, 'epoch': 139.0}
{'loss': 2.0526, 'grad_norm': 4.106555938720703, 'learning_rate': 3.4237726098191217e-06, 'epoch': 139.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1294972896575928, 'eval_runtime': 68.4318, 'eval_samples_per_second': 8.929, 'eval_steps_per_second': 1.125, 'epoch': 140.0}
{'loss': 2.0481, 'grad_norm': 4.05790901184082, 'learning_rate': 3.0910852713178296e-06, 'epoch': 140.73}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.124866008758545, 'eval_runtime': 68.7761, 'eval_samples_per_second': 8.884, 'eval_steps_per_second': 1.12, 'epoch': 141.0}
{'loss': 2.0575, 'grad_norm': 4.654408931732178, 'learning_rate': 2.7583979328165375e-06, 'epoch': 141.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.124605655670166, 'eval_runtime': 70.3372, 'eval_samples_per_second': 8.687, 'eval_steps_per_second': 1.095, 'epoch': 142.0}
{'loss': 2.0458, 'grad_norm': 5.274786472320557, 'learning_rate': 2.4257105943152458e-06, 'epoch': 142.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.126765489578247, 'eval_runtime': 68.6026, 'eval_samples_per_second': 8.906, 'eval_steps_per_second': 1.122, 'epoch': 143.0}
{'loss': 2.0355, 'grad_norm': 5.181264400482178, 'learning_rate': 2.0930232558139536e-06, 'epoch': 143.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.128877639770508, 'eval_runtime': 70.0505, 'eval_samples_per_second': 8.722, 'eval_steps_per_second': 1.099, 'epoch': 144.0}
{'loss': 2.0341, 'grad_norm': 4.071650981903076, 'learning_rate': 1.7603359173126617e-06, 'epoch': 144.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.127084255218506, 'eval_runtime': 69.5597, 'eval_samples_per_second': 8.784, 'eval_steps_per_second': 1.107, 'epoch': 145.0}
{'loss': 2.0425, 'grad_norm': 4.618537902832031, 'learning_rate': 1.4276485788113694e-06, 'epoch': 145.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.128075361251831, 'eval_runtime': 92.1154, 'eval_samples_per_second': 6.633, 'eval_steps_per_second': 0.836, 'epoch': 146.0}
{'loss': 2.0457, 'grad_norm': 4.888888359069824, 'learning_rate': 1.0949612403100777e-06, 'epoch': 146.72}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1286213397979736, 'eval_runtime': 118.0993, 'eval_samples_per_second': 5.174, 'eval_steps_per_second': 0.652, 'epoch': 147.0}
{'loss': 2.0366, 'grad_norm': 4.051687240600586, 'learning_rate': 7.622739018087856e-07, 'epoch': 147.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.129117965698242, 'eval_runtime': 69.4779, 'eval_samples_per_second': 8.794, 'eval_steps_per_second': 1.108, 'epoch': 148.0}
{'loss': 2.0444, 'grad_norm': 4.328103542327881, 'learning_rate': 4.295865633074936e-07, 'epoch': 148.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.1298272609710693, 'eval_runtime': 115.0486, 'eval_samples_per_second': 5.311, 'eval_steps_per_second': 0.669, 'epoch': 149.0}
{'loss': 2.0442, 'grad_norm': 3.713656425476074, 'learning_rate': 9.689922480620156e-08, 'epoch': 149.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 3.129140853881836, 'eval_runtime': 68.2309, 'eval_samples_per_second': 8.955, 'eval_steps_per_second': 1.129, 'epoch': 150.0}
{'train_runtime': 494258.0982, 'train_samples_per_second': 1.252, 'train_steps_per_second': 0.157, 'train_loss': 2.819731461872426, 'epoch': 150.0}


TrainOutput(global_step=77400, training_loss=2.819731461872426, metrics={'train_runtime': 494258.0982, 'train_samples_per_second': 1.252, 'train_steps_per_second': 0.157, 'total_flos': 5346095180206080.0, 'train_loss': 2.819731461872426, 'epoch': 150.0})

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MT5ForConditionalGeneration

# Define constants and model
BATCH_SIZE = 16 # Increased batch size to reduce iterations

# Assuming tokenized_dataset and tokenizer are already defined
# Initialize the model
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

logging_steps = len(tokenized_datasets["train"]) // BATCH_SIZE
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=150,  # Reduced number of epochs to 50
    predict_with_generate=True,
    fp16=True,  # Enabled mixed precision training
    logging_steps=logging_steps,
    report_to="none"
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()






  0%|          | 0/38700 [00:00<?, ?it/s]

{'loss': 17.449, 'grad_norm': 536.2093505859375, 'learning_rate': 4.9667958656330755e-05, 'epoch': 1.0}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 8.673234939575195, 'eval_runtime': 105.6429, 'eval_samples_per_second': 5.784, 'eval_steps_per_second': 0.369, 'epoch': 1.0}
{'loss': 8.8813, 'grad_norm': 14.584156036376953, 'learning_rate': 4.93359173126615e-05, 'epoch': 1.99}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 6.422065258026123, 'eval_runtime': 108.8476, 'eval_samples_per_second': 5.613, 'eval_steps_per_second': 0.358, 'epoch': 2.0}
{'loss': 6.6363, 'grad_norm': 3.571881055831909, 'learning_rate': 4.900387596899225e-05, 'epoch': 2.99}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 5.042505741119385, 'eval_runtime': 92.8004, 'eval_samples_per_second': 6.584, 'eval_steps_per_second': 0.42, 'epoch': 3.0}
{'loss': 5.7163, 'grad_norm': 2.566762924194336, 'learning_rate': 4.8671834625323e-05, 'epoch': 3.98}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 4.634969234466553, 'eval_runtime': 94.6228, 'eval_samples_per_second': 6.457, 'eval_steps_per_second': 0.412, 'epoch': 4.0}
{'loss': 5.2978, 'grad_norm': 2.3249003887176514, 'learning_rate': 4.833979328165375e-05, 'epoch': 4.98}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 4.410088062286377, 'eval_runtime': 94.0583, 'eval_samples_per_second': 6.496, 'eval_steps_per_second': 0.415, 'epoch': 5.0}
{'loss': 5.0268, 'grad_norm': 1.874089002609253, 'learning_rate': 4.8007751937984496e-05, 'epoch': 5.98}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 4.252509593963623, 'eval_runtime': 95.8428, 'eval_samples_per_second': 6.375, 'eval_steps_per_second': 0.407, 'epoch': 6.0}
{'loss': 4.8573, 'grad_norm': 2.217402696609497, 'learning_rate': 4.767571059431525e-05, 'epoch': 6.97}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 4.144523620605469, 'eval_runtime': 268.4335, 'eval_samples_per_second': 2.276, 'eval_steps_per_second': 0.145, 'epoch': 7.0}
{'loss': 4.7062, 'grad_norm': 1.9252628087997437, 'learning_rate': 4.7343669250645994e-05, 'epoch': 7.97}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 4.058993816375732, 'eval_runtime': 98.9356, 'eval_samples_per_second': 6.176, 'eval_steps_per_second': 0.394, 'epoch': 8.0}
{'loss': 4.5969, 'grad_norm': 2.2344954013824463, 'learning_rate': 4.701162790697675e-05, 'epoch': 8.97}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.985435724258423, 'eval_runtime': 94.219, 'eval_samples_per_second': 6.485, 'eval_steps_per_second': 0.414, 'epoch': 9.0}
{'loss': 4.4926, 'grad_norm': 2.425255298614502, 'learning_rate': 4.66795865633075e-05, 'epoch': 9.96}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.9248759746551514, 'eval_runtime': 83.5969, 'eval_samples_per_second': 7.309, 'eval_steps_per_second': 0.467, 'epoch': 10.0}
{'loss': 4.4197, 'grad_norm': 2.016418218612671, 'learning_rate': 4.6347545219638245e-05, 'epoch': 10.96}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.875159740447998, 'eval_runtime': 83.439, 'eval_samples_per_second': 7.323, 'eval_steps_per_second': 0.467, 'epoch': 11.0}
{'loss': 4.3311, 'grad_norm': 2.00227427482605, 'learning_rate': 4.6015503875969e-05, 'epoch': 11.95}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.831611156463623, 'eval_runtime': 82.3708, 'eval_samples_per_second': 7.418, 'eval_steps_per_second': 0.473, 'epoch': 12.0}
{'loss': 4.262, 'grad_norm': 2.0761890411376953, 'learning_rate': 4.568346253229974e-05, 'epoch': 12.95}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.7847301959991455, 'eval_runtime': 85.0034, 'eval_samples_per_second': 7.188, 'eval_steps_per_second': 0.459, 'epoch': 13.0}
{'loss': 4.1953, 'grad_norm': 1.876960039138794, 'learning_rate': 4.5351421188630495e-05, 'epoch': 13.95}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.751694440841675, 'eval_runtime': 66.8876, 'eval_samples_per_second': 9.135, 'eval_steps_per_second': 0.583, 'epoch': 14.0}
{'loss': 4.1346, 'grad_norm': 2.150141716003418, 'learning_rate': 4.501937984496124e-05, 'epoch': 14.94}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.712899684906006, 'eval_runtime': 67.1753, 'eval_samples_per_second': 9.096, 'eval_steps_per_second': 0.581, 'epoch': 15.0}
{'loss': 4.0798, 'grad_norm': 2.057161808013916, 'learning_rate': 4.468733850129199e-05, 'epoch': 15.94}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.6808526515960693, 'eval_runtime': 67.0234, 'eval_samples_per_second': 9.116, 'eval_steps_per_second': 0.582, 'epoch': 16.0}
{'loss': 4.0257, 'grad_norm': 2.063814878463745, 'learning_rate': 4.4355297157622745e-05, 'epoch': 16.93}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.654670238494873, 'eval_runtime': 66.7192, 'eval_samples_per_second': 9.158, 'eval_steps_per_second': 0.585, 'epoch': 17.0}
{'loss': 3.9851, 'grad_norm': 2.0494580268859863, 'learning_rate': 4.402325581395349e-05, 'epoch': 17.93}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.6319420337677, 'eval_runtime': 97.9792, 'eval_samples_per_second': 6.236, 'eval_steps_per_second': 0.398, 'epoch': 18.0}
{'loss': 3.9314, 'grad_norm': 2.1494498252868652, 'learning_rate': 4.3691214470284243e-05, 'epoch': 18.93}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.6045985221862793, 'eval_runtime': 61.5879, 'eval_samples_per_second': 9.921, 'eval_steps_per_second': 0.633, 'epoch': 19.0}
{'loss': 3.8923, 'grad_norm': 1.931779384613037, 'learning_rate': 4.335917312661499e-05, 'epoch': 19.92}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.5859241485595703, 'eval_runtime': 62.0135, 'eval_samples_per_second': 9.853, 'eval_steps_per_second': 0.629, 'epoch': 20.0}
{'loss': 3.8508, 'grad_norm': 2.3258860111236572, 'learning_rate': 4.302713178294574e-05, 'epoch': 20.92}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.5552122592926025, 'eval_runtime': 58.6492, 'eval_samples_per_second': 10.418, 'eval_steps_per_second': 0.665, 'epoch': 21.0}
{'loss': 3.8264, 'grad_norm': 2.1718695163726807, 'learning_rate': 4.269509043927649e-05, 'epoch': 21.91}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.543980598449707, 'eval_runtime': 55.7497, 'eval_samples_per_second': 10.96, 'eval_steps_per_second': 0.7, 'epoch': 22.0}
{'loss': 3.7758, 'grad_norm': 2.124774932861328, 'learning_rate': 4.236304909560724e-05, 'epoch': 22.91}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.5217065811157227, 'eval_runtime': 58.7275, 'eval_samples_per_second': 10.404, 'eval_steps_per_second': 0.664, 'epoch': 23.0}
{'loss': 3.7423, 'grad_norm': 2.2537782192230225, 'learning_rate': 4.2031007751937985e-05, 'epoch': 23.91}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.501703977584839, 'eval_runtime': 62.0801, 'eval_samples_per_second': 9.842, 'eval_steps_per_second': 0.628, 'epoch': 24.0}
{'loss': 3.7087, 'grad_norm': 2.1528708934783936, 'learning_rate': 4.169896640826874e-05, 'epoch': 24.9}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.4899024963378906, 'eval_runtime': 56.7755, 'eval_samples_per_second': 10.762, 'eval_steps_per_second': 0.687, 'epoch': 25.0}
{'loss': 3.6754, 'grad_norm': 2.1878418922424316, 'learning_rate': 4.136692506459949e-05, 'epoch': 25.9}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.471608877182007, 'eval_runtime': 59.1891, 'eval_samples_per_second': 10.323, 'eval_steps_per_second': 0.659, 'epoch': 26.0}
{'loss': 3.6367, 'grad_norm': 2.4339756965637207, 'learning_rate': 4.1034883720930235e-05, 'epoch': 26.9}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.4558918476104736, 'eval_runtime': 55.2283, 'eval_samples_per_second': 11.063, 'eval_steps_per_second': 0.706, 'epoch': 27.0}
{'loss': 3.612, 'grad_norm': 2.547501802444458, 'learning_rate': 4.070284237726099e-05, 'epoch': 27.89}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.43866229057312, 'eval_runtime': 55.1169, 'eval_samples_per_second': 11.086, 'eval_steps_per_second': 0.708, 'epoch': 28.0}
{'loss': 3.5803, 'grad_norm': 2.368957281112671, 'learning_rate': 4.0370801033591733e-05, 'epoch': 28.89}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.4330546855926514, 'eval_runtime': 55.2375, 'eval_samples_per_second': 11.061, 'eval_steps_per_second': 0.706, 'epoch': 29.0}
{'loss': 3.565, 'grad_norm': 2.2603073120117188, 'learning_rate': 4.0038759689922486e-05, 'epoch': 29.88}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.4113945960998535, 'eval_runtime': 56.7526, 'eval_samples_per_second': 10.766, 'eval_steps_per_second': 0.687, 'epoch': 30.0}
{'loss': 3.534, 'grad_norm': 2.4548585414886475, 'learning_rate': 3.970671834625323e-05, 'epoch': 30.88}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.3957676887512207, 'eval_runtime': 55.0345, 'eval_samples_per_second': 11.102, 'eval_steps_per_second': 0.709, 'epoch': 31.0}
{'loss': 3.5012, 'grad_norm': 2.341632127761841, 'learning_rate': 3.9374677002583984e-05, 'epoch': 31.88}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.39617919921875, 'eval_runtime': 54.294, 'eval_samples_per_second': 11.254, 'eval_steps_per_second': 0.718, 'epoch': 32.0}
{'loss': 3.4796, 'grad_norm': 2.1200718879699707, 'learning_rate': 3.904263565891473e-05, 'epoch': 32.87}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.374443531036377, 'eval_runtime': 59.8966, 'eval_samples_per_second': 10.201, 'eval_steps_per_second': 0.651, 'epoch': 33.0}
{'loss': 3.4499, 'grad_norm': 2.376657724380493, 'learning_rate': 3.871059431524548e-05, 'epoch': 33.87}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.375018835067749, 'eval_runtime': 55.2261, 'eval_samples_per_second': 11.064, 'eval_steps_per_second': 0.706, 'epoch': 34.0}
{'loss': 3.4348, 'grad_norm': 2.606332778930664, 'learning_rate': 3.8378552971576234e-05, 'epoch': 34.86}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.3506391048431396, 'eval_runtime': 58.9794, 'eval_samples_per_second': 10.36, 'eval_steps_per_second': 0.661, 'epoch': 35.0}
{'loss': 3.4038, 'grad_norm': 2.399453639984131, 'learning_rate': 3.804651162790698e-05, 'epoch': 35.86}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.338191032409668, 'eval_runtime': 58.3738, 'eval_samples_per_second': 10.467, 'eval_steps_per_second': 0.668, 'epoch': 36.0}
{'loss': 3.3983, 'grad_norm': 2.4580302238464355, 'learning_rate': 3.771447028423773e-05, 'epoch': 36.86}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.326648473739624, 'eval_runtime': 62.8755, 'eval_samples_per_second': 9.718, 'eval_steps_per_second': 0.62, 'epoch': 37.0}
{'loss': 3.3699, 'grad_norm': 2.0412073135375977, 'learning_rate': 3.738242894056848e-05, 'epoch': 37.85}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.326575994491577, 'eval_runtime': 63.4474, 'eval_samples_per_second': 9.63, 'eval_steps_per_second': 0.615, 'epoch': 38.0}
{'loss': 3.325, 'grad_norm': 2.5339138507843018, 'learning_rate': 3.705038759689923e-05, 'epoch': 38.85}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.316586971282959, 'eval_runtime': 76.5825, 'eval_samples_per_second': 7.978, 'eval_steps_per_second': 0.509, 'epoch': 39.0}
{'loss': 3.318, 'grad_norm': 2.3777287006378174, 'learning_rate': 3.6718346253229976e-05, 'epoch': 39.84}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.316782236099243, 'eval_runtime': 63.43, 'eval_samples_per_second': 9.633, 'eval_steps_per_second': 0.615, 'epoch': 40.0}
{'loss': 3.308, 'grad_norm': 2.9240424633026123, 'learning_rate': 3.638630490956073e-05, 'epoch': 40.84}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.3010687828063965, 'eval_runtime': 64.3411, 'eval_samples_per_second': 9.496, 'eval_steps_per_second': 0.606, 'epoch': 41.0}
{'loss': 3.2873, 'grad_norm': 2.850663185119629, 'learning_rate': 3.605426356589148e-05, 'epoch': 41.84}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.293701171875, 'eval_runtime': 63.0505, 'eval_samples_per_second': 9.691, 'eval_steps_per_second': 0.619, 'epoch': 42.0}
{'loss': 3.2519, 'grad_norm': 1.905053973197937, 'learning_rate': 3.5722222222222226e-05, 'epoch': 42.83}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.288518190383911, 'eval_runtime': 63.1406, 'eval_samples_per_second': 9.677, 'eval_steps_per_second': 0.618, 'epoch': 43.0}
{'loss': 3.2454, 'grad_norm': 2.412349224090576, 'learning_rate': 3.539018087855298e-05, 'epoch': 43.83}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2885072231292725, 'eval_runtime': 63.4458, 'eval_samples_per_second': 9.63, 'eval_steps_per_second': 0.615, 'epoch': 44.0}
{'loss': 3.2218, 'grad_norm': 2.41525936126709, 'learning_rate': 3.5058139534883724e-05, 'epoch': 44.83}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.271368980407715, 'eval_runtime': 63.0056, 'eval_samples_per_second': 9.698, 'eval_steps_per_second': 0.619, 'epoch': 45.0}
{'loss': 3.218, 'grad_norm': 2.217292070388794, 'learning_rate': 3.4726098191214477e-05, 'epoch': 45.82}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.267451524734497, 'eval_runtime': 64.3964, 'eval_samples_per_second': 9.488, 'eval_steps_per_second': 0.606, 'epoch': 46.0}
{'loss': 3.1912, 'grad_norm': 2.1351258754730225, 'learning_rate': 3.439405684754522e-05, 'epoch': 46.82}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2479498386383057, 'eval_runtime': 63.4118, 'eval_samples_per_second': 9.635, 'eval_steps_per_second': 0.615, 'epoch': 47.0}
{'loss': 3.1749, 'grad_norm': 2.1968939304351807, 'learning_rate': 3.4062015503875975e-05, 'epoch': 47.81}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.251516580581665, 'eval_runtime': 63.7096, 'eval_samples_per_second': 9.59, 'eval_steps_per_second': 0.612, 'epoch': 48.0}
{'loss': 3.1544, 'grad_norm': 2.596121072769165, 'learning_rate': 3.372997416020672e-05, 'epoch': 48.81}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2411303520202637, 'eval_runtime': 63.2536, 'eval_samples_per_second': 9.66, 'eval_steps_per_second': 0.617, 'epoch': 49.0}
{'loss': 3.1512, 'grad_norm': 2.254818916320801, 'learning_rate': 3.339793281653747e-05, 'epoch': 49.81}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2459170818328857, 'eval_runtime': 79.6972, 'eval_samples_per_second': 7.667, 'eval_steps_per_second': 0.489, 'epoch': 50.0}
{'loss': 3.1199, 'grad_norm': 2.2025461196899414, 'learning_rate': 3.306589147286822e-05, 'epoch': 50.8}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.241635322570801, 'eval_runtime': 86.6844, 'eval_samples_per_second': 7.049, 'eval_steps_per_second': 0.45, 'epoch': 51.0}
{'loss': 3.0959, 'grad_norm': 3.218963146209717, 'learning_rate': 3.273385012919897e-05, 'epoch': 51.8}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2325117588043213, 'eval_runtime': 88.3041, 'eval_samples_per_second': 6.919, 'eval_steps_per_second': 0.442, 'epoch': 52.0}
{'loss': 3.1094, 'grad_norm': 2.6621522903442383, 'learning_rate': 3.2401808785529716e-05, 'epoch': 52.79}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2128753662109375, 'eval_runtime': 73.5188, 'eval_samples_per_second': 8.311, 'eval_steps_per_second': 0.53, 'epoch': 53.0}
{'loss': 3.0731, 'grad_norm': 2.5275752544403076, 'learning_rate': 3.206976744186047e-05, 'epoch': 53.79}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.228480339050293, 'eval_runtime': 74.1744, 'eval_samples_per_second': 8.237, 'eval_steps_per_second': 0.526, 'epoch': 54.0}
{'loss': 3.064, 'grad_norm': 2.3728561401367188, 'learning_rate': 3.1737726098191214e-05, 'epoch': 54.79}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.215498208999634, 'eval_runtime': 75.3853, 'eval_samples_per_second': 8.105, 'eval_steps_per_second': 0.517, 'epoch': 55.0}
{'loss': 3.0489, 'grad_norm': 2.3376710414886475, 'learning_rate': 3.1405684754521967e-05, 'epoch': 55.78}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2151873111724854, 'eval_runtime': 74.4003, 'eval_samples_per_second': 8.212, 'eval_steps_per_second': 0.524, 'epoch': 56.0}
{'loss': 3.0407, 'grad_norm': 3.0579726696014404, 'learning_rate': 3.107364341085271e-05, 'epoch': 56.78}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.2027623653411865, 'eval_runtime': 62.096, 'eval_samples_per_second': 9.84, 'eval_steps_per_second': 0.628, 'epoch': 57.0}
{'loss': 3.0235, 'grad_norm': 2.3255515098571777, 'learning_rate': 3.0741602067183465e-05, 'epoch': 57.78}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.194362163543701, 'eval_runtime': 61.8652, 'eval_samples_per_second': 9.876, 'eval_steps_per_second': 0.63, 'epoch': 58.0}
{'loss': 3.0026, 'grad_norm': 2.930999994277954, 'learning_rate': 3.0409560723514213e-05, 'epoch': 58.77}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.205878734588623, 'eval_runtime': 62.1932, 'eval_samples_per_second': 9.824, 'eval_steps_per_second': 0.627, 'epoch': 59.0}
{'loss': 3.0024, 'grad_norm': 2.592621326446533, 'learning_rate': 3.0077519379844966e-05, 'epoch': 59.77}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1872692108154297, 'eval_runtime': 61.8446, 'eval_samples_per_second': 9.88, 'eval_steps_per_second': 0.631, 'epoch': 60.0}
{'loss': 2.9803, 'grad_norm': 2.3900139331817627, 'learning_rate': 2.974547803617571e-05, 'epoch': 60.76}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1838674545288086, 'eval_runtime': 62.3768, 'eval_samples_per_second': 9.795, 'eval_steps_per_second': 0.625, 'epoch': 61.0}
{'loss': 2.9677, 'grad_norm': 2.7632062435150146, 'learning_rate': 2.9413436692506464e-05, 'epoch': 61.76}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.178084135055542, 'eval_runtime': 63.9669, 'eval_samples_per_second': 9.552, 'eval_steps_per_second': 0.61, 'epoch': 62.0}
{'loss': 2.9708, 'grad_norm': 2.4234092235565186, 'learning_rate': 2.908139534883721e-05, 'epoch': 62.76}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1865625381469727, 'eval_runtime': 63.9207, 'eval_samples_per_second': 9.559, 'eval_steps_per_second': 0.61, 'epoch': 63.0}
{'loss': 2.943, 'grad_norm': 2.2907471656799316, 'learning_rate': 2.8749354005167962e-05, 'epoch': 63.75}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1789097785949707, 'eval_runtime': 79.6293, 'eval_samples_per_second': 7.673, 'eval_steps_per_second': 0.49, 'epoch': 64.0}
{'loss': 2.9299, 'grad_norm': 2.6168460845947266, 'learning_rate': 2.8417312661498707e-05, 'epoch': 64.75}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.174351692199707, 'eval_runtime': 78.2997, 'eval_samples_per_second': 7.803, 'eval_steps_per_second': 0.498, 'epoch': 65.0}
{'loss': 2.919, 'grad_norm': 2.6246824264526367, 'learning_rate': 2.808527131782946e-05, 'epoch': 65.74}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.182227611541748, 'eval_runtime': 81.2197, 'eval_samples_per_second': 7.523, 'eval_steps_per_second': 0.48, 'epoch': 66.0}
{'loss': 2.9027, 'grad_norm': 2.1415064334869385, 'learning_rate': 2.7753229974160205e-05, 'epoch': 66.74}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1699793338775635, 'eval_runtime': 80.1587, 'eval_samples_per_second': 7.622, 'eval_steps_per_second': 0.487, 'epoch': 67.0}
{'loss': 2.9055, 'grad_norm': 2.605119466781616, 'learning_rate': 2.7421188630490958e-05, 'epoch': 67.74}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.158971071243286, 'eval_runtime': 80.0841, 'eval_samples_per_second': 7.629, 'eval_steps_per_second': 0.487, 'epoch': 68.0}
{'loss': 2.8724, 'grad_norm': 2.5898537635803223, 'learning_rate': 2.708914728682171e-05, 'epoch': 68.73}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.164077043533325, 'eval_runtime': 79.5691, 'eval_samples_per_second': 7.679, 'eval_steps_per_second': 0.49, 'epoch': 69.0}
{'loss': 2.8869, 'grad_norm': 2.4118189811706543, 'learning_rate': 2.6757105943152456e-05, 'epoch': 69.73}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.159557819366455, 'eval_runtime': 83.2295, 'eval_samples_per_second': 7.341, 'eval_steps_per_second': 0.469, 'epoch': 70.0}
{'loss': 2.8833, 'grad_norm': 2.252811908721924, 'learning_rate': 2.6425064599483208e-05, 'epoch': 70.72}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1562700271606445, 'eval_runtime': 80.1084, 'eval_samples_per_second': 7.627, 'eval_steps_per_second': 0.487, 'epoch': 71.0}
{'loss': 2.8407, 'grad_norm': 2.3848683834075928, 'learning_rate': 2.6093023255813954e-05, 'epoch': 71.72}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1575396060943604, 'eval_runtime': 75.3265, 'eval_samples_per_second': 8.111, 'eval_steps_per_second': 0.518, 'epoch': 72.0}
{'loss': 2.8483, 'grad_norm': 2.8262722492218018, 'learning_rate': 2.5760981912144706e-05, 'epoch': 72.72}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1521852016448975, 'eval_runtime': 77.8502, 'eval_samples_per_second': 7.848, 'eval_steps_per_second': 0.501, 'epoch': 73.0}
{'loss': 2.8447, 'grad_norm': 2.6239802837371826, 'learning_rate': 2.5428940568475452e-05, 'epoch': 73.71}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1517417430877686, 'eval_runtime': 75.5104, 'eval_samples_per_second': 8.092, 'eval_steps_per_second': 0.516, 'epoch': 74.0}
{'loss': 2.8127, 'grad_norm': 2.8929874897003174, 'learning_rate': 2.5096899224806204e-05, 'epoch': 74.71}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.166567325592041, 'eval_runtime': 73.7347, 'eval_samples_per_second': 8.286, 'eval_steps_per_second': 0.529, 'epoch': 75.0}
{'loss': 2.8165, 'grad_norm': 2.737569570541382, 'learning_rate': 2.4764857881136953e-05, 'epoch': 75.71}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1726579666137695, 'eval_runtime': 74.1229, 'eval_samples_per_second': 8.243, 'eval_steps_per_second': 0.526, 'epoch': 76.0}
{'loss': 2.8197, 'grad_norm': 2.7974677085876465, 'learning_rate': 2.4432816537467702e-05, 'epoch': 76.7}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1455962657928467, 'eval_runtime': 75.9744, 'eval_samples_per_second': 8.042, 'eval_steps_per_second': 0.513, 'epoch': 77.0}
{'loss': 2.7789, 'grad_norm': 2.641301393508911, 'learning_rate': 2.410077519379845e-05, 'epoch': 77.7}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1328794956207275, 'eval_runtime': 77.6671, 'eval_samples_per_second': 7.867, 'eval_steps_per_second': 0.502, 'epoch': 78.0}
{'loss': 2.788, 'grad_norm': 3.259155511856079, 'learning_rate': 2.37687338501292e-05, 'epoch': 78.69}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.135356903076172, 'eval_runtime': 78.8878, 'eval_samples_per_second': 7.745, 'eval_steps_per_second': 0.494, 'epoch': 79.0}
{'loss': 2.7886, 'grad_norm': 2.664609432220459, 'learning_rate': 2.343669250645995e-05, 'epoch': 79.69}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1434388160705566, 'eval_runtime': 64.3728, 'eval_samples_per_second': 9.492, 'eval_steps_per_second': 0.606, 'epoch': 80.0}
{'loss': 2.7622, 'grad_norm': 2.4833788871765137, 'learning_rate': 2.3104651162790698e-05, 'epoch': 80.69}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.134648323059082, 'eval_runtime': 64.5434, 'eval_samples_per_second': 9.467, 'eval_steps_per_second': 0.604, 'epoch': 81.0}
{'loss': 2.7553, 'grad_norm': 2.4983139038085938, 'learning_rate': 2.2772609819121447e-05, 'epoch': 81.68}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1413843631744385, 'eval_runtime': 98.9433, 'eval_samples_per_second': 6.175, 'eval_steps_per_second': 0.394, 'epoch': 82.0}
{'loss': 2.7527, 'grad_norm': 2.7713139057159424, 'learning_rate': 2.2440568475452196e-05, 'epoch': 82.68}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1268348693847656, 'eval_runtime': 90.4246, 'eval_samples_per_second': 6.757, 'eval_steps_per_second': 0.431, 'epoch': 83.0}
{'loss': 2.7374, 'grad_norm': 2.588552951812744, 'learning_rate': 2.2108527131782945e-05, 'epoch': 83.67}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.132969379425049, 'eval_runtime': 77.4535, 'eval_samples_per_second': 7.889, 'eval_steps_per_second': 0.504, 'epoch': 84.0}
{'loss': 2.7324, 'grad_norm': 2.4902796745300293, 'learning_rate': 2.1776485788113694e-05, 'epoch': 84.67}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1210625171661377, 'eval_runtime': 77.5837, 'eval_samples_per_second': 7.875, 'eval_steps_per_second': 0.503, 'epoch': 85.0}
{'loss': 2.7325, 'grad_norm': 2.8676302433013916, 'learning_rate': 2.1444444444444443e-05, 'epoch': 85.67}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1171700954437256, 'eval_runtime': 80.4309, 'eval_samples_per_second': 7.597, 'eval_steps_per_second': 0.485, 'epoch': 86.0}
{'loss': 2.7086, 'grad_norm': 2.549161911010742, 'learning_rate': 2.1112403100775192e-05, 'epoch': 86.66}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1357622146606445, 'eval_runtime': 79.0966, 'eval_samples_per_second': 7.725, 'eval_steps_per_second': 0.493, 'epoch': 87.0}
{'loss': 2.7113, 'grad_norm': 2.702925205230713, 'learning_rate': 2.078036175710594e-05, 'epoch': 87.66}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1207175254821777, 'eval_runtime': 138.8138, 'eval_samples_per_second': 4.402, 'eval_steps_per_second': 0.281, 'epoch': 88.0}
{'loss': 2.7109, 'grad_norm': 2.3255395889282227, 'learning_rate': 2.0448320413436694e-05, 'epoch': 88.66}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.132380247116089, 'eval_runtime': 182.046, 'eval_samples_per_second': 3.356, 'eval_steps_per_second': 0.214, 'epoch': 89.0}
{'loss': 2.6933, 'grad_norm': 2.666975736618042, 'learning_rate': 2.0116279069767443e-05, 'epoch': 89.65}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1242053508758545, 'eval_runtime': 85.1417, 'eval_samples_per_second': 7.176, 'eval_steps_per_second': 0.458, 'epoch': 90.0}
{'loss': 2.6802, 'grad_norm': 2.374788522720337, 'learning_rate': 1.978423772609819e-05, 'epoch': 90.65}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1190707683563232, 'eval_runtime': 84.1824, 'eval_samples_per_second': 7.258, 'eval_steps_per_second': 0.463, 'epoch': 91.0}
{'loss': 2.6825, 'grad_norm': 2.396141290664673, 'learning_rate': 1.945219638242894e-05, 'epoch': 91.64}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.112140655517578, 'eval_runtime': 44.3604, 'eval_samples_per_second': 13.774, 'eval_steps_per_second': 0.879, 'epoch': 92.0}
{'loss': 2.6721, 'grad_norm': 2.69037127494812, 'learning_rate': 1.912015503875969e-05, 'epoch': 92.64}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1139183044433594, 'eval_runtime': 60.8181, 'eval_samples_per_second': 10.046, 'eval_steps_per_second': 0.641, 'epoch': 93.0}
{'loss': 2.6626, 'grad_norm': 2.370354175567627, 'learning_rate': 1.878811369509044e-05, 'epoch': 93.64}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1216418743133545, 'eval_runtime': 68.7145, 'eval_samples_per_second': 8.892, 'eval_steps_per_second': 0.568, 'epoch': 94.0}
{'loss': 2.6717, 'grad_norm': 2.6295764446258545, 'learning_rate': 1.8456072351421188e-05, 'epoch': 94.63}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1125855445861816, 'eval_runtime': 69.961, 'eval_samples_per_second': 8.733, 'eval_steps_per_second': 0.557, 'epoch': 95.0}
{'loss': 2.6474, 'grad_norm': 3.1786768436431885, 'learning_rate': 1.8124031007751937e-05, 'epoch': 95.63}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.116462230682373, 'eval_runtime': 64.9911, 'eval_samples_per_second': 9.401, 'eval_steps_per_second': 0.6, 'epoch': 96.0}
{'loss': 2.633, 'grad_norm': 2.886732578277588, 'learning_rate': 1.779198966408269e-05, 'epoch': 96.62}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.124011993408203, 'eval_runtime': 67.9272, 'eval_samples_per_second': 8.995, 'eval_steps_per_second': 0.574, 'epoch': 97.0}
{'loss': 2.6637, 'grad_norm': 2.597961664199829, 'learning_rate': 1.7459948320413438e-05, 'epoch': 97.62}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.117750406265259, 'eval_runtime': 88.2655, 'eval_samples_per_second': 6.922, 'eval_steps_per_second': 0.442, 'epoch': 98.0}
{'loss': 2.6123, 'grad_norm': 2.4321181774139404, 'learning_rate': 1.7127906976744187e-05, 'epoch': 98.62}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.124800205230713, 'eval_runtime': 101.6433, 'eval_samples_per_second': 6.011, 'eval_steps_per_second': 0.384, 'epoch': 99.0}
{'loss': 2.6294, 'grad_norm': 3.0449323654174805, 'learning_rate': 1.6795865633074936e-05, 'epoch': 99.61}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.104422092437744, 'eval_runtime': 90.8701, 'eval_samples_per_second': 6.724, 'eval_steps_per_second': 0.429, 'epoch': 100.0}
{'loss': 2.6204, 'grad_norm': 3.0787222385406494, 'learning_rate': 1.6463824289405685e-05, 'epoch': 100.61}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.110433340072632, 'eval_runtime': 80.7347, 'eval_samples_per_second': 7.568, 'eval_steps_per_second': 0.483, 'epoch': 101.0}
{'loss': 2.6201, 'grad_norm': 2.550168037414551, 'learning_rate': 1.6131782945736434e-05, 'epoch': 101.6}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1186811923980713, 'eval_runtime': 82.1589, 'eval_samples_per_second': 7.437, 'eval_steps_per_second': 0.475, 'epoch': 102.0}
{'loss': 2.6086, 'grad_norm': 2.8478691577911377, 'learning_rate': 1.5799741602067183e-05, 'epoch': 102.6}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1165997982025146, 'eval_runtime': 81.2597, 'eval_samples_per_second': 7.519, 'eval_steps_per_second': 0.48, 'epoch': 103.0}
{'loss': 2.5973, 'grad_norm': 2.82112455368042, 'learning_rate': 1.5467700258397932e-05, 'epoch': 103.6}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1074142456054688, 'eval_runtime': 81.6477, 'eval_samples_per_second': 7.483, 'eval_steps_per_second': 0.478, 'epoch': 104.0}
{'loss': 2.5972, 'grad_norm': 3.0500833988189697, 'learning_rate': 1.5135658914728684e-05, 'epoch': 104.59}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.105523109436035, 'eval_runtime': 81.2522, 'eval_samples_per_second': 7.52, 'eval_steps_per_second': 0.48, 'epoch': 105.0}
{'loss': 2.595, 'grad_norm': 2.3583061695098877, 'learning_rate': 1.4803617571059433e-05, 'epoch': 105.59}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1115148067474365, 'eval_runtime': 81.8347, 'eval_samples_per_second': 7.466, 'eval_steps_per_second': 0.477, 'epoch': 106.0}
{'loss': 2.5838, 'grad_norm': 2.4742558002471924, 'learning_rate': 1.4471576227390182e-05, 'epoch': 106.59}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.100160837173462, 'eval_runtime': 81.8995, 'eval_samples_per_second': 7.46, 'eval_steps_per_second': 0.476, 'epoch': 107.0}
{'loss': 2.5756, 'grad_norm': 2.3608224391937256, 'learning_rate': 1.4139534883720931e-05, 'epoch': 107.58}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.108240842819214, 'eval_runtime': 82.8468, 'eval_samples_per_second': 7.375, 'eval_steps_per_second': 0.471, 'epoch': 108.0}
{'loss': 2.5842, 'grad_norm': 2.945906162261963, 'learning_rate': 1.380749354005168e-05, 'epoch': 108.58}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1045658588409424, 'eval_runtime': 81.031, 'eval_samples_per_second': 7.54, 'eval_steps_per_second': 0.481, 'epoch': 109.0}
{'loss': 2.5779, 'grad_norm': 2.6371653079986572, 'learning_rate': 1.347545219638243e-05, 'epoch': 109.57}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1054718494415283, 'eval_runtime': 80.7059, 'eval_samples_per_second': 7.571, 'eval_steps_per_second': 0.483, 'epoch': 110.0}
{'loss': 2.5541, 'grad_norm': 3.428496837615967, 'learning_rate': 1.3143410852713178e-05, 'epoch': 110.57}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1032605171203613, 'eval_runtime': 83.5238, 'eval_samples_per_second': 7.315, 'eval_steps_per_second': 0.467, 'epoch': 111.0}
{'loss': 2.5632, 'grad_norm': 3.2862846851348877, 'learning_rate': 1.2811369509043927e-05, 'epoch': 111.57}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.104883909225464, 'eval_runtime': 80.6356, 'eval_samples_per_second': 7.577, 'eval_steps_per_second': 0.484, 'epoch': 112.0}
{'loss': 2.5616, 'grad_norm': 2.7198941707611084, 'learning_rate': 1.2479328165374678e-05, 'epoch': 112.56}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.108600616455078, 'eval_runtime': 83.7506, 'eval_samples_per_second': 7.295, 'eval_steps_per_second': 0.466, 'epoch': 113.0}
{'loss': 2.5567, 'grad_norm': 2.614652395248413, 'learning_rate': 1.2147286821705427e-05, 'epoch': 113.56}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1003129482269287, 'eval_runtime': 68.1093, 'eval_samples_per_second': 8.971, 'eval_steps_per_second': 0.573, 'epoch': 114.0}
{'loss': 2.5557, 'grad_norm': 2.538710355758667, 'learning_rate': 1.1815245478036176e-05, 'epoch': 114.55}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.099813222885132, 'eval_runtime': 68.0636, 'eval_samples_per_second': 8.977, 'eval_steps_per_second': 0.573, 'epoch': 115.0}
{'loss': 2.5372, 'grad_norm': 3.08599591255188, 'learning_rate': 1.1483204134366927e-05, 'epoch': 115.55}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1019225120544434, 'eval_runtime': 66.1858, 'eval_samples_per_second': 9.232, 'eval_steps_per_second': 0.589, 'epoch': 116.0}
{'loss': 2.552, 'grad_norm': 3.0186350345611572, 'learning_rate': 1.1151162790697676e-05, 'epoch': 116.55}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.097144842147827, 'eval_runtime': 69.1904, 'eval_samples_per_second': 8.831, 'eval_steps_per_second': 0.564, 'epoch': 117.0}
{'loss': 2.5478, 'grad_norm': 2.7494924068450928, 'learning_rate': 1.0819121447028425e-05, 'epoch': 117.54}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0923235416412354, 'eval_runtime': 53.2417, 'eval_samples_per_second': 11.476, 'eval_steps_per_second': 0.733, 'epoch': 118.0}
{'loss': 2.5352, 'grad_norm': 2.4974255561828613, 'learning_rate': 1.0487080103359174e-05, 'epoch': 118.54}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.094747304916382, 'eval_runtime': 278.0517, 'eval_samples_per_second': 2.197, 'eval_steps_per_second': 0.14, 'epoch': 119.0}
{'loss': 2.5313, 'grad_norm': 2.7061448097229004, 'learning_rate': 1.0155038759689924e-05, 'epoch': 119.53}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.097052574157715, 'eval_runtime': 168.8871, 'eval_samples_per_second': 3.618, 'eval_steps_per_second': 0.231, 'epoch': 120.0}
{'loss': 2.5209, 'grad_norm': 2.442615270614624, 'learning_rate': 9.822997416020673e-06, 'epoch': 120.53}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.095480442047119, 'eval_runtime': 163.2364, 'eval_samples_per_second': 3.743, 'eval_steps_per_second': 0.239, 'epoch': 121.0}
{'loss': 2.5271, 'grad_norm': 2.5839884281158447, 'learning_rate': 9.490956072351422e-06, 'epoch': 121.53}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.097426414489746, 'eval_runtime': 76.5367, 'eval_samples_per_second': 7.983, 'eval_steps_per_second': 0.51, 'epoch': 122.0}
{'loss': 2.5185, 'grad_norm': 2.720186948776245, 'learning_rate': 9.158914728682171e-06, 'epoch': 122.52}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0943329334259033, 'eval_runtime': 76.4513, 'eval_samples_per_second': 7.992, 'eval_steps_per_second': 0.51, 'epoch': 123.0}
{'loss': 2.5067, 'grad_norm': 3.2131564617156982, 'learning_rate': 8.82687338501292e-06, 'epoch': 123.52}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0957210063934326, 'eval_runtime': 76.4914, 'eval_samples_per_second': 7.988, 'eval_steps_per_second': 0.51, 'epoch': 124.0}
{'loss': 2.51, 'grad_norm': 2.898455858230591, 'learning_rate': 8.49483204134367e-06, 'epoch': 124.52}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.095294713973999, 'eval_runtime': 86.0027, 'eval_samples_per_second': 7.104, 'eval_steps_per_second': 0.453, 'epoch': 125.0}
{'loss': 2.5129, 'grad_norm': 2.758087396621704, 'learning_rate': 8.162790697674418e-06, 'epoch': 125.51}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0949528217315674, 'eval_runtime': 74.5811, 'eval_samples_per_second': 8.192, 'eval_steps_per_second': 0.523, 'epoch': 126.0}
{'loss': 2.5057, 'grad_norm': 2.730203866958618, 'learning_rate': 7.830749354005167e-06, 'epoch': 126.51}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0996880531311035, 'eval_runtime': 144.7573, 'eval_samples_per_second': 4.221, 'eval_steps_per_second': 0.269, 'epoch': 127.0}
{'loss': 2.4999, 'grad_norm': 3.6677534580230713, 'learning_rate': 7.498708010335918e-06, 'epoch': 127.5}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0987448692321777, 'eval_runtime': 124.2624, 'eval_samples_per_second': 4.917, 'eval_steps_per_second': 0.314, 'epoch': 128.0}
{'loss': 2.513, 'grad_norm': 2.6768958568573, 'learning_rate': 7.166666666666667e-06, 'epoch': 128.5}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0933735370635986, 'eval_runtime': 154.9701, 'eval_samples_per_second': 3.943, 'eval_steps_per_second': 0.252, 'epoch': 129.0}
{'loss': 2.4962, 'grad_norm': 3.0395901203155518, 'learning_rate': 6.834625322997417e-06, 'epoch': 129.5}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0896427631378174, 'eval_runtime': 282.0423, 'eval_samples_per_second': 2.166, 'eval_steps_per_second': 0.138, 'epoch': 130.0}
{'loss': 2.4837, 'grad_norm': 2.8793299198150635, 'learning_rate': 6.502583979328166e-06, 'epoch': 130.49}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0884714126586914, 'eval_runtime': 99.1827, 'eval_samples_per_second': 6.16, 'eval_steps_per_second': 0.393, 'epoch': 131.0}
{'loss': 2.5019, 'grad_norm': 3.1750247478485107, 'learning_rate': 6.170542635658915e-06, 'epoch': 131.49}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.09600567817688, 'eval_runtime': 81.963, 'eval_samples_per_second': 7.455, 'eval_steps_per_second': 0.476, 'epoch': 132.0}
{'loss': 2.4993, 'grad_norm': 2.740421772003174, 'learning_rate': 5.838501291989664e-06, 'epoch': 132.48}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0974977016448975, 'eval_runtime': 81.6206, 'eval_samples_per_second': 7.486, 'eval_steps_per_second': 0.478, 'epoch': 133.0}
{'loss': 2.4804, 'grad_norm': 2.9481518268585205, 'learning_rate': 5.506459948320414e-06, 'epoch': 133.48}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.1010375022888184, 'eval_runtime': 85.4719, 'eval_samples_per_second': 7.149, 'eval_steps_per_second': 0.456, 'epoch': 134.0}
{'loss': 2.4895, 'grad_norm': 3.011680841445923, 'learning_rate': 5.174418604651163e-06, 'epoch': 134.48}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0973100662231445, 'eval_runtime': 81.9153, 'eval_samples_per_second': 7.459, 'eval_steps_per_second': 0.476, 'epoch': 135.0}
{'loss': 2.4913, 'grad_norm': 3.510786533355713, 'learning_rate': 4.8423772609819125e-06, 'epoch': 135.47}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0982894897460938, 'eval_runtime': 81.8677, 'eval_samples_per_second': 7.463, 'eval_steps_per_second': 0.476, 'epoch': 136.0}
{'loss': 2.4974, 'grad_norm': 2.7398552894592285, 'learning_rate': 4.5103359173126615e-06, 'epoch': 136.47}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.09431791305542, 'eval_runtime': 81.7928, 'eval_samples_per_second': 7.47, 'eval_steps_per_second': 0.477, 'epoch': 137.0}
{'loss': 2.48, 'grad_norm': 2.823324680328369, 'learning_rate': 4.178294573643411e-06, 'epoch': 137.47}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0947422981262207, 'eval_runtime': 82.6742, 'eval_samples_per_second': 7.39, 'eval_steps_per_second': 0.472, 'epoch': 138.0}
{'loss': 2.4858, 'grad_norm': 3.3818037509918213, 'learning_rate': 3.84625322997416e-06, 'epoch': 138.46}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0937092304229736, 'eval_runtime': 84.8319, 'eval_samples_per_second': 7.202, 'eval_steps_per_second': 0.46, 'epoch': 139.0}
{'loss': 2.4737, 'grad_norm': 3.0319454669952393, 'learning_rate': 3.5142118863049097e-06, 'epoch': 139.46}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0954997539520264, 'eval_runtime': 80.8899, 'eval_samples_per_second': 7.553, 'eval_steps_per_second': 0.482, 'epoch': 140.0}
{'loss': 2.4805, 'grad_norm': 3.165743589401245, 'learning_rate': 3.1821705426356587e-06, 'epoch': 140.45}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0926311016082764, 'eval_runtime': 82.0511, 'eval_samples_per_second': 7.447, 'eval_steps_per_second': 0.475, 'epoch': 141.0}
{'loss': 2.4746, 'grad_norm': 2.343999147415161, 'learning_rate': 2.850129198966408e-06, 'epoch': 141.45}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0923500061035156, 'eval_runtime': 83.0931, 'eval_samples_per_second': 7.353, 'eval_steps_per_second': 0.469, 'epoch': 142.0}
{'loss': 2.4951, 'grad_norm': 3.074157238006592, 'learning_rate': 2.5180878552971575e-06, 'epoch': 142.45}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0940515995025635, 'eval_runtime': 82.6431, 'eval_samples_per_second': 7.393, 'eval_steps_per_second': 0.472, 'epoch': 143.0}
{'loss': 2.4688, 'grad_norm': 2.6052308082580566, 'learning_rate': 2.186046511627907e-06, 'epoch': 143.44}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.096784830093384, 'eval_runtime': 82.6072, 'eval_samples_per_second': 7.396, 'eval_steps_per_second': 0.472, 'epoch': 144.0}
{'loss': 2.4595, 'grad_norm': 2.9565885066986084, 'learning_rate': 1.8540051679586564e-06, 'epoch': 144.44}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0943071842193604, 'eval_runtime': 82.9937, 'eval_samples_per_second': 7.362, 'eval_steps_per_second': 0.47, 'epoch': 145.0}
{'loss': 2.4814, 'grad_norm': 3.3863236904144287, 'learning_rate': 1.5219638242894056e-06, 'epoch': 145.43}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.09494686126709, 'eval_runtime': 88.6146, 'eval_samples_per_second': 6.895, 'eval_steps_per_second': 0.44, 'epoch': 146.0}
{'loss': 2.4767, 'grad_norm': 2.93894624710083, 'learning_rate': 1.189922480620155e-06, 'epoch': 146.43}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.095057964324951, 'eval_runtime': 83.9052, 'eval_samples_per_second': 7.282, 'eval_steps_per_second': 0.465, 'epoch': 147.0}
{'loss': 2.463, 'grad_norm': 2.7098071575164795, 'learning_rate': 8.578811369509044e-07, 'epoch': 147.43}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0957300662994385, 'eval_runtime': 83.4944, 'eval_samples_per_second': 7.318, 'eval_steps_per_second': 0.467, 'epoch': 148.0}
{'loss': 2.4871, 'grad_norm': 2.733576536178589, 'learning_rate': 5.258397932816537e-07, 'epoch': 148.42}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.094539165496826, 'eval_runtime': 77.9597, 'eval_samples_per_second': 7.837, 'eval_steps_per_second': 0.5, 'epoch': 149.0}
{'loss': 2.4615, 'grad_norm': 2.7204573154449463, 'learning_rate': 1.9379844961240311e-07, 'epoch': 149.42}


  0%|          | 0/39 [00:00<?, ?it/s]

{'eval_loss': 3.0947484970092773, 'eval_runtime': 77.1823, 'eval_samples_per_second': 7.916, 'eval_steps_per_second': 0.505, 'epoch': 150.0}
{'train_runtime': 347951.0235, 'train_samples_per_second': 1.779, 'train_steps_per_second': 0.111, 'train_loss': 3.200120762866905, 'epoch': 150.0}


TrainOutput(global_step=38700, training_loss=3.200120762866905, metrics={'train_runtime': 347951.0235, 'train_samples_per_second': 1.779, 'train_steps_per_second': 0.111, 'total_flos': 7396924178995200.0, 'train_loss': 3.200120762866905, 'epoch': 150.0})

In [41]:
from transformers import MT5ForConditionalGeneration, T5Tokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch

# Load the pre-trained mT5 model and tokenizer
model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size, collate_fn=data_collator)


# Function to generate summaries using the mT5 model
def generate_summary(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Generate summaries for the test inputs
generated_summaries = [generate_summary(text) for text in inputs]

# Function to tokenize summaries
def tokenize_text(text):
    return tokenizer(text, return_tensors="pt", padding=True, truncation=True).input_ids[0].tolist()

# Tokenize reference and generated summaries
tokenized_references = [tokenize_text(summary) for summary in reference_summaries]
tokenized_generated = [tokenize_text(summary) for summary in generated_summaries]

# Flatten lists for comparison
def flatten(l):
    return [item for sublist in l for item in sublist]

flat_labels = flatten(tokenized_references)
flat_preds = flatten(tokenized_generated)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(flat_labels, flat_preds)
precision = precision_score(flat_labels, flat_preds, average='weighted', zero_division=0)
recall = recall_score(flat_labels, flat_preds, average='weighted', zero_division=0)
f1 = f1_score(flat_labels, flat_preds, average='weighted', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: nan
Precision: nan
Recall: nan
F1 Score: nan


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


In [43]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch

# Load the model and tokenizer
model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Define DataLoader for evaluation
batch_size = 8  # Adjust as needed
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size, collate_fn=data_collator)

def generate_and_decode_predictions(dataloader, model, tokenizer):
    model.eval()  # Set model to evaluation mode

    all_labels = []
    all_preds = []

    for batch in dataloader:
        # Move batch to device (e.g., GPU) if available
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        # Generate summaries
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )

        # Decode predictions and labels
        decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Collect predictions and labels
        all_preds.extend(decoded_preds)
        all_labels.extend(decoded_labels)

    return all_labels, all_preds

def compute_metrics(predictions, references):
    # Tokenize text to compute metrics at the token level
    tokenized_preds = [tokenizer.encode(pred, add_special_tokens=False) for pred in predictions]
    tokenized_refs = [tokenizer.encode(ref, add_special_tokens=False) for ref in references]

    # Flatten lists for comparison
    flat_preds = [token for sublist in tokenized_preds for token in sublist]
    flat_refs = [token for sublist in tokenized_refs for token in sublist]

    # Compute metrics
    accuracy = accuracy_score(flat_refs, flat_preds)
    precision = precision_score(flat_refs, flat_preds, average='weighted', zero_division=0)
    recall = recall_score(flat_refs, flat_preds, average='weighted', zero_division=0)
    f1 = f1_score(flat_refs, flat_preds, average='weighted', zero_division=0)

    return accuracy, precision, recall, f1

# Generate predictions and decode
references, predictions = generate_and_decode_predictions(eval_dataloader, model, tokenizer)

# Compute and print metrics
accuracy, precision, recall, f1 = compute_metrics(predictions, references)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`source_text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).