In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [2]:
df_huffpo = pd.read_parquet("../../data/raw/news_categories.parquet", columns=["link", "category"])

In [3]:
df_huffpo["slug"] = df_huffpo.link.apply(lambda x: x.split("entry/")[-1].split("_")[0].replace("-", " "))

In [4]:
pcts = (df_huffpo.groupby("category").count() / len(df_huffpo)) * 100
df_huffpo = df_huffpo[df_huffpo.category.isin(pcts[pcts.link >= 2].index)]

In [5]:
le = LabelEncoder()
df_huffpo['label_encoded'] = le.fit_transform(df_huffpo['category'])

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df_huffpo['slug'], df_huffpo['label_encoded'], test_size=0.3)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.33)

In [7]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(le.classes_))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

In [9]:
# Prepare datasets
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

In [10]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 100916/100916 [00:07<00:00, 13047.88 examples/s]
Map: 100%|██████████| 28978/28978 [00:02<00:00, 12503.46 examples/s]


In [24]:
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 14273/14273 [00:01<00:00, 12835.39 examples/s]


In [15]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [19]:
device = torch.device("mps")
model = model.to(device)

In [21]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

In [22]:
# Train the model
trainer.train()

  0%|          | 7/18924 [02:50<128:01:52, 24.36s/it]
  0%|          | 10/18924 [00:18<9:21:47,  1.78s/it]

{'loss': 2.6709, 'grad_norm': 1.8923208713531494, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 20/18924 [00:36<9:24:20,  1.79s/it]

{'loss': 2.6637, 'grad_norm': 2.2499585151672363, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 30/18924 [00:54<9:19:16,  1.78s/it]

{'loss': 2.6558, 'grad_norm': 2.4395265579223633, 'learning_rate': 3e-06, 'epoch': 0.0}


  0%|          | 40/18924 [01:11<9:17:13,  1.77s/it]

{'loss': 2.6425, 'grad_norm': 2.219957113265991, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


  0%|          | 50/18924 [01:29<9:18:03,  1.77s/it]

{'loss': 2.6306, 'grad_norm': 1.9474124908447266, 'learning_rate': 5e-06, 'epoch': 0.01}


  0%|          | 60/18924 [01:47<9:19:42,  1.78s/it]

{'loss': 2.598, 'grad_norm': 2.2843143939971924, 'learning_rate': 6e-06, 'epoch': 0.01}


  0%|          | 70/18924 [02:05<9:18:27,  1.78s/it]

{'loss': 2.5349, 'grad_norm': 2.9953811168670654, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


  0%|          | 80/18924 [02:23<9:26:02,  1.80s/it]

{'loss': 2.4464, 'grad_norm': 3.8089656829833984, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


  0%|          | 90/18924 [02:41<9:22:43,  1.79s/it]

{'loss': 2.4226, 'grad_norm': 3.675827741622925, 'learning_rate': 9e-06, 'epoch': 0.01}


  1%|          | 100/18924 [02:59<9:30:32,  1.82s/it]

{'loss': 2.3895, 'grad_norm': 4.112186431884766, 'learning_rate': 1e-05, 'epoch': 0.02}


  1%|          | 110/18924 [03:17<9:30:41,  1.82s/it]

{'loss': 2.3024, 'grad_norm': 3.5832345485687256, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.02}


  1%|          | 120/18924 [03:35<9:14:11,  1.77s/it]

{'loss': 2.186, 'grad_norm': 3.8176777362823486, 'learning_rate': 1.2e-05, 'epoch': 0.02}


  1%|          | 130/18924 [03:52<9:14:16,  1.77s/it]

{'loss': 2.2714, 'grad_norm': 3.9990954399108887, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.02}


  1%|          | 140/18924 [04:10<9:16:07,  1.78s/it]

{'loss': 2.0757, 'grad_norm': 4.18874454498291, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.02}


  1%|          | 150/18924 [04:28<9:22:40,  1.80s/it]

{'loss': 2.078, 'grad_norm': 7.521965980529785, 'learning_rate': 1.5e-05, 'epoch': 0.02}


  1%|          | 160/18924 [04:47<10:26:05,  2.00s/it]

{'loss': 1.9643, 'grad_norm': 6.082216739654541, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.03}


  1%|          | 170/18924 [05:09<11:09:59,  2.14s/it]

{'loss': 1.9232, 'grad_norm': 3.896890878677368, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.03}


  1%|          | 180/18924 [05:30<10:51:22,  2.09s/it]

{'loss': 1.8579, 'grad_norm': 5.013913631439209, 'learning_rate': 1.8e-05, 'epoch': 0.03}


  1%|          | 190/18924 [05:50<10:30:13,  2.02s/it]

{'loss': 2.0123, 'grad_norm': 6.109916687011719, 'learning_rate': 1.9e-05, 'epoch': 0.03}


  1%|          | 200/18924 [06:12<11:15:23,  2.16s/it]

{'loss': 1.7719, 'grad_norm': 10.266648292541504, 'learning_rate': 2e-05, 'epoch': 0.03}


  1%|          | 210/18924 [06:33<10:44:08,  2.07s/it]

{'loss': 1.8675, 'grad_norm': 19.157190322875977, 'learning_rate': 2.1e-05, 'epoch': 0.03}


  1%|          | 220/18924 [06:53<10:30:17,  2.02s/it]

{'loss': 1.905, 'grad_norm': 7.266666889190674, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.03}


  1%|          | 230/18924 [07:13<10:30:02,  2.02s/it]

{'loss': 1.622, 'grad_norm': 6.935032367706299, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.04}


  1%|▏         | 240/18924 [07:33<10:18:29,  1.99s/it]

{'loss': 1.7245, 'grad_norm': 7.292938232421875, 'learning_rate': 2.4e-05, 'epoch': 0.04}


  1%|▏         | 250/18924 [07:54<10:53:03,  2.10s/it]

{'loss': 1.6663, 'grad_norm': 6.3059163093566895, 'learning_rate': 2.5e-05, 'epoch': 0.04}


  1%|▏         | 260/18924 [08:14<10:39:04,  2.05s/it]

{'loss': 1.7482, 'grad_norm': 10.305374145507812, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.04}


  1%|▏         | 270/18924 [08:35<10:55:13,  2.11s/it]

{'loss': 1.7036, 'grad_norm': 8.161866188049316, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.04}


  1%|▏         | 280/18924 [08:56<10:53:45,  2.10s/it]

{'loss': 1.7081, 'grad_norm': 7.282687664031982, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.04}


  2%|▏         | 290/18924 [09:17<10:34:03,  2.04s/it]

{'loss': 1.7759, 'grad_norm': 8.593811988830566, 'learning_rate': 2.9e-05, 'epoch': 0.05}


  2%|▏         | 300/18924 [09:37<10:46:23,  2.08s/it]

{'loss': 1.5602, 'grad_norm': 12.541396141052246, 'learning_rate': 3e-05, 'epoch': 0.05}


  2%|▏         | 310/18924 [09:59<11:12:58,  2.17s/it]

{'loss': 1.6967, 'grad_norm': 13.69345760345459, 'learning_rate': 3.1e-05, 'epoch': 0.05}


  2%|▏         | 320/18924 [10:22<11:27:57,  2.22s/it]

{'loss': 1.6537, 'grad_norm': 9.574458122253418, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.05}


  2%|▏         | 330/18924 [10:43<11:10:18,  2.16s/it]

{'loss': 1.4925, 'grad_norm': 10.093875885009766, 'learning_rate': 3.3e-05, 'epoch': 0.05}


  2%|▏         | 340/18924 [11:04<10:50:28,  2.10s/it]

{'loss': 1.4749, 'grad_norm': 5.4586029052734375, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.05}


  2%|▏         | 350/18924 [11:25<10:39:57,  2.07s/it]

{'loss': 1.4333, 'grad_norm': 6.526297092437744, 'learning_rate': 3.5e-05, 'epoch': 0.06}


  2%|▏         | 360/18924 [11:46<10:45:07,  2.09s/it]

{'loss': 1.6249, 'grad_norm': 6.481968402862549, 'learning_rate': 3.6e-05, 'epoch': 0.06}


  2%|▏         | 370/18924 [12:08<11:18:48,  2.20s/it]

{'loss': 1.5011, 'grad_norm': 9.924278259277344, 'learning_rate': 3.7e-05, 'epoch': 0.06}


  2%|▏         | 380/18924 [12:29<10:54:10,  2.12s/it]

{'loss': 1.376, 'grad_norm': 8.255733489990234, 'learning_rate': 3.8e-05, 'epoch': 0.06}


  2%|▏         | 390/18924 [12:50<10:54:12,  2.12s/it]

{'loss': 1.4207, 'grad_norm': 6.2454118728637695, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.06}


  2%|▏         | 400/18924 [13:11<10:51:42,  2.11s/it]

{'loss': 1.3978, 'grad_norm': 11.513806343078613, 'learning_rate': 4e-05, 'epoch': 0.06}


  2%|▏         | 410/18924 [13:32<10:40:44,  2.08s/it]

{'loss': 1.4581, 'grad_norm': 10.064947128295898, 'learning_rate': 4.1e-05, 'epoch': 0.06}


  2%|▏         | 420/18924 [13:53<10:44:06,  2.09s/it]

{'loss': 1.311, 'grad_norm': 10.370902061462402, 'learning_rate': 4.2e-05, 'epoch': 0.07}


  2%|▏         | 430/18924 [14:14<10:59:15,  2.14s/it]

{'loss': 1.462, 'grad_norm': 11.784512519836426, 'learning_rate': 4.3e-05, 'epoch': 0.07}


  2%|▏         | 440/18924 [14:35<10:47:11,  2.10s/it]

{'loss': 1.4057, 'grad_norm': 14.936873435974121, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.07}


  2%|▏         | 450/18924 [14:56<10:33:59,  2.06s/it]

{'loss': 1.3702, 'grad_norm': 10.70354175567627, 'learning_rate': 4.5e-05, 'epoch': 0.07}


  2%|▏         | 460/18924 [15:17<10:38:42,  2.08s/it]

{'loss': 1.3734, 'grad_norm': 7.0921430587768555, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.07}


  2%|▏         | 470/18924 [15:38<10:44:30,  2.10s/it]

{'loss': 1.2926, 'grad_norm': 8.384232521057129, 'learning_rate': 4.7e-05, 'epoch': 0.07}


  3%|▎         | 480/18924 [15:59<11:02:11,  2.15s/it]

{'loss': 1.3738, 'grad_norm': 7.936601161956787, 'learning_rate': 4.8e-05, 'epoch': 0.08}


  3%|▎         | 490/18924 [16:21<11:12:25,  2.19s/it]

{'loss': 1.5638, 'grad_norm': 6.559786796569824, 'learning_rate': 4.9e-05, 'epoch': 0.08}


  3%|▎         | 500/18924 [16:43<11:05:19,  2.17s/it]

{'loss': 1.412, 'grad_norm': 8.767770767211914, 'learning_rate': 5e-05, 'epoch': 0.08}


  3%|▎         | 510/18924 [17:05<11:07:58,  2.18s/it]

{'loss': 1.29, 'grad_norm': 12.186182975769043, 'learning_rate': 4.997286148501954e-05, 'epoch': 0.08}


  3%|▎         | 520/18924 [17:31<14:25:14,  2.82s/it]

{'loss': 1.3521, 'grad_norm': 12.016179084777832, 'learning_rate': 4.9945722970039084e-05, 'epoch': 0.08}


  3%|▎         | 530/18924 [17:58<13:15:42,  2.60s/it]

{'loss': 1.3883, 'grad_norm': 8.661683082580566, 'learning_rate': 4.991858445505862e-05, 'epoch': 0.08}


  3%|▎         | 540/18924 [18:21<11:17:43,  2.21s/it]

{'loss': 1.1602, 'grad_norm': 5.102187156677246, 'learning_rate': 4.9891445940078166e-05, 'epoch': 0.09}


  3%|▎         | 550/18924 [18:42<11:06:45,  2.18s/it]

{'loss': 1.2471, 'grad_norm': 8.402748107910156, 'learning_rate': 4.98643074250977e-05, 'epoch': 0.09}


  3%|▎         | 560/18924 [19:04<11:12:57,  2.20s/it]

{'loss': 1.291, 'grad_norm': 7.676553726196289, 'learning_rate': 4.983716891011724e-05, 'epoch': 0.09}


  3%|▎         | 570/18924 [19:26<11:19:09,  2.22s/it]

{'loss': 1.2697, 'grad_norm': 10.19295883178711, 'learning_rate': 4.981003039513678e-05, 'epoch': 0.09}


  3%|▎         | 580/18924 [19:49<12:00:36,  2.36s/it]

{'loss': 1.149, 'grad_norm': 10.003117561340332, 'learning_rate': 4.9782891880156316e-05, 'epoch': 0.09}


  3%|▎         | 590/18924 [20:12<11:34:30,  2.27s/it]

{'loss': 1.3098, 'grad_norm': 7.7847442626953125, 'learning_rate': 4.975575336517586e-05, 'epoch': 0.09}


  3%|▎         | 600/18924 [20:34<11:27:21,  2.25s/it]

{'loss': 1.3147, 'grad_norm': 11.087762832641602, 'learning_rate': 4.97286148501954e-05, 'epoch': 0.1}


  3%|▎         | 610/18924 [20:58<12:05:19,  2.38s/it]

{'loss': 1.0221, 'grad_norm': 10.125154495239258, 'learning_rate': 4.970147633521494e-05, 'epoch': 0.1}


  3%|▎         | 620/18924 [21:21<11:41:31,  2.30s/it]

{'loss': 1.1935, 'grad_norm': 12.284809112548828, 'learning_rate': 4.967433782023448e-05, 'epoch': 0.1}


  3%|▎         | 630/18924 [21:44<11:19:34,  2.23s/it]

{'loss': 1.4025, 'grad_norm': 4.677879810333252, 'learning_rate': 4.964719930525402e-05, 'epoch': 0.1}


  3%|▎         | 640/18924 [22:06<11:20:59,  2.23s/it]

{'loss': 1.0898, 'grad_norm': 7.482612609863281, 'learning_rate': 4.962006079027356e-05, 'epoch': 0.1}


  3%|▎         | 650/18924 [22:30<13:00:17,  2.56s/it]

{'loss': 1.2724, 'grad_norm': 9.732070922851562, 'learning_rate': 4.95929222752931e-05, 'epoch': 0.1}


  3%|▎         | 660/18924 [22:55<11:44:49,  2.32s/it]

{'loss': 1.3049, 'grad_norm': 9.227788925170898, 'learning_rate': 4.956578376031264e-05, 'epoch': 0.1}


  4%|▎         | 670/18924 [23:17<11:32:32,  2.28s/it]

{'loss': 1.1129, 'grad_norm': 14.817119598388672, 'learning_rate': 4.953864524533218e-05, 'epoch': 0.11}


  4%|▎         | 680/18924 [23:39<11:06:14,  2.19s/it]

{'loss': 1.404, 'grad_norm': 6.907465934753418, 'learning_rate': 4.951150673035172e-05, 'epoch': 0.11}


  4%|▎         | 690/18924 [24:04<11:47:19,  2.33s/it]

{'loss': 1.1808, 'grad_norm': 7.813426494598389, 'learning_rate': 4.9484368215371255e-05, 'epoch': 0.11}


  4%|▎         | 700/18924 [24:27<12:00:18,  2.37s/it]

{'loss': 1.2768, 'grad_norm': 7.456671237945557, 'learning_rate': 4.945722970039079e-05, 'epoch': 0.11}


  4%|▍         | 710/18924 [24:52<11:50:59,  2.34s/it]

{'loss': 1.1774, 'grad_norm': 12.846640586853027, 'learning_rate': 4.943009118541034e-05, 'epoch': 0.11}


  4%|▍         | 720/18924 [25:13<10:34:38,  2.09s/it]

{'loss': 1.0652, 'grad_norm': 5.957202911376953, 'learning_rate': 4.9402952670429874e-05, 'epoch': 0.11}


  4%|▍         | 730/18924 [25:33<10:08:09,  2.01s/it]

{'loss': 1.1533, 'grad_norm': 7.772305488586426, 'learning_rate': 4.937581415544942e-05, 'epoch': 0.12}


  4%|▍         | 740/18924 [25:53<10:13:39,  2.02s/it]

{'loss': 1.3382, 'grad_norm': 6.094491958618164, 'learning_rate': 4.9348675640468956e-05, 'epoch': 0.12}


  4%|▍         | 750/18924 [26:14<10:26:54,  2.07s/it]

{'loss': 1.1015, 'grad_norm': 11.939891815185547, 'learning_rate': 4.93215371254885e-05, 'epoch': 0.12}


  4%|▍         | 760/18924 [26:35<10:27:57,  2.07s/it]

{'loss': 1.2837, 'grad_norm': 8.276843070983887, 'learning_rate': 4.929439861050804e-05, 'epoch': 0.12}


  4%|▍         | 770/18924 [26:55<10:23:16,  2.06s/it]

{'loss': 1.0025, 'grad_norm': 9.031243324279785, 'learning_rate': 4.9267260095527575e-05, 'epoch': 0.12}


  4%|▍         | 780/18924 [27:16<10:26:48,  2.07s/it]

{'loss': 1.3117, 'grad_norm': 13.373053550720215, 'learning_rate': 4.924012158054712e-05, 'epoch': 0.12}


  4%|▍         | 790/18924 [27:38<11:20:02,  2.25s/it]

{'loss': 1.2017, 'grad_norm': 5.797341346740723, 'learning_rate': 4.921298306556666e-05, 'epoch': 0.13}


  4%|▍         | 800/18924 [28:00<10:52:21,  2.16s/it]

{'loss': 1.1602, 'grad_norm': 8.903548240661621, 'learning_rate': 4.9185844550586194e-05, 'epoch': 0.13}


  4%|▍         | 810/18924 [28:21<10:48:08,  2.15s/it]

{'loss': 1.3195, 'grad_norm': 11.140593528747559, 'learning_rate': 4.915870603560573e-05, 'epoch': 0.13}


  4%|▍         | 820/18924 [28:42<10:42:11,  2.13s/it]

{'loss': 1.1565, 'grad_norm': 8.672099113464355, 'learning_rate': 4.913156752062527e-05, 'epoch': 0.13}


  4%|▍         | 830/18924 [29:04<10:55:55,  2.18s/it]

{'loss': 1.2744, 'grad_norm': 9.288698196411133, 'learning_rate': 4.9104429005644814e-05, 'epoch': 0.13}


  4%|▍         | 840/18924 [29:28<11:01:16,  2.19s/it]

{'loss': 1.3615, 'grad_norm': 14.5015869140625, 'learning_rate': 4.907729049066435e-05, 'epoch': 0.13}


  4%|▍         | 850/18924 [29:49<10:45:15,  2.14s/it]

{'loss': 1.2133, 'grad_norm': 10.426215171813965, 'learning_rate': 4.9050151975683895e-05, 'epoch': 0.13}


  5%|▍         | 860/18924 [30:10<10:51:15,  2.16s/it]

{'loss': 1.1062, 'grad_norm': 9.92881965637207, 'learning_rate': 4.902301346070343e-05, 'epoch': 0.14}


  5%|▍         | 870/18924 [30:34<11:56:56,  2.38s/it]

{'loss': 1.1684, 'grad_norm': 10.331197738647461, 'learning_rate': 4.899587494572298e-05, 'epoch': 0.14}


  5%|▍         | 880/18924 [30:54<9:44:08,  1.94s/it] 

{'loss': 1.0354, 'grad_norm': 7.277226448059082, 'learning_rate': 4.8968736430742514e-05, 'epoch': 0.14}


  5%|▍         | 890/18924 [31:13<9:12:51,  1.84s/it]

{'loss': 1.0393, 'grad_norm': 12.967745780944824, 'learning_rate': 4.894159791576205e-05, 'epoch': 0.14}


  5%|▍         | 900/18924 [31:31<9:09:08,  1.83s/it]

{'loss': 1.3093, 'grad_norm': 9.469196319580078, 'learning_rate': 4.891445940078159e-05, 'epoch': 0.14}


  5%|▍         | 910/18924 [31:50<9:31:01,  1.90s/it]

{'loss': 1.2596, 'grad_norm': 13.948942184448242, 'learning_rate': 4.888732088580113e-05, 'epoch': 0.14}


  5%|▍         | 920/18924 [32:10<10:10:27,  2.03s/it]

{'loss': 1.0649, 'grad_norm': 7.835845470428467, 'learning_rate': 4.886018237082067e-05, 'epoch': 0.15}


  5%|▍         | 930/18924 [32:30<9:39:11,  1.93s/it] 

{'loss': 1.1045, 'grad_norm': 8.637633323669434, 'learning_rate': 4.883304385584021e-05, 'epoch': 0.15}


  5%|▍         | 940/18924 [32:49<9:32:06,  1.91s/it]

{'loss': 1.1047, 'grad_norm': 11.24156379699707, 'learning_rate': 4.8805905340859746e-05, 'epoch': 0.15}


  5%|▌         | 950/18924 [33:08<9:44:25,  1.95s/it]

{'loss': 1.1603, 'grad_norm': 13.253399848937988, 'learning_rate': 4.877876682587929e-05, 'epoch': 0.15}


  5%|▌         | 960/18924 [33:28<9:48:57,  1.97s/it]

{'loss': 1.2062, 'grad_norm': 6.958517074584961, 'learning_rate': 4.875162831089883e-05, 'epoch': 0.15}


  5%|▌         | 970/18924 [33:47<9:42:36,  1.95s/it]

{'loss': 1.0879, 'grad_norm': 8.471841812133789, 'learning_rate': 4.872448979591837e-05, 'epoch': 0.15}


  5%|▌         | 980/18924 [34:07<9:40:29,  1.94s/it]

{'loss': 1.1105, 'grad_norm': 9.573980331420898, 'learning_rate': 4.869735128093791e-05, 'epoch': 0.16}


  5%|▌         | 990/18924 [34:26<9:43:40,  1.95s/it]

{'loss': 1.0671, 'grad_norm': 8.928059577941895, 'learning_rate': 4.8670212765957454e-05, 'epoch': 0.16}


  5%|▌         | 1000/18924 [34:46<9:44:36,  1.96s/it]

{'loss': 1.3116, 'grad_norm': 10.769577026367188, 'learning_rate': 4.864307425097699e-05, 'epoch': 0.16}


  5%|▌         | 1010/18924 [35:06<9:43:15,  1.95s/it] 

{'loss': 0.9324, 'grad_norm': 6.795643329620361, 'learning_rate': 4.861593573599653e-05, 'epoch': 0.16}


  5%|▌         | 1020/18924 [35:26<9:41:26,  1.95s/it]

{'loss': 1.4823, 'grad_norm': 11.458212852478027, 'learning_rate': 4.8588797221016066e-05, 'epoch': 0.16}


  5%|▌         | 1030/18924 [35:46<9:51:02,  1.98s/it]

{'loss': 1.155, 'grad_norm': 3.970832109451294, 'learning_rate': 4.8561658706035604e-05, 'epoch': 0.16}


  5%|▌         | 1040/18924 [36:08<11:33:27,  2.33s/it]

{'loss': 1.2372, 'grad_norm': 6.9410834312438965, 'learning_rate': 4.853452019105515e-05, 'epoch': 0.16}


  6%|▌         | 1050/18924 [36:28<9:55:41,  2.00s/it] 

{'loss': 1.2032, 'grad_norm': 10.907660484313965, 'learning_rate': 4.8507381676074685e-05, 'epoch': 0.17}


  6%|▌         | 1060/18924 [36:48<9:39:42,  1.95s/it]

{'loss': 1.3482, 'grad_norm': 13.644991874694824, 'learning_rate': 4.848024316109423e-05, 'epoch': 0.17}


  6%|▌         | 1070/18924 [37:07<9:38:08,  1.94s/it]

{'loss': 1.2526, 'grad_norm': 14.19983196258545, 'learning_rate': 4.845310464611377e-05, 'epoch': 0.17}


  6%|▌         | 1080/18924 [37:28<11:16:38,  2.28s/it]

{'loss': 1.1901, 'grad_norm': 9.13348388671875, 'learning_rate': 4.8425966131133305e-05, 'epoch': 0.17}


  6%|▌         | 1090/18924 [37:49<10:13:26,  2.06s/it]

{'loss': 0.9598, 'grad_norm': 13.756491661071777, 'learning_rate': 4.839882761615285e-05, 'epoch': 0.17}


  6%|▌         | 1100/18924 [38:09<9:46:00,  1.97s/it] 

{'loss': 1.1085, 'grad_norm': 9.887214660644531, 'learning_rate': 4.8371689101172386e-05, 'epoch': 0.17}


  6%|▌         | 1110/18924 [38:29<9:48:45,  1.98s/it]

{'loss': 1.2022, 'grad_norm': 11.505974769592285, 'learning_rate': 4.834455058619193e-05, 'epoch': 0.18}


  6%|▌         | 1120/18924 [38:49<9:54:42,  2.00s/it]

{'loss': 1.0372, 'grad_norm': 8.764666557312012, 'learning_rate': 4.831741207121147e-05, 'epoch': 0.18}


  6%|▌         | 1130/18924 [39:09<9:44:52,  1.97s/it] 

{'loss': 1.3355, 'grad_norm': 7.783081531524658, 'learning_rate': 4.8290273556231006e-05, 'epoch': 0.18}


  6%|▌         | 1140/18924 [39:28<9:35:43,  1.94s/it]

{'loss': 1.0321, 'grad_norm': 11.06749439239502, 'learning_rate': 4.826313504125054e-05, 'epoch': 0.18}


  6%|▌         | 1150/18924 [39:48<9:38:44,  1.95s/it]

{'loss': 0.9785, 'grad_norm': 10.64209270477295, 'learning_rate': 4.823599652627008e-05, 'epoch': 0.18}


  6%|▌         | 1160/18924 [40:07<9:36:39,  1.95s/it]

{'loss': 1.2187, 'grad_norm': 7.629161834716797, 'learning_rate': 4.8208858011289625e-05, 'epoch': 0.18}


  6%|▌         | 1170/18924 [40:27<9:40:50,  1.96s/it]

{'loss': 0.9291, 'grad_norm': 8.651512145996094, 'learning_rate': 4.818171949630916e-05, 'epoch': 0.19}


  6%|▌         | 1180/18924 [40:47<9:33:35,  1.94s/it]

{'loss': 1.3822, 'grad_norm': 10.880064010620117, 'learning_rate': 4.8154580981328706e-05, 'epoch': 0.19}


  6%|▋         | 1190/18924 [41:06<9:30:15,  1.93s/it]

{'loss': 1.015, 'grad_norm': 8.70299243927002, 'learning_rate': 4.8127442466348244e-05, 'epoch': 0.19}


  6%|▋         | 1200/18924 [41:25<9:47:15,  1.99s/it]

{'loss': 1.2431, 'grad_norm': 12.179030418395996, 'learning_rate': 4.810030395136778e-05, 'epoch': 0.19}


  6%|▋         | 1210/18924 [41:46<9:50:18,  2.00s/it]

{'loss': 1.2753, 'grad_norm': 11.053133964538574, 'learning_rate': 4.8073165436387326e-05, 'epoch': 0.19}


  6%|▋         | 1220/18924 [42:05<9:37:53,  1.96s/it]

{'loss': 1.1665, 'grad_norm': 11.31827163696289, 'learning_rate': 4.804602692140686e-05, 'epoch': 0.19}


  6%|▋         | 1230/18924 [42:25<9:45:54,  1.99s/it]

{'loss': 1.2008, 'grad_norm': 11.088968276977539, 'learning_rate': 4.801888840642641e-05, 'epoch': 0.19}


  7%|▋         | 1240/18924 [42:45<9:44:15,  1.98s/it]

{'loss': 1.1471, 'grad_norm': 7.021801471710205, 'learning_rate': 4.7991749891445945e-05, 'epoch': 0.2}


  7%|▋         | 1250/18924 [43:05<9:33:33,  1.95s/it]

{'loss': 1.1195, 'grad_norm': 14.857929229736328, 'learning_rate': 4.796461137646548e-05, 'epoch': 0.2}


  7%|▋         | 1260/18924 [43:24<9:32:28,  1.94s/it]

{'loss': 1.1933, 'grad_norm': 9.335853576660156, 'learning_rate': 4.793747286148502e-05, 'epoch': 0.2}


  7%|▋         | 1270/18924 [43:44<10:05:35,  2.06s/it]

{'loss': 1.1267, 'grad_norm': 6.615536212921143, 'learning_rate': 4.791033434650456e-05, 'epoch': 0.2}


  7%|▋         | 1280/18924 [44:06<10:51:41,  2.22s/it]

{'loss': 1.1011, 'grad_norm': 7.868829250335693, 'learning_rate': 4.78831958315241e-05, 'epoch': 0.2}


  7%|▋         | 1290/18924 [44:28<10:19:58,  2.11s/it]

{'loss': 0.9307, 'grad_norm': 8.327589988708496, 'learning_rate': 4.785605731654364e-05, 'epoch': 0.2}


  7%|▋         | 1300/18924 [44:48<10:02:58,  2.05s/it]

{'loss': 0.92, 'grad_norm': 7.567911624908447, 'learning_rate': 4.782891880156318e-05, 'epoch': 0.21}


  7%|▋         | 1310/18924 [45:09<10:20:15,  2.11s/it]

{'loss': 0.9963, 'grad_norm': 7.05263090133667, 'learning_rate': 4.780178028658272e-05, 'epoch': 0.21}


  7%|▋         | 1320/18924 [45:32<11:05:25,  2.27s/it]

{'loss': 1.0505, 'grad_norm': 12.303730010986328, 'learning_rate': 4.777464177160226e-05, 'epoch': 0.21}


  7%|▋         | 1330/18924 [45:54<10:47:43,  2.21s/it]

{'loss': 1.0856, 'grad_norm': 12.972880363464355, 'learning_rate': 4.77475032566218e-05, 'epoch': 0.21}


  7%|▋         | 1340/18924 [46:16<10:27:13,  2.14s/it]

{'loss': 1.1568, 'grad_norm': 7.764331817626953, 'learning_rate': 4.772036474164134e-05, 'epoch': 0.21}


  7%|▋         | 1350/18924 [46:37<10:15:46,  2.10s/it]

{'loss': 1.04, 'grad_norm': 6.258935928344727, 'learning_rate': 4.7693226226660884e-05, 'epoch': 0.21}


  7%|▋         | 1360/18924 [46:58<10:15:15,  2.10s/it]

{'loss': 1.0387, 'grad_norm': 8.391901969909668, 'learning_rate': 4.766608771168042e-05, 'epoch': 0.22}


  7%|▋         | 1370/18924 [47:19<10:00:14,  2.05s/it]

{'loss': 1.289, 'grad_norm': 12.673828125, 'learning_rate': 4.763894919669996e-05, 'epoch': 0.22}


  7%|▋         | 1380/18924 [47:39<10:05:15,  2.07s/it]

{'loss': 1.2402, 'grad_norm': 5.62207555770874, 'learning_rate': 4.7611810681719497e-05, 'epoch': 0.22}


  7%|▋         | 1390/18924 [48:00<10:07:26,  2.08s/it]

{'loss': 1.0375, 'grad_norm': 10.035656929016113, 'learning_rate': 4.7584672166739034e-05, 'epoch': 0.22}


  7%|▋         | 1400/18924 [48:20<9:52:51,  2.03s/it] 

{'loss': 1.1661, 'grad_norm': 8.650940895080566, 'learning_rate': 4.755753365175858e-05, 'epoch': 0.22}


  7%|▋         | 1410/18924 [48:41<10:05:17,  2.07s/it]

{'loss': 1.0167, 'grad_norm': 6.828969955444336, 'learning_rate': 4.7530395136778116e-05, 'epoch': 0.22}


  8%|▊         | 1420/18924 [49:02<10:04:22,  2.07s/it]

{'loss': 0.9698, 'grad_norm': 10.894286155700684, 'learning_rate': 4.750325662179766e-05, 'epoch': 0.23}


  8%|▊         | 1430/18924 [49:23<9:55:41,  2.04s/it] 

{'loss': 1.0795, 'grad_norm': 15.124788284301758, 'learning_rate': 4.74761181068172e-05, 'epoch': 0.23}


  8%|▊         | 1440/18924 [49:43<9:40:15,  1.99s/it]

{'loss': 1.1639, 'grad_norm': 12.351634979248047, 'learning_rate': 4.744897959183674e-05, 'epoch': 0.23}


  8%|▊         | 1450/18924 [50:02<9:24:01,  1.94s/it]

{'loss': 0.9377, 'grad_norm': 7.922765731811523, 'learning_rate': 4.742184107685628e-05, 'epoch': 0.23}


  8%|▊         | 1460/18924 [50:21<9:17:40,  1.92s/it]

{'loss': 1.0966, 'grad_norm': 9.574175834655762, 'learning_rate': 4.739470256187582e-05, 'epoch': 0.23}


  8%|▊         | 1470/18924 [50:41<9:36:25,  1.98s/it]

{'loss': 1.0647, 'grad_norm': 18.863239288330078, 'learning_rate': 4.7367564046895354e-05, 'epoch': 0.23}


  8%|▊         | 1480/18924 [51:01<9:29:48,  1.96s/it]

{'loss': 1.2693, 'grad_norm': 15.643965721130371, 'learning_rate': 4.734042553191489e-05, 'epoch': 0.23}


  8%|▊         | 1490/18924 [51:20<9:28:03,  1.95s/it]

{'loss': 1.0253, 'grad_norm': 14.849427223205566, 'learning_rate': 4.7313287016934436e-05, 'epoch': 0.24}


  8%|▊         | 1500/18924 [51:40<9:30:03,  1.96s/it]

{'loss': 0.9055, 'grad_norm': 13.142826080322266, 'learning_rate': 4.728614850195397e-05, 'epoch': 0.24}


  8%|▊         | 1510/18924 [52:00<9:44:16,  2.01s/it] 

{'loss': 1.0539, 'grad_norm': 8.12466049194336, 'learning_rate': 4.725900998697351e-05, 'epoch': 0.24}


  8%|▊         | 1520/18924 [52:20<9:29:41,  1.96s/it]

{'loss': 1.1091, 'grad_norm': 10.636323928833008, 'learning_rate': 4.7231871471993055e-05, 'epoch': 0.24}


  8%|▊         | 1530/18924 [52:40<9:29:21,  1.96s/it]

{'loss': 1.3277, 'grad_norm': 11.974635124206543, 'learning_rate': 4.720473295701259e-05, 'epoch': 0.24}


  8%|▊         | 1540/18924 [52:59<9:28:07,  1.96s/it]

{'loss': 1.0218, 'grad_norm': 6.9374213218688965, 'learning_rate': 4.717759444203214e-05, 'epoch': 0.24}


  8%|▊         | 1550/18924 [53:19<9:30:37,  1.97s/it]

{'loss': 1.1959, 'grad_norm': 9.303131103515625, 'learning_rate': 4.7150455927051674e-05, 'epoch': 0.25}


  8%|▊         | 1560/18924 [53:39<9:37:51,  2.00s/it]

{'loss': 1.2478, 'grad_norm': 7.199949264526367, 'learning_rate': 4.712331741207122e-05, 'epoch': 0.25}


  8%|▊         | 1570/18924 [53:59<9:37:40,  2.00s/it]

{'loss': 0.9714, 'grad_norm': 10.302793502807617, 'learning_rate': 4.7096178897090756e-05, 'epoch': 0.25}


  8%|▊         | 1580/18924 [54:19<9:33:00,  1.98s/it]

{'loss': 1.0187, 'grad_norm': 9.590506553649902, 'learning_rate': 4.7069040382110293e-05, 'epoch': 0.25}


  8%|▊         | 1590/18924 [54:39<9:35:30,  1.99s/it]

{'loss': 1.1118, 'grad_norm': 11.421060562133789, 'learning_rate': 4.704190186712983e-05, 'epoch': 0.25}


  8%|▊         | 1600/18924 [54:59<9:37:23,  2.00s/it]

{'loss': 0.9144, 'grad_norm': 11.881319999694824, 'learning_rate': 4.701476335214937e-05, 'epoch': 0.25}


  9%|▊         | 1610/18924 [55:19<9:36:42,  2.00s/it]

{'loss': 0.9586, 'grad_norm': 9.472733497619629, 'learning_rate': 4.698762483716891e-05, 'epoch': 0.26}


  9%|▊         | 1620/18924 [55:39<9:24:02,  1.96s/it]

{'loss': 0.9452, 'grad_norm': 7.256148338317871, 'learning_rate': 4.696048632218845e-05, 'epoch': 0.26}


  9%|▊         | 1630/18924 [55:58<9:25:05,  1.96s/it]

{'loss': 1.1346, 'grad_norm': 12.148579597473145, 'learning_rate': 4.693334780720799e-05, 'epoch': 0.26}


  9%|▊         | 1640/18924 [56:18<9:18:29,  1.94s/it]

{'loss': 1.1639, 'grad_norm': 19.820960998535156, 'learning_rate': 4.690620929222753e-05, 'epoch': 0.26}


  9%|▊         | 1650/18924 [56:37<9:29:23,  1.98s/it]

{'loss': 1.1178, 'grad_norm': 6.714358806610107, 'learning_rate': 4.687907077724707e-05, 'epoch': 0.26}


  9%|▉         | 1660/18924 [56:57<9:37:00,  2.01s/it]

{'loss': 1.2447, 'grad_norm': 8.57238483428955, 'learning_rate': 4.6851932262266614e-05, 'epoch': 0.26}


  9%|▉         | 1670/18924 [57:17<9:19:26,  1.95s/it]

{'loss': 1.2276, 'grad_norm': 7.876016139984131, 'learning_rate': 4.682479374728615e-05, 'epoch': 0.26}


  9%|▉         | 1680/18924 [57:36<9:27:58,  1.98s/it]

{'loss': 1.0846, 'grad_norm': 10.282188415527344, 'learning_rate': 4.6797655232305695e-05, 'epoch': 0.27}


  9%|▉         | 1690/18924 [57:57<9:36:51,  2.01s/it]

{'loss': 1.0792, 'grad_norm': 10.114431381225586, 'learning_rate': 4.677051671732523e-05, 'epoch': 0.27}


  9%|▉         | 1700/18924 [58:17<9:38:24,  2.01s/it]

{'loss': 1.114, 'grad_norm': 9.043612480163574, 'learning_rate': 4.674337820234477e-05, 'epoch': 0.27}


  9%|▉         | 1710/18924 [58:37<9:36:50,  2.01s/it]

{'loss': 1.0872, 'grad_norm': 11.377715110778809, 'learning_rate': 4.671623968736431e-05, 'epoch': 0.27}


  9%|▉         | 1720/18924 [58:57<9:31:53,  1.99s/it]

{'loss': 1.1378, 'grad_norm': 4.70425271987915, 'learning_rate': 4.6689101172383845e-05, 'epoch': 0.27}


  9%|▉         | 1730/18924 [59:17<9:31:47,  2.00s/it]

{'loss': 1.1236, 'grad_norm': 9.205772399902344, 'learning_rate': 4.666196265740339e-05, 'epoch': 0.27}


  9%|▉         | 1740/18924 [59:38<9:45:34,  2.04s/it] 

{'loss': 1.0597, 'grad_norm': 8.314299583435059, 'learning_rate': 4.663482414242293e-05, 'epoch': 0.28}


  9%|▉         | 1750/18924 [59:58<9:44:54,  2.04s/it]

{'loss': 1.2157, 'grad_norm': 17.63616943359375, 'learning_rate': 4.660768562744247e-05, 'epoch': 0.28}


  9%|▉         | 1760/18924 [1:00:19<9:43:18,  2.04s/it]

{'loss': 1.043, 'grad_norm': 8.637767791748047, 'learning_rate': 4.658054711246201e-05, 'epoch': 0.28}


  9%|▉         | 1770/18924 [1:00:39<9:46:02,  2.05s/it]

{'loss': 1.0088, 'grad_norm': 10.284234046936035, 'learning_rate': 4.6553408597481546e-05, 'epoch': 0.28}


  9%|▉         | 1780/18924 [1:01:00<9:45:33,  2.05s/it]

{'loss': 1.1563, 'grad_norm': 5.582452297210693, 'learning_rate': 4.652627008250109e-05, 'epoch': 0.28}


  9%|▉         | 1790/18924 [1:01:20<9:37:51,  2.02s/it]

{'loss': 1.0798, 'grad_norm': 10.970549583435059, 'learning_rate': 4.649913156752063e-05, 'epoch': 0.28}


 10%|▉         | 1800/18924 [1:01:40<9:32:25,  2.01s/it]

{'loss': 1.0376, 'grad_norm': 11.015304565429688, 'learning_rate': 4.647199305254017e-05, 'epoch': 0.29}


 10%|▉         | 1810/18924 [1:02:01<9:40:39,  2.04s/it]

{'loss': 0.8629, 'grad_norm': 15.346243858337402, 'learning_rate': 4.644485453755971e-05, 'epoch': 0.29}


 10%|▉         | 1820/18924 [1:02:21<9:41:12,  2.04s/it]

{'loss': 1.3523, 'grad_norm': 10.223663330078125, 'learning_rate': 4.641771602257925e-05, 'epoch': 0.29}


 10%|▉         | 1830/18924 [1:02:41<9:39:17,  2.03s/it]

{'loss': 1.1275, 'grad_norm': 6.889869213104248, 'learning_rate': 4.6390577507598785e-05, 'epoch': 0.29}


 10%|▉         | 1840/18924 [1:03:02<9:38:03,  2.03s/it]

{'loss': 0.9649, 'grad_norm': 7.7385454177856445, 'learning_rate': 4.636343899261832e-05, 'epoch': 0.29}


 10%|▉         | 1850/18924 [1:03:22<9:43:32,  2.05s/it]

{'loss': 0.9155, 'grad_norm': 5.503654479980469, 'learning_rate': 4.6336300477637866e-05, 'epoch': 0.29}


 10%|▉         | 1860/18924 [1:03:43<9:35:49,  2.02s/it]

{'loss': 1.0397, 'grad_norm': 5.48089075088501, 'learning_rate': 4.6309161962657404e-05, 'epoch': 0.29}


 10%|▉         | 1870/18924 [1:04:03<9:32:25,  2.01s/it]

{'loss': 0.9992, 'grad_norm': 11.012007713317871, 'learning_rate': 4.628202344767695e-05, 'epoch': 0.3}


 10%|▉         | 1880/18924 [1:04:23<9:37:02,  2.03s/it]

{'loss': 1.1013, 'grad_norm': 4.740357398986816, 'learning_rate': 4.6254884932696485e-05, 'epoch': 0.3}


 10%|▉         | 1890/18924 [1:04:44<9:27:42,  2.00s/it]

{'loss': 1.2172, 'grad_norm': 15.25808048248291, 'learning_rate': 4.622774641771602e-05, 'epoch': 0.3}


 10%|█         | 1900/18924 [1:05:03<9:21:55,  1.98s/it]

{'loss': 1.2411, 'grad_norm': 5.209313869476318, 'learning_rate': 4.620060790273557e-05, 'epoch': 0.3}


 10%|█         | 1910/18924 [1:05:24<9:49:23,  2.08s/it]

{'loss': 1.0509, 'grad_norm': 12.929778099060059, 'learning_rate': 4.6173469387755105e-05, 'epoch': 0.3}


 10%|█         | 1920/18924 [1:05:44<9:31:17,  2.02s/it]

{'loss': 0.9699, 'grad_norm': 7.474587917327881, 'learning_rate': 4.614633087277465e-05, 'epoch': 0.3}


 10%|█         | 1930/18924 [1:06:04<9:23:38,  1.99s/it]

{'loss': 1.1558, 'grad_norm': 10.110616683959961, 'learning_rate': 4.6119192357794186e-05, 'epoch': 0.31}


 10%|█         | 1940/18924 [1:06:24<9:21:33,  1.98s/it]

{'loss': 1.0087, 'grad_norm': 5.897101879119873, 'learning_rate': 4.6092053842813724e-05, 'epoch': 0.31}


 10%|█         | 1950/18924 [1:06:44<9:20:23,  1.98s/it]

{'loss': 1.0937, 'grad_norm': 6.395606517791748, 'learning_rate': 4.606491532783326e-05, 'epoch': 0.31}


 10%|█         | 1960/18924 [1:07:04<9:45:40,  2.07s/it]

{'loss': 0.9739, 'grad_norm': 8.794754981994629, 'learning_rate': 4.60377768128528e-05, 'epoch': 0.31}


 10%|█         | 1970/18924 [1:07:25<9:38:51,  2.05s/it]

{'loss': 0.9243, 'grad_norm': 7.504417419433594, 'learning_rate': 4.601063829787234e-05, 'epoch': 0.31}


 10%|█         | 1980/18924 [1:07:46<9:35:52,  2.04s/it]

{'loss': 1.0116, 'grad_norm': 10.734916687011719, 'learning_rate': 4.598349978289188e-05, 'epoch': 0.31}


 11%|█         | 1990/18924 [1:08:06<9:35:04,  2.04s/it]

{'loss': 0.9612, 'grad_norm': 11.869301795959473, 'learning_rate': 4.5956361267911425e-05, 'epoch': 0.32}


 11%|█         | 2000/18924 [1:08:26<9:27:57,  2.01s/it]

{'loss': 1.1015, 'grad_norm': 7.934505939483643, 'learning_rate': 4.592922275293096e-05, 'epoch': 0.32}


 11%|█         | 2010/18924 [1:08:47<9:29:42,  2.02s/it] 

{'loss': 1.0563, 'grad_norm': 9.032183647155762, 'learning_rate': 4.59020842379505e-05, 'epoch': 0.32}


 11%|█         | 2020/18924 [1:09:07<9:27:30,  2.01s/it]

{'loss': 0.9101, 'grad_norm': 10.711813926696777, 'learning_rate': 4.5874945722970044e-05, 'epoch': 0.32}


 11%|█         | 2030/18924 [1:09:28<9:36:20,  2.05s/it]

{'loss': 0.9206, 'grad_norm': 13.940409660339355, 'learning_rate': 4.584780720798958e-05, 'epoch': 0.32}


 11%|█         | 2040/18924 [1:09:48<9:32:50,  2.04s/it]

{'loss': 1.1346, 'grad_norm': 8.313085556030273, 'learning_rate': 4.582066869300912e-05, 'epoch': 0.32}


 11%|█         | 2050/18924 [1:10:09<9:42:05,  2.07s/it]

{'loss': 0.9585, 'grad_norm': 6.609879016876221, 'learning_rate': 4.5793530178028656e-05, 'epoch': 0.32}


 11%|█         | 2060/18924 [1:10:29<9:40:53,  2.07s/it]

{'loss': 1.0955, 'grad_norm': 7.016671180725098, 'learning_rate': 4.57663916630482e-05, 'epoch': 0.33}


 11%|█         | 2070/18924 [1:10:50<9:36:55,  2.05s/it]

{'loss': 0.8916, 'grad_norm': 9.017314910888672, 'learning_rate': 4.573925314806774e-05, 'epoch': 0.33}


 11%|█         | 2080/18924 [1:11:10<9:30:17,  2.03s/it]

{'loss': 1.1496, 'grad_norm': 11.154440879821777, 'learning_rate': 4.5712114633087276e-05, 'epoch': 0.33}


 11%|█         | 2090/18924 [1:11:31<9:37:03,  2.06s/it]

{'loss': 1.0215, 'grad_norm': 7.710649013519287, 'learning_rate': 4.568497611810682e-05, 'epoch': 0.33}


 11%|█         | 2100/18924 [1:11:52<9:47:22,  2.09s/it]

{'loss': 0.9168, 'grad_norm': 9.229928970336914, 'learning_rate': 4.565783760312636e-05, 'epoch': 0.33}


 11%|█         | 2110/18924 [1:12:12<9:39:08,  2.07s/it]

{'loss': 1.0648, 'grad_norm': 10.716371536254883, 'learning_rate': 4.56306990881459e-05, 'epoch': 0.33}


 11%|█         | 2120/18924 [1:12:33<9:30:17,  2.04s/it]

{'loss': 1.0055, 'grad_norm': 12.504687309265137, 'learning_rate': 4.560356057316544e-05, 'epoch': 0.34}


 11%|█▏        | 2130/18924 [1:12:53<9:25:10,  2.02s/it]

{'loss': 1.0636, 'grad_norm': 8.228683471679688, 'learning_rate': 4.557642205818498e-05, 'epoch': 0.34}


 11%|█▏        | 2140/18924 [1:13:14<9:34:13,  2.05s/it]

{'loss': 0.8861, 'grad_norm': 9.737160682678223, 'learning_rate': 4.554928354320452e-05, 'epoch': 0.34}


 11%|█▏        | 2150/18924 [1:13:34<9:34:05,  2.05s/it]

{'loss': 0.9765, 'grad_norm': 10.504312515258789, 'learning_rate': 4.552214502822406e-05, 'epoch': 0.34}


 11%|█▏        | 2160/18924 [1:13:55<9:32:03,  2.05s/it]

{'loss': 1.2182, 'grad_norm': 8.202252388000488, 'learning_rate': 4.5495006513243596e-05, 'epoch': 0.34}


 11%|█▏        | 2170/18924 [1:14:15<9:28:34,  2.04s/it]

{'loss': 1.1448, 'grad_norm': 8.389633178710938, 'learning_rate': 4.546786799826313e-05, 'epoch': 0.34}


 12%|█▏        | 2180/18924 [1:14:35<9:29:04,  2.04s/it]

{'loss': 1.0365, 'grad_norm': 5.726315498352051, 'learning_rate': 4.544072948328268e-05, 'epoch': 0.35}


 12%|█▏        | 2190/18924 [1:14:56<9:33:42,  2.06s/it]

{'loss': 0.7453, 'grad_norm': 13.234403610229492, 'learning_rate': 4.5413590968302215e-05, 'epoch': 0.35}


 12%|█▏        | 2200/18924 [1:15:16<9:20:16,  2.01s/it]

{'loss': 1.1405, 'grad_norm': 9.784223556518555, 'learning_rate': 4.538645245332175e-05, 'epoch': 0.35}


 12%|█▏        | 2210/18924 [1:15:36<9:08:03,  1.97s/it]

{'loss': 1.1811, 'grad_norm': 8.199422836303711, 'learning_rate': 4.5359313938341297e-05, 'epoch': 0.35}


 12%|█▏        | 2220/18924 [1:15:56<9:18:12,  2.01s/it]

{'loss': 1.1957, 'grad_norm': 7.316580295562744, 'learning_rate': 4.5332175423360834e-05, 'epoch': 0.35}


 12%|█▏        | 2230/18924 [1:16:16<9:23:04,  2.02s/it]

{'loss': 1.1161, 'grad_norm': 9.168916702270508, 'learning_rate': 4.530503690838038e-05, 'epoch': 0.35}


 12%|█▏        | 2240/18924 [1:16:37<9:36:08,  2.07s/it]

{'loss': 1.0637, 'grad_norm': 8.077885627746582, 'learning_rate': 4.5277898393399916e-05, 'epoch': 0.36}


 12%|█▏        | 2250/18924 [1:16:57<9:10:54,  1.98s/it]

{'loss': 1.1102, 'grad_norm': 12.131393432617188, 'learning_rate': 4.525075987841946e-05, 'epoch': 0.36}


 12%|█▏        | 2260/18924 [1:17:17<9:23:18,  2.03s/it]

{'loss': 1.0493, 'grad_norm': 9.79043960571289, 'learning_rate': 4.5223621363439e-05, 'epoch': 0.36}


 12%|█▏        | 2270/18924 [1:17:38<9:47:09,  2.12s/it]

{'loss': 1.163, 'grad_norm': 11.519255638122559, 'learning_rate': 4.5196482848458535e-05, 'epoch': 0.36}


 12%|█▏        | 2280/18924 [1:17:59<9:19:34,  2.02s/it]

{'loss': 1.0894, 'grad_norm': 8.277183532714844, 'learning_rate': 4.516934433347807e-05, 'epoch': 0.36}


 12%|█▏        | 2290/18924 [1:18:19<9:14:54,  2.00s/it]

{'loss': 0.9412, 'grad_norm': 10.561595916748047, 'learning_rate': 4.514220581849761e-05, 'epoch': 0.36}


 12%|█▏        | 2300/18924 [1:18:40<9:43:57,  2.11s/it]

{'loss': 1.0225, 'grad_norm': 11.786602020263672, 'learning_rate': 4.5115067303517154e-05, 'epoch': 0.36}


 12%|█▏        | 2310/18924 [1:19:00<9:11:01,  1.99s/it]

{'loss': 0.9973, 'grad_norm': 7.0317864418029785, 'learning_rate': 4.508792878853669e-05, 'epoch': 0.37}


 12%|█▏        | 2320/18924 [1:19:20<9:12:23,  2.00s/it]

{'loss': 0.9707, 'grad_norm': 7.462252616882324, 'learning_rate': 4.506079027355623e-05, 'epoch': 0.37}


 12%|█▏        | 2330/18924 [1:19:40<9:26:44,  2.05s/it]

{'loss': 1.0452, 'grad_norm': 14.106841087341309, 'learning_rate': 4.503365175857577e-05, 'epoch': 0.37}


 12%|█▏        | 2340/18924 [1:20:01<9:32:42,  2.07s/it]

{'loss': 1.0421, 'grad_norm': 8.025382995605469, 'learning_rate': 4.500651324359531e-05, 'epoch': 0.37}


 12%|█▏        | 2350/18924 [1:20:20<9:06:11,  1.98s/it]

{'loss': 0.907, 'grad_norm': 9.521818161010742, 'learning_rate': 4.4979374728614855e-05, 'epoch': 0.37}


 12%|█▏        | 2360/18924 [1:20:41<9:22:39,  2.04s/it]

{'loss': 0.8762, 'grad_norm': 8.206785202026367, 'learning_rate': 4.495223621363439e-05, 'epoch': 0.37}


 13%|█▎        | 2370/18924 [1:21:01<9:29:34,  2.06s/it]

{'loss': 0.9053, 'grad_norm': 9.530706405639648, 'learning_rate': 4.492509769865394e-05, 'epoch': 0.38}


 13%|█▎        | 2380/18924 [1:21:22<9:43:01,  2.11s/it]

{'loss': 1.1689, 'grad_norm': 7.72846794128418, 'learning_rate': 4.4897959183673474e-05, 'epoch': 0.38}


 13%|█▎        | 2390/18924 [1:21:43<9:28:23,  2.06s/it]

{'loss': 1.0552, 'grad_norm': 7.820244789123535, 'learning_rate': 4.487082066869301e-05, 'epoch': 0.38}


 13%|█▎        | 2400/18924 [1:22:04<9:29:14,  2.07s/it]

{'loss': 1.2172, 'grad_norm': 12.744272232055664, 'learning_rate': 4.484368215371255e-05, 'epoch': 0.38}


 13%|█▎        | 2410/18924 [1:22:24<9:14:15,  2.01s/it]

{'loss': 0.9839, 'grad_norm': 7.291882038116455, 'learning_rate': 4.481654363873209e-05, 'epoch': 0.38}


 13%|█▎        | 2420/18924 [1:22:44<9:20:38,  2.04s/it]

{'loss': 0.9091, 'grad_norm': 5.780275344848633, 'learning_rate': 4.478940512375163e-05, 'epoch': 0.38}


 13%|█▎        | 2430/18924 [1:23:05<9:15:31,  2.02s/it]

{'loss': 0.9063, 'grad_norm': 12.235221862792969, 'learning_rate': 4.476226660877117e-05, 'epoch': 0.39}


 13%|█▎        | 2440/18924 [1:23:25<9:19:31,  2.04s/it]

{'loss': 1.0209, 'grad_norm': 8.763605117797852, 'learning_rate': 4.473512809379071e-05, 'epoch': 0.39}


 13%|█▎        | 2450/18924 [1:23:46<9:20:40,  2.04s/it]

{'loss': 1.1019, 'grad_norm': 7.209311485290527, 'learning_rate': 4.470798957881025e-05, 'epoch': 0.39}


 13%|█▎        | 2460/18924 [1:24:06<9:26:38,  2.06s/it]

{'loss': 1.2316, 'grad_norm': 12.093451499938965, 'learning_rate': 4.468085106382979e-05, 'epoch': 0.39}


 13%|█▎        | 2470/18924 [1:24:27<9:35:09,  2.10s/it]

{'loss': 0.8179, 'grad_norm': 10.523204803466797, 'learning_rate': 4.465371254884933e-05, 'epoch': 0.39}


 13%|█▎        | 2480/18924 [1:24:48<9:31:58,  2.09s/it]

{'loss': 0.8942, 'grad_norm': 7.302587032318115, 'learning_rate': 4.462657403386887e-05, 'epoch': 0.39}


 13%|█▎        | 2490/18924 [1:25:09<9:23:19,  2.06s/it]

{'loss': 1.0191, 'grad_norm': 12.804932594299316, 'learning_rate': 4.4599435518888414e-05, 'epoch': 0.39}


 13%|█▎        | 2500/18924 [1:25:29<9:22:18,  2.05s/it]

{'loss': 1.0328, 'grad_norm': 8.262746810913086, 'learning_rate': 4.457229700390795e-05, 'epoch': 0.4}


 13%|█▎        | 2510/18924 [1:25:51<9:19:37,  2.05s/it] 

{'loss': 0.8257, 'grad_norm': 5.764394283294678, 'learning_rate': 4.454515848892749e-05, 'epoch': 0.4}


 13%|█▎        | 2520/18924 [1:26:11<9:16:49,  2.04s/it]

{'loss': 0.9969, 'grad_norm': 9.141204833984375, 'learning_rate': 4.4518019973947026e-05, 'epoch': 0.4}


 13%|█▎        | 2530/18924 [1:26:31<9:16:39,  2.04s/it]

{'loss': 1.2171, 'grad_norm': 7.9693193435668945, 'learning_rate': 4.4490881458966564e-05, 'epoch': 0.4}


 13%|█▎        | 2540/18924 [1:26:52<9:24:39,  2.07s/it]

{'loss': 1.0845, 'grad_norm': 7.19540548324585, 'learning_rate': 4.446374294398611e-05, 'epoch': 0.4}


 13%|█▎        | 2550/18924 [1:27:13<9:28:23,  2.08s/it]

{'loss': 1.0872, 'grad_norm': 12.300622940063477, 'learning_rate': 4.4436604429005645e-05, 'epoch': 0.4}


 14%|█▎        | 2560/18924 [1:27:33<9:16:42,  2.04s/it]

{'loss': 0.8812, 'grad_norm': 8.184462547302246, 'learning_rate': 4.440946591402519e-05, 'epoch': 0.41}


 14%|█▎        | 2570/18924 [1:27:54<9:24:27,  2.07s/it]

{'loss': 0.8836, 'grad_norm': 8.82076358795166, 'learning_rate': 4.438232739904473e-05, 'epoch': 0.41}


 14%|█▎        | 2580/18924 [1:28:14<9:16:51,  2.04s/it]

{'loss': 1.0329, 'grad_norm': 8.782156944274902, 'learning_rate': 4.4355188884064264e-05, 'epoch': 0.41}


 14%|█▎        | 2590/18924 [1:28:35<9:15:22,  2.04s/it]

{'loss': 0.9841, 'grad_norm': 6.768447399139404, 'learning_rate': 4.432805036908381e-05, 'epoch': 0.41}


 14%|█▎        | 2600/18924 [1:28:55<9:10:43,  2.02s/it]

{'loss': 1.1463, 'grad_norm': 9.103682518005371, 'learning_rate': 4.4300911854103346e-05, 'epoch': 0.41}


 14%|█▍        | 2610/18924 [1:29:16<9:20:59,  2.06s/it]

{'loss': 0.9046, 'grad_norm': 9.176606178283691, 'learning_rate': 4.427377333912289e-05, 'epoch': 0.41}


 14%|█▍        | 2620/18924 [1:29:36<9:12:16,  2.03s/it]

{'loss': 0.9342, 'grad_norm': 6.068510055541992, 'learning_rate': 4.424663482414242e-05, 'epoch': 0.42}


 14%|█▍        | 2630/18924 [1:29:56<9:05:43,  2.01s/it]

{'loss': 1.2158, 'grad_norm': 8.557100296020508, 'learning_rate': 4.4219496309161965e-05, 'epoch': 0.42}


 14%|█▍        | 2640/18924 [1:30:16<8:58:04,  1.98s/it]

{'loss': 0.9783, 'grad_norm': 6.385613918304443, 'learning_rate': 4.41923577941815e-05, 'epoch': 0.42}


 14%|█▍        | 2650/18924 [1:30:36<8:58:32,  1.99s/it]

{'loss': 1.0347, 'grad_norm': 13.155571937561035, 'learning_rate': 4.416521927920104e-05, 'epoch': 0.42}


 14%|█▍        | 2660/18924 [1:30:56<8:54:04,  1.97s/it]

{'loss': 0.9537, 'grad_norm': 7.539699554443359, 'learning_rate': 4.4138080764220585e-05, 'epoch': 0.42}


 14%|█▍        | 2670/18924 [1:31:15<8:51:32,  1.96s/it]

{'loss': 0.9785, 'grad_norm': 9.864224433898926, 'learning_rate': 4.411094224924012e-05, 'epoch': 0.42}


 14%|█▍        | 2680/18924 [1:31:34<8:20:01,  1.85s/it]

{'loss': 0.9904, 'grad_norm': 9.810026168823242, 'learning_rate': 4.4083803734259666e-05, 'epoch': 0.42}


 14%|█▍        | 2690/18924 [1:31:52<8:10:48,  1.81s/it]

{'loss': 1.0245, 'grad_norm': 12.129212379455566, 'learning_rate': 4.4056665219279204e-05, 'epoch': 0.43}


 14%|█▍        | 2700/18924 [1:32:11<8:23:43,  1.86s/it]

{'loss': 1.0966, 'grad_norm': 12.627175331115723, 'learning_rate': 4.402952670429874e-05, 'epoch': 0.43}


 14%|█▍        | 2710/18924 [1:32:29<8:20:51,  1.85s/it]

{'loss': 1.0158, 'grad_norm': 6.07438850402832, 'learning_rate': 4.4002388189318285e-05, 'epoch': 0.43}


 14%|█▍        | 2720/18924 [1:32:48<8:19:27,  1.85s/it]

{'loss': 0.9888, 'grad_norm': 7.780563831329346, 'learning_rate': 4.397524967433782e-05, 'epoch': 0.43}


 14%|█▍        | 2730/18924 [1:33:06<8:22:23,  1.86s/it]

{'loss': 1.0259, 'grad_norm': 9.568648338317871, 'learning_rate': 4.394811115935736e-05, 'epoch': 0.43}


 14%|█▍        | 2740/18924 [1:33:25<8:22:09,  1.86s/it]

{'loss': 0.9831, 'grad_norm': 8.726814270019531, 'learning_rate': 4.39209726443769e-05, 'epoch': 0.43}


 15%|█▍        | 2750/18924 [1:33:44<8:22:25,  1.86s/it]

{'loss': 1.0563, 'grad_norm': 11.048033714294434, 'learning_rate': 4.389383412939644e-05, 'epoch': 0.44}


 15%|█▍        | 2760/18924 [1:34:03<8:24:49,  1.87s/it]

{'loss': 1.0428, 'grad_norm': 9.392476081848145, 'learning_rate': 4.386669561441598e-05, 'epoch': 0.44}


 15%|█▍        | 2770/18924 [1:34:21<8:26:24,  1.88s/it]

{'loss': 1.0409, 'grad_norm': 12.036663055419922, 'learning_rate': 4.383955709943552e-05, 'epoch': 0.44}


 15%|█▍        | 2780/18924 [1:34:40<8:27:14,  1.89s/it]

{'loss': 0.9199, 'grad_norm': 6.056685924530029, 'learning_rate': 4.381241858445506e-05, 'epoch': 0.44}


 15%|█▍        | 2790/18924 [1:34:59<8:30:18,  1.90s/it]

{'loss': 1.0792, 'grad_norm': 8.793553352355957, 'learning_rate': 4.37852800694746e-05, 'epoch': 0.44}


 15%|█▍        | 2800/18924 [1:35:18<8:23:58,  1.88s/it]

{'loss': 0.9448, 'grad_norm': 11.321322441101074, 'learning_rate': 4.375814155449414e-05, 'epoch': 0.44}


 15%|█▍        | 2810/18924 [1:35:37<8:29:18,  1.90s/it]

{'loss': 0.8242, 'grad_norm': 7.4511213302612305, 'learning_rate': 4.373100303951368e-05, 'epoch': 0.45}


 15%|█▍        | 2820/18924 [1:35:56<8:25:28,  1.88s/it]

{'loss': 1.2231, 'grad_norm': 10.007354736328125, 'learning_rate': 4.3703864524533225e-05, 'epoch': 0.45}


 15%|█▍        | 2830/18924 [1:36:14<8:27:02,  1.89s/it]

{'loss': 1.2619, 'grad_norm': 8.890152931213379, 'learning_rate': 4.367672600955276e-05, 'epoch': 0.45}


 15%|█▌        | 2840/18924 [1:36:33<8:25:36,  1.89s/it]

{'loss': 1.0708, 'grad_norm': 4.240480422973633, 'learning_rate': 4.36495874945723e-05, 'epoch': 0.45}


 15%|█▌        | 2850/18924 [1:36:52<8:19:08,  1.86s/it]

{'loss': 0.8412, 'grad_norm': 9.131072998046875, 'learning_rate': 4.362244897959184e-05, 'epoch': 0.45}


 15%|█▌        | 2860/18924 [1:37:11<8:24:38,  1.88s/it]

{'loss': 1.1762, 'grad_norm': 8.914348602294922, 'learning_rate': 4.3595310464611375e-05, 'epoch': 0.45}


 15%|█▌        | 2870/18924 [1:37:30<8:24:04,  1.88s/it]

{'loss': 0.9376, 'grad_norm': 7.917049884796143, 'learning_rate': 4.356817194963092e-05, 'epoch': 0.45}


 15%|█▌        | 2880/18924 [1:37:48<8:22:12,  1.88s/it]

{'loss': 0.9048, 'grad_norm': 8.484718322753906, 'learning_rate': 4.3541033434650456e-05, 'epoch': 0.46}


 15%|█▌        | 2890/18924 [1:38:07<8:26:19,  1.89s/it]

{'loss': 1.0721, 'grad_norm': 11.594158172607422, 'learning_rate': 4.3513894919669994e-05, 'epoch': 0.46}


 15%|█▌        | 2900/18924 [1:38:26<8:25:15,  1.89s/it]

{'loss': 0.8988, 'grad_norm': 8.385963439941406, 'learning_rate': 4.348675640468954e-05, 'epoch': 0.46}


 15%|█▌        | 2910/18924 [1:38:45<8:24:42,  1.89s/it]

{'loss': 0.9213, 'grad_norm': 13.003897666931152, 'learning_rate': 4.3459617889709076e-05, 'epoch': 0.46}


 15%|█▌        | 2920/18924 [1:39:04<8:26:34,  1.90s/it]

{'loss': 1.1429, 'grad_norm': 6.897055149078369, 'learning_rate': 4.343247937472862e-05, 'epoch': 0.46}


 15%|█▌        | 2930/18924 [1:39:23<8:18:01,  1.87s/it]

{'loss': 0.9099, 'grad_norm': 5.774617671966553, 'learning_rate': 4.340534085974816e-05, 'epoch': 0.46}


 16%|█▌        | 2940/18924 [1:39:41<8:18:37,  1.87s/it]

{'loss': 1.1949, 'grad_norm': 14.358694076538086, 'learning_rate': 4.33782023447677e-05, 'epoch': 0.47}


 16%|█▌        | 2950/18924 [1:40:00<8:23:07,  1.89s/it]

{'loss': 0.9147, 'grad_norm': 9.571078300476074, 'learning_rate': 4.335106382978724e-05, 'epoch': 0.47}


 16%|█▌        | 2960/18924 [1:40:19<8:29:08,  1.91s/it]

{'loss': 0.9494, 'grad_norm': 8.685648918151855, 'learning_rate': 4.3323925314806777e-05, 'epoch': 0.47}


 16%|█▌        | 2970/18924 [1:40:38<8:25:39,  1.90s/it]

{'loss': 1.0048, 'grad_norm': 10.932092666625977, 'learning_rate': 4.3296786799826314e-05, 'epoch': 0.47}


 16%|█▌        | 2980/18924 [1:40:57<8:17:18,  1.87s/it]

{'loss': 1.0436, 'grad_norm': 17.32655906677246, 'learning_rate': 4.326964828484585e-05, 'epoch': 0.47}


 16%|█▌        | 2990/18924 [1:41:16<8:23:15,  1.90s/it]

{'loss': 0.8405, 'grad_norm': 10.70070743560791, 'learning_rate': 4.3242509769865396e-05, 'epoch': 0.47}


 16%|█▌        | 3000/18924 [1:41:35<8:16:40,  1.87s/it]

{'loss': 0.7044, 'grad_norm': 7.527324199676514, 'learning_rate': 4.321537125488493e-05, 'epoch': 0.48}


 16%|█▌        | 3010/18924 [1:41:55<8:19:45,  1.88s/it]

{'loss': 0.8618, 'grad_norm': 6.507564067840576, 'learning_rate': 4.318823273990448e-05, 'epoch': 0.48}


 16%|█▌        | 3020/18924 [1:42:13<8:19:16,  1.88s/it]

{'loss': 1.0922, 'grad_norm': 8.985532760620117, 'learning_rate': 4.3161094224924015e-05, 'epoch': 0.48}


 16%|█▌        | 3030/18924 [1:42:32<8:15:53,  1.87s/it]

{'loss': 0.9924, 'grad_norm': 6.775444030761719, 'learning_rate': 4.313395570994355e-05, 'epoch': 0.48}


 16%|█▌        | 3040/18924 [1:42:51<8:17:50,  1.88s/it]

{'loss': 1.0009, 'grad_norm': 12.503226280212402, 'learning_rate': 4.31068171949631e-05, 'epoch': 0.48}


 16%|█▌        | 3050/18924 [1:43:10<8:21:57,  1.90s/it]

{'loss': 1.0203, 'grad_norm': 9.257758140563965, 'learning_rate': 4.3079678679982634e-05, 'epoch': 0.48}


 16%|█▌        | 3060/18924 [1:43:29<8:16:36,  1.88s/it]

{'loss': 1.0273, 'grad_norm': 9.91024112701416, 'learning_rate': 4.305254016500218e-05, 'epoch': 0.49}


 16%|█▌        | 3070/18924 [1:43:48<8:18:18,  1.89s/it]

{'loss': 0.8539, 'grad_norm': 6.1034255027771, 'learning_rate': 4.3025401650021716e-05, 'epoch': 0.49}


 16%|█▋        | 3080/18924 [1:44:06<8:15:05,  1.87s/it]

{'loss': 1.026, 'grad_norm': 7.660027027130127, 'learning_rate': 4.299826313504125e-05, 'epoch': 0.49}


 16%|█▋        | 3090/18924 [1:44:25<8:13:35,  1.87s/it]

{'loss': 0.8149, 'grad_norm': 8.590332984924316, 'learning_rate': 4.297112462006079e-05, 'epoch': 0.49}


 16%|█▋        | 3100/18924 [1:44:44<8:19:34,  1.89s/it]

{'loss': 1.0663, 'grad_norm': 6.17831563949585, 'learning_rate': 4.294398610508033e-05, 'epoch': 0.49}


 16%|█▋        | 3110/18924 [1:45:03<8:23:41,  1.91s/it]

{'loss': 1.0171, 'grad_norm': 11.124811172485352, 'learning_rate': 4.291684759009987e-05, 'epoch': 0.49}


 16%|█▋        | 3120/18924 [1:45:22<8:28:29,  1.93s/it]

{'loss': 1.0372, 'grad_norm': 8.452400207519531, 'learning_rate': 4.288970907511941e-05, 'epoch': 0.49}


 17%|█▋        | 3130/18924 [1:45:42<8:29:45,  1.94s/it]

{'loss': 0.8983, 'grad_norm': 7.841787815093994, 'learning_rate': 4.2862570560138954e-05, 'epoch': 0.5}


 17%|█▋        | 3140/18924 [1:46:01<8:14:02,  1.88s/it]

{'loss': 1.0097, 'grad_norm': 5.068417072296143, 'learning_rate': 4.283543204515849e-05, 'epoch': 0.5}


 17%|█▋        | 3150/18924 [1:46:19<8:10:36,  1.87s/it]

{'loss': 0.8253, 'grad_norm': 10.27879810333252, 'learning_rate': 4.280829353017803e-05, 'epoch': 0.5}


 17%|█▋        | 3160/18924 [1:46:38<8:19:00,  1.90s/it]

{'loss': 1.0785, 'grad_norm': 14.257104873657227, 'learning_rate': 4.2781155015197573e-05, 'epoch': 0.5}


 17%|█▋        | 3170/18924 [1:46:57<8:21:17,  1.91s/it]

{'loss': 1.1365, 'grad_norm': 8.48618221282959, 'learning_rate': 4.275401650021711e-05, 'epoch': 0.5}


 17%|█▋        | 3180/18924 [1:47:16<8:25:13,  1.93s/it]

{'loss': 1.0135, 'grad_norm': 10.998083114624023, 'learning_rate': 4.2726877985236655e-05, 'epoch': 0.5}


 17%|█▋        | 3190/18924 [1:47:36<8:20:29,  1.91s/it]

{'loss': 0.9745, 'grad_norm': 9.380470275878906, 'learning_rate': 4.269973947025619e-05, 'epoch': 0.51}


 17%|█▋        | 3200/18924 [1:47:55<8:19:25,  1.91s/it]

{'loss': 1.0382, 'grad_norm': 9.06667423248291, 'learning_rate': 4.267260095527572e-05, 'epoch': 0.51}


 17%|█▋        | 3210/18924 [1:48:13<8:09:42,  1.87s/it]

{'loss': 0.9035, 'grad_norm': 7.200159549713135, 'learning_rate': 4.264546244029527e-05, 'epoch': 0.51}


 17%|█▋        | 3220/18924 [1:48:32<8:09:15,  1.87s/it]

{'loss': 0.9804, 'grad_norm': 8.355880737304688, 'learning_rate': 4.2618323925314805e-05, 'epoch': 0.51}


 17%|█▋        | 3230/18924 [1:48:51<8:13:41,  1.89s/it]

{'loss': 0.8285, 'grad_norm': 8.386260032653809, 'learning_rate': 4.259118541033435e-05, 'epoch': 0.51}


 17%|█▋        | 3240/18924 [1:49:10<8:20:55,  1.92s/it]

{'loss': 1.0541, 'grad_norm': 9.04215145111084, 'learning_rate': 4.256404689535389e-05, 'epoch': 0.51}


 17%|█▋        | 3250/18924 [1:49:30<8:25:29,  1.94s/it]

{'loss': 0.9254, 'grad_norm': 8.343427658081055, 'learning_rate': 4.253690838037343e-05, 'epoch': 0.52}


 17%|█▋        | 3260/18924 [1:49:49<8:17:43,  1.91s/it]

{'loss': 0.831, 'grad_norm': 4.793644905090332, 'learning_rate': 4.250976986539297e-05, 'epoch': 0.52}


 17%|█▋        | 3270/18924 [1:50:08<8:16:38,  1.90s/it]

{'loss': 1.0428, 'grad_norm': 8.445043563842773, 'learning_rate': 4.2482631350412506e-05, 'epoch': 0.52}


 17%|█▋        | 3280/18924 [1:50:27<8:16:27,  1.90s/it]

{'loss': 1.0674, 'grad_norm': 8.326789855957031, 'learning_rate': 4.245549283543205e-05, 'epoch': 0.52}


 17%|█▋        | 3290/18924 [1:50:46<8:09:26,  1.88s/it]

{'loss': 0.9633, 'grad_norm': 8.126697540283203, 'learning_rate': 4.242835432045159e-05, 'epoch': 0.52}


 17%|█▋        | 3300/18924 [1:51:04<8:08:16,  1.88s/it]

{'loss': 0.888, 'grad_norm': 4.784097194671631, 'learning_rate': 4.2401215805471125e-05, 'epoch': 0.52}


 17%|█▋        | 3310/18924 [1:51:23<8:14:31,  1.90s/it]

{'loss': 1.1169, 'grad_norm': 8.876235008239746, 'learning_rate': 4.237407729049066e-05, 'epoch': 0.52}


 18%|█▊        | 3320/18924 [1:51:42<8:14:40,  1.90s/it]

{'loss': 1.1231, 'grad_norm': 9.179895401000977, 'learning_rate': 4.234693877551021e-05, 'epoch': 0.53}


 18%|█▊        | 3330/18924 [1:52:01<8:15:13,  1.91s/it]

{'loss': 1.0409, 'grad_norm': 9.470203399658203, 'learning_rate': 4.2319800260529744e-05, 'epoch': 0.53}


 18%|█▊        | 3340/18924 [1:52:21<8:32:31,  1.97s/it]

{'loss': 0.9685, 'grad_norm': 8.350357055664062, 'learning_rate': 4.229266174554928e-05, 'epoch': 0.53}


 18%|█▊        | 3350/18924 [1:52:42<9:03:17,  2.09s/it]

{'loss': 0.9633, 'grad_norm': 10.116786003112793, 'learning_rate': 4.2265523230568826e-05, 'epoch': 0.53}


 18%|█▊        | 3360/18924 [1:53:01<8:12:08,  1.90s/it]

{'loss': 0.9747, 'grad_norm': 7.283727169036865, 'learning_rate': 4.2238384715588364e-05, 'epoch': 0.53}


 18%|█▊        | 3370/18924 [1:53:20<8:00:22,  1.85s/it]

{'loss': 0.9616, 'grad_norm': 7.5558905601501465, 'learning_rate': 4.221124620060791e-05, 'epoch': 0.53}


 18%|█▊        | 3380/18924 [1:53:38<8:03:03,  1.86s/it]

{'loss': 0.9897, 'grad_norm': 8.654813766479492, 'learning_rate': 4.2184107685627445e-05, 'epoch': 0.54}


 18%|█▊        | 3390/18924 [1:53:57<8:08:57,  1.89s/it]

{'loss': 0.7828, 'grad_norm': 5.939502716064453, 'learning_rate': 4.215696917064698e-05, 'epoch': 0.54}


 18%|█▊        | 3400/18924 [1:54:16<8:08:09,  1.89s/it]

{'loss': 1.0449, 'grad_norm': 12.811129570007324, 'learning_rate': 4.212983065566653e-05, 'epoch': 0.54}


 18%|█▊        | 3410/18924 [1:54:35<8:07:56,  1.89s/it]

{'loss': 0.9586, 'grad_norm': 14.958794593811035, 'learning_rate': 4.2102692140686064e-05, 'epoch': 0.54}


 18%|█▊        | 3420/18924 [1:54:54<8:08:24,  1.89s/it]

{'loss': 0.959, 'grad_norm': 8.706069946289062, 'learning_rate': 4.20755536257056e-05, 'epoch': 0.54}


 18%|█▊        | 3430/18924 [1:55:13<8:03:22,  1.87s/it]

{'loss': 0.9323, 'grad_norm': 7.202505111694336, 'learning_rate': 4.204841511072514e-05, 'epoch': 0.54}


 18%|█▊        | 3440/18924 [1:55:31<8:07:35,  1.89s/it]

{'loss': 0.8699, 'grad_norm': 6.440024375915527, 'learning_rate': 4.2021276595744684e-05, 'epoch': 0.55}


 18%|█▊        | 3450/18924 [1:55:50<8:06:45,  1.89s/it]

{'loss': 1.0428, 'grad_norm': 8.73386287689209, 'learning_rate': 4.199413808076422e-05, 'epoch': 0.55}


 18%|█▊        | 3460/18924 [1:56:09<8:02:35,  1.87s/it]

{'loss': 0.9154, 'grad_norm': 8.670973777770996, 'learning_rate': 4.196699956578376e-05, 'epoch': 0.55}


 18%|█▊        | 3470/18924 [1:56:28<8:06:26,  1.89s/it]

{'loss': 0.7949, 'grad_norm': 8.628874778747559, 'learning_rate': 4.19398610508033e-05, 'epoch': 0.55}


 18%|█▊        | 3480/18924 [1:56:47<8:11:18,  1.91s/it]

{'loss': 1.1084, 'grad_norm': 8.36253833770752, 'learning_rate': 4.191272253582284e-05, 'epoch': 0.55}


 18%|█▊        | 3490/18924 [1:57:06<8:05:56,  1.89s/it]

{'loss': 1.1116, 'grad_norm': 8.332494735717773, 'learning_rate': 4.1885584020842385e-05, 'epoch': 0.55}


 18%|█▊        | 3500/18924 [1:57:25<8:04:08,  1.88s/it]

{'loss': 0.9028, 'grad_norm': 7.422390937805176, 'learning_rate': 4.185844550586192e-05, 'epoch': 0.55}


 19%|█▊        | 3510/18924 [1:57:44<8:04:23,  1.89s/it]

{'loss': 1.1243, 'grad_norm': 6.306804656982422, 'learning_rate': 4.1831306990881466e-05, 'epoch': 0.56}


 19%|█▊        | 3520/18924 [1:58:04<8:24:44,  1.97s/it]

{'loss': 0.9627, 'grad_norm': 9.007428169250488, 'learning_rate': 4.1804168475901004e-05, 'epoch': 0.56}


 19%|█▊        | 3530/18924 [1:58:25<9:06:34,  2.13s/it]

{'loss': 0.97, 'grad_norm': 7.782107353210449, 'learning_rate': 4.177702996092054e-05, 'epoch': 0.56}


 19%|█▊        | 3540/18924 [1:58:46<9:10:49,  2.15s/it]

{'loss': 1.0813, 'grad_norm': 4.738739013671875, 'learning_rate': 4.174989144594008e-05, 'epoch': 0.56}


 19%|█▉        | 3550/18924 [1:59:07<8:53:12,  2.08s/it]

{'loss': 0.9053, 'grad_norm': 5.506789684295654, 'learning_rate': 4.1722752930959616e-05, 'epoch': 0.56}


 19%|█▉        | 3560/18924 [1:59:28<8:47:15,  2.06s/it]

{'loss': 0.8735, 'grad_norm': 5.744731426239014, 'learning_rate': 4.169561441597916e-05, 'epoch': 0.56}


 19%|█▉        | 3570/18924 [1:59:48<8:42:35,  2.04s/it]

{'loss': 1.0561, 'grad_norm': 7.087968349456787, 'learning_rate': 4.16684759009987e-05, 'epoch': 0.57}


 19%|█▉        | 3580/18924 [2:00:08<8:36:33,  2.02s/it]

{'loss': 0.8822, 'grad_norm': 14.064587593078613, 'learning_rate': 4.1641337386018235e-05, 'epoch': 0.57}


 19%|█▉        | 3590/18924 [2:00:29<8:36:26,  2.02s/it]

{'loss': 0.8414, 'grad_norm': 9.51914119720459, 'learning_rate': 4.161419887103778e-05, 'epoch': 0.57}


 19%|█▉        | 3600/18924 [2:00:49<8:34:25,  2.01s/it]

{'loss': 1.1093, 'grad_norm': 8.789275169372559, 'learning_rate': 4.158706035605732e-05, 'epoch': 0.57}


 19%|█▉        | 3610/18924 [2:01:09<8:30:43,  2.00s/it]

{'loss': 0.9984, 'grad_norm': 9.06257438659668, 'learning_rate': 4.155992184107686e-05, 'epoch': 0.57}


 19%|█▉        | 3620/18924 [2:01:29<8:31:33,  2.01s/it]

{'loss': 1.0401, 'grad_norm': 8.591787338256836, 'learning_rate': 4.15327833260964e-05, 'epoch': 0.57}


 19%|█▉        | 3630/18924 [2:01:49<8:28:07,  1.99s/it]

{'loss': 0.7637, 'grad_norm': 10.72758674621582, 'learning_rate': 4.150564481111594e-05, 'epoch': 0.58}


 19%|█▉        | 3640/18924 [2:02:08<7:50:58,  1.85s/it]

{'loss': 1.0141, 'grad_norm': 10.245415687561035, 'learning_rate': 4.147850629613548e-05, 'epoch': 0.58}


 19%|█▉        | 3650/18924 [2:02:26<7:41:41,  1.81s/it]

{'loss': 0.9309, 'grad_norm': 11.024054527282715, 'learning_rate': 4.145136778115502e-05, 'epoch': 0.58}


 19%|█▉        | 3660/18924 [2:02:44<7:53:24,  1.86s/it]

{'loss': 1.0518, 'grad_norm': 8.47916030883789, 'learning_rate': 4.1424229266174556e-05, 'epoch': 0.58}


 19%|█▉        | 3670/18924 [2:03:03<7:59:36,  1.89s/it]

{'loss': 0.9432, 'grad_norm': 7.767529487609863, 'learning_rate': 4.139709075119409e-05, 'epoch': 0.58}


 19%|█▉        | 3680/18924 [2:03:22<7:58:14,  1.88s/it]

{'loss': 1.0432, 'grad_norm': 7.863335609436035, 'learning_rate': 4.136995223621364e-05, 'epoch': 0.58}


 19%|█▉        | 3690/18924 [2:03:41<7:51:31,  1.86s/it]

{'loss': 1.1235, 'grad_norm': 8.867847442626953, 'learning_rate': 4.1342813721233175e-05, 'epoch': 0.58}


 20%|█▉        | 3700/18924 [2:03:59<7:51:04,  1.86s/it]

{'loss': 1.1041, 'grad_norm': 5.803707599639893, 'learning_rate': 4.131567520625272e-05, 'epoch': 0.59}


 20%|█▉        | 3710/18924 [2:04:18<7:55:05,  1.87s/it]

{'loss': 0.7293, 'grad_norm': 10.83293342590332, 'learning_rate': 4.1288536691272256e-05, 'epoch': 0.59}


 20%|█▉        | 3720/18924 [2:04:37<7:54:41,  1.87s/it]

{'loss': 1.0614, 'grad_norm': 6.6842451095581055, 'learning_rate': 4.1261398176291794e-05, 'epoch': 0.59}


 20%|█▉        | 3730/18924 [2:04:55<7:53:31,  1.87s/it]

{'loss': 0.9925, 'grad_norm': 9.394102096557617, 'learning_rate': 4.123425966131134e-05, 'epoch': 0.59}


 20%|█▉        | 3740/18924 [2:05:14<7:55:31,  1.88s/it]

{'loss': 0.9202, 'grad_norm': 8.809697151184082, 'learning_rate': 4.1207121146330876e-05, 'epoch': 0.59}


 20%|█▉        | 3750/18924 [2:05:33<8:02:53,  1.91s/it]

{'loss': 0.7259, 'grad_norm': 6.935725688934326, 'learning_rate': 4.117998263135042e-05, 'epoch': 0.59}


 20%|█▉        | 3760/18924 [2:05:52<8:09:27,  1.94s/it]

{'loss': 1.1277, 'grad_norm': 8.16643238067627, 'learning_rate': 4.115284411636996e-05, 'epoch': 0.6}


 20%|█▉        | 3770/18924 [2:06:12<8:11:59,  1.95s/it]

{'loss': 0.8721, 'grad_norm': 20.363927841186523, 'learning_rate': 4.1125705601389495e-05, 'epoch': 0.6}


 20%|█▉        | 3780/18924 [2:06:31<8:10:40,  1.94s/it]

{'loss': 0.8998, 'grad_norm': 10.649453163146973, 'learning_rate': 4.109856708640903e-05, 'epoch': 0.6}


 20%|██        | 3790/18924 [2:06:51<8:02:56,  1.91s/it]

{'loss': 1.0281, 'grad_norm': 9.697054862976074, 'learning_rate': 4.107142857142857e-05, 'epoch': 0.6}


 20%|██        | 3800/18924 [2:07:09<7:54:07,  1.88s/it]

{'loss': 0.9918, 'grad_norm': 7.600900650024414, 'learning_rate': 4.1044290056448114e-05, 'epoch': 0.6}


 20%|██        | 3810/18924 [2:07:28<7:57:49,  1.90s/it]

{'loss': 1.136, 'grad_norm': 8.44373893737793, 'learning_rate': 4.101715154146765e-05, 'epoch': 0.6}


 20%|██        | 3820/18924 [2:07:48<8:10:06,  1.95s/it]

{'loss': 0.951, 'grad_norm': 7.265394687652588, 'learning_rate': 4.0990013026487196e-05, 'epoch': 0.61}


 20%|██        | 3830/18924 [2:08:07<8:00:34,  1.91s/it]

{'loss': 0.9954, 'grad_norm': 11.762185096740723, 'learning_rate': 4.096287451150673e-05, 'epoch': 0.61}


 20%|██        | 3840/18924 [2:08:26<7:56:31,  1.90s/it]

{'loss': 0.8916, 'grad_norm': 8.101730346679688, 'learning_rate': 4.093573599652627e-05, 'epoch': 0.61}


 20%|██        | 3850/18924 [2:08:45<8:09:10,  1.95s/it]

{'loss': 0.9315, 'grad_norm': 7.0797600746154785, 'learning_rate': 4.0908597481545815e-05, 'epoch': 0.61}


 20%|██        | 3860/18924 [2:09:05<8:12:39,  1.96s/it]

{'loss': 0.9228, 'grad_norm': 9.275520324707031, 'learning_rate': 4.088145896656535e-05, 'epoch': 0.61}


 20%|██        | 3870/18924 [2:09:24<8:09:38,  1.95s/it]

{'loss': 1.0192, 'grad_norm': 9.510906219482422, 'learning_rate': 4.085432045158489e-05, 'epoch': 0.61}


 21%|██        | 3880/18924 [2:09:44<8:05:24,  1.94s/it]

{'loss': 1.0682, 'grad_norm': 13.640631675720215, 'learning_rate': 4.082718193660443e-05, 'epoch': 0.62}


 21%|██        | 3890/18924 [2:10:03<8:04:12,  1.93s/it]

{'loss': 0.8824, 'grad_norm': 5.724077224731445, 'learning_rate': 4.0800043421623965e-05, 'epoch': 0.62}


 21%|██        | 3900/18924 [2:10:22<8:14:35,  1.98s/it]

{'loss': 0.8316, 'grad_norm': 5.490976810455322, 'learning_rate': 4.077290490664351e-05, 'epoch': 0.62}


 21%|██        | 3910/18924 [2:10:42<8:06:53,  1.95s/it]

{'loss': 0.8973, 'grad_norm': 8.93968677520752, 'learning_rate': 4.0745766391663047e-05, 'epoch': 0.62}


 21%|██        | 3920/18924 [2:11:01<7:59:19,  1.92s/it]

{'loss': 0.8545, 'grad_norm': 13.01517391204834, 'learning_rate': 4.071862787668259e-05, 'epoch': 0.62}


 21%|██        | 3930/18924 [2:11:20<8:05:36,  1.94s/it]

{'loss': 0.9516, 'grad_norm': 6.843331813812256, 'learning_rate': 4.069148936170213e-05, 'epoch': 0.62}


 21%|██        | 3940/18924 [2:11:40<8:10:05,  1.96s/it]

{'loss': 0.8069, 'grad_norm': 8.18807315826416, 'learning_rate': 4.066435084672167e-05, 'epoch': 0.62}


 21%|██        | 3950/18924 [2:12:00<8:16:33,  1.99s/it]

{'loss': 0.7346, 'grad_norm': 9.852282524108887, 'learning_rate': 4.063721233174121e-05, 'epoch': 0.63}


 21%|██        | 3960/18924 [2:12:19<8:03:07,  1.94s/it]

{'loss': 0.9436, 'grad_norm': 8.734076499938965, 'learning_rate': 4.061007381676075e-05, 'epoch': 0.63}


 21%|██        | 3970/18924 [2:12:39<8:00:21,  1.93s/it]

{'loss': 0.8385, 'grad_norm': 11.3905611038208, 'learning_rate': 4.058293530178029e-05, 'epoch': 0.63}


 21%|██        | 3980/18924 [2:12:58<8:12:08,  1.98s/it]

{'loss': 0.7458, 'grad_norm': 7.4328460693359375, 'learning_rate': 4.055579678679983e-05, 'epoch': 0.63}


 21%|██        | 3990/18924 [2:13:18<8:12:23,  1.98s/it]

{'loss': 1.0237, 'grad_norm': 10.859747886657715, 'learning_rate': 4.052865827181937e-05, 'epoch': 0.63}


 21%|██        | 4000/18924 [2:13:38<8:09:34,  1.97s/it]

{'loss': 1.0744, 'grad_norm': 9.28200912475586, 'learning_rate': 4.0501519756838904e-05, 'epoch': 0.63}


 21%|██        | 4010/18924 [2:13:58<8:12:38,  1.98s/it]

{'loss': 0.7842, 'grad_norm': 9.50847339630127, 'learning_rate': 4.047438124185845e-05, 'epoch': 0.64}


 21%|██        | 4020/18924 [2:14:18<8:07:34,  1.96s/it]

{'loss': 1.143, 'grad_norm': 9.525138854980469, 'learning_rate': 4.0447242726877986e-05, 'epoch': 0.64}


 21%|██▏       | 4030/18924 [2:14:37<8:07:59,  1.97s/it]

{'loss': 0.9417, 'grad_norm': 8.188075065612793, 'learning_rate': 4.042010421189752e-05, 'epoch': 0.64}


 21%|██▏       | 4040/18924 [2:14:57<8:13:46,  1.99s/it]

{'loss': 0.8805, 'grad_norm': 9.11673641204834, 'learning_rate': 4.039296569691707e-05, 'epoch': 0.64}


 21%|██▏       | 4050/18924 [2:15:17<8:03:40,  1.95s/it]

{'loss': 0.9762, 'grad_norm': 11.219305038452148, 'learning_rate': 4.0365827181936605e-05, 'epoch': 0.64}


 21%|██▏       | 4060/18924 [2:15:36<8:03:50,  1.95s/it]

{'loss': 0.8623, 'grad_norm': 6.527420520782471, 'learning_rate': 4.033868866695615e-05, 'epoch': 0.64}


 22%|██▏       | 4070/18924 [2:15:56<7:59:39,  1.94s/it]

{'loss': 1.0704, 'grad_norm': 7.004192352294922, 'learning_rate': 4.031155015197569e-05, 'epoch': 0.65}


 22%|██▏       | 4080/18924 [2:16:15<7:56:16,  1.93s/it]

{'loss': 0.9428, 'grad_norm': 9.705744743347168, 'learning_rate': 4.0284411636995224e-05, 'epoch': 0.65}


 22%|██▏       | 4090/18924 [2:16:35<8:06:04,  1.97s/it]

{'loss': 1.0122, 'grad_norm': 8.713143348693848, 'learning_rate': 4.025727312201477e-05, 'epoch': 0.65}


 22%|██▏       | 4100/18924 [2:16:55<8:17:49,  2.01s/it]

{'loss': 0.9391, 'grad_norm': 6.349215030670166, 'learning_rate': 4.0230134607034306e-05, 'epoch': 0.65}


 22%|██▏       | 4110/18924 [2:17:15<8:09:29,  1.98s/it]

{'loss': 0.8556, 'grad_norm': 6.9579291343688965, 'learning_rate': 4.0202996092053843e-05, 'epoch': 0.65}


 22%|██▏       | 4120/18924 [2:17:34<7:58:22,  1.94s/it]

{'loss': 0.9372, 'grad_norm': 8.486611366271973, 'learning_rate': 4.017585757707338e-05, 'epoch': 0.65}


 22%|██▏       | 4130/18924 [2:17:54<8:02:57,  1.96s/it]

{'loss': 0.7435, 'grad_norm': 5.39361572265625, 'learning_rate': 4.0148719062092925e-05, 'epoch': 0.65}


 22%|██▏       | 4140/18924 [2:18:13<8:07:31,  1.98s/it]

{'loss': 1.0742, 'grad_norm': 14.171125411987305, 'learning_rate': 4.012158054711246e-05, 'epoch': 0.66}


 22%|██▏       | 4150/18924 [2:18:33<8:00:50,  1.95s/it]

{'loss': 0.9108, 'grad_norm': 4.560145854949951, 'learning_rate': 4.0094442032132e-05, 'epoch': 0.66}


 22%|██▏       | 4160/18924 [2:18:52<7:50:56,  1.91s/it]

{'loss': 1.02, 'grad_norm': 7.917825222015381, 'learning_rate': 4.0067303517151544e-05, 'epoch': 0.66}


 22%|██▏       | 4170/18924 [2:19:12<8:01:19,  1.96s/it]

{'loss': 1.0539, 'grad_norm': 11.28288745880127, 'learning_rate': 4.004016500217108e-05, 'epoch': 0.66}


 22%|██▏       | 4180/18924 [2:19:31<8:03:37,  1.97s/it]

{'loss': 1.1737, 'grad_norm': 9.921168327331543, 'learning_rate': 4.0013026487190626e-05, 'epoch': 0.66}


 22%|██▏       | 4190/18924 [2:19:51<8:01:52,  1.96s/it]

{'loss': 1.0821, 'grad_norm': 8.883529663085938, 'learning_rate': 3.9985887972210164e-05, 'epoch': 0.66}


 22%|██▏       | 4200/18924 [2:20:10<7:59:02,  1.95s/it]

{'loss': 0.9511, 'grad_norm': 7.304935455322266, 'learning_rate': 3.995874945722971e-05, 'epoch': 0.67}


 22%|██▏       | 4210/18924 [2:20:30<8:03:27,  1.97s/it]

{'loss': 1.0629, 'grad_norm': 5.303044319152832, 'learning_rate': 3.9931610942249245e-05, 'epoch': 0.67}


 22%|██▏       | 4220/18924 [2:20:50<8:02:23,  1.97s/it]

{'loss': 0.9908, 'grad_norm': 9.001443862915039, 'learning_rate': 3.990447242726878e-05, 'epoch': 0.67}


 22%|██▏       | 4230/18924 [2:21:09<7:58:22,  1.95s/it]

{'loss': 0.8387, 'grad_norm': 4.325475215911865, 'learning_rate': 3.987733391228832e-05, 'epoch': 0.67}


 22%|██▏       | 4240/18924 [2:21:29<7:57:13,  1.95s/it]

{'loss': 1.002, 'grad_norm': 8.586522102355957, 'learning_rate': 3.985019539730786e-05, 'epoch': 0.67}


 22%|██▏       | 4250/18924 [2:21:48<7:53:52,  1.94s/it]

{'loss': 1.0004, 'grad_norm': 8.5556640625, 'learning_rate': 3.98230568823274e-05, 'epoch': 0.67}


 23%|██▎       | 4260/18924 [2:22:08<7:54:21,  1.94s/it]

{'loss': 1.096, 'grad_norm': 10.354440689086914, 'learning_rate': 3.979591836734694e-05, 'epoch': 0.68}


 23%|██▎       | 4270/18924 [2:22:27<7:55:26,  1.95s/it]

{'loss': 0.9247, 'grad_norm': 6.444290637969971, 'learning_rate': 3.976877985236648e-05, 'epoch': 0.68}


 23%|██▎       | 4280/18924 [2:22:47<8:12:14,  2.02s/it]

{'loss': 0.7274, 'grad_norm': 4.997109889984131, 'learning_rate': 3.974164133738602e-05, 'epoch': 0.68}


 23%|██▎       | 4290/18924 [2:23:07<7:55:37,  1.95s/it]

{'loss': 1.0442, 'grad_norm': 7.304619312286377, 'learning_rate': 3.971450282240556e-05, 'epoch': 0.68}


 23%|██▎       | 4300/18924 [2:23:26<7:50:48,  1.93s/it]

{'loss': 0.9573, 'grad_norm': 11.416963577270508, 'learning_rate': 3.96873643074251e-05, 'epoch': 0.68}


 23%|██▎       | 4310/18924 [2:23:45<7:45:52,  1.91s/it]

{'loss': 0.9334, 'grad_norm': 11.435356140136719, 'learning_rate': 3.966022579244464e-05, 'epoch': 0.68}


 23%|██▎       | 4320/18924 [2:24:05<7:51:38,  1.94s/it]

{'loss': 0.9617, 'grad_norm': 6.4074296951293945, 'learning_rate': 3.9633087277464185e-05, 'epoch': 0.68}


 23%|██▎       | 4330/18924 [2:24:24<7:59:53,  1.97s/it]

{'loss': 1.0658, 'grad_norm': 9.643607139587402, 'learning_rate': 3.960594876248372e-05, 'epoch': 0.69}


 23%|██▎       | 4340/18924 [2:24:45<8:39:15,  2.14s/it]

{'loss': 0.8217, 'grad_norm': 8.712701797485352, 'learning_rate': 3.957881024750326e-05, 'epoch': 0.69}


 23%|██▎       | 4350/18924 [2:25:07<8:31:45,  2.11s/it]

{'loss': 0.9345, 'grad_norm': 7.3135151863098145, 'learning_rate': 3.95516717325228e-05, 'epoch': 0.69}


 23%|██▎       | 4360/18924 [2:25:27<8:13:29,  2.03s/it]

{'loss': 0.9912, 'grad_norm': 7.886600494384766, 'learning_rate': 3.9524533217542335e-05, 'epoch': 0.69}


 23%|██▎       | 4370/18924 [2:25:47<8:25:42,  2.08s/it]

{'loss': 0.9743, 'grad_norm': 11.12131404876709, 'learning_rate': 3.949739470256188e-05, 'epoch': 0.69}


 23%|██▎       | 4380/18924 [2:26:09<8:39:41,  2.14s/it]

{'loss': 1.143, 'grad_norm': 10.4907808303833, 'learning_rate': 3.9470256187581416e-05, 'epoch': 0.69}


 23%|██▎       | 4390/18924 [2:26:30<8:27:06,  2.09s/it]

{'loss': 1.0109, 'grad_norm': 11.282918930053711, 'learning_rate': 3.944311767260096e-05, 'epoch': 0.7}


 23%|██▎       | 4400/18924 [2:26:50<7:46:11,  1.93s/it]

{'loss': 1.1172, 'grad_norm': 12.082927703857422, 'learning_rate': 3.94159791576205e-05, 'epoch': 0.7}


 23%|██▎       | 4410/18924 [2:27:09<7:38:50,  1.90s/it]

{'loss': 0.9627, 'grad_norm': 7.351726531982422, 'learning_rate': 3.9388840642640035e-05, 'epoch': 0.7}


 23%|██▎       | 4420/18924 [2:27:28<7:37:01,  1.89s/it]

{'loss': 0.8517, 'grad_norm': 9.37344741821289, 'learning_rate': 3.936170212765958e-05, 'epoch': 0.7}


 23%|██▎       | 4430/18924 [2:27:47<7:54:16,  1.96s/it]

{'loss': 0.9403, 'grad_norm': 10.752184867858887, 'learning_rate': 3.933456361267912e-05, 'epoch': 0.7}


 23%|██▎       | 4440/18924 [2:28:06<7:53:11,  1.96s/it]

{'loss': 0.9579, 'grad_norm': 11.669951438903809, 'learning_rate': 3.9307425097698655e-05, 'epoch': 0.7}


 24%|██▎       | 4450/18924 [2:28:26<7:37:56,  1.90s/it]

{'loss': 0.9741, 'grad_norm': 8.867688179016113, 'learning_rate': 3.928028658271819e-05, 'epoch': 0.71}


 24%|██▎       | 4460/18924 [2:28:45<7:35:28,  1.89s/it]

{'loss': 0.9189, 'grad_norm': 7.2566328048706055, 'learning_rate': 3.925314806773773e-05, 'epoch': 0.71}


 24%|██▎       | 4470/18924 [2:29:04<7:55:54,  1.98s/it]

{'loss': 0.9097, 'grad_norm': 6.529553413391113, 'learning_rate': 3.9226009552757274e-05, 'epoch': 0.71}


 24%|██▎       | 4480/18924 [2:29:23<7:38:45,  1.91s/it]

{'loss': 1.0337, 'grad_norm': 6.781398296356201, 'learning_rate': 3.919887103777681e-05, 'epoch': 0.71}


 24%|██▎       | 4490/18924 [2:29:42<7:27:10,  1.86s/it]

{'loss': 0.9353, 'grad_norm': 5.966160297393799, 'learning_rate': 3.9171732522796356e-05, 'epoch': 0.71}


 24%|██▍       | 4500/18924 [2:30:00<7:20:42,  1.83s/it]

{'loss': 0.7561, 'grad_norm': 13.703306198120117, 'learning_rate': 3.914459400781589e-05, 'epoch': 0.71}


 24%|██▍       | 4510/18924 [2:30:19<7:24:52,  1.85s/it]

{'loss': 0.9357, 'grad_norm': 10.24828052520752, 'learning_rate': 3.911745549283544e-05, 'epoch': 0.71}


 24%|██▍       | 4520/18924 [2:30:38<7:27:58,  1.87s/it]

{'loss': 0.9011, 'grad_norm': 9.519153594970703, 'learning_rate': 3.9090316977854975e-05, 'epoch': 0.72}


 24%|██▍       | 4530/18924 [2:30:57<7:28:49,  1.87s/it]

{'loss': 1.0567, 'grad_norm': 10.221845626831055, 'learning_rate': 3.906317846287451e-05, 'epoch': 0.72}


 24%|██▍       | 4540/18924 [2:31:16<7:53:16,  1.97s/it]

{'loss': 0.936, 'grad_norm': 8.915684700012207, 'learning_rate': 3.9036039947894056e-05, 'epoch': 0.72}


 24%|██▍       | 4550/18924 [2:31:36<7:40:53,  1.92s/it]

{'loss': 0.9065, 'grad_norm': 7.341060161590576, 'learning_rate': 3.9008901432913594e-05, 'epoch': 0.72}


 24%|██▍       | 4560/18924 [2:31:55<7:26:07,  1.86s/it]

{'loss': 1.1044, 'grad_norm': 9.653285026550293, 'learning_rate': 3.898176291793313e-05, 'epoch': 0.72}


 24%|██▍       | 4570/18924 [2:32:14<8:01:03,  2.01s/it]

{'loss': 0.8565, 'grad_norm': 11.869685173034668, 'learning_rate': 3.895462440295267e-05, 'epoch': 0.72}


 24%|██▍       | 4580/18924 [2:32:35<8:16:23,  2.08s/it]

{'loss': 0.9485, 'grad_norm': 13.403650283813477, 'learning_rate': 3.8927485887972206e-05, 'epoch': 0.73}


 24%|██▍       | 4590/18924 [2:32:55<7:51:14,  1.97s/it]

{'loss': 0.9623, 'grad_norm': 6.466182708740234, 'learning_rate': 3.890034737299175e-05, 'epoch': 0.73}


 24%|██▍       | 4600/18924 [2:33:13<7:17:11,  1.83s/it]

{'loss': 0.8893, 'grad_norm': 6.31643533706665, 'learning_rate': 3.887320885801129e-05, 'epoch': 0.73}


 24%|██▍       | 4610/18924 [2:33:32<7:11:56,  1.81s/it]

{'loss': 0.8798, 'grad_norm': 7.4684367179870605, 'learning_rate': 3.884607034303083e-05, 'epoch': 0.73}


 24%|██▍       | 4620/18924 [2:33:50<7:14:51,  1.82s/it]

{'loss': 0.7695, 'grad_norm': 7.419144630432129, 'learning_rate': 3.881893182805037e-05, 'epoch': 0.73}


 24%|██▍       | 4630/18924 [2:34:09<7:37:40,  1.92s/it]

{'loss': 0.9878, 'grad_norm': 5.919284343719482, 'learning_rate': 3.8791793313069914e-05, 'epoch': 0.73}


 25%|██▍       | 4640/18924 [2:34:28<7:27:18,  1.88s/it]

{'loss': 1.1552, 'grad_norm': 7.799288272857666, 'learning_rate': 3.876465479808945e-05, 'epoch': 0.74}


 25%|██▍       | 4650/18924 [2:34:48<8:04:43,  2.04s/it]

{'loss': 1.0735, 'grad_norm': 7.259054183959961, 'learning_rate': 3.873751628310899e-05, 'epoch': 0.74}


 25%|██▍       | 4660/18924 [2:35:07<7:42:10,  1.94s/it]

{'loss': 1.1339, 'grad_norm': 14.271435737609863, 'learning_rate': 3.871037776812853e-05, 'epoch': 0.74}


 25%|██▍       | 4670/18924 [2:35:27<7:34:43,  1.91s/it]

{'loss': 0.9643, 'grad_norm': 11.265347480773926, 'learning_rate': 3.868323925314807e-05, 'epoch': 0.74}


 25%|██▍       | 4680/18924 [2:35:45<7:21:40,  1.86s/it]

{'loss': 1.0884, 'grad_norm': 9.166102409362793, 'learning_rate': 3.865610073816761e-05, 'epoch': 0.74}


 25%|██▍       | 4690/18924 [2:36:04<7:14:36,  1.83s/it]

{'loss': 0.8958, 'grad_norm': 6.931641101837158, 'learning_rate': 3.8628962223187146e-05, 'epoch': 0.74}


 25%|██▍       | 4700/18924 [2:36:22<7:08:57,  1.81s/it]

{'loss': 0.7707, 'grad_norm': 8.371326446533203, 'learning_rate': 3.860182370820669e-05, 'epoch': 0.75}


 25%|██▍       | 4710/18924 [2:36:40<7:13:52,  1.83s/it]

{'loss': 0.7758, 'grad_norm': 8.783226013183594, 'learning_rate': 3.857468519322623e-05, 'epoch': 0.75}


 25%|██▍       | 4720/18924 [2:36:58<7:21:33,  1.87s/it]

{'loss': 1.021, 'grad_norm': 5.297598361968994, 'learning_rate': 3.8547546678245765e-05, 'epoch': 0.75}


 25%|██▍       | 4730/18924 [2:37:17<7:28:35,  1.90s/it]

{'loss': 0.8512, 'grad_norm': 13.113702774047852, 'learning_rate': 3.852040816326531e-05, 'epoch': 0.75}


 25%|██▌       | 4740/18924 [2:37:37<7:38:03,  1.94s/it]

{'loss': 1.0062, 'grad_norm': 7.233025074005127, 'learning_rate': 3.8493269648284847e-05, 'epoch': 0.75}


 25%|██▌       | 4750/18924 [2:37:56<7:40:05,  1.95s/it]

{'loss': 1.0092, 'grad_norm': 10.630636215209961, 'learning_rate': 3.846613113330439e-05, 'epoch': 0.75}


 25%|██▌       | 4760/18924 [2:38:15<7:22:28,  1.87s/it]

{'loss': 0.6844, 'grad_norm': 7.041744709014893, 'learning_rate': 3.843899261832393e-05, 'epoch': 0.75}


 25%|██▌       | 4770/18924 [2:38:34<7:16:08,  1.85s/it]

{'loss': 1.0205, 'grad_norm': 10.993345260620117, 'learning_rate': 3.8411854103343466e-05, 'epoch': 0.76}


 25%|██▌       | 4780/18924 [2:38:52<7:28:19,  1.90s/it]

{'loss': 0.8734, 'grad_norm': 14.145733833312988, 'learning_rate': 3.838471558836301e-05, 'epoch': 0.76}


 25%|██▌       | 4790/18924 [2:39:12<7:31:39,  1.92s/it]

{'loss': 0.7681, 'grad_norm': 6.209301471710205, 'learning_rate': 3.835757707338255e-05, 'epoch': 0.76}


 25%|██▌       | 4800/18924 [2:39:30<7:17:14,  1.86s/it]

{'loss': 1.0822, 'grad_norm': 8.675596237182617, 'learning_rate': 3.8330438558402085e-05, 'epoch': 0.76}


 25%|██▌       | 4810/18924 [2:39:49<7:36:12,  1.94s/it]

{'loss': 0.966, 'grad_norm': 9.488978385925293, 'learning_rate': 3.830330004342162e-05, 'epoch': 0.76}


 25%|██▌       | 4820/18924 [2:40:10<8:16:08,  2.11s/it]

{'loss': 0.994, 'grad_norm': 8.550642967224121, 'learning_rate': 3.827616152844117e-05, 'epoch': 0.76}


 26%|██▌       | 4830/18924 [2:40:31<7:52:10,  2.01s/it]

{'loss': 0.7817, 'grad_norm': 9.633970260620117, 'learning_rate': 3.8249023013460704e-05, 'epoch': 0.77}


 26%|██▌       | 4840/18924 [2:40:49<7:15:04,  1.85s/it]

{'loss': 0.8552, 'grad_norm': 8.76780891418457, 'learning_rate': 3.822188449848024e-05, 'epoch': 0.77}


 26%|██▌       | 4850/18924 [2:41:09<7:48:07,  2.00s/it]

{'loss': 0.9452, 'grad_norm': 8.046527862548828, 'learning_rate': 3.8194745983499786e-05, 'epoch': 0.77}


 26%|██▌       | 4860/18924 [2:41:29<7:55:33,  2.03s/it]

{'loss': 0.9253, 'grad_norm': 9.100655555725098, 'learning_rate': 3.816760746851932e-05, 'epoch': 0.77}


 26%|██▌       | 4870/18924 [2:41:49<7:33:15,  1.94s/it]

{'loss': 0.9843, 'grad_norm': 9.455655097961426, 'learning_rate': 3.814046895353887e-05, 'epoch': 0.77}


 26%|██▌       | 4880/18924 [2:42:07<7:06:13,  1.82s/it]

{'loss': 0.9127, 'grad_norm': 10.94446849822998, 'learning_rate': 3.8113330438558405e-05, 'epoch': 0.77}


 26%|██▌       | 4890/18924 [2:42:26<7:03:31,  1.81s/it]

{'loss': 1.0121, 'grad_norm': 6.077000617980957, 'learning_rate': 3.808619192357795e-05, 'epoch': 0.78}


 26%|██▌       | 4900/18924 [2:42:44<7:00:48,  1.80s/it]

{'loss': 0.7408, 'grad_norm': 8.166298866271973, 'learning_rate': 3.805905340859749e-05, 'epoch': 0.78}


 26%|██▌       | 4910/18924 [2:43:02<7:07:19,  1.83s/it]

{'loss': 1.1168, 'grad_norm': 6.883129596710205, 'learning_rate': 3.8031914893617024e-05, 'epoch': 0.78}


 26%|██▌       | 4920/18924 [2:43:20<7:12:27,  1.85s/it]

{'loss': 0.7573, 'grad_norm': 7.26639461517334, 'learning_rate': 3.800477637863656e-05, 'epoch': 0.78}


 26%|██▌       | 4930/18924 [2:43:39<7:16:33,  1.87s/it]

{'loss': 0.8525, 'grad_norm': 6.910661697387695, 'learning_rate': 3.79776378636561e-05, 'epoch': 0.78}


 26%|██▌       | 4940/18924 [2:43:57<7:07:50,  1.84s/it]

{'loss': 1.0342, 'grad_norm': 8.613431930541992, 'learning_rate': 3.7950499348675643e-05, 'epoch': 0.78}


 26%|██▌       | 4950/18924 [2:44:16<7:10:33,  1.85s/it]

{'loss': 0.9775, 'grad_norm': 9.199788093566895, 'learning_rate': 3.792336083369518e-05, 'epoch': 0.78}


 26%|██▌       | 4960/18924 [2:44:35<7:16:18,  1.87s/it]

{'loss': 0.7668, 'grad_norm': 7.453257083892822, 'learning_rate': 3.789622231871472e-05, 'epoch': 0.79}


 26%|██▋       | 4970/18924 [2:44:54<7:24:18,  1.91s/it]

{'loss': 0.9224, 'grad_norm': 10.219069480895996, 'learning_rate': 3.786908380373426e-05, 'epoch': 0.79}


 26%|██▋       | 4980/18924 [2:45:13<7:20:50,  1.90s/it]

{'loss': 0.9694, 'grad_norm': 8.953837394714355, 'learning_rate': 3.78419452887538e-05, 'epoch': 0.79}


 26%|██▋       | 4990/18924 [2:45:32<7:19:48,  1.89s/it]

{'loss': 0.8344, 'grad_norm': 9.887001991271973, 'learning_rate': 3.7814806773773344e-05, 'epoch': 0.79}


 26%|██▋       | 5000/18924 [2:45:50<7:17:34,  1.89s/it]

{'loss': 1.0673, 'grad_norm': 14.369807243347168, 'learning_rate': 3.778766825879288e-05, 'epoch': 0.79}


 26%|██▋       | 5010/18924 [2:46:10<7:18:06,  1.89s/it]

{'loss': 1.0378, 'grad_norm': 7.346952438354492, 'learning_rate': 3.7760529743812426e-05, 'epoch': 0.79}


 27%|██▋       | 5020/18924 [2:46:29<7:14:45,  1.88s/it]

{'loss': 1.0699, 'grad_norm': 9.301969528198242, 'learning_rate': 3.773339122883196e-05, 'epoch': 0.8}


 27%|██▋       | 5030/18924 [2:46:48<7:18:54,  1.90s/it]

{'loss': 1.1378, 'grad_norm': 11.459092140197754, 'learning_rate': 3.7706252713851494e-05, 'epoch': 0.8}


 27%|██▋       | 5040/18924 [2:47:07<7:16:54,  1.89s/it]

{'loss': 0.9733, 'grad_norm': 6.4934282302856445, 'learning_rate': 3.767911419887104e-05, 'epoch': 0.8}


 27%|██▋       | 5050/18924 [2:47:25<7:19:26,  1.90s/it]

{'loss': 0.8015, 'grad_norm': 8.14948844909668, 'learning_rate': 3.7651975683890576e-05, 'epoch': 0.8}


 27%|██▋       | 5060/18924 [2:47:45<7:25:21,  1.93s/it]

{'loss': 0.8567, 'grad_norm': 7.227443695068359, 'learning_rate': 3.762483716891012e-05, 'epoch': 0.8}


 27%|██▋       | 5070/18924 [2:48:04<7:18:44,  1.90s/it]

{'loss': 0.969, 'grad_norm': 8.629847526550293, 'learning_rate': 3.759769865392966e-05, 'epoch': 0.8}


 27%|██▋       | 5080/18924 [2:48:23<7:14:10,  1.88s/it]

{'loss': 0.8889, 'grad_norm': 6.857182025909424, 'learning_rate': 3.75705601389492e-05, 'epoch': 0.81}


 27%|██▋       | 5090/18924 [2:48:42<7:20:13,  1.91s/it]

{'loss': 0.7865, 'grad_norm': 14.519767761230469, 'learning_rate': 3.754342162396874e-05, 'epoch': 0.81}


 27%|██▋       | 5100/18924 [2:49:01<7:12:45,  1.88s/it]

{'loss': 0.8919, 'grad_norm': 9.447541236877441, 'learning_rate': 3.751628310898828e-05, 'epoch': 0.81}


 27%|██▋       | 5110/18924 [2:49:19<7:11:48,  1.88s/it]

{'loss': 1.0096, 'grad_norm': 6.213133335113525, 'learning_rate': 3.748914459400782e-05, 'epoch': 0.81}


 27%|██▋       | 5120/18924 [2:49:38<7:17:49,  1.90s/it]

{'loss': 0.923, 'grad_norm': 8.673962593078613, 'learning_rate': 3.746200607902736e-05, 'epoch': 0.81}


 27%|██▋       | 5130/18924 [2:49:58<7:35:29,  1.98s/it]

{'loss': 1.0158, 'grad_norm': 8.931020736694336, 'learning_rate': 3.7434867564046896e-05, 'epoch': 0.81}


 27%|██▋       | 5140/18924 [2:50:19<8:18:27,  2.17s/it]

{'loss': 0.8976, 'grad_norm': 7.014249324798584, 'learning_rate': 3.7407729049066434e-05, 'epoch': 0.81}


 27%|██▋       | 5150/18924 [2:50:40<8:03:05,  2.10s/it]

{'loss': 0.812, 'grad_norm': 11.208953857421875, 'learning_rate': 3.738059053408597e-05, 'epoch': 0.82}


 27%|██▋       | 5160/18924 [2:51:00<7:19:11,  1.91s/it]

{'loss': 0.9383, 'grad_norm': 6.725771427154541, 'learning_rate': 3.7353452019105515e-05, 'epoch': 0.82}


 27%|██▋       | 5170/18924 [2:51:18<7:06:34,  1.86s/it]

{'loss': 1.1513, 'grad_norm': 9.556988716125488, 'learning_rate': 3.732631350412505e-05, 'epoch': 0.82}


 27%|██▋       | 5180/18924 [2:51:37<7:18:29,  1.91s/it]

{'loss': 0.7809, 'grad_norm': 6.159531593322754, 'learning_rate': 3.72991749891446e-05, 'epoch': 0.82}


 27%|██▋       | 5190/18924 [2:51:57<7:31:18,  1.97s/it]

{'loss': 0.9385, 'grad_norm': 7.46619176864624, 'learning_rate': 3.7272036474164135e-05, 'epoch': 0.82}


 27%|██▋       | 5200/18924 [2:52:16<7:17:35,  1.91s/it]

{'loss': 0.9567, 'grad_norm': 6.884192943572998, 'learning_rate': 3.724489795918368e-05, 'epoch': 0.82}


 28%|██▊       | 5210/18924 [2:52:37<7:58:05,  2.09s/it]

{'loss': 0.9389, 'grad_norm': 8.440152168273926, 'learning_rate': 3.7217759444203216e-05, 'epoch': 0.83}


 28%|██▊       | 5220/18924 [2:52:58<8:13:32,  2.16s/it]

{'loss': 0.8832, 'grad_norm': 14.435734748840332, 'learning_rate': 3.7190620929222754e-05, 'epoch': 0.83}


 28%|██▊       | 5230/18924 [2:53:19<7:40:52,  2.02s/it]

{'loss': 0.7689, 'grad_norm': 5.623401165008545, 'learning_rate': 3.71634824142423e-05, 'epoch': 0.83}


 28%|██▊       | 5240/18924 [2:53:38<7:05:58,  1.87s/it]

{'loss': 0.8733, 'grad_norm': 5.6723222732543945, 'learning_rate': 3.7136343899261835e-05, 'epoch': 0.83}


 28%|██▊       | 5250/18924 [2:53:58<7:53:24,  2.08s/it]

{'loss': 0.7318, 'grad_norm': 7.734636306762695, 'learning_rate': 3.710920538428137e-05, 'epoch': 0.83}


 28%|██▊       | 5260/18924 [2:54:19<8:04:16,  2.13s/it]

{'loss': 0.7423, 'grad_norm': 8.7444486618042, 'learning_rate': 3.708206686930091e-05, 'epoch': 0.83}


 28%|██▊       | 5270/18924 [2:54:40<7:24:59,  1.96s/it]

{'loss': 0.9395, 'grad_norm': 9.006160736083984, 'learning_rate': 3.705492835432045e-05, 'epoch': 0.84}


 28%|██▊       | 5280/18924 [2:54:58<6:55:55,  1.83s/it]

{'loss': 0.853, 'grad_norm': 8.548075675964355, 'learning_rate': 3.702778983933999e-05, 'epoch': 0.84}


 28%|██▊       | 5290/18924 [2:55:16<6:57:08,  1.84s/it]

{'loss': 0.9614, 'grad_norm': 6.2196364402771, 'learning_rate': 3.700065132435953e-05, 'epoch': 0.84}


 28%|██▊       | 5300/18924 [2:55:35<7:03:48,  1.87s/it]

{'loss': 0.9726, 'grad_norm': 8.70605754852295, 'learning_rate': 3.6973512809379074e-05, 'epoch': 0.84}


 28%|██▊       | 5310/18924 [2:55:54<7:11:09,  1.90s/it]

{'loss': 0.9156, 'grad_norm': 9.762274742126465, 'learning_rate': 3.694637429439861e-05, 'epoch': 0.84}


 28%|██▊       | 5320/18924 [2:56:13<7:15:54,  1.92s/it]

{'loss': 0.8637, 'grad_norm': 10.222128868103027, 'learning_rate': 3.6919235779418156e-05, 'epoch': 0.84}


 28%|██▊       | 5330/18924 [2:56:32<7:14:24,  1.92s/it]

{'loss': 1.0073, 'grad_norm': 7.6187872886657715, 'learning_rate': 3.689209726443769e-05, 'epoch': 0.84}


 28%|██▊       | 5340/18924 [2:56:51<7:20:52,  1.95s/it]

{'loss': 0.7819, 'grad_norm': 5.809870719909668, 'learning_rate': 3.686495874945723e-05, 'epoch': 0.85}


 28%|██▊       | 5350/18924 [2:57:10<7:07:13,  1.89s/it]

{'loss': 0.8543, 'grad_norm': 8.80474853515625, 'learning_rate': 3.6837820234476775e-05, 'epoch': 0.85}


 28%|██▊       | 5360/18924 [2:57:29<7:08:19,  1.89s/it]

{'loss': 0.8468, 'grad_norm': 6.859419345855713, 'learning_rate': 3.681068171949631e-05, 'epoch': 0.85}


 28%|██▊       | 5370/18924 [2:57:49<7:13:40,  1.92s/it]

{'loss': 0.8247, 'grad_norm': 12.162439346313477, 'learning_rate': 3.678354320451585e-05, 'epoch': 0.85}


 28%|██▊       | 5380/18924 [2:58:08<7:36:44,  2.02s/it]

{'loss': 0.9775, 'grad_norm': 6.122043609619141, 'learning_rate': 3.675640468953539e-05, 'epoch': 0.85}


 28%|██▊       | 5390/18924 [2:58:29<7:40:18,  2.04s/it]

{'loss': 0.826, 'grad_norm': 7.639630317687988, 'learning_rate': 3.672926617455493e-05, 'epoch': 0.85}


 29%|██▊       | 5400/18924 [2:58:49<7:23:44,  1.97s/it]

{'loss': 0.9179, 'grad_norm': 7.944596290588379, 'learning_rate': 3.670212765957447e-05, 'epoch': 0.86}


 29%|██▊       | 5410/18924 [2:59:08<7:09:57,  1.91s/it]

{'loss': 0.9185, 'grad_norm': 7.300431728363037, 'learning_rate': 3.6674989144594006e-05, 'epoch': 0.86}


 29%|██▊       | 5420/18924 [2:59:27<7:22:01,  1.96s/it]

{'loss': 1.0122, 'grad_norm': 7.688991069793701, 'learning_rate': 3.664785062961355e-05, 'epoch': 0.86}


 29%|██▊       | 5430/18924 [2:59:47<7:21:03,  1.96s/it]

{'loss': 0.7212, 'grad_norm': 7.724431991577148, 'learning_rate': 3.662071211463309e-05, 'epoch': 0.86}


 29%|██▊       | 5440/18924 [3:00:06<7:03:41,  1.89s/it]

{'loss': 0.8317, 'grad_norm': 9.844017028808594, 'learning_rate': 3.659357359965263e-05, 'epoch': 0.86}


 29%|██▉       | 5450/18924 [3:00:25<7:01:51,  1.88s/it]

{'loss': 0.9738, 'grad_norm': 10.316407203674316, 'learning_rate': 3.656643508467217e-05, 'epoch': 0.86}


 29%|██▉       | 5460/18924 [3:00:44<7:08:36,  1.91s/it]

{'loss': 1.2121, 'grad_norm': 10.086338996887207, 'learning_rate': 3.653929656969171e-05, 'epoch': 0.87}


 29%|██▉       | 5470/18924 [3:01:03<7:14:41,  1.94s/it]

{'loss': 1.0364, 'grad_norm': 7.7852783203125, 'learning_rate': 3.651215805471125e-05, 'epoch': 0.87}


 29%|██▉       | 5480/18924 [3:01:23<7:14:00,  1.94s/it]

{'loss': 0.871, 'grad_norm': 6.787080764770508, 'learning_rate': 3.648501953973079e-05, 'epoch': 0.87}


 29%|██▉       | 5490/18924 [3:01:42<7:03:12,  1.89s/it]

{'loss': 0.9632, 'grad_norm': 11.608553886413574, 'learning_rate': 3.6457881024750326e-05, 'epoch': 0.87}


 29%|██▉       | 5500/18924 [3:02:01<7:10:23,  1.92s/it]

{'loss': 0.7982, 'grad_norm': 9.278486251831055, 'learning_rate': 3.6430742509769864e-05, 'epoch': 0.87}


 29%|██▉       | 5510/18924 [3:02:22<7:23:29,  1.98s/it]

{'loss': 0.9167, 'grad_norm': 11.293156623840332, 'learning_rate': 3.640360399478941e-05, 'epoch': 0.87}


 29%|██▉       | 5520/18924 [3:02:41<7:05:50,  1.91s/it]

{'loss': 0.7651, 'grad_norm': 5.8748297691345215, 'learning_rate': 3.6376465479808946e-05, 'epoch': 0.88}


 29%|██▉       | 5530/18924 [3:03:00<6:57:39,  1.87s/it]

{'loss': 1.0082, 'grad_norm': 9.782904624938965, 'learning_rate': 3.634932696482848e-05, 'epoch': 0.88}


 29%|██▉       | 5540/18924 [3:03:18<7:02:32,  1.89s/it]

{'loss': 0.8391, 'grad_norm': 9.179909706115723, 'learning_rate': 3.632218844984803e-05, 'epoch': 0.88}


 29%|██▉       | 5550/18924 [3:03:38<7:07:41,  1.92s/it]

{'loss': 0.9489, 'grad_norm': 11.061832427978516, 'learning_rate': 3.6295049934867565e-05, 'epoch': 0.88}


 29%|██▉       | 5560/18924 [3:03:57<7:02:09,  1.90s/it]

{'loss': 0.9473, 'grad_norm': 6.291375637054443, 'learning_rate': 3.626791141988711e-05, 'epoch': 0.88}


 29%|██▉       | 5570/18924 [3:04:16<7:05:32,  1.91s/it]

{'loss': 0.9699, 'grad_norm': 8.482937812805176, 'learning_rate': 3.624077290490665e-05, 'epoch': 0.88}


 29%|██▉       | 5580/18924 [3:04:35<7:07:14,  1.92s/it]

{'loss': 0.7663, 'grad_norm': 7.563244342803955, 'learning_rate': 3.621363438992619e-05, 'epoch': 0.88}


 30%|██▉       | 5590/18924 [3:04:54<7:02:58,  1.90s/it]

{'loss': 0.8596, 'grad_norm': 11.392573356628418, 'learning_rate': 3.618649587494573e-05, 'epoch': 0.89}


 30%|██▉       | 5600/18924 [3:05:13<7:08:57,  1.93s/it]

{'loss': 0.9348, 'grad_norm': 8.464075088500977, 'learning_rate': 3.615935735996526e-05, 'epoch': 0.89}


 30%|██▉       | 5610/18924 [3:05:32<6:57:05,  1.88s/it]

{'loss': 0.9164, 'grad_norm': 7.688480377197266, 'learning_rate': 3.61322188449848e-05, 'epoch': 0.89}


 30%|██▉       | 5620/18924 [3:05:51<6:53:25,  1.86s/it]

{'loss': 0.9974, 'grad_norm': 6.431030750274658, 'learning_rate': 3.610508033000434e-05, 'epoch': 0.89}


 30%|██▉       | 5630/18924 [3:06:10<7:00:33,  1.90s/it]

{'loss': 0.9599, 'grad_norm': 7.456891059875488, 'learning_rate': 3.6077941815023885e-05, 'epoch': 0.89}


 30%|██▉       | 5640/18924 [3:06:29<7:08:58,  1.94s/it]

{'loss': 0.713, 'grad_norm': 10.22070026397705, 'learning_rate': 3.605080330004342e-05, 'epoch': 0.89}


 30%|██▉       | 5650/18924 [3:06:48<7:06:30,  1.93s/it]

{'loss': 0.9327, 'grad_norm': 8.88148307800293, 'learning_rate': 3.602366478506296e-05, 'epoch': 0.9}


 30%|██▉       | 5660/18924 [3:07:09<7:43:16,  2.10s/it]

{'loss': 0.6484, 'grad_norm': 8.657793045043945, 'learning_rate': 3.5996526270082504e-05, 'epoch': 0.9}


 30%|██▉       | 5670/18924 [3:07:29<7:10:34,  1.95s/it]

{'loss': 0.931, 'grad_norm': 5.176065921783447, 'learning_rate': 3.596938775510204e-05, 'epoch': 0.9}


 30%|███       | 5680/18924 [3:07:49<7:10:42,  1.95s/it]

{'loss': 0.9462, 'grad_norm': 10.656926155090332, 'learning_rate': 3.5942249240121586e-05, 'epoch': 0.9}


 30%|███       | 5690/18924 [3:08:08<7:03:28,  1.92s/it]

{'loss': 0.9931, 'grad_norm': 6.323999404907227, 'learning_rate': 3.5915110725141123e-05, 'epoch': 0.9}


 30%|███       | 5700/18924 [3:08:27<7:09:11,  1.95s/it]

{'loss': 1.0356, 'grad_norm': 10.032366752624512, 'learning_rate': 3.588797221016066e-05, 'epoch': 0.9}


 30%|███       | 5710/18924 [3:08:47<7:14:38,  1.97s/it]

{'loss': 0.7803, 'grad_norm': 5.336211204528809, 'learning_rate': 3.58608336951802e-05, 'epoch': 0.91}


 30%|███       | 5720/18924 [3:09:07<7:17:33,  1.99s/it]

{'loss': 0.748, 'grad_norm': 7.235138893127441, 'learning_rate': 3.5833695180199736e-05, 'epoch': 0.91}


 30%|███       | 5730/18924 [3:09:27<7:17:12,  1.99s/it]

{'loss': 1.0619, 'grad_norm': 9.881617546081543, 'learning_rate': 3.580655666521928e-05, 'epoch': 0.91}


 30%|███       | 5740/18924 [3:09:46<7:12:40,  1.97s/it]

{'loss': 0.8225, 'grad_norm': 8.311243057250977, 'learning_rate': 3.577941815023882e-05, 'epoch': 0.91}


 30%|███       | 5750/18924 [3:10:06<7:19:20,  2.00s/it]

{'loss': 1.0443, 'grad_norm': 7.269958019256592, 'learning_rate': 3.575227963525836e-05, 'epoch': 0.91}


 30%|███       | 5760/18924 [3:10:26<7:19:05,  2.00s/it]

{'loss': 1.0976, 'grad_norm': 9.476371765136719, 'learning_rate': 3.57251411202779e-05, 'epoch': 0.91}


 30%|███       | 5770/18924 [3:10:46<7:11:35,  1.97s/it]

{'loss': 0.9131, 'grad_norm': 7.791684150695801, 'learning_rate': 3.5698002605297444e-05, 'epoch': 0.91}


 31%|███       | 5780/18924 [3:11:06<7:09:42,  1.96s/it]

{'loss': 0.9248, 'grad_norm': 8.619566917419434, 'learning_rate': 3.567086409031698e-05, 'epoch': 0.92}


 31%|███       | 5790/18924 [3:11:25<7:11:21,  1.97s/it]

{'loss': 1.0185, 'grad_norm': 6.755127906799316, 'learning_rate': 3.564372557533652e-05, 'epoch': 0.92}


 31%|███       | 5800/18924 [3:11:45<7:09:56,  1.97s/it]

{'loss': 0.9076, 'grad_norm': 7.305097579956055, 'learning_rate': 3.561658706035606e-05, 'epoch': 0.92}


 31%|███       | 5810/18924 [3:12:04<7:05:17,  1.95s/it]

{'loss': 0.8852, 'grad_norm': 4.753597259521484, 'learning_rate': 3.55894485453756e-05, 'epoch': 0.92}


 31%|███       | 5820/18924 [3:12:24<7:05:22,  1.95s/it]

{'loss': 1.0834, 'grad_norm': 6.938762187957764, 'learning_rate': 3.556231003039514e-05, 'epoch': 0.92}


 31%|███       | 5830/18924 [3:12:43<7:08:17,  1.96s/it]

{'loss': 0.8179, 'grad_norm': 7.327616214752197, 'learning_rate': 3.5535171515414675e-05, 'epoch': 0.92}


 31%|███       | 5840/18924 [3:13:03<7:10:04,  1.97s/it]

{'loss': 0.998, 'grad_norm': 10.420466423034668, 'learning_rate': 3.550803300043421e-05, 'epoch': 0.93}


 31%|███       | 5850/18924 [3:13:23<7:15:09,  2.00s/it]

{'loss': 0.9157, 'grad_norm': 9.501550674438477, 'learning_rate': 3.548089448545376e-05, 'epoch': 0.93}


 31%|███       | 5860/18924 [3:13:43<7:10:58,  1.98s/it]

{'loss': 0.9058, 'grad_norm': 6.497661113739014, 'learning_rate': 3.5453755970473294e-05, 'epoch': 0.93}


 31%|███       | 5870/18924 [3:14:03<7:23:41,  2.04s/it]

{'loss': 1.0197, 'grad_norm': 10.949589729309082, 'learning_rate': 3.542661745549284e-05, 'epoch': 0.93}


 31%|███       | 5880/18924 [3:14:23<7:09:19,  1.97s/it]

{'loss': 0.7921, 'grad_norm': 8.337409019470215, 'learning_rate': 3.5399478940512376e-05, 'epoch': 0.93}


 31%|███       | 5890/18924 [3:14:43<7:01:47,  1.94s/it]

{'loss': 0.9351, 'grad_norm': 8.80463981628418, 'learning_rate': 3.537234042553192e-05, 'epoch': 0.93}


 31%|███       | 5900/18924 [3:15:02<6:59:46,  1.93s/it]

{'loss': 0.9338, 'grad_norm': 5.110937118530273, 'learning_rate': 3.534520191055146e-05, 'epoch': 0.94}


 31%|███       | 5910/18924 [3:15:22<7:09:00,  1.98s/it]

{'loss': 0.8889, 'grad_norm': 6.945006370544434, 'learning_rate': 3.5318063395570995e-05, 'epoch': 0.94}


 31%|███▏      | 5920/18924 [3:15:42<7:15:29,  2.01s/it]

{'loss': 0.9091, 'grad_norm': 9.928524017333984, 'learning_rate': 3.529092488059054e-05, 'epoch': 0.94}


 31%|███▏      | 5930/18924 [3:16:02<7:14:37,  2.01s/it]

{'loss': 0.9785, 'grad_norm': 12.529580116271973, 'learning_rate': 3.526378636561008e-05, 'epoch': 0.94}


 31%|███▏      | 5940/18924 [3:16:22<7:11:22,  1.99s/it]

{'loss': 1.1678, 'grad_norm': 12.214184761047363, 'learning_rate': 3.5236647850629614e-05, 'epoch': 0.94}


 31%|███▏      | 5950/18924 [3:16:42<7:12:50,  2.00s/it]

{'loss': 0.7616, 'grad_norm': 7.109343528747559, 'learning_rate': 3.520950933564915e-05, 'epoch': 0.94}


 31%|███▏      | 5960/18924 [3:17:02<7:14:23,  2.01s/it]

{'loss': 0.7686, 'grad_norm': 7.3361077308654785, 'learning_rate': 3.518237082066869e-05, 'epoch': 0.94}


 32%|███▏      | 5970/18924 [3:17:22<7:06:26,  1.98s/it]

{'loss': 0.8192, 'grad_norm': 8.554980278015137, 'learning_rate': 3.5155232305688234e-05, 'epoch': 0.95}


 32%|███▏      | 5980/18924 [3:17:41<7:07:39,  1.98s/it]

{'loss': 0.8253, 'grad_norm': 8.861886978149414, 'learning_rate': 3.512809379070777e-05, 'epoch': 0.95}


 32%|███▏      | 5990/18924 [3:18:01<7:04:02,  1.97s/it]

{'loss': 0.8712, 'grad_norm': 6.249860763549805, 'learning_rate': 3.5100955275727315e-05, 'epoch': 0.95}


 32%|███▏      | 6000/18924 [3:18:20<6:56:30,  1.93s/it]

{'loss': 0.889, 'grad_norm': 6.014321804046631, 'learning_rate': 3.507381676074685e-05, 'epoch': 0.95}


 32%|███▏      | 6010/18924 [3:18:41<7:14:26,  2.02s/it]

{'loss': 0.8013, 'grad_norm': 7.264403343200684, 'learning_rate': 3.50466782457664e-05, 'epoch': 0.95}


 32%|███▏      | 6020/18924 [3:19:01<7:07:05,  1.99s/it]

{'loss': 0.8629, 'grad_norm': 9.44598388671875, 'learning_rate': 3.5019539730785935e-05, 'epoch': 0.95}


 32%|███▏      | 6030/18924 [3:19:20<6:43:37,  1.88s/it]

{'loss': 0.8475, 'grad_norm': 8.223814010620117, 'learning_rate': 3.499240121580547e-05, 'epoch': 0.96}


 32%|███▏      | 6040/18924 [3:19:39<6:42:01,  1.87s/it]

{'loss': 0.9259, 'grad_norm': 7.645233154296875, 'learning_rate': 3.4965262700825016e-05, 'epoch': 0.96}


 32%|███▏      | 6050/18924 [3:19:58<6:41:44,  1.87s/it]

{'loss': 0.9359, 'grad_norm': 11.042329788208008, 'learning_rate': 3.4938124185844554e-05, 'epoch': 0.96}


 32%|███▏      | 6060/18924 [3:20:17<7:02:23,  1.97s/it]

{'loss': 0.9107, 'grad_norm': 7.763554573059082, 'learning_rate': 3.491098567086409e-05, 'epoch': 0.96}


 32%|███▏      | 6070/18924 [3:20:37<6:50:08,  1.91s/it]

{'loss': 0.7488, 'grad_norm': 6.543914318084717, 'learning_rate': 3.488384715588363e-05, 'epoch': 0.96}


 32%|███▏      | 6080/18924 [3:20:55<6:43:09,  1.88s/it]

{'loss': 0.7052, 'grad_norm': 9.511557579040527, 'learning_rate': 3.485670864090317e-05, 'epoch': 0.96}


 32%|███▏      | 6090/18924 [3:21:14<6:44:56,  1.89s/it]

{'loss': 0.9457, 'grad_norm': 12.796587944030762, 'learning_rate': 3.482957012592271e-05, 'epoch': 0.97}


 32%|███▏      | 6100/18924 [3:21:33<6:48:13,  1.91s/it]

{'loss': 0.9212, 'grad_norm': 11.238905906677246, 'learning_rate': 3.480243161094225e-05, 'epoch': 0.97}


 32%|███▏      | 6110/18924 [3:21:53<6:51:08,  1.93s/it]

{'loss': 0.9292, 'grad_norm': 9.9629545211792, 'learning_rate': 3.477529309596179e-05, 'epoch': 0.97}


 32%|███▏      | 6120/18924 [3:22:12<6:47:19,  1.91s/it]

{'loss': 0.7912, 'grad_norm': 6.9922637939453125, 'learning_rate': 3.474815458098133e-05, 'epoch': 0.97}


 32%|███▏      | 6130/18924 [3:22:31<6:48:59,  1.92s/it]

{'loss': 0.8215, 'grad_norm': 10.113288879394531, 'learning_rate': 3.4721016066000874e-05, 'epoch': 0.97}


 32%|███▏      | 6140/18924 [3:22:50<6:50:09,  1.92s/it]

{'loss': 0.8867, 'grad_norm': 4.248483657836914, 'learning_rate': 3.469387755102041e-05, 'epoch': 0.97}


 32%|███▏      | 6150/18924 [3:23:10<6:56:26,  1.96s/it]

{'loss': 0.8366, 'grad_norm': 9.594996452331543, 'learning_rate': 3.466673903603995e-05, 'epoch': 0.97}


 33%|███▎      | 6160/18924 [3:23:29<6:55:14,  1.95s/it]

{'loss': 0.8565, 'grad_norm': 12.648801803588867, 'learning_rate': 3.463960052105949e-05, 'epoch': 0.98}


 33%|███▎      | 6170/18924 [3:23:49<6:48:10,  1.92s/it]

{'loss': 0.9474, 'grad_norm': 8.353160858154297, 'learning_rate': 3.461246200607903e-05, 'epoch': 0.98}


 33%|███▎      | 6180/18924 [3:24:08<6:44:20,  1.90s/it]

{'loss': 1.1058, 'grad_norm': 9.317673683166504, 'learning_rate': 3.458532349109857e-05, 'epoch': 0.98}


 33%|███▎      | 6190/18924 [3:24:27<6:53:31,  1.95s/it]

{'loss': 0.847, 'grad_norm': 10.772659301757812, 'learning_rate': 3.4558184976118105e-05, 'epoch': 0.98}


 33%|███▎      | 6200/18924 [3:24:47<6:57:06,  1.97s/it]

{'loss': 0.8851, 'grad_norm': 8.323199272155762, 'learning_rate': 3.453104646113765e-05, 'epoch': 0.98}


 33%|███▎      | 6210/18924 [3:25:07<6:57:36,  1.97s/it]

{'loss': 0.8977, 'grad_norm': 12.102048873901367, 'learning_rate': 3.450390794615719e-05, 'epoch': 0.98}


 33%|███▎      | 6220/18924 [3:25:26<6:53:36,  1.95s/it]

{'loss': 0.891, 'grad_norm': 9.81935977935791, 'learning_rate': 3.4476769431176725e-05, 'epoch': 0.99}


 33%|███▎      | 6230/18924 [3:25:46<6:54:50,  1.96s/it]

{'loss': 0.8582, 'grad_norm': 13.630965232849121, 'learning_rate': 3.444963091619627e-05, 'epoch': 0.99}


 33%|███▎      | 6240/18924 [3:26:07<7:25:52,  2.11s/it]

{'loss': 1.0139, 'grad_norm': 10.293807029724121, 'learning_rate': 3.4422492401215806e-05, 'epoch': 0.99}


 33%|███▎      | 6250/18924 [3:26:28<7:09:43,  2.03s/it]

{'loss': 0.8028, 'grad_norm': 8.406526565551758, 'learning_rate': 3.439535388623535e-05, 'epoch': 0.99}


 33%|███▎      | 6260/18924 [3:26:47<6:45:08,  1.92s/it]

{'loss': 0.9615, 'grad_norm': 8.00184440612793, 'learning_rate': 3.436821537125489e-05, 'epoch': 0.99}


 33%|███▎      | 6270/18924 [3:27:06<6:51:28,  1.95s/it]

{'loss': 0.977, 'grad_norm': 10.335908889770508, 'learning_rate': 3.4341076856274426e-05, 'epoch': 0.99}


 33%|███▎      | 6280/18924 [3:27:25<6:45:29,  1.92s/it]

{'loss': 0.7681, 'grad_norm': 5.274760723114014, 'learning_rate': 3.431393834129396e-05, 'epoch': 1.0}


 33%|███▎      | 6290/18924 [3:27:47<7:39:54,  2.18s/it]

{'loss': 0.9855, 'grad_norm': 6.628208637237549, 'learning_rate': 3.42867998263135e-05, 'epoch': 1.0}


 33%|███▎      | 6300/18924 [3:28:08<7:25:14,  2.12s/it]

{'loss': 1.0518, 'grad_norm': 5.938792705535889, 'learning_rate': 3.4259661311333045e-05, 'epoch': 1.0}


 33%|███▎      | 6310/18924 [3:28:29<7:10:57,  2.05s/it]

{'loss': 0.7137, 'grad_norm': 5.67280912399292, 'learning_rate': 3.423252279635258e-05, 'epoch': 1.0}


 33%|███▎      | 6320/18924 [3:28:50<7:10:37,  2.05s/it]

{'loss': 0.6218, 'grad_norm': 4.5414581298828125, 'learning_rate': 3.4205384281372127e-05, 'epoch': 1.0}


 33%|███▎      | 6330/18924 [3:29:10<7:01:39,  2.01s/it]

{'loss': 0.6906, 'grad_norm': 6.147617340087891, 'learning_rate': 3.4178245766391664e-05, 'epoch': 1.0}


 34%|███▎      | 6340/18924 [3:29:30<7:13:44,  2.07s/it]

{'loss': 0.5722, 'grad_norm': 7.279849529266357, 'learning_rate': 3.41511072514112e-05, 'epoch': 1.01}


 34%|███▎      | 6350/18924 [3:29:51<7:21:04,  2.10s/it]

{'loss': 0.7382, 'grad_norm': 8.598492622375488, 'learning_rate': 3.4123968736430746e-05, 'epoch': 1.01}


 34%|███▎      | 6360/18924 [3:30:13<7:22:34,  2.11s/it]

{'loss': 0.6333, 'grad_norm': 10.439894676208496, 'learning_rate': 3.409683022145028e-05, 'epoch': 1.01}


 34%|███▎      | 6370/18924 [3:30:33<7:04:05,  2.03s/it]

{'loss': 0.6092, 'grad_norm': 5.158868312835693, 'learning_rate': 3.406969170646983e-05, 'epoch': 1.01}


 34%|███▎      | 6380/18924 [3:30:54<7:13:42,  2.07s/it]

{'loss': 0.7745, 'grad_norm': 8.65957260131836, 'learning_rate': 3.4042553191489365e-05, 'epoch': 1.01}


 34%|███▍      | 6390/18924 [3:31:15<7:36:20,  2.18s/it]

{'loss': 0.6145, 'grad_norm': 6.914078712463379, 'learning_rate': 3.40154146765089e-05, 'epoch': 1.01}


 34%|███▍      | 6400/18924 [3:31:38<8:07:20,  2.33s/it]

{'loss': 0.653, 'grad_norm': 7.445847511291504, 'learning_rate': 3.398827616152844e-05, 'epoch': 1.01}


 34%|███▍      | 6410/18924 [3:32:03<8:17:28,  2.39s/it]

{'loss': 0.7492, 'grad_norm': 6.279403209686279, 'learning_rate': 3.396113764654798e-05, 'epoch': 1.02}


 34%|███▍      | 6420/18924 [3:32:24<7:05:30,  2.04s/it]

{'loss': 0.7152, 'grad_norm': 8.61915397644043, 'learning_rate': 3.393399913156752e-05, 'epoch': 1.02}


 34%|███▍      | 6430/18924 [3:32:44<7:03:50,  2.04s/it]

{'loss': 0.7485, 'grad_norm': 17.053316116333008, 'learning_rate': 3.390686061658706e-05, 'epoch': 1.02}


 34%|███▍      | 6440/18924 [3:33:05<7:16:15,  2.10s/it]

{'loss': 0.7313, 'grad_norm': 8.725330352783203, 'learning_rate': 3.38797221016066e-05, 'epoch': 1.02}


 34%|███▍      | 6450/18924 [3:33:26<7:15:50,  2.10s/it]

{'loss': 0.7699, 'grad_norm': 7.772342681884766, 'learning_rate': 3.385258358662614e-05, 'epoch': 1.02}


 34%|███▍      | 6460/18924 [3:33:47<7:15:43,  2.10s/it]

{'loss': 0.5706, 'grad_norm': 3.195610523223877, 'learning_rate': 3.3825445071645685e-05, 'epoch': 1.02}


 34%|███▍      | 6470/18924 [3:34:08<7:04:58,  2.05s/it]

{'loss': 0.5298, 'grad_norm': 10.751646995544434, 'learning_rate': 3.379830655666522e-05, 'epoch': 1.03}


 34%|███▍      | 6480/18924 [3:34:28<6:59:01,  2.02s/it]

{'loss': 0.6916, 'grad_norm': 12.26448917388916, 'learning_rate': 3.377116804168476e-05, 'epoch': 1.03}


 34%|███▍      | 6490/18924 [3:34:48<7:02:19,  2.04s/it]

{'loss': 0.7889, 'grad_norm': 9.027652740478516, 'learning_rate': 3.3744029526704304e-05, 'epoch': 1.03}


 34%|███▍      | 6500/18924 [3:35:09<7:01:41,  2.04s/it]

{'loss': 0.5303, 'grad_norm': 4.865777015686035, 'learning_rate': 3.371689101172384e-05, 'epoch': 1.03}


 34%|███▍      | 6510/18924 [3:35:30<7:01:24,  2.04s/it]

{'loss': 0.701, 'grad_norm': 7.678099155426025, 'learning_rate': 3.368975249674338e-05, 'epoch': 1.03}


 34%|███▍      | 6520/18924 [3:35:50<7:00:05,  2.03s/it]

{'loss': 0.5683, 'grad_norm': 12.084874153137207, 'learning_rate': 3.366261398176292e-05, 'epoch': 1.03}


 35%|███▍      | 6530/18924 [3:36:13<7:46:51,  2.26s/it]

{'loss': 0.5928, 'grad_norm': 6.561376094818115, 'learning_rate': 3.3635475466782454e-05, 'epoch': 1.04}


 35%|███▍      | 6540/18924 [3:36:35<7:37:01,  2.21s/it]

{'loss': 0.6425, 'grad_norm': 15.145020484924316, 'learning_rate': 3.3608336951802e-05, 'epoch': 1.04}


 35%|███▍      | 6550/18924 [3:36:55<6:59:23,  2.03s/it]

{'loss': 0.7125, 'grad_norm': 7.057868480682373, 'learning_rate': 3.3581198436821536e-05, 'epoch': 1.04}


 35%|███▍      | 6560/18924 [3:37:16<6:54:35,  2.01s/it]

{'loss': 0.492, 'grad_norm': 9.292278289794922, 'learning_rate': 3.355405992184108e-05, 'epoch': 1.04}


 35%|███▍      | 6570/18924 [3:37:36<6:53:19,  2.01s/it]

{'loss': 0.6094, 'grad_norm': 5.038539409637451, 'learning_rate': 3.352692140686062e-05, 'epoch': 1.04}


 35%|███▍      | 6580/18924 [3:37:56<6:57:55,  2.03s/it]

{'loss': 0.5984, 'grad_norm': 11.435736656188965, 'learning_rate': 3.349978289188016e-05, 'epoch': 1.04}


 35%|███▍      | 6590/18924 [3:38:17<7:04:14,  2.06s/it]

{'loss': 0.5638, 'grad_norm': 6.542542457580566, 'learning_rate': 3.34726443768997e-05, 'epoch': 1.04}


 35%|███▍      | 6600/18924 [3:38:38<7:11:17,  2.10s/it]

{'loss': 0.6518, 'grad_norm': 15.174113273620605, 'learning_rate': 3.344550586191924e-05, 'epoch': 1.05}


 35%|███▍      | 6610/18924 [3:38:57<6:40:50,  1.95s/it]

{'loss': 0.6937, 'grad_norm': 9.561620712280273, 'learning_rate': 3.341836734693878e-05, 'epoch': 1.05}


 35%|███▍      | 6620/18924 [3:39:17<6:28:46,  1.90s/it]

{'loss': 0.5934, 'grad_norm': 6.237335681915283, 'learning_rate': 3.339122883195832e-05, 'epoch': 1.05}


 35%|███▌      | 6630/18924 [3:39:36<6:36:40,  1.94s/it]

{'loss': 0.6641, 'grad_norm': 7.290168285369873, 'learning_rate': 3.3364090316977856e-05, 'epoch': 1.05}


 35%|███▌      | 6640/18924 [3:39:55<6:45:46,  1.98s/it]

{'loss': 0.7644, 'grad_norm': 14.921024322509766, 'learning_rate': 3.3336951801997393e-05, 'epoch': 1.05}


 35%|███▌      | 6650/18924 [3:40:15<6:29:55,  1.91s/it]

{'loss': 0.82, 'grad_norm': 12.212903022766113, 'learning_rate': 3.330981328701693e-05, 'epoch': 1.05}


 35%|███▌      | 6660/18924 [3:40:34<6:31:07,  1.91s/it]

{'loss': 0.5459, 'grad_norm': 5.725934982299805, 'learning_rate': 3.3282674772036475e-05, 'epoch': 1.06}


 35%|███▌      | 6670/18924 [3:40:53<6:37:41,  1.95s/it]

{'loss': 0.8118, 'grad_norm': 7.6379876136779785, 'learning_rate': 3.325553625705601e-05, 'epoch': 1.06}


 35%|███▌      | 6680/18924 [3:41:13<6:37:43,  1.95s/it]

{'loss': 0.6079, 'grad_norm': 4.4014081954956055, 'learning_rate': 3.322839774207556e-05, 'epoch': 1.06}


 35%|███▌      | 6690/18924 [3:41:32<6:27:42,  1.90s/it]

{'loss': 0.7134, 'grad_norm': 5.462494850158691, 'learning_rate': 3.3201259227095094e-05, 'epoch': 1.06}


 35%|███▌      | 6700/18924 [3:41:51<6:24:49,  1.89s/it]

{'loss': 0.5954, 'grad_norm': 11.105106353759766, 'learning_rate': 3.317412071211464e-05, 'epoch': 1.06}


 35%|███▌      | 6710/18924 [3:42:10<6:33:11,  1.93s/it]

{'loss': 0.638, 'grad_norm': 8.740187644958496, 'learning_rate': 3.3146982197134176e-05, 'epoch': 1.06}


 36%|███▌      | 6720/18924 [3:42:29<6:36:29,  1.95s/it]

{'loss': 0.7282, 'grad_norm': 8.877031326293945, 'learning_rate': 3.3119843682153714e-05, 'epoch': 1.07}


 36%|███▌      | 6730/18924 [3:42:49<6:33:04,  1.93s/it]

{'loss': 0.6448, 'grad_norm': 7.946844577789307, 'learning_rate': 3.309270516717326e-05, 'epoch': 1.07}


 36%|███▌      | 6740/18924 [3:43:08<6:26:44,  1.90s/it]

{'loss': 0.6689, 'grad_norm': 10.614609718322754, 'learning_rate': 3.3065566652192795e-05, 'epoch': 1.07}


 36%|███▌      | 6750/18924 [3:43:27<6:26:54,  1.91s/it]

{'loss': 0.6232, 'grad_norm': 7.0004754066467285, 'learning_rate': 3.303842813721233e-05, 'epoch': 1.07}


 36%|███▌      | 6760/18924 [3:43:46<6:33:26,  1.94s/it]

{'loss': 0.4615, 'grad_norm': 9.406377792358398, 'learning_rate': 3.301128962223187e-05, 'epoch': 1.07}


 36%|███▌      | 6770/18924 [3:44:06<6:31:28,  1.93s/it]

{'loss': 0.6339, 'grad_norm': 12.011012077331543, 'learning_rate': 3.2984151107251414e-05, 'epoch': 1.07}


 36%|███▌      | 6780/18924 [3:44:25<6:29:30,  1.92s/it]

{'loss': 0.5954, 'grad_norm': 7.178305149078369, 'learning_rate': 3.295701259227095e-05, 'epoch': 1.07}


 36%|███▌      | 6790/18924 [3:44:44<6:29:42,  1.93s/it]

{'loss': 0.6242, 'grad_norm': 7.032754421234131, 'learning_rate': 3.292987407729049e-05, 'epoch': 1.08}


 36%|███▌      | 6800/18924 [3:45:03<6:30:46,  1.93s/it]

{'loss': 0.8086, 'grad_norm': 9.768270492553711, 'learning_rate': 3.2902735562310034e-05, 'epoch': 1.08}


 36%|███▌      | 6810/18924 [3:45:22<6:19:43,  1.88s/it]

{'loss': 0.5949, 'grad_norm': 9.216314315795898, 'learning_rate': 3.287559704732957e-05, 'epoch': 1.08}


 36%|███▌      | 6820/18924 [3:45:41<6:16:15,  1.87s/it]

{'loss': 0.7439, 'grad_norm': 10.25090503692627, 'learning_rate': 3.2848458532349115e-05, 'epoch': 1.08}


 36%|███▌      | 6830/18924 [3:46:00<6:25:51,  1.91s/it]

{'loss': 0.509, 'grad_norm': 6.640642166137695, 'learning_rate': 3.282132001736865e-05, 'epoch': 1.08}


 36%|███▌      | 6840/18924 [3:46:19<6:26:56,  1.92s/it]

{'loss': 0.6785, 'grad_norm': 9.673201560974121, 'learning_rate': 3.279418150238819e-05, 'epoch': 1.08}


 36%|███▌      | 6850/18924 [3:46:39<6:46:05,  2.02s/it]

{'loss': 0.7426, 'grad_norm': 10.140338897705078, 'learning_rate': 3.276704298740773e-05, 'epoch': 1.09}


 36%|███▋      | 6860/18924 [3:47:00<7:06:37,  2.12s/it]

{'loss': 0.8756, 'grad_norm': 6.680406093597412, 'learning_rate': 3.2739904472427265e-05, 'epoch': 1.09}


 36%|███▋      | 6870/18924 [3:47:20<6:26:44,  1.93s/it]

{'loss': 0.7733, 'grad_norm': 11.762470245361328, 'learning_rate': 3.271276595744681e-05, 'epoch': 1.09}


 36%|███▋      | 6880/18924 [3:47:38<6:08:07,  1.83s/it]

{'loss': 0.6663, 'grad_norm': 13.586270332336426, 'learning_rate': 3.268562744246635e-05, 'epoch': 1.09}


 36%|███▋      | 6890/18924 [3:47:57<6:08:25,  1.84s/it]

{'loss': 0.8857, 'grad_norm': 9.111741065979004, 'learning_rate': 3.265848892748589e-05, 'epoch': 1.09}


 36%|███▋      | 6900/18924 [3:48:16<6:19:57,  1.90s/it]

{'loss': 0.5592, 'grad_norm': 8.052321434020996, 'learning_rate': 3.263135041250543e-05, 'epoch': 1.09}


 37%|███▋      | 6910/18924 [3:48:34<6:16:36,  1.88s/it]

{'loss': 0.7359, 'grad_norm': 12.83712100982666, 'learning_rate': 3.2604211897524966e-05, 'epoch': 1.1}


 37%|███▋      | 6920/18924 [3:48:53<6:09:54,  1.85s/it]

{'loss': 0.6952, 'grad_norm': 6.2021989822387695, 'learning_rate': 3.257707338254451e-05, 'epoch': 1.1}


 37%|███▋      | 6930/18924 [3:49:12<6:12:18,  1.86s/it]

{'loss': 0.7655, 'grad_norm': 11.663721084594727, 'learning_rate': 3.254993486756405e-05, 'epoch': 1.1}


 37%|███▋      | 6940/18924 [3:49:31<6:23:28,  1.92s/it]

{'loss': 0.7061, 'grad_norm': 5.8947038650512695, 'learning_rate': 3.252279635258359e-05, 'epoch': 1.1}


 37%|███▋      | 6950/18924 [3:49:50<6:20:53,  1.91s/it]

{'loss': 0.835, 'grad_norm': 5.951417922973633, 'learning_rate': 3.249565783760313e-05, 'epoch': 1.1}


 37%|███▋      | 6960/18924 [3:50:09<6:17:37,  1.89s/it]

{'loss': 0.6981, 'grad_norm': 10.550955772399902, 'learning_rate': 3.246851932262267e-05, 'epoch': 1.1}


 37%|███▋      | 6970/18924 [3:50:28<6:18:36,  1.90s/it]

{'loss': 0.7693, 'grad_norm': 11.999796867370605, 'learning_rate': 3.2441380807642205e-05, 'epoch': 1.1}


 37%|███▋      | 6980/18924 [3:50:47<6:14:43,  1.88s/it]

{'loss': 0.4972, 'grad_norm': 10.681428909301758, 'learning_rate': 3.241424229266174e-05, 'epoch': 1.11}


 37%|███▋      | 6990/18924 [3:51:05<6:11:45,  1.87s/it]

{'loss': 0.7376, 'grad_norm': 10.04135513305664, 'learning_rate': 3.2387103777681286e-05, 'epoch': 1.11}


 37%|███▋      | 7000/18924 [3:51:24<6:08:28,  1.85s/it]

{'loss': 0.5781, 'grad_norm': 6.067868709564209, 'learning_rate': 3.2359965262700824e-05, 'epoch': 1.11}


 37%|███▋      | 7010/18924 [3:51:43<6:15:23,  1.89s/it]

{'loss': 0.6146, 'grad_norm': 9.327213287353516, 'learning_rate': 3.233282674772037e-05, 'epoch': 1.11}


 37%|███▋      | 7020/18924 [3:52:02<6:15:22,  1.89s/it]

{'loss': 0.739, 'grad_norm': 14.593610763549805, 'learning_rate': 3.2305688232739906e-05, 'epoch': 1.11}


 37%|███▋      | 7030/18924 [3:52:21<6:20:48,  1.92s/it]

{'loss': 0.5746, 'grad_norm': 9.304511070251465, 'learning_rate': 3.227854971775944e-05, 'epoch': 1.11}


 37%|███▋      | 7040/18924 [3:52:41<6:23:22,  1.94s/it]

{'loss': 0.577, 'grad_norm': 20.62403106689453, 'learning_rate': 3.225141120277899e-05, 'epoch': 1.12}


 37%|███▋      | 7050/18924 [3:53:02<7:27:00,  2.26s/it]

{'loss': 0.503, 'grad_norm': 14.457718849182129, 'learning_rate': 3.2224272687798525e-05, 'epoch': 1.12}


 37%|███▋      | 7060/18924 [3:53:25<7:26:00,  2.26s/it]

{'loss': 0.6294, 'grad_norm': 11.955256462097168, 'learning_rate': 3.219713417281807e-05, 'epoch': 1.12}


 37%|███▋      | 7070/18924 [3:53:45<6:25:16,  1.95s/it]

{'loss': 0.6824, 'grad_norm': 12.930627822875977, 'learning_rate': 3.2169995657837606e-05, 'epoch': 1.12}


 37%|███▋      | 7080/18924 [3:54:04<6:04:39,  1.85s/it]

{'loss': 0.6751, 'grad_norm': 6.110440254211426, 'learning_rate': 3.2142857142857144e-05, 'epoch': 1.12}


 37%|███▋      | 7090/18924 [3:54:22<6:03:40,  1.84s/it]

{'loss': 0.6482, 'grad_norm': 7.396611213684082, 'learning_rate': 3.211571862787668e-05, 'epoch': 1.12}


 38%|███▊      | 7100/18924 [3:54:41<6:08:47,  1.87s/it]

{'loss': 0.5567, 'grad_norm': 7.047054767608643, 'learning_rate': 3.208858011289622e-05, 'epoch': 1.13}


 38%|███▊      | 7110/18924 [3:55:00<6:15:42,  1.91s/it]

{'loss': 0.5448, 'grad_norm': 17.59351348876953, 'learning_rate': 3.206144159791576e-05, 'epoch': 1.13}


 38%|███▊      | 7120/18924 [3:55:19<6:12:34,  1.89s/it]

{'loss': 0.5493, 'grad_norm': 7.905555725097656, 'learning_rate': 3.20343030829353e-05, 'epoch': 1.13}


 38%|███▊      | 7130/18924 [3:55:38<6:13:53,  1.90s/it]

{'loss': 0.6931, 'grad_norm': 11.922015190124512, 'learning_rate': 3.2007164567954845e-05, 'epoch': 1.13}


 38%|███▊      | 7140/18924 [3:55:57<6:12:42,  1.90s/it]

{'loss': 0.8937, 'grad_norm': 12.862954139709473, 'learning_rate': 3.198002605297438e-05, 'epoch': 1.13}


 38%|███▊      | 7150/18924 [3:56:16<6:15:19,  1.91s/it]

{'loss': 0.6296, 'grad_norm': 6.770645618438721, 'learning_rate': 3.1952887537993927e-05, 'epoch': 1.13}


 38%|███▊      | 7160/18924 [3:56:35<6:13:25,  1.90s/it]

{'loss': 0.7447, 'grad_norm': 11.35000228881836, 'learning_rate': 3.1925749023013464e-05, 'epoch': 1.14}


 38%|███▊      | 7170/18924 [3:56:54<6:14:57,  1.91s/it]

{'loss': 0.5889, 'grad_norm': 5.856510162353516, 'learning_rate': 3.1898610508033e-05, 'epoch': 1.14}


 38%|███▊      | 7180/18924 [3:57:13<6:15:41,  1.92s/it]

{'loss': 0.7116, 'grad_norm': 10.737955093383789, 'learning_rate': 3.1871471993052546e-05, 'epoch': 1.14}


 38%|███▊      | 7190/18924 [3:57:33<6:15:40,  1.92s/it]

{'loss': 0.6962, 'grad_norm': 9.56878662109375, 'learning_rate': 3.184433347807208e-05, 'epoch': 1.14}


 38%|███▊      | 7200/18924 [3:57:52<6:11:33,  1.90s/it]

{'loss': 0.564, 'grad_norm': 6.316723346710205, 'learning_rate': 3.181719496309162e-05, 'epoch': 1.14}


 38%|███▊      | 7210/18924 [3:58:11<6:08:38,  1.89s/it]

{'loss': 0.6135, 'grad_norm': 12.595233917236328, 'learning_rate': 3.179005644811116e-05, 'epoch': 1.14}


 38%|███▊      | 7220/18924 [3:58:30<6:12:05,  1.91s/it]

{'loss': 0.7035, 'grad_norm': 9.618130683898926, 'learning_rate': 3.1762917933130696e-05, 'epoch': 1.14}


 38%|███▊      | 7230/18924 [3:58:49<6:13:01,  1.91s/it]

{'loss': 0.8767, 'grad_norm': 14.102339744567871, 'learning_rate': 3.173577941815024e-05, 'epoch': 1.15}


 38%|███▊      | 7240/18924 [3:59:08<6:04:11,  1.87s/it]

{'loss': 0.7997, 'grad_norm': 9.426420211791992, 'learning_rate': 3.170864090316978e-05, 'epoch': 1.15}


 38%|███▊      | 7250/18924 [3:59:26<5:58:30,  1.84s/it]

{'loss': 0.7619, 'grad_norm': 6.669231414794922, 'learning_rate': 3.168150238818932e-05, 'epoch': 1.15}


 38%|███▊      | 7260/18924 [3:59:45<6:00:44,  1.86s/it]

{'loss': 0.7735, 'grad_norm': 7.911169528961182, 'learning_rate': 3.165436387320886e-05, 'epoch': 1.15}


 38%|███▊      | 7270/18924 [4:00:03<6:03:17,  1.87s/it]

{'loss': 0.8054, 'grad_norm': 9.740285873413086, 'learning_rate': 3.16272253582284e-05, 'epoch': 1.15}


 38%|███▊      | 7280/18924 [4:00:22<6:11:12,  1.91s/it]

{'loss': 0.5477, 'grad_norm': 13.560030937194824, 'learning_rate': 3.160008684324794e-05, 'epoch': 1.15}


 39%|███▊      | 7290/18924 [4:00:41<6:00:30,  1.86s/it]

{'loss': 0.8179, 'grad_norm': 9.646904945373535, 'learning_rate': 3.157294832826748e-05, 'epoch': 1.16}


 39%|███▊      | 7300/18924 [4:01:00<5:57:47,  1.85s/it]

{'loss': 0.7976, 'grad_norm': 7.5150980949401855, 'learning_rate': 3.154580981328702e-05, 'epoch': 1.16}


 39%|███▊      | 7310/18924 [4:01:18<6:03:25,  1.88s/it]

{'loss': 0.5438, 'grad_norm': 7.218386650085449, 'learning_rate': 3.151867129830656e-05, 'epoch': 1.16}


 39%|███▊      | 7320/18924 [4:01:37<6:01:35,  1.87s/it]

{'loss': 0.7088, 'grad_norm': 7.973738670349121, 'learning_rate': 3.14915327833261e-05, 'epoch': 1.16}


 39%|███▊      | 7330/18924 [4:01:56<6:01:43,  1.87s/it]

{'loss': 0.7594, 'grad_norm': 3.7242369651794434, 'learning_rate': 3.1464394268345635e-05, 'epoch': 1.16}


 39%|███▉      | 7340/18924 [4:02:15<6:03:01,  1.88s/it]

{'loss': 0.833, 'grad_norm': 14.426888465881348, 'learning_rate': 3.143725575336517e-05, 'epoch': 1.16}


 39%|███▉      | 7350/18924 [4:02:33<6:03:35,  1.88s/it]

{'loss': 0.5424, 'grad_norm': 6.304374694824219, 'learning_rate': 3.141011723838472e-05, 'epoch': 1.17}


 39%|███▉      | 7360/18924 [4:02:52<6:03:47,  1.89s/it]

{'loss': 0.8218, 'grad_norm': 10.819860458374023, 'learning_rate': 3.1382978723404254e-05, 'epoch': 1.17}


 39%|███▉      | 7370/18924 [4:03:11<6:03:31,  1.89s/it]

{'loss': 0.5649, 'grad_norm': 8.30933666229248, 'learning_rate': 3.13558402084238e-05, 'epoch': 1.17}


 39%|███▉      | 7380/18924 [4:03:31<6:44:05,  2.10s/it]

{'loss': 0.6035, 'grad_norm': 7.028224468231201, 'learning_rate': 3.1328701693443336e-05, 'epoch': 1.17}


 39%|███▉      | 7390/18924 [4:03:51<6:11:36,  1.93s/it]

{'loss': 0.5735, 'grad_norm': 8.407990455627441, 'learning_rate': 3.130156317846288e-05, 'epoch': 1.17}


 39%|███▉      | 7400/18924 [4:04:11<6:33:24,  2.05s/it]

{'loss': 0.7804, 'grad_norm': 5.914783000946045, 'learning_rate': 3.127442466348242e-05, 'epoch': 1.17}


 39%|███▉      | 7410/18924 [4:04:32<6:42:57,  2.10s/it]

{'loss': 0.7161, 'grad_norm': 16.22226905822754, 'learning_rate': 3.1247286148501955e-05, 'epoch': 1.17}


 39%|███▉      | 7420/18924 [4:04:53<6:29:27,  2.03s/it]

{'loss': 0.7987, 'grad_norm': 8.957353591918945, 'learning_rate': 3.12201476335215e-05, 'epoch': 1.18}


 39%|███▉      | 7430/18924 [4:05:12<5:56:01,  1.86s/it]

{'loss': 0.6384, 'grad_norm': 6.421966075897217, 'learning_rate': 3.119300911854103e-05, 'epoch': 1.18}


 39%|███▉      | 7440/18924 [4:05:31<6:19:44,  1.98s/it]

{'loss': 0.662, 'grad_norm': 10.069759368896484, 'learning_rate': 3.1165870603560574e-05, 'epoch': 1.18}


 39%|███▉      | 7450/18924 [4:05:52<6:32:41,  2.05s/it]

{'loss': 0.6674, 'grad_norm': 5.984777927398682, 'learning_rate': 3.113873208858011e-05, 'epoch': 1.18}


 39%|███▉      | 7460/18924 [4:06:12<6:13:12,  1.95s/it]

{'loss': 0.6023, 'grad_norm': 9.704704284667969, 'learning_rate': 3.1111593573599656e-05, 'epoch': 1.18}


 39%|███▉      | 7470/18924 [4:06:30<5:48:43,  1.83s/it]

{'loss': 0.8593, 'grad_norm': 7.3547868728637695, 'learning_rate': 3.1084455058619193e-05, 'epoch': 1.18}


 40%|███▉      | 7480/18924 [4:06:49<6:06:24,  1.92s/it]

{'loss': 0.6649, 'grad_norm': 6.7047858238220215, 'learning_rate': 3.105731654363873e-05, 'epoch': 1.19}


 40%|███▉      | 7490/18924 [4:07:09<6:35:43,  2.08s/it]

{'loss': 0.7799, 'grad_norm': 8.52795124053955, 'learning_rate': 3.1030178028658275e-05, 'epoch': 1.19}


 40%|███▉      | 7500/18924 [4:07:30<6:17:45,  1.98s/it]

{'loss': 0.7332, 'grad_norm': 5.458614349365234, 'learning_rate': 3.100303951367781e-05, 'epoch': 1.19}


 40%|███▉      | 7510/18924 [4:07:49<6:00:22,  1.89s/it]

{'loss': 0.4794, 'grad_norm': 7.172079086303711, 'learning_rate': 3.097590099869736e-05, 'epoch': 1.19}


 40%|███▉      | 7520/18924 [4:08:09<6:20:12,  2.00s/it]

{'loss': 0.783, 'grad_norm': 7.286491870880127, 'learning_rate': 3.0948762483716894e-05, 'epoch': 1.19}


 40%|███▉      | 7530/18924 [4:08:30<6:27:10,  2.04s/it]

{'loss': 0.5866, 'grad_norm': 16.33221435546875, 'learning_rate': 3.092162396873643e-05, 'epoch': 1.19}


 40%|███▉      | 7540/18924 [4:08:49<6:01:06,  1.90s/it]

{'loss': 0.5831, 'grad_norm': 6.338925361633301, 'learning_rate': 3.089448545375597e-05, 'epoch': 1.2}


 40%|███▉      | 7550/18924 [4:09:07<5:45:34,  1.82s/it]

{'loss': 0.6921, 'grad_norm': 16.352825164794922, 'learning_rate': 3.086734693877551e-05, 'epoch': 1.2}


 40%|███▉      | 7560/18924 [4:09:26<6:07:12,  1.94s/it]

{'loss': 0.7481, 'grad_norm': 10.315762519836426, 'learning_rate': 3.084020842379505e-05, 'epoch': 1.2}


 40%|████      | 7570/18924 [4:09:47<6:24:50,  2.03s/it]

{'loss': 0.7689, 'grad_norm': 10.063142776489258, 'learning_rate': 3.081306990881459e-05, 'epoch': 1.2}


 40%|████      | 7580/18924 [4:10:06<6:03:43,  1.92s/it]

{'loss': 0.4872, 'grad_norm': 10.554405212402344, 'learning_rate': 3.078593139383413e-05, 'epoch': 1.2}


 40%|████      | 7590/18924 [4:10:25<5:55:20,  1.88s/it]

{'loss': 0.492, 'grad_norm': 6.883671283721924, 'learning_rate': 3.075879287885367e-05, 'epoch': 1.2}


 40%|████      | 7600/18924 [4:10:45<6:20:07,  2.01s/it]

{'loss': 0.5397, 'grad_norm': 6.115610122680664, 'learning_rate': 3.073165436387321e-05, 'epoch': 1.2}


 40%|████      | 7610/18924 [4:11:05<6:22:08,  2.03s/it]

{'loss': 0.7699, 'grad_norm': 10.022851943969727, 'learning_rate': 3.070451584889275e-05, 'epoch': 1.21}


 40%|████      | 7620/18924 [4:11:25<5:57:23,  1.90s/it]

{'loss': 0.7875, 'grad_norm': 15.505417823791504, 'learning_rate': 3.067737733391229e-05, 'epoch': 1.21}


 40%|████      | 7630/18924 [4:11:43<5:42:34,  1.82s/it]

{'loss': 0.8261, 'grad_norm': 9.339781761169434, 'learning_rate': 3.0650238818931834e-05, 'epoch': 1.21}


 40%|████      | 7640/18924 [4:12:02<5:54:27,  1.88s/it]

{'loss': 0.6365, 'grad_norm': 9.902382850646973, 'learning_rate': 3.062310030395137e-05, 'epoch': 1.21}


 40%|████      | 7650/18924 [4:12:21<6:12:37,  1.98s/it]

{'loss': 0.6978, 'grad_norm': 8.670090675354004, 'learning_rate': 3.059596178897091e-05, 'epoch': 1.21}


 40%|████      | 7660/18924 [4:12:41<6:01:11,  1.92s/it]

{'loss': 0.5836, 'grad_norm': 11.120723724365234, 'learning_rate': 3.0568823273990446e-05, 'epoch': 1.21}


 41%|████      | 7670/18924 [4:12:59<5:46:47,  1.85s/it]

{'loss': 0.5272, 'grad_norm': 7.620254993438721, 'learning_rate': 3.0541684759009984e-05, 'epoch': 1.22}


 41%|████      | 7680/18924 [4:13:18<5:55:39,  1.90s/it]

{'loss': 0.6072, 'grad_norm': 6.183027267456055, 'learning_rate': 3.0514546244029528e-05, 'epoch': 1.22}


 41%|████      | 7690/18924 [4:13:37<5:53:01,  1.89s/it]

{'loss': 0.6324, 'grad_norm': 12.262999534606934, 'learning_rate': 3.0487407729049065e-05, 'epoch': 1.22}


 41%|████      | 7700/18924 [4:13:56<5:45:07,  1.84s/it]

{'loss': 0.748, 'grad_norm': 10.974340438842773, 'learning_rate': 3.046026921406861e-05, 'epoch': 1.22}


 41%|████      | 7710/18924 [4:14:14<5:44:19,  1.84s/it]

{'loss': 0.7206, 'grad_norm': 11.087532997131348, 'learning_rate': 3.0433130699088147e-05, 'epoch': 1.22}


 41%|████      | 7720/18924 [4:14:34<6:06:45,  1.96s/it]

{'loss': 0.7836, 'grad_norm': 9.587959289550781, 'learning_rate': 3.0405992184107685e-05, 'epoch': 1.22}


 41%|████      | 7730/18924 [4:14:54<6:23:49,  2.06s/it]

{'loss': 0.8506, 'grad_norm': 7.333005428314209, 'learning_rate': 3.037885366912723e-05, 'epoch': 1.23}


 41%|████      | 7740/18924 [4:15:14<6:01:33,  1.94s/it]

{'loss': 0.6445, 'grad_norm': 11.462794303894043, 'learning_rate': 3.0351715154146766e-05, 'epoch': 1.23}


 41%|████      | 7750/18924 [4:15:33<5:45:42,  1.86s/it]

{'loss': 0.7627, 'grad_norm': 7.627843379974365, 'learning_rate': 3.0324576639166307e-05, 'epoch': 1.23}


 41%|████      | 7760/18924 [4:15:51<5:46:15,  1.86s/it]

{'loss': 0.652, 'grad_norm': 10.600005149841309, 'learning_rate': 3.0297438124185845e-05, 'epoch': 1.23}


 41%|████      | 7770/18924 [4:16:10<5:53:10,  1.90s/it]

{'loss': 0.686, 'grad_norm': 11.555221557617188, 'learning_rate': 3.027029960920539e-05, 'epoch': 1.23}


 41%|████      | 7780/18924 [4:16:30<6:03:27,  1.96s/it]

{'loss': 0.7078, 'grad_norm': 11.861010551452637, 'learning_rate': 3.0243161094224926e-05, 'epoch': 1.23}


 41%|████      | 7790/18924 [4:16:49<5:53:43,  1.91s/it]

{'loss': 0.8039, 'grad_norm': 10.261106491088867, 'learning_rate': 3.0216022579244464e-05, 'epoch': 1.23}


 41%|████      | 7800/18924 [4:17:09<6:15:24,  2.02s/it]

{'loss': 0.621, 'grad_norm': 9.213623046875, 'learning_rate': 3.0188884064264005e-05, 'epoch': 1.24}


 41%|████▏     | 7810/18924 [4:17:30<6:23:58,  2.07s/it]

{'loss': 0.7424, 'grad_norm': 10.686029434204102, 'learning_rate': 3.0161745549283542e-05, 'epoch': 1.24}


 41%|████▏     | 7820/18924 [4:17:49<5:47:27,  1.88s/it]

{'loss': 0.7753, 'grad_norm': 10.383392333984375, 'learning_rate': 3.0134607034303086e-05, 'epoch': 1.24}


 41%|████▏     | 7830/18924 [4:18:07<5:35:25,  1.81s/it]

{'loss': 0.7849, 'grad_norm': 9.00933837890625, 'learning_rate': 3.0107468519322624e-05, 'epoch': 1.24}


 41%|████▏     | 7840/18924 [4:18:26<5:53:52,  1.92s/it]

{'loss': 0.5807, 'grad_norm': 5.574733257293701, 'learning_rate': 3.0080330004342168e-05, 'epoch': 1.24}


 41%|████▏     | 7850/18924 [4:18:46<6:06:14,  1.98s/it]

{'loss': 0.6636, 'grad_norm': 11.278008460998535, 'learning_rate': 3.0053191489361706e-05, 'epoch': 1.24}


 42%|████▏     | 7860/18924 [4:19:05<5:52:32,  1.91s/it]

{'loss': 0.7551, 'grad_norm': 10.383442878723145, 'learning_rate': 3.002605297438124e-05, 'epoch': 1.25}


 42%|████▏     | 7870/18924 [4:19:24<5:55:15,  1.93s/it]

{'loss': 0.701, 'grad_norm': 10.701563835144043, 'learning_rate': 2.9998914459400784e-05, 'epoch': 1.25}


 42%|████▏     | 7880/18924 [4:19:44<6:05:18,  1.98s/it]

{'loss': 0.6858, 'grad_norm': 10.262028694152832, 'learning_rate': 2.997177594442032e-05, 'epoch': 1.25}


 42%|████▏     | 7890/18924 [4:20:03<6:02:26,  1.97s/it]

{'loss': 0.6179, 'grad_norm': 7.657776355743408, 'learning_rate': 2.9944637429439866e-05, 'epoch': 1.25}


 42%|████▏     | 7900/18924 [4:20:22<5:42:59,  1.87s/it]

{'loss': 0.6375, 'grad_norm': 8.598383903503418, 'learning_rate': 2.9917498914459403e-05, 'epoch': 1.25}


 42%|████▏     | 7910/18924 [4:20:41<5:30:10,  1.80s/it]

{'loss': 0.7082, 'grad_norm': 6.651242256164551, 'learning_rate': 2.989036039947894e-05, 'epoch': 1.25}


 42%|████▏     | 7920/18924 [4:20:58<5:27:03,  1.78s/it]

{'loss': 0.5556, 'grad_norm': 6.008552074432373, 'learning_rate': 2.986322188449848e-05, 'epoch': 1.26}


 42%|████▏     | 7930/18924 [4:21:16<5:29:40,  1.80s/it]

{'loss': 0.7159, 'grad_norm': 10.563888549804688, 'learning_rate': 2.983608336951802e-05, 'epoch': 1.26}


 42%|████▏     | 7940/18924 [4:21:35<5:35:44,  1.83s/it]

{'loss': 0.7769, 'grad_norm': 9.113810539245605, 'learning_rate': 2.9808944854537563e-05, 'epoch': 1.26}


 42%|████▏     | 7950/18924 [4:21:53<5:41:35,  1.87s/it]

{'loss': 0.6397, 'grad_norm': 7.5291056632995605, 'learning_rate': 2.97818063395571e-05, 'epoch': 1.26}


 42%|████▏     | 7960/18924 [4:22:13<6:02:16,  1.98s/it]

{'loss': 0.6597, 'grad_norm': 8.498880386352539, 'learning_rate': 2.975466782457664e-05, 'epoch': 1.26}


 42%|████▏     | 7970/18924 [4:22:34<6:25:02,  2.11s/it]

{'loss': 0.6129, 'grad_norm': 7.78417444229126, 'learning_rate': 2.972752930959618e-05, 'epoch': 1.26}


 42%|████▏     | 7980/18924 [4:22:53<5:48:07,  1.91s/it]

{'loss': 0.6999, 'grad_norm': 7.798642635345459, 'learning_rate': 2.9700390794615716e-05, 'epoch': 1.27}


 42%|████▏     | 7990/18924 [4:23:11<5:26:09,  1.79s/it]

{'loss': 0.7491, 'grad_norm': 9.724323272705078, 'learning_rate': 2.967325227963526e-05, 'epoch': 1.27}


 42%|████▏     | 8000/18924 [4:23:29<5:25:28,  1.79s/it]

{'loss': 0.6046, 'grad_norm': 4.409224033355713, 'learning_rate': 2.9646113764654798e-05, 'epoch': 1.27}


 42%|████▏     | 8010/18924 [4:23:48<5:29:38,  1.81s/it]

{'loss': 0.8212, 'grad_norm': 10.070090293884277, 'learning_rate': 2.9618975249674342e-05, 'epoch': 1.27}


 42%|████▏     | 8020/18924 [4:24:07<5:36:46,  1.85s/it]

{'loss': 0.5436, 'grad_norm': 13.302068710327148, 'learning_rate': 2.959183673469388e-05, 'epoch': 1.27}


 42%|████▏     | 8030/18924 [4:24:25<5:36:17,  1.85s/it]

{'loss': 0.743, 'grad_norm': 9.283529281616211, 'learning_rate': 2.956469821971342e-05, 'epoch': 1.27}


 42%|████▏     | 8040/18924 [4:24:44<5:53:03,  1.95s/it]

{'loss': 0.6575, 'grad_norm': 7.5240797996521, 'learning_rate': 2.9537559704732958e-05, 'epoch': 1.27}


 43%|████▎     | 8050/18924 [4:25:04<6:04:08,  2.01s/it]

{'loss': 0.6985, 'grad_norm': 5.070080757141113, 'learning_rate': 2.9510421189752496e-05, 'epoch': 1.28}


 43%|████▎     | 8060/18924 [4:25:24<5:46:50,  1.92s/it]

{'loss': 0.5866, 'grad_norm': 6.767513751983643, 'learning_rate': 2.948328267477204e-05, 'epoch': 1.28}


 43%|████▎     | 8070/18924 [4:25:42<5:34:02,  1.85s/it]

{'loss': 0.8257, 'grad_norm': 12.465803146362305, 'learning_rate': 2.9456144159791577e-05, 'epoch': 1.28}


 43%|████▎     | 8080/18924 [4:26:02<5:53:38,  1.96s/it]

{'loss': 0.5381, 'grad_norm': 8.187469482421875, 'learning_rate': 2.9429005644811118e-05, 'epoch': 1.28}


 43%|████▎     | 8090/18924 [4:26:22<6:13:24,  2.07s/it]

{'loss': 0.7257, 'grad_norm': 5.939581394195557, 'learning_rate': 2.9401867129830656e-05, 'epoch': 1.28}


 43%|████▎     | 8100/18924 [4:26:42<5:40:07,  1.89s/it]

{'loss': 0.7448, 'grad_norm': 6.958425998687744, 'learning_rate': 2.9374728614850193e-05, 'epoch': 1.28}


 43%|████▎     | 8110/18924 [4:27:00<5:34:14,  1.85s/it]

{'loss': 0.8035, 'grad_norm': 6.511173725128174, 'learning_rate': 2.9347590099869737e-05, 'epoch': 1.29}


 43%|████▎     | 8120/18924 [4:27:19<5:51:35,  1.95s/it]

{'loss': 0.9019, 'grad_norm': 7.768550872802734, 'learning_rate': 2.9320451584889275e-05, 'epoch': 1.29}


 43%|████▎     | 8130/18924 [4:27:39<5:56:29,  1.98s/it]

{'loss': 0.4863, 'grad_norm': 7.83866548538208, 'learning_rate': 2.929331306990882e-05, 'epoch': 1.29}


 43%|████▎     | 8140/18924 [4:27:58<5:39:28,  1.89s/it]

{'loss': 0.6424, 'grad_norm': 8.80080795288086, 'learning_rate': 2.9266174554928357e-05, 'epoch': 1.29}


 43%|████▎     | 8150/18924 [4:28:17<5:25:08,  1.81s/it]

{'loss': 0.6045, 'grad_norm': 11.573100090026855, 'learning_rate': 2.9239036039947898e-05, 'epoch': 1.29}


 43%|████▎     | 8160/18924 [4:28:34<5:20:13,  1.79s/it]

{'loss': 0.6872, 'grad_norm': 9.81789493560791, 'learning_rate': 2.9211897524967435e-05, 'epoch': 1.29}


 43%|████▎     | 8170/18924 [4:28:52<5:21:36,  1.79s/it]

{'loss': 0.5933, 'grad_norm': 9.685409545898438, 'learning_rate': 2.9184759009986972e-05, 'epoch': 1.3}


 43%|████▎     | 8180/18924 [4:29:11<5:27:15,  1.83s/it]

{'loss': 0.6338, 'grad_norm': 5.240006446838379, 'learning_rate': 2.9157620495006517e-05, 'epoch': 1.3}


 43%|████▎     | 8190/18924 [4:29:29<5:40:37,  1.90s/it]

{'loss': 0.6545, 'grad_norm': 13.749588012695312, 'learning_rate': 2.9130481980026054e-05, 'epoch': 1.3}


 43%|████▎     | 8200/18924 [4:29:50<6:12:03,  2.08s/it]

{'loss': 0.7232, 'grad_norm': 7.836121559143066, 'learning_rate': 2.9103343465045595e-05, 'epoch': 1.3}


 43%|████▎     | 8210/18924 [4:30:11<6:07:31,  2.06s/it]

{'loss': 0.5514, 'grad_norm': 11.593165397644043, 'learning_rate': 2.9076204950065133e-05, 'epoch': 1.3}


 43%|████▎     | 8220/18924 [4:30:30<5:41:18,  1.91s/it]

{'loss': 0.7826, 'grad_norm': 10.860513687133789, 'learning_rate': 2.904906643508467e-05, 'epoch': 1.3}


 43%|████▎     | 8230/18924 [4:30:49<5:24:42,  1.82s/it]

{'loss': 0.5469, 'grad_norm': 9.058963775634766, 'learning_rate': 2.9021927920104214e-05, 'epoch': 1.3}


 44%|████▎     | 8240/18924 [4:31:07<5:21:46,  1.81s/it]

{'loss': 0.69, 'grad_norm': 8.500295639038086, 'learning_rate': 2.8994789405123752e-05, 'epoch': 1.31}


 44%|████▎     | 8250/18924 [4:31:25<5:21:31,  1.81s/it]

{'loss': 0.7381, 'grad_norm': 5.703226089477539, 'learning_rate': 2.8967650890143293e-05, 'epoch': 1.31}


 44%|████▎     | 8260/18924 [4:31:43<5:25:49,  1.83s/it]

{'loss': 0.6784, 'grad_norm': 5.986019611358643, 'learning_rate': 2.894051237516283e-05, 'epoch': 1.31}


 44%|████▎     | 8270/18924 [4:32:02<5:27:58,  1.85s/it]

{'loss': 0.7052, 'grad_norm': 8.260563850402832, 'learning_rate': 2.8913373860182374e-05, 'epoch': 1.31}


 44%|████▍     | 8280/18924 [4:32:20<5:29:49,  1.86s/it]

{'loss': 0.9249, 'grad_norm': 8.079997062683105, 'learning_rate': 2.8886235345201912e-05, 'epoch': 1.31}


 44%|████▍     | 8290/18924 [4:32:39<5:31:54,  1.87s/it]

{'loss': 0.6811, 'grad_norm': 10.077020645141602, 'learning_rate': 2.885909683022145e-05, 'epoch': 1.31}


 44%|████▍     | 8300/18924 [4:32:58<5:30:40,  1.87s/it]

{'loss': 0.6947, 'grad_norm': 11.764946937561035, 'learning_rate': 2.8831958315240993e-05, 'epoch': 1.32}


 44%|████▍     | 8310/18924 [4:33:16<5:30:49,  1.87s/it]

{'loss': 0.7156, 'grad_norm': 5.061513900756836, 'learning_rate': 2.880481980026053e-05, 'epoch': 1.32}


 44%|████▍     | 8320/18924 [4:33:35<5:30:46,  1.87s/it]

{'loss': 0.7257, 'grad_norm': 14.191325187683105, 'learning_rate': 2.8777681285280072e-05, 'epoch': 1.32}


 44%|████▍     | 8330/18924 [4:33:54<5:32:09,  1.88s/it]

{'loss': 0.6961, 'grad_norm': 10.966190338134766, 'learning_rate': 2.875054277029961e-05, 'epoch': 1.32}


 44%|████▍     | 8340/18924 [4:34:12<5:28:12,  1.86s/it]

{'loss': 0.5652, 'grad_norm': 8.156688690185547, 'learning_rate': 2.8723404255319154e-05, 'epoch': 1.32}


 44%|████▍     | 8350/18924 [4:34:31<5:27:08,  1.86s/it]

{'loss': 0.8339, 'grad_norm': 12.427794456481934, 'learning_rate': 2.869626574033869e-05, 'epoch': 1.32}


 44%|████▍     | 8360/18924 [4:34:50<5:47:06,  1.97s/it]

{'loss': 0.7204, 'grad_norm': 5.36499547958374, 'learning_rate': 2.866912722535823e-05, 'epoch': 1.33}


 44%|████▍     | 8370/18924 [4:35:11<6:11:07,  2.11s/it]

{'loss': 0.8091, 'grad_norm': 10.134705543518066, 'learning_rate': 2.864198871037777e-05, 'epoch': 1.33}


 44%|████▍     | 8380/18924 [4:35:32<5:55:36,  2.02s/it]

{'loss': 0.5593, 'grad_norm': 11.222662925720215, 'learning_rate': 2.8614850195397307e-05, 'epoch': 1.33}


 44%|████▍     | 8390/18924 [4:35:51<5:31:58,  1.89s/it]

{'loss': 0.5334, 'grad_norm': 6.295616149902344, 'learning_rate': 2.858771168041685e-05, 'epoch': 1.33}


 44%|████▍     | 8400/18924 [4:36:11<5:53:24,  2.01s/it]

{'loss': 0.6012, 'grad_norm': 13.711288452148438, 'learning_rate': 2.856057316543639e-05, 'epoch': 1.33}


 44%|████▍     | 8410/18924 [4:36:31<6:00:23,  2.06s/it]

{'loss': 0.7727, 'grad_norm': 7.115315914154053, 'learning_rate': 2.8533434650455926e-05, 'epoch': 1.33}


 44%|████▍     | 8420/18924 [4:36:51<5:39:43,  1.94s/it]

{'loss': 0.7548, 'grad_norm': 10.201436042785645, 'learning_rate': 2.850629613547547e-05, 'epoch': 1.33}


 45%|████▍     | 8430/18924 [4:37:10<5:24:13,  1.85s/it]

{'loss': 0.5512, 'grad_norm': 11.722908973693848, 'learning_rate': 2.8479157620495008e-05, 'epoch': 1.34}


 45%|████▍     | 8440/18924 [4:37:29<5:37:49,  1.93s/it]

{'loss': 0.6722, 'grad_norm': 12.680015563964844, 'learning_rate': 2.845201910551455e-05, 'epoch': 1.34}


 45%|████▍     | 8450/18924 [4:37:49<5:55:43,  2.04s/it]

{'loss': 0.8619, 'grad_norm': 13.357912063598633, 'learning_rate': 2.8424880590534086e-05, 'epoch': 1.34}


 45%|████▍     | 8460/18924 [4:38:09<5:37:02,  1.93s/it]

{'loss': 0.6763, 'grad_norm': 7.282384872436523, 'learning_rate': 2.839774207555363e-05, 'epoch': 1.34}


 45%|████▍     | 8470/18924 [4:38:28<5:31:25,  1.90s/it]

{'loss': 0.6939, 'grad_norm': 9.643247604370117, 'learning_rate': 2.8370603560573168e-05, 'epoch': 1.34}


 45%|████▍     | 8480/18924 [4:38:48<5:55:03,  2.04s/it]

{'loss': 0.6676, 'grad_norm': 3.895720958709717, 'learning_rate': 2.8343465045592705e-05, 'epoch': 1.34}


 45%|████▍     | 8490/18924 [4:39:08<5:53:32,  2.03s/it]

{'loss': 0.5936, 'grad_norm': 8.4970703125, 'learning_rate': 2.8316326530612246e-05, 'epoch': 1.35}


 45%|████▍     | 8500/18924 [4:39:27<5:26:54,  1.88s/it]

{'loss': 0.6234, 'grad_norm': 7.050299167633057, 'learning_rate': 2.8289188015631784e-05, 'epoch': 1.35}


 45%|████▍     | 8510/18924 [4:39:47<5:28:21,  1.89s/it]

{'loss': 0.6534, 'grad_norm': 8.276975631713867, 'learning_rate': 2.8262049500651328e-05, 'epoch': 1.35}


 45%|████▌     | 8520/18924 [4:40:07<5:49:18,  2.01s/it]

{'loss': 0.7311, 'grad_norm': 13.425023078918457, 'learning_rate': 2.8234910985670865e-05, 'epoch': 1.35}


 45%|████▌     | 8530/18924 [4:40:27<5:46:56,  2.00s/it]

{'loss': 0.7232, 'grad_norm': 12.865071296691895, 'learning_rate': 2.8207772470690406e-05, 'epoch': 1.35}


 45%|████▌     | 8540/18924 [4:40:46<5:24:14,  1.87s/it]

{'loss': 0.7314, 'grad_norm': 10.093493461608887, 'learning_rate': 2.8180633955709944e-05, 'epoch': 1.35}


 45%|████▌     | 8550/18924 [4:41:04<5:23:05,  1.87s/it]

{'loss': 0.6239, 'grad_norm': 6.8343658447265625, 'learning_rate': 2.815349544072948e-05, 'epoch': 1.36}


 45%|████▌     | 8560/18924 [4:41:24<5:46:01,  2.00s/it]

{'loss': 0.7798, 'grad_norm': 9.070432662963867, 'learning_rate': 2.8126356925749025e-05, 'epoch': 1.36}


 45%|████▌     | 8570/18924 [4:41:45<5:50:45,  2.03s/it]

{'loss': 0.7346, 'grad_norm': 7.018474102020264, 'learning_rate': 2.8099218410768563e-05, 'epoch': 1.36}


 45%|████▌     | 8580/18924 [4:42:04<5:21:27,  1.86s/it]

{'loss': 0.6113, 'grad_norm': 8.551788330078125, 'learning_rate': 2.8072079895788107e-05, 'epoch': 1.36}


 45%|████▌     | 8590/18924 [4:42:23<5:30:40,  1.92s/it]

{'loss': 0.804, 'grad_norm': 13.205939292907715, 'learning_rate': 2.8044941380807645e-05, 'epoch': 1.36}


 45%|████▌     | 8600/18924 [4:42:43<5:49:59,  2.03s/it]

{'loss': 0.5891, 'grad_norm': 5.155694961547852, 'learning_rate': 2.8017802865827182e-05, 'epoch': 1.36}


 45%|████▌     | 8610/18924 [4:43:03<5:49:47,  2.03s/it]

{'loss': 0.7132, 'grad_norm': 11.44472599029541, 'learning_rate': 2.7990664350846723e-05, 'epoch': 1.36}


 46%|████▌     | 8620/18924 [4:43:22<5:18:25,  1.85s/it]

{'loss': 0.6352, 'grad_norm': 9.045724868774414, 'learning_rate': 2.796352583586626e-05, 'epoch': 1.37}


 46%|████▌     | 8630/18924 [4:43:40<5:11:10,  1.81s/it]

{'loss': 0.6555, 'grad_norm': 10.756526947021484, 'learning_rate': 2.7936387320885805e-05, 'epoch': 1.37}


 46%|████▌     | 8640/18924 [4:43:59<5:16:25,  1.85s/it]

{'loss': 0.6207, 'grad_norm': 13.041650772094727, 'learning_rate': 2.7909248805905342e-05, 'epoch': 1.37}


 46%|████▌     | 8650/18924 [4:44:17<5:15:11,  1.84s/it]

{'loss': 0.6394, 'grad_norm': 8.003390312194824, 'learning_rate': 2.7882110290924883e-05, 'epoch': 1.37}


 46%|████▌     | 8660/18924 [4:44:36<5:15:45,  1.85s/it]

{'loss': 0.5621, 'grad_norm': 6.039896011352539, 'learning_rate': 2.785497177594442e-05, 'epoch': 1.37}


 46%|████▌     | 8670/18924 [4:44:55<5:25:42,  1.91s/it]

{'loss': 0.7908, 'grad_norm': 10.995311737060547, 'learning_rate': 2.7827833260963958e-05, 'epoch': 1.37}


 46%|████▌     | 8680/18924 [4:45:13<5:16:55,  1.86s/it]

{'loss': 0.5803, 'grad_norm': 8.183943748474121, 'learning_rate': 2.7800694745983502e-05, 'epoch': 1.38}


 46%|████▌     | 8690/18924 [4:45:32<5:12:09,  1.83s/it]

{'loss': 0.5626, 'grad_norm': 9.992177963256836, 'learning_rate': 2.777355623100304e-05, 'epoch': 1.38}


 46%|████▌     | 8700/18924 [4:45:50<5:14:31,  1.85s/it]

{'loss': 0.7574, 'grad_norm': 12.443404197692871, 'learning_rate': 2.7746417716022584e-05, 'epoch': 1.38}


 46%|████▌     | 8710/18924 [4:46:09<5:20:50,  1.88s/it]

{'loss': 0.6238, 'grad_norm': 10.386009216308594, 'learning_rate': 2.771927920104212e-05, 'epoch': 1.38}


 46%|████▌     | 8720/18924 [4:46:28<5:21:30,  1.89s/it]

{'loss': 0.9927, 'grad_norm': 10.523354530334473, 'learning_rate': 2.7692140686061662e-05, 'epoch': 1.38}


 46%|████▌     | 8730/18924 [4:46:47<5:17:54,  1.87s/it]

{'loss': 0.6098, 'grad_norm': 8.60322093963623, 'learning_rate': 2.76650021710812e-05, 'epoch': 1.38}


 46%|████▌     | 8740/18924 [4:47:05<5:16:39,  1.87s/it]

{'loss': 0.6171, 'grad_norm': 11.081256866455078, 'learning_rate': 2.7637863656100737e-05, 'epoch': 1.39}


 46%|████▌     | 8750/18924 [4:47:24<5:18:25,  1.88s/it]

{'loss': 0.6948, 'grad_norm': 10.824217796325684, 'learning_rate': 2.761072514112028e-05, 'epoch': 1.39}


 46%|████▋     | 8760/18924 [4:47:43<5:20:06,  1.89s/it]

{'loss': 0.584, 'grad_norm': 7.103282928466797, 'learning_rate': 2.758358662613982e-05, 'epoch': 1.39}


 46%|████▋     | 8770/18924 [4:48:02<5:32:03,  1.96s/it]

{'loss': 0.7067, 'grad_norm': 11.557402610778809, 'learning_rate': 2.755644811115936e-05, 'epoch': 1.39}


 46%|████▋     | 8780/18924 [4:48:22<5:27:23,  1.94s/it]

{'loss': 0.7222, 'grad_norm': 19.067134857177734, 'learning_rate': 2.7529309596178897e-05, 'epoch': 1.39}


 46%|████▋     | 8790/18924 [4:48:41<5:31:18,  1.96s/it]

{'loss': 0.5975, 'grad_norm': 13.97273063659668, 'learning_rate': 2.7502171081198435e-05, 'epoch': 1.39}


 47%|████▋     | 8800/18924 [4:49:02<5:51:53,  2.09s/it]

{'loss': 0.6467, 'grad_norm': 12.184931755065918, 'learning_rate': 2.747503256621798e-05, 'epoch': 1.4}


 47%|████▋     | 8810/18924 [4:49:23<5:53:51,  2.10s/it]

{'loss': 0.7025, 'grad_norm': 10.333008766174316, 'learning_rate': 2.7447894051237516e-05, 'epoch': 1.4}


 47%|████▋     | 8820/18924 [4:49:42<5:16:10,  1.88s/it]

{'loss': 0.8705, 'grad_norm': 8.848164558410645, 'learning_rate': 2.7420755536257057e-05, 'epoch': 1.4}


 47%|████▋     | 8830/18924 [4:50:01<5:10:32,  1.85s/it]

{'loss': 0.7591, 'grad_norm': 9.247847557067871, 'learning_rate': 2.7393617021276595e-05, 'epoch': 1.4}


 47%|████▋     | 8840/18924 [4:50:20<5:35:35,  2.00s/it]

{'loss': 0.755, 'grad_norm': 11.288917541503906, 'learning_rate': 2.736647850629614e-05, 'epoch': 1.4}


 47%|████▋     | 8850/18924 [4:50:41<5:49:16,  2.08s/it]

{'loss': 0.7582, 'grad_norm': 7.060076713562012, 'learning_rate': 2.7339339991315677e-05, 'epoch': 1.4}


 47%|████▋     | 8860/18924 [4:51:00<5:16:36,  1.89s/it]

{'loss': 0.7824, 'grad_norm': 7.076761245727539, 'learning_rate': 2.7312201476335214e-05, 'epoch': 1.4}


 47%|████▋     | 8870/18924 [4:51:19<5:09:21,  1.85s/it]

{'loss': 0.6023, 'grad_norm': 6.4406232833862305, 'learning_rate': 2.7285062961354758e-05, 'epoch': 1.41}


 47%|████▋     | 8880/18924 [4:51:38<5:29:52,  1.97s/it]

{'loss': 0.5849, 'grad_norm': 12.319637298583984, 'learning_rate': 2.7257924446374296e-05, 'epoch': 1.41}


 47%|████▋     | 8890/18924 [4:51:59<5:40:35,  2.04s/it]

{'loss': 0.7528, 'grad_norm': 14.0695219039917, 'learning_rate': 2.7230785931393837e-05, 'epoch': 1.41}


 47%|████▋     | 8900/18924 [4:52:18<5:13:15,  1.88s/it]

{'loss': 0.6144, 'grad_norm': 6.781679630279541, 'learning_rate': 2.7203647416413374e-05, 'epoch': 1.41}


 47%|████▋     | 8910/18924 [4:52:37<5:24:32,  1.94s/it]

{'loss': 0.6835, 'grad_norm': 9.96961784362793, 'learning_rate': 2.717650890143291e-05, 'epoch': 1.41}


 47%|████▋     | 8920/18924 [4:52:57<5:39:43,  2.04s/it]

{'loss': 0.6883, 'grad_norm': 6.964280128479004, 'learning_rate': 2.7149370386452456e-05, 'epoch': 1.41}


 47%|████▋     | 8930/18924 [4:53:17<5:32:36,  2.00s/it]

{'loss': 0.8081, 'grad_norm': 10.501177787780762, 'learning_rate': 2.7122231871471993e-05, 'epoch': 1.42}


 47%|████▋     | 8940/18924 [4:53:36<5:08:25,  1.85s/it]

{'loss': 0.6434, 'grad_norm': 8.467117309570312, 'learning_rate': 2.7095093356491534e-05, 'epoch': 1.42}


 47%|████▋     | 8950/18924 [4:53:54<5:01:11,  1.81s/it]

{'loss': 0.6037, 'grad_norm': 12.290331840515137, 'learning_rate': 2.706795484151107e-05, 'epoch': 1.42}


 47%|████▋     | 8960/18924 [4:54:12<5:00:06,  1.81s/it]

{'loss': 0.8521, 'grad_norm': 11.591900825500488, 'learning_rate': 2.7040816326530616e-05, 'epoch': 1.42}


 47%|████▋     | 8970/18924 [4:54:31<5:04:18,  1.83s/it]

{'loss': 0.7028, 'grad_norm': 11.226137161254883, 'learning_rate': 2.7013677811550153e-05, 'epoch': 1.42}


 47%|████▋     | 8980/18924 [4:54:49<5:07:04,  1.85s/it]

{'loss': 0.6652, 'grad_norm': 11.1915922164917, 'learning_rate': 2.698653929656969e-05, 'epoch': 1.42}


 48%|████▊     | 8990/18924 [4:55:08<5:06:54,  1.85s/it]

{'loss': 0.59, 'grad_norm': 7.870028018951416, 'learning_rate': 2.6959400781589235e-05, 'epoch': 1.43}


 48%|████▊     | 9000/18924 [4:55:27<5:27:33,  1.98s/it]

{'loss': 0.7995, 'grad_norm': 5.995444297790527, 'learning_rate': 2.6932262266608772e-05, 'epoch': 1.43}


 48%|████▊     | 9010/18924 [4:55:49<5:45:55,  2.09s/it]

{'loss': 0.5878, 'grad_norm': 5.895106792449951, 'learning_rate': 2.6905123751628313e-05, 'epoch': 1.43}


 48%|████▊     | 9020/18924 [4:56:08<5:09:18,  1.87s/it]

{'loss': 0.7068, 'grad_norm': 4.566312789916992, 'learning_rate': 2.687798523664785e-05, 'epoch': 1.43}


 48%|████▊     | 9030/18924 [4:56:26<5:05:16,  1.85s/it]

{'loss': 0.7886, 'grad_norm': 7.5528106689453125, 'learning_rate': 2.6850846721667395e-05, 'epoch': 1.43}


 48%|████▊     | 9040/18924 [4:56:46<5:30:42,  2.01s/it]

{'loss': 0.6602, 'grad_norm': 10.716526985168457, 'learning_rate': 2.6823708206686933e-05, 'epoch': 1.43}


 48%|████▊     | 9050/18924 [4:57:06<5:32:45,  2.02s/it]

{'loss': 0.6829, 'grad_norm': 6.302693843841553, 'learning_rate': 2.679656969170647e-05, 'epoch': 1.43}


 48%|████▊     | 9060/18924 [4:57:25<5:03:35,  1.85s/it]

{'loss': 0.7577, 'grad_norm': 7.84568977355957, 'learning_rate': 2.676943117672601e-05, 'epoch': 1.44}


 48%|████▊     | 9070/18924 [4:57:43<4:55:34,  1.80s/it]

{'loss': 0.8557, 'grad_norm': 6.119560241699219, 'learning_rate': 2.674229266174555e-05, 'epoch': 1.44}


 48%|████▊     | 9080/18924 [4:58:01<4:54:32,  1.80s/it]

{'loss': 0.6021, 'grad_norm': 7.312753677368164, 'learning_rate': 2.6715154146765093e-05, 'epoch': 1.44}


 48%|████▊     | 9090/18924 [4:58:19<5:00:45,  1.84s/it]

{'loss': 0.6217, 'grad_norm': 7.567080497741699, 'learning_rate': 2.668801563178463e-05, 'epoch': 1.44}


 48%|████▊     | 9100/18924 [4:58:38<5:03:43,  1.85s/it]

{'loss': 0.5724, 'grad_norm': 11.919299125671387, 'learning_rate': 2.6660877116804168e-05, 'epoch': 1.44}


 48%|████▊     | 9110/18924 [4:58:57<5:17:58,  1.94s/it]

{'loss': 0.6865, 'grad_norm': 9.979443550109863, 'learning_rate': 2.663373860182371e-05, 'epoch': 1.44}


 48%|████▊     | 9120/18924 [4:59:17<5:33:54,  2.04s/it]

{'loss': 0.5586, 'grad_norm': 2.6987757682800293, 'learning_rate': 2.6606600086843246e-05, 'epoch': 1.45}


 48%|████▊     | 9130/18924 [4:59:37<5:24:49,  1.99s/it]

{'loss': 0.7461, 'grad_norm': 9.873866081237793, 'learning_rate': 2.657946157186279e-05, 'epoch': 1.45}


 48%|████▊     | 9140/18924 [4:59:56<4:59:42,  1.84s/it]

{'loss': 0.6248, 'grad_norm': 11.521553993225098, 'learning_rate': 2.6552323056882328e-05, 'epoch': 1.45}


 48%|████▊     | 9150/18924 [5:00:14<4:52:20,  1.79s/it]

{'loss': 0.7118, 'grad_norm': 11.1720552444458, 'learning_rate': 2.6525184541901872e-05, 'epoch': 1.45}


 48%|████▊     | 9160/18924 [5:00:32<4:50:55,  1.79s/it]

{'loss': 0.8018, 'grad_norm': 12.768312454223633, 'learning_rate': 2.649804602692141e-05, 'epoch': 1.45}


 48%|████▊     | 9170/18924 [5:00:50<4:54:10,  1.81s/it]

{'loss': 0.624, 'grad_norm': 7.205874919891357, 'learning_rate': 2.6470907511940947e-05, 'epoch': 1.45}


 49%|████▊     | 9180/18924 [5:01:08<4:55:52,  1.82s/it]

{'loss': 0.7711, 'grad_norm': 9.592001914978027, 'learning_rate': 2.6443768996960488e-05, 'epoch': 1.46}


 49%|████▊     | 9190/18924 [5:01:27<5:01:23,  1.86s/it]

{'loss': 0.6771, 'grad_norm': 5.222733020782471, 'learning_rate': 2.6416630481980025e-05, 'epoch': 1.46}


 49%|████▊     | 9200/18924 [5:01:45<4:56:46,  1.83s/it]

{'loss': 0.6502, 'grad_norm': 15.114282608032227, 'learning_rate': 2.638949196699957e-05, 'epoch': 1.46}


 49%|████▊     | 9210/18924 [5:02:03<4:53:14,  1.81s/it]

{'loss': 0.8318, 'grad_norm': 6.752763748168945, 'learning_rate': 2.6362353452019107e-05, 'epoch': 1.46}


 49%|████▊     | 9220/18924 [5:02:22<4:52:46,  1.81s/it]

{'loss': 0.692, 'grad_norm': 12.01680850982666, 'learning_rate': 2.6335214937038648e-05, 'epoch': 1.46}


 49%|████▉     | 9230/18924 [5:02:40<4:55:01,  1.83s/it]

{'loss': 0.5688, 'grad_norm': 11.89796257019043, 'learning_rate': 2.6308076422058185e-05, 'epoch': 1.46}


 49%|████▉     | 9240/18924 [5:02:58<4:56:39,  1.84s/it]

{'loss': 0.4724, 'grad_norm': 10.486393928527832, 'learning_rate': 2.6280937907077723e-05, 'epoch': 1.46}


 49%|████▉     | 9250/18924 [5:03:17<5:09:48,  1.92s/it]

{'loss': 0.7222, 'grad_norm': 12.985721588134766, 'learning_rate': 2.6253799392097267e-05, 'epoch': 1.47}


 49%|████▉     | 9260/18924 [5:03:37<5:29:55,  2.05s/it]

{'loss': 0.7696, 'grad_norm': 8.297370910644531, 'learning_rate': 2.6226660877116804e-05, 'epoch': 1.47}


 49%|████▉     | 9270/18924 [5:03:58<5:31:41,  2.06s/it]

{'loss': 0.6628, 'grad_norm': 6.387478828430176, 'learning_rate': 2.619952236213635e-05, 'epoch': 1.47}


 49%|████▉     | 9280/18924 [5:04:18<5:10:23,  1.93s/it]

{'loss': 0.7231, 'grad_norm': 26.196264266967773, 'learning_rate': 2.6172383847155886e-05, 'epoch': 1.47}


 49%|████▉     | 9290/18924 [5:04:36<4:55:40,  1.84s/it]

{'loss': 0.6684, 'grad_norm': 6.9191436767578125, 'learning_rate': 2.6145245332175424e-05, 'epoch': 1.47}


 49%|████▉     | 9300/18924 [5:04:55<5:00:52,  1.88s/it]

{'loss': 0.7123, 'grad_norm': 14.354586601257324, 'learning_rate': 2.6118106817194964e-05, 'epoch': 1.47}


 49%|████▉     | 9310/18924 [5:05:14<5:08:15,  1.92s/it]

{'loss': 0.7609, 'grad_norm': 9.552409172058105, 'learning_rate': 2.6090968302214502e-05, 'epoch': 1.48}


 49%|████▉     | 9320/18924 [5:05:33<4:57:05,  1.86s/it]

{'loss': 0.4943, 'grad_norm': 8.421201705932617, 'learning_rate': 2.6063829787234046e-05, 'epoch': 1.48}


 49%|████▉     | 9330/18924 [5:05:52<5:19:03,  2.00s/it]

{'loss': 0.6015, 'grad_norm': 6.960545063018799, 'learning_rate': 2.6036691272253584e-05, 'epoch': 1.48}


 49%|████▉     | 9340/18924 [5:06:13<5:21:46,  2.01s/it]

{'loss': 0.6421, 'grad_norm': 10.130209922790527, 'learning_rate': 2.6009552757273125e-05, 'epoch': 1.48}


 49%|████▉     | 9350/18924 [5:06:32<5:05:42,  1.92s/it]

{'loss': 0.7147, 'grad_norm': 10.52669620513916, 'learning_rate': 2.5982414242292662e-05, 'epoch': 1.48}


 49%|████▉     | 9360/18924 [5:06:50<4:49:11,  1.81s/it]

{'loss': 0.6658, 'grad_norm': 13.153632164001465, 'learning_rate': 2.59552757273122e-05, 'epoch': 1.48}


 50%|████▉     | 9370/18924 [5:07:09<4:48:10,  1.81s/it]

{'loss': 0.6772, 'grad_norm': 6.297703266143799, 'learning_rate': 2.5928137212331744e-05, 'epoch': 1.49}


 50%|████▉     | 9380/18924 [5:07:27<4:50:15,  1.82s/it]

{'loss': 0.7112, 'grad_norm': 9.691349029541016, 'learning_rate': 2.590099869735128e-05, 'epoch': 1.49}


 50%|████▉     | 9390/18924 [5:07:45<4:52:49,  1.84s/it]

{'loss': 0.6594, 'grad_norm': 11.350140571594238, 'learning_rate': 2.5873860182370825e-05, 'epoch': 1.49}


 50%|████▉     | 9400/18924 [5:08:04<4:54:45,  1.86s/it]

{'loss': 0.6573, 'grad_norm': 10.455368041992188, 'learning_rate': 2.584672166739036e-05, 'epoch': 1.49}


 50%|████▉     | 9410/18924 [5:08:22<4:55:14,  1.86s/it]

{'loss': 0.5859, 'grad_norm': 6.3922295570373535, 'learning_rate': 2.5819583152409904e-05, 'epoch': 1.49}


 50%|████▉     | 9420/18924 [5:08:41<4:51:48,  1.84s/it]

{'loss': 0.6445, 'grad_norm': 8.658921241760254, 'learning_rate': 2.579244463742944e-05, 'epoch': 1.49}


 50%|████▉     | 9430/18924 [5:09:00<5:03:30,  1.92s/it]

{'loss': 0.6105, 'grad_norm': 3.8783340454101562, 'learning_rate': 2.576530612244898e-05, 'epoch': 1.49}


 50%|████▉     | 9440/18924 [5:09:19<5:08:11,  1.95s/it]

{'loss': 0.7343, 'grad_norm': 5.874997138977051, 'learning_rate': 2.5738167607468523e-05, 'epoch': 1.5}


 50%|████▉     | 9450/18924 [5:09:39<5:21:34,  2.04s/it]

{'loss': 0.7203, 'grad_norm': 9.334250450134277, 'learning_rate': 2.571102909248806e-05, 'epoch': 1.5}


 50%|████▉     | 9460/18924 [5:10:00<5:20:50,  2.03s/it]

{'loss': 0.7199, 'grad_norm': 13.135369300842285, 'learning_rate': 2.56838905775076e-05, 'epoch': 1.5}


 50%|█████     | 9470/18924 [5:10:20<5:15:48,  2.00s/it]

{'loss': 0.5687, 'grad_norm': 4.183157920837402, 'learning_rate': 2.565675206252714e-05, 'epoch': 1.5}


 50%|█████     | 9480/18924 [5:10:40<5:15:49,  2.01s/it]

{'loss': 0.6776, 'grad_norm': 9.193885803222656, 'learning_rate': 2.5629613547546676e-05, 'epoch': 1.5}


 50%|█████     | 9490/18924 [5:11:00<5:11:16,  1.98s/it]

{'loss': 0.8746, 'grad_norm': 8.1482572555542, 'learning_rate': 2.560247503256622e-05, 'epoch': 1.5}


 50%|█████     | 9500/18924 [5:11:19<5:10:42,  1.98s/it]

{'loss': 0.7695, 'grad_norm': 11.434576988220215, 'learning_rate': 2.5575336517585758e-05, 'epoch': 1.51}


 50%|█████     | 9510/18924 [5:11:40<5:08:46,  1.97s/it]

{'loss': 0.7378, 'grad_norm': 7.834637641906738, 'learning_rate': 2.55481980026053e-05, 'epoch': 1.51}


 50%|█████     | 9520/18924 [5:11:58<4:43:54,  1.81s/it]

{'loss': 0.7579, 'grad_norm': 8.434929847717285, 'learning_rate': 2.5521059487624836e-05, 'epoch': 1.51}


 50%|█████     | 9530/18924 [5:12:17<4:44:54,  1.82s/it]

{'loss': 0.7897, 'grad_norm': 6.929574966430664, 'learning_rate': 2.549392097264438e-05, 'epoch': 1.51}


 50%|█████     | 9540/18924 [5:12:35<4:54:34,  1.88s/it]

{'loss': 0.723, 'grad_norm': 10.170499801635742, 'learning_rate': 2.5466782457663918e-05, 'epoch': 1.51}


 50%|█████     | 9550/18924 [5:12:55<5:16:00,  2.02s/it]

{'loss': 0.5866, 'grad_norm': 9.557890892028809, 'learning_rate': 2.5439643942683456e-05, 'epoch': 1.51}


 51%|█████     | 9560/18924 [5:13:14<4:49:22,  1.85s/it]

{'loss': 0.6737, 'grad_norm': 20.90595054626465, 'learning_rate': 2.5412505427703e-05, 'epoch': 1.52}


 51%|█████     | 9570/18924 [5:13:32<4:42:40,  1.81s/it]

{'loss': 0.5828, 'grad_norm': 8.123701095581055, 'learning_rate': 2.5385366912722537e-05, 'epoch': 1.52}


 51%|█████     | 9580/18924 [5:13:51<5:01:36,  1.94s/it]

{'loss': 0.8176, 'grad_norm': 7.680830001831055, 'learning_rate': 2.5358228397742078e-05, 'epoch': 1.52}


 51%|█████     | 9590/18924 [5:14:12<5:17:43,  2.04s/it]

{'loss': 0.622, 'grad_norm': 8.722038269042969, 'learning_rate': 2.5331089882761616e-05, 'epoch': 1.52}


 51%|█████     | 9600/18924 [5:14:31<4:51:31,  1.88s/it]

{'loss': 0.6009, 'grad_norm': 9.306391716003418, 'learning_rate': 2.5303951367781153e-05, 'epoch': 1.52}


 51%|█████     | 9610/18924 [5:14:50<5:01:30,  1.94s/it]

{'loss': 0.5794, 'grad_norm': 10.162379264831543, 'learning_rate': 2.5276812852800697e-05, 'epoch': 1.52}


 51%|█████     | 9620/18924 [5:15:10<5:16:31,  2.04s/it]

{'loss': 0.709, 'grad_norm': 12.80723762512207, 'learning_rate': 2.5249674337820235e-05, 'epoch': 1.53}


 51%|█████     | 9630/18924 [5:15:30<5:05:47,  1.97s/it]

{'loss': 0.5535, 'grad_norm': 7.677685260772705, 'learning_rate': 2.5222535822839776e-05, 'epoch': 1.53}


 51%|█████     | 9640/18924 [5:15:49<4:39:54,  1.81s/it]

{'loss': 0.5637, 'grad_norm': 7.6917805671691895, 'learning_rate': 2.5195397307859313e-05, 'epoch': 1.53}


 51%|█████     | 9650/18924 [5:16:07<4:37:37,  1.80s/it]

{'loss': 0.6581, 'grad_norm': 6.6919989585876465, 'learning_rate': 2.5168258792878857e-05, 'epoch': 1.53}


 51%|█████     | 9660/18924 [5:16:25<4:44:45,  1.84s/it]

{'loss': 0.5978, 'grad_norm': 10.530600547790527, 'learning_rate': 2.5141120277898395e-05, 'epoch': 1.53}


 51%|█████     | 9670/18924 [5:16:44<4:46:06,  1.86s/it]

{'loss': 0.6418, 'grad_norm': 12.948250770568848, 'learning_rate': 2.5113981762917932e-05, 'epoch': 1.53}


 51%|█████     | 9680/18924 [5:17:02<4:49:12,  1.88s/it]

{'loss': 0.7633, 'grad_norm': 9.109952926635742, 'learning_rate': 2.5086843247937477e-05, 'epoch': 1.53}


 51%|█████     | 9690/18924 [5:17:22<5:09:24,  2.01s/it]

{'loss': 0.5427, 'grad_norm': 8.772150039672852, 'learning_rate': 2.505970473295701e-05, 'epoch': 1.54}


 51%|█████▏    | 9700/18924 [5:17:43<5:22:21,  2.10s/it]

{'loss': 0.6982, 'grad_norm': 10.41414737701416, 'learning_rate': 2.5032566217976555e-05, 'epoch': 1.54}


 51%|█████▏    | 9710/18924 [5:18:03<5:08:36,  2.01s/it]

{'loss': 0.5495, 'grad_norm': 6.475250244140625, 'learning_rate': 2.5005427702996092e-05, 'epoch': 1.54}


 51%|█████▏    | 9720/18924 [5:18:22<4:47:19,  1.87s/it]

{'loss': 0.6291, 'grad_norm': 14.58350658416748, 'learning_rate': 2.4978289188015633e-05, 'epoch': 1.54}


 51%|█████▏    | 9730/18924 [5:18:42<4:58:53,  1.95s/it]

{'loss': 0.7112, 'grad_norm': 14.25300407409668, 'learning_rate': 2.4951150673035174e-05, 'epoch': 1.54}


 51%|█████▏    | 9740/18924 [5:19:01<5:00:07,  1.96s/it]

{'loss': 0.5875, 'grad_norm': 9.940378189086914, 'learning_rate': 2.492401215805471e-05, 'epoch': 1.54}


 52%|█████▏    | 9750/18924 [5:19:21<4:58:52,  1.95s/it]

{'loss': 0.7162, 'grad_norm': 10.10612678527832, 'learning_rate': 2.4896873643074252e-05, 'epoch': 1.55}


 52%|█████▏    | 9760/18924 [5:19:40<4:49:46,  1.90s/it]

{'loss': 0.8723, 'grad_norm': 7.664809226989746, 'learning_rate': 2.486973512809379e-05, 'epoch': 1.55}


 52%|█████▏    | 9770/18924 [5:20:00<5:08:34,  2.02s/it]

{'loss': 0.6606, 'grad_norm': 8.110477447509766, 'learning_rate': 2.484259661311333e-05, 'epoch': 1.55}


 52%|█████▏    | 9780/18924 [5:20:20<5:09:40,  2.03s/it]

{'loss': 0.584, 'grad_norm': 14.165217399597168, 'learning_rate': 2.481545809813287e-05, 'epoch': 1.55}


 52%|█████▏    | 9790/18924 [5:20:40<4:54:12,  1.93s/it]

{'loss': 0.7702, 'grad_norm': 7.471515655517578, 'learning_rate': 2.4788319583152412e-05, 'epoch': 1.55}


 52%|█████▏    | 9800/18924 [5:20:59<4:45:06,  1.87s/it]

{'loss': 0.4877, 'grad_norm': 5.1365790367126465, 'learning_rate': 2.476118106817195e-05, 'epoch': 1.55}


 52%|█████▏    | 9810/18924 [5:21:19<5:05:58,  2.01s/it]

{'loss': 0.8006, 'grad_norm': 12.323173522949219, 'learning_rate': 2.473404255319149e-05, 'epoch': 1.56}


 52%|█████▏    | 9820/18924 [5:21:39<5:08:24,  2.03s/it]

{'loss': 0.5494, 'grad_norm': 7.198905944824219, 'learning_rate': 2.4706904038211028e-05, 'epoch': 1.56}


 52%|█████▏    | 9830/18924 [5:21:58<4:46:13,  1.89s/it]

{'loss': 0.6264, 'grad_norm': 9.190592765808105, 'learning_rate': 2.467976552323057e-05, 'epoch': 1.56}


 52%|█████▏    | 9840/18924 [5:22:17<4:40:14,  1.85s/it]

{'loss': 0.7343, 'grad_norm': 11.274169921875, 'learning_rate': 2.465262700825011e-05, 'epoch': 1.56}


 52%|█████▏    | 9850/18924 [5:22:36<5:00:30,  1.99s/it]

{'loss': 0.6821, 'grad_norm': 9.769283294677734, 'learning_rate': 2.462548849326965e-05, 'epoch': 1.56}


 52%|█████▏    | 9860/18924 [5:22:57<5:15:58,  2.09s/it]

{'loss': 0.5363, 'grad_norm': 12.230236053466797, 'learning_rate': 2.459834997828919e-05, 'epoch': 1.56}


 52%|█████▏    | 9870/18924 [5:23:17<4:46:09,  1.90s/it]

{'loss': 0.7027, 'grad_norm': 10.18307113647461, 'learning_rate': 2.457121146330873e-05, 'epoch': 1.56}


 52%|█████▏    | 9880/18924 [5:23:35<4:47:46,  1.91s/it]

{'loss': 0.5575, 'grad_norm': 2.7747159004211426, 'learning_rate': 2.454407294832827e-05, 'epoch': 1.57}


 52%|█████▏    | 9890/18924 [5:23:56<5:06:24,  2.04s/it]

{'loss': 0.8089, 'grad_norm': 8.625853538513184, 'learning_rate': 2.4516934433347808e-05, 'epoch': 1.57}


 52%|█████▏    | 9900/18924 [5:24:16<5:07:42,  2.05s/it]

{'loss': 0.6209, 'grad_norm': 10.3844633102417, 'learning_rate': 2.448979591836735e-05, 'epoch': 1.57}


 52%|█████▏    | 9910/18924 [5:24:35<4:41:04,  1.87s/it]

{'loss': 0.8731, 'grad_norm': 11.663232803344727, 'learning_rate': 2.446265740338689e-05, 'epoch': 1.57}


 52%|█████▏    | 9920/18924 [5:24:54<4:54:00,  1.96s/it]

{'loss': 0.8572, 'grad_norm': 12.62566089630127, 'learning_rate': 2.4435518888406427e-05, 'epoch': 1.57}


 52%|█████▏    | 9930/18924 [5:25:15<5:10:20,  2.07s/it]

{'loss': 0.6194, 'grad_norm': 8.716202735900879, 'learning_rate': 2.4408380373425968e-05, 'epoch': 1.57}


 53%|█████▎    | 9940/18924 [5:25:36<5:07:55,  2.06s/it]

{'loss': 0.6509, 'grad_norm': 7.00743293762207, 'learning_rate': 2.438124185844551e-05, 'epoch': 1.58}


 53%|█████▎    | 9950/18924 [5:25:55<4:38:09,  1.86s/it]

{'loss': 0.603, 'grad_norm': 6.584665775299072, 'learning_rate': 2.4354103343465046e-05, 'epoch': 1.58}


 53%|█████▎    | 9960/18924 [5:26:14<4:55:38,  1.98s/it]

{'loss': 0.6828, 'grad_norm': 14.169549942016602, 'learning_rate': 2.4326964828484587e-05, 'epoch': 1.58}


 53%|█████▎    | 9970/18924 [5:26:34<5:02:37,  2.03s/it]

{'loss': 0.6336, 'grad_norm': 9.063117980957031, 'learning_rate': 2.4299826313504128e-05, 'epoch': 1.58}


 53%|█████▎    | 9980/18924 [5:26:55<5:04:21,  2.04s/it]

{'loss': 0.7971, 'grad_norm': 8.758574485778809, 'learning_rate': 2.4272687798523665e-05, 'epoch': 1.58}


 53%|█████▎    | 9990/18924 [5:27:13<4:34:54,  1.85s/it]

{'loss': 0.5071, 'grad_norm': 5.170801162719727, 'learning_rate': 2.4245549283543206e-05, 'epoch': 1.58}


 53%|█████▎    | 10000/18924 [5:27:33<4:55:45,  1.99s/it]

{'loss': 0.4719, 'grad_norm': 8.506902694702148, 'learning_rate': 2.4218410768562747e-05, 'epoch': 1.59}


 53%|█████▎    | 10010/18924 [5:27:54<5:05:39,  2.06s/it]

{'loss': 0.5955, 'grad_norm': 9.313153266906738, 'learning_rate': 2.4191272253582284e-05, 'epoch': 1.59}


 53%|█████▎    | 10020/18924 [5:28:14<4:51:55,  1.97s/it]

{'loss': 0.6288, 'grad_norm': 10.249340057373047, 'learning_rate': 2.4164133738601825e-05, 'epoch': 1.59}


 53%|█████▎    | 10030/18924 [5:28:33<4:32:24,  1.84s/it]

{'loss': 0.5456, 'grad_norm': 9.31839656829834, 'learning_rate': 2.4136995223621363e-05, 'epoch': 1.59}


 53%|█████▎    | 10040/18924 [5:28:52<4:57:38,  2.01s/it]

{'loss': 0.6701, 'grad_norm': 11.300721168518066, 'learning_rate': 2.4109856708640904e-05, 'epoch': 1.59}


 53%|█████▎    | 10050/18924 [5:29:13<4:57:14,  2.01s/it]

{'loss': 0.7565, 'grad_norm': 8.101909637451172, 'learning_rate': 2.4082718193660444e-05, 'epoch': 1.59}


 53%|█████▎    | 10060/18924 [5:29:32<4:44:54,  1.93s/it]

{'loss': 0.6125, 'grad_norm': 4.427089691162109, 'learning_rate': 2.4055579678679985e-05, 'epoch': 1.59}


 53%|█████▎    | 10070/18924 [5:29:51<4:31:53,  1.84s/it]

{'loss': 0.7252, 'grad_norm': 7.8343281745910645, 'learning_rate': 2.4028441163699523e-05, 'epoch': 1.6}


 53%|█████▎    | 10080/18924 [5:30:10<4:46:51,  1.95s/it]

{'loss': 0.6542, 'grad_norm': 8.752758026123047, 'learning_rate': 2.4001302648719064e-05, 'epoch': 1.6}


 53%|█████▎    | 10090/18924 [5:30:30<4:55:28,  2.01s/it]

{'loss': 0.5816, 'grad_norm': 16.29355812072754, 'learning_rate': 2.39741641337386e-05, 'epoch': 1.6}


 53%|█████▎    | 10100/18924 [5:30:50<4:43:35,  1.93s/it]

{'loss': 0.7751, 'grad_norm': 11.082197189331055, 'learning_rate': 2.3947025618758142e-05, 'epoch': 1.6}


 53%|█████▎    | 10110/18924 [5:31:08<4:33:41,  1.86s/it]

{'loss': 0.6715, 'grad_norm': 11.007089614868164, 'learning_rate': 2.3919887103777683e-05, 'epoch': 1.6}


 53%|█████▎    | 10120/18924 [5:31:28<4:46:34,  1.95s/it]

{'loss': 0.5848, 'grad_norm': 6.311676502227783, 'learning_rate': 2.3892748588797224e-05, 'epoch': 1.6}


 54%|█████▎    | 10130/18924 [5:31:47<4:48:33,  1.97s/it]

{'loss': 0.6312, 'grad_norm': 11.195765495300293, 'learning_rate': 2.3865610073816764e-05, 'epoch': 1.61}


 54%|█████▎    | 10140/18924 [5:32:07<4:39:16,  1.91s/it]

{'loss': 0.6162, 'grad_norm': 9.779345512390137, 'learning_rate': 2.3838471558836302e-05, 'epoch': 1.61}


 54%|█████▎    | 10150/18924 [5:32:25<4:30:40,  1.85s/it]

{'loss': 0.6389, 'grad_norm': 8.981499671936035, 'learning_rate': 2.381133304385584e-05, 'epoch': 1.61}


 54%|█████▎    | 10160/18924 [5:32:45<4:48:00,  1.97s/it]

{'loss': 0.6436, 'grad_norm': 5.6376471519470215, 'learning_rate': 2.378419452887538e-05, 'epoch': 1.61}


 54%|█████▎    | 10170/18924 [5:33:05<4:53:26,  2.01s/it]

{'loss': 0.6057, 'grad_norm': 9.1775541305542, 'learning_rate': 2.375705601389492e-05, 'epoch': 1.61}


 54%|█████▍    | 10180/18924 [5:33:24<4:39:28,  1.92s/it]

{'loss': 0.6396, 'grad_norm': 4.986166000366211, 'learning_rate': 2.3729917498914462e-05, 'epoch': 1.61}


 54%|█████▍    | 10190/18924 [5:33:42<4:28:22,  1.84s/it]

{'loss': 0.7522, 'grad_norm': 4.655019283294678, 'learning_rate': 2.3702778983934003e-05, 'epoch': 1.62}


 54%|█████▍    | 10200/18924 [5:34:02<4:45:34,  1.96s/it]

{'loss': 0.6064, 'grad_norm': 10.656230926513672, 'learning_rate': 2.367564046895354e-05, 'epoch': 1.62}


 54%|█████▍    | 10210/18924 [5:34:22<4:52:51,  2.02s/it]

{'loss': 0.6763, 'grad_norm': 12.753074645996094, 'learning_rate': 2.3648501953973078e-05, 'epoch': 1.62}


 54%|█████▍    | 10220/18924 [5:34:41<4:32:36,  1.88s/it]

{'loss': 0.6066, 'grad_norm': 6.833820343017578, 'learning_rate': 2.362136343899262e-05, 'epoch': 1.62}


 54%|█████▍    | 10230/18924 [5:35:00<4:27:06,  1.84s/it]

{'loss': 0.5779, 'grad_norm': 7.799305438995361, 'learning_rate': 2.359422492401216e-05, 'epoch': 1.62}


 54%|█████▍    | 10240/18924 [5:35:19<4:47:46,  1.99s/it]

{'loss': 0.7509, 'grad_norm': 12.848852157592773, 'learning_rate': 2.35670864090317e-05, 'epoch': 1.62}


 54%|█████▍    | 10250/18924 [5:35:39<4:45:50,  1.98s/it]

{'loss': 0.7105, 'grad_norm': 8.26225471496582, 'learning_rate': 2.353994789405124e-05, 'epoch': 1.62}


 54%|█████▍    | 10260/18924 [5:35:58<4:29:16,  1.86s/it]

{'loss': 0.5164, 'grad_norm': 4.239741325378418, 'learning_rate': 2.351280937907078e-05, 'epoch': 1.63}


 54%|█████▍    | 10270/18924 [5:36:16<4:26:23,  1.85s/it]

{'loss': 0.884, 'grad_norm': 12.576591491699219, 'learning_rate': 2.3485670864090316e-05, 'epoch': 1.63}


 54%|█████▍    | 10280/18924 [5:36:36<4:40:52,  1.95s/it]

{'loss': 0.688, 'grad_norm': 9.375370025634766, 'learning_rate': 2.3458532349109857e-05, 'epoch': 1.63}


 54%|█████▍    | 10290/18924 [5:36:56<4:44:30,  1.98s/it]

{'loss': 0.6021, 'grad_norm': 4.3762526512146, 'learning_rate': 2.3431393834129398e-05, 'epoch': 1.63}


 54%|█████▍    | 10300/18924 [5:37:15<4:30:35,  1.88s/it]

{'loss': 0.5495, 'grad_norm': 10.279746055603027, 'learning_rate': 2.340425531914894e-05, 'epoch': 1.63}


 54%|█████▍    | 10310/18924 [5:37:33<4:19:06,  1.80s/it]

{'loss': 0.7145, 'grad_norm': 6.463319778442383, 'learning_rate': 2.3377116804168476e-05, 'epoch': 1.63}


 55%|█████▍    | 10320/18924 [5:37:52<4:30:18,  1.89s/it]

{'loss': 0.6933, 'grad_norm': 11.728730201721191, 'learning_rate': 2.3349978289188017e-05, 'epoch': 1.64}


 55%|█████▍    | 10330/18924 [5:38:11<4:40:56,  1.96s/it]

{'loss': 0.8055, 'grad_norm': 6.911181926727295, 'learning_rate': 2.3322839774207555e-05, 'epoch': 1.64}


 55%|█████▍    | 10340/18924 [5:38:31<4:33:29,  1.91s/it]

{'loss': 0.7273, 'grad_norm': 28.09503936767578, 'learning_rate': 2.3295701259227095e-05, 'epoch': 1.64}


 55%|█████▍    | 10350/18924 [5:38:49<4:26:51,  1.87s/it]

{'loss': 0.5458, 'grad_norm': 6.2599334716796875, 'learning_rate': 2.3268562744246636e-05, 'epoch': 1.64}


 55%|█████▍    | 10360/18924 [5:39:09<4:42:32,  1.98s/it]

{'loss': 0.6163, 'grad_norm': 6.238856792449951, 'learning_rate': 2.3241424229266177e-05, 'epoch': 1.64}


 55%|█████▍    | 10370/18924 [5:39:29<4:45:51,  2.01s/it]

{'loss': 0.8158, 'grad_norm': 11.406081199645996, 'learning_rate': 2.3214285714285715e-05, 'epoch': 1.64}


 55%|█████▍    | 10380/18924 [5:39:48<4:28:03,  1.88s/it]

{'loss': 0.9014, 'grad_norm': 8.686367988586426, 'learning_rate': 2.3187147199305256e-05, 'epoch': 1.65}


 55%|█████▍    | 10390/18924 [5:40:06<4:16:26,  1.80s/it]

{'loss': 0.6701, 'grad_norm': 6.911813735961914, 'learning_rate': 2.3160008684324793e-05, 'epoch': 1.65}


 55%|█████▍    | 10400/18924 [5:40:24<4:16:14,  1.80s/it]

{'loss': 0.5486, 'grad_norm': 6.270071029663086, 'learning_rate': 2.3132870169344334e-05, 'epoch': 1.65}


 55%|█████▌    | 10410/18924 [5:40:42<4:18:49,  1.82s/it]

{'loss': 0.6969, 'grad_norm': 17.831445693969727, 'learning_rate': 2.3105731654363875e-05, 'epoch': 1.65}


 55%|█████▌    | 10420/18924 [5:41:01<4:19:14,  1.83s/it]

{'loss': 0.7102, 'grad_norm': 11.137203216552734, 'learning_rate': 2.3078593139383416e-05, 'epoch': 1.65}


 55%|█████▌    | 10430/18924 [5:41:19<4:21:54,  1.85s/it]

{'loss': 0.6532, 'grad_norm': 10.460644721984863, 'learning_rate': 2.3051454624402953e-05, 'epoch': 1.65}


 55%|█████▌    | 10440/18924 [5:41:38<4:39:00,  1.97s/it]

{'loss': 0.5464, 'grad_norm': 9.008499145507812, 'learning_rate': 2.3024316109422494e-05, 'epoch': 1.66}


 55%|█████▌    | 10450/18924 [5:41:59<4:57:30,  2.11s/it]

{'loss': 0.6295, 'grad_norm': 13.578432083129883, 'learning_rate': 2.299717759444203e-05, 'epoch': 1.66}


 55%|█████▌    | 10460/18924 [5:42:19<4:32:23,  1.93s/it]

{'loss': 0.5288, 'grad_norm': 7.029463291168213, 'learning_rate': 2.2970039079461572e-05, 'epoch': 1.66}


 55%|█████▌    | 10470/18924 [5:42:37<4:13:56,  1.80s/it]

{'loss': 0.66, 'grad_norm': 16.66007423400879, 'learning_rate': 2.2942900564481113e-05, 'epoch': 1.66}


 55%|█████▌    | 10480/18924 [5:42:55<4:13:25,  1.80s/it]

{'loss': 0.8173, 'grad_norm': 10.155426979064941, 'learning_rate': 2.2915762049500654e-05, 'epoch': 1.66}


 55%|█████▌    | 10490/18924 [5:43:13<4:14:26,  1.81s/it]

{'loss': 0.6418, 'grad_norm': 7.4220123291015625, 'learning_rate': 2.288862353452019e-05, 'epoch': 1.66}


 55%|█████▌    | 10500/18924 [5:43:31<4:17:00,  1.83s/it]

{'loss': 0.6289, 'grad_norm': 9.485042572021484, 'learning_rate': 2.2861485019539732e-05, 'epoch': 1.66}


 56%|█████▌    | 10510/18924 [5:43:51<4:22:18,  1.87s/it]

{'loss': 0.5573, 'grad_norm': 13.853887557983398, 'learning_rate': 2.283434650455927e-05, 'epoch': 1.67}


 56%|█████▌    | 10520/18924 [5:44:10<4:21:42,  1.87s/it]

{'loss': 0.6043, 'grad_norm': 12.767070770263672, 'learning_rate': 2.280720798957881e-05, 'epoch': 1.67}


 56%|█████▌    | 10530/18924 [5:44:28<4:20:15,  1.86s/it]

{'loss': 0.5785, 'grad_norm': 7.608387470245361, 'learning_rate': 2.278006947459835e-05, 'epoch': 1.67}


 56%|█████▌    | 10540/18924 [5:44:47<4:18:53,  1.85s/it]

{'loss': 0.7225, 'grad_norm': 8.508963584899902, 'learning_rate': 2.2752930959617892e-05, 'epoch': 1.67}


 56%|█████▌    | 10550/18924 [5:45:05<4:20:52,  1.87s/it]

{'loss': 0.7051, 'grad_norm': 10.74875545501709, 'learning_rate': 2.272579244463743e-05, 'epoch': 1.67}


 56%|█████▌    | 10560/18924 [5:45:24<4:20:29,  1.87s/it]

{'loss': 0.723, 'grad_norm': 13.626593589782715, 'learning_rate': 2.269865392965697e-05, 'epoch': 1.67}


 56%|█████▌    | 10570/18924 [5:45:43<4:14:27,  1.83s/it]

{'loss': 0.631, 'grad_norm': 9.79332447052002, 'learning_rate': 2.267151541467651e-05, 'epoch': 1.68}


 56%|█████▌    | 10580/18924 [5:46:01<4:16:06,  1.84s/it]

{'loss': 0.6609, 'grad_norm': 11.645716667175293, 'learning_rate': 2.264437689969605e-05, 'epoch': 1.68}


 56%|█████▌    | 10590/18924 [5:46:19<4:19:17,  1.87s/it]

{'loss': 0.7547, 'grad_norm': 7.295321464538574, 'learning_rate': 2.261723838471559e-05, 'epoch': 1.68}


 56%|█████▌    | 10600/18924 [5:46:38<4:24:04,  1.90s/it]

{'loss': 0.6581, 'grad_norm': 6.442380905151367, 'learning_rate': 2.2590099869735127e-05, 'epoch': 1.68}


 56%|█████▌    | 10610/18924 [5:46:57<4:18:38,  1.87s/it]

{'loss': 0.6405, 'grad_norm': 8.84836196899414, 'learning_rate': 2.2562961354754668e-05, 'epoch': 1.68}


 56%|█████▌    | 10620/18924 [5:47:16<4:13:19,  1.83s/it]

{'loss': 0.7747, 'grad_norm': 11.628666877746582, 'learning_rate': 2.253582283977421e-05, 'epoch': 1.68}


 56%|█████▌    | 10630/18924 [5:47:34<4:17:42,  1.86s/it]

{'loss': 0.533, 'grad_norm': 2.880765438079834, 'learning_rate': 2.250868432479375e-05, 'epoch': 1.69}


 56%|█████▌    | 10640/18924 [5:47:53<4:18:23,  1.87s/it]

{'loss': 0.5254, 'grad_norm': 4.668539524078369, 'learning_rate': 2.2481545809813287e-05, 'epoch': 1.69}


 56%|█████▋    | 10650/18924 [5:48:11<4:16:27,  1.86s/it]

{'loss': 0.6926, 'grad_norm': 9.523162841796875, 'learning_rate': 2.2454407294832828e-05, 'epoch': 1.69}


 56%|█████▋    | 10660/18924 [5:48:30<4:12:51,  1.84s/it]

{'loss': 0.7162, 'grad_norm': 9.592811584472656, 'learning_rate': 2.2427268779852366e-05, 'epoch': 1.69}


 56%|█████▋    | 10670/18924 [5:48:49<4:16:28,  1.86s/it]

{'loss': 0.6099, 'grad_norm': 5.406688690185547, 'learning_rate': 2.2400130264871907e-05, 'epoch': 1.69}


 56%|█████▋    | 10680/18924 [5:49:07<4:17:43,  1.88s/it]

{'loss': 0.5618, 'grad_norm': 10.789324760437012, 'learning_rate': 2.2372991749891447e-05, 'epoch': 1.69}


 56%|█████▋    | 10690/18924 [5:49:26<4:26:13,  1.94s/it]

{'loss': 0.6001, 'grad_norm': 11.983152389526367, 'learning_rate': 2.234585323491099e-05, 'epoch': 1.69}


 57%|█████▋    | 10700/18924 [5:49:47<4:51:42,  2.13s/it]

{'loss': 0.6062, 'grad_norm': 8.16004753112793, 'learning_rate': 2.2318714719930526e-05, 'epoch': 1.7}


 57%|█████▋    | 10710/18924 [5:50:07<4:20:20,  1.90s/it]

{'loss': 0.512, 'grad_norm': 8.075678825378418, 'learning_rate': 2.2291576204950067e-05, 'epoch': 1.7}


 57%|█████▋    | 10720/18924 [5:50:25<4:07:53,  1.81s/it]

{'loss': 0.7042, 'grad_norm': 11.468427658081055, 'learning_rate': 2.2264437689969604e-05, 'epoch': 1.7}


 57%|█████▋    | 10730/18924 [5:50:43<4:06:31,  1.81s/it]

{'loss': 0.6558, 'grad_norm': 10.664298057556152, 'learning_rate': 2.2237299174989145e-05, 'epoch': 1.7}


 57%|█████▋    | 10740/18924 [5:51:01<4:10:14,  1.83s/it]

{'loss': 0.8366, 'grad_norm': 11.665529251098633, 'learning_rate': 2.2210160660008686e-05, 'epoch': 1.7}


 57%|█████▋    | 10750/18924 [5:51:20<4:14:46,  1.87s/it]

{'loss': 0.6874, 'grad_norm': 9.873955726623535, 'learning_rate': 2.2183022145028227e-05, 'epoch': 1.7}


 57%|█████▋    | 10760/18924 [5:51:39<4:16:36,  1.89s/it]

{'loss': 0.6804, 'grad_norm': 8.507431983947754, 'learning_rate': 2.2155883630047764e-05, 'epoch': 1.71}


 57%|█████▋    | 10770/18924 [5:51:57<4:15:30,  1.88s/it]

{'loss': 0.8108, 'grad_norm': 5.316370964050293, 'learning_rate': 2.2128745115067305e-05, 'epoch': 1.71}


 57%|█████▋    | 10780/18924 [5:52:16<4:15:35,  1.88s/it]

{'loss': 0.5538, 'grad_norm': 9.54482364654541, 'learning_rate': 2.2101606600086843e-05, 'epoch': 1.71}


 57%|█████▋    | 10790/18924 [5:52:35<4:10:35,  1.85s/it]

{'loss': 0.5177, 'grad_norm': 10.797871589660645, 'learning_rate': 2.2074468085106383e-05, 'epoch': 1.71}


 57%|█████▋    | 10800/18924 [5:52:53<4:08:19,  1.83s/it]

{'loss': 0.8828, 'grad_norm': 10.162697792053223, 'learning_rate': 2.2047329570125924e-05, 'epoch': 1.71}


 57%|█████▋    | 10810/18924 [5:53:12<4:09:17,  1.84s/it]

{'loss': 0.5731, 'grad_norm': 8.320594787597656, 'learning_rate': 2.2020191055145465e-05, 'epoch': 1.71}


 57%|█████▋    | 10820/18924 [5:53:30<4:08:23,  1.84s/it]

{'loss': 0.5228, 'grad_norm': 6.98752498626709, 'learning_rate': 2.1993052540165006e-05, 'epoch': 1.72}


 57%|█████▋    | 10830/18924 [5:53:48<4:06:02,  1.82s/it]

{'loss': 0.6716, 'grad_norm': 8.095675468444824, 'learning_rate': 2.1965914025184543e-05, 'epoch': 1.72}


 57%|█████▋    | 10840/18924 [5:54:07<4:10:05,  1.86s/it]

{'loss': 0.6763, 'grad_norm': 10.46946907043457, 'learning_rate': 2.193877551020408e-05, 'epoch': 1.72}


 57%|█████▋    | 10850/18924 [5:54:26<4:10:21,  1.86s/it]

{'loss': 0.7209, 'grad_norm': 7.612987995147705, 'learning_rate': 2.1911636995223622e-05, 'epoch': 1.72}


 57%|█████▋    | 10860/18924 [5:54:44<4:06:38,  1.84s/it]

{'loss': 0.6783, 'grad_norm': 13.867147445678711, 'learning_rate': 2.1884498480243163e-05, 'epoch': 1.72}


 57%|█████▋    | 10870/18924 [5:55:02<4:05:48,  1.83s/it]

{'loss': 0.5679, 'grad_norm': 6.5608954429626465, 'learning_rate': 2.1857359965262704e-05, 'epoch': 1.72}


 57%|█████▋    | 10880/18924 [5:55:21<4:09:14,  1.86s/it]

{'loss': 0.5958, 'grad_norm': 7.25145959854126, 'learning_rate': 2.1830221450282244e-05, 'epoch': 1.72}


 58%|█████▊    | 10890/18924 [5:55:39<4:05:32,  1.83s/it]

{'loss': 0.6555, 'grad_norm': 7.296473979949951, 'learning_rate': 2.180308293530178e-05, 'epoch': 1.73}


 58%|█████▊    | 10900/18924 [5:55:57<4:02:16,  1.81s/it]

{'loss': 0.6926, 'grad_norm': 6.043830871582031, 'learning_rate': 2.177594442032132e-05, 'epoch': 1.73}


 58%|█████▊    | 10910/18924 [5:56:16<4:03:57,  1.83s/it]

{'loss': 0.8035, 'grad_norm': 6.272878646850586, 'learning_rate': 2.174880590534086e-05, 'epoch': 1.73}


 58%|█████▊    | 10920/18924 [5:56:34<4:10:20,  1.88s/it]

{'loss': 0.6039, 'grad_norm': 14.281058311462402, 'learning_rate': 2.17216673903604e-05, 'epoch': 1.73}


 58%|█████▊    | 10930/18924 [5:56:53<4:10:07,  1.88s/it]

{'loss': 0.606, 'grad_norm': 4.8592352867126465, 'learning_rate': 2.1694528875379942e-05, 'epoch': 1.73}


 58%|█████▊    | 10940/18924 [5:57:11<4:06:27,  1.85s/it]

{'loss': 0.6498, 'grad_norm': 9.75775146484375, 'learning_rate': 2.166739036039948e-05, 'epoch': 1.73}


 58%|█████▊    | 10950/18924 [5:57:30<4:05:17,  1.85s/it]

{'loss': 0.7168, 'grad_norm': 6.85038948059082, 'learning_rate': 2.1640251845419017e-05, 'epoch': 1.74}


 58%|█████▊    | 10960/18924 [5:57:48<4:05:20,  1.85s/it]

{'loss': 0.666, 'grad_norm': 11.262578010559082, 'learning_rate': 2.1613113330438558e-05, 'epoch': 1.74}


 58%|█████▊    | 10970/18924 [5:58:07<4:01:40,  1.82s/it]

{'loss': 0.7752, 'grad_norm': 8.279878616333008, 'learning_rate': 2.15859748154581e-05, 'epoch': 1.74}


 58%|█████▊    | 10980/18924 [5:58:25<4:02:23,  1.83s/it]

{'loss': 0.6235, 'grad_norm': 19.54446792602539, 'learning_rate': 2.155883630047764e-05, 'epoch': 1.74}


 58%|█████▊    | 10990/18924 [5:58:43<4:04:40,  1.85s/it]

{'loss': 0.5616, 'grad_norm': 6.33365535736084, 'learning_rate': 2.153169778549718e-05, 'epoch': 1.74}


 58%|█████▊    | 11000/18924 [5:59:02<4:02:31,  1.84s/it]

{'loss': 0.6051, 'grad_norm': 6.282685279846191, 'learning_rate': 2.1504559270516718e-05, 'epoch': 1.74}


 58%|█████▊    | 11010/18924 [5:59:21<4:03:41,  1.85s/it]

{'loss': 0.5806, 'grad_norm': 11.828359603881836, 'learning_rate': 2.147742075553626e-05, 'epoch': 1.75}


 58%|█████▊    | 11020/18924 [5:59:41<4:24:43,  2.01s/it]

{'loss': 0.5736, 'grad_norm': 11.626011848449707, 'learning_rate': 2.1450282240555796e-05, 'epoch': 1.75}


 58%|█████▊    | 11030/18924 [6:00:02<4:35:14,  2.09s/it]

{'loss': 0.5836, 'grad_norm': 7.973701000213623, 'learning_rate': 2.1423143725575337e-05, 'epoch': 1.75}


 58%|█████▊    | 11040/18924 [6:00:20<4:03:17,  1.85s/it]

{'loss': 0.6717, 'grad_norm': 11.800165176391602, 'learning_rate': 2.1396005210594878e-05, 'epoch': 1.75}


 58%|█████▊    | 11050/18924 [6:00:38<3:56:08,  1.80s/it]

{'loss': 0.6322, 'grad_norm': 6.584733963012695, 'learning_rate': 2.136886669561442e-05, 'epoch': 1.75}


 58%|█████▊    | 11060/18924 [6:00:57<4:13:43,  1.94s/it]

{'loss': 0.5674, 'grad_norm': 10.569130897521973, 'learning_rate': 2.1341728180633956e-05, 'epoch': 1.75}


 58%|█████▊    | 11070/18924 [6:01:17<4:22:20,  2.00s/it]

{'loss': 0.6593, 'grad_norm': 26.07465934753418, 'learning_rate': 2.1314589665653497e-05, 'epoch': 1.75}


 59%|█████▊    | 11080/18924 [6:01:36<3:57:40,  1.82s/it]

{'loss': 0.8607, 'grad_norm': 7.036047458648682, 'learning_rate': 2.1287451150673035e-05, 'epoch': 1.76}


 59%|█████▊    | 11090/18924 [6:01:54<3:54:04,  1.79s/it]

{'loss': 0.7515, 'grad_norm': 2.973050117492676, 'learning_rate': 2.1260312635692575e-05, 'epoch': 1.76}


 59%|█████▊    | 11100/18924 [6:02:13<4:09:55,  1.92s/it]

{'loss': 0.8354, 'grad_norm': 10.224085807800293, 'learning_rate': 2.1233174120712116e-05, 'epoch': 1.76}


 59%|█████▊    | 11110/18924 [6:02:33<4:21:40,  2.01s/it]

{'loss': 0.6209, 'grad_norm': 10.903703689575195, 'learning_rate': 2.1206035605731657e-05, 'epoch': 1.76}


 59%|█████▉    | 11120/18924 [6:02:53<4:17:48,  1.98s/it]

{'loss': 0.5751, 'grad_norm': 6.912240028381348, 'learning_rate': 2.1178897090751195e-05, 'epoch': 1.76}


 59%|█████▉    | 11130/18924 [6:03:12<4:16:45,  1.98s/it]

{'loss': 0.8058, 'grad_norm': 12.639071464538574, 'learning_rate': 2.1151758575770735e-05, 'epoch': 1.76}


 59%|█████▉    | 11140/18924 [6:03:32<4:10:10,  1.93s/it]

{'loss': 0.6343, 'grad_norm': 8.59841251373291, 'learning_rate': 2.1124620060790273e-05, 'epoch': 1.77}


 59%|█████▉    | 11150/18924 [6:03:51<4:11:29,  1.94s/it]

{'loss': 0.6577, 'grad_norm': 7.8004961013793945, 'learning_rate': 2.1097481545809814e-05, 'epoch': 1.77}


 59%|█████▉    | 11160/18924 [6:04:10<4:09:43,  1.93s/it]

{'loss': 0.4641, 'grad_norm': 8.310844421386719, 'learning_rate': 2.1070343030829355e-05, 'epoch': 1.77}


 59%|█████▉    | 11170/18924 [6:04:30<4:09:01,  1.93s/it]

{'loss': 0.749, 'grad_norm': 11.692581176757812, 'learning_rate': 2.1043204515848896e-05, 'epoch': 1.77}


 59%|█████▉    | 11180/18924 [6:04:49<4:09:52,  1.94s/it]

{'loss': 0.7631, 'grad_norm': 9.088786125183105, 'learning_rate': 2.1016066000868433e-05, 'epoch': 1.77}


 59%|█████▉    | 11190/18924 [6:05:08<4:08:55,  1.93s/it]

{'loss': 0.884, 'grad_norm': 10.036473274230957, 'learning_rate': 2.0988927485887974e-05, 'epoch': 1.77}


 59%|█████▉    | 11200/18924 [6:05:28<4:06:57,  1.92s/it]

{'loss': 0.7471, 'grad_norm': 10.321605682373047, 'learning_rate': 2.096178897090751e-05, 'epoch': 1.78}


 59%|█████▉    | 11210/18924 [6:05:47<4:04:46,  1.90s/it]

{'loss': 0.5505, 'grad_norm': 10.117504119873047, 'learning_rate': 2.0934650455927052e-05, 'epoch': 1.78}


 59%|█████▉    | 11220/18924 [6:06:06<4:05:11,  1.91s/it]

{'loss': 0.5607, 'grad_norm': 10.462882041931152, 'learning_rate': 2.0907511940946593e-05, 'epoch': 1.78}


 59%|█████▉    | 11230/18924 [6:06:25<4:06:10,  1.92s/it]

{'loss': 0.5587, 'grad_norm': 9.259751319885254, 'learning_rate': 2.088037342596613e-05, 'epoch': 1.78}


 59%|█████▉    | 11240/18924 [6:06:44<4:01:40,  1.89s/it]

{'loss': 0.6822, 'grad_norm': 12.299565315246582, 'learning_rate': 2.085323491098567e-05, 'epoch': 1.78}


 59%|█████▉    | 11250/18924 [6:07:02<3:52:51,  1.82s/it]

{'loss': 0.683, 'grad_norm': 10.081377983093262, 'learning_rate': 2.0826096396005212e-05, 'epoch': 1.78}


 60%|█████▉    | 11260/18924 [6:07:20<3:51:08,  1.81s/it]

{'loss': 0.643, 'grad_norm': 6.084698677062988, 'learning_rate': 2.0798957881024753e-05, 'epoch': 1.79}


 60%|█████▉    | 11270/18924 [6:07:39<3:54:21,  1.84s/it]

{'loss': 0.7773, 'grad_norm': 12.532052040100098, 'learning_rate': 2.077181936604429e-05, 'epoch': 1.79}


 60%|█████▉    | 11280/18924 [6:07:58<4:05:04,  1.92s/it]

{'loss': 0.7251, 'grad_norm': 7.839629173278809, 'learning_rate': 2.074468085106383e-05, 'epoch': 1.79}


 60%|█████▉    | 11290/18924 [6:08:17<4:06:50,  1.94s/it]

{'loss': 0.5601, 'grad_norm': 13.891594886779785, 'learning_rate': 2.071754233608337e-05, 'epoch': 1.79}


 60%|█████▉    | 11300/18924 [6:08:36<3:59:16,  1.88s/it]

{'loss': 0.6268, 'grad_norm': 8.341917991638184, 'learning_rate': 2.069040382110291e-05, 'epoch': 1.79}


 60%|█████▉    | 11310/18924 [6:08:54<3:48:42,  1.80s/it]

{'loss': 0.6472, 'grad_norm': 8.122676849365234, 'learning_rate': 2.066326530612245e-05, 'epoch': 1.79}


 60%|█████▉    | 11320/18924 [6:09:13<4:00:20,  1.90s/it]

{'loss': 0.7205, 'grad_norm': 10.606128692626953, 'learning_rate': 2.063612679114199e-05, 'epoch': 1.79}


 60%|█████▉    | 11330/18924 [6:09:32<4:04:43,  1.93s/it]

{'loss': 0.6981, 'grad_norm': 7.647772312164307, 'learning_rate': 2.060898827616153e-05, 'epoch': 1.8}


 60%|█████▉    | 11340/18924 [6:09:51<4:02:15,  1.92s/it]

{'loss': 0.5712, 'grad_norm': 9.4077730178833, 'learning_rate': 2.058184976118107e-05, 'epoch': 1.8}


 60%|█████▉    | 11350/18924 [6:10:09<3:46:50,  1.80s/it]

{'loss': 0.5242, 'grad_norm': 7.9910888671875, 'learning_rate': 2.0554711246200607e-05, 'epoch': 1.8}


 60%|██████    | 11360/18924 [6:10:28<4:01:17,  1.91s/it]

{'loss': 0.8337, 'grad_norm': 10.973051071166992, 'learning_rate': 2.0527572731220148e-05, 'epoch': 1.8}


 60%|██████    | 11370/18924 [6:10:47<4:04:08,  1.94s/it]

{'loss': 0.7072, 'grad_norm': 14.314140319824219, 'learning_rate': 2.050043421623969e-05, 'epoch': 1.8}


 60%|██████    | 11380/18924 [6:11:07<4:02:48,  1.93s/it]

{'loss': 0.7525, 'grad_norm': 7.138186454772949, 'learning_rate': 2.047329570125923e-05, 'epoch': 1.8}


 60%|██████    | 11390/18924 [6:11:26<4:01:23,  1.92s/it]

{'loss': 0.6512, 'grad_norm': 13.301982879638672, 'learning_rate': 2.0446157186278767e-05, 'epoch': 1.81}


 60%|██████    | 11400/18924 [6:11:45<3:58:42,  1.90s/it]

{'loss': 0.5687, 'grad_norm': 10.697144508361816, 'learning_rate': 2.0419018671298308e-05, 'epoch': 1.81}


 60%|██████    | 11410/18924 [6:12:04<3:59:31,  1.91s/it]

{'loss': 0.6183, 'grad_norm': 12.195365905761719, 'learning_rate': 2.0391880156317846e-05, 'epoch': 1.81}


 60%|██████    | 11420/18924 [6:12:23<3:57:30,  1.90s/it]

{'loss': 0.6559, 'grad_norm': 8.536419868469238, 'learning_rate': 2.0364741641337387e-05, 'epoch': 1.81}


 60%|██████    | 11430/18924 [6:12:42<3:55:27,  1.89s/it]

{'loss': 0.6562, 'grad_norm': 12.177197456359863, 'learning_rate': 2.0337603126356927e-05, 'epoch': 1.81}


 60%|██████    | 11440/18924 [6:13:00<3:45:04,  1.80s/it]

{'loss': 0.7145, 'grad_norm': 10.511474609375, 'learning_rate': 2.0310464611376468e-05, 'epoch': 1.81}


 61%|██████    | 11450/18924 [6:13:18<3:46:37,  1.82s/it]

{'loss': 0.645, 'grad_norm': 7.061703205108643, 'learning_rate': 2.028332609639601e-05, 'epoch': 1.82}


 61%|██████    | 11460/18924 [6:13:37<3:51:12,  1.86s/it]

{'loss': 0.5456, 'grad_norm': 6.6604461669921875, 'learning_rate': 2.0256187581415547e-05, 'epoch': 1.82}


 61%|██████    | 11470/18924 [6:13:56<3:56:01,  1.90s/it]

{'loss': 0.7229, 'grad_norm': 10.163105964660645, 'learning_rate': 2.0229049066435084e-05, 'epoch': 1.82}


 61%|██████    | 11480/18924 [6:14:15<3:59:45,  1.93s/it]

{'loss': 0.4013, 'grad_norm': 10.491117477416992, 'learning_rate': 2.0201910551454625e-05, 'epoch': 1.82}


 61%|██████    | 11490/18924 [6:14:34<3:57:38,  1.92s/it]

{'loss': 0.5955, 'grad_norm': 8.167170524597168, 'learning_rate': 2.0174772036474166e-05, 'epoch': 1.82}


 61%|██████    | 11500/18924 [6:14:53<3:55:54,  1.91s/it]

{'loss': 0.7038, 'grad_norm': 10.710393905639648, 'learning_rate': 2.0147633521493707e-05, 'epoch': 1.82}


 61%|██████    | 11510/18924 [6:15:13<3:59:53,  1.94s/it]

{'loss': 0.6384, 'grad_norm': 12.42987060546875, 'learning_rate': 2.0120495006513244e-05, 'epoch': 1.82}


 61%|██████    | 11520/18924 [6:15:33<4:01:58,  1.96s/it]

{'loss': 0.6, 'grad_norm': 11.498614311218262, 'learning_rate': 2.009335649153278e-05, 'epoch': 1.83}


 61%|██████    | 11530/18924 [6:15:53<4:05:34,  1.99s/it]

{'loss': 0.7146, 'grad_norm': 11.734414100646973, 'learning_rate': 2.0066217976552322e-05, 'epoch': 1.83}


 61%|██████    | 11540/18924 [6:16:12<3:55:55,  1.92s/it]

{'loss': 0.6172, 'grad_norm': 9.449496269226074, 'learning_rate': 2.0039079461571863e-05, 'epoch': 1.83}


 61%|██████    | 11550/18924 [6:16:31<3:52:52,  1.89s/it]

{'loss': 0.6919, 'grad_norm': 8.532066345214844, 'learning_rate': 2.0011940946591404e-05, 'epoch': 1.83}


 61%|██████    | 11560/18924 [6:16:50<3:52:12,  1.89s/it]

{'loss': 0.5947, 'grad_norm': 4.505870819091797, 'learning_rate': 1.9984802431610945e-05, 'epoch': 1.83}


 61%|██████    | 11570/18924 [6:17:09<3:51:37,  1.89s/it]

{'loss': 0.8423, 'grad_norm': 13.992730140686035, 'learning_rate': 1.9957663916630483e-05, 'epoch': 1.83}


 61%|██████    | 11580/18924 [6:17:28<3:51:10,  1.89s/it]

{'loss': 0.6901, 'grad_norm': 12.218152046203613, 'learning_rate': 1.993052540165002e-05, 'epoch': 1.84}


 61%|██████    | 11590/18924 [6:17:47<3:51:22,  1.89s/it]

{'loss': 0.7435, 'grad_norm': 10.01777172088623, 'learning_rate': 1.990338688666956e-05, 'epoch': 1.84}


 61%|██████▏   | 11600/18924 [6:18:06<3:51:03,  1.89s/it]

{'loss': 0.522, 'grad_norm': 5.2285261154174805, 'learning_rate': 1.9876248371689102e-05, 'epoch': 1.84}


 61%|██████▏   | 11610/18924 [6:18:24<3:47:42,  1.87s/it]

{'loss': 0.6471, 'grad_norm': 9.976082801818848, 'learning_rate': 1.9849109856708643e-05, 'epoch': 1.84}


 61%|██████▏   | 11620/18924 [6:18:42<3:38:08,  1.79s/it]

{'loss': 0.6749, 'grad_norm': 13.416902542114258, 'learning_rate': 1.9821971341728183e-05, 'epoch': 1.84}


 61%|██████▏   | 11630/18924 [6:19:01<3:42:40,  1.83s/it]

{'loss': 0.5448, 'grad_norm': 9.489789962768555, 'learning_rate': 1.979483282674772e-05, 'epoch': 1.84}


 62%|██████▏   | 11640/18924 [6:19:20<3:52:49,  1.92s/it]

{'loss': 0.7065, 'grad_norm': 13.394368171691895, 'learning_rate': 1.976769431176726e-05, 'epoch': 1.85}


 62%|██████▏   | 11650/18924 [6:19:39<3:50:45,  1.90s/it]

{'loss': 0.6388, 'grad_norm': 11.40860366821289, 'learning_rate': 1.97405557967868e-05, 'epoch': 1.85}


 62%|██████▏   | 11660/18924 [6:19:58<3:51:30,  1.91s/it]

{'loss': 0.6069, 'grad_norm': 9.697160720825195, 'learning_rate': 1.971341728180634e-05, 'epoch': 1.85}


 62%|██████▏   | 11670/18924 [6:20:16<3:35:46,  1.78s/it]

{'loss': 0.8148, 'grad_norm': 10.659591674804688, 'learning_rate': 1.968627876682588e-05, 'epoch': 1.85}


 62%|██████▏   | 11680/18924 [6:20:34<3:35:20,  1.78s/it]

{'loss': 0.6701, 'grad_norm': 7.306699752807617, 'learning_rate': 1.9659140251845422e-05, 'epoch': 1.85}


 62%|██████▏   | 11690/18924 [6:20:52<3:46:44,  1.88s/it]

{'loss': 0.6068, 'grad_norm': 9.214208602905273, 'learning_rate': 1.963200173686496e-05, 'epoch': 1.85}


 62%|██████▏   | 11700/18924 [6:21:12<3:54:24,  1.95s/it]

{'loss': 0.651, 'grad_norm': 11.388383865356445, 'learning_rate': 1.96048632218845e-05, 'epoch': 1.85}


 62%|██████▏   | 11710/18924 [6:21:30<3:40:53,  1.84s/it]

{'loss': 0.6828, 'grad_norm': 9.162993431091309, 'learning_rate': 1.9577724706904038e-05, 'epoch': 1.86}


 62%|██████▏   | 11720/18924 [6:21:48<3:38:08,  1.82s/it]

{'loss': 0.6545, 'grad_norm': 8.333948135375977, 'learning_rate': 1.955058619192358e-05, 'epoch': 1.86}


 62%|██████▏   | 11730/18924 [6:22:07<3:45:59,  1.88s/it]

{'loss': 0.7221, 'grad_norm': 13.369390487670898, 'learning_rate': 1.952344767694312e-05, 'epoch': 1.86}


 62%|██████▏   | 11740/18924 [6:22:26<3:47:23,  1.90s/it]

{'loss': 0.5642, 'grad_norm': 6.169476509094238, 'learning_rate': 1.949630916196266e-05, 'epoch': 1.86}


 62%|██████▏   | 11750/18924 [6:22:45<3:37:51,  1.82s/it]

{'loss': 0.7815, 'grad_norm': 10.173213005065918, 'learning_rate': 1.9469170646982198e-05, 'epoch': 1.86}


 62%|██████▏   | 11760/18924 [6:23:03<3:36:58,  1.82s/it]

{'loss': 0.6074, 'grad_norm': 5.821484565734863, 'learning_rate': 1.944203213200174e-05, 'epoch': 1.86}


 62%|██████▏   | 11770/18924 [6:23:22<3:46:00,  1.90s/it]

{'loss': 0.6389, 'grad_norm': 8.162775039672852, 'learning_rate': 1.9414893617021276e-05, 'epoch': 1.87}


 62%|██████▏   | 11780/18924 [6:23:41<3:50:13,  1.93s/it]

{'loss': 0.5666, 'grad_norm': 7.975165367126465, 'learning_rate': 1.9387755102040817e-05, 'epoch': 1.87}


 62%|██████▏   | 11790/18924 [6:24:00<3:49:34,  1.93s/it]

{'loss': 0.5183, 'grad_norm': 8.869599342346191, 'learning_rate': 1.9360616587060358e-05, 'epoch': 1.87}


 62%|██████▏   | 11800/18924 [6:24:19<3:45:09,  1.90s/it]

{'loss': 0.5084, 'grad_norm': 8.60587215423584, 'learning_rate': 1.9333478072079895e-05, 'epoch': 1.87}


 62%|██████▏   | 11810/18924 [6:24:38<3:42:13,  1.87s/it]

{'loss': 0.6377, 'grad_norm': 12.229376792907715, 'learning_rate': 1.9306339557099436e-05, 'epoch': 1.87}


 62%|██████▏   | 11820/18924 [6:24:57<3:44:52,  1.90s/it]

{'loss': 0.6947, 'grad_norm': 2.9077577590942383, 'learning_rate': 1.9279201042118977e-05, 'epoch': 1.87}


 63%|██████▎   | 11830/18924 [6:25:16<3:42:02,  1.88s/it]

{'loss': 0.6577, 'grad_norm': 13.18928050994873, 'learning_rate': 1.9252062527138514e-05, 'epoch': 1.88}


 63%|██████▎   | 11840/18924 [6:25:34<3:41:31,  1.88s/it]

{'loss': 0.6365, 'grad_norm': 12.516119003295898, 'learning_rate': 1.9224924012158055e-05, 'epoch': 1.88}


 63%|██████▎   | 11850/18924 [6:25:53<3:34:56,  1.82s/it]

{'loss': 0.6874, 'grad_norm': 10.73392105102539, 'learning_rate': 1.9197785497177596e-05, 'epoch': 1.88}


 63%|██████▎   | 11860/18924 [6:26:11<3:36:13,  1.84s/it]

{'loss': 1.0131, 'grad_norm': 8.55910587310791, 'learning_rate': 1.9170646982197134e-05, 'epoch': 1.88}


 63%|██████▎   | 11870/18924 [6:26:30<3:36:51,  1.84s/it]

{'loss': 0.4905, 'grad_norm': 7.437764644622803, 'learning_rate': 1.9143508467216674e-05, 'epoch': 1.88}


 63%|██████▎   | 11880/18924 [6:26:48<3:40:38,  1.88s/it]

{'loss': 0.5925, 'grad_norm': 8.745743751525879, 'learning_rate': 1.9116369952236215e-05, 'epoch': 1.88}


 63%|██████▎   | 11890/18924 [6:27:08<3:42:46,  1.90s/it]

{'loss': 0.7071, 'grad_norm': 12.377803802490234, 'learning_rate': 1.9089231437255753e-05, 'epoch': 1.88}


 63%|██████▎   | 11900/18924 [6:27:27<3:42:24,  1.90s/it]

{'loss': 0.7069, 'grad_norm': 12.233858108520508, 'learning_rate': 1.9062092922275294e-05, 'epoch': 1.89}


 63%|██████▎   | 11910/18924 [6:27:45<3:40:33,  1.89s/it]

{'loss': 0.959, 'grad_norm': 15.440157890319824, 'learning_rate': 1.9034954407294835e-05, 'epoch': 1.89}


 63%|██████▎   | 11920/18924 [6:28:04<3:39:35,  1.88s/it]

{'loss': 0.5892, 'grad_norm': 5.925980091094971, 'learning_rate': 1.9007815892314372e-05, 'epoch': 1.89}


 63%|██████▎   | 11930/18924 [6:28:23<3:38:07,  1.87s/it]

{'loss': 0.5695, 'grad_norm': 10.380175590515137, 'learning_rate': 1.8980677377333913e-05, 'epoch': 1.89}


 63%|██████▎   | 11940/18924 [6:28:42<3:35:02,  1.85s/it]

{'loss': 0.6434, 'grad_norm': 7.07132625579834, 'learning_rate': 1.8953538862353454e-05, 'epoch': 1.89}


 63%|██████▎   | 11950/18924 [6:29:00<3:37:09,  1.87s/it]

{'loss': 0.6229, 'grad_norm': 10.743260383605957, 'learning_rate': 1.8926400347372995e-05, 'epoch': 1.89}


 63%|██████▎   | 11960/18924 [6:29:19<3:38:48,  1.89s/it]

{'loss': 0.8639, 'grad_norm': 13.923351287841797, 'learning_rate': 1.8899261832392532e-05, 'epoch': 1.9}


 63%|██████▎   | 11970/18924 [6:29:38<3:37:14,  1.87s/it]

{'loss': 0.7825, 'grad_norm': 11.879140853881836, 'learning_rate': 1.8872123317412073e-05, 'epoch': 1.9}


 63%|██████▎   | 11980/18924 [6:29:57<3:37:20,  1.88s/it]

{'loss': 0.7171, 'grad_norm': 8.999561309814453, 'learning_rate': 1.884498480243161e-05, 'epoch': 1.9}


 63%|██████▎   | 11990/18924 [6:30:16<3:39:22,  1.90s/it]

{'loss': 0.639, 'grad_norm': 15.105729103088379, 'learning_rate': 1.881784628745115e-05, 'epoch': 1.9}


 63%|██████▎   | 12000/18924 [6:30:35<3:37:47,  1.89s/it]

{'loss': 0.6886, 'grad_norm': 7.427853107452393, 'learning_rate': 1.8790707772470692e-05, 'epoch': 1.9}


 63%|██████▎   | 12010/18924 [6:30:53<3:25:34,  1.78s/it]

{'loss': 0.6168, 'grad_norm': 13.337080955505371, 'learning_rate': 1.8763569257490233e-05, 'epoch': 1.9}


 64%|██████▎   | 12020/18924 [6:31:11<3:25:10,  1.78s/it]

{'loss': 0.6589, 'grad_norm': 9.075435638427734, 'learning_rate': 1.873643074250977e-05, 'epoch': 1.91}


 64%|██████▎   | 12030/18924 [6:31:30<3:33:15,  1.86s/it]

{'loss': 0.6905, 'grad_norm': 10.366113662719727, 'learning_rate': 1.870929222752931e-05, 'epoch': 1.91}


 64%|██████▎   | 12040/18924 [6:31:49<3:38:10,  1.90s/it]

{'loss': 0.6504, 'grad_norm': 11.64573860168457, 'learning_rate': 1.868215371254885e-05, 'epoch': 1.91}


 64%|██████▎   | 12050/18924 [6:32:07<3:26:10,  1.80s/it]

{'loss': 0.6532, 'grad_norm': 6.803878307342529, 'learning_rate': 1.865501519756839e-05, 'epoch': 1.91}


 64%|██████▎   | 12060/18924 [6:32:25<3:31:22,  1.85s/it]

{'loss': 0.6853, 'grad_norm': 10.353547096252441, 'learning_rate': 1.862787668258793e-05, 'epoch': 1.91}


 64%|██████▍   | 12070/18924 [6:32:44<3:38:39,  1.91s/it]

{'loss': 0.7584, 'grad_norm': 12.239679336547852, 'learning_rate': 1.860073816760747e-05, 'epoch': 1.91}


 64%|██████▍   | 12080/18924 [6:33:03<3:37:04,  1.90s/it]

{'loss': 0.5009, 'grad_norm': 8.442611694335938, 'learning_rate': 1.857359965262701e-05, 'epoch': 1.92}


 64%|██████▍   | 12090/18924 [6:33:21<3:24:16,  1.79s/it]

{'loss': 0.7565, 'grad_norm': 10.899819374084473, 'learning_rate': 1.8546461137646546e-05, 'epoch': 1.92}


 64%|██████▍   | 12100/18924 [6:33:39<3:26:57,  1.82s/it]

{'loss': 0.6369, 'grad_norm': 11.253303527832031, 'learning_rate': 1.8519322622666087e-05, 'epoch': 1.92}


 64%|██████▍   | 12110/18924 [6:33:58<3:35:43,  1.90s/it]

{'loss': 0.6137, 'grad_norm': 10.150150299072266, 'learning_rate': 1.8492184107685628e-05, 'epoch': 1.92}


 64%|██████▍   | 12120/18924 [6:34:17<3:38:07,  1.92s/it]

{'loss': 0.6014, 'grad_norm': 10.101203918457031, 'learning_rate': 1.846504559270517e-05, 'epoch': 1.92}


 64%|██████▍   | 12130/18924 [6:34:36<3:24:28,  1.81s/it]

{'loss': 0.6771, 'grad_norm': 5.090014934539795, 'learning_rate': 1.843790707772471e-05, 'epoch': 1.92}


 64%|██████▍   | 12140/18924 [6:34:54<3:24:09,  1.81s/it]

{'loss': 0.675, 'grad_norm': 13.541062355041504, 'learning_rate': 1.8410768562744247e-05, 'epoch': 1.92}


 64%|██████▍   | 12150/18924 [6:35:13<3:32:30,  1.88s/it]

{'loss': 0.7081, 'grad_norm': 11.505541801452637, 'learning_rate': 1.8383630047763785e-05, 'epoch': 1.93}


 64%|██████▍   | 12160/18924 [6:35:32<3:33:34,  1.89s/it]

{'loss': 0.6399, 'grad_norm': 10.273524284362793, 'learning_rate': 1.8356491532783326e-05, 'epoch': 1.93}


 64%|██████▍   | 12170/18924 [6:35:50<3:32:15,  1.89s/it]

{'loss': 0.7029, 'grad_norm': 12.141226768493652, 'learning_rate': 1.8329353017802866e-05, 'epoch': 1.93}


 64%|██████▍   | 12180/18924 [6:36:09<3:30:48,  1.88s/it]

{'loss': 0.6207, 'grad_norm': 10.52840805053711, 'learning_rate': 1.8302214502822407e-05, 'epoch': 1.93}


 64%|██████▍   | 12190/18924 [6:36:28<3:29:18,  1.86s/it]

{'loss': 0.7712, 'grad_norm': 9.950876235961914, 'learning_rate': 1.8275075987841948e-05, 'epoch': 1.93}


 64%|██████▍   | 12200/18924 [6:36:46<3:29:13,  1.87s/it]

{'loss': 0.8243, 'grad_norm': 13.076607704162598, 'learning_rate': 1.8247937472861486e-05, 'epoch': 1.93}


 65%|██████▍   | 12210/18924 [6:37:05<3:30:21,  1.88s/it]

{'loss': 0.5707, 'grad_norm': 10.019876480102539, 'learning_rate': 1.8220798957881023e-05, 'epoch': 1.94}


 65%|██████▍   | 12220/18924 [6:37:24<3:29:45,  1.88s/it]

{'loss': 0.7409, 'grad_norm': 14.578267097473145, 'learning_rate': 1.8193660442900564e-05, 'epoch': 1.94}


 65%|██████▍   | 12230/18924 [6:37:43<3:28:06,  1.87s/it]

{'loss': 0.6109, 'grad_norm': 2.9460396766662598, 'learning_rate': 1.8166521927920105e-05, 'epoch': 1.94}


 65%|██████▍   | 12240/18924 [6:38:02<3:29:47,  1.88s/it]

{'loss': 0.6132, 'grad_norm': 7.382153034210205, 'learning_rate': 1.8139383412939646e-05, 'epoch': 1.94}


 65%|██████▍   | 12250/18924 [6:38:20<3:28:06,  1.87s/it]

{'loss': 0.5286, 'grad_norm': 4.156952857971191, 'learning_rate': 1.8112244897959187e-05, 'epoch': 1.94}


 65%|██████▍   | 12260/18924 [6:38:39<3:27:28,  1.87s/it]

{'loss': 0.6281, 'grad_norm': 11.515604972839355, 'learning_rate': 1.8085106382978724e-05, 'epoch': 1.94}


 65%|██████▍   | 12270/18924 [6:38:58<3:25:22,  1.85s/it]

{'loss': 0.5626, 'grad_norm': 9.878168106079102, 'learning_rate': 1.805796786799826e-05, 'epoch': 1.95}


 65%|██████▍   | 12280/18924 [6:39:16<3:26:12,  1.86s/it]

{'loss': 0.6773, 'grad_norm': 14.426140785217285, 'learning_rate': 1.8030829353017802e-05, 'epoch': 1.95}


 65%|██████▍   | 12290/18924 [6:39:35<3:27:46,  1.88s/it]

{'loss': 0.7733, 'grad_norm': 7.875096321105957, 'learning_rate': 1.8003690838037343e-05, 'epoch': 1.95}


 65%|██████▍   | 12300/18924 [6:39:54<3:27:11,  1.88s/it]

{'loss': 0.6158, 'grad_norm': 8.944414138793945, 'learning_rate': 1.7976552323056884e-05, 'epoch': 1.95}


 65%|██████▌   | 12310/18924 [6:40:13<3:28:04,  1.89s/it]

{'loss': 0.6624, 'grad_norm': 3.9482929706573486, 'learning_rate': 1.7949413808076425e-05, 'epoch': 1.95}


 65%|██████▌   | 12320/18924 [6:40:31<3:25:49,  1.87s/it]

{'loss': 0.5907, 'grad_norm': 9.377596855163574, 'learning_rate': 1.7922275293095962e-05, 'epoch': 1.95}


 65%|██████▌   | 12330/18924 [6:40:50<3:21:32,  1.83s/it]

{'loss': 0.4897, 'grad_norm': 6.271770477294922, 'learning_rate': 1.78951367781155e-05, 'epoch': 1.95}


 65%|██████▌   | 12340/18924 [6:41:08<3:17:07,  1.80s/it]

{'loss': 0.5264, 'grad_norm': 12.558279037475586, 'learning_rate': 1.786799826313504e-05, 'epoch': 1.96}


 65%|██████▌   | 12350/18924 [6:41:26<3:16:05,  1.79s/it]

{'loss': 0.5995, 'grad_norm': 13.028286933898926, 'learning_rate': 1.784085974815458e-05, 'epoch': 1.96}


 65%|██████▌   | 12360/18924 [6:41:44<3:18:44,  1.82s/it]

{'loss': 0.7617, 'grad_norm': 12.130392074584961, 'learning_rate': 1.7813721233174123e-05, 'epoch': 1.96}


 65%|██████▌   | 12370/18924 [6:42:03<3:27:12,  1.90s/it]

{'loss': 0.5464, 'grad_norm': 8.267119407653809, 'learning_rate': 1.7786582718193663e-05, 'epoch': 1.96}


 65%|██████▌   | 12380/18924 [6:42:22<3:27:53,  1.91s/it]

{'loss': 0.6447, 'grad_norm': 2.5675368309020996, 'learning_rate': 1.77594442032132e-05, 'epoch': 1.96}


 65%|██████▌   | 12390/18924 [6:42:41<3:28:18,  1.91s/it]

{'loss': 0.6068, 'grad_norm': 5.944419860839844, 'learning_rate': 1.7732305688232742e-05, 'epoch': 1.96}


 66%|██████▌   | 12400/18924 [6:43:00<3:24:42,  1.88s/it]

{'loss': 0.6786, 'grad_norm': 10.941217422485352, 'learning_rate': 1.770516717325228e-05, 'epoch': 1.97}


 66%|██████▌   | 12410/18924 [6:43:19<3:22:35,  1.87s/it]

{'loss': 0.5962, 'grad_norm': 3.0677335262298584, 'learning_rate': 1.767802865827182e-05, 'epoch': 1.97}


 66%|██████▌   | 12420/18924 [6:43:37<3:23:10,  1.87s/it]

{'loss': 0.4705, 'grad_norm': 9.099388122558594, 'learning_rate': 1.765089014329136e-05, 'epoch': 1.97}


 66%|██████▌   | 12430/18924 [6:43:56<3:22:46,  1.87s/it]

{'loss': 0.677, 'grad_norm': 14.583098411560059, 'learning_rate': 1.76237516283109e-05, 'epoch': 1.97}


 66%|██████▌   | 12440/18924 [6:44:15<3:21:50,  1.87s/it]

{'loss': 0.5027, 'grad_norm': 10.639034271240234, 'learning_rate': 1.759661311333044e-05, 'epoch': 1.97}


 66%|██████▌   | 12450/18924 [6:44:34<3:22:22,  1.88s/it]

{'loss': 0.6969, 'grad_norm': 6.788395404815674, 'learning_rate': 1.756947459834998e-05, 'epoch': 1.97}


 66%|██████▌   | 12460/18924 [6:44:52<3:16:31,  1.82s/it]

{'loss': 0.6027, 'grad_norm': 9.245429039001465, 'learning_rate': 1.7542336083369518e-05, 'epoch': 1.98}


 66%|██████▌   | 12470/18924 [6:45:10<3:11:42,  1.78s/it]

{'loss': 0.5828, 'grad_norm': 13.335057258605957, 'learning_rate': 1.751519756838906e-05, 'epoch': 1.98}


 66%|██████▌   | 12480/18924 [6:45:28<3:13:26,  1.80s/it]

{'loss': 0.4743, 'grad_norm': 2.6088497638702393, 'learning_rate': 1.74880590534086e-05, 'epoch': 1.98}


 66%|██████▌   | 12490/18924 [6:45:46<3:14:34,  1.81s/it]

{'loss': 0.7685, 'grad_norm': 11.836308479309082, 'learning_rate': 1.7460920538428137e-05, 'epoch': 1.98}


 66%|██████▌   | 12500/18924 [6:46:04<3:18:24,  1.85s/it]

{'loss': 0.5638, 'grad_norm': 11.107361793518066, 'learning_rate': 1.7433782023447678e-05, 'epoch': 1.98}


 66%|██████▌   | 12510/18924 [6:46:24<3:21:47,  1.89s/it]

{'loss': 0.8115, 'grad_norm': 10.50273323059082, 'learning_rate': 1.740664350846722e-05, 'epoch': 1.98}


 66%|██████▌   | 12520/18924 [6:46:43<3:24:35,  1.92s/it]

{'loss': 0.5948, 'grad_norm': 8.95380687713623, 'learning_rate': 1.7379504993486756e-05, 'epoch': 1.98}


 66%|██████▌   | 12530/18924 [6:47:02<3:21:24,  1.89s/it]

{'loss': 0.6957, 'grad_norm': 10.235382080078125, 'learning_rate': 1.7352366478506297e-05, 'epoch': 1.99}


 66%|██████▋   | 12540/18924 [6:47:21<3:17:53,  1.86s/it]

{'loss': 0.7411, 'grad_norm': 12.341266632080078, 'learning_rate': 1.7325227963525838e-05, 'epoch': 1.99}


 66%|██████▋   | 12550/18924 [6:47:39<3:11:40,  1.80s/it]

{'loss': 0.632, 'grad_norm': 6.510308742523193, 'learning_rate': 1.7298089448545375e-05, 'epoch': 1.99}


 66%|██████▋   | 12560/18924 [6:47:57<3:13:51,  1.83s/it]

{'loss': 0.6028, 'grad_norm': 10.331446647644043, 'learning_rate': 1.7270950933564916e-05, 'epoch': 1.99}


 66%|██████▋   | 12570/18924 [6:48:16<3:16:41,  1.86s/it]

{'loss': 0.4768, 'grad_norm': 6.749009609222412, 'learning_rate': 1.7243812418584457e-05, 'epoch': 1.99}


 66%|██████▋   | 12580/18924 [6:48:34<3:18:10,  1.87s/it]

{'loss': 0.5708, 'grad_norm': 7.80897331237793, 'learning_rate': 1.7216673903603994e-05, 'epoch': 1.99}


 67%|██████▋   | 12590/18924 [6:48:53<3:16:08,  1.86s/it]

{'loss': 0.7718, 'grad_norm': 11.29818344116211, 'learning_rate': 1.7189535388623535e-05, 'epoch': 2.0}


 67%|██████▋   | 12600/18924 [6:49:12<3:15:32,  1.86s/it]

{'loss': 0.8331, 'grad_norm': 7.509934902191162, 'learning_rate': 1.7162396873643076e-05, 'epoch': 2.0}


 67%|██████▋   | 12610/18924 [6:49:30<3:16:14,  1.86s/it]

{'loss': 0.6415, 'grad_norm': 12.796512603759766, 'learning_rate': 1.7135258358662614e-05, 'epoch': 2.0}


 67%|██████▋   | 12620/18924 [6:49:48<3:07:39,  1.79s/it]

{'loss': 0.5611, 'grad_norm': 6.743980407714844, 'learning_rate': 1.7108119843682154e-05, 'epoch': 2.0}


 67%|██████▋   | 12630/18924 [6:50:06<3:14:47,  1.86s/it]

{'loss': 0.3998, 'grad_norm': 8.094865798950195, 'learning_rate': 1.7080981328701695e-05, 'epoch': 2.0}


 67%|██████▋   | 12640/18924 [6:50:25<3:13:30,  1.85s/it]

{'loss': 0.4893, 'grad_norm': 6.6636505126953125, 'learning_rate': 1.7053842813721236e-05, 'epoch': 2.0}


 67%|██████▋   | 12650/18924 [6:50:43<3:13:52,  1.85s/it]

{'loss': 0.4023, 'grad_norm': 8.104488372802734, 'learning_rate': 1.7026704298740774e-05, 'epoch': 2.01}


 67%|██████▋   | 12660/18924 [6:51:02<3:13:35,  1.85s/it]

{'loss': 0.2931, 'grad_norm': 0.944363534450531, 'learning_rate': 1.6999565783760314e-05, 'epoch': 2.01}


 67%|██████▋   | 12670/18924 [6:51:20<3:14:53,  1.87s/it]

{'loss': 0.4154, 'grad_norm': 11.635848045349121, 'learning_rate': 1.6972427268779852e-05, 'epoch': 2.01}


 67%|██████▋   | 12680/18924 [6:51:39<3:13:12,  1.86s/it]

{'loss': 0.3903, 'grad_norm': 9.010851860046387, 'learning_rate': 1.6945288753799393e-05, 'epoch': 2.01}


 67%|██████▋   | 12690/18924 [6:51:57<3:12:38,  1.85s/it]

{'loss': 0.4571, 'grad_norm': 6.582052707672119, 'learning_rate': 1.6918150238818934e-05, 'epoch': 2.01}


 67%|██████▋   | 12700/18924 [6:52:16<3:11:15,  1.84s/it]

{'loss': 0.4991, 'grad_norm': 14.380334854125977, 'learning_rate': 1.6891011723838475e-05, 'epoch': 2.01}


 67%|██████▋   | 12710/18924 [6:52:35<3:12:22,  1.86s/it]

{'loss': 0.4534, 'grad_norm': 5.912633895874023, 'learning_rate': 1.6863873208858012e-05, 'epoch': 2.01}


 67%|██████▋   | 12720/18924 [6:52:53<3:11:35,  1.85s/it]

{'loss': 0.3789, 'grad_norm': 5.4998931884765625, 'learning_rate': 1.683673469387755e-05, 'epoch': 2.02}


 67%|██████▋   | 12730/18924 [6:53:12<3:11:05,  1.85s/it]

{'loss': 0.354, 'grad_norm': 5.13196325302124, 'learning_rate': 1.680959617889709e-05, 'epoch': 2.02}


 67%|██████▋   | 12740/18924 [6:53:30<3:10:45,  1.85s/it]

{'loss': 0.4787, 'grad_norm': 8.4332275390625, 'learning_rate': 1.678245766391663e-05, 'epoch': 2.02}


 67%|██████▋   | 12750/18924 [6:53:49<3:09:17,  1.84s/it]

{'loss': 0.4541, 'grad_norm': 5.754784107208252, 'learning_rate': 1.6755319148936172e-05, 'epoch': 2.02}


 67%|██████▋   | 12760/18924 [6:54:07<3:09:04,  1.84s/it]

{'loss': 0.4441, 'grad_norm': 9.110005378723145, 'learning_rate': 1.6728180633955713e-05, 'epoch': 2.02}


 67%|██████▋   | 12770/18924 [6:54:25<3:08:57,  1.84s/it]

{'loss': 0.6251, 'grad_norm': 11.939001083374023, 'learning_rate': 1.670104211897525e-05, 'epoch': 2.02}


 68%|██████▊   | 12780/18924 [6:54:44<3:11:32,  1.87s/it]

{'loss': 0.4685, 'grad_norm': 9.960476875305176, 'learning_rate': 1.6673903603994788e-05, 'epoch': 2.03}


 68%|██████▊   | 12790/18924 [6:55:03<3:11:33,  1.87s/it]

{'loss': 0.5413, 'grad_norm': 7.977530479431152, 'learning_rate': 1.664676508901433e-05, 'epoch': 2.03}


 68%|██████▊   | 12800/18924 [6:55:21<3:08:42,  1.85s/it]

{'loss': 0.4379, 'grad_norm': 12.186924934387207, 'learning_rate': 1.661962657403387e-05, 'epoch': 2.03}


 68%|██████▊   | 12810/18924 [6:55:40<3:07:55,  1.84s/it]

{'loss': 0.3635, 'grad_norm': 6.513113021850586, 'learning_rate': 1.659248805905341e-05, 'epoch': 2.03}


 68%|██████▊   | 12820/18924 [6:55:58<3:08:39,  1.85s/it]

{'loss': 0.4472, 'grad_norm': 4.520623683929443, 'learning_rate': 1.656534954407295e-05, 'epoch': 2.03}


 68%|██████▊   | 12830/18924 [6:56:17<3:08:17,  1.85s/it]

{'loss': 0.4274, 'grad_norm': 8.264786720275879, 'learning_rate': 1.653821102909249e-05, 'epoch': 2.03}


 68%|██████▊   | 12840/18924 [6:56:35<3:04:51,  1.82s/it]

{'loss': 0.5258, 'grad_norm': 14.467220306396484, 'learning_rate': 1.6511072514112026e-05, 'epoch': 2.04}


 68%|██████▊   | 12850/18924 [6:56:53<3:01:26,  1.79s/it]

{'loss': 0.4218, 'grad_norm': 10.672592163085938, 'learning_rate': 1.6483933999131567e-05, 'epoch': 2.04}


 68%|██████▊   | 12860/18924 [6:57:11<3:02:16,  1.80s/it]

{'loss': 0.4661, 'grad_norm': 6.519412040710449, 'learning_rate': 1.6456795484151108e-05, 'epoch': 2.04}


 68%|██████▊   | 12870/18924 [6:57:29<3:04:13,  1.83s/it]

{'loss': 0.4723, 'grad_norm': 13.125104904174805, 'learning_rate': 1.642965696917065e-05, 'epoch': 2.04}


 68%|██████▊   | 12880/18924 [6:57:48<3:07:52,  1.87s/it]

{'loss': 0.5348, 'grad_norm': 10.189839363098145, 'learning_rate': 1.640251845419019e-05, 'epoch': 2.04}


 68%|██████▊   | 12890/18924 [6:58:07<3:10:04,  1.89s/it]

{'loss': 0.4367, 'grad_norm': 3.5448975563049316, 'learning_rate': 1.6375379939209727e-05, 'epoch': 2.04}


 68%|██████▊   | 12900/18924 [6:58:25<3:07:51,  1.87s/it]

{'loss': 0.3639, 'grad_norm': 4.93501091003418, 'learning_rate': 1.6348241424229265e-05, 'epoch': 2.05}


 68%|██████▊   | 12910/18924 [6:58:44<3:01:29,  1.81s/it]

{'loss': 0.405, 'grad_norm': 6.225639343261719, 'learning_rate': 1.6321102909248806e-05, 'epoch': 2.05}


 68%|██████▊   | 12920/18924 [6:59:02<2:58:26,  1.78s/it]

{'loss': 0.3681, 'grad_norm': 7.216191291809082, 'learning_rate': 1.6293964394268346e-05, 'epoch': 2.05}


 68%|██████▊   | 12930/18924 [6:59:20<3:01:57,  1.82s/it]

{'loss': 0.4858, 'grad_norm': 9.05191421508789, 'learning_rate': 1.6266825879287887e-05, 'epoch': 2.05}


 68%|██████▊   | 12940/18924 [6:59:38<3:05:59,  1.86s/it]

{'loss': 0.4315, 'grad_norm': 4.699267864227295, 'learning_rate': 1.6239687364307428e-05, 'epoch': 2.05}


 68%|██████▊   | 12950/18924 [6:59:57<3:10:44,  1.92s/it]

{'loss': 0.473, 'grad_norm': 4.7039313316345215, 'learning_rate': 1.6212548849326966e-05, 'epoch': 2.05}


 68%|██████▊   | 12960/18924 [7:00:17<3:08:58,  1.90s/it]

{'loss': 0.4084, 'grad_norm': 8.49258804321289, 'learning_rate': 1.6185410334346503e-05, 'epoch': 2.05}


 69%|██████▊   | 12970/18924 [7:00:35<3:05:46,  1.87s/it]

{'loss': 0.4461, 'grad_norm': 6.749445915222168, 'learning_rate': 1.6158271819366044e-05, 'epoch': 2.06}


 69%|██████▊   | 12980/18924 [7:00:54<3:02:45,  1.84s/it]

{'loss': 0.4607, 'grad_norm': 8.668662071228027, 'learning_rate': 1.6131133304385585e-05, 'epoch': 2.06}


 69%|██████▊   | 12990/18924 [7:01:12<3:03:06,  1.85s/it]

{'loss': 0.3507, 'grad_norm': 6.766122341156006, 'learning_rate': 1.6103994789405126e-05, 'epoch': 2.06}


 69%|██████▊   | 13000/18924 [7:01:31<3:05:19,  1.88s/it]

{'loss': 0.4613, 'grad_norm': 7.554630756378174, 'learning_rate': 1.6076856274424663e-05, 'epoch': 2.06}


 69%|██████▊   | 13010/18924 [7:01:50<3:02:04,  1.85s/it]

{'loss': 0.4672, 'grad_norm': 9.96544075012207, 'learning_rate': 1.6049717759444204e-05, 'epoch': 2.06}


 69%|██████▉   | 13020/18924 [7:02:08<2:56:17,  1.79s/it]

{'loss': 0.4122, 'grad_norm': 21.068342208862305, 'learning_rate': 1.602257924446374e-05, 'epoch': 2.06}


 69%|██████▉   | 13030/18924 [7:02:26<2:55:38,  1.79s/it]

{'loss': 0.2898, 'grad_norm': 12.530803680419922, 'learning_rate': 1.5995440729483282e-05, 'epoch': 2.07}


 69%|██████▉   | 13040/18924 [7:02:44<2:59:15,  1.83s/it]

{'loss': 0.3777, 'grad_norm': 9.984028816223145, 'learning_rate': 1.5968302214502823e-05, 'epoch': 2.07}


 69%|██████▉   | 13050/18924 [7:03:03<3:02:17,  1.86s/it]

{'loss': 0.3726, 'grad_norm': 12.932196617126465, 'learning_rate': 1.5941163699522364e-05, 'epoch': 2.07}


 69%|██████▉   | 13060/18924 [7:03:21<3:02:10,  1.86s/it]

{'loss': 0.4316, 'grad_norm': 6.100008487701416, 'learning_rate': 1.59140251845419e-05, 'epoch': 2.07}


 69%|██████▉   | 13070/18924 [7:03:40<3:01:17,  1.86s/it]

{'loss': 0.4758, 'grad_norm': 16.628063201904297, 'learning_rate': 1.5886886669561442e-05, 'epoch': 2.07}


 69%|██████▉   | 13080/18924 [7:03:59<3:00:49,  1.86s/it]

{'loss': 0.536, 'grad_norm': 13.656646728515625, 'learning_rate': 1.5859748154580983e-05, 'epoch': 2.07}


 69%|██████▉   | 13090/18924 [7:04:17<2:59:53,  1.85s/it]

{'loss': 0.3731, 'grad_norm': 12.58640193939209, 'learning_rate': 1.583260963960052e-05, 'epoch': 2.08}


 69%|██████▉   | 13100/18924 [7:04:36<3:00:16,  1.86s/it]

{'loss': 0.4272, 'grad_norm': 6.588784217834473, 'learning_rate': 1.580547112462006e-05, 'epoch': 2.08}


 69%|██████▉   | 13110/18924 [7:04:54<2:59:38,  1.85s/it]

{'loss': 0.5557, 'grad_norm': 11.27084732055664, 'learning_rate': 1.5778332609639602e-05, 'epoch': 2.08}


 69%|██████▉   | 13120/18924 [7:05:13<2:59:06,  1.85s/it]

{'loss': 0.5626, 'grad_norm': 10.799747467041016, 'learning_rate': 1.575119409465914e-05, 'epoch': 2.08}


 69%|██████▉   | 13130/18924 [7:05:31<2:59:10,  1.86s/it]

{'loss': 0.5666, 'grad_norm': 9.834502220153809, 'learning_rate': 1.572405557967868e-05, 'epoch': 2.08}


 69%|██████▉   | 13140/18924 [7:05:50<2:57:35,  1.84s/it]

{'loss': 0.5323, 'grad_norm': 13.419011116027832, 'learning_rate': 1.569691706469822e-05, 'epoch': 2.08}


 69%|██████▉   | 13150/18924 [7:06:08<2:57:31,  1.84s/it]

{'loss': 0.5702, 'grad_norm': 9.314980506896973, 'learning_rate': 1.566977854971776e-05, 'epoch': 2.08}


 70%|██████▉   | 13160/18924 [7:06:27<2:58:16,  1.86s/it]

{'loss': 0.4854, 'grad_norm': 6.074323654174805, 'learning_rate': 1.56426400347373e-05, 'epoch': 2.09}


 70%|██████▉   | 13170/18924 [7:06:46<2:56:47,  1.84s/it]

{'loss': 0.3655, 'grad_norm': 8.168906211853027, 'learning_rate': 1.561550151975684e-05, 'epoch': 2.09}


 70%|██████▉   | 13180/18924 [7:07:04<2:57:13,  1.85s/it]

{'loss': 0.3331, 'grad_norm': 7.5595316886901855, 'learning_rate': 1.5588363004776378e-05, 'epoch': 2.09}


 70%|██████▉   | 13190/18924 [7:07:22<2:56:19,  1.85s/it]

{'loss': 0.4357, 'grad_norm': 11.259800910949707, 'learning_rate': 1.556122448979592e-05, 'epoch': 2.09}


 70%|██████▉   | 13200/18924 [7:07:41<2:58:13,  1.87s/it]

{'loss': 0.4418, 'grad_norm': 14.987833976745605, 'learning_rate': 1.553408597481546e-05, 'epoch': 2.09}


 70%|██████▉   | 13210/18924 [7:08:00<2:59:13,  1.88s/it]

{'loss': 0.4342, 'grad_norm': 9.057332038879395, 'learning_rate': 1.5506947459834997e-05, 'epoch': 2.09}


 70%|██████▉   | 13220/18924 [7:08:19<2:56:57,  1.86s/it]

{'loss': 0.4158, 'grad_norm': 7.898565292358398, 'learning_rate': 1.547980894485454e-05, 'epoch': 2.1}


 70%|██████▉   | 13230/18924 [7:08:37<2:57:22,  1.87s/it]

{'loss': 0.3408, 'grad_norm': 7.9330830574035645, 'learning_rate': 1.545267042987408e-05, 'epoch': 2.1}


 70%|██████▉   | 13240/18924 [7:08:56<2:54:49,  1.85s/it]

{'loss': 0.4259, 'grad_norm': 10.806832313537598, 'learning_rate': 1.5425531914893617e-05, 'epoch': 2.1}


 70%|███████   | 13250/18924 [7:09:14<2:54:56,  1.85s/it]

{'loss': 0.5039, 'grad_norm': 9.968364715576172, 'learning_rate': 1.5398393399913158e-05, 'epoch': 2.1}


 70%|███████   | 13260/18924 [7:09:33<2:54:43,  1.85s/it]

{'loss': 0.5211, 'grad_norm': 15.10832405090332, 'learning_rate': 1.53712548849327e-05, 'epoch': 2.1}


 70%|███████   | 13270/18924 [7:09:51<2:53:56,  1.85s/it]

{'loss': 0.389, 'grad_norm': 7.464730739593506, 'learning_rate': 1.534411636995224e-05, 'epoch': 2.1}


 70%|███████   | 13280/18924 [7:10:10<2:53:51,  1.85s/it]

{'loss': 0.4588, 'grad_norm': 1.5188207626342773, 'learning_rate': 1.5316977854971777e-05, 'epoch': 2.11}


 70%|███████   | 13290/18924 [7:10:28<2:52:49,  1.84s/it]

{'loss': 0.3517, 'grad_norm': 11.357049942016602, 'learning_rate': 1.5289839339991314e-05, 'epoch': 2.11}


 70%|███████   | 13300/18924 [7:10:46<2:50:15,  1.82s/it]

{'loss': 0.4925, 'grad_norm': 11.360885620117188, 'learning_rate': 1.5262700825010855e-05, 'epoch': 2.11}


 70%|███████   | 13310/18924 [7:11:04<2:46:40,  1.78s/it]

{'loss': 0.4939, 'grad_norm': 5.355436325073242, 'learning_rate': 1.5235562310030396e-05, 'epoch': 2.11}


 70%|███████   | 13320/18924 [7:11:22<2:46:53,  1.79s/it]

{'loss': 0.4565, 'grad_norm': 9.8660249710083, 'learning_rate': 1.5208423795049937e-05, 'epoch': 2.11}


 70%|███████   | 13330/18924 [7:11:40<2:50:19,  1.83s/it]

{'loss': 0.5055, 'grad_norm': 2.4203834533691406, 'learning_rate': 1.5181285280069476e-05, 'epoch': 2.11}


 70%|███████   | 13340/18924 [7:11:59<2:53:30,  1.86s/it]

{'loss': 0.3859, 'grad_norm': 5.988199234008789, 'learning_rate': 1.5154146765089013e-05, 'epoch': 2.11}


 71%|███████   | 13350/18924 [7:12:18<2:53:02,  1.86s/it]

{'loss': 0.3898, 'grad_norm': 9.17707347869873, 'learning_rate': 1.5127008250108554e-05, 'epoch': 2.12}


 71%|███████   | 13360/18924 [7:12:36<2:52:41,  1.86s/it]

{'loss': 0.4869, 'grad_norm': 12.886305809020996, 'learning_rate': 1.5099869735128095e-05, 'epoch': 2.12}


 71%|███████   | 13370/18924 [7:12:55<2:51:22,  1.85s/it]

{'loss': 0.4341, 'grad_norm': 8.942238807678223, 'learning_rate': 1.5072731220147634e-05, 'epoch': 2.12}


 71%|███████   | 13380/18924 [7:13:13<2:51:05,  1.85s/it]

{'loss': 0.385, 'grad_norm': 8.467313766479492, 'learning_rate': 1.5045592705167175e-05, 'epoch': 2.12}


 71%|███████   | 13390/18924 [7:13:31<2:47:22,  1.81s/it]

{'loss': 0.518, 'grad_norm': 16.709884643554688, 'learning_rate': 1.5018454190186714e-05, 'epoch': 2.12}


 71%|███████   | 13400/18924 [7:13:49<2:42:58,  1.77s/it]

{'loss': 0.3373, 'grad_norm': 6.220389366149902, 'learning_rate': 1.4991315675206252e-05, 'epoch': 2.12}


 71%|███████   | 13410/18924 [7:14:07<2:44:36,  1.79s/it]

{'loss': 0.3821, 'grad_norm': 11.128337860107422, 'learning_rate': 1.4964177160225793e-05, 'epoch': 2.13}


 71%|███████   | 13420/18924 [7:14:25<2:47:29,  1.83s/it]

{'loss': 0.2998, 'grad_norm': 9.096890449523926, 'learning_rate': 1.4937038645245332e-05, 'epoch': 2.13}


 71%|███████   | 13430/18924 [7:14:44<2:50:49,  1.87s/it]

{'loss': 0.3899, 'grad_norm': 10.62173080444336, 'learning_rate': 1.4909900130264873e-05, 'epoch': 2.13}


 71%|███████   | 13440/18924 [7:15:03<2:51:50,  1.88s/it]

{'loss': 0.5976, 'grad_norm': 3.8163483142852783, 'learning_rate': 1.4882761615284414e-05, 'epoch': 2.13}


 71%|███████   | 13450/18924 [7:15:22<2:50:07,  1.86s/it]

{'loss': 0.4125, 'grad_norm': 11.862786293029785, 'learning_rate': 1.4855623100303953e-05, 'epoch': 2.13}


 71%|███████   | 13460/18924 [7:15:40<2:49:08,  1.86s/it]

{'loss': 0.3884, 'grad_norm': 14.992671966552734, 'learning_rate': 1.482848458532349e-05, 'epoch': 2.13}


 71%|███████   | 13470/18924 [7:15:58<2:41:02,  1.77s/it]

{'loss': 0.4275, 'grad_norm': 16.541574478149414, 'learning_rate': 1.4801346070343031e-05, 'epoch': 2.14}


 71%|███████   | 13480/18924 [7:16:16<2:40:54,  1.77s/it]

{'loss': 0.534, 'grad_norm': 5.107711315155029, 'learning_rate': 1.477420755536257e-05, 'epoch': 2.14}


 71%|███████▏  | 13490/18924 [7:16:34<2:44:31,  1.82s/it]

{'loss': 0.4742, 'grad_norm': 11.69197940826416, 'learning_rate': 1.4747069040382111e-05, 'epoch': 2.14}


 71%|███████▏  | 13500/18924 [7:16:52<2:48:30,  1.86s/it]

{'loss': 0.5082, 'grad_norm': 12.929061889648438, 'learning_rate': 1.4719930525401652e-05, 'epoch': 2.14}


 71%|███████▏  | 13510/18924 [7:17:11<2:44:12,  1.82s/it]

{'loss': 0.6642, 'grad_norm': 10.696646690368652, 'learning_rate': 1.4692792010421191e-05, 'epoch': 2.14}


 71%|███████▏  | 13520/18924 [7:17:29<2:42:48,  1.81s/it]

{'loss': 0.5782, 'grad_norm': 8.603652954101562, 'learning_rate': 1.4665653495440732e-05, 'epoch': 2.14}


 71%|███████▏  | 13530/18924 [7:17:48<2:50:44,  1.90s/it]

{'loss': 0.3725, 'grad_norm': 6.248086452484131, 'learning_rate': 1.463851498046027e-05, 'epoch': 2.14}


 72%|███████▏  | 13540/18924 [7:18:07<2:48:56,  1.88s/it]

{'loss': 0.4655, 'grad_norm': 8.424567222595215, 'learning_rate': 1.4611376465479809e-05, 'epoch': 2.15}


 72%|███████▏  | 13550/18924 [7:18:26<2:41:53,  1.81s/it]

{'loss': 0.4706, 'grad_norm': 5.935105323791504, 'learning_rate': 1.458423795049935e-05, 'epoch': 2.15}


 72%|███████▏  | 13560/18924 [7:18:43<2:39:13,  1.78s/it]

{'loss': 0.3142, 'grad_norm': 12.239541053771973, 'learning_rate': 1.4557099435518889e-05, 'epoch': 2.15}


 72%|███████▏  | 13570/18924 [7:19:02<2:45:20,  1.85s/it]

{'loss': 0.4248, 'grad_norm': 11.412976264953613, 'learning_rate': 1.452996092053843e-05, 'epoch': 2.15}


 72%|███████▏  | 13580/18924 [7:19:21<2:47:30,  1.88s/it]

{'loss': 0.2933, 'grad_norm': 3.5057859420776367, 'learning_rate': 1.450282240555797e-05, 'epoch': 2.15}


 72%|███████▏  | 13590/18924 [7:19:39<2:46:50,  1.88s/it]

{'loss': 0.3606, 'grad_norm': 12.548709869384766, 'learning_rate': 1.4475683890577508e-05, 'epoch': 2.15}


 72%|███████▏  | 13600/18924 [7:19:58<2:45:01,  1.86s/it]

{'loss': 0.2732, 'grad_norm': 1.555963397026062, 'learning_rate': 1.4448545375597047e-05, 'epoch': 2.16}


 72%|███████▏  | 13610/18924 [7:20:16<2:43:22,  1.84s/it]

{'loss': 0.3907, 'grad_norm': 12.474706649780273, 'learning_rate': 1.4421406860616588e-05, 'epoch': 2.16}


 72%|███████▏  | 13620/18924 [7:20:35<2:43:18,  1.85s/it]

{'loss': 0.3806, 'grad_norm': 11.165250778198242, 'learning_rate': 1.4394268345636127e-05, 'epoch': 2.16}


 72%|███████▏  | 13630/18924 [7:20:53<2:42:06,  1.84s/it]

{'loss': 0.3413, 'grad_norm': 9.179829597473145, 'learning_rate': 1.4367129830655668e-05, 'epoch': 2.16}


 72%|███████▏  | 13640/18924 [7:21:12<2:42:07,  1.84s/it]

{'loss': 0.2918, 'grad_norm': 6.225919723510742, 'learning_rate': 1.4339991315675209e-05, 'epoch': 2.16}


 72%|███████▏  | 13650/18924 [7:21:30<2:42:09,  1.84s/it]

{'loss': 0.3791, 'grad_norm': 13.073716163635254, 'learning_rate': 1.4312852800694746e-05, 'epoch': 2.16}


 72%|███████▏  | 13660/18924 [7:21:49<2:41:48,  1.84s/it]

{'loss': 0.3674, 'grad_norm': 14.235342025756836, 'learning_rate': 1.4285714285714285e-05, 'epoch': 2.17}


 72%|███████▏  | 13670/18924 [7:22:07<2:40:19,  1.83s/it]

{'loss': 0.5515, 'grad_norm': 14.817288398742676, 'learning_rate': 1.4258575770733826e-05, 'epoch': 2.17}


 72%|███████▏  | 13680/18924 [7:22:25<2:39:59,  1.83s/it]

{'loss': 0.3753, 'grad_norm': 10.949044227600098, 'learning_rate': 1.4231437255753365e-05, 'epoch': 2.17}


 72%|███████▏  | 13690/18924 [7:22:44<2:40:42,  1.84s/it]

{'loss': 0.5592, 'grad_norm': 7.343600273132324, 'learning_rate': 1.4204298740772906e-05, 'epoch': 2.17}


 72%|███████▏  | 13700/18924 [7:23:02<2:43:21,  1.88s/it]

{'loss': 0.477, 'grad_norm': 8.579598426818848, 'learning_rate': 1.4177160225792445e-05, 'epoch': 2.17}


 72%|███████▏  | 13710/18924 [7:23:21<2:40:28,  1.85s/it]

{'loss': 0.4027, 'grad_norm': 13.777243614196777, 'learning_rate': 1.4150021710811983e-05, 'epoch': 2.17}


 73%|███████▎  | 13720/18924 [7:23:39<2:39:12,  1.84s/it]

{'loss': 0.4082, 'grad_norm': 6.9555816650390625, 'learning_rate': 1.4122883195831524e-05, 'epoch': 2.18}


 73%|███████▎  | 13730/18924 [7:23:57<2:35:20,  1.79s/it]

{'loss': 0.4178, 'grad_norm': 7.5771660804748535, 'learning_rate': 1.4095744680851065e-05, 'epoch': 2.18}


 73%|███████▎  | 13740/18924 [7:24:15<2:34:59,  1.79s/it]

{'loss': 0.3197, 'grad_norm': 14.405776977539062, 'learning_rate': 1.4068606165870604e-05, 'epoch': 2.18}


 73%|███████▎  | 13750/18924 [7:24:34<2:36:57,  1.82s/it]

{'loss': 0.3905, 'grad_norm': 9.243673324584961, 'learning_rate': 1.4041467650890145e-05, 'epoch': 2.18}


 73%|███████▎  | 13760/18924 [7:24:52<2:39:02,  1.85s/it]

{'loss': 0.4357, 'grad_norm': 15.93854808807373, 'learning_rate': 1.4014329135909684e-05, 'epoch': 2.18}


 73%|███████▎  | 13770/18924 [7:25:11<2:39:57,  1.86s/it]

{'loss': 0.4282, 'grad_norm': 7.445497512817383, 'learning_rate': 1.3987190620929225e-05, 'epoch': 2.18}


 73%|███████▎  | 13780/18924 [7:25:29<2:34:16,  1.80s/it]

{'loss': 0.3934, 'grad_norm': 3.4519693851470947, 'learning_rate': 1.3960052105948762e-05, 'epoch': 2.18}


 73%|███████▎  | 13790/18924 [7:25:47<2:31:08,  1.77s/it]

{'loss': 0.4197, 'grad_norm': 20.634389877319336, 'learning_rate': 1.3932913590968303e-05, 'epoch': 2.19}


 73%|███████▎  | 13800/18924 [7:26:05<2:37:48,  1.85s/it]

{'loss': 0.4779, 'grad_norm': 2.08522629737854, 'learning_rate': 1.3905775075987842e-05, 'epoch': 2.19}


 73%|███████▎  | 13810/18924 [7:26:24<2:41:05,  1.89s/it]

{'loss': 0.5507, 'grad_norm': 14.336324691772461, 'learning_rate': 1.3878636561007383e-05, 'epoch': 2.19}


 73%|███████▎  | 13820/18924 [7:26:42<2:35:17,  1.83s/it]

{'loss': 0.4105, 'grad_norm': 9.717178344726562, 'learning_rate': 1.3851498046026922e-05, 'epoch': 2.19}


 73%|███████▎  | 13830/18924 [7:27:00<2:31:17,  1.78s/it]

{'loss': 0.3684, 'grad_norm': 8.746261596679688, 'learning_rate': 1.3824359531046463e-05, 'epoch': 2.19}


 73%|███████▎  | 13840/18924 [7:27:18<2:34:46,  1.83s/it]

{'loss': 0.5322, 'grad_norm': 15.994489669799805, 'learning_rate': 1.3797221016066e-05, 'epoch': 2.19}


 73%|███████▎  | 13850/18924 [7:27:37<2:38:04,  1.87s/it]

{'loss': 0.3402, 'grad_norm': 4.241693019866943, 'learning_rate': 1.377008250108554e-05, 'epoch': 2.2}


 73%|███████▎  | 13860/18924 [7:27:56<2:37:39,  1.87s/it]

{'loss': 0.4232, 'grad_norm': 6.725394248962402, 'learning_rate': 1.374294398610508e-05, 'epoch': 2.2}


 73%|███████▎  | 13870/18924 [7:28:14<2:29:30,  1.77s/it]

{'loss': 0.3916, 'grad_norm': 6.806887149810791, 'learning_rate': 1.3715805471124622e-05, 'epoch': 2.2}


 73%|███████▎  | 13880/18924 [7:28:32<2:32:11,  1.81s/it]

{'loss': 0.3654, 'grad_norm': 9.666239738464355, 'learning_rate': 1.368866695614416e-05, 'epoch': 2.2}


 73%|███████▎  | 13890/18924 [7:28:50<2:37:16,  1.87s/it]

{'loss': 0.3698, 'grad_norm': 12.998007774353027, 'learning_rate': 1.3661528441163702e-05, 'epoch': 2.2}


 73%|███████▎  | 13900/18924 [7:29:09<2:36:17,  1.87s/it]

{'loss': 0.2723, 'grad_norm': 7.117579936981201, 'learning_rate': 1.3634389926183239e-05, 'epoch': 2.2}


 74%|███████▎  | 13910/18924 [7:29:27<2:34:11,  1.85s/it]

{'loss': 0.4377, 'grad_norm': 11.014795303344727, 'learning_rate': 1.3607251411202778e-05, 'epoch': 2.21}


 74%|███████▎  | 13920/18924 [7:29:46<2:33:05,  1.84s/it]

{'loss': 0.3923, 'grad_norm': 9.608760833740234, 'learning_rate': 1.3580112896222319e-05, 'epoch': 2.21}


 74%|███████▎  | 13930/18924 [7:30:04<2:32:59,  1.84s/it]

{'loss': 0.4477, 'grad_norm': 7.559205532073975, 'learning_rate': 1.355297438124186e-05, 'epoch': 2.21}


 74%|███████▎  | 13940/18924 [7:30:22<2:29:24,  1.80s/it]

{'loss': 0.4705, 'grad_norm': 13.732709884643555, 'learning_rate': 1.3525835866261399e-05, 'epoch': 2.21}


 74%|███████▎  | 13950/18924 [7:30:40<2:30:24,  1.81s/it]

{'loss': 0.3985, 'grad_norm': 7.808469772338867, 'learning_rate': 1.349869735128094e-05, 'epoch': 2.21}


 74%|███████▍  | 13960/18924 [7:30:59<2:31:16,  1.83s/it]

{'loss': 0.3121, 'grad_norm': 7.843575477600098, 'learning_rate': 1.3471558836300479e-05, 'epoch': 2.21}


 74%|███████▍  | 13970/18924 [7:31:17<2:30:48,  1.83s/it]

{'loss': 0.5492, 'grad_norm': 8.873856544494629, 'learning_rate': 1.3444420321320017e-05, 'epoch': 2.21}


 74%|███████▍  | 13980/18924 [7:31:35<2:31:47,  1.84s/it]

{'loss': 0.3677, 'grad_norm': 10.02334976196289, 'learning_rate': 1.3417281806339557e-05, 'epoch': 2.22}


 74%|███████▍  | 13990/18924 [7:31:54<2:32:01,  1.85s/it]

{'loss': 0.4724, 'grad_norm': 21.536304473876953, 'learning_rate': 1.3390143291359097e-05, 'epoch': 2.22}


 74%|███████▍  | 14000/18924 [7:32:12<2:32:12,  1.85s/it]

{'loss': 0.4464, 'grad_norm': 12.3179292678833, 'learning_rate': 1.3363004776378637e-05, 'epoch': 2.22}


 74%|███████▍  | 14010/18924 [7:32:32<2:31:29,  1.85s/it]

{'loss': 0.3617, 'grad_norm': 8.390129089355469, 'learning_rate': 1.3335866261398178e-05, 'epoch': 2.22}


 74%|███████▍  | 14020/18924 [7:32:50<2:29:39,  1.83s/it]

{'loss': 0.3656, 'grad_norm': 11.264503479003906, 'learning_rate': 1.3308727746417717e-05, 'epoch': 2.22}


 74%|███████▍  | 14030/18924 [7:33:08<2:29:51,  1.84s/it]

{'loss': 0.5281, 'grad_norm': 12.145230293273926, 'learning_rate': 1.3281589231437255e-05, 'epoch': 2.22}


 74%|███████▍  | 14040/18924 [7:33:27<2:29:30,  1.84s/it]

{'loss': 0.455, 'grad_norm': 7.641922950744629, 'learning_rate': 1.3254450716456796e-05, 'epoch': 2.23}


 74%|███████▍  | 14050/18924 [7:33:45<2:28:35,  1.83s/it]

{'loss': 0.3406, 'grad_norm': 6.376630783081055, 'learning_rate': 1.3227312201476335e-05, 'epoch': 2.23}


 74%|███████▍  | 14060/18924 [7:34:04<2:29:36,  1.85s/it]

{'loss': 0.4489, 'grad_norm': 16.206457138061523, 'learning_rate': 1.3200173686495876e-05, 'epoch': 2.23}


 74%|███████▍  | 14070/18924 [7:34:22<2:28:48,  1.84s/it]

{'loss': 0.3772, 'grad_norm': 6.618458271026611, 'learning_rate': 1.3173035171515417e-05, 'epoch': 2.23}


 74%|███████▍  | 14080/18924 [7:34:40<2:27:38,  1.83s/it]

{'loss': 0.299, 'grad_norm': 2.8195250034332275, 'learning_rate': 1.3145896656534956e-05, 'epoch': 2.23}


 74%|███████▍  | 14090/18924 [7:34:59<2:28:42,  1.85s/it]

{'loss': 0.494, 'grad_norm': 4.522160053253174, 'learning_rate': 1.3118758141554493e-05, 'epoch': 2.23}


 75%|███████▍  | 14100/18924 [7:35:17<2:28:44,  1.85s/it]

{'loss': 0.4745, 'grad_norm': 17.094669342041016, 'learning_rate': 1.3091619626574034e-05, 'epoch': 2.24}


 75%|███████▍  | 14110/18924 [7:35:36<2:27:09,  1.83s/it]

{'loss': 0.3716, 'grad_norm': 12.358148574829102, 'learning_rate': 1.3064481111593573e-05, 'epoch': 2.24}


 75%|███████▍  | 14120/18924 [7:35:54<2:27:40,  1.84s/it]

{'loss': 0.3826, 'grad_norm': 13.216212272644043, 'learning_rate': 1.3037342596613114e-05, 'epoch': 2.24}


 75%|███████▍  | 14130/18924 [7:36:12<2:27:10,  1.84s/it]

{'loss': 0.3818, 'grad_norm': 9.435233116149902, 'learning_rate': 1.3010204081632653e-05, 'epoch': 2.24}


 75%|███████▍  | 14140/18924 [7:36:31<2:26:08,  1.83s/it]

{'loss': 0.2707, 'grad_norm': 7.911682605743408, 'learning_rate': 1.2983065566652194e-05, 'epoch': 2.24}


 75%|███████▍  | 14150/18924 [7:36:49<2:26:31,  1.84s/it]

{'loss': 0.4179, 'grad_norm': 7.7987799644470215, 'learning_rate': 1.2955927051671732e-05, 'epoch': 2.24}


 75%|███████▍  | 14160/18924 [7:37:08<2:26:21,  1.84s/it]

{'loss': 0.4249, 'grad_norm': 4.4042768478393555, 'learning_rate': 1.2928788536691273e-05, 'epoch': 2.24}


 75%|███████▍  | 14170/18924 [7:37:26<2:25:07,  1.83s/it]

{'loss': 0.3616, 'grad_norm': 7.871657848358154, 'learning_rate': 1.2901650021710812e-05, 'epoch': 2.25}


 75%|███████▍  | 14180/18924 [7:37:44<2:25:25,  1.84s/it]

{'loss': 0.3559, 'grad_norm': 11.003766059875488, 'learning_rate': 1.2874511506730353e-05, 'epoch': 2.25}


 75%|███████▍  | 14190/18924 [7:38:03<2:23:56,  1.82s/it]

{'loss': 0.4476, 'grad_norm': 10.383554458618164, 'learning_rate': 1.2847372991749892e-05, 'epoch': 2.25}


 75%|███████▌  | 14200/18924 [7:38:21<2:23:41,  1.82s/it]

{'loss': 0.4987, 'grad_norm': 12.905147552490234, 'learning_rate': 1.2820234476769433e-05, 'epoch': 2.25}


 75%|███████▌  | 14210/18924 [7:38:39<2:24:51,  1.84s/it]

{'loss': 0.4808, 'grad_norm': 8.403719902038574, 'learning_rate': 1.2793095961788974e-05, 'epoch': 2.25}


 75%|███████▌  | 14220/18924 [7:38:58<2:24:04,  1.84s/it]

{'loss': 0.5027, 'grad_norm': 14.02796745300293, 'learning_rate': 1.2765957446808511e-05, 'epoch': 2.25}


 75%|███████▌  | 14230/18924 [7:39:16<2:23:35,  1.84s/it]

{'loss': 0.4669, 'grad_norm': 9.926518440246582, 'learning_rate': 1.273881893182805e-05, 'epoch': 2.26}


 75%|███████▌  | 14240/18924 [7:39:34<2:19:25,  1.79s/it]

{'loss': 0.5181, 'grad_norm': 8.702245712280273, 'learning_rate': 1.2711680416847591e-05, 'epoch': 2.26}


 75%|███████▌  | 14250/18924 [7:39:52<2:16:57,  1.76s/it]

{'loss': 0.4385, 'grad_norm': 2.4461214542388916, 'learning_rate': 1.268454190186713e-05, 'epoch': 2.26}


 75%|███████▌  | 14260/18924 [7:40:09<2:17:04,  1.76s/it]

{'loss': 0.4567, 'grad_norm': 13.500951766967773, 'learning_rate': 1.2657403386886671e-05, 'epoch': 2.26}


 75%|███████▌  | 14270/18924 [7:40:27<2:18:35,  1.79s/it]

{'loss': 0.465, 'grad_norm': 14.018305778503418, 'learning_rate': 1.2630264871906212e-05, 'epoch': 2.26}


 75%|███████▌  | 14280/18924 [7:40:46<2:24:38,  1.87s/it]

{'loss': 0.311, 'grad_norm': 5.615419387817383, 'learning_rate': 1.2603126356925748e-05, 'epoch': 2.26}


 76%|███████▌  | 14290/18924 [7:41:04<2:25:10,  1.88s/it]

{'loss': 0.3696, 'grad_norm': 8.548747062683105, 'learning_rate': 1.2575987841945289e-05, 'epoch': 2.27}


 76%|███████▌  | 14300/18924 [7:41:23<2:23:12,  1.86s/it]

{'loss': 0.3194, 'grad_norm': 5.471031188964844, 'learning_rate': 1.254884932696483e-05, 'epoch': 2.27}


 76%|███████▌  | 14310/18924 [7:41:42<2:21:49,  1.84s/it]

{'loss': 0.3099, 'grad_norm': 11.825716018676758, 'learning_rate': 1.2521710811984369e-05, 'epoch': 2.27}


 76%|███████▌  | 14320/18924 [7:42:00<2:21:14,  1.84s/it]

{'loss': 0.4315, 'grad_norm': 10.45743179321289, 'learning_rate': 1.249457229700391e-05, 'epoch': 2.27}


 76%|███████▌  | 14330/18924 [7:42:18<2:21:30,  1.85s/it]

{'loss': 0.4165, 'grad_norm': 7.825321674346924, 'learning_rate': 1.2467433782023449e-05, 'epoch': 2.27}


 76%|███████▌  | 14340/18924 [7:42:36<2:15:12,  1.77s/it]

{'loss': 0.5166, 'grad_norm': 12.521280288696289, 'learning_rate': 1.2440295267042988e-05, 'epoch': 2.27}


 76%|███████▌  | 14350/18924 [7:42:54<2:16:50,  1.79s/it]

{'loss': 0.4149, 'grad_norm': 5.245632171630859, 'learning_rate': 1.2413156752062529e-05, 'epoch': 2.27}


 76%|███████▌  | 14360/18924 [7:43:12<2:20:34,  1.85s/it]

{'loss': 0.4071, 'grad_norm': 7.548450469970703, 'learning_rate': 1.2386018237082068e-05, 'epoch': 2.28}


 76%|███████▌  | 14370/18924 [7:43:31<2:21:33,  1.87s/it]

{'loss': 0.4548, 'grad_norm': 11.108932495117188, 'learning_rate': 1.2358879722101607e-05, 'epoch': 2.28}


 76%|███████▌  | 14380/18924 [7:43:49<2:14:49,  1.78s/it]

{'loss': 0.5499, 'grad_norm': 17.654794692993164, 'learning_rate': 1.2331741207121148e-05, 'epoch': 2.28}


 76%|███████▌  | 14390/18924 [7:44:07<2:14:23,  1.78s/it]

{'loss': 0.4636, 'grad_norm': 11.596455574035645, 'learning_rate': 1.2304602692140687e-05, 'epoch': 2.28}


 76%|███████▌  | 14400/18924 [7:44:25<2:15:45,  1.80s/it]

{'loss': 0.3347, 'grad_norm': 12.188764572143555, 'learning_rate': 1.2277464177160226e-05, 'epoch': 2.28}


 76%|███████▌  | 14410/18924 [7:44:43<2:19:28,  1.85s/it]

{'loss': 0.4565, 'grad_norm': 11.627565383911133, 'learning_rate': 1.2250325662179767e-05, 'epoch': 2.28}


 76%|███████▌  | 14420/18924 [7:45:02<2:16:44,  1.82s/it]

{'loss': 0.3109, 'grad_norm': 10.409390449523926, 'learning_rate': 1.2223187147199305e-05, 'epoch': 2.29}


 76%|███████▋  | 14430/18924 [7:45:19<2:13:22,  1.78s/it]

{'loss': 0.4623, 'grad_norm': 17.94752311706543, 'learning_rate': 1.2196048632218845e-05, 'epoch': 2.29}


 76%|███████▋  | 14440/18924 [7:45:38<2:17:07,  1.83s/it]

{'loss': 0.3328, 'grad_norm': 2.7512667179107666, 'learning_rate': 1.2168910117238386e-05, 'epoch': 2.29}


 76%|███████▋  | 14450/18924 [7:45:56<2:20:10,  1.88s/it]

{'loss': 0.5652, 'grad_norm': 10.84040641784668, 'learning_rate': 1.2141771602257924e-05, 'epoch': 2.29}


 76%|███████▋  | 14460/18924 [7:46:15<2:15:59,  1.83s/it]

{'loss': 0.3838, 'grad_norm': 8.273688316345215, 'learning_rate': 1.2114633087277465e-05, 'epoch': 2.29}


 76%|███████▋  | 14470/18924 [7:46:32<2:11:09,  1.77s/it]

{'loss': 0.4495, 'grad_norm': 12.466541290283203, 'learning_rate': 1.2087494572297005e-05, 'epoch': 2.29}


 77%|███████▋  | 14480/18924 [7:46:51<2:14:28,  1.82s/it]

{'loss': 0.4308, 'grad_norm': 7.898915767669678, 'learning_rate': 1.2060356057316543e-05, 'epoch': 2.3}


 77%|███████▋  | 14490/18924 [7:47:09<2:18:03,  1.87s/it]

{'loss': 0.5432, 'grad_norm': 7.933945655822754, 'learning_rate': 1.2033217542336084e-05, 'epoch': 2.3}


 77%|███████▋  | 14500/18924 [7:47:28<2:16:48,  1.86s/it]

{'loss': 0.4635, 'grad_norm': 10.060988426208496, 'learning_rate': 1.2006079027355625e-05, 'epoch': 2.3}


 77%|███████▋  | 14510/18924 [7:47:46<2:10:10,  1.77s/it]

{'loss': 0.3254, 'grad_norm': 6.147611141204834, 'learning_rate': 1.1978940512375162e-05, 'epoch': 2.3}


 77%|███████▋  | 14520/18924 [7:48:04<2:08:53,  1.76s/it]

{'loss': 0.3821, 'grad_norm': 2.353344440460205, 'learning_rate': 1.1951801997394703e-05, 'epoch': 2.3}


 77%|███████▋  | 14530/18924 [7:48:22<2:11:36,  1.80s/it]

{'loss': 0.3356, 'grad_norm': 6.216646671295166, 'learning_rate': 1.1924663482414244e-05, 'epoch': 2.3}


 77%|███████▋  | 14540/18924 [7:48:40<2:15:33,  1.86s/it]

{'loss': 0.3777, 'grad_norm': 10.449320793151855, 'learning_rate': 1.1897524967433783e-05, 'epoch': 2.31}


 77%|███████▋  | 14550/18924 [7:48:58<2:09:50,  1.78s/it]

{'loss': 0.4939, 'grad_norm': 7.808910846710205, 'learning_rate': 1.1870386452453322e-05, 'epoch': 2.31}


 77%|███████▋  | 14560/18924 [7:49:16<2:12:05,  1.82s/it]

{'loss': 0.4834, 'grad_norm': 10.353964805603027, 'learning_rate': 1.1843247937472863e-05, 'epoch': 2.31}


 77%|███████▋  | 14570/18924 [7:49:35<2:14:59,  1.86s/it]

{'loss': 0.3795, 'grad_norm': 6.644855976104736, 'learning_rate': 1.1816109422492402e-05, 'epoch': 2.31}


 77%|███████▋  | 14580/18924 [7:49:53<2:15:27,  1.87s/it]

{'loss': 0.4309, 'grad_norm': 20.342573165893555, 'learning_rate': 1.1788970907511941e-05, 'epoch': 2.31}


 77%|███████▋  | 14590/18924 [7:50:12<2:08:53,  1.78s/it]

{'loss': 0.2964, 'grad_norm': 9.079448699951172, 'learning_rate': 1.176183239253148e-05, 'epoch': 2.31}


 77%|███████▋  | 14600/18924 [7:50:29<2:08:40,  1.79s/it]

{'loss': 0.4293, 'grad_norm': 11.034600257873535, 'learning_rate': 1.1734693877551021e-05, 'epoch': 2.31}


 77%|███████▋  | 14610/18924 [7:50:48<2:12:54,  1.85s/it]

{'loss': 0.2324, 'grad_norm': 7.943446636199951, 'learning_rate': 1.170755536257056e-05, 'epoch': 2.32}


 77%|███████▋  | 14620/18924 [7:51:06<2:13:46,  1.86s/it]

{'loss': 0.3235, 'grad_norm': 4.958524703979492, 'learning_rate': 1.16804168475901e-05, 'epoch': 2.32}


 77%|███████▋  | 14630/18924 [7:51:25<2:08:48,  1.80s/it]

{'loss': 0.3718, 'grad_norm': 8.61065673828125, 'learning_rate': 1.165327833260964e-05, 'epoch': 2.32}


 77%|███████▋  | 14640/18924 [7:51:42<2:07:14,  1.78s/it]

{'loss': 0.4402, 'grad_norm': 8.160645484924316, 'learning_rate': 1.162613981762918e-05, 'epoch': 2.32}


 77%|███████▋  | 14650/18924 [7:52:00<2:09:56,  1.82s/it]

{'loss': 0.5797, 'grad_norm': 12.401853561401367, 'learning_rate': 1.1599001302648719e-05, 'epoch': 2.32}


 77%|███████▋  | 14660/18924 [7:52:19<2:11:57,  1.86s/it]

{'loss': 0.3229, 'grad_norm': 10.318117141723633, 'learning_rate': 1.157186278766826e-05, 'epoch': 2.32}


 78%|███████▊  | 14670/18924 [7:52:37<2:08:21,  1.81s/it]

{'loss': 0.2286, 'grad_norm': 5.211884498596191, 'learning_rate': 1.1544724272687799e-05, 'epoch': 2.33}


 78%|███████▊  | 14680/18924 [7:52:55<2:04:53,  1.77s/it]

{'loss': 0.4245, 'grad_norm': 10.38330078125, 'learning_rate': 1.1517585757707338e-05, 'epoch': 2.33}


 78%|███████▊  | 14690/18924 [7:53:13<2:08:22,  1.82s/it]

{'loss': 0.4455, 'grad_norm': 5.659548759460449, 'learning_rate': 1.1490447242726879e-05, 'epoch': 2.33}


 78%|███████▊  | 14700/18924 [7:53:32<2:11:12,  1.86s/it]

{'loss': 0.387, 'grad_norm': 10.056389808654785, 'learning_rate': 1.1463308727746418e-05, 'epoch': 2.33}


 78%|███████▊  | 14710/18924 [7:53:50<2:09:13,  1.84s/it]

{'loss': 0.4706, 'grad_norm': 14.290715217590332, 'learning_rate': 1.1436170212765957e-05, 'epoch': 2.33}


 78%|███████▊  | 14720/18924 [7:54:08<2:03:28,  1.76s/it]

{'loss': 0.5033, 'grad_norm': 11.219157218933105, 'learning_rate': 1.1409031697785498e-05, 'epoch': 2.33}


 78%|███████▊  | 14730/18924 [7:54:26<2:06:25,  1.81s/it]

{'loss': 0.4719, 'grad_norm': 9.89171314239502, 'learning_rate': 1.1381893182805037e-05, 'epoch': 2.34}


 78%|███████▊  | 14740/18924 [7:54:44<2:09:04,  1.85s/it]

{'loss': 0.4416, 'grad_norm': 13.732376098632812, 'learning_rate': 1.1354754667824577e-05, 'epoch': 2.34}


 78%|███████▊  | 14750/18924 [7:55:03<2:09:20,  1.86s/it]

{'loss': 0.4228, 'grad_norm': 10.8245849609375, 'learning_rate': 1.1327616152844117e-05, 'epoch': 2.34}


 78%|███████▊  | 14760/18924 [7:55:21<2:02:47,  1.77s/it]

{'loss': 0.4448, 'grad_norm': 12.455323219299316, 'learning_rate': 1.1300477637863657e-05, 'epoch': 2.34}


 78%|███████▊  | 14770/18924 [7:55:39<2:04:45,  1.80s/it]

{'loss': 0.5815, 'grad_norm': 12.318638801574707, 'learning_rate': 1.1273339122883196e-05, 'epoch': 2.34}


 78%|███████▊  | 14780/18924 [7:55:57<2:09:13,  1.87s/it]

{'loss': 0.3677, 'grad_norm': 5.547032833099365, 'learning_rate': 1.1246200607902737e-05, 'epoch': 2.34}


 78%|███████▊  | 14790/18924 [7:56:16<2:08:03,  1.86s/it]

{'loss': 0.2674, 'grad_norm': 2.163823366165161, 'learning_rate': 1.1219062092922276e-05, 'epoch': 2.34}


 78%|███████▊  | 14800/18924 [7:56:34<2:01:45,  1.77s/it]

{'loss': 0.3905, 'grad_norm': 8.140281677246094, 'learning_rate': 1.1191923577941815e-05, 'epoch': 2.35}


 78%|███████▊  | 14810/18924 [7:56:52<2:01:55,  1.78s/it]

{'loss': 0.3577, 'grad_norm': 8.409294128417969, 'learning_rate': 1.1164785062961356e-05, 'epoch': 2.35}


 78%|███████▊  | 14820/18924 [7:57:10<2:03:03,  1.80s/it]

{'loss': 0.5073, 'grad_norm': 15.411453247070312, 'learning_rate': 1.1137646547980895e-05, 'epoch': 2.35}


 78%|███████▊  | 14830/18924 [7:57:27<2:01:19,  1.78s/it]

{'loss': 0.3604, 'grad_norm': 13.122659683227539, 'learning_rate': 1.1110508033000434e-05, 'epoch': 2.35}


 78%|███████▊  | 14840/18924 [7:57:45<2:01:50,  1.79s/it]

{'loss': 0.4447, 'grad_norm': 8.407740592956543, 'learning_rate': 1.1083369518019975e-05, 'epoch': 2.35}


 78%|███████▊  | 14850/18924 [7:58:03<2:02:08,  1.80s/it]

{'loss': 0.5454, 'grad_norm': 5.748809814453125, 'learning_rate': 1.1056231003039514e-05, 'epoch': 2.35}


 79%|███████▊  | 14860/18924 [7:58:22<2:05:14,  1.85s/it]

{'loss': 0.4818, 'grad_norm': 12.338706970214844, 'learning_rate': 1.1029092488059053e-05, 'epoch': 2.36}


 79%|███████▊  | 14870/18924 [7:58:40<2:06:40,  1.87s/it]

{'loss': 0.4054, 'grad_norm': 7.250087261199951, 'learning_rate': 1.1001953973078594e-05, 'epoch': 2.36}


 79%|███████▊  | 14880/18924 [7:58:59<2:03:30,  1.83s/it]

{'loss': 0.3799, 'grad_norm': 8.543843269348145, 'learning_rate': 1.0974815458098133e-05, 'epoch': 2.36}


 79%|███████▊  | 14890/18924 [7:59:17<1:58:37,  1.76s/it]

{'loss': 0.5083, 'grad_norm': 9.452570915222168, 'learning_rate': 1.0947676943117672e-05, 'epoch': 2.36}


 79%|███████▊  | 14900/18924 [7:59:35<2:01:16,  1.81s/it]

{'loss': 0.5105, 'grad_norm': 8.487109184265137, 'learning_rate': 1.0920538428137213e-05, 'epoch': 2.36}


 79%|███████▉  | 14910/18924 [7:59:53<2:05:01,  1.87s/it]

{'loss': 0.3755, 'grad_norm': 2.2546918392181396, 'learning_rate': 1.0893399913156753e-05, 'epoch': 2.36}


 79%|███████▉  | 14920/18924 [8:00:12<2:03:51,  1.86s/it]

{'loss': 0.393, 'grad_norm': 5.3881754875183105, 'learning_rate': 1.0866261398176292e-05, 'epoch': 2.37}


 79%|███████▉  | 14930/18924 [8:00:29<1:56:53,  1.76s/it]

{'loss': 0.4125, 'grad_norm': 4.628158092498779, 'learning_rate': 1.0839122883195833e-05, 'epoch': 2.37}


 79%|███████▉  | 14940/18924 [8:00:47<1:56:58,  1.76s/it]

{'loss': 0.3686, 'grad_norm': 4.736345291137695, 'learning_rate': 1.0811984368215372e-05, 'epoch': 2.37}


 79%|███████▉  | 14950/18924 [8:01:05<1:59:04,  1.80s/it]

{'loss': 0.4209, 'grad_norm': 10.512795448303223, 'learning_rate': 1.0784845853234911e-05, 'epoch': 2.37}


 79%|███████▉  | 14960/18924 [8:01:23<2:03:02,  1.86s/it]

{'loss': 0.4358, 'grad_norm': 12.108916282653809, 'learning_rate': 1.0757707338254452e-05, 'epoch': 2.37}


 79%|███████▉  | 14970/18924 [8:01:41<1:57:18,  1.78s/it]

{'loss': 0.38, 'grad_norm': 11.058220863342285, 'learning_rate': 1.0730568823273991e-05, 'epoch': 2.37}


 79%|███████▉  | 14980/18924 [8:01:59<1:58:44,  1.81s/it]

{'loss': 0.42, 'grad_norm': 7.049782752990723, 'learning_rate': 1.0703430308293532e-05, 'epoch': 2.37}


 79%|███████▉  | 14990/18924 [8:02:18<2:01:21,  1.85s/it]

{'loss': 0.3593, 'grad_norm': 10.58396053314209, 'learning_rate': 1.0676291793313071e-05, 'epoch': 2.38}


 79%|███████▉  | 15000/18924 [8:02:37<2:02:16,  1.87s/it]

{'loss': 0.3949, 'grad_norm': 11.439301490783691, 'learning_rate': 1.064915327833261e-05, 'epoch': 2.38}


 79%|███████▉  | 15010/18924 [8:02:56<1:57:25,  1.80s/it]

{'loss': 0.4979, 'grad_norm': 12.609397888183594, 'learning_rate': 1.0622014763352151e-05, 'epoch': 2.38}


 79%|███████▉  | 15020/18924 [8:03:13<1:55:56,  1.78s/it]

{'loss': 0.4516, 'grad_norm': 11.498620986938477, 'learning_rate': 1.059487624837169e-05, 'epoch': 2.38}


 79%|███████▉  | 15030/18924 [8:03:31<1:59:07,  1.84s/it]

{'loss': 0.5014, 'grad_norm': 7.767269134521484, 'learning_rate': 1.056773773339123e-05, 'epoch': 2.38}


 79%|███████▉  | 15040/18924 [8:03:50<2:00:36,  1.86s/it]

{'loss': 0.2915, 'grad_norm': 8.842796325683594, 'learning_rate': 1.054059921841077e-05, 'epoch': 2.38}


 80%|███████▉  | 15050/18924 [8:04:08<1:56:47,  1.81s/it]

{'loss': 0.3949, 'grad_norm': 7.2140350341796875, 'learning_rate': 1.0513460703430308e-05, 'epoch': 2.39}


 80%|███████▉  | 15060/18924 [8:04:26<1:54:45,  1.78s/it]

{'loss': 0.3935, 'grad_norm': 11.195730209350586, 'learning_rate': 1.0486322188449849e-05, 'epoch': 2.39}


 80%|███████▉  | 15070/18924 [8:04:44<1:57:34,  1.83s/it]

{'loss': 0.4593, 'grad_norm': 11.646347045898438, 'learning_rate': 1.045918367346939e-05, 'epoch': 2.39}


 80%|███████▉  | 15080/18924 [8:05:03<1:58:41,  1.85s/it]

{'loss': 0.3685, 'grad_norm': 4.567422866821289, 'learning_rate': 1.0432045158488927e-05, 'epoch': 2.39}


 80%|███████▉  | 15090/18924 [8:05:21<1:56:12,  1.82s/it]

{'loss': 0.2953, 'grad_norm': 7.947259426116943, 'learning_rate': 1.0404906643508468e-05, 'epoch': 2.39}


 80%|███████▉  | 15100/18924 [8:05:39<1:52:36,  1.77s/it]

{'loss': 0.396, 'grad_norm': 11.03649616241455, 'learning_rate': 1.0377768128528009e-05, 'epoch': 2.39}


 80%|███████▉  | 15110/18924 [8:05:57<1:55:58,  1.82s/it]

{'loss': 0.5946, 'grad_norm': 9.658233642578125, 'learning_rate': 1.0350629613547546e-05, 'epoch': 2.4}


 80%|███████▉  | 15120/18924 [8:06:16<1:57:09,  1.85s/it]

{'loss': 0.3089, 'grad_norm': 11.649757385253906, 'learning_rate': 1.0323491098567087e-05, 'epoch': 2.4}


 80%|███████▉  | 15130/18924 [8:06:34<1:56:10,  1.84s/it]

{'loss': 0.318, 'grad_norm': 10.504985809326172, 'learning_rate': 1.0296352583586628e-05, 'epoch': 2.4}


 80%|████████  | 15140/18924 [8:06:52<1:51:55,  1.77s/it]

{'loss': 0.298, 'grad_norm': 8.328609466552734, 'learning_rate': 1.0269214068606165e-05, 'epoch': 2.4}


 80%|████████  | 15150/18924 [8:07:10<1:53:27,  1.80s/it]

{'loss': 0.4741, 'grad_norm': 5.6711039543151855, 'learning_rate': 1.0242075553625706e-05, 'epoch': 2.4}


 80%|████████  | 15160/18924 [8:07:28<1:56:11,  1.85s/it]

{'loss': 0.3758, 'grad_norm': 12.070560455322266, 'learning_rate': 1.0214937038645247e-05, 'epoch': 2.4}


 80%|████████  | 15170/18924 [8:07:47<1:55:37,  1.85s/it]

{'loss': 0.5206, 'grad_norm': 10.46557903289795, 'learning_rate': 1.0187798523664784e-05, 'epoch': 2.4}


 80%|████████  | 15180/18924 [8:08:05<1:50:13,  1.77s/it]

{'loss': 0.3141, 'grad_norm': 4.035473823547363, 'learning_rate': 1.0160660008684325e-05, 'epoch': 2.41}


 80%|████████  | 15190/18924 [8:08:23<1:51:03,  1.78s/it]

{'loss': 0.4807, 'grad_norm': 7.3685455322265625, 'learning_rate': 1.0133521493703864e-05, 'epoch': 2.41}


 80%|████████  | 15200/18924 [8:08:41<1:54:15,  1.84s/it]

{'loss': 0.3592, 'grad_norm': 3.479032039642334, 'learning_rate': 1.0106382978723404e-05, 'epoch': 2.41}


 80%|████████  | 15210/18924 [8:08:59<1:50:25,  1.78s/it]

{'loss': 0.4105, 'grad_norm': 15.382875442504883, 'learning_rate': 1.0079244463742944e-05, 'epoch': 2.41}


 80%|████████  | 15220/18924 [8:09:17<1:49:32,  1.77s/it]

{'loss': 0.2901, 'grad_norm': 4.3414225578308105, 'learning_rate': 1.0052105948762484e-05, 'epoch': 2.41}


 80%|████████  | 15230/18924 [8:09:35<1:49:25,  1.78s/it]

{'loss': 0.3125, 'grad_norm': 10.659360885620117, 'learning_rate': 1.0024967433782025e-05, 'epoch': 2.41}


 81%|████████  | 15240/18924 [8:09:53<1:53:09,  1.84s/it]

{'loss': 0.3773, 'grad_norm': 10.931639671325684, 'learning_rate': 9.997828918801564e-06, 'epoch': 2.42}


 81%|████████  | 15250/18924 [8:10:12<1:53:49,  1.86s/it]

{'loss': 0.2634, 'grad_norm': 9.164518356323242, 'learning_rate': 9.970690403821103e-06, 'epoch': 2.42}


 81%|████████  | 15260/18924 [8:10:30<1:49:52,  1.80s/it]

{'loss': 0.4702, 'grad_norm': 14.470622062683105, 'learning_rate': 9.943551888840644e-06, 'epoch': 2.42}


 81%|████████  | 15270/18924 [8:10:47<1:48:02,  1.77s/it]

{'loss': 0.4432, 'grad_norm': 12.75557804107666, 'learning_rate': 9.916413373860183e-06, 'epoch': 2.42}


 81%|████████  | 15280/18924 [8:11:06<1:51:03,  1.83s/it]

{'loss': 0.3891, 'grad_norm': 0.5794095396995544, 'learning_rate': 9.889274858879722e-06, 'epoch': 2.42}


 81%|████████  | 15290/18924 [8:11:24<1:52:07,  1.85s/it]

{'loss': 0.5047, 'grad_norm': 3.340280055999756, 'learning_rate': 9.862136343899263e-06, 'epoch': 2.42}


 81%|████████  | 15300/18924 [8:11:43<1:50:20,  1.83s/it]

{'loss': 0.3831, 'grad_norm': 11.526440620422363, 'learning_rate': 9.834997828918802e-06, 'epoch': 2.43}


 81%|████████  | 15310/18924 [8:12:00<1:46:27,  1.77s/it]

{'loss': 0.4711, 'grad_norm': 7.145369529724121, 'learning_rate': 9.807859313938341e-06, 'epoch': 2.43}


 81%|████████  | 15320/18924 [8:12:18<1:48:57,  1.81s/it]

{'loss': 0.4051, 'grad_norm': 6.487504482269287, 'learning_rate': 9.780720798957882e-06, 'epoch': 2.43}


 81%|████████  | 15330/18924 [8:12:37<1:51:35,  1.86s/it]

{'loss': 0.41, 'grad_norm': 11.792193412780762, 'learning_rate': 9.753582283977421e-06, 'epoch': 2.43}


 81%|████████  | 15340/18924 [8:12:56<1:49:53,  1.84s/it]

{'loss': 0.5231, 'grad_norm': 17.1074275970459, 'learning_rate': 9.72644376899696e-06, 'epoch': 2.43}


 81%|████████  | 15350/18924 [8:13:13<1:45:22,  1.77s/it]

{'loss': 0.3435, 'grad_norm': 8.678940773010254, 'learning_rate': 9.699305254016501e-06, 'epoch': 2.43}


 81%|████████  | 15360/18924 [8:13:31<1:46:51,  1.80s/it]

{'loss': 0.4879, 'grad_norm': 7.6211748123168945, 'learning_rate': 9.67216673903604e-06, 'epoch': 2.44}


 81%|████████  | 15370/18924 [8:13:50<1:48:54,  1.84s/it]

{'loss': 0.5317, 'grad_norm': 10.524205207824707, 'learning_rate': 9.64502822405558e-06, 'epoch': 2.44}


 81%|████████▏ | 15380/18924 [8:14:08<1:46:20,  1.80s/it]

{'loss': 0.4216, 'grad_norm': 5.110313415527344, 'learning_rate': 9.61788970907512e-06, 'epoch': 2.44}


 81%|████████▏ | 15390/18924 [8:14:25<1:43:30,  1.76s/it]

{'loss': 0.4031, 'grad_norm': 8.612984657287598, 'learning_rate': 9.59075119409466e-06, 'epoch': 2.44}


 81%|████████▏ | 15400/18924 [8:14:43<1:43:39,  1.76s/it]

{'loss': 0.4316, 'grad_norm': 13.318014144897461, 'learning_rate': 9.563612679114199e-06, 'epoch': 2.44}


 81%|████████▏ | 15410/18924 [8:15:01<1:48:21,  1.85s/it]

{'loss': 0.3297, 'grad_norm': 12.222122192382812, 'learning_rate': 9.53647416413374e-06, 'epoch': 2.44}


 81%|████████▏ | 15420/18924 [8:15:20<1:49:19,  1.87s/it]

{'loss': 0.4241, 'grad_norm': 6.4722208976745605, 'learning_rate': 9.509335649153279e-06, 'epoch': 2.44}


 82%|████████▏ | 15430/18924 [8:15:39<1:48:14,  1.86s/it]

{'loss': 0.3078, 'grad_norm': 7.680759906768799, 'learning_rate': 9.482197134172818e-06, 'epoch': 2.45}


 82%|████████▏ | 15440/18924 [8:15:57<1:46:10,  1.83s/it]

{'loss': 0.3464, 'grad_norm': 12.355117797851562, 'learning_rate': 9.455058619192359e-06, 'epoch': 2.45}


 82%|████████▏ | 15450/18924 [8:16:15<1:45:53,  1.83s/it]

{'loss': 0.4777, 'grad_norm': 20.054523468017578, 'learning_rate': 9.427920104211898e-06, 'epoch': 2.45}


 82%|████████▏ | 15460/18924 [8:16:34<1:45:36,  1.83s/it]

{'loss': 0.2566, 'grad_norm': 10.326892852783203, 'learning_rate': 9.400781589231437e-06, 'epoch': 2.45}


 82%|████████▏ | 15470/18924 [8:16:52<1:45:42,  1.84s/it]

{'loss': 0.5862, 'grad_norm': 18.44089698791504, 'learning_rate': 9.373643074250978e-06, 'epoch': 2.45}


 82%|████████▏ | 15480/18924 [8:17:10<1:45:08,  1.83s/it]

{'loss': 0.4463, 'grad_norm': 10.94061279296875, 'learning_rate': 9.346504559270517e-06, 'epoch': 2.45}


 82%|████████▏ | 15490/18924 [8:17:28<1:43:58,  1.82s/it]

{'loss': 0.5119, 'grad_norm': 15.33207893371582, 'learning_rate': 9.319366044290056e-06, 'epoch': 2.46}


 82%|████████▏ | 15500/18924 [8:17:47<1:44:18,  1.83s/it]

{'loss': 0.3267, 'grad_norm': 8.493790626525879, 'learning_rate': 9.292227529309597e-06, 'epoch': 2.46}


 82%|████████▏ | 15510/18924 [8:18:06<1:44:36,  1.84s/it]

{'loss': 0.3845, 'grad_norm': 12.828141212463379, 'learning_rate': 9.265089014329136e-06, 'epoch': 2.46}


 82%|████████▏ | 15520/18924 [8:18:24<1:43:49,  1.83s/it]

{'loss': 0.3291, 'grad_norm': 5.639601230621338, 'learning_rate': 9.237950499348676e-06, 'epoch': 2.46}


 82%|████████▏ | 15530/18924 [8:18:42<1:42:52,  1.82s/it]

{'loss': 0.4028, 'grad_norm': 12.73695182800293, 'learning_rate': 9.210811984368216e-06, 'epoch': 2.46}


 82%|████████▏ | 15540/18924 [8:19:01<1:42:48,  1.82s/it]

{'loss': 0.4656, 'grad_norm': 4.697173595428467, 'learning_rate': 9.183673469387756e-06, 'epoch': 2.46}


 82%|████████▏ | 15550/18924 [8:19:19<1:42:06,  1.82s/it]

{'loss': 0.4998, 'grad_norm': 18.659120559692383, 'learning_rate': 9.156534954407295e-06, 'epoch': 2.47}


 82%|████████▏ | 15560/18924 [8:19:37<1:41:47,  1.82s/it]

{'loss': 0.6097, 'grad_norm': 13.134498596191406, 'learning_rate': 9.129396439426836e-06, 'epoch': 2.47}


 82%|████████▏ | 15570/18924 [8:19:55<1:41:33,  1.82s/it]

{'loss': 0.4266, 'grad_norm': 8.199503898620605, 'learning_rate': 9.102257924446375e-06, 'epoch': 2.47}


 82%|████████▏ | 15580/18924 [8:20:13<1:41:06,  1.81s/it]

{'loss': 0.4813, 'grad_norm': 11.670029640197754, 'learning_rate': 9.075119409465914e-06, 'epoch': 2.47}


 82%|████████▏ | 15590/18924 [8:20:32<1:40:56,  1.82s/it]

{'loss': 0.5134, 'grad_norm': 9.330589294433594, 'learning_rate': 9.047980894485455e-06, 'epoch': 2.47}


 82%|████████▏ | 15600/18924 [8:20:50<1:40:47,  1.82s/it]

{'loss': 0.3904, 'grad_norm': 11.544214248657227, 'learning_rate': 9.020842379504994e-06, 'epoch': 2.47}


 82%|████████▏ | 15610/18924 [8:21:08<1:40:17,  1.82s/it]

{'loss': 0.4758, 'grad_norm': 8.103041648864746, 'learning_rate': 8.993703864524533e-06, 'epoch': 2.47}


 83%|████████▎ | 15620/18924 [8:21:26<1:40:02,  1.82s/it]

{'loss': 0.2921, 'grad_norm': 3.7592995166778564, 'learning_rate': 8.966565349544074e-06, 'epoch': 2.48}


 83%|████████▎ | 15630/18924 [8:21:44<1:39:55,  1.82s/it]

{'loss': 0.5579, 'grad_norm': 12.942571640014648, 'learning_rate': 8.939426834563613e-06, 'epoch': 2.48}


 83%|████████▎ | 15640/18924 [8:22:02<1:39:26,  1.82s/it]

{'loss': 0.4445, 'grad_norm': 3.9984376430511475, 'learning_rate': 8.912288319583152e-06, 'epoch': 2.48}


 83%|████████▎ | 15650/18924 [8:22:21<1:39:20,  1.82s/it]

{'loss': 0.3555, 'grad_norm': 16.11493682861328, 'learning_rate': 8.885149804602692e-06, 'epoch': 2.48}


 83%|████████▎ | 15660/18924 [8:22:39<1:39:12,  1.82s/it]

{'loss': 0.4861, 'grad_norm': 6.7328782081604, 'learning_rate': 8.858011289622232e-06, 'epoch': 2.48}


 83%|████████▎ | 15670/18924 [8:22:57<1:38:15,  1.81s/it]

{'loss': 0.4846, 'grad_norm': 11.276899337768555, 'learning_rate': 8.830872774641773e-06, 'epoch': 2.48}


 83%|████████▎ | 15680/18924 [8:23:15<1:38:50,  1.83s/it]

{'loss': 0.3679, 'grad_norm': 11.790067672729492, 'learning_rate': 8.80373425966131e-06, 'epoch': 2.49}


 83%|████████▎ | 15690/18924 [8:23:34<1:38:59,  1.84s/it]

{'loss': 0.5234, 'grad_norm': 6.187506675720215, 'learning_rate': 8.776595744680852e-06, 'epoch': 2.49}


 83%|████████▎ | 15700/18924 [8:23:52<1:36:56,  1.80s/it]

{'loss': 0.5686, 'grad_norm': 17.13857078552246, 'learning_rate': 8.749457229700392e-06, 'epoch': 2.49}


 83%|████████▎ | 15710/18924 [8:24:10<1:37:16,  1.82s/it]

{'loss': 0.3849, 'grad_norm': 15.52028751373291, 'learning_rate': 8.72231871471993e-06, 'epoch': 2.49}


 83%|████████▎ | 15720/18924 [8:24:28<1:37:05,  1.82s/it]

{'loss': 0.4687, 'grad_norm': 13.035933494567871, 'learning_rate': 8.69518019973947e-06, 'epoch': 2.49}


 83%|████████▎ | 15730/18924 [8:24:46<1:38:04,  1.84s/it]

{'loss': 0.4112, 'grad_norm': 6.526691436767578, 'learning_rate': 8.668041684759012e-06, 'epoch': 2.49}


 83%|████████▎ | 15740/18924 [8:25:05<1:36:52,  1.83s/it]

{'loss': 0.6073, 'grad_norm': 15.10181713104248, 'learning_rate': 8.64090316977855e-06, 'epoch': 2.5}


 83%|████████▎ | 15750/18924 [8:25:23<1:36:16,  1.82s/it]

{'loss': 0.526, 'grad_norm': 9.971213340759277, 'learning_rate': 8.61376465479809e-06, 'epoch': 2.5}


 83%|████████▎ | 15760/18924 [8:25:41<1:35:24,  1.81s/it]

{'loss': 0.3548, 'grad_norm': 16.886592864990234, 'learning_rate': 8.586626139817631e-06, 'epoch': 2.5}


 83%|████████▎ | 15770/18924 [8:25:59<1:34:11,  1.79s/it]

{'loss': 0.4584, 'grad_norm': 11.635787963867188, 'learning_rate': 8.559487624837168e-06, 'epoch': 2.5}


 83%|████████▎ | 15780/18924 [8:26:17<1:34:36,  1.81s/it]

{'loss': 0.3881, 'grad_norm': 7.749602794647217, 'learning_rate': 8.53234910985671e-06, 'epoch': 2.5}


 83%|████████▎ | 15790/18924 [8:26:35<1:34:14,  1.80s/it]

{'loss': 0.4648, 'grad_norm': 10.193838119506836, 'learning_rate': 8.505210594876248e-06, 'epoch': 2.5}


 83%|████████▎ | 15800/18924 [8:26:53<1:35:05,  1.83s/it]

{'loss': 0.4297, 'grad_norm': 6.099392890930176, 'learning_rate': 8.478072079895788e-06, 'epoch': 2.5}


 84%|████████▎ | 15810/18924 [8:27:12<1:35:07,  1.83s/it]

{'loss': 0.5067, 'grad_norm': 15.808562278747559, 'learning_rate': 8.450933564915328e-06, 'epoch': 2.51}


 84%|████████▎ | 15820/18924 [8:27:30<1:34:08,  1.82s/it]

{'loss': 0.46, 'grad_norm': 7.435504913330078, 'learning_rate': 8.423795049934868e-06, 'epoch': 2.51}


 84%|████████▎ | 15830/18924 [8:27:48<1:33:58,  1.82s/it]

{'loss': 0.4143, 'grad_norm': 11.86209487915039, 'learning_rate': 8.396656534954407e-06, 'epoch': 2.51}


 84%|████████▎ | 15840/18924 [8:28:06<1:33:25,  1.82s/it]

{'loss': 0.4219, 'grad_norm': 9.93260669708252, 'learning_rate': 8.369518019973948e-06, 'epoch': 2.51}


 84%|████████▍ | 15850/18924 [8:28:24<1:32:50,  1.81s/it]

{'loss': 0.4997, 'grad_norm': 6.909438133239746, 'learning_rate': 8.342379504993487e-06, 'epoch': 2.51}


 84%|████████▍ | 15860/18924 [8:28:43<1:32:55,  1.82s/it]

{'loss': 0.3931, 'grad_norm': 21.4149227142334, 'learning_rate': 8.315240990013026e-06, 'epoch': 2.51}


 84%|████████▍ | 15870/18924 [8:29:01<1:32:24,  1.82s/it]

{'loss': 0.4661, 'grad_norm': 17.80022430419922, 'learning_rate': 8.288102475032567e-06, 'epoch': 2.52}


 84%|████████▍ | 15880/18924 [8:29:19<1:31:52,  1.81s/it]

{'loss': 0.4835, 'grad_norm': 6.3733439445495605, 'learning_rate': 8.260963960052106e-06, 'epoch': 2.52}


 84%|████████▍ | 15890/18924 [8:29:37<1:32:43,  1.83s/it]

{'loss': 0.5747, 'grad_norm': 16.61292266845703, 'learning_rate': 8.233825445071645e-06, 'epoch': 2.52}


 84%|████████▍ | 15900/18924 [8:29:55<1:32:12,  1.83s/it]

{'loss': 0.521, 'grad_norm': 5.468445777893066, 'learning_rate': 8.206686930091186e-06, 'epoch': 2.52}


 84%|████████▍ | 15910/18924 [8:30:14<1:31:27,  1.82s/it]

{'loss': 0.4283, 'grad_norm': 13.919201850891113, 'learning_rate': 8.179548415110725e-06, 'epoch': 2.52}


 84%|████████▍ | 15920/18924 [8:30:32<1:30:57,  1.82s/it]

{'loss': 0.3889, 'grad_norm': 12.817766189575195, 'learning_rate': 8.152409900130266e-06, 'epoch': 2.52}


 84%|████████▍ | 15930/18924 [8:30:50<1:31:10,  1.83s/it]

{'loss': 0.3167, 'grad_norm': 7.2557783126831055, 'learning_rate': 8.125271385149805e-06, 'epoch': 2.53}


 84%|████████▍ | 15940/18924 [8:31:08<1:27:45,  1.76s/it]

{'loss': 0.4401, 'grad_norm': 17.971054077148438, 'learning_rate': 8.098132870169344e-06, 'epoch': 2.53}


 84%|████████▍ | 15950/18924 [8:31:26<1:27:25,  1.76s/it]

{'loss': 0.4187, 'grad_norm': 8.83853816986084, 'learning_rate': 8.070994355188885e-06, 'epoch': 2.53}


 84%|████████▍ | 15960/18924 [8:31:44<1:29:50,  1.82s/it]

{'loss': 0.4477, 'grad_norm': 9.451213836669922, 'learning_rate': 8.043855840208424e-06, 'epoch': 2.53}


 84%|████████▍ | 15970/18924 [8:32:02<1:31:17,  1.85s/it]

{'loss': 0.4546, 'grad_norm': 9.29249382019043, 'learning_rate': 8.016717325227964e-06, 'epoch': 2.53}


 84%|████████▍ | 15980/18924 [8:32:20<1:27:51,  1.79s/it]

{'loss': 0.4735, 'grad_norm': 19.302209854125977, 'learning_rate': 7.989578810247504e-06, 'epoch': 2.53}


 84%|████████▍ | 15990/18924 [8:32:38<1:26:11,  1.76s/it]

{'loss': 0.4266, 'grad_norm': 12.7328462600708, 'learning_rate': 7.962440295267044e-06, 'epoch': 2.53}


 85%|████████▍ | 16000/18924 [8:32:56<1:27:57,  1.80s/it]

{'loss': 0.3427, 'grad_norm': 10.556766510009766, 'learning_rate': 7.935301780286583e-06, 'epoch': 2.54}


 85%|████████▍ | 16010/18924 [8:33:15<1:28:08,  1.81s/it]

{'loss': 0.3084, 'grad_norm': 16.636884689331055, 'learning_rate': 7.908163265306124e-06, 'epoch': 2.54}


 85%|████████▍ | 16020/18924 [8:33:33<1:25:49,  1.77s/it]

{'loss': 0.4711, 'grad_norm': 14.018489837646484, 'learning_rate': 7.881024750325663e-06, 'epoch': 2.54}


 85%|████████▍ | 16030/18924 [8:33:50<1:25:14,  1.77s/it]

{'loss': 0.4103, 'grad_norm': 15.552180290222168, 'learning_rate': 7.853886235345202e-06, 'epoch': 2.54}


 85%|████████▍ | 16040/18924 [8:34:08<1:26:47,  1.81s/it]

{'loss': 0.3374, 'grad_norm': 5.816899299621582, 'learning_rate': 7.826747720364743e-06, 'epoch': 2.54}


 85%|████████▍ | 16050/18924 [8:34:27<1:28:24,  1.85s/it]

{'loss': 0.3138, 'grad_norm': 8.402480125427246, 'learning_rate': 7.799609205384282e-06, 'epoch': 2.54}


 85%|████████▍ | 16060/18924 [8:34:45<1:28:47,  1.86s/it]

{'loss': 0.5124, 'grad_norm': 14.29540729522705, 'learning_rate': 7.772470690403821e-06, 'epoch': 2.55}


 85%|████████▍ | 16070/18924 [8:35:03<1:24:02,  1.77s/it]

{'loss': 0.452, 'grad_norm': 5.209953308105469, 'learning_rate': 7.745332175423362e-06, 'epoch': 2.55}


 85%|████████▍ | 16080/18924 [8:35:21<1:24:48,  1.79s/it]

{'loss': 0.4335, 'grad_norm': 11.478312492370605, 'learning_rate': 7.7181936604429e-06, 'epoch': 2.55}


 85%|████████▌ | 16090/18924 [8:35:39<1:26:22,  1.83s/it]

{'loss': 0.47, 'grad_norm': 3.2385191917419434, 'learning_rate': 7.69105514546244e-06, 'epoch': 2.55}


 85%|████████▌ | 16100/18924 [8:35:58<1:27:21,  1.86s/it]

{'loss': 0.4925, 'grad_norm': 10.896894454956055, 'learning_rate': 7.663916630481981e-06, 'epoch': 2.55}


 85%|████████▌ | 16110/18924 [8:36:16<1:26:03,  1.83s/it]

{'loss': 0.469, 'grad_norm': 12.075620651245117, 'learning_rate': 7.636778115501519e-06, 'epoch': 2.55}


 85%|████████▌ | 16120/18924 [8:36:34<1:24:55,  1.82s/it]

{'loss': 0.406, 'grad_norm': 11.553644180297852, 'learning_rate': 7.6096396005210595e-06, 'epoch': 2.56}


 85%|████████▌ | 16130/18924 [8:36:52<1:22:50,  1.78s/it]

{'loss': 0.409, 'grad_norm': 4.083863258361816, 'learning_rate': 7.5825010855405996e-06, 'epoch': 2.56}


 85%|████████▌ | 16140/18924 [8:37:10<1:22:35,  1.78s/it]

{'loss': 0.4933, 'grad_norm': 11.562623023986816, 'learning_rate': 7.55536257056014e-06, 'epoch': 2.56}


 85%|████████▌ | 16150/18924 [8:37:28<1:23:13,  1.80s/it]

{'loss': 0.2674, 'grad_norm': 18.102746963500977, 'learning_rate': 7.528224055579679e-06, 'epoch': 2.56}


 85%|████████▌ | 16160/18924 [8:37:46<1:24:34,  1.84s/it]

{'loss': 0.5196, 'grad_norm': 18.504484176635742, 'learning_rate': 7.501085540599219e-06, 'epoch': 2.56}


 85%|████████▌ | 16170/18924 [8:38:05<1:25:08,  1.86s/it]

{'loss': 0.3003, 'grad_norm': 11.790641784667969, 'learning_rate': 7.473947025618759e-06, 'epoch': 2.56}


 85%|████████▌ | 16180/18924 [8:38:23<1:23:47,  1.83s/it]

{'loss': 0.5494, 'grad_norm': 8.758390426635742, 'learning_rate': 7.446808510638298e-06, 'epoch': 2.56}


 86%|████████▌ | 16190/18924 [8:38:41<1:21:14,  1.78s/it]

{'loss': 0.3551, 'grad_norm': 10.364519119262695, 'learning_rate': 7.419669995657838e-06, 'epoch': 2.57}


 86%|████████▌ | 16200/18924 [8:38:59<1:20:22,  1.77s/it]

{'loss': 0.4868, 'grad_norm': 8.327863693237305, 'learning_rate': 7.392531480677378e-06, 'epoch': 2.57}


 86%|████████▌ | 16210/18924 [8:39:16<1:20:21,  1.78s/it]

{'loss': 0.524, 'grad_norm': 12.084965705871582, 'learning_rate': 7.365392965696917e-06, 'epoch': 2.57}


 86%|████████▌ | 16220/18924 [8:39:34<1:21:28,  1.81s/it]

{'loss': 0.3969, 'grad_norm': 11.779545783996582, 'learning_rate': 7.338254450716457e-06, 'epoch': 2.57}


 86%|████████▌ | 16230/18924 [8:39:53<1:22:21,  1.83s/it]

{'loss': 0.2965, 'grad_norm': 3.6364123821258545, 'learning_rate': 7.311115935735997e-06, 'epoch': 2.57}


 86%|████████▌ | 16240/18924 [8:40:11<1:22:26,  1.84s/it]

{'loss': 0.4529, 'grad_norm': 6.453751564025879, 'learning_rate': 7.283977420755536e-06, 'epoch': 2.57}


 86%|████████▌ | 16250/18924 [8:40:30<1:22:27,  1.85s/it]

{'loss': 0.3321, 'grad_norm': 11.2767972946167, 'learning_rate': 7.256838905775076e-06, 'epoch': 2.58}


 86%|████████▌ | 16260/18924 [8:40:48<1:21:09,  1.83s/it]

{'loss': 0.3246, 'grad_norm': 13.126953125, 'learning_rate': 7.229700390794616e-06, 'epoch': 2.58}


 86%|████████▌ | 16270/18924 [8:41:06<1:21:15,  1.84s/it]

{'loss': 0.4228, 'grad_norm': 9.699968338012695, 'learning_rate': 7.2025618758141555e-06, 'epoch': 2.58}


 86%|████████▌ | 16280/18924 [8:41:25<1:20:12,  1.82s/it]

{'loss': 0.3986, 'grad_norm': 21.28714942932129, 'learning_rate': 7.1754233608336955e-06, 'epoch': 2.58}


 86%|████████▌ | 16290/18924 [8:41:43<1:19:56,  1.82s/it]

{'loss': 0.3748, 'grad_norm': 11.559149742126465, 'learning_rate': 7.1482848458532356e-06, 'epoch': 2.58}


 86%|████████▌ | 16300/18924 [8:42:01<1:20:07,  1.83s/it]

{'loss': 0.4748, 'grad_norm': 10.219162940979004, 'learning_rate': 7.121146330872775e-06, 'epoch': 2.58}


 86%|████████▌ | 16310/18924 [8:42:20<1:19:36,  1.83s/it]

{'loss': 0.3048, 'grad_norm': 5.871389865875244, 'learning_rate': 7.094007815892315e-06, 'epoch': 2.59}


 86%|████████▌ | 16320/18924 [8:42:38<1:19:26,  1.83s/it]

{'loss': 0.3574, 'grad_norm': 17.131759643554688, 'learning_rate': 7.066869300911855e-06, 'epoch': 2.59}


 86%|████████▋ | 16330/18924 [8:42:56<1:18:55,  1.83s/it]

{'loss': 0.5322, 'grad_norm': 3.4731016159057617, 'learning_rate': 7.039730785931394e-06, 'epoch': 2.59}


 86%|████████▋ | 16340/18924 [8:43:15<1:18:48,  1.83s/it]

{'loss': 0.4761, 'grad_norm': 3.432556390762329, 'learning_rate': 7.012592270950934e-06, 'epoch': 2.59}


 86%|████████▋ | 16350/18924 [8:43:33<1:17:31,  1.81s/it]

{'loss': 0.4518, 'grad_norm': 10.102805137634277, 'learning_rate': 6.985453755970474e-06, 'epoch': 2.59}


 86%|████████▋ | 16360/18924 [8:43:51<1:17:35,  1.82s/it]

{'loss': 0.4695, 'grad_norm': 14.974574089050293, 'learning_rate': 6.958315240990014e-06, 'epoch': 2.59}


 87%|████████▋ | 16370/18924 [8:44:09<1:17:42,  1.83s/it]

{'loss': 0.3693, 'grad_norm': 8.000216484069824, 'learning_rate': 6.931176726009553e-06, 'epoch': 2.6}


 87%|████████▋ | 16380/18924 [8:44:27<1:17:17,  1.82s/it]

{'loss': 0.388, 'grad_norm': 2.7985756397247314, 'learning_rate': 6.904038211029093e-06, 'epoch': 2.6}


 87%|████████▋ | 16390/18924 [8:44:46<1:17:30,  1.84s/it]

{'loss': 0.4232, 'grad_norm': 10.339272499084473, 'learning_rate': 6.876899696048633e-06, 'epoch': 2.6}


 87%|████████▋ | 16400/18924 [8:45:04<1:16:40,  1.82s/it]

{'loss': 0.2851, 'grad_norm': 15.473495483398438, 'learning_rate': 6.849761181068172e-06, 'epoch': 2.6}


 87%|████████▋ | 16410/18924 [8:45:22<1:16:16,  1.82s/it]

{'loss': 0.6162, 'grad_norm': 15.9243745803833, 'learning_rate': 6.822622666087712e-06, 'epoch': 2.6}


 87%|████████▋ | 16420/18924 [8:45:41<1:16:07,  1.82s/it]

{'loss': 0.4626, 'grad_norm': 10.46176528930664, 'learning_rate': 6.795484151107252e-06, 'epoch': 2.6}


 87%|████████▋ | 16430/18924 [8:45:59<1:15:50,  1.82s/it]

{'loss': 0.5311, 'grad_norm': 10.970597267150879, 'learning_rate': 6.768345636126791e-06, 'epoch': 2.6}


 87%|████████▋ | 16440/18924 [8:46:17<1:15:37,  1.83s/it]

{'loss': 0.4154, 'grad_norm': 8.617094993591309, 'learning_rate': 6.7412071211463315e-06, 'epoch': 2.61}


 87%|████████▋ | 16450/18924 [8:46:35<1:15:27,  1.83s/it]

{'loss': 0.418, 'grad_norm': 7.627892017364502, 'learning_rate': 6.7140686061658716e-06, 'epoch': 2.61}


 87%|████████▋ | 16460/18924 [8:46:54<1:15:24,  1.84s/it]

{'loss': 0.5746, 'grad_norm': 18.562246322631836, 'learning_rate': 6.68693009118541e-06, 'epoch': 2.61}


 87%|████████▋ | 16470/18924 [8:47:12<1:14:36,  1.82s/it]

{'loss': 0.5264, 'grad_norm': 8.532742500305176, 'learning_rate': 6.659791576204951e-06, 'epoch': 2.61}


 87%|████████▋ | 16480/18924 [8:47:30<1:14:03,  1.82s/it]

{'loss': 0.3886, 'grad_norm': 18.568222045898438, 'learning_rate': 6.632653061224491e-06, 'epoch': 2.61}


 87%|████████▋ | 16490/18924 [8:47:48<1:13:11,  1.80s/it]

{'loss': 0.3495, 'grad_norm': 8.66898250579834, 'learning_rate': 6.605514546244029e-06, 'epoch': 2.61}


 87%|████████▋ | 16500/18924 [8:48:06<1:12:50,  1.80s/it]

{'loss': 0.349, 'grad_norm': 2.2289576530456543, 'learning_rate': 6.578376031263569e-06, 'epoch': 2.62}


 87%|████████▋ | 16510/18924 [8:48:25<1:13:28,  1.83s/it]

{'loss': 0.5089, 'grad_norm': 11.373425483703613, 'learning_rate': 6.55123751628311e-06, 'epoch': 2.62}


 87%|████████▋ | 16520/18924 [8:48:44<1:13:35,  1.84s/it]

{'loss': 0.4598, 'grad_norm': 5.6104559898376465, 'learning_rate': 6.524099001302648e-06, 'epoch': 2.62}


 87%|████████▋ | 16530/18924 [8:49:02<1:12:35,  1.82s/it]

{'loss': 0.4274, 'grad_norm': 8.763233184814453, 'learning_rate': 6.496960486322188e-06, 'epoch': 2.62}


 87%|████████▋ | 16540/18924 [8:49:20<1:12:26,  1.82s/it]

{'loss': 0.4705, 'grad_norm': 10.934675216674805, 'learning_rate': 6.469821971341729e-06, 'epoch': 2.62}


 87%|████████▋ | 16550/18924 [8:49:38<1:12:06,  1.82s/it]

{'loss': 0.379, 'grad_norm': 11.100253105163574, 'learning_rate': 6.4426834563612675e-06, 'epoch': 2.62}


 88%|████████▊ | 16560/18924 [8:49:57<1:11:33,  1.82s/it]

{'loss': 0.2698, 'grad_norm': 6.858494758605957, 'learning_rate': 6.4155449413808075e-06, 'epoch': 2.63}


 88%|████████▊ | 16570/18924 [8:50:15<1:10:38,  1.80s/it]

{'loss': 0.3109, 'grad_norm': 33.56159973144531, 'learning_rate': 6.388406426400348e-06, 'epoch': 2.63}


 88%|████████▊ | 16580/18924 [8:50:33<1:11:02,  1.82s/it]

{'loss': 0.5153, 'grad_norm': 12.010029792785645, 'learning_rate': 6.361267911419888e-06, 'epoch': 2.63}


 88%|████████▊ | 16590/18924 [8:50:51<1:12:15,  1.86s/it]

{'loss': 0.4569, 'grad_norm': 9.792797088623047, 'learning_rate': 6.334129396439427e-06, 'epoch': 2.63}


 88%|████████▊ | 16600/18924 [8:51:10<1:10:53,  1.83s/it]

{'loss': 0.3766, 'grad_norm': 8.646239280700684, 'learning_rate': 6.306990881458967e-06, 'epoch': 2.63}


 88%|████████▊ | 16610/18924 [8:51:28<1:10:25,  1.83s/it]

{'loss': 0.4618, 'grad_norm': 11.86951732635498, 'learning_rate': 6.2798523664785076e-06, 'epoch': 2.63}


 88%|████████▊ | 16620/18924 [8:51:46<1:10:04,  1.82s/it]

{'loss': 0.4421, 'grad_norm': 7.6870269775390625, 'learning_rate': 6.252713851498046e-06, 'epoch': 2.63}


 88%|████████▊ | 16630/18924 [8:52:05<1:09:55,  1.83s/it]

{'loss': 0.4562, 'grad_norm': 6.388749122619629, 'learning_rate': 6.225575336517586e-06, 'epoch': 2.64}


 88%|████████▊ | 16640/18924 [8:52:23<1:09:43,  1.83s/it]

{'loss': 0.4491, 'grad_norm': 10.076241493225098, 'learning_rate': 6.198436821537126e-06, 'epoch': 2.64}


 88%|████████▊ | 16650/18924 [8:52:41<1:09:34,  1.84s/it]

{'loss': 0.3641, 'grad_norm': 9.254700660705566, 'learning_rate': 6.171298306556666e-06, 'epoch': 2.64}


 88%|████████▊ | 16660/18924 [8:52:59<1:08:45,  1.82s/it]

{'loss': 0.3651, 'grad_norm': 6.601210117340088, 'learning_rate': 6.144159791576205e-06, 'epoch': 2.64}


 88%|████████▊ | 16670/18924 [8:53:18<1:08:51,  1.83s/it]

{'loss': 0.3854, 'grad_norm': 6.870410442352295, 'learning_rate': 6.117021276595745e-06, 'epoch': 2.64}


 88%|████████▊ | 16680/18924 [8:53:36<1:08:48,  1.84s/it]

{'loss': 0.4282, 'grad_norm': 11.26457691192627, 'learning_rate': 6.089882761615285e-06, 'epoch': 2.64}


 88%|████████▊ | 16690/18924 [8:53:54<1:08:02,  1.83s/it]

{'loss': 0.4404, 'grad_norm': 6.370454788208008, 'learning_rate': 6.062744246634824e-06, 'epoch': 2.65}


 88%|████████▊ | 16700/18924 [8:54:13<1:07:46,  1.83s/it]

{'loss': 0.4219, 'grad_norm': 21.650104522705078, 'learning_rate': 6.035605731654364e-06, 'epoch': 2.65}


 88%|████████▊ | 16710/18924 [8:54:31<1:07:12,  1.82s/it]

{'loss': 0.5264, 'grad_norm': 15.936850547790527, 'learning_rate': 6.008467216673904e-06, 'epoch': 2.65}


 88%|████████▊ | 16720/18924 [8:54:49<1:07:51,  1.85s/it]

{'loss': 0.3848, 'grad_norm': 8.866876602172852, 'learning_rate': 5.9813287016934435e-06, 'epoch': 2.65}


 88%|████████▊ | 16730/18924 [8:55:09<1:12:05,  1.97s/it]

{'loss': 0.5258, 'grad_norm': 17.36675262451172, 'learning_rate': 5.954190186712983e-06, 'epoch': 2.65}


 88%|████████▊ | 16740/18924 [8:55:28<1:07:15,  1.85s/it]

{'loss': 0.427, 'grad_norm': 15.381900787353516, 'learning_rate': 5.9270516717325235e-06, 'epoch': 2.65}


 89%|████████▊ | 16750/18924 [8:55:47<1:08:56,  1.90s/it]

{'loss': 0.5034, 'grad_norm': 12.952059745788574, 'learning_rate': 5.899913156752063e-06, 'epoch': 2.66}


 89%|████████▊ | 16760/18924 [8:56:05<1:06:42,  1.85s/it]

{'loss': 0.3748, 'grad_norm': 13.301850318908691, 'learning_rate': 5.872774641771603e-06, 'epoch': 2.66}


 89%|████████▊ | 16770/18924 [8:56:24<1:06:47,  1.86s/it]

{'loss': 0.4754, 'grad_norm': 10.186705589294434, 'learning_rate': 5.845636126791143e-06, 'epoch': 2.66}


 89%|████████▊ | 16780/18924 [8:56:42<1:06:28,  1.86s/it]

{'loss': 0.4533, 'grad_norm': 10.88754940032959, 'learning_rate': 5.818497611810682e-06, 'epoch': 2.66}


 89%|████████▊ | 16790/18924 [8:57:01<1:06:02,  1.86s/it]

{'loss': 0.4439, 'grad_norm': 17.50262451171875, 'learning_rate': 5.791359096830222e-06, 'epoch': 2.66}


 89%|████████▉ | 16800/18924 [8:57:20<1:05:19,  1.85s/it]

{'loss': 0.3593, 'grad_norm': 9.618699073791504, 'learning_rate': 5.764220581849761e-06, 'epoch': 2.66}


 89%|████████▉ | 16810/18924 [8:57:38<1:05:14,  1.85s/it]

{'loss': 0.4951, 'grad_norm': 11.054550170898438, 'learning_rate': 5.737082066869301e-06, 'epoch': 2.66}


 89%|████████▉ | 16820/18924 [8:57:57<1:04:50,  1.85s/it]

{'loss': 0.379, 'grad_norm': 13.085525512695312, 'learning_rate': 5.709943551888841e-06, 'epoch': 2.67}


 89%|████████▉ | 16830/18924 [8:58:15<1:04:15,  1.84s/it]

{'loss': 0.2794, 'grad_norm': 8.66058349609375, 'learning_rate': 5.68280503690838e-06, 'epoch': 2.67}


 89%|████████▉ | 16840/18924 [8:58:33<1:03:39,  1.83s/it]

{'loss': 0.3483, 'grad_norm': 7.811807632446289, 'learning_rate': 5.65566652192792e-06, 'epoch': 2.67}


 89%|████████▉ | 16850/18924 [8:58:52<1:03:21,  1.83s/it]

{'loss': 0.4253, 'grad_norm': 13.403882026672363, 'learning_rate': 5.62852800694746e-06, 'epoch': 2.67}


 89%|████████▉ | 16860/18924 [8:59:10<1:03:11,  1.84s/it]

{'loss': 0.3794, 'grad_norm': 14.389379501342773, 'learning_rate': 5.6013894919669994e-06, 'epoch': 2.67}


 89%|████████▉ | 16870/18924 [8:59:28<1:02:34,  1.83s/it]

{'loss': 0.2433, 'grad_norm': 4.229971885681152, 'learning_rate': 5.57425097698654e-06, 'epoch': 2.67}


 89%|████████▉ | 16880/18924 [8:59:47<1:02:56,  1.85s/it]

{'loss': 0.4051, 'grad_norm': 8.085517883300781, 'learning_rate': 5.5471124620060795e-06, 'epoch': 2.68}


 89%|████████▉ | 16890/18924 [9:00:05<1:03:04,  1.86s/it]

{'loss': 0.2731, 'grad_norm': 5.812002658843994, 'learning_rate': 5.519973947025619e-06, 'epoch': 2.68}


 89%|████████▉ | 16900/18924 [9:00:24<1:02:22,  1.85s/it]

{'loss': 0.5424, 'grad_norm': 14.907947540283203, 'learning_rate': 5.492835432045159e-06, 'epoch': 2.68}


 89%|████████▉ | 16910/18924 [9:00:42<1:01:32,  1.83s/it]

{'loss': 0.4481, 'grad_norm': 5.563335418701172, 'learning_rate': 5.465696917064699e-06, 'epoch': 2.68}


 89%|████████▉ | 16920/18924 [9:01:01<1:00:56,  1.82s/it]

{'loss': 0.4253, 'grad_norm': 10.790575981140137, 'learning_rate': 5.438558402084238e-06, 'epoch': 2.68}


 89%|████████▉ | 16930/18924 [9:01:19<1:00:43,  1.83s/it]

{'loss': 0.3795, 'grad_norm': 13.422586441040039, 'learning_rate': 5.411419887103778e-06, 'epoch': 2.68}


 90%|████████▉ | 16940/18924 [9:01:37<1:00:16,  1.82s/it]

{'loss': 0.471, 'grad_norm': 11.544777870178223, 'learning_rate': 5.384281372123318e-06, 'epoch': 2.69}


 90%|████████▉ | 16950/18924 [9:01:56<1:00:01,  1.82s/it]

{'loss': 0.3408, 'grad_norm': 9.021987915039062, 'learning_rate': 5.357142857142857e-06, 'epoch': 2.69}


 90%|████████▉ | 16960/18924 [9:02:14<59:52,  1.83s/it]  

{'loss': 0.4461, 'grad_norm': 9.03746223449707, 'learning_rate': 5.330004342162397e-06, 'epoch': 2.69}


 90%|████████▉ | 16970/18924 [9:02:32<59:52,  1.84s/it]  

{'loss': 0.4714, 'grad_norm': 12.273805618286133, 'learning_rate': 5.302865827181937e-06, 'epoch': 2.69}


 90%|████████▉ | 16980/18924 [9:02:51<59:26,  1.83s/it]

{'loss': 0.5007, 'grad_norm': 12.647806167602539, 'learning_rate': 5.275727312201477e-06, 'epoch': 2.69}


 90%|████████▉ | 16990/18924 [9:03:09<58:58,  1.83s/it]

{'loss': 0.3531, 'grad_norm': 6.925670623779297, 'learning_rate': 5.248588797221016e-06, 'epoch': 2.69}


 90%|████████▉ | 17000/18924 [9:03:27<58:50,  1.84s/it]

{'loss': 0.4446, 'grad_norm': 9.465085983276367, 'learning_rate': 5.221450282240556e-06, 'epoch': 2.69}


 90%|████████▉ | 17010/18924 [9:03:46<58:49,  1.84s/it]  

{'loss': 0.5472, 'grad_norm': 8.2045316696167, 'learning_rate': 5.194311767260096e-06, 'epoch': 2.7}


 90%|████████▉ | 17020/18924 [9:04:05<58:32,  1.84s/it]

{'loss': 0.3254, 'grad_norm': 10.923270225524902, 'learning_rate': 5.1671732522796354e-06, 'epoch': 2.7}


 90%|████████▉ | 17030/18924 [9:04:23<57:52,  1.83s/it]

{'loss': 0.4237, 'grad_norm': 7.406300067901611, 'learning_rate': 5.140034737299175e-06, 'epoch': 2.7}


 90%|█████████ | 17040/18924 [9:04:41<57:22,  1.83s/it]

{'loss': 0.3093, 'grad_norm': 7.737555027008057, 'learning_rate': 5.1128962223187155e-06, 'epoch': 2.7}


 90%|█████████ | 17050/18924 [9:05:00<57:13,  1.83s/it]

{'loss': 0.3437, 'grad_norm': 8.101371765136719, 'learning_rate': 5.085757707338255e-06, 'epoch': 2.7}


 90%|█████████ | 17060/18924 [9:05:18<57:20,  1.85s/it]

{'loss': 0.4822, 'grad_norm': 13.372346878051758, 'learning_rate': 5.058619192357794e-06, 'epoch': 2.7}


 90%|█████████ | 17070/18924 [9:05:36<56:27,  1.83s/it]

{'loss': 0.3862, 'grad_norm': 0.9586223363876343, 'learning_rate': 5.031480677377335e-06, 'epoch': 2.71}


 90%|█████████ | 17080/18924 [9:05:55<56:22,  1.83s/it]

{'loss': 0.5497, 'grad_norm': 16.00334358215332, 'learning_rate': 5.004342162396874e-06, 'epoch': 2.71}


 90%|█████████ | 17090/18924 [9:06:13<56:17,  1.84s/it]

{'loss': 0.4152, 'grad_norm': 14.781479835510254, 'learning_rate': 4.977203647416414e-06, 'epoch': 2.71}


 90%|█████████ | 17100/18924 [9:06:32<55:32,  1.83s/it]

{'loss': 0.3293, 'grad_norm': 12.178439140319824, 'learning_rate': 4.950065132435953e-06, 'epoch': 2.71}


 90%|█████████ | 17110/18924 [9:06:50<55:08,  1.82s/it]

{'loss': 0.36, 'grad_norm': 5.459040641784668, 'learning_rate': 4.922926617455493e-06, 'epoch': 2.71}


 90%|█████████ | 17120/18924 [9:07:08<55:04,  1.83s/it]

{'loss': 0.3352, 'grad_norm': 4.3366265296936035, 'learning_rate': 4.895788102475033e-06, 'epoch': 2.71}


 91%|█████████ | 17130/18924 [9:07:26<53:52,  1.80s/it]

{'loss': 0.5206, 'grad_norm': 8.082664489746094, 'learning_rate': 4.868649587494572e-06, 'epoch': 2.72}


 91%|█████████ | 17140/18924 [9:07:44<52:45,  1.77s/it]

{'loss': 0.4441, 'grad_norm': 4.956736087799072, 'learning_rate': 4.841511072514112e-06, 'epoch': 2.72}


 91%|█████████ | 17150/18924 [9:08:02<54:21,  1.84s/it]

{'loss': 0.4886, 'grad_norm': 18.404029846191406, 'learning_rate': 4.814372557533652e-06, 'epoch': 2.72}


 91%|█████████ | 17160/18924 [9:08:21<54:28,  1.85s/it]

{'loss': 0.3601, 'grad_norm': 15.642443656921387, 'learning_rate': 4.787234042553191e-06, 'epoch': 2.72}


 91%|█████████ | 17170/18924 [9:08:39<53:49,  1.84s/it]

{'loss': 0.5089, 'grad_norm': 8.91653823852539, 'learning_rate': 4.7600955275727314e-06, 'epoch': 2.72}


 91%|█████████ | 17180/18924 [9:08:57<51:31,  1.77s/it]

{'loss': 0.3686, 'grad_norm': 10.150729179382324, 'learning_rate': 4.7329570125922714e-06, 'epoch': 2.72}


 91%|█████████ | 17190/18924 [9:09:15<52:07,  1.80s/it]

{'loss': 0.4377, 'grad_norm': 7.050704479217529, 'learning_rate': 4.705818497611811e-06, 'epoch': 2.73}


 91%|█████████ | 17200/18924 [9:09:33<53:28,  1.86s/it]

{'loss': 0.4452, 'grad_norm': 9.471848487854004, 'learning_rate': 4.678679982631351e-06, 'epoch': 2.73}


 91%|█████████ | 17210/18924 [9:09:52<52:57,  1.85s/it]

{'loss': 0.4371, 'grad_norm': 13.908167839050293, 'learning_rate': 4.651541467650891e-06, 'epoch': 2.73}


 91%|█████████ | 17220/18924 [9:10:10<50:22,  1.77s/it]

{'loss': 0.3462, 'grad_norm': 12.593012809753418, 'learning_rate': 4.62440295267043e-06, 'epoch': 2.73}


 91%|█████████ | 17230/18924 [9:10:28<50:38,  1.79s/it]

{'loss': 0.4422, 'grad_norm': 13.547102928161621, 'learning_rate': 4.59726443768997e-06, 'epoch': 2.73}


 91%|█████████ | 17240/18924 [9:10:46<51:57,  1.85s/it]

{'loss': 0.3595, 'grad_norm': 16.084980010986328, 'learning_rate': 4.57012592270951e-06, 'epoch': 2.73}


 91%|█████████ | 17250/18924 [9:11:05<52:14,  1.87s/it]

{'loss': 0.4184, 'grad_norm': 5.672257900238037, 'learning_rate': 4.542987407729049e-06, 'epoch': 2.73}


 91%|█████████ | 17260/18924 [9:11:24<51:23,  1.85s/it]

{'loss': 0.3912, 'grad_norm': 5.786076545715332, 'learning_rate': 4.515848892748589e-06, 'epoch': 2.74}


 91%|█████████▏| 17270/18924 [9:11:42<50:47,  1.84s/it]

{'loss': 0.3804, 'grad_norm': 16.3201961517334, 'learning_rate': 4.488710377768129e-06, 'epoch': 2.74}


 91%|█████████▏| 17280/18924 [9:12:00<50:40,  1.85s/it]

{'loss': 0.4173, 'grad_norm': 12.245277404785156, 'learning_rate': 4.461571862787668e-06, 'epoch': 2.74}


 91%|█████████▏| 17290/18924 [9:12:19<49:57,  1.83s/it]

{'loss': 0.37, 'grad_norm': 12.385599136352539, 'learning_rate': 4.434433347807208e-06, 'epoch': 2.74}


 91%|█████████▏| 17300/18924 [9:12:37<49:55,  1.84s/it]

{'loss': 0.4845, 'grad_norm': 15.734179496765137, 'learning_rate': 4.407294832826748e-06, 'epoch': 2.74}


 91%|█████████▏| 17310/18924 [9:12:56<49:16,  1.83s/it]

{'loss': 0.4145, 'grad_norm': 13.109526634216309, 'learning_rate': 4.380156317846287e-06, 'epoch': 2.74}


 92%|█████████▏| 17320/18924 [9:13:14<49:12,  1.84s/it]

{'loss': 0.4392, 'grad_norm': 5.128017425537109, 'learning_rate': 4.353017802865827e-06, 'epoch': 2.75}


 92%|█████████▏| 17330/18924 [9:13:32<48:42,  1.83s/it]

{'loss': 0.3795, 'grad_norm': 14.083258628845215, 'learning_rate': 4.325879287885367e-06, 'epoch': 2.75}


 92%|█████████▏| 17340/18924 [9:13:51<48:28,  1.84s/it]

{'loss': 0.3918, 'grad_norm': 9.159965515136719, 'learning_rate': 4.2987407729049074e-06, 'epoch': 2.75}


 92%|█████████▏| 17350/18924 [9:14:09<48:25,  1.85s/it]

{'loss': 0.3329, 'grad_norm': 7.598048686981201, 'learning_rate': 4.271602257924447e-06, 'epoch': 2.75}


 92%|█████████▏| 17360/18924 [9:14:28<47:50,  1.84s/it]

{'loss': 0.4723, 'grad_norm': 4.072360038757324, 'learning_rate': 4.244463742943986e-06, 'epoch': 2.75}


 92%|█████████▏| 17370/18924 [9:14:46<47:18,  1.83s/it]

{'loss': 0.3874, 'grad_norm': 10.223723411560059, 'learning_rate': 4.217325227963527e-06, 'epoch': 2.75}


 92%|█████████▏| 17380/18924 [9:15:04<47:31,  1.85s/it]

{'loss': 0.6547, 'grad_norm': 4.109381675720215, 'learning_rate': 4.190186712983066e-06, 'epoch': 2.76}


 92%|█████████▏| 17390/18924 [9:15:23<46:47,  1.83s/it]

{'loss': 0.4866, 'grad_norm': 6.229748249053955, 'learning_rate': 4.163048198002605e-06, 'epoch': 2.76}


 92%|█████████▏| 17400/18924 [9:15:41<46:38,  1.84s/it]

{'loss': 0.4345, 'grad_norm': 16.210420608520508, 'learning_rate': 4.135909683022145e-06, 'epoch': 2.76}


 92%|█████████▏| 17410/18924 [9:16:00<46:46,  1.85s/it]

{'loss': 0.4136, 'grad_norm': 16.45209503173828, 'learning_rate': 4.108771168041685e-06, 'epoch': 2.76}


 92%|█████████▏| 17420/18924 [9:16:18<46:01,  1.84s/it]

{'loss': 0.3433, 'grad_norm': 5.768385410308838, 'learning_rate': 4.081632653061224e-06, 'epoch': 2.76}


 92%|█████████▏| 17430/18924 [9:16:36<45:39,  1.83s/it]

{'loss': 0.416, 'grad_norm': 10.752525329589844, 'learning_rate': 4.054494138080764e-06, 'epoch': 2.76}


 92%|█████████▏| 17440/18924 [9:16:55<45:15,  1.83s/it]

{'loss': 0.3298, 'grad_norm': 9.64810848236084, 'learning_rate': 4.027355623100304e-06, 'epoch': 2.76}


 92%|█████████▏| 17450/18924 [9:17:13<44:57,  1.83s/it]

{'loss': 0.461, 'grad_norm': 10.394883155822754, 'learning_rate': 4.000217108119844e-06, 'epoch': 2.77}


 92%|█████████▏| 17460/18924 [9:17:32<44:34,  1.83s/it]

{'loss': 0.3636, 'grad_norm': 8.91843318939209, 'learning_rate': 3.973078593139383e-06, 'epoch': 2.77}


 92%|█████████▏| 17470/18924 [9:17:50<44:26,  1.83s/it]

{'loss': 0.56, 'grad_norm': 19.942771911621094, 'learning_rate': 3.945940078158923e-06, 'epoch': 2.77}


 92%|█████████▏| 17480/18924 [9:18:08<44:15,  1.84s/it]

{'loss': 0.3708, 'grad_norm': 10.135037422180176, 'learning_rate': 3.918801563178463e-06, 'epoch': 2.77}


 92%|█████████▏| 17490/18924 [9:18:27<44:01,  1.84s/it]

{'loss': 0.3682, 'grad_norm': 6.206029891967773, 'learning_rate': 3.8916630481980026e-06, 'epoch': 2.77}


 92%|█████████▏| 17500/18924 [9:18:45<43:25,  1.83s/it]

{'loss': 0.5356, 'grad_norm': 13.948688507080078, 'learning_rate': 3.864524533217543e-06, 'epoch': 2.77}


 93%|█████████▎| 17510/18924 [9:19:04<42:52,  1.82s/it]

{'loss': 0.4407, 'grad_norm': 9.004393577575684, 'learning_rate': 3.837386018237083e-06, 'epoch': 2.78}


 93%|█████████▎| 17520/18924 [9:19:22<42:27,  1.81s/it]

{'loss': 0.3052, 'grad_norm': 5.028786659240723, 'learning_rate': 3.8102475032566218e-06, 'epoch': 2.78}


 93%|█████████▎| 17530/18924 [9:19:40<41:24,  1.78s/it]

{'loss': 0.4233, 'grad_norm': 13.059978485107422, 'learning_rate': 3.7831089882761614e-06, 'epoch': 2.78}


 93%|█████████▎| 17540/18924 [9:19:58<41:46,  1.81s/it]

{'loss': 0.3825, 'grad_norm': 8.847630500793457, 'learning_rate': 3.7559704732957014e-06, 'epoch': 2.78}


 93%|█████████▎| 17550/18924 [9:20:16<41:51,  1.83s/it]

{'loss': 0.3384, 'grad_norm': 8.692846298217773, 'learning_rate': 3.728831958315241e-06, 'epoch': 2.78}


 93%|█████████▎| 17560/18924 [9:20:35<41:36,  1.83s/it]

{'loss': 0.4743, 'grad_norm': 14.721792221069336, 'learning_rate': 3.7016934433347814e-06, 'epoch': 2.78}


 93%|█████████▎| 17570/18924 [9:20:53<40:58,  1.82s/it]

{'loss': 0.4446, 'grad_norm': 14.029857635498047, 'learning_rate': 3.6745549283543206e-06, 'epoch': 2.79}


 93%|█████████▎| 17580/18924 [9:21:11<40:33,  1.81s/it]

{'loss': 0.3174, 'grad_norm': 11.456928253173828, 'learning_rate': 3.64741641337386e-06, 'epoch': 2.79}


 93%|█████████▎| 17590/18924 [9:21:29<40:58,  1.84s/it]

{'loss': 0.4164, 'grad_norm': 6.49610710144043, 'learning_rate': 3.6202778983934e-06, 'epoch': 2.79}


 93%|█████████▎| 17600/18924 [9:21:48<40:15,  1.82s/it]

{'loss': 0.3835, 'grad_norm': 4.589870452880859, 'learning_rate': 3.5931393834129398e-06, 'epoch': 2.79}


 93%|█████████▎| 17610/18924 [9:22:06<39:50,  1.82s/it]

{'loss': 0.3736, 'grad_norm': 15.335224151611328, 'learning_rate': 3.5660008684324794e-06, 'epoch': 2.79}


 93%|█████████▎| 17620/18924 [9:22:24<39:23,  1.81s/it]

{'loss': 0.4166, 'grad_norm': 13.7833833694458, 'learning_rate': 3.5388623534520194e-06, 'epoch': 2.79}


 93%|█████████▎| 17630/18924 [9:22:42<39:02,  1.81s/it]

{'loss': 0.382, 'grad_norm': 7.974844932556152, 'learning_rate': 3.511723838471559e-06, 'epoch': 2.79}


 93%|█████████▎| 17640/18924 [9:23:00<38:48,  1.81s/it]

{'loss': 0.4245, 'grad_norm': 10.789726257324219, 'learning_rate': 3.4845853234910986e-06, 'epoch': 2.8}


 93%|█████████▎| 17650/18924 [9:23:18<38:32,  1.82s/it]

{'loss': 0.4713, 'grad_norm': 9.674637794494629, 'learning_rate': 3.4574468085106386e-06, 'epoch': 2.8}


 93%|█████████▎| 17660/18924 [9:23:37<38:12,  1.81s/it]

{'loss': 0.3879, 'grad_norm': 11.50180721282959, 'learning_rate': 3.430308293530178e-06, 'epoch': 2.8}


 93%|█████████▎| 17670/18924 [9:23:55<37:49,  1.81s/it]

{'loss': 0.4174, 'grad_norm': 7.867244243621826, 'learning_rate': 3.403169778549718e-06, 'epoch': 2.8}


 93%|█████████▎| 17680/18924 [9:24:13<37:38,  1.82s/it]

{'loss': 0.5407, 'grad_norm': 5.5563859939575195, 'learning_rate': 3.3760312635692578e-06, 'epoch': 2.8}


 93%|█████████▎| 17690/18924 [9:24:31<36:27,  1.77s/it]

{'loss': 0.3036, 'grad_norm': 5.097099304199219, 'learning_rate': 3.3488927485887974e-06, 'epoch': 2.8}


 94%|█████████▎| 17700/18924 [9:24:48<35:57,  1.76s/it]

{'loss': 0.2528, 'grad_norm': 10.209647178649902, 'learning_rate': 3.3217542336083374e-06, 'epoch': 2.81}


 94%|█████████▎| 17710/18924 [9:25:06<36:36,  1.81s/it]

{'loss': 0.2541, 'grad_norm': 7.1190876960754395, 'learning_rate': 3.294615718627877e-06, 'epoch': 2.81}


 94%|█████████▎| 17720/18924 [9:25:25<37:00,  1.84s/it]

{'loss': 0.3868, 'grad_norm': 14.251087188720703, 'learning_rate': 3.267477203647416e-06, 'epoch': 2.81}


 94%|█████████▎| 17730/18924 [9:25:43<35:59,  1.81s/it]

{'loss': 0.3076, 'grad_norm': 7.226181983947754, 'learning_rate': 3.2403386886669566e-06, 'epoch': 2.81}


 94%|█████████▎| 17740/18924 [9:26:01<34:38,  1.76s/it]

{'loss': 0.3157, 'grad_norm': 6.056725025177002, 'learning_rate': 3.213200173686496e-06, 'epoch': 2.81}


 94%|█████████▍| 17750/18924 [9:26:18<34:37,  1.77s/it]

{'loss': 0.4662, 'grad_norm': 8.812536239624023, 'learning_rate': 3.1860616587060353e-06, 'epoch': 2.81}


 94%|█████████▍| 17760/18924 [9:26:36<34:52,  1.80s/it]

{'loss': 0.4744, 'grad_norm': 11.132153511047363, 'learning_rate': 3.1589231437255758e-06, 'epoch': 2.82}


 94%|█████████▍| 17770/18924 [9:26:54<34:40,  1.80s/it]

{'loss': 0.4345, 'grad_norm': 11.232087135314941, 'learning_rate': 3.131784628745115e-06, 'epoch': 2.82}


 94%|█████████▍| 17780/18924 [9:27:12<33:48,  1.77s/it]

{'loss': 0.4258, 'grad_norm': 16.363759994506836, 'learning_rate': 3.104646113764655e-06, 'epoch': 2.82}


 94%|█████████▍| 17790/18924 [9:27:30<33:25,  1.77s/it]

{'loss': 0.4294, 'grad_norm': 10.733585357666016, 'learning_rate': 3.077507598784195e-06, 'epoch': 2.82}


 94%|█████████▍| 17800/18924 [9:27:48<33:42,  1.80s/it]

{'loss': 0.4343, 'grad_norm': 17.143587112426758, 'learning_rate': 3.050369083803734e-06, 'epoch': 2.82}


 94%|█████████▍| 17810/18924 [9:28:06<33:44,  1.82s/it]

{'loss': 0.5205, 'grad_norm': 21.044281005859375, 'learning_rate': 3.023230568823274e-06, 'epoch': 2.82}


 94%|█████████▍| 17820/18924 [9:28:24<33:05,  1.80s/it]

{'loss': 0.2726, 'grad_norm': 2.6609387397766113, 'learning_rate': 2.9960920538428137e-06, 'epoch': 2.82}


 94%|█████████▍| 17830/18924 [9:28:42<32:26,  1.78s/it]

{'loss': 0.6605, 'grad_norm': 16.3593807220459, 'learning_rate': 2.9689535388623538e-06, 'epoch': 2.83}


 94%|█████████▍| 17840/18924 [9:29:00<32:18,  1.79s/it]

{'loss': 0.4732, 'grad_norm': 11.343416213989258, 'learning_rate': 2.9418150238818933e-06, 'epoch': 2.83}


 94%|█████████▍| 17850/18924 [9:29:18<32:19,  1.81s/it]

{'loss': 0.5526, 'grad_norm': 9.812887191772461, 'learning_rate': 2.914676508901433e-06, 'epoch': 2.83}


 94%|█████████▍| 17860/18924 [9:29:36<32:22,  1.83s/it]

{'loss': 0.4425, 'grad_norm': 11.691917419433594, 'learning_rate': 2.887537993920973e-06, 'epoch': 2.83}


 94%|█████████▍| 17870/18924 [9:29:54<32:23,  1.84s/it]

{'loss': 0.398, 'grad_norm': 4.463249683380127, 'learning_rate': 2.8603994789405125e-06, 'epoch': 2.83}


 94%|█████████▍| 17880/18924 [9:30:12<31:31,  1.81s/it]

{'loss': 0.5195, 'grad_norm': 6.879973411560059, 'learning_rate': 2.833260963960052e-06, 'epoch': 2.83}


 95%|█████████▍| 17890/18924 [9:30:30<30:35,  1.78s/it]

{'loss': 0.4857, 'grad_norm': 16.071998596191406, 'learning_rate': 2.806122448979592e-06, 'epoch': 2.84}


 95%|█████████▍| 17900/18924 [9:30:48<29:56,  1.75s/it]

{'loss': 0.3694, 'grad_norm': 4.262146472930908, 'learning_rate': 2.7789839339991317e-06, 'epoch': 2.84}


 95%|█████████▍| 17910/18924 [9:31:06<29:49,  1.76s/it]

{'loss': 0.508, 'grad_norm': 8.956411361694336, 'learning_rate': 2.7518454190186713e-06, 'epoch': 2.84}


 95%|█████████▍| 17920/18924 [9:31:23<29:54,  1.79s/it]

{'loss': 0.4094, 'grad_norm': 9.67754077911377, 'learning_rate': 2.724706904038211e-06, 'epoch': 2.84}


 95%|█████████▍| 17930/18924 [9:31:42<30:17,  1.83s/it]

{'loss': 0.3805, 'grad_norm': 7.517499923706055, 'learning_rate': 2.697568389057751e-06, 'epoch': 2.84}


 95%|█████████▍| 17940/18924 [9:32:00<29:59,  1.83s/it]

{'loss': 0.3175, 'grad_norm': 9.272127151489258, 'learning_rate': 2.670429874077291e-06, 'epoch': 2.84}


 95%|█████████▍| 17950/18924 [9:32:18<29:38,  1.83s/it]

{'loss': 0.292, 'grad_norm': 5.295626640319824, 'learning_rate': 2.64329135909683e-06, 'epoch': 2.85}


 95%|█████████▍| 17960/18924 [9:32:37<29:30,  1.84s/it]

{'loss': 0.412, 'grad_norm': 3.0643656253814697, 'learning_rate': 2.61615284411637e-06, 'epoch': 2.85}


 95%|█████████▍| 17970/18924 [9:32:55<28:53,  1.82s/it]

{'loss': 0.3667, 'grad_norm': 6.704516410827637, 'learning_rate': 2.5890143291359097e-06, 'epoch': 2.85}


 95%|█████████▌| 17980/18924 [9:33:13<28:26,  1.81s/it]

{'loss': 0.3305, 'grad_norm': 10.038224220275879, 'learning_rate': 2.5618758141554497e-06, 'epoch': 2.85}


 95%|█████████▌| 17990/18924 [9:33:31<28:28,  1.83s/it]

{'loss': 0.4672, 'grad_norm': 14.269904136657715, 'learning_rate': 2.5347372991749893e-06, 'epoch': 2.85}


 95%|█████████▌| 18000/18924 [9:33:49<27:53,  1.81s/it]

{'loss': 0.3801, 'grad_norm': 12.780783653259277, 'learning_rate': 2.507598784194529e-06, 'epoch': 2.85}


 95%|█████████▌| 18010/18924 [9:34:08<27:40,  1.82s/it]

{'loss': 0.4299, 'grad_norm': 11.714646339416504, 'learning_rate': 2.480460269214069e-06, 'epoch': 2.86}


 95%|█████████▌| 18020/18924 [9:34:27<27:19,  1.81s/it]

{'loss': 0.4572, 'grad_norm': 17.146657943725586, 'learning_rate': 2.453321754233608e-06, 'epoch': 2.86}


 95%|█████████▌| 18030/18924 [9:34:45<27:06,  1.82s/it]

{'loss': 0.3562, 'grad_norm': 14.648330688476562, 'learning_rate': 2.426183239253148e-06, 'epoch': 2.86}


 95%|█████████▌| 18040/18924 [9:35:03<26:40,  1.81s/it]

{'loss': 0.3904, 'grad_norm': 8.949899673461914, 'learning_rate': 2.399044724272688e-06, 'epoch': 2.86}


 95%|█████████▌| 18050/18924 [9:35:21<26:21,  1.81s/it]

{'loss': 0.3151, 'grad_norm': 21.871004104614258, 'learning_rate': 2.3719062092922277e-06, 'epoch': 2.86}


 95%|█████████▌| 18060/18924 [9:35:39<25:59,  1.80s/it]

{'loss': 0.4245, 'grad_norm': 7.970520496368408, 'learning_rate': 2.3447676943117673e-06, 'epoch': 2.86}


 95%|█████████▌| 18070/18924 [9:35:57<25:46,  1.81s/it]

{'loss': 0.3651, 'grad_norm': 12.529226303100586, 'learning_rate': 2.317629179331307e-06, 'epoch': 2.86}


 96%|█████████▌| 18080/18924 [9:36:15<25:18,  1.80s/it]

{'loss': 0.4517, 'grad_norm': 8.397828102111816, 'learning_rate': 2.290490664350847e-06, 'epoch': 2.87}


 96%|█████████▌| 18090/18924 [9:36:33<25:07,  1.81s/it]

{'loss': 0.4326, 'grad_norm': 11.888290405273438, 'learning_rate': 2.263352149370387e-06, 'epoch': 2.87}


 96%|█████████▌| 18100/18924 [9:36:51<24:46,  1.80s/it]

{'loss': 0.4424, 'grad_norm': 10.738778114318848, 'learning_rate': 2.236213634389926e-06, 'epoch': 2.87}


 96%|█████████▌| 18110/18924 [9:37:09<24:37,  1.81s/it]

{'loss': 0.4592, 'grad_norm': 16.544889450073242, 'learning_rate': 2.209075119409466e-06, 'epoch': 2.87}


 96%|█████████▌| 18120/18924 [9:37:27<24:01,  1.79s/it]

{'loss': 0.5027, 'grad_norm': 8.59980583190918, 'learning_rate': 2.1819366044290057e-06, 'epoch': 2.87}


 96%|█████████▌| 18130/18924 [9:37:45<23:18,  1.76s/it]

{'loss': 0.3788, 'grad_norm': 11.74877643585205, 'learning_rate': 2.1547980894485453e-06, 'epoch': 2.87}


 96%|█████████▌| 18140/18924 [9:38:03<23:17,  1.78s/it]

{'loss': 0.3495, 'grad_norm': 4.5554280281066895, 'learning_rate': 2.1276595744680853e-06, 'epoch': 2.88}


 96%|█████████▌| 18150/18924 [9:38:21<23:16,  1.80s/it]

{'loss': 0.5509, 'grad_norm': 4.973081588745117, 'learning_rate': 2.100521059487625e-06, 'epoch': 2.88}


 96%|█████████▌| 18160/18924 [9:38:39<23:12,  1.82s/it]

{'loss': 0.3457, 'grad_norm': 10.744102478027344, 'learning_rate': 2.073382544507165e-06, 'epoch': 2.88}


 96%|█████████▌| 18170/18924 [9:38:57<22:51,  1.82s/it]

{'loss': 0.454, 'grad_norm': 12.203788757324219, 'learning_rate': 2.046244029526704e-06, 'epoch': 2.88}


 96%|█████████▌| 18180/18924 [9:39:15<22:25,  1.81s/it]

{'loss': 0.3632, 'grad_norm': 8.195579528808594, 'learning_rate': 2.019105514546244e-06, 'epoch': 2.88}


 96%|█████████▌| 18190/18924 [9:39:33<22:07,  1.81s/it]

{'loss': 0.4555, 'grad_norm': 14.490636825561523, 'learning_rate': 1.991966999565784e-06, 'epoch': 2.88}


 96%|█████████▌| 18200/18924 [9:39:52<21:53,  1.81s/it]

{'loss': 0.5689, 'grad_norm': 13.526114463806152, 'learning_rate': 1.9648284845853233e-06, 'epoch': 2.89}


 96%|█████████▌| 18210/18924 [9:40:10<21:32,  1.81s/it]

{'loss': 0.4683, 'grad_norm': 9.225664138793945, 'learning_rate': 1.9376899696048633e-06, 'epoch': 2.89}


 96%|█████████▋| 18220/18924 [9:40:28<21:14,  1.81s/it]

{'loss': 0.4141, 'grad_norm': 8.931550979614258, 'learning_rate': 1.910551454624403e-06, 'epoch': 2.89}


 96%|█████████▋| 18230/18924 [9:40:46<20:58,  1.81s/it]

{'loss': 0.2232, 'grad_norm': 9.451979637145996, 'learning_rate': 1.883412939643943e-06, 'epoch': 2.89}


 96%|█████████▋| 18240/18924 [9:41:04<20:35,  1.81s/it]

{'loss': 0.3928, 'grad_norm': 22.612071990966797, 'learning_rate': 1.8562744246634823e-06, 'epoch': 2.89}


 96%|█████████▋| 18250/18924 [9:41:22<20:19,  1.81s/it]

{'loss': 0.4288, 'grad_norm': 14.045257568359375, 'learning_rate': 1.8291359096830223e-06, 'epoch': 2.89}


 96%|█████████▋| 18260/18924 [9:41:40<19:31,  1.76s/it]

{'loss': 0.3586, 'grad_norm': 11.05777645111084, 'learning_rate': 1.801997394702562e-06, 'epoch': 2.89}


 97%|█████████▋| 18270/18924 [9:41:58<19:35,  1.80s/it]

{'loss': 0.4084, 'grad_norm': 8.712299346923828, 'learning_rate': 1.774858879722102e-06, 'epoch': 2.9}


 97%|█████████▋| 18280/18924 [9:42:16<19:36,  1.83s/it]

{'loss': 0.4754, 'grad_norm': 12.182910919189453, 'learning_rate': 1.7477203647416413e-06, 'epoch': 2.9}


 97%|█████████▋| 18290/18924 [9:42:34<19:23,  1.84s/it]

{'loss': 0.549, 'grad_norm': 5.303319931030273, 'learning_rate': 1.720581849761181e-06, 'epoch': 2.9}


 97%|█████████▋| 18300/18924 [9:42:53<19:08,  1.84s/it]

{'loss': 0.4757, 'grad_norm': 15.052942276000977, 'learning_rate': 1.6934433347807209e-06, 'epoch': 2.9}


 97%|█████████▋| 18310/18924 [9:43:11<18:38,  1.82s/it]

{'loss': 0.4255, 'grad_norm': 4.308608531951904, 'learning_rate': 1.6663048198002605e-06, 'epoch': 2.9}


 97%|█████████▋| 18320/18924 [9:43:29<18:19,  1.82s/it]

{'loss': 0.4431, 'grad_norm': 10.544967651367188, 'learning_rate': 1.6391663048198003e-06, 'epoch': 2.9}


 97%|█████████▋| 18330/18924 [9:43:47<17:56,  1.81s/it]

{'loss': 0.4663, 'grad_norm': 8.989645957946777, 'learning_rate': 1.61202778983934e-06, 'epoch': 2.91}


 97%|█████████▋| 18340/18924 [9:44:06<17:46,  1.83s/it]

{'loss': 0.4786, 'grad_norm': 6.473011493682861, 'learning_rate': 1.5848892748588799e-06, 'epoch': 2.91}


 97%|█████████▋| 18350/18924 [9:44:24<17:22,  1.82s/it]

{'loss': 0.4009, 'grad_norm': 12.750076293945312, 'learning_rate': 1.5577507598784195e-06, 'epoch': 2.91}


 97%|█████████▋| 18360/18924 [9:44:42<16:52,  1.80s/it]

{'loss': 0.3414, 'grad_norm': 10.62640380859375, 'learning_rate': 1.5306122448979593e-06, 'epoch': 2.91}


 97%|█████████▋| 18370/18924 [9:45:00<16:50,  1.82s/it]

{'loss': 0.4984, 'grad_norm': 14.301596641540527, 'learning_rate': 1.503473729917499e-06, 'epoch': 2.91}


 97%|█████████▋| 18380/18924 [9:45:18<16:27,  1.82s/it]

{'loss': 0.5492, 'grad_norm': 4.596724510192871, 'learning_rate': 1.4763352149370387e-06, 'epoch': 2.91}


 97%|█████████▋| 18390/18924 [9:45:36<16:12,  1.82s/it]

{'loss': 0.4159, 'grad_norm': 5.231854438781738, 'learning_rate': 1.4491966999565785e-06, 'epoch': 2.92}


 97%|█████████▋| 18400/18924 [9:45:55<15:53,  1.82s/it]

{'loss': 0.3955, 'grad_norm': 11.446186065673828, 'learning_rate': 1.4220581849761183e-06, 'epoch': 2.92}


 97%|█████████▋| 18410/18924 [9:46:13<15:33,  1.82s/it]

{'loss': 0.3611, 'grad_norm': 9.948134422302246, 'learning_rate': 1.3949196699956579e-06, 'epoch': 2.92}


 97%|█████████▋| 18420/18924 [9:46:31<15:12,  1.81s/it]

{'loss': 0.3215, 'grad_norm': 10.760008811950684, 'learning_rate': 1.3677811550151977e-06, 'epoch': 2.92}


 97%|█████████▋| 18430/18924 [9:46:49<14:53,  1.81s/it]

{'loss': 0.3777, 'grad_norm': 9.847575187683105, 'learning_rate': 1.3406426400347373e-06, 'epoch': 2.92}


 97%|█████████▋| 18440/18924 [9:47:07<14:43,  1.83s/it]

{'loss': 0.5774, 'grad_norm': 12.667817115783691, 'learning_rate': 1.313504125054277e-06, 'epoch': 2.92}


 97%|█████████▋| 18450/18924 [9:47:26<14:23,  1.82s/it]

{'loss': 0.4836, 'grad_norm': 14.528876304626465, 'learning_rate': 1.2863656100738169e-06, 'epoch': 2.92}


 98%|█████████▊| 18460/18924 [9:47:44<14:16,  1.85s/it]

{'loss': 0.4122, 'grad_norm': 8.967413902282715, 'learning_rate': 1.2592270950933567e-06, 'epoch': 2.93}


 98%|█████████▊| 18470/18924 [9:48:02<13:48,  1.82s/it]

{'loss': 0.3352, 'grad_norm': 8.983858108520508, 'learning_rate': 1.2320885801128963e-06, 'epoch': 2.93}


 98%|█████████▊| 18480/18924 [9:48:20<13:23,  1.81s/it]

{'loss': 0.314, 'grad_norm': 3.7838127613067627, 'learning_rate': 1.204950065132436e-06, 'epoch': 2.93}


 98%|█████████▊| 18490/18924 [9:48:38<13:07,  1.81s/it]

{'loss': 0.2782, 'grad_norm': 3.9142937660217285, 'learning_rate': 1.1778115501519757e-06, 'epoch': 2.93}


 98%|█████████▊| 18500/18924 [9:48:57<12:51,  1.82s/it]

{'loss': 0.438, 'grad_norm': 21.283897399902344, 'learning_rate': 1.1506730351715155e-06, 'epoch': 2.93}


 98%|█████████▊| 18510/18924 [9:49:16<12:34,  1.82s/it]

{'loss': 0.4353, 'grad_norm': 12.072900772094727, 'learning_rate': 1.1235345201910553e-06, 'epoch': 2.93}


 98%|█████████▊| 18520/18924 [9:49:34<12:12,  1.81s/it]

{'loss': 0.4645, 'grad_norm': 8.188762664794922, 'learning_rate': 1.0963960052105948e-06, 'epoch': 2.94}


 98%|█████████▊| 18530/18924 [9:49:52<11:57,  1.82s/it]

{'loss': 0.3607, 'grad_norm': 14.75183391571045, 'learning_rate': 1.0692574902301347e-06, 'epoch': 2.94}


 98%|█████████▊| 18540/18924 [9:50:10<11:39,  1.82s/it]

{'loss': 0.3727, 'grad_norm': 11.696983337402344, 'learning_rate': 1.0421189752496742e-06, 'epoch': 2.94}


 98%|█████████▊| 18550/18924 [9:50:29<11:19,  1.82s/it]

{'loss': 0.3332, 'grad_norm': 16.13336944580078, 'learning_rate': 1.0149804602692143e-06, 'epoch': 2.94}


 98%|█████████▊| 18560/18924 [9:50:47<11:02,  1.82s/it]

{'loss': 0.2, 'grad_norm': 5.818690299987793, 'learning_rate': 9.878419452887538e-07, 'epoch': 2.94}


 98%|█████████▊| 18570/18924 [9:51:05<10:38,  1.80s/it]

{'loss': 0.3628, 'grad_norm': 5.987581253051758, 'learning_rate': 9.607034303082937e-07, 'epoch': 2.94}


 98%|█████████▊| 18580/18924 [9:51:23<10:20,  1.81s/it]

{'loss': 0.5021, 'grad_norm': 18.52129364013672, 'learning_rate': 9.335649153278332e-07, 'epoch': 2.95}


 98%|█████████▊| 18590/18924 [9:51:41<10:08,  1.82s/it]

{'loss': 0.3069, 'grad_norm': 8.407999992370605, 'learning_rate': 9.064264003473731e-07, 'epoch': 2.95}


 98%|█████████▊| 18600/18924 [9:51:59<09:51,  1.82s/it]

{'loss': 0.3183, 'grad_norm': 12.33795166015625, 'learning_rate': 8.792878853669127e-07, 'epoch': 2.95}


 98%|█████████▊| 18610/18924 [9:52:18<09:30,  1.82s/it]

{'loss': 0.4131, 'grad_norm': 12.196638107299805, 'learning_rate': 8.521493703864524e-07, 'epoch': 2.95}


 98%|█████████▊| 18620/18924 [9:52:36<09:11,  1.82s/it]

{'loss': 0.6024, 'grad_norm': 5.274540901184082, 'learning_rate': 8.250108554059922e-07, 'epoch': 2.95}


 98%|█████████▊| 18630/18924 [9:52:54<08:56,  1.82s/it]

{'loss': 0.36, 'grad_norm': 7.4580559730529785, 'learning_rate': 7.978723404255319e-07, 'epoch': 2.95}


 98%|█████████▊| 18640/18924 [9:53:12<08:41,  1.84s/it]

{'loss': 0.5313, 'grad_norm': 13.28494644165039, 'learning_rate': 7.707338254450717e-07, 'epoch': 2.95}


 99%|█████████▊| 18650/18924 [9:53:31<08:21,  1.83s/it]

{'loss': 0.395, 'grad_norm': 11.533553123474121, 'learning_rate': 7.435953104646114e-07, 'epoch': 2.96}


 99%|█████████▊| 18660/18924 [9:53:49<08:00,  1.82s/it]

{'loss': 0.4276, 'grad_norm': 13.380048751831055, 'learning_rate': 7.164567954841511e-07, 'epoch': 2.96}


 99%|█████████▊| 18670/18924 [9:54:07<07:42,  1.82s/it]

{'loss': 0.3561, 'grad_norm': 7.44097375869751, 'learning_rate': 6.893182805036908e-07, 'epoch': 2.96}


 99%|█████████▊| 18680/18924 [9:54:25<07:23,  1.82s/it]

{'loss': 0.2607, 'grad_norm': 13.065107345581055, 'learning_rate': 6.621797655232306e-07, 'epoch': 2.96}


 99%|█████████▉| 18690/18924 [9:54:44<07:07,  1.83s/it]

{'loss': 0.4058, 'grad_norm': 6.643017768859863, 'learning_rate': 6.350412505427703e-07, 'epoch': 2.96}


 99%|█████████▉| 18700/18924 [9:55:02<06:47,  1.82s/it]

{'loss': 0.4692, 'grad_norm': 2.4679322242736816, 'learning_rate': 6.0790273556231e-07, 'epoch': 2.96}


 99%|█████████▉| 18710/18924 [9:55:20<06:29,  1.82s/it]

{'loss': 0.4776, 'grad_norm': 2.430049419403076, 'learning_rate': 5.807642205818498e-07, 'epoch': 2.97}


 99%|█████████▉| 18720/18924 [9:55:38<06:10,  1.82s/it]

{'loss': 0.492, 'grad_norm': 8.737603187561035, 'learning_rate': 5.536257056013895e-07, 'epoch': 2.97}


 99%|█████████▉| 18730/18924 [9:55:56<05:54,  1.83s/it]

{'loss': 0.4978, 'grad_norm': 7.473292827606201, 'learning_rate': 5.264871906209292e-07, 'epoch': 2.97}


 99%|█████████▉| 18740/18924 [9:56:15<05:34,  1.82s/it]

{'loss': 0.4669, 'grad_norm': 9.735034942626953, 'learning_rate': 4.99348675640469e-07, 'epoch': 2.97}


 99%|█████████▉| 18750/18924 [9:56:33<05:17,  1.83s/it]

{'loss': 0.4177, 'grad_norm': 9.02966022491455, 'learning_rate': 4.722101606600087e-07, 'epoch': 2.97}


 99%|█████████▉| 18760/18924 [9:56:51<04:57,  1.82s/it]

{'loss': 0.4946, 'grad_norm': 6.552377700805664, 'learning_rate': 4.450716456795484e-07, 'epoch': 2.97}


 99%|█████████▉| 18770/18924 [9:57:09<04:39,  1.82s/it]

{'loss': 0.3368, 'grad_norm': 7.423328876495361, 'learning_rate': 4.179331306990881e-07, 'epoch': 2.98}


 99%|█████████▉| 18780/18924 [9:57:28<04:23,  1.83s/it]

{'loss': 0.3135, 'grad_norm': 9.570555686950684, 'learning_rate': 3.9079461571862787e-07, 'epoch': 2.98}


 99%|█████████▉| 18790/18924 [9:57:46<04:06,  1.84s/it]

{'loss': 0.3518, 'grad_norm': 10.353702545166016, 'learning_rate': 3.636561007381676e-07, 'epoch': 2.98}


 99%|█████████▉| 18800/18924 [9:58:04<03:46,  1.82s/it]

{'loss': 0.44, 'grad_norm': 2.446762800216675, 'learning_rate': 3.3651758575770737e-07, 'epoch': 2.98}


 99%|█████████▉| 18810/18924 [9:58:22<03:25,  1.80s/it]

{'loss': 0.3518, 'grad_norm': 10.283665657043457, 'learning_rate': 3.0937907077724706e-07, 'epoch': 2.98}


 99%|█████████▉| 18820/18924 [9:58:40<03:03,  1.77s/it]

{'loss': 0.5023, 'grad_norm': 14.50269889831543, 'learning_rate': 2.822405557967868e-07, 'epoch': 2.98}


100%|█████████▉| 18830/18924 [9:58:58<02:47,  1.79s/it]

{'loss': 0.5406, 'grad_norm': 6.446771621704102, 'learning_rate': 2.551020408163265e-07, 'epoch': 2.99}


100%|█████████▉| 18840/18924 [9:59:16<02:34,  1.84s/it]

{'loss': 0.4567, 'grad_norm': 18.78110694885254, 'learning_rate': 2.2796352583586626e-07, 'epoch': 2.99}


100%|█████████▉| 18850/18924 [9:59:35<02:17,  1.85s/it]

{'loss': 0.435, 'grad_norm': 25.95880889892578, 'learning_rate': 2.00825010855406e-07, 'epoch': 2.99}


100%|█████████▉| 18860/18924 [9:59:53<01:53,  1.77s/it]

{'loss': 0.4178, 'grad_norm': 19.259794235229492, 'learning_rate': 1.7368649587494573e-07, 'epoch': 2.99}


100%|█████████▉| 18870/18924 [10:00:10<01:36,  1.79s/it]

{'loss': 0.3993, 'grad_norm': 18.40705108642578, 'learning_rate': 1.4654798089448546e-07, 'epoch': 2.99}


100%|█████████▉| 18880/18924 [10:00:29<01:20,  1.83s/it]

{'loss': 0.3502, 'grad_norm': 15.24200439453125, 'learning_rate': 1.1940946591402518e-07, 'epoch': 2.99}


100%|█████████▉| 18890/18924 [10:00:47<01:02,  1.84s/it]

{'loss': 0.3916, 'grad_norm': 13.71373462677002, 'learning_rate': 9.227095093356492e-08, 'epoch': 2.99}


100%|█████████▉| 18900/18924 [10:01:05<00:42,  1.78s/it]

{'loss': 0.451, 'grad_norm': 14.751548767089844, 'learning_rate': 6.513243595310464e-08, 'epoch': 3.0}


100%|█████████▉| 18910/18924 [10:01:23<00:24,  1.78s/it]

{'loss': 0.3833, 'grad_norm': 7.824601173400879, 'learning_rate': 3.799392097264438e-08, 'epoch': 3.0}


100%|█████████▉| 18920/18924 [10:01:41<00:07,  1.84s/it]

{'loss': 0.3929, 'grad_norm': 12.712198257446289, 'learning_rate': 1.0855405992184108e-08, 'epoch': 3.0}


100%|██████████| 18924/18924 [10:01:48<00:00,  1.91s/it]

{'train_runtime': 36108.6525, 'train_samples_per_second': 8.384, 'train_steps_per_second': 0.524, 'train_loss': 0.7197749201287883, 'epoch': 3.0}





TrainOutput(global_step=18924, training_loss=0.7197749201287883, metrics={'train_runtime': 36108.6525, 'train_samples_per_second': 8.384, 'train_steps_per_second': 0.524, 'total_flos': 4.011282243455386e+16, 'train_loss': 0.7197749201287883, 'epoch': 3.0})

In [23]:
# Save the model
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

# Save label encoder
import joblib
joblib.dump(le, './fine_tuned_model/label_encoder.joblib')

print("Fine-tuning complete. Model and tokenizer saved.")

Fine-tuning complete. Model and tokenizer saved.


In [26]:
# Run inference on test set
from sklearn.metrics import accuracy_score

predictions = trainer.predict(tokenized_test)
preds = np.argmax(predictions.predictions, axis=1)
accuracy = accuracy_score(test_labels, preds)
print(f"Accuracy on test set: {accuracy}")

100%|██████████| 224/224 [08:57<00:00,  2.40s/it]

Accuracy on test set: 0.7516289497652912





In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(test_labels, preds, average='micro')
print(f"Precision on test set: {precision}")

recall = recall_score(test_labels, preds, average='micro')
print(f"Recall on test set: {recall}")

f1 = f1_score(test_labels, preds, average='micro')
print(f"F1 on test set: {f1}")

Precision on test set: 0.7516289497652912
Recall on test set: 0.7516289497652912
F1 on test set: 0.7516289497652912
