In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch

dataset = load_dataset("issai/kazqad")

In [13]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def preprocess_function(examples):
    outputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    print(f"Examples: {examples}")

    for i in range(len(examples['question'])):
        offset_mapping = outputs["offset_mapping"][i]
        input_ids = outputs["input_ids"][i]

        print(f"Example {i} Offset Mapping: {offset_mapping}")
        print(f"Example {i} Input IDs: {input_ids}")

        if 'answers' in examples:
            answer = examples['answers'][i]
            if 'text' in answer and 'answer_start' in answer:
                answer_texts = answer['text']
                answer_starts = answer['answer_start']

                print(f"Example {i} Answer Texts: {answer_texts}")
                print(f"Example {i} Answer Starts: {answer_starts}")

                if len(answer_texts) > 0 and len(answer_starts) > 0:
                    start_char = answer_starts[0]
                    end_char = start_char + len(answer_texts[0])

                    start_token_index = None
                    end_token_index = None

                    for idx, (start, end) in enumerate(offset_mapping):
                        if start <= start_char < end:
                            start_token_index = idx
                            break

                    for idx, (start, end) in enumerate(offset_mapping):
                        if start < end_char <= end:
                            end_token_index = idx
                            break

                    print(f"Processing example {i}:")
                    print("Answer Text:", answer_texts[0])
                    print("Start Char Index:", start_char)
                    print("End Char Index:", end_char)
                    print("Start Token Index:", start_token_index)
                    print("End Token Index:", end_token_index)

                    if start_token_index is None:
                        start_token_index = 0
                    if end_token_index is None or end_token_index >= len(input_ids):
                        end_token_index = len(input_ids) - 1

                    start_positions.append(start_token_index)
                    end_positions.append(end_token_index)
                else:
                    start_positions.append(0)
                    end_positions.append(0)
            else:
                start_positions.append(0)
                end_positions.append(0)
        else:
            start_positions.append(0)
            end_positions.append(0)

    outputs['start_positions'] = start_positions
    outputs['end_positions'] = end_positions

    outputs.pop("offset_mapping", None)
    return outputs

try:
    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
except Exception as e:
    print("Error during processing:", e)

In [14]:
tokenized_datasets['validation'][0]

{'input_ids': [101,
  538,
  28588,
  12670,
  24612,
  13237,
  21720,
  10993,
  16417,
  22840,
  64712,
  11571,
  10375,
  136,
  102,
  509,
  20411,
  25363,
  118,
  541,
  111085,
  71742,
  89306,
  542,
  41069,
  26205,
  28993,
  13257,
  570,
  66963,
  12095,
  88044,
  547,
  62289,
  30189,
  26367,
  36268,
  208,
  61861,
  33440,
  220,
  10234,
  10351,
  119,
  510,
  41069,
  26205,
  48097,
  10519,
  67501,
  605,
  40270,
  24519,
  10352,
  10375,
  15773,
  103116,
  10205,
  62025,
  87439,
  13237,
  557,
  10205,
  89094,
  57794,
  38074,
  108110,
  10987,
  105922,
  541,
  16417,
  15726,
  19913,
  10227,
  117,
  553,
  104808,
  13237,
  117,
  57684,
  11323,
  61947,
  31890,
  46195,
  570,
  66963,
  66013,
  28993,
  19079,
  37721,
  55655,
  11890,
  119,
  509,
  20411,
  25363,
  118,
  554,
  10292,
  208,
  38074,
  108110,
  57794,
  570,
  28588,
  12670,
  24612,
  13237,
  21384,
  40628,
  220,
  18609,
  547,
  10191,
  10352,
  10

In [15]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

True
1
Using device: cuda


In [16]:
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-multilingual-cased")
model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000, 
    save_total_limit=2, 
    warmup_steps=500, 
    fp16=True,
    gradient_accumulation_steps=2, 
    report_to="none",
    dataloader_pin_memory=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

print(f"Model device: {next(model.parameters()).device}")

trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model device: cuda:0


  0%|          | 10/5270 [00:11<53:55,  1.63it/s] 

{'loss': 6.2985, 'grad_norm': 8.222498893737793, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.02}


  0%|          | 20/5270 [00:16<47:31,  1.84it/s]

{'loss': 6.3036, 'grad_norm': 8.080987930297852, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.04}


  1%|          | 30/5270 [00:19<27:24,  3.19it/s]

{'loss': 6.2476, 'grad_norm': 10.557405471801758, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.06}


  1%|          | 40/5270 [00:22<28:53,  3.02it/s]

{'loss': 6.2191, 'grad_norm': 10.014206886291504, 'learning_rate': 1.6000000000000001e-06, 'epoch': 0.08}


  1%|          | 50/5270 [00:27<32:05,  2.71it/s]

{'loss': 6.0854, 'grad_norm': 8.849541664123535, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.09}


  1%|          | 60/5270 [00:30<27:27,  3.16it/s]

{'loss': 6.011, 'grad_norm': 8.708096504211426, 'learning_rate': 2.3600000000000003e-06, 'epoch': 0.11}


  1%|▏         | 70/5270 [00:33<28:43,  3.02it/s]

{'loss': 5.9268, 'grad_norm': 10.391244888305664, 'learning_rate': 2.7600000000000003e-06, 'epoch': 0.13}


  2%|▏         | 80/5270 [00:36<27:18,  3.17it/s]

{'loss': 5.8113, 'grad_norm': 11.554844856262207, 'learning_rate': 3.1600000000000002e-06, 'epoch': 0.15}


  2%|▏         | 90/5270 [00:39<25:12,  3.43it/s]

{'loss': 5.5528, 'grad_norm': 11.75501823425293, 'learning_rate': 3.52e-06, 'epoch': 0.17}


  2%|▏         | 100/5270 [00:42<25:51,  3.33it/s]

{'loss': 5.4656, 'grad_norm': 11.184981346130371, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.19}


  2%|▏         | 110/5270 [00:45<25:52,  3.32it/s]

{'loss': 5.1506, 'grad_norm': 14.205012321472168, 'learning_rate': 4.32e-06, 'epoch': 0.21}


  2%|▏         | 120/5270 [00:48<26:04,  3.29it/s]

{'loss': 5.1378, 'grad_norm': 22.152088165283203, 'learning_rate': 4.7200000000000005e-06, 'epoch': 0.23}


  2%|▏         | 130/5270 [00:51<25:50,  3.31it/s]

{'loss': 4.7764, 'grad_norm': 12.081171989440918, 'learning_rate': 5.12e-06, 'epoch': 0.25}


  3%|▎         | 140/5270 [00:54<26:04,  3.28it/s]

{'loss': 4.4914, 'grad_norm': 11.808035850524902, 'learning_rate': 5.5200000000000005e-06, 'epoch': 0.27}


  3%|▎         | 150/5270 [00:58<26:20,  3.24it/s]

{'loss': 4.5482, 'grad_norm': 13.113334655761719, 'learning_rate': 5.92e-06, 'epoch': 0.28}


  3%|▎         | 160/5270 [01:01<26:27,  3.22it/s]

{'loss': 4.4255, 'grad_norm': 12.514299392700195, 'learning_rate': 6.3200000000000005e-06, 'epoch': 0.3}


  3%|▎         | 170/5270 [01:04<26:15,  3.24it/s]

{'loss': 4.2004, 'grad_norm': 19.307838439941406, 'learning_rate': 6.680000000000001e-06, 'epoch': 0.32}


  3%|▎         | 180/5270 [01:07<26:25,  3.21it/s]

{'loss': 4.1474, 'grad_norm': 15.618537902832031, 'learning_rate': 7.08e-06, 'epoch': 0.34}


  4%|▎         | 190/5270 [01:10<26:24,  3.21it/s]

{'loss': 3.9454, 'grad_norm': 17.45821762084961, 'learning_rate': 7.48e-06, 'epoch': 0.36}


  4%|▍         | 200/5270 [01:13<26:41,  3.17it/s]

{'loss': 4.0559, 'grad_norm': 19.733060836791992, 'learning_rate': 7.88e-06, 'epoch': 0.38}


  4%|▍         | 210/5270 [01:16<26:42,  3.16it/s]

{'loss': 3.9859, 'grad_norm': 18.550546646118164, 'learning_rate': 8.28e-06, 'epoch': 0.4}


  4%|▍         | 220/5270 [01:19<26:31,  3.17it/s]

{'loss': 3.8947, 'grad_norm': 19.810951232910156, 'learning_rate': 8.68e-06, 'epoch': 0.42}


  4%|▍         | 230/5270 [01:23<26:31,  3.17it/s]

{'loss': 3.9946, 'grad_norm': 21.822925567626953, 'learning_rate': 9.080000000000001e-06, 'epoch': 0.44}


  5%|▍         | 240/5270 [01:26<27:10,  3.08it/s]

{'loss': 3.8631, 'grad_norm': 27.79082679748535, 'learning_rate': 9.48e-06, 'epoch': 0.45}


  5%|▍         | 250/5270 [01:29<27:11,  3.08it/s]

{'loss': 4.0246, 'grad_norm': 23.02857780456543, 'learning_rate': 9.88e-06, 'epoch': 0.47}


  5%|▍         | 260/5270 [01:32<27:05,  3.08it/s]

{'loss': 3.6263, 'grad_norm': 23.287918090820312, 'learning_rate': 1.0280000000000002e-05, 'epoch': 0.49}


  5%|▌         | 270/5270 [01:36<26:51,  3.10it/s]

{'loss': 3.834, 'grad_norm': 19.360641479492188, 'learning_rate': 1.0680000000000001e-05, 'epoch': 0.51}


  5%|▌         | 280/5270 [01:39<26:49,  3.10it/s]

{'loss': 3.6454, 'grad_norm': 25.68149757385254, 'learning_rate': 1.1080000000000002e-05, 'epoch': 0.53}


  6%|▌         | 290/5270 [01:42<26:49,  3.09it/s]

{'loss': 3.6189, 'grad_norm': 36.567623138427734, 'learning_rate': 1.148e-05, 'epoch': 0.55}


  6%|▌         | 300/5270 [01:45<27:03,  3.06it/s]

{'loss': 3.269, 'grad_norm': 21.077281951904297, 'learning_rate': 1.188e-05, 'epoch': 0.57}


  6%|▌         | 310/5270 [01:49<27:02,  3.06it/s]

{'loss': 3.0566, 'grad_norm': 27.37933349609375, 'learning_rate': 1.2280000000000001e-05, 'epoch': 0.59}


  6%|▌         | 320/5270 [01:52<26:58,  3.06it/s]

{'loss': 3.4001, 'grad_norm': 27.32234001159668, 'learning_rate': 1.268e-05, 'epoch': 0.61}


  6%|▋         | 330/5270 [01:55<27:06,  3.04it/s]

{'loss': 3.3112, 'grad_norm': 26.270292282104492, 'learning_rate': 1.3080000000000002e-05, 'epoch': 0.63}


  6%|▋         | 340/5270 [01:58<27:04,  3.03it/s]

{'loss': 3.4194, 'grad_norm': 27.57077980041504, 'learning_rate': 1.3480000000000001e-05, 'epoch': 0.64}


  7%|▋         | 350/5270 [02:02<27:00,  3.04it/s]

{'loss': 3.6793, 'grad_norm': 26.32823944091797, 'learning_rate': 1.3880000000000001e-05, 'epoch': 0.66}


  7%|▋         | 360/5270 [02:05<31:25,  2.60it/s]

{'loss': 3.3906, 'grad_norm': 25.27208137512207, 'learning_rate': 1.428e-05, 'epoch': 0.68}


  7%|▋         | 370/5270 [02:10<35:11,  2.32it/s]

{'loss': 3.6089, 'grad_norm': 30.574514389038086, 'learning_rate': 1.4680000000000002e-05, 'epoch': 0.7}


  7%|▋         | 380/5270 [02:14<31:54,  2.55it/s]

{'loss': 3.1006, 'grad_norm': 40.15396499633789, 'learning_rate': 1.5080000000000001e-05, 'epoch': 0.72}


  7%|▋         | 390/5270 [02:18<27:46,  2.93it/s]

{'loss': 3.2507, 'grad_norm': 44.806060791015625, 'learning_rate': 1.548e-05, 'epoch': 0.74}


  8%|▊         | 400/5270 [02:21<26:42,  3.04it/s]

{'loss': 3.2977, 'grad_norm': 41.706138610839844, 'learning_rate': 1.588e-05, 'epoch': 0.76}


  8%|▊         | 410/5270 [02:24<26:31,  3.05it/s]

{'loss': 3.3323, 'grad_norm': 33.21181869506836, 'learning_rate': 1.628e-05, 'epoch': 0.78}


  8%|▊         | 420/5270 [02:28<28:57,  2.79it/s]

{'loss': 3.2937, 'grad_norm': 33.25291442871094, 'learning_rate': 1.668e-05, 'epoch': 0.8}


  8%|▊         | 430/5270 [02:31<27:29,  2.93it/s]

{'loss': 3.1771, 'grad_norm': 37.41890335083008, 'learning_rate': 1.7080000000000002e-05, 'epoch': 0.82}


  8%|▊         | 440/5270 [02:35<27:34,  2.92it/s]

{'loss': 3.2501, 'grad_norm': 29.240280151367188, 'learning_rate': 1.7480000000000002e-05, 'epoch': 0.83}


  9%|▊         | 450/5270 [02:38<28:38,  2.80it/s]

{'loss': 3.0356, 'grad_norm': 29.429414749145508, 'learning_rate': 1.788e-05, 'epoch': 0.85}


  9%|▊         | 460/5270 [02:42<26:49,  2.99it/s]

{'loss': 3.1785, 'grad_norm': 33.09192657470703, 'learning_rate': 1.828e-05, 'epoch': 0.87}


  9%|▉         | 470/5270 [02:45<27:45,  2.88it/s]

{'loss': 2.7923, 'grad_norm': 23.500478744506836, 'learning_rate': 1.8680000000000004e-05, 'epoch': 0.89}


  9%|▉         | 480/5270 [02:48<25:42,  3.11it/s]

{'loss': 2.8542, 'grad_norm': 27.61875343322754, 'learning_rate': 1.908e-05, 'epoch': 0.91}


  9%|▉         | 490/5270 [02:52<25:00,  3.19it/s]

{'loss': 3.0991, 'grad_norm': 37.325958251953125, 'learning_rate': 1.948e-05, 'epoch': 0.93}


  9%|▉         | 500/5270 [02:55<24:44,  3.21it/s]

{'loss': 3.0504, 'grad_norm': 24.09575080871582, 'learning_rate': 1.9880000000000003e-05, 'epoch': 0.95}


 10%|▉         | 510/5270 [02:58<24:45,  3.20it/s]

{'loss': 3.2067, 'grad_norm': 30.914180755615234, 'learning_rate': 1.9970649895178197e-05, 'epoch': 0.97}


 10%|▉         | 520/5270 [03:01<24:42,  3.20it/s]

{'loss': 3.2178, 'grad_norm': 31.52884864807129, 'learning_rate': 1.9928721174004195e-05, 'epoch': 0.99}


 10%|█         | 527/5270 [03:03<26:02,  3.04it/s]
 10%|█         | 527/5270 [03:17<26:02,  3.04it/s]

{'eval_loss': 3.6353073120117188, 'eval_runtime': 13.7205, 'eval_samples_per_second': 55.683, 'eval_steps_per_second': 18.585, 'epoch': 1.0}


 10%|█         | 530/5270 [03:18<3:07:34,  2.37s/it]

{'loss': 3.1524, 'grad_norm': 28.76982879638672, 'learning_rate': 1.989098532494759e-05, 'epoch': 1.0}


 10%|█         | 540/5270 [03:22<37:08,  2.12it/s]  

{'loss': 2.774, 'grad_norm': 28.71210289001465, 'learning_rate': 1.9849056603773588e-05, 'epoch': 1.02}


 10%|█         | 550/5270 [03:26<32:46,  2.40it/s]

{'loss': 2.7071, 'grad_norm': 31.58884620666504, 'learning_rate': 1.9807127882599582e-05, 'epoch': 1.04}


 11%|█         | 560/5270 [03:30<27:14,  2.88it/s]

{'loss': 2.9664, 'grad_norm': 26.571781158447266, 'learning_rate': 1.976519916142558e-05, 'epoch': 1.06}


 11%|█         | 570/5270 [03:33<26:49,  2.92it/s]

{'loss': 2.5948, 'grad_norm': 46.253135681152344, 'learning_rate': 1.9723270440251574e-05, 'epoch': 1.08}


 11%|█         | 580/5270 [03:37<26:49,  2.91it/s]

{'loss': 2.4506, 'grad_norm': 39.624000549316406, 'learning_rate': 1.9681341719077568e-05, 'epoch': 1.1}


 11%|█         | 590/5270 [03:40<26:42,  2.92it/s]

{'loss': 2.823, 'grad_norm': 59.5487174987793, 'learning_rate': 1.9639412997903566e-05, 'epoch': 1.12}


 11%|█▏        | 600/5270 [03:44<26:38,  2.92it/s]

{'loss': 2.891, 'grad_norm': 48.92069625854492, 'learning_rate': 1.959748427672956e-05, 'epoch': 1.14}


 12%|█▏        | 610/5270 [03:47<26:33,  2.92it/s]

{'loss': 2.3012, 'grad_norm': 31.04632568359375, 'learning_rate': 1.9555555555555557e-05, 'epoch': 1.16}


 12%|█▏        | 620/5270 [03:51<26:32,  2.92it/s]

{'loss': 2.834, 'grad_norm': 29.771831512451172, 'learning_rate': 1.9513626834381555e-05, 'epoch': 1.18}


 12%|█▏        | 630/5270 [03:54<26:26,  2.92it/s]

{'loss': 2.5343, 'grad_norm': 51.78104782104492, 'learning_rate': 1.947169811320755e-05, 'epoch': 1.19}


 12%|█▏        | 640/5270 [03:57<26:17,  2.94it/s]

{'loss': 2.5265, 'grad_norm': 39.77187728881836, 'learning_rate': 1.9429769392033543e-05, 'epoch': 1.21}


 12%|█▏        | 650/5270 [04:01<26:13,  2.94it/s]

{'loss': 2.4201, 'grad_norm': 36.045101165771484, 'learning_rate': 1.938784067085954e-05, 'epoch': 1.23}


 13%|█▎        | 660/5270 [04:04<26:16,  2.92it/s]

{'loss': 2.7666, 'grad_norm': 52.81081008911133, 'learning_rate': 1.9345911949685535e-05, 'epoch': 1.25}


 13%|█▎        | 670/5270 [04:08<26:11,  2.93it/s]

{'loss': 2.5769, 'grad_norm': 39.15768051147461, 'learning_rate': 1.9303983228511532e-05, 'epoch': 1.27}


 13%|█▎        | 680/5270 [04:11<26:01,  2.94it/s]

{'loss': 2.5776, 'grad_norm': 34.58432388305664, 'learning_rate': 1.926205450733753e-05, 'epoch': 1.29}


 13%|█▎        | 690/5270 [04:15<26:04,  2.93it/s]

{'loss': 2.6602, 'grad_norm': 28.44825553894043, 'learning_rate': 1.9220125786163524e-05, 'epoch': 1.31}


 13%|█▎        | 700/5270 [04:18<25:59,  2.93it/s]

{'loss': 2.588, 'grad_norm': 28.395526885986328, 'learning_rate': 1.9178197064989518e-05, 'epoch': 1.33}


 13%|█▎        | 710/5270 [04:21<25:54,  2.93it/s]

{'loss': 2.838, 'grad_norm': 40.25624465942383, 'learning_rate': 1.9136268343815516e-05, 'epoch': 1.35}


 14%|█▎        | 720/5270 [04:25<25:52,  2.93it/s]

{'loss': 2.518, 'grad_norm': 27.6146297454834, 'learning_rate': 1.909433962264151e-05, 'epoch': 1.36}


 14%|█▍        | 730/5270 [04:28<25:49,  2.93it/s]

{'loss': 2.896, 'grad_norm': 41.6487922668457, 'learning_rate': 1.9052410901467508e-05, 'epoch': 1.38}


 14%|█▍        | 740/5270 [04:32<25:42,  2.94it/s]

{'loss': 2.6405, 'grad_norm': 32.930267333984375, 'learning_rate': 1.9010482180293505e-05, 'epoch': 1.4}


 14%|█▍        | 750/5270 [04:35<25:37,  2.94it/s]

{'loss': 2.5528, 'grad_norm': 31.52093505859375, 'learning_rate': 1.89685534591195e-05, 'epoch': 1.42}


 14%|█▍        | 760/5270 [04:38<25:45,  2.92it/s]

{'loss': 2.4777, 'grad_norm': 36.076839447021484, 'learning_rate': 1.8926624737945493e-05, 'epoch': 1.44}


 15%|█▍        | 770/5270 [04:42<25:36,  2.93it/s]

{'loss': 2.5444, 'grad_norm': 39.13210678100586, 'learning_rate': 1.888469601677149e-05, 'epoch': 1.46}


 15%|█▍        | 780/5270 [04:45<25:36,  2.92it/s]

{'loss': 2.4993, 'grad_norm': 66.12073516845703, 'learning_rate': 1.8842767295597485e-05, 'epoch': 1.48}


 15%|█▍        | 790/5270 [04:49<25:38,  2.91it/s]

{'loss': 2.6445, 'grad_norm': 42.00129318237305, 'learning_rate': 1.8800838574423483e-05, 'epoch': 1.5}


 15%|█▌        | 800/5270 [04:52<25:22,  2.94it/s]

{'loss': 2.6071, 'grad_norm': 22.577322006225586, 'learning_rate': 1.8758909853249477e-05, 'epoch': 1.52}


 15%|█▌        | 810/5270 [04:56<25:26,  2.92it/s]

{'loss': 2.7613, 'grad_norm': 32.76530075073242, 'learning_rate': 1.8716981132075474e-05, 'epoch': 1.54}


 16%|█▌        | 820/5270 [04:59<25:21,  2.92it/s]

{'loss': 2.6908, 'grad_norm': 34.93364334106445, 'learning_rate': 1.867505241090147e-05, 'epoch': 1.55}


 16%|█▌        | 830/5270 [05:02<25:14,  2.93it/s]

{'loss': 2.4342, 'grad_norm': 36.5152702331543, 'learning_rate': 1.8633123689727466e-05, 'epoch': 1.57}


 16%|█▌        | 840/5270 [05:06<25:12,  2.93it/s]

{'loss': 2.6552, 'grad_norm': 36.49320602416992, 'learning_rate': 1.859119496855346e-05, 'epoch': 1.59}


 16%|█▌        | 850/5270 [05:09<25:07,  2.93it/s]

{'loss': 2.6846, 'grad_norm': 32.11320495605469, 'learning_rate': 1.8549266247379458e-05, 'epoch': 1.61}


 16%|█▋        | 860/5270 [05:13<25:24,  2.89it/s]

{'loss': 2.3927, 'grad_norm': 58.68850326538086, 'learning_rate': 1.8507337526205452e-05, 'epoch': 1.63}


 17%|█▋        | 870/5270 [05:16<25:53,  2.83it/s]

{'loss': 2.14, 'grad_norm': 39.305389404296875, 'learning_rate': 1.846540880503145e-05, 'epoch': 1.65}


 17%|█▋        | 880/5270 [05:20<25:58,  2.82it/s]

{'loss': 2.3274, 'grad_norm': 29.535192489624023, 'learning_rate': 1.8423480083857444e-05, 'epoch': 1.67}


 17%|█▋        | 890/5270 [05:23<25:48,  2.83it/s]

{'loss': 2.2263, 'grad_norm': 27.990266799926758, 'learning_rate': 1.838155136268344e-05, 'epoch': 1.69}


 17%|█▋        | 900/5270 [05:27<25:49,  2.82it/s]

{'loss': 2.5468, 'grad_norm': 37.89945983886719, 'learning_rate': 1.8339622641509435e-05, 'epoch': 1.71}


 17%|█▋        | 910/5270 [05:30<25:37,  2.84it/s]

{'loss': 2.4694, 'grad_norm': 25.757427215576172, 'learning_rate': 1.829769392033543e-05, 'epoch': 1.73}


 17%|█▋        | 920/5270 [05:34<25:35,  2.83it/s]

{'loss': 2.1934, 'grad_norm': 36.94843673706055, 'learning_rate': 1.8255765199161427e-05, 'epoch': 1.74}


 18%|█▊        | 930/5270 [05:38<25:26,  2.84it/s]

{'loss': 2.3936, 'grad_norm': 37.85823059082031, 'learning_rate': 1.8213836477987425e-05, 'epoch': 1.76}


 18%|█▊        | 940/5270 [05:41<25:35,  2.82it/s]

{'loss': 2.1255, 'grad_norm': 20.991437911987305, 'learning_rate': 1.817190775681342e-05, 'epoch': 1.78}


 18%|█▊        | 950/5270 [05:45<25:22,  2.84it/s]

{'loss': 2.4351, 'grad_norm': 30.09794044494629, 'learning_rate': 1.8129979035639413e-05, 'epoch': 1.8}


 18%|█▊        | 960/5270 [05:48<25:30,  2.82it/s]

{'loss': 2.4459, 'grad_norm': 39.292205810546875, 'learning_rate': 1.808805031446541e-05, 'epoch': 1.82}


 18%|█▊        | 970/5270 [05:52<25:27,  2.81it/s]

{'loss': 2.4285, 'grad_norm': 34.786006927490234, 'learning_rate': 1.8046121593291405e-05, 'epoch': 1.84}


 19%|█▊        | 980/5270 [05:55<25:16,  2.83it/s]

{'loss': 2.9857, 'grad_norm': 37.263973236083984, 'learning_rate': 1.8004192872117402e-05, 'epoch': 1.86}


 19%|█▉        | 990/5270 [05:59<25:12,  2.83it/s]

{'loss': 2.6875, 'grad_norm': 34.613250732421875, 'learning_rate': 1.79622641509434e-05, 'epoch': 1.88}


 19%|█▉        | 1000/5270 [06:02<25:01,  2.84it/s]

{'loss': 2.3858, 'grad_norm': 30.95955467224121, 'learning_rate': 1.7920335429769394e-05, 'epoch': 1.9}


 19%|█▉        | 1010/5270 [06:09<28:26,  2.50it/s]  

{'loss': 2.3707, 'grad_norm': 28.4589786529541, 'learning_rate': 1.7878406708595388e-05, 'epoch': 1.91}


 19%|█▉        | 1020/5270 [06:12<24:25,  2.90it/s]

{'loss': 2.4902, 'grad_norm': 39.89407730102539, 'learning_rate': 1.7836477987421385e-05, 'epoch': 1.93}


 20%|█▉        | 1030/5270 [06:15<23:55,  2.95it/s]

{'loss': 2.4622, 'grad_norm': 34.75053787231445, 'learning_rate': 1.779454926624738e-05, 'epoch': 1.95}


 20%|█▉        | 1040/5270 [06:19<25:32,  2.76it/s]

{'loss': 2.2136, 'grad_norm': 30.107418060302734, 'learning_rate': 1.7752620545073377e-05, 'epoch': 1.97}


 20%|█▉        | 1050/5270 [06:23<25:47,  2.73it/s]

{'loss': 2.1846, 'grad_norm': 31.41657066345215, 'learning_rate': 1.7710691823899375e-05, 'epoch': 1.99}


 20%|██        | 1055/5270 [06:24<23:59,  2.93it/s]
 20%|██        | 1055/5270 [06:37<23:59,  2.93it/s]

{'eval_loss': 3.409621000289917, 'eval_runtime': 12.8389, 'eval_samples_per_second': 59.507, 'eval_steps_per_second': 19.862, 'epoch': 2.0}


 20%|██        | 1060/5270 [06:39<1:30:10,  1.29s/it]

{'loss': 1.9035, 'grad_norm': 28.23077964782715, 'learning_rate': 1.766876310272537e-05, 'epoch': 2.01}


 20%|██        | 1070/5270 [06:43<28:36,  2.45it/s]  

{'loss': 1.9365, 'grad_norm': 41.20967102050781, 'learning_rate': 1.7626834381551363e-05, 'epoch': 2.03}


 20%|██        | 1080/5270 [06:46<24:20,  2.87it/s]

{'loss': 1.4764, 'grad_norm': 22.928932189941406, 'learning_rate': 1.758490566037736e-05, 'epoch': 2.05}


 21%|██        | 1090/5270 [06:50<23:44,  2.93it/s]

{'loss': 1.9475, 'grad_norm': 48.277042388916016, 'learning_rate': 1.7542976939203355e-05, 'epoch': 2.07}


 21%|██        | 1100/5270 [06:54<26:29,  2.62it/s]

{'loss': 1.894, 'grad_norm': 28.01272964477539, 'learning_rate': 1.7501048218029352e-05, 'epoch': 2.09}


 21%|██        | 1110/5270 [06:58<28:41,  2.42it/s]

{'loss': 1.8556, 'grad_norm': 52.50372314453125, 'learning_rate': 1.7459119496855346e-05, 'epoch': 2.1}


 21%|██▏       | 1120/5270 [07:02<30:03,  2.30it/s]

{'loss': 1.8235, 'grad_norm': 37.891815185546875, 'learning_rate': 1.7417190775681344e-05, 'epoch': 2.12}


 21%|██▏       | 1130/5270 [07:06<29:36,  2.33it/s]

{'loss': 2.3894, 'grad_norm': 27.89118003845215, 'learning_rate': 1.7375262054507338e-05, 'epoch': 2.14}


 22%|██▏       | 1140/5270 [07:11<29:34,  2.33it/s]

{'loss': 1.6901, 'grad_norm': 34.13071060180664, 'learning_rate': 1.7333333333333336e-05, 'epoch': 2.16}


 22%|██▏       | 1150/5270 [07:15<26:07,  2.63it/s]

{'loss': 1.8278, 'grad_norm': 37.69837951660156, 'learning_rate': 1.729140461215933e-05, 'epoch': 2.18}


 22%|██▏       | 1160/5270 [07:18<25:52,  2.65it/s]

{'loss': 1.6069, 'grad_norm': 34.92679214477539, 'learning_rate': 1.7249475890985324e-05, 'epoch': 2.2}


 22%|██▏       | 1170/5270 [07:22<25:33,  2.67it/s]

{'loss': 1.8752, 'grad_norm': 45.895660400390625, 'learning_rate': 1.720754716981132e-05, 'epoch': 2.22}


 22%|██▏       | 1180/5270 [07:26<25:41,  2.65it/s]

{'loss': 1.6171, 'grad_norm': 36.09885787963867, 'learning_rate': 1.716561844863732e-05, 'epoch': 2.24}


 23%|██▎       | 1190/5270 [07:30<25:46,  2.64it/s]

{'loss': 2.3034, 'grad_norm': 56.349365234375, 'learning_rate': 1.7123689727463313e-05, 'epoch': 2.26}


 23%|██▎       | 1200/5270 [07:33<25:30,  2.66it/s]

{'loss': 1.6171, 'grad_norm': 54.83940505981445, 'learning_rate': 1.708176100628931e-05, 'epoch': 2.27}


 23%|██▎       | 1210/5270 [07:37<25:28,  2.66it/s]

{'loss': 2.1647, 'grad_norm': 63.31614685058594, 'learning_rate': 1.7039832285115305e-05, 'epoch': 2.29}


 23%|██▎       | 1220/5270 [07:41<25:22,  2.66it/s]

{'loss': 1.7684, 'grad_norm': 38.47107696533203, 'learning_rate': 1.69979035639413e-05, 'epoch': 2.31}


 23%|██▎       | 1230/5270 [07:45<25:12,  2.67it/s]

{'loss': 1.8204, 'grad_norm': 24.93905258178711, 'learning_rate': 1.6955974842767297e-05, 'epoch': 2.33}


 24%|██▎       | 1240/5270 [07:48<25:17,  2.66it/s]

{'loss': 1.927, 'grad_norm': 41.92142868041992, 'learning_rate': 1.6914046121593294e-05, 'epoch': 2.35}


 24%|██▎       | 1250/5270 [07:52<25:14,  2.65it/s]

{'loss': 1.7159, 'grad_norm': 26.46210289001465, 'learning_rate': 1.687211740041929e-05, 'epoch': 2.37}


 24%|██▍       | 1260/5270 [07:56<25:09,  2.66it/s]

{'loss': 1.9562, 'grad_norm': 49.5198974609375, 'learning_rate': 1.6830188679245286e-05, 'epoch': 2.39}


 24%|██▍       | 1270/5270 [08:00<24:49,  2.68it/s]

{'loss': 1.6348, 'grad_norm': 41.83298110961914, 'learning_rate': 1.678825995807128e-05, 'epoch': 2.41}


 24%|██▍       | 1280/5270 [08:04<24:56,  2.67it/s]

{'loss': 1.7604, 'grad_norm': 37.51907730102539, 'learning_rate': 1.6746331236897274e-05, 'epoch': 2.43}


 24%|██▍       | 1290/5270 [08:07<24:51,  2.67it/s]

{'loss': 1.5588, 'grad_norm': 30.037403106689453, 'learning_rate': 1.6704402515723272e-05, 'epoch': 2.45}


 25%|██▍       | 1300/5270 [08:11<24:41,  2.68it/s]

{'loss': 1.7654, 'grad_norm': 38.69841766357422, 'learning_rate': 1.666247379454927e-05, 'epoch': 2.46}


 25%|██▍       | 1310/5270 [08:15<24:52,  2.65it/s]

{'loss': 1.8107, 'grad_norm': 30.796531677246094, 'learning_rate': 1.6620545073375263e-05, 'epoch': 2.48}


 25%|██▌       | 1320/5270 [08:19<24:47,  2.66it/s]

{'loss': 1.9866, 'grad_norm': 30.10017204284668, 'learning_rate': 1.657861635220126e-05, 'epoch': 2.5}


 25%|██▌       | 1330/5270 [08:22<24:43,  2.66it/s]

{'loss': 1.7327, 'grad_norm': 35.862892150878906, 'learning_rate': 1.6536687631027255e-05, 'epoch': 2.52}


 25%|██▌       | 1340/5270 [08:26<24:34,  2.66it/s]

{'loss': 1.7388, 'grad_norm': 49.92581558227539, 'learning_rate': 1.649475890985325e-05, 'epoch': 2.54}


 26%|██▌       | 1350/5270 [08:30<24:32,  2.66it/s]

{'loss': 1.7694, 'grad_norm': 45.859615325927734, 'learning_rate': 1.6452830188679247e-05, 'epoch': 2.56}


 26%|██▌       | 1360/5270 [08:34<24:30,  2.66it/s]

{'loss': 2.0, 'grad_norm': 51.8797721862793, 'learning_rate': 1.641090146750524e-05, 'epoch': 2.58}


 26%|██▌       | 1370/5270 [08:37<24:22,  2.67it/s]

{'loss': 1.8403, 'grad_norm': 41.124473571777344, 'learning_rate': 1.636897274633124e-05, 'epoch': 2.6}


 26%|██▌       | 1380/5270 [08:41<24:20,  2.66it/s]

{'loss': 2.058, 'grad_norm': 45.601566314697266, 'learning_rate': 1.6327044025157236e-05, 'epoch': 2.62}


 26%|██▋       | 1390/5270 [08:45<24:18,  2.66it/s]

{'loss': 1.7099, 'grad_norm': 35.92702102661133, 'learning_rate': 1.628511530398323e-05, 'epoch': 2.64}


 27%|██▋       | 1400/5270 [08:49<24:11,  2.67it/s]

{'loss': 1.8805, 'grad_norm': 46.57113265991211, 'learning_rate': 1.6243186582809224e-05, 'epoch': 2.65}


 27%|██▋       | 1410/5270 [08:52<24:06,  2.67it/s]

{'loss': 1.844, 'grad_norm': 41.13093948364258, 'learning_rate': 1.6201257861635222e-05, 'epoch': 2.67}


 27%|██▋       | 1420/5270 [08:56<24:10,  2.65it/s]

{'loss': 1.5738, 'grad_norm': 43.86971664428711, 'learning_rate': 1.6159329140461216e-05, 'epoch': 2.69}


 27%|██▋       | 1430/5270 [09:00<24:05,  2.66it/s]

{'loss': 1.8194, 'grad_norm': 44.10369110107422, 'learning_rate': 1.6117400419287214e-05, 'epoch': 2.71}


 27%|██▋       | 1440/5270 [09:04<23:50,  2.68it/s]

{'loss': 1.7349, 'grad_norm': 26.13024139404297, 'learning_rate': 1.607547169811321e-05, 'epoch': 2.73}


 28%|██▊       | 1450/5270 [09:07<23:49,  2.67it/s]

{'loss': 1.884, 'grad_norm': 25.797969818115234, 'learning_rate': 1.6033542976939205e-05, 'epoch': 2.75}


 28%|██▊       | 1460/5270 [09:11<23:48,  2.67it/s]

{'loss': 1.6514, 'grad_norm': 86.63463592529297, 'learning_rate': 1.59916142557652e-05, 'epoch': 2.77}


 28%|██▊       | 1470/5270 [09:15<23:50,  2.66it/s]

{'loss': 1.7321, 'grad_norm': 42.46574783325195, 'learning_rate': 1.5949685534591197e-05, 'epoch': 2.79}


 28%|██▊       | 1480/5270 [09:19<23:45,  2.66it/s]

{'loss': 2.0028, 'grad_norm': 72.99320220947266, 'learning_rate': 1.590775681341719e-05, 'epoch': 2.81}


 28%|██▊       | 1490/5270 [09:22<23:37,  2.67it/s]

{'loss': 1.5035, 'grad_norm': 19.850631713867188, 'learning_rate': 1.586582809224319e-05, 'epoch': 2.82}


 28%|██▊       | 1500/5270 [09:26<23:28,  2.68it/s]

{'loss': 2.0054, 'grad_norm': 64.8952407836914, 'learning_rate': 1.5823899371069186e-05, 'epoch': 2.84}


 29%|██▊       | 1510/5270 [09:30<23:34,  2.66it/s]

{'loss': 1.838, 'grad_norm': 53.14495086669922, 'learning_rate': 1.578197064989518e-05, 'epoch': 2.86}


 29%|██▉       | 1520/5270 [09:34<23:23,  2.67it/s]

{'loss': 1.7521, 'grad_norm': 47.78458023071289, 'learning_rate': 1.5740041928721175e-05, 'epoch': 2.88}


 29%|██▉       | 1530/5270 [09:37<23:20,  2.67it/s]

{'loss': 1.5398, 'grad_norm': 39.08148193359375, 'learning_rate': 1.5698113207547172e-05, 'epoch': 2.9}


 29%|██▉       | 1540/5270 [09:41<23:22,  2.66it/s]

{'loss': 1.9306, 'grad_norm': 47.258399963378906, 'learning_rate': 1.5656184486373166e-05, 'epoch': 2.92}


 29%|██▉       | 1550/5270 [09:45<23:18,  2.66it/s]

{'loss': 1.6939, 'grad_norm': 24.736770629882812, 'learning_rate': 1.5614255765199164e-05, 'epoch': 2.94}


 30%|██▉       | 1560/5270 [09:49<23:16,  2.66it/s]

{'loss': 2.1131, 'grad_norm': 56.66402816772461, 'learning_rate': 1.5572327044025158e-05, 'epoch': 2.96}


 30%|██▉       | 1570/5270 [09:52<23:05,  2.67it/s]

{'loss': 1.9269, 'grad_norm': 19.46231460571289, 'learning_rate': 1.5530398322851156e-05, 'epoch': 2.98}


 30%|██▉       | 1580/5270 [09:56<23:09,  2.66it/s]

{'loss': 1.9625, 'grad_norm': 48.19060134887695, 'learning_rate': 1.548846960167715e-05, 'epoch': 3.0}


 30%|███       | 1582/5270 [09:57<23:09,  2.66it/s]
 30%|███       | 1583/5270 [10:12<4:52:28,  4.76s/it]

{'eval_loss': 3.5362322330474854, 'eval_runtime': 14.6793, 'eval_samples_per_second': 52.046, 'eval_steps_per_second': 17.371, 'epoch': 3.0}


 30%|███       | 1590/5270 [10:15<45:15,  1.36it/s]  

{'loss': 1.6714, 'grad_norm': 30.92829704284668, 'learning_rate': 1.5446540880503147e-05, 'epoch': 3.01}


 30%|███       | 1600/5270 [10:18<23:34,  2.60it/s]

{'loss': 1.4009, 'grad_norm': 33.612518310546875, 'learning_rate': 1.540461215932914e-05, 'epoch': 3.03}


 31%|███       | 1610/5270 [10:22<22:54,  2.66it/s]

{'loss': 1.3284, 'grad_norm': 39.50798034667969, 'learning_rate': 1.5362683438155136e-05, 'epoch': 3.05}


 31%|███       | 1620/5270 [10:26<22:44,  2.68it/s]

{'loss': 1.1685, 'grad_norm': 30.13815689086914, 'learning_rate': 1.5320754716981133e-05, 'epoch': 3.07}


 31%|███       | 1630/5270 [10:30<22:50,  2.66it/s]

{'loss': 1.4623, 'grad_norm': 52.53239440917969, 'learning_rate': 1.527882599580713e-05, 'epoch': 3.09}


 31%|███       | 1640/5270 [10:33<22:38,  2.67it/s]

{'loss': 1.389, 'grad_norm': 46.60771942138672, 'learning_rate': 1.5236897274633125e-05, 'epoch': 3.11}


 31%|███▏      | 1650/5270 [10:37<22:37,  2.67it/s]

{'loss': 1.2612, 'grad_norm': 18.97163200378418, 'learning_rate': 1.5194968553459122e-05, 'epoch': 3.13}


 31%|███▏      | 1660/5270 [10:41<22:35,  2.66it/s]

{'loss': 1.298, 'grad_norm': 40.33070755004883, 'learning_rate': 1.5153039832285116e-05, 'epoch': 3.15}


 32%|███▏      | 1670/5270 [10:45<22:36,  2.65it/s]

{'loss': 1.3081, 'grad_norm': 45.375091552734375, 'learning_rate': 1.5111111111111112e-05, 'epoch': 3.17}


 32%|███▏      | 1680/5270 [10:48<22:30,  2.66it/s]

{'loss': 1.3385, 'grad_norm': 68.06037139892578, 'learning_rate': 1.506918238993711e-05, 'epoch': 3.18}


 32%|███▏      | 1690/5270 [10:52<22:29,  2.65it/s]

{'loss': 1.1668, 'grad_norm': 31.822660446166992, 'learning_rate': 1.5027253668763104e-05, 'epoch': 3.2}


 32%|███▏      | 1700/5270 [10:56<22:26,  2.65it/s]

{'loss': 1.3731, 'grad_norm': 25.139020919799805, 'learning_rate': 1.49853249475891e-05, 'epoch': 3.22}


 32%|███▏      | 1710/5270 [11:00<22:10,  2.68it/s]

{'loss': 1.3686, 'grad_norm': 25.37640380859375, 'learning_rate': 1.4943396226415094e-05, 'epoch': 3.24}


 33%|███▎      | 1720/5270 [11:03<22:08,  2.67it/s]

{'loss': 1.0795, 'grad_norm': 25.19610595703125, 'learning_rate': 1.4901467505241092e-05, 'epoch': 3.26}


 33%|███▎      | 1730/5270 [11:07<22:25,  2.63it/s]

{'loss': 1.0313, 'grad_norm': 66.18492126464844, 'learning_rate': 1.4859538784067087e-05, 'epoch': 3.28}


 33%|███▎      | 1740/5270 [11:11<22:19,  2.63it/s]

{'loss': 1.4261, 'grad_norm': 33.113224029541016, 'learning_rate': 1.4817610062893082e-05, 'epoch': 3.3}


 33%|███▎      | 1750/5270 [11:15<22:12,  2.64it/s]

{'loss': 1.4629, 'grad_norm': 48.49033737182617, 'learning_rate': 1.4775681341719079e-05, 'epoch': 3.32}


 33%|███▎      | 1760/5270 [11:19<22:03,  2.65it/s]

{'loss': 1.2678, 'grad_norm': 34.393245697021484, 'learning_rate': 1.4733752620545075e-05, 'epoch': 3.34}


 34%|███▎      | 1770/5270 [11:22<22:08,  2.63it/s]

{'loss': 1.3932, 'grad_norm': 41.851192474365234, 'learning_rate': 1.4691823899371069e-05, 'epoch': 3.36}


 34%|███▍      | 1780/5270 [11:26<22:03,  2.64it/s]

{'loss': 1.43, 'grad_norm': 42.05432891845703, 'learning_rate': 1.4649895178197067e-05, 'epoch': 3.37}


 34%|███▍      | 1790/5270 [11:30<21:48,  2.66it/s]

{'loss': 1.5128, 'grad_norm': 50.21049118041992, 'learning_rate': 1.4607966457023063e-05, 'epoch': 3.39}


 34%|███▍      | 1800/5270 [11:34<21:58,  2.63it/s]

{'loss': 1.2291, 'grad_norm': 37.14689254760742, 'learning_rate': 1.4566037735849057e-05, 'epoch': 3.41}


 34%|███▍      | 1810/5270 [11:38<21:46,  2.65it/s]

{'loss': 1.0711, 'grad_norm': 33.61949157714844, 'learning_rate': 1.4524109014675054e-05, 'epoch': 3.43}


 35%|███▍      | 1820/5270 [11:41<21:49,  2.63it/s]

{'loss': 1.0384, 'grad_norm': 27.99239158630371, 'learning_rate': 1.4482180293501048e-05, 'epoch': 3.45}


 35%|███▍      | 1830/5270 [11:45<21:47,  2.63it/s]

{'loss': 1.0475, 'grad_norm': 15.09708023071289, 'learning_rate': 1.4440251572327044e-05, 'epoch': 3.47}


 35%|███▍      | 1840/5270 [11:49<21:39,  2.64it/s]

{'loss': 1.5217, 'grad_norm': 25.392547607421875, 'learning_rate': 1.4398322851153042e-05, 'epoch': 3.49}


 35%|███▌      | 1850/5270 [11:53<21:42,  2.63it/s]

{'loss': 1.1258, 'grad_norm': 41.82175827026367, 'learning_rate': 1.4356394129979036e-05, 'epoch': 3.51}


 35%|███▌      | 1860/5270 [11:57<21:35,  2.63it/s]

{'loss': 1.314, 'grad_norm': 45.66740798950195, 'learning_rate': 1.4314465408805032e-05, 'epoch': 3.53}


 35%|███▌      | 1870/5270 [12:00<21:22,  2.65it/s]

{'loss': 1.2969, 'grad_norm': 37.4549674987793, 'learning_rate': 1.427253668763103e-05, 'epoch': 3.55}


 36%|███▌      | 1880/5270 [12:04<21:18,  2.65it/s]

{'loss': 1.2505, 'grad_norm': 53.528926849365234, 'learning_rate': 1.4230607966457023e-05, 'epoch': 3.56}


 36%|███▌      | 1890/5270 [12:08<21:22,  2.64it/s]

{'loss': 1.5155, 'grad_norm': 62.33045196533203, 'learning_rate': 1.418867924528302e-05, 'epoch': 3.58}


 36%|███▌      | 1900/5270 [12:12<21:14,  2.64it/s]

{'loss': 1.0513, 'grad_norm': 34.34291076660156, 'learning_rate': 1.4146750524109017e-05, 'epoch': 3.6}


 36%|███▌      | 1910/5270 [12:15<21:12,  2.64it/s]

{'loss': 1.2316, 'grad_norm': 30.70679473876953, 'learning_rate': 1.4104821802935011e-05, 'epoch': 3.62}


 36%|███▋      | 1920/5270 [12:19<21:02,  2.65it/s]

{'loss': 1.1894, 'grad_norm': 43.16868591308594, 'learning_rate': 1.4062893081761007e-05, 'epoch': 3.64}


 37%|███▋      | 1930/5270 [12:23<21:04,  2.64it/s]

{'loss': 1.1821, 'grad_norm': 30.654123306274414, 'learning_rate': 1.4020964360587004e-05, 'epoch': 3.66}


 37%|███▋      | 1940/5270 [12:27<21:01,  2.64it/s]

{'loss': 1.2019, 'grad_norm': 39.94017791748047, 'learning_rate': 1.3979035639412999e-05, 'epoch': 3.68}


 37%|███▋      | 1950/5270 [12:31<20:57,  2.64it/s]

{'loss': 1.0749, 'grad_norm': 49.11924362182617, 'learning_rate': 1.3937106918238994e-05, 'epoch': 3.7}


 37%|███▋      | 1960/5270 [12:34<20:52,  2.64it/s]

{'loss': 1.4195, 'grad_norm': 33.966732025146484, 'learning_rate': 1.3895178197064992e-05, 'epoch': 3.72}


 37%|███▋      | 1970/5270 [12:38<20:45,  2.65it/s]

{'loss': 1.0649, 'grad_norm': 22.869672775268555, 'learning_rate': 1.3853249475890986e-05, 'epoch': 3.73}


 38%|███▊      | 1980/5270 [12:42<20:43,  2.65it/s]

{'loss': 1.1878, 'grad_norm': 50.957481384277344, 'learning_rate': 1.3811320754716982e-05, 'epoch': 3.75}


 38%|███▊      | 1990/5270 [12:46<20:40,  2.64it/s]

{'loss': 1.4351, 'grad_norm': 42.44892501831055, 'learning_rate': 1.376939203354298e-05, 'epoch': 3.77}


 38%|███▊      | 2000/5270 [12:50<20:34,  2.65it/s]

{'loss': 1.2319, 'grad_norm': 44.8114128112793, 'learning_rate': 1.3727463312368974e-05, 'epoch': 3.79}


 38%|███▊      | 2010/5270 [12:56<22:31,  2.41it/s]  

{'loss': 1.5069, 'grad_norm': 71.9164810180664, 'learning_rate': 1.368553459119497e-05, 'epoch': 3.81}


 38%|███▊      | 2020/5270 [13:00<21:13,  2.55it/s]

{'loss': 1.3638, 'grad_norm': 65.43917083740234, 'learning_rate': 1.3643605870020965e-05, 'epoch': 3.83}


 39%|███▊      | 2030/5270 [13:04<21:11,  2.55it/s]

{'loss': 1.6892, 'grad_norm': 113.42173767089844, 'learning_rate': 1.3601677148846961e-05, 'epoch': 3.85}


 39%|███▊      | 2040/5270 [13:08<21:00,  2.56it/s]

{'loss': 1.3899, 'grad_norm': 68.457763671875, 'learning_rate': 1.3559748427672957e-05, 'epoch': 3.87}


 39%|███▉      | 2050/5270 [13:12<21:05,  2.54it/s]

{'loss': 1.4468, 'grad_norm': 42.06935501098633, 'learning_rate': 1.3517819706498953e-05, 'epoch': 3.89}


 39%|███▉      | 2060/5270 [13:15<20:50,  2.57it/s]

{'loss': 1.2669, 'grad_norm': 30.96674156188965, 'learning_rate': 1.3475890985324949e-05, 'epoch': 3.91}


 39%|███▉      | 2070/5270 [13:19<20:46,  2.57it/s]

{'loss': 1.3818, 'grad_norm': 28.97748374938965, 'learning_rate': 1.3433962264150943e-05, 'epoch': 3.92}


 39%|███▉      | 2080/5270 [13:23<20:46,  2.56it/s]

{'loss': 1.3074, 'grad_norm': 39.808448791503906, 'learning_rate': 1.339203354297694e-05, 'epoch': 3.94}


 40%|███▉      | 2090/5270 [13:27<20:38,  2.57it/s]

{'loss': 1.4654, 'grad_norm': 81.9515609741211, 'learning_rate': 1.3350104821802936e-05, 'epoch': 3.96}


 40%|███▉      | 2100/5270 [13:31<20:28,  2.58it/s]

{'loss': 1.4938, 'grad_norm': 52.34231185913086, 'learning_rate': 1.330817610062893e-05, 'epoch': 3.98}


 40%|████      | 2110/5270 [13:35<19:07,  2.75it/s]

{'loss': 1.4712, 'grad_norm': 26.592775344848633, 'learning_rate': 1.3266247379454928e-05, 'epoch': 4.0}



 40%|████      | 2110/5270 [13:49<19:07,  2.75it/s]

{'eval_loss': 4.135161876678467, 'eval_runtime': 13.6725, 'eval_samples_per_second': 55.878, 'eval_steps_per_second': 18.651, 'epoch': 4.0}


 40%|████      | 2120/5270 [13:53<30:20,  1.73it/s]  

{'loss': 0.9262, 'grad_norm': 28.511354446411133, 'learning_rate': 1.3224318658280924e-05, 'epoch': 4.02}


 40%|████      | 2130/5270 [13:57<21:51,  2.39it/s]

{'loss': 0.97, 'grad_norm': 54.836875915527344, 'learning_rate': 1.3182389937106918e-05, 'epoch': 4.04}


 41%|████      | 2140/5270 [14:00<19:09,  2.72it/s]

{'loss': 0.9231, 'grad_norm': 71.85923767089844, 'learning_rate': 1.3140461215932916e-05, 'epoch': 4.06}


 41%|████      | 2150/5270 [14:04<19:01,  2.73it/s]

{'loss': 0.7303, 'grad_norm': 26.59879493713379, 'learning_rate': 1.3098532494758911e-05, 'epoch': 4.08}


 41%|████      | 2160/5270 [14:08<18:58,  2.73it/s]

{'loss': 0.7134, 'grad_norm': 86.35844421386719, 'learning_rate': 1.3056603773584906e-05, 'epoch': 4.09}


 41%|████      | 2170/5270 [14:11<18:58,  2.72it/s]

{'loss': 1.0457, 'grad_norm': 34.03375244140625, 'learning_rate': 1.3014675052410903e-05, 'epoch': 4.11}


 41%|████▏     | 2180/5270 [14:15<18:54,  2.72it/s]

{'loss': 1.1148, 'grad_norm': 49.66153335571289, 'learning_rate': 1.2972746331236899e-05, 'epoch': 4.13}


 42%|████▏     | 2190/5270 [14:19<18:45,  2.74it/s]

{'loss': 0.9293, 'grad_norm': 42.28743362426758, 'learning_rate': 1.2930817610062893e-05, 'epoch': 4.15}


 42%|████▏     | 2200/5270 [14:23<21:17,  2.40it/s]

{'loss': 0.6961, 'grad_norm': 34.19352722167969, 'learning_rate': 1.288888888888889e-05, 'epoch': 4.17}


 42%|████▏     | 2210/5270 [14:27<21:28,  2.37it/s]

{'loss': 0.9305, 'grad_norm': 24.41004180908203, 'learning_rate': 1.2846960167714887e-05, 'epoch': 4.19}


 42%|████▏     | 2220/5270 [14:31<21:28,  2.37it/s]

{'loss': 1.0147, 'grad_norm': 53.53531265258789, 'learning_rate': 1.280503144654088e-05, 'epoch': 4.21}


 42%|████▏     | 2230/5270 [14:35<20:28,  2.48it/s]

{'loss': 0.8202, 'grad_norm': 49.09074783325195, 'learning_rate': 1.2763102725366878e-05, 'epoch': 4.23}


 43%|████▎     | 2240/5270 [14:39<20:24,  2.47it/s]

{'loss': 1.1475, 'grad_norm': 43.43282699584961, 'learning_rate': 1.2721174004192874e-05, 'epoch': 4.25}


 43%|████▎     | 2250/5270 [14:43<18:05,  2.78it/s]

{'loss': 0.9145, 'grad_norm': 47.929805755615234, 'learning_rate': 1.2679245283018868e-05, 'epoch': 4.27}


 43%|████▎     | 2260/5270 [14:47<20:26,  2.45it/s]

{'loss': 1.1149, 'grad_norm': 46.39956283569336, 'learning_rate': 1.2637316561844866e-05, 'epoch': 4.28}


 43%|████▎     | 2270/5270 [14:51<18:02,  2.77it/s]

{'loss': 0.7916, 'grad_norm': 58.200469970703125, 'learning_rate': 1.259538784067086e-05, 'epoch': 4.3}


 43%|████▎     | 2280/5270 [14:54<17:51,  2.79it/s]

{'loss': 1.206, 'grad_norm': 47.649375915527344, 'learning_rate': 1.2553459119496856e-05, 'epoch': 4.32}


 43%|████▎     | 2290/5270 [14:58<20:32,  2.42it/s]

{'loss': 0.926, 'grad_norm': 33.25457000732422, 'learning_rate': 1.2511530398322853e-05, 'epoch': 4.34}


 44%|████▎     | 2300/5270 [15:03<20:42,  2.39it/s]

{'loss': 1.0539, 'grad_norm': 40.92694091796875, 'learning_rate': 1.2469601677148847e-05, 'epoch': 4.36}


 44%|████▍     | 2310/5270 [15:06<18:14,  2.70it/s]

{'loss': 0.9351, 'grad_norm': 16.62480926513672, 'learning_rate': 1.2427672955974843e-05, 'epoch': 4.38}


 44%|████▍     | 2320/5270 [15:10<18:03,  2.72it/s]

{'loss': 0.8136, 'grad_norm': 35.47499465942383, 'learning_rate': 1.2385744234800841e-05, 'epoch': 4.4}


 44%|████▍     | 2330/5270 [15:14<18:03,  2.71it/s]

{'loss': 0.8399, 'grad_norm': 20.821746826171875, 'learning_rate': 1.2343815513626835e-05, 'epoch': 4.42}


 44%|████▍     | 2340/5270 [15:17<17:51,  2.73it/s]

{'loss': 0.9457, 'grad_norm': 43.966331481933594, 'learning_rate': 1.2301886792452831e-05, 'epoch': 4.44}


 45%|████▍     | 2350/5270 [15:21<17:45,  2.74it/s]

{'loss': 0.9916, 'grad_norm': 24.390331268310547, 'learning_rate': 1.2259958071278828e-05, 'epoch': 4.45}


 45%|████▍     | 2360/5270 [15:25<19:51,  2.44it/s]

{'loss': 0.8859, 'grad_norm': 37.031803131103516, 'learning_rate': 1.2218029350104823e-05, 'epoch': 4.47}


 45%|████▍     | 2370/5270 [15:29<20:05,  2.41it/s]

{'loss': 1.0937, 'grad_norm': 46.56423568725586, 'learning_rate': 1.2176100628930818e-05, 'epoch': 4.49}


 45%|████▌     | 2380/5270 [15:33<20:04,  2.40it/s]

{'loss': 0.7635, 'grad_norm': 27.155118942260742, 'learning_rate': 1.2134171907756816e-05, 'epoch': 4.51}


 45%|████▌     | 2390/5270 [15:38<20:02,  2.40it/s]

{'loss': 0.9289, 'grad_norm': 94.18013000488281, 'learning_rate': 1.209224318658281e-05, 'epoch': 4.53}


 46%|████▌     | 2400/5270 [15:41<17:47,  2.69it/s]

{'loss': 1.1801, 'grad_norm': 32.64531707763672, 'learning_rate': 1.2050314465408806e-05, 'epoch': 4.55}


 46%|████▌     | 2410/5270 [15:45<17:15,  2.76it/s]

{'loss': 0.9036, 'grad_norm': 51.70439910888672, 'learning_rate': 1.2008385744234804e-05, 'epoch': 4.57}


 46%|████▌     | 2420/5270 [15:49<17:12,  2.76it/s]

{'loss': 0.7306, 'grad_norm': 45.23664855957031, 'learning_rate': 1.1966457023060798e-05, 'epoch': 4.59}


 46%|████▌     | 2430/5270 [15:52<17:07,  2.76it/s]

{'loss': 1.0182, 'grad_norm': 67.11846160888672, 'learning_rate': 1.1924528301886794e-05, 'epoch': 4.61}


 46%|████▋     | 2440/5270 [15:56<18:56,  2.49it/s]

{'loss': 1.0273, 'grad_norm': 88.51338195800781, 'learning_rate': 1.1882599580712791e-05, 'epoch': 4.63}


 46%|████▋     | 2450/5270 [16:00<19:32,  2.41it/s]

{'loss': 0.9469, 'grad_norm': 42.72591781616211, 'learning_rate': 1.1840670859538785e-05, 'epoch': 4.64}


 47%|████▋     | 2460/5270 [16:05<19:29,  2.40it/s]

{'loss': 0.9494, 'grad_norm': 28.697858810424805, 'learning_rate': 1.1798742138364781e-05, 'epoch': 4.66}


 47%|████▋     | 2470/5270 [16:09<19:19,  2.42it/s]

{'loss': 1.1937, 'grad_norm': 38.50943374633789, 'learning_rate': 1.1756813417190777e-05, 'epoch': 4.68}


 47%|████▋     | 2480/5270 [16:13<19:16,  2.41it/s]

{'loss': 1.1845, 'grad_norm': 66.60672760009766, 'learning_rate': 1.1714884696016773e-05, 'epoch': 4.7}


 47%|████▋     | 2490/5270 [16:17<17:51,  2.59it/s]

{'loss': 1.188, 'grad_norm': 49.950496673583984, 'learning_rate': 1.1672955974842769e-05, 'epoch': 4.72}


 47%|████▋     | 2500/5270 [16:20<16:35,  2.78it/s]

{'loss': 0.8427, 'grad_norm': 28.19915008544922, 'learning_rate': 1.1631027253668763e-05, 'epoch': 4.74}


 48%|████▊     | 2510/5270 [16:24<16:32,  2.78it/s]

{'loss': 0.7745, 'grad_norm': 41.544185638427734, 'learning_rate': 1.158909853249476e-05, 'epoch': 4.76}


 48%|████▊     | 2520/5270 [16:28<16:52,  2.72it/s]

{'loss': 0.9899, 'grad_norm': 62.94758224487305, 'learning_rate': 1.1547169811320756e-05, 'epoch': 4.78}


 48%|████▊     | 2530/5270 [16:31<17:02,  2.68it/s]

{'loss': 0.9557, 'grad_norm': 59.67117691040039, 'learning_rate': 1.150524109014675e-05, 'epoch': 4.8}


 48%|████▊     | 2540/5270 [16:35<17:02,  2.67it/s]

{'loss': 1.1727, 'grad_norm': 58.3978157043457, 'learning_rate': 1.1463312368972748e-05, 'epoch': 4.82}


 48%|████▊     | 2550/5270 [16:39<16:59,  2.67it/s]

{'loss': 0.818, 'grad_norm': 27.66299057006836, 'learning_rate': 1.1421383647798742e-05, 'epoch': 4.83}


 49%|████▊     | 2560/5270 [16:43<16:54,  2.67it/s]

{'loss': 0.7839, 'grad_norm': 27.700653076171875, 'learning_rate': 1.1379454926624738e-05, 'epoch': 4.85}


 49%|████▉     | 2570/5270 [16:46<16:47,  2.68it/s]

{'loss': 0.8434, 'grad_norm': 34.82466125488281, 'learning_rate': 1.1337526205450735e-05, 'epoch': 4.87}


 49%|████▉     | 2580/5270 [16:50<16:42,  2.68it/s]

{'loss': 1.0108, 'grad_norm': 31.731279373168945, 'learning_rate': 1.129559748427673e-05, 'epoch': 4.89}


 49%|████▉     | 2590/5270 [16:54<16:41,  2.68it/s]

{'loss': 1.0094, 'grad_norm': 75.71648406982422, 'learning_rate': 1.1253668763102725e-05, 'epoch': 4.91}


 49%|████▉     | 2600/5270 [16:58<16:36,  2.68it/s]

{'loss': 1.0347, 'grad_norm': 24.034046173095703, 'learning_rate': 1.1211740041928723e-05, 'epoch': 4.93}


 50%|████▉     | 2610/5270 [17:01<16:33,  2.68it/s]

{'loss': 0.9791, 'grad_norm': 97.89730072021484, 'learning_rate': 1.1169811320754717e-05, 'epoch': 4.95}


 50%|████▉     | 2620/5270 [17:05<16:33,  2.67it/s]

{'loss': 0.967, 'grad_norm': 22.17144203186035, 'learning_rate': 1.1127882599580713e-05, 'epoch': 4.97}


 50%|████▉     | 2630/5270 [17:09<16:35,  2.65it/s]

{'loss': 0.8225, 'grad_norm': 60.53718185424805, 'learning_rate': 1.108595387840671e-05, 'epoch': 4.99}


 50%|█████     | 2637/5270 [17:11<16:22,  2.68it/s]
 50%|█████     | 2637/5270 [17:26<16:22,  2.68it/s]

{'eval_loss': 4.833948135375977, 'eval_runtime': 14.5379, 'eval_samples_per_second': 52.552, 'eval_steps_per_second': 17.54, 'epoch': 5.0}


 50%|█████     | 2640/5270 [17:27<1:49:35,  2.50s/it]

{'loss': 0.778, 'grad_norm': 26.87006378173828, 'learning_rate': 1.1044025157232705e-05, 'epoch': 5.0}


 50%|█████     | 2650/5270 [17:31<19:04,  2.29it/s]  

{'loss': 0.6648, 'grad_norm': 50.52385711669922, 'learning_rate': 1.10020964360587e-05, 'epoch': 5.02}


 50%|█████     | 2660/5270 [17:35<16:26,  2.65it/s]

{'loss': 0.7503, 'grad_norm': 35.10611343383789, 'learning_rate': 1.0960167714884698e-05, 'epoch': 5.04}


 51%|█████     | 2670/5270 [17:38<16:12,  2.67it/s]

{'loss': 0.6406, 'grad_norm': 42.00798034667969, 'learning_rate': 1.0918238993710692e-05, 'epoch': 5.06}


 51%|█████     | 2680/5270 [17:42<16:07,  2.68it/s]

{'loss': 0.759, 'grad_norm': 35.930625915527344, 'learning_rate': 1.0876310272536688e-05, 'epoch': 5.08}


 51%|█████     | 2690/5270 [17:46<16:01,  2.68it/s]

{'loss': 0.4697, 'grad_norm': 27.18764305114746, 'learning_rate': 1.0834381551362686e-05, 'epoch': 5.1}


 51%|█████     | 2700/5270 [17:50<16:06,  2.66it/s]

{'loss': 0.6176, 'grad_norm': 31.867021560668945, 'learning_rate': 1.079245283018868e-05, 'epoch': 5.12}


 51%|█████▏    | 2710/5270 [17:53<15:57,  2.68it/s]

{'loss': 0.6357, 'grad_norm': 24.567493438720703, 'learning_rate': 1.0750524109014676e-05, 'epoch': 5.14}


 52%|█████▏    | 2720/5270 [17:57<16:02,  2.65it/s]

{'loss': 0.8993, 'grad_norm': 60.61571502685547, 'learning_rate': 1.0708595387840673e-05, 'epoch': 5.16}


 52%|█████▏    | 2730/5270 [18:01<15:52,  2.67it/s]

{'loss': 0.7976, 'grad_norm': 45.01401138305664, 'learning_rate': 1.0666666666666667e-05, 'epoch': 5.18}


 52%|█████▏    | 2740/5270 [18:05<17:26,  2.42it/s]

{'loss': 0.6263, 'grad_norm': 69.98491668701172, 'learning_rate': 1.0624737945492663e-05, 'epoch': 5.19}


 52%|█████▏    | 2750/5270 [18:09<17:38,  2.38it/s]

{'loss': 0.7656, 'grad_norm': 76.78641510009766, 'learning_rate': 1.0582809224318659e-05, 'epoch': 5.21}


 52%|█████▏    | 2760/5270 [18:13<17:37,  2.37it/s]

{'loss': 0.6043, 'grad_norm': 64.26402282714844, 'learning_rate': 1.0540880503144655e-05, 'epoch': 5.23}


 53%|█████▎    | 2770/5270 [18:17<17:25,  2.39it/s]

{'loss': 0.7025, 'grad_norm': 24.1363525390625, 'learning_rate': 1.049895178197065e-05, 'epoch': 5.25}


 53%|█████▎    | 2780/5270 [18:21<16:19,  2.54it/s]

{'loss': 0.5582, 'grad_norm': 32.29231643676758, 'learning_rate': 1.0457023060796647e-05, 'epoch': 5.27}


 53%|█████▎    | 2790/5270 [18:25<15:37,  2.65it/s]

{'loss': 0.5826, 'grad_norm': 53.87314987182617, 'learning_rate': 1.0415094339622642e-05, 'epoch': 5.29}


 53%|█████▎    | 2800/5270 [18:29<15:28,  2.66it/s]

{'loss': 0.5917, 'grad_norm': 51.26101303100586, 'learning_rate': 1.0373165618448637e-05, 'epoch': 5.31}


 53%|█████▎    | 2810/5270 [18:33<15:24,  2.66it/s]

{'loss': 0.7237, 'grad_norm': 78.83344268798828, 'learning_rate': 1.0331236897274634e-05, 'epoch': 5.33}


 54%|█████▎    | 2820/5270 [18:37<15:16,  2.67it/s]

{'loss': 0.7605, 'grad_norm': 76.48191833496094, 'learning_rate': 1.028930817610063e-05, 'epoch': 5.35}


 54%|█████▎    | 2830/5270 [18:40<15:16,  2.66it/s]

{'loss': 0.8348, 'grad_norm': 47.848846435546875, 'learning_rate': 1.0247379454926624e-05, 'epoch': 5.36}


 54%|█████▍    | 2840/5270 [18:44<15:10,  2.67it/s]

{'loss': 0.78, 'grad_norm': 38.2759895324707, 'learning_rate': 1.0205450733752622e-05, 'epoch': 5.38}


 54%|█████▍    | 2850/5270 [18:48<15:11,  2.65it/s]

{'loss': 0.8383, 'grad_norm': 34.96602249145508, 'learning_rate': 1.0163522012578618e-05, 'epoch': 5.4}


 54%|█████▍    | 2860/5270 [18:52<14:59,  2.68it/s]

{'loss': 0.7177, 'grad_norm': 17.155046463012695, 'learning_rate': 1.0121593291404612e-05, 'epoch': 5.42}


 54%|█████▍    | 2870/5270 [18:55<15:04,  2.65it/s]

{'loss': 0.8387, 'grad_norm': inf, 'learning_rate': 1.0083857442348009e-05, 'epoch': 5.44}


 55%|█████▍    | 2880/5270 [18:59<14:51,  2.68it/s]

{'loss': 0.7809, 'grad_norm': 47.843990325927734, 'learning_rate': 1.0046121593291405e-05, 'epoch': 5.46}


 55%|█████▍    | 2890/5270 [19:03<14:54,  2.66it/s]

{'loss': 0.8309, 'grad_norm': 20.64682388305664, 'learning_rate': 1.0004192872117402e-05, 'epoch': 5.48}


 55%|█████▌    | 2900/5270 [19:07<16:24,  2.41it/s]

{'loss': 0.6885, 'grad_norm': 58.53581237792969, 'learning_rate': 9.962264150943397e-06, 'epoch': 5.5}


 55%|█████▌    | 2910/5270 [19:11<16:54,  2.33it/s]

{'loss': 0.7436, 'grad_norm': 43.04487991333008, 'learning_rate': 9.920335429769392e-06, 'epoch': 5.52}


 55%|█████▌    | 2920/5270 [19:15<16:44,  2.34it/s]

{'loss': 0.7784, 'grad_norm': 48.67704772949219, 'learning_rate': 9.878406708595388e-06, 'epoch': 5.54}


 56%|█████▌    | 2930/5270 [19:20<17:31,  2.23it/s]

{'loss': 0.7456, 'grad_norm': 44.33000183105469, 'learning_rate': 9.836477987421384e-06, 'epoch': 5.55}


 56%|█████▌    | 2940/5270 [19:24<17:54,  2.17it/s]

{'loss': 0.6789, 'grad_norm': 48.24397277832031, 'learning_rate': 9.79454926624738e-06, 'epoch': 5.57}


 56%|█████▌    | 2950/5270 [19:29<17:48,  2.17it/s]

{'loss': 0.7133, 'grad_norm': 35.143226623535156, 'learning_rate': 9.752620545073376e-06, 'epoch': 5.59}


 56%|█████▌    | 2960/5270 [19:33<16:19,  2.36it/s]

{'loss': 0.745, 'grad_norm': 68.89501190185547, 'learning_rate': 9.710691823899372e-06, 'epoch': 5.61}


 56%|█████▋    | 2970/5270 [19:38<16:46,  2.29it/s]

{'loss': 0.7021, 'grad_norm': 14.761308670043945, 'learning_rate': 9.668763102725368e-06, 'epoch': 5.63}


 57%|█████▋    | 2980/5270 [19:42<17:25,  2.19it/s]

{'loss': 0.7071, 'grad_norm': 36.409053802490234, 'learning_rate': 9.626834381551363e-06, 'epoch': 5.65}


 57%|█████▋    | 2990/5270 [19:46<15:21,  2.47it/s]

{'loss': 0.6763, 'grad_norm': 47.57148742675781, 'learning_rate': 9.58490566037736e-06, 'epoch': 5.67}


 57%|█████▋    | 3000/5270 [19:50<15:01,  2.52it/s]

{'loss': 0.7558, 'grad_norm': 21.92473030090332, 'learning_rate': 9.542976939203355e-06, 'epoch': 5.69}


 57%|█████▋    | 3010/5270 [19:57<15:13,  2.48it/s]

{'loss': 0.6411, 'grad_norm': 28.30286979675293, 'learning_rate': 9.501048218029351e-06, 'epoch': 5.71}


 57%|█████▋    | 3020/5270 [20:01<13:58,  2.68it/s]

{'loss': 0.5548, 'grad_norm': 49.04566955566406, 'learning_rate': 9.459119496855347e-06, 'epoch': 5.73}


 57%|█████▋    | 3030/5270 [20:04<13:52,  2.69it/s]

{'loss': 0.8357, 'grad_norm': 92.6123046875, 'learning_rate': 9.417190775681343e-06, 'epoch': 5.74}


 58%|█████▊    | 3040/5270 [20:08<13:51,  2.68it/s]

{'loss': 0.8319, 'grad_norm': 63.43717956542969, 'learning_rate': 9.375262054507338e-06, 'epoch': 5.76}


 58%|█████▊    | 3050/5270 [20:12<13:50,  2.67it/s]

{'loss': 0.6557, 'grad_norm': 35.172298431396484, 'learning_rate': 9.333333333333334e-06, 'epoch': 5.78}


 58%|█████▊    | 3060/5270 [20:15<13:41,  2.69it/s]

{'loss': 0.6751, 'grad_norm': 50.06849670410156, 'learning_rate': 9.29140461215933e-06, 'epoch': 5.8}


 58%|█████▊    | 3070/5270 [20:19<13:38,  2.69it/s]

{'loss': 0.7422, 'grad_norm': 22.64161491394043, 'learning_rate': 9.249475890985326e-06, 'epoch': 5.82}


 58%|█████▊    | 3080/5270 [20:23<13:36,  2.68it/s]

{'loss': 0.9691, 'grad_norm': 46.95933151245117, 'learning_rate': 9.207547169811322e-06, 'epoch': 5.84}


 59%|█████▊    | 3090/5270 [20:27<13:38,  2.66it/s]

{'loss': 0.775, 'grad_norm': 63.7152214050293, 'learning_rate': 9.165618448637316e-06, 'epoch': 5.86}


 59%|█████▉    | 3100/5270 [20:30<13:29,  2.68it/s]

{'loss': 0.6479, 'grad_norm': 36.761962890625, 'learning_rate': 9.123689727463314e-06, 'epoch': 5.88}


 59%|█████▉    | 3110/5270 [20:34<13:22,  2.69it/s]

{'loss': 0.8112, 'grad_norm': 57.48936462402344, 'learning_rate': 9.08176100628931e-06, 'epoch': 5.9}


 59%|█████▉    | 3120/5270 [20:38<13:17,  2.70it/s]

{'loss': 0.7063, 'grad_norm': 52.2878303527832, 'learning_rate': 9.039832285115304e-06, 'epoch': 5.91}


 59%|█████▉    | 3130/5270 [20:42<13:15,  2.69it/s]

{'loss': 0.5073, 'grad_norm': 54.63965606689453, 'learning_rate': 8.997903563941301e-06, 'epoch': 5.93}


 60%|█████▉    | 3140/5270 [20:45<13:09,  2.70it/s]

{'loss': 0.6127, 'grad_norm': 27.997831344604492, 'learning_rate': 8.955974842767297e-06, 'epoch': 5.95}


 60%|█████▉    | 3150/5270 [20:49<13:12,  2.67it/s]

{'loss': 0.7964, 'grad_norm': 39.005699157714844, 'learning_rate': 8.914046121593291e-06, 'epoch': 5.97}


 60%|█████▉    | 3160/5270 [20:53<13:09,  2.67it/s]

{'loss': 0.846, 'grad_norm': 53.913238525390625, 'learning_rate': 8.872117400419287e-06, 'epoch': 5.99}


 60%|██████    | 3165/5270 [20:55<12:14,  2.86it/s]
 60%|██████    | 3165/5270 [21:09<12:14,  2.86it/s]

{'eval_loss': 5.235256195068359, 'eval_runtime': 14.8224, 'eval_samples_per_second': 51.544, 'eval_steps_per_second': 17.204, 'epoch': 6.0}


 60%|██████    | 3170/5270 [21:11<50:07,  1.43s/it]  

{'loss': 0.7277, 'grad_norm': 12.441580772399902, 'learning_rate': 8.830188679245285e-06, 'epoch': 6.01}


 60%|██████    | 3180/5270 [21:15<13:57,  2.50it/s]

{'loss': 0.3833, 'grad_norm': 28.05949592590332, 'learning_rate': 8.788259958071279e-06, 'epoch': 6.03}


 61%|██████    | 3190/5270 [21:19<12:53,  2.69it/s]

{'loss': 0.4977, 'grad_norm': 53.014461517333984, 'learning_rate': 8.746331236897275e-06, 'epoch': 6.05}


 61%|██████    | 3200/5270 [21:22<12:54,  2.67it/s]

{'loss': 0.5063, 'grad_norm': 40.94393539428711, 'learning_rate': 8.704402515723272e-06, 'epoch': 6.07}


 61%|██████    | 3210/5270 [21:26<12:46,  2.69it/s]

{'loss': 0.3797, 'grad_norm': 28.442249298095703, 'learning_rate': 8.662473794549266e-06, 'epoch': 6.09}


 61%|██████    | 3220/5270 [21:30<12:44,  2.68it/s]

{'loss': 0.5061, 'grad_norm': 37.00974655151367, 'learning_rate': 8.620545073375262e-06, 'epoch': 6.1}


 61%|██████▏   | 3230/5270 [21:34<12:41,  2.68it/s]

{'loss': 0.513, 'grad_norm': 18.014482498168945, 'learning_rate': 8.57861635220126e-06, 'epoch': 6.12}


 61%|██████▏   | 3240/5270 [21:37<12:38,  2.68it/s]

{'loss': 0.5297, 'grad_norm': 35.36332321166992, 'learning_rate': 8.536687631027254e-06, 'epoch': 6.14}


 62%|██████▏   | 3250/5270 [21:41<12:34,  2.68it/s]

{'loss': 0.4756, 'grad_norm': 84.14803314208984, 'learning_rate': 8.49475890985325e-06, 'epoch': 6.16}


 62%|██████▏   | 3260/5270 [21:45<12:25,  2.70it/s]

{'loss': 0.5437, 'grad_norm': 22.297420501708984, 'learning_rate': 8.452830188679245e-06, 'epoch': 6.18}


 62%|██████▏   | 3270/5270 [21:49<12:27,  2.68it/s]

{'loss': 0.5101, 'grad_norm': 47.763301849365234, 'learning_rate': 8.410901467505241e-06, 'epoch': 6.2}


 62%|██████▏   | 3280/5270 [21:52<12:23,  2.68it/s]

{'loss': 0.5369, 'grad_norm': 66.23430633544922, 'learning_rate': 8.368972746331237e-06, 'epoch': 6.22}


 62%|██████▏   | 3290/5270 [21:56<12:19,  2.68it/s]

{'loss': 0.673, 'grad_norm': 45.32788848876953, 'learning_rate': 8.327044025157233e-06, 'epoch': 6.24}


 63%|██████▎   | 3300/5270 [22:00<12:13,  2.68it/s]

{'loss': 0.613, 'grad_norm': 48.54710006713867, 'learning_rate': 8.285115303983229e-06, 'epoch': 6.26}


 63%|██████▎   | 3310/5270 [22:03<12:08,  2.69it/s]

{'loss': 0.3389, 'grad_norm': 43.029090881347656, 'learning_rate': 8.243186582809225e-06, 'epoch': 6.27}


 63%|██████▎   | 3320/5270 [22:07<12:07,  2.68it/s]

{'loss': 0.5357, 'grad_norm': 16.174036026000977, 'learning_rate': 8.20125786163522e-06, 'epoch': 6.29}


 63%|██████▎   | 3330/5270 [22:11<12:04,  2.68it/s]

{'loss': 0.3238, 'grad_norm': 25.586517333984375, 'learning_rate': 8.159329140461216e-06, 'epoch': 6.31}


 63%|██████▎   | 3340/5270 [22:15<12:01,  2.67it/s]

{'loss': 0.4328, 'grad_norm': 21.11106300354004, 'learning_rate': 8.117400419287212e-06, 'epoch': 6.33}


 64%|██████▎   | 3350/5270 [22:18<12:02,  2.66it/s]

{'loss': 0.4279, 'grad_norm': 40.91997146606445, 'learning_rate': 8.075471698113208e-06, 'epoch': 6.35}


 64%|██████▍   | 3360/5270 [22:22<11:57,  2.66it/s]

{'loss': 0.5374, 'grad_norm': 96.04467010498047, 'learning_rate': 8.033542976939204e-06, 'epoch': 6.37}


 64%|██████▍   | 3370/5270 [22:26<11:43,  2.70it/s]

{'loss': 0.3408, 'grad_norm': 30.465999603271484, 'learning_rate': 7.9916142557652e-06, 'epoch': 6.39}


 64%|██████▍   | 3380/5270 [22:30<11:47,  2.67it/s]

{'loss': 0.634, 'grad_norm': 13.623220443725586, 'learning_rate': 7.949685534591196e-06, 'epoch': 6.41}


 64%|██████▍   | 3390/5270 [22:33<11:41,  2.68it/s]

{'loss': 0.4473, 'grad_norm': 87.14071655273438, 'learning_rate': 7.907756813417192e-06, 'epoch': 6.43}


 65%|██████▍   | 3400/5270 [22:37<11:37,  2.68it/s]

{'loss': 0.4163, 'grad_norm': 57.250221252441406, 'learning_rate': 7.865828092243187e-06, 'epoch': 6.45}


 65%|██████▍   | 3410/5270 [22:41<11:33,  2.68it/s]

{'loss': 0.6383, 'grad_norm': 21.151742935180664, 'learning_rate': 7.823899371069183e-06, 'epoch': 6.46}


 65%|██████▍   | 3420/5270 [22:45<11:32,  2.67it/s]

{'loss': 0.5051, 'grad_norm': 27.668373107910156, 'learning_rate': 7.781970649895179e-06, 'epoch': 6.48}


 65%|██████▌   | 3430/5270 [22:48<12:11,  2.51it/s]

{'loss': 0.4506, 'grad_norm': 51.977542877197266, 'learning_rate': 7.740041928721175e-06, 'epoch': 6.5}


 65%|██████▌   | 3440/5270 [22:53<12:35,  2.42it/s]

{'loss': 0.8232, 'grad_norm': 35.52547836303711, 'learning_rate': 7.69811320754717e-06, 'epoch': 6.52}


 65%|██████▌   | 3450/5270 [22:57<12:26,  2.44it/s]

{'loss': 0.7099, 'grad_norm': 29.58039093017578, 'learning_rate': 7.656184486373167e-06, 'epoch': 6.54}


 66%|██████▌   | 3460/5270 [23:01<12:21,  2.44it/s]

{'loss': 0.5119, 'grad_norm': 46.59348678588867, 'learning_rate': 7.6142557651991625e-06, 'epoch': 6.56}


 66%|██████▌   | 3470/5270 [23:05<12:20,  2.43it/s]

{'loss': 0.4146, 'grad_norm': 6.641229152679443, 'learning_rate': 7.572327044025158e-06, 'epoch': 6.58}


 66%|██████▌   | 3480/5270 [23:09<12:15,  2.43it/s]

{'loss': 0.5464, 'grad_norm': 53.847808837890625, 'learning_rate': 7.530398322851153e-06, 'epoch': 6.6}


 66%|██████▌   | 3490/5270 [23:13<12:11,  2.43it/s]

{'loss': 0.6029, 'grad_norm': 40.602603912353516, 'learning_rate': 7.48846960167715e-06, 'epoch': 6.62}


 66%|██████▋   | 3500/5270 [23:17<11:11,  2.64it/s]

{'loss': 0.3956, 'grad_norm': 5.846301555633545, 'learning_rate': 7.446540880503145e-06, 'epoch': 6.64}


 67%|██████▋   | 3510/5270 [23:21<10:57,  2.68it/s]

{'loss': 0.4985, 'grad_norm': 46.576412200927734, 'learning_rate': 7.404612159329141e-06, 'epoch': 6.65}


 67%|██████▋   | 3520/5270 [23:24<10:50,  2.69it/s]

{'loss': 0.5411, 'grad_norm': 75.6265640258789, 'learning_rate': 7.362683438155137e-06, 'epoch': 6.67}


 67%|██████▋   | 3530/5270 [23:28<10:46,  2.69it/s]

{'loss': 0.6849, 'grad_norm': 10.836095809936523, 'learning_rate': 7.3207547169811326e-06, 'epoch': 6.69}


 67%|██████▋   | 3540/5270 [23:32<10:47,  2.67it/s]

{'loss': 0.5859, 'grad_norm': 47.40731430053711, 'learning_rate': 7.2788259958071284e-06, 'epoch': 6.71}


 67%|██████▋   | 3550/5270 [23:36<10:41,  2.68it/s]

{'loss': 0.6577, 'grad_norm': 72.3969497680664, 'learning_rate': 7.236897274633124e-06, 'epoch': 6.73}


 68%|██████▊   | 3560/5270 [23:39<10:41,  2.67it/s]

{'loss': 0.5638, 'grad_norm': 37.61716842651367, 'learning_rate': 7.194968553459119e-06, 'epoch': 6.75}


 68%|██████▊   | 3570/5270 [23:43<10:34,  2.68it/s]

{'loss': 0.6802, 'grad_norm': 54.97772216796875, 'learning_rate': 7.153039832285116e-06, 'epoch': 6.77}


 68%|██████▊   | 3580/5270 [23:47<10:32,  2.67it/s]

{'loss': 0.6005, 'grad_norm': 20.675567626953125, 'learning_rate': 7.111111111111112e-06, 'epoch': 6.79}


 68%|██████▊   | 3590/5270 [23:51<10:27,  2.68it/s]

{'loss': 0.5389, 'grad_norm': 134.0134735107422, 'learning_rate': 7.069182389937107e-06, 'epoch': 6.81}


 68%|██████▊   | 3600/5270 [23:54<10:24,  2.67it/s]

{'loss': 0.616, 'grad_norm': 8.76258659362793, 'learning_rate': 7.0272536687631035e-06, 'epoch': 6.82}


 69%|██████▊   | 3610/5270 [23:58<10:18,  2.69it/s]

{'loss': 0.7267, 'grad_norm': 87.8435287475586, 'learning_rate': 6.985324947589099e-06, 'epoch': 6.84}


 69%|██████▊   | 3620/5270 [24:02<10:14,  2.69it/s]

{'loss': 0.433, 'grad_norm': 75.16143035888672, 'learning_rate': 6.943396226415094e-06, 'epoch': 6.86}


 69%|██████▉   | 3630/5270 [24:06<10:11,  2.68it/s]

{'loss': 0.3541, 'grad_norm': 10.545727729797363, 'learning_rate': 6.901467505241091e-06, 'epoch': 6.88}


 69%|██████▉   | 3640/5270 [24:09<10:06,  2.69it/s]

{'loss': 0.5562, 'grad_norm': 26.085866928100586, 'learning_rate': 6.859538784067087e-06, 'epoch': 6.9}


 69%|██████▉   | 3650/5270 [24:13<10:00,  2.70it/s]

{'loss': 0.3787, 'grad_norm': 41.895687103271484, 'learning_rate': 6.817610062893082e-06, 'epoch': 6.92}


 69%|██████▉   | 3660/5270 [24:17<10:03,  2.67it/s]

{'loss': 0.512, 'grad_norm': 39.7564697265625, 'learning_rate': 6.775681341719078e-06, 'epoch': 6.94}


 70%|██████▉   | 3670/5270 [24:21<09:59,  2.67it/s]

{'loss': 0.3901, 'grad_norm': 37.28553009033203, 'learning_rate': 6.7337526205450745e-06, 'epoch': 6.96}


 70%|██████▉   | 3680/5270 [24:24<09:52,  2.68it/s]

{'loss': 0.4678, 'grad_norm': 5.676793098449707, 'learning_rate': 6.6918238993710695e-06, 'epoch': 6.98}


 70%|███████   | 3690/5270 [24:28<09:49,  2.68it/s]

{'loss': 0.6101, 'grad_norm': 29.181608200073242, 'learning_rate': 6.649895178197065e-06, 'epoch': 7.0}


 70%|███████   | 3692/5270 [24:29<09:51,  2.67it/s]
 70%|███████   | 3693/5270 [24:44<2:06:23,  4.81s/it]

{'eval_loss': 5.564907073974609, 'eval_runtime': 14.8544, 'eval_samples_per_second': 51.433, 'eval_steps_per_second': 17.167, 'epoch': 7.0}


 70%|███████   | 3700/5270 [24:46<19:17,  1.36it/s]  

{'loss': 0.3338, 'grad_norm': 20.914474487304688, 'learning_rate': 6.607966457023062e-06, 'epoch': 7.01}


 70%|███████   | 3710/5270 [24:50<09:58,  2.61it/s]

{'loss': 0.3337, 'grad_norm': 4.971357822418213, 'learning_rate': 6.566037735849057e-06, 'epoch': 7.03}


 71%|███████   | 3720/5270 [24:54<09:38,  2.68it/s]

{'loss': 0.3681, 'grad_norm': 20.313316345214844, 'learning_rate': 6.524109014675053e-06, 'epoch': 7.05}


 71%|███████   | 3730/5270 [24:58<09:36,  2.67it/s]

{'loss': 0.4582, 'grad_norm': 13.6456937789917, 'learning_rate': 6.4821802935010496e-06, 'epoch': 7.07}


 71%|███████   | 3740/5270 [25:01<09:31,  2.68it/s]

{'loss': 0.1818, 'grad_norm': 41.790916442871094, 'learning_rate': 6.4402515723270446e-06, 'epoch': 7.09}


 71%|███████   | 3750/5270 [25:05<09:24,  2.69it/s]

{'loss': 0.3325, 'grad_norm': 19.824636459350586, 'learning_rate': 6.3983228511530404e-06, 'epoch': 7.11}


 71%|███████▏  | 3760/5270 [25:09<09:24,  2.68it/s]

{'loss': 0.4813, 'grad_norm': 28.931528091430664, 'learning_rate': 6.356394129979036e-06, 'epoch': 7.13}


 72%|███████▏  | 3770/5270 [25:13<09:19,  2.68it/s]

{'loss': 0.2656, 'grad_norm': 24.352108001708984, 'learning_rate': 6.314465408805031e-06, 'epoch': 7.15}


 72%|███████▏  | 3780/5270 [25:16<09:17,  2.67it/s]

{'loss': 0.3436, 'grad_norm': 22.044883728027344, 'learning_rate': 6.272536687631028e-06, 'epoch': 7.17}


 72%|███████▏  | 3790/5270 [25:20<09:13,  2.68it/s]

{'loss': 0.4206, 'grad_norm': 68.95362854003906, 'learning_rate': 6.230607966457024e-06, 'epoch': 7.18}


 72%|███████▏  | 3800/5270 [25:24<09:06,  2.69it/s]

{'loss': 0.2995, 'grad_norm': 8.889057159423828, 'learning_rate': 6.188679245283019e-06, 'epoch': 7.2}


 72%|███████▏  | 3810/5270 [25:28<09:02,  2.69it/s]

{'loss': 0.2661, 'grad_norm': 15.163758277893066, 'learning_rate': 6.1467505241090155e-06, 'epoch': 7.22}


 72%|███████▏  | 3820/5270 [25:31<08:59,  2.69it/s]

{'loss': 0.1976, 'grad_norm': 33.17724609375, 'learning_rate': 6.104821802935011e-06, 'epoch': 7.24}


 73%|███████▎  | 3830/5270 [25:35<08:57,  2.68it/s]

{'loss': 0.2781, 'grad_norm': 86.39572143554688, 'learning_rate': 6.062893081761006e-06, 'epoch': 7.26}


 73%|███████▎  | 3840/5270 [25:39<09:05,  2.62it/s]

{'loss': 0.4837, 'grad_norm': 27.642742156982422, 'learning_rate': 6.020964360587003e-06, 'epoch': 7.28}


 73%|███████▎  | 3850/5270 [25:43<09:45,  2.42it/s]

{'loss': 0.4928, 'grad_norm': 47.108238220214844, 'learning_rate': 5.979035639412999e-06, 'epoch': 7.3}


 73%|███████▎  | 3860/5270 [25:47<09:40,  2.43it/s]

{'loss': 0.4375, 'grad_norm': 51.1305046081543, 'learning_rate': 5.937106918238994e-06, 'epoch': 7.32}


 73%|███████▎  | 3870/5270 [25:51<09:35,  2.43it/s]

{'loss': 0.4851, 'grad_norm': 49.14829635620117, 'learning_rate': 5.89517819706499e-06, 'epoch': 7.34}


 74%|███████▎  | 3880/5270 [25:55<09:31,  2.43it/s]

{'loss': 0.5236, 'grad_norm': 51.889862060546875, 'learning_rate': 5.8532494758909865e-06, 'epoch': 7.36}


 74%|███████▍  | 3890/5270 [25:59<09:27,  2.43it/s]

{'loss': 0.3834, 'grad_norm': 47.88873291015625, 'learning_rate': 5.8113207547169815e-06, 'epoch': 7.37}


 74%|███████▍  | 3900/5270 [26:03<09:23,  2.43it/s]

{'loss': 0.4659, 'grad_norm': 37.20064163208008, 'learning_rate': 5.769392033542977e-06, 'epoch': 7.39}


 74%|███████▍  | 3910/5270 [26:07<08:27,  2.68it/s]

{'loss': 0.3473, 'grad_norm': 32.21043395996094, 'learning_rate': 5.727463312368972e-06, 'epoch': 7.41}


 74%|███████▍  | 3920/5270 [26:11<08:25,  2.67it/s]

{'loss': 0.3025, 'grad_norm': 18.165794372558594, 'learning_rate': 5.685534591194969e-06, 'epoch': 7.43}


 75%|███████▍  | 3930/5270 [26:15<08:22,  2.67it/s]

{'loss': 0.3419, 'grad_norm': 39.846736907958984, 'learning_rate': 5.643605870020965e-06, 'epoch': 7.45}


 75%|███████▍  | 3940/5270 [26:18<08:16,  2.68it/s]

{'loss': 0.4745, 'grad_norm': 61.01234436035156, 'learning_rate': 5.60167714884696e-06, 'epoch': 7.47}


 75%|███████▍  | 3950/5270 [26:22<08:10,  2.69it/s]

{'loss': 0.3596, 'grad_norm': 27.905208587646484, 'learning_rate': 5.5597484276729566e-06, 'epoch': 7.49}


 75%|███████▌  | 3960/5270 [26:26<08:07,  2.68it/s]

{'loss': 0.2583, 'grad_norm': 40.169189453125, 'learning_rate': 5.5178197064989524e-06, 'epoch': 7.51}


 75%|███████▌  | 3970/5270 [26:30<08:01,  2.70it/s]

{'loss': 0.4256, 'grad_norm': 70.45488739013672, 'learning_rate': 5.4758909853249474e-06, 'epoch': 7.53}


 76%|███████▌  | 3980/5270 [26:33<07:58,  2.69it/s]

{'loss': 0.5132, 'grad_norm': 117.97626495361328, 'learning_rate': 5.433962264150944e-06, 'epoch': 7.55}


 76%|███████▌  | 3990/5270 [26:37<07:56,  2.69it/s]

{'loss': 0.2513, 'grad_norm': 66.37491607666016, 'learning_rate': 5.39203354297694e-06, 'epoch': 7.56}


 76%|███████▌  | 4000/5270 [26:41<07:52,  2.69it/s]

{'loss': 0.3849, 'grad_norm': 25.101970672607422, 'learning_rate': 5.350104821802935e-06, 'epoch': 7.58}


 76%|███████▌  | 4010/5270 [26:47<08:57,  2.34it/s]

{'loss': 0.347, 'grad_norm': 25.87184715270996, 'learning_rate': 5.308176100628931e-06, 'epoch': 7.6}


 76%|███████▋  | 4020/5270 [26:51<07:33,  2.76it/s]

{'loss': 0.3776, 'grad_norm': 17.375768661499023, 'learning_rate': 5.2662473794549275e-06, 'epoch': 7.62}


 76%|███████▋  | 4030/5270 [26:55<07:24,  2.79it/s]

{'loss': 0.2672, 'grad_norm': 62.890079498291016, 'learning_rate': 5.2243186582809225e-06, 'epoch': 7.64}


 77%|███████▋  | 4040/5270 [26:58<07:20,  2.79it/s]

{'loss': 0.3059, 'grad_norm': 20.854034423828125, 'learning_rate': 5.182389937106918e-06, 'epoch': 7.66}


 77%|███████▋  | 4050/5270 [27:02<08:17,  2.45it/s]

{'loss': 0.5301, 'grad_norm': 24.334545135498047, 'learning_rate': 5.140461215932915e-06, 'epoch': 7.68}


 77%|███████▋  | 4060/5270 [27:06<08:19,  2.42it/s]

{'loss': 0.4057, 'grad_norm': 41.59856033325195, 'learning_rate': 5.09853249475891e-06, 'epoch': 7.7}


 77%|███████▋  | 4070/5270 [27:10<08:03,  2.48it/s]

{'loss': 0.3967, 'grad_norm': 22.95106315612793, 'learning_rate': 5.056603773584906e-06, 'epoch': 7.72}


 77%|███████▋  | 4080/5270 [27:14<07:34,  2.62it/s]

{'loss': 0.3595, 'grad_norm': 3.074491500854492, 'learning_rate': 5.014675052410903e-06, 'epoch': 7.73}


 78%|███████▊  | 4090/5270 [27:18<07:31,  2.61it/s]

{'loss': 0.3425, 'grad_norm': 59.885414123535156, 'learning_rate': 4.972746331236898e-06, 'epoch': 7.75}


 78%|███████▊  | 4100/5270 [27:22<07:26,  2.62it/s]

{'loss': 0.3294, 'grad_norm': 9.677740097045898, 'learning_rate': 4.9308176100628935e-06, 'epoch': 7.77}


 78%|███████▊  | 4110/5270 [27:26<07:21,  2.63it/s]

{'loss': 0.4933, 'grad_norm': 46.8867073059082, 'learning_rate': 4.888888888888889e-06, 'epoch': 7.79}


 78%|███████▊  | 4120/5270 [27:30<07:19,  2.61it/s]

{'loss': 0.3027, 'grad_norm': 65.63762664794922, 'learning_rate': 4.846960167714885e-06, 'epoch': 7.81}


 78%|███████▊  | 4130/5270 [27:33<07:17,  2.61it/s]

{'loss': 0.3273, 'grad_norm': 38.6510124206543, 'learning_rate': 4.805031446540881e-06, 'epoch': 7.83}


 79%|███████▊  | 4140/5270 [27:37<07:15,  2.59it/s]

{'loss': 0.5236, 'grad_norm': 31.12738037109375, 'learning_rate': 4.763102725366877e-06, 'epoch': 7.85}


 79%|███████▊  | 4150/5270 [27:41<07:10,  2.60it/s]

{'loss': 0.4766, 'grad_norm': 43.541786193847656, 'learning_rate': 4.721174004192873e-06, 'epoch': 7.87}


 79%|███████▉  | 4160/5270 [27:45<07:04,  2.61it/s]

{'loss': 0.3247, 'grad_norm': 9.57084846496582, 'learning_rate': 4.6792452830188686e-06, 'epoch': 7.89}


 79%|███████▉  | 4170/5270 [27:49<07:00,  2.61it/s]

{'loss': 0.3228, 'grad_norm': 19.136011123657227, 'learning_rate': 4.6373165618448644e-06, 'epoch': 7.91}


 79%|███████▉  | 4180/5270 [27:53<06:57,  2.61it/s]

{'loss': 0.4493, 'grad_norm': 33.054100036621094, 'learning_rate': 4.59538784067086e-06, 'epoch': 7.92}


 80%|███████▉  | 4190/5270 [27:56<06:52,  2.62it/s]

{'loss': 0.2023, 'grad_norm': 15.527181625366211, 'learning_rate': 4.553459119496856e-06, 'epoch': 7.94}


 80%|███████▉  | 4200/5270 [28:00<06:50,  2.61it/s]

{'loss': 0.4652, 'grad_norm': 14.86523723602295, 'learning_rate': 4.511530398322851e-06, 'epoch': 7.96}


 80%|███████▉  | 4210/5270 [28:04<06:45,  2.61it/s]

{'loss': 0.4832, 'grad_norm': 64.26507568359375, 'learning_rate': 4.469601677148847e-06, 'epoch': 7.98}


 80%|████████  | 4220/5270 [28:08<06:15,  2.80it/s]

{'loss': 0.4082, 'grad_norm': 35.367469787597656, 'learning_rate': 4.427672955974844e-06, 'epoch': 8.0}



 80%|████████  | 4220/5270 [28:22<06:15,  2.80it/s]

{'eval_loss': 6.074433326721191, 'eval_runtime': 14.0778, 'eval_samples_per_second': 54.27, 'eval_steps_per_second': 18.114, 'epoch': 8.0}


 80%|████████  | 4230/5270 [28:26<09:26,  1.83it/s]  

{'loss': 0.2738, 'grad_norm': 36.51899337768555, 'learning_rate': 4.385744234800839e-06, 'epoch': 8.02}


 80%|████████  | 4240/5270 [28:29<06:30,  2.64it/s]

{'loss': 0.2686, 'grad_norm': 9.177796363830566, 'learning_rate': 4.3438155136268345e-06, 'epoch': 8.04}


 81%|████████  | 4250/5270 [28:33<06:23,  2.66it/s]

{'loss': 0.3027, 'grad_norm': 36.2071647644043, 'learning_rate': 4.30188679245283e-06, 'epoch': 8.06}


 81%|████████  | 4260/5270 [28:37<06:19,  2.66it/s]

{'loss': 0.1553, 'grad_norm': 11.420278549194336, 'learning_rate': 4.259958071278826e-06, 'epoch': 8.08}


 81%|████████  | 4270/5270 [28:41<06:16,  2.66it/s]

{'loss': 0.3243, 'grad_norm': 69.94795227050781, 'learning_rate': 4.218029350104822e-06, 'epoch': 8.09}


 81%|████████  | 4280/5270 [28:44<06:11,  2.66it/s]

{'loss': 0.2459, 'grad_norm': 28.99347496032715, 'learning_rate': 4.176100628930818e-06, 'epoch': 8.11}


 81%|████████▏ | 4290/5270 [28:48<06:07,  2.67it/s]

{'loss': 0.3135, 'grad_norm': 32.72399139404297, 'learning_rate': 4.134171907756814e-06, 'epoch': 8.13}


 82%|████████▏ | 4300/5270 [28:52<06:03,  2.67it/s]

{'loss': 0.2704, 'grad_norm': 21.43511199951172, 'learning_rate': 4.09224318658281e-06, 'epoch': 8.15}


 82%|████████▏ | 4310/5270 [28:56<06:00,  2.67it/s]

{'loss': 0.2633, 'grad_norm': 1.372241735458374, 'learning_rate': 4.0503144654088055e-06, 'epoch': 8.17}


 82%|████████▏ | 4320/5270 [28:59<05:56,  2.67it/s]

{'loss': 0.2244, 'grad_norm': 60.976348876953125, 'learning_rate': 4.008385744234801e-06, 'epoch': 8.19}


 82%|████████▏ | 4330/5270 [29:03<05:52,  2.67it/s]

{'loss': 0.2591, 'grad_norm': 87.75883483886719, 'learning_rate': 3.966457023060797e-06, 'epoch': 8.21}


 82%|████████▏ | 4340/5270 [29:07<05:50,  2.65it/s]

{'loss': 0.4231, 'grad_norm': 36.59647750854492, 'learning_rate': 3.924528301886793e-06, 'epoch': 8.23}


 83%|████████▎ | 4350/5270 [29:11<05:43,  2.68it/s]

{'loss': 0.2664, 'grad_norm': 61.61928176879883, 'learning_rate': 3.882599580712789e-06, 'epoch': 8.25}


 83%|████████▎ | 4360/5270 [29:14<05:39,  2.68it/s]

{'loss': 0.3149, 'grad_norm': 36.46474075317383, 'learning_rate': 3.840670859538784e-06, 'epoch': 8.27}


 83%|████████▎ | 4370/5270 [29:18<05:59,  2.50it/s]

{'loss': 0.2607, 'grad_norm': 41.51679992675781, 'learning_rate': 3.7987421383647806e-06, 'epoch': 8.28}


 83%|████████▎ | 4380/5270 [29:23<06:20,  2.34it/s]

{'loss': 0.4107, 'grad_norm': 73.3086929321289, 'learning_rate': 3.756813417190776e-06, 'epoch': 8.3}


 83%|████████▎ | 4390/5270 [29:27<06:23,  2.30it/s]

{'loss': 0.3726, 'grad_norm': 25.462858200073242, 'learning_rate': 3.714884696016772e-06, 'epoch': 8.32}


 83%|████████▎ | 4400/5270 [29:31<06:13,  2.33it/s]

{'loss': 0.2444, 'grad_norm': 68.49214172363281, 'learning_rate': 3.6729559748427673e-06, 'epoch': 8.34}


 84%|████████▎ | 4410/5270 [29:35<06:12,  2.31it/s]

{'loss': 0.188, 'grad_norm': 36.947933197021484, 'learning_rate': 3.6310272536687636e-06, 'epoch': 8.36}


 84%|████████▍ | 4420/5270 [29:40<06:03,  2.34it/s]

{'loss': 0.346, 'grad_norm': 42.387386322021484, 'learning_rate': 3.589098532494759e-06, 'epoch': 8.38}


 84%|████████▍ | 4430/5270 [29:44<05:09,  2.71it/s]

{'loss': 0.4187, 'grad_norm': 50.45467758178711, 'learning_rate': 3.547169811320755e-06, 'epoch': 8.4}


 84%|████████▍ | 4440/5270 [29:48<05:34,  2.48it/s]

{'loss': 0.2389, 'grad_norm': 43.16492462158203, 'learning_rate': 3.505241090146751e-06, 'epoch': 8.42}


 84%|████████▍ | 4450/5270 [29:52<05:45,  2.37it/s]

{'loss': 0.2236, 'grad_norm': 23.332887649536133, 'learning_rate': 3.4633123689727465e-06, 'epoch': 8.44}


 85%|████████▍ | 4460/5270 [29:56<05:51,  2.31it/s]

{'loss': 0.3101, 'grad_norm': 56.651885986328125, 'learning_rate': 3.4213836477987424e-06, 'epoch': 8.45}


 85%|████████▍ | 4470/5270 [30:00<05:06,  2.61it/s]

{'loss': 0.2849, 'grad_norm': 41.0705451965332, 'learning_rate': 3.3794549266247382e-06, 'epoch': 8.47}


 85%|████████▌ | 4480/5270 [30:04<04:56,  2.66it/s]

{'loss': 0.2661, 'grad_norm': 11.682251930236816, 'learning_rate': 3.337526205450734e-06, 'epoch': 8.49}


 85%|████████▌ | 4490/5270 [30:08<05:21,  2.43it/s]

{'loss': 0.2982, 'grad_norm': 36.22441101074219, 'learning_rate': 3.2955974842767295e-06, 'epoch': 8.51}


 85%|████████▌ | 4500/5270 [30:12<05:34,  2.30it/s]

{'loss': 0.2951, 'grad_norm': 25.16767692565918, 'learning_rate': 3.2536687631027258e-06, 'epoch': 8.53}


 86%|████████▌ | 4510/5270 [30:17<05:33,  2.28it/s]

{'loss': 0.1829, 'grad_norm': 46.76775360107422, 'learning_rate': 3.2117400419287216e-06, 'epoch': 8.55}


 86%|████████▌ | 4520/5270 [30:21<06:07,  2.04it/s]

{'loss': 0.4614, 'grad_norm': 9.679858207702637, 'learning_rate': 3.169811320754717e-06, 'epoch': 8.57}


 86%|████████▌ | 4530/5270 [30:26<05:18,  2.32it/s]

{'loss': 0.1521, 'grad_norm': 9.053017616271973, 'learning_rate': 3.1278825995807133e-06, 'epoch': 8.59}


 86%|████████▌ | 4540/5270 [30:30<05:15,  2.32it/s]

{'loss': 0.2678, 'grad_norm': 125.67405700683594, 'learning_rate': 3.0859538784067088e-06, 'epoch': 8.61}


 86%|████████▋ | 4550/5270 [30:35<05:33,  2.16it/s]

{'loss': 0.3815, 'grad_norm': 45.834251403808594, 'learning_rate': 3.0440251572327046e-06, 'epoch': 8.63}


 87%|████████▋ | 4560/5270 [30:39<05:32,  2.13it/s]

{'loss': 0.2412, 'grad_norm': 19.7076416015625, 'learning_rate': 3.002096436058701e-06, 'epoch': 8.64}


 87%|████████▋ | 4570/5270 [30:44<05:29,  2.13it/s]

{'loss': 0.4538, 'grad_norm': 62.12559509277344, 'learning_rate': 2.9601677148846963e-06, 'epoch': 8.66}


 87%|████████▋ | 4580/5270 [30:49<05:22,  2.14it/s]

{'loss': 0.2773, 'grad_norm': 48.64904022216797, 'learning_rate': 2.918238993710692e-06, 'epoch': 8.68}


 87%|████████▋ | 4590/5270 [30:53<05:14,  2.16it/s]

{'loss': 0.2755, 'grad_norm': 56.804813385009766, 'learning_rate': 2.8763102725366876e-06, 'epoch': 8.7}


 87%|████████▋ | 4600/5270 [30:58<04:43,  2.37it/s]

{'loss': 0.2759, 'grad_norm': 69.28440856933594, 'learning_rate': 2.834381551362684e-06, 'epoch': 8.72}


 87%|████████▋ | 4610/5270 [31:01<04:20,  2.53it/s]

{'loss': 0.2295, 'grad_norm': 2.545046329498291, 'learning_rate': 2.7924528301886793e-06, 'epoch': 8.74}


 88%|████████▊ | 4620/5270 [31:05<04:17,  2.52it/s]

{'loss': 0.1861, 'grad_norm': 2.8515543937683105, 'learning_rate': 2.750524109014675e-06, 'epoch': 8.76}


 88%|████████▊ | 4630/5270 [31:09<04:12,  2.53it/s]

{'loss': 0.4039, 'grad_norm': 59.28866195678711, 'learning_rate': 2.7085953878406714e-06, 'epoch': 8.78}


 88%|████████▊ | 4640/5270 [31:13<04:08,  2.54it/s]

{'loss': 0.267, 'grad_norm': 32.58991241455078, 'learning_rate': 2.666666666666667e-06, 'epoch': 8.8}


 88%|████████▊ | 4650/5270 [31:17<04:06,  2.51it/s]

{'loss': 0.2529, 'grad_norm': 28.164365768432617, 'learning_rate': 2.6247379454926627e-06, 'epoch': 8.82}


 88%|████████▊ | 4660/5270 [31:21<04:04,  2.50it/s]

{'loss': 0.2085, 'grad_norm': 22.994142532348633, 'learning_rate': 2.5828092243186585e-06, 'epoch': 8.83}


 89%|████████▊ | 4670/5270 [31:25<03:38,  2.74it/s]

{'loss': 0.2199, 'grad_norm': 25.051433563232422, 'learning_rate': 2.5408805031446544e-06, 'epoch': 8.85}


 89%|████████▉ | 4680/5270 [31:29<03:37,  2.71it/s]

{'loss': 0.5333, 'grad_norm': 103.75084686279297, 'learning_rate': 2.4989517819706502e-06, 'epoch': 8.87}


 89%|████████▉ | 4690/5270 [31:33<03:49,  2.53it/s]

{'loss': 0.2853, 'grad_norm': 36.44264221191406, 'learning_rate': 2.4570230607966457e-06, 'epoch': 8.89}


 89%|████████▉ | 4700/5270 [31:37<03:46,  2.52it/s]

{'loss': 0.2137, 'grad_norm': 28.385759353637695, 'learning_rate': 2.415094339622642e-06, 'epoch': 8.91}


 89%|████████▉ | 4710/5270 [31:41<03:42,  2.51it/s]

{'loss': 0.3233, 'grad_norm': 35.712646484375, 'learning_rate': 2.3731656184486378e-06, 'epoch': 8.93}


 90%|████████▉ | 4720/5270 [31:45<03:35,  2.55it/s]

{'loss': 0.2555, 'grad_norm': 50.277854919433594, 'learning_rate': 2.331236897274633e-06, 'epoch': 8.95}


 90%|████████▉ | 4730/5270 [31:48<03:09,  2.85it/s]

{'loss': 0.2297, 'grad_norm': 21.23414421081543, 'learning_rate': 2.289308176100629e-06, 'epoch': 8.97}


 90%|████████▉ | 4740/5270 [31:52<03:06,  2.84it/s]

{'loss': 0.282, 'grad_norm': 72.8680648803711, 'learning_rate': 2.247379454926625e-06, 'epoch': 8.99}


 90%|█████████ | 4747/5270 [31:54<03:03,  2.85it/s]
 90%|█████████ | 4748/5270 [32:08<38:19,  4.40s/it]

{'eval_loss': 6.352036952972412, 'eval_runtime': 13.5821, 'eval_samples_per_second': 56.251, 'eval_steps_per_second': 18.775, 'epoch': 9.0}


 90%|█████████ | 4750/5270 [32:09<20:15,  2.34s/it]

{'loss': 0.3715, 'grad_norm': 27.465883255004883, 'learning_rate': 2.2054507337526208e-06, 'epoch': 9.0}


 90%|█████████ | 4760/5270 [32:12<03:49,  2.22it/s]

{'loss': 0.2756, 'grad_norm': 35.44778060913086, 'learning_rate': 2.1635220125786166e-06, 'epoch': 9.02}


 91%|█████████ | 4770/5270 [32:16<03:23,  2.45it/s]

{'loss': 0.284, 'grad_norm': 15.605679512023926, 'learning_rate': 2.121593291404612e-06, 'epoch': 9.04}


 91%|█████████ | 4780/5270 [32:20<03:18,  2.46it/s]

{'loss': 0.2199, 'grad_norm': 34.96634292602539, 'learning_rate': 2.0796645702306083e-06, 'epoch': 9.06}


 91%|█████████ | 4790/5270 [32:25<03:14,  2.47it/s]

{'loss': 0.3227, 'grad_norm': 85.12948608398438, 'learning_rate': 2.037735849056604e-06, 'epoch': 9.08}


 91%|█████████ | 4800/5270 [32:28<02:47,  2.81it/s]

{'loss': 0.2596, 'grad_norm': 91.82584381103516, 'learning_rate': 1.9958071278825996e-06, 'epoch': 9.1}


 91%|█████████▏| 4810/5270 [32:32<02:48,  2.74it/s]

{'loss': 0.2151, 'grad_norm': 11.016633033752441, 'learning_rate': 1.9538784067085954e-06, 'epoch': 9.12}


 91%|█████████▏| 4820/5270 [32:36<02:44,  2.73it/s]

{'loss': 0.2189, 'grad_norm': 26.71510887145996, 'learning_rate': 1.9119496855345913e-06, 'epoch': 9.14}


 92%|█████████▏| 4830/5270 [32:39<02:41,  2.73it/s]

{'loss': 0.1689, 'grad_norm': 30.177997589111328, 'learning_rate': 1.8700209643605871e-06, 'epoch': 9.16}


 92%|█████████▏| 4840/5270 [32:43<02:38,  2.71it/s]

{'loss': 0.2274, 'grad_norm': 54.86988830566406, 'learning_rate': 1.828092243186583e-06, 'epoch': 9.18}


 92%|█████████▏| 4850/5270 [32:47<02:34,  2.72it/s]

{'loss': 0.1659, 'grad_norm': 23.190555572509766, 'learning_rate': 1.7861635220125786e-06, 'epoch': 9.19}


 92%|█████████▏| 4860/5270 [32:50<02:30,  2.72it/s]

{'loss': 0.2685, 'grad_norm': 7.639720916748047, 'learning_rate': 1.7442348008385745e-06, 'epoch': 9.21}


 92%|█████████▏| 4870/5270 [32:54<02:28,  2.70it/s]

{'loss': 0.2727, 'grad_norm': 56.198097229003906, 'learning_rate': 1.7023060796645705e-06, 'epoch': 9.23}


 93%|█████████▎| 4880/5270 [32:58<02:23,  2.71it/s]

{'loss': 0.2664, 'grad_norm': 25.699127197265625, 'learning_rate': 1.6603773584905662e-06, 'epoch': 9.25}


 93%|█████████▎| 4890/5270 [33:01<02:20,  2.71it/s]

{'loss': 0.2352, 'grad_norm': 51.25837707519531, 'learning_rate': 1.618448637316562e-06, 'epoch': 9.27}


 93%|█████████▎| 4900/5270 [33:05<02:16,  2.71it/s]

{'loss': 0.2419, 'grad_norm': 57.93054962158203, 'learning_rate': 1.5765199161425579e-06, 'epoch': 9.29}


 93%|█████████▎| 4910/5270 [33:09<02:12,  2.71it/s]

{'loss': 0.1094, 'grad_norm': 15.574138641357422, 'learning_rate': 1.5345911949685535e-06, 'epoch': 9.31}


 93%|█████████▎| 4920/5270 [33:12<02:09,  2.70it/s]

{'loss': 0.2641, 'grad_norm': 43.29298782348633, 'learning_rate': 1.4926624737945494e-06, 'epoch': 9.33}


 94%|█████████▎| 4930/5270 [33:16<02:05,  2.70it/s]

{'loss': 0.1796, 'grad_norm': 125.38035583496094, 'learning_rate': 1.450733752620545e-06, 'epoch': 9.35}


 94%|█████████▎| 4940/5270 [33:20<02:02,  2.69it/s]

{'loss': 0.2869, 'grad_norm': 34.38068389892578, 'learning_rate': 1.408805031446541e-06, 'epoch': 9.36}


 94%|█████████▍| 4950/5270 [33:24<01:57,  2.72it/s]

{'loss': 0.2358, 'grad_norm': 75.25316619873047, 'learning_rate': 1.366876310272537e-06, 'epoch': 9.38}


 94%|█████████▍| 4960/5270 [33:27<01:54,  2.71it/s]

{'loss': 0.1636, 'grad_norm': 25.806751251220703, 'learning_rate': 1.3249475890985325e-06, 'epoch': 9.4}


 94%|█████████▍| 4970/5270 [33:31<01:50,  2.71it/s]

{'loss': 0.2215, 'grad_norm': 13.51694107055664, 'learning_rate': 1.2830188679245284e-06, 'epoch': 9.42}


 94%|█████████▍| 4980/5270 [33:35<01:46,  2.72it/s]

{'loss': 0.1778, 'grad_norm': 11.464359283447266, 'learning_rate': 1.2410901467505242e-06, 'epoch': 9.44}


 95%|█████████▍| 4990/5270 [33:38<01:43,  2.71it/s]

{'loss': 0.1762, 'grad_norm': 3.9843506813049316, 'learning_rate': 1.19916142557652e-06, 'epoch': 9.46}


 95%|█████████▍| 5000/5270 [33:42<01:40,  2.69it/s]

{'loss': 0.2055, 'grad_norm': 72.66127014160156, 'learning_rate': 1.157232704402516e-06, 'epoch': 9.48}


 95%|█████████▌| 5010/5270 [33:49<01:53,  2.30it/s]

{'loss': 0.2264, 'grad_norm': 54.32352828979492, 'learning_rate': 1.1153039832285116e-06, 'epoch': 9.5}


 95%|█████████▌| 5020/5270 [33:53<01:41,  2.47it/s]

{'loss': 0.254, 'grad_norm': 9.762683868408203, 'learning_rate': 1.0733752620545074e-06, 'epoch': 9.52}


 95%|█████████▌| 5030/5270 [33:57<01:33,  2.58it/s]

{'loss': 0.3829, 'grad_norm': 76.46906280517578, 'learning_rate': 1.0314465408805033e-06, 'epoch': 9.54}


 96%|█████████▌| 5040/5270 [34:00<01:27,  2.62it/s]

{'loss': 0.2695, 'grad_norm': 8.439477920532227, 'learning_rate': 9.895178197064991e-07, 'epoch': 9.55}


 96%|█████████▌| 5050/5270 [34:04<01:23,  2.62it/s]

{'loss': 0.1655, 'grad_norm': 18.415735244750977, 'learning_rate': 9.475890985324948e-07, 'epoch': 9.57}


 96%|█████████▌| 5060/5270 [34:08<01:19,  2.63it/s]

{'loss': 0.2446, 'grad_norm': 25.47089385986328, 'learning_rate': 9.056603773584906e-07, 'epoch': 9.59}


 96%|█████████▌| 5070/5270 [34:12<01:16,  2.62it/s]

{'loss': 0.1069, 'grad_norm': 28.8560791015625, 'learning_rate': 8.637316561844865e-07, 'epoch': 9.61}


 96%|█████████▋| 5080/5270 [34:16<01:12,  2.61it/s]

{'loss': 0.2323, 'grad_norm': 7.518230438232422, 'learning_rate': 8.218029350104822e-07, 'epoch': 9.63}


 97%|█████████▋| 5090/5270 [34:20<01:08,  2.62it/s]

{'loss': 0.2025, 'grad_norm': 35.80931854248047, 'learning_rate': 7.798742138364781e-07, 'epoch': 9.65}


 97%|█████████▋| 5100/5270 [34:23<01:04,  2.62it/s]

{'loss': 0.1135, 'grad_norm': 2.9920785427093506, 'learning_rate': 7.379454926624739e-07, 'epoch': 9.67}


 97%|█████████▋| 5110/5270 [34:27<01:01,  2.62it/s]

{'loss': 0.211, 'grad_norm': 17.56874656677246, 'learning_rate': 6.960167714884697e-07, 'epoch': 9.69}


 97%|█████████▋| 5120/5270 [34:31<00:57,  2.62it/s]

{'loss': 0.2972, 'grad_norm': 30.60441017150879, 'learning_rate': 6.540880503144655e-07, 'epoch': 9.71}


 97%|█████████▋| 5130/5270 [34:35<00:53,  2.62it/s]

{'loss': 0.225, 'grad_norm': 45.142189025878906, 'learning_rate': 6.121593291404612e-07, 'epoch': 9.73}


 98%|█████████▊| 5140/5270 [34:39<00:49,  2.61it/s]

{'loss': 0.1466, 'grad_norm': 19.80261993408203, 'learning_rate': 5.702306079664571e-07, 'epoch': 9.74}


 98%|█████████▊| 5150/5270 [34:42<00:46,  2.61it/s]

{'loss': 0.2806, 'grad_norm': 50.337890625, 'learning_rate': 5.283018867924528e-07, 'epoch': 9.76}


 98%|█████████▊| 5160/5270 [34:46<00:42,  2.60it/s]

{'loss': 0.2562, 'grad_norm': 44.69153594970703, 'learning_rate': 4.863731656184487e-07, 'epoch': 9.78}


 98%|█████████▊| 5170/5270 [34:50<00:34,  2.91it/s]

{'loss': 0.1875, 'grad_norm': 11.156797409057617, 'learning_rate': 4.444444444444445e-07, 'epoch': 9.8}


 98%|█████████▊| 5180/5270 [34:54<00:33,  2.65it/s]

{'loss': 0.4143, 'grad_norm': 12.241249084472656, 'learning_rate': 4.025157232704403e-07, 'epoch': 9.82}


 98%|█████████▊| 5190/5270 [34:57<00:30,  2.63it/s]

{'loss': 0.1653, 'grad_norm': 22.88559913635254, 'learning_rate': 3.605870020964361e-07, 'epoch': 9.84}


 99%|█████████▊| 5200/5270 [35:01<00:26,  2.63it/s]

{'loss': 0.2958, 'grad_norm': 53.45882797241211, 'learning_rate': 3.186582809224319e-07, 'epoch': 9.86}


 99%|█████████▉| 5210/5270 [35:05<00:22,  2.63it/s]

{'loss': 0.1901, 'grad_norm': 7.092888832092285, 'learning_rate': 2.7672955974842773e-07, 'epoch': 9.88}


 99%|█████████▉| 5220/5270 [35:09<00:19,  2.62it/s]

{'loss': 0.1331, 'grad_norm': 10.012152671813965, 'learning_rate': 2.348008385744235e-07, 'epoch': 9.9}


 99%|█████████▉| 5230/5270 [35:13<00:15,  2.62it/s]

{'loss': 0.1723, 'grad_norm': 9.020524978637695, 'learning_rate': 1.928721174004193e-07, 'epoch': 9.91}


 99%|█████████▉| 5240/5270 [35:16<00:11,  2.62it/s]

{'loss': 0.119, 'grad_norm': 9.776790618896484, 'learning_rate': 1.5094339622641512e-07, 'epoch': 9.93}


100%|█████████▉| 5250/5270 [35:20<00:07,  2.64it/s]

{'loss': 0.165, 'grad_norm': 39.833309173583984, 'learning_rate': 1.090146750524109e-07, 'epoch': 9.95}


100%|█████████▉| 5260/5270 [35:24<00:03,  2.63it/s]

{'loss': 0.2773, 'grad_norm': 2.685666561126709, 'learning_rate': 6.708595387840671e-08, 'epoch': 9.97}


100%|██████████| 5270/5270 [35:28<00:00,  2.64it/s]

{'loss': 0.1957, 'grad_norm': 0.601735532283783, 'learning_rate': 2.5157232704402518e-08, 'epoch': 9.99}



100%|██████████| 5270/5270 [35:41<00:00,  2.46it/s]

{'eval_loss': 6.327608108520508, 'eval_runtime': 13.205, 'eval_samples_per_second': 57.857, 'eval_steps_per_second': 19.311, 'epoch': 9.99}
{'train_runtime': 2141.5432, 'train_samples_per_second': 14.77, 'train_steps_per_second': 2.461, 'train_loss': 1.2860708624633486, 'epoch': 9.99}





TrainOutput(global_step=5270, training_loss=1.2860708624633486, metrics={'train_runtime': 2141.5432, 'train_samples_per_second': 14.77, 'train_steps_per_second': 2.461, 'total_flos': 8257500106371072.0, 'train_loss': 1.2860708624633486, 'epoch': 9.990521327014218})

In [17]:
results = trainer.evaluate()

print(results)

model.save_pretrained("./kazakh_qa_model")
tokenizer.save_pretrained("./kazakh_qa_model")

  0%|          | 0/255 [00:00<?, ?it/s]

100%|██████████| 255/255 [00:14<00:00, 17.61it/s]


{'eval_loss': 6.327608108520508, 'eval_runtime': 14.4973, 'eval_samples_per_second': 52.7, 'eval_steps_per_second': 17.59, 'epoch': 9.990521327014218}


('./kazakh_qa_model\\tokenizer_config.json',
 './kazakh_qa_model\\special_tokens_map.json',
 './kazakh_qa_model\\vocab.txt',
 './kazakh_qa_model\\added_tokens.json',
 './kazakh_qa_model\\tokenizer.json')

In [24]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

model_path = "./kazakh_qa_model(5e)"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

question = "Шоқан Уәлиханов қандай аймақтарда зерттеу жұмыстарын жүргізді?"
context = (
    "Шоқан Уәлиханов (1835–1865) — қазақтың ғалымы, тарихшысы, этнографы, географы, саяхатшысы, әрі саяси қайраткері. "
    "Ол қазақ халқының тарихын, мәдениетін және тұрмысын зерттеуде үлкен үлес қосты. Шоқан Уәлиханов Орыс Географиялық Қоғамының мүшесі болды "
    "және өзінің еңбектерімен танымал болды. Ол Жоңғария, Қашқария, Тянь-Шань аймақтарында зерттеу жұмыстарын жүргізді."
)

inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)

inputs = {key: val.to(device) for key, val in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

start_scores = outputs.start_logits
end_scores = outputs.end_logits

start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)

if end_index < start_index:
    end_index = start_index

all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
answer_tokens = all_tokens[start_index:end_index+1]

answer_tokens = [token for token in answer_tokens if token not in tokenizer.all_special_tokens]

answer = tokenizer.convert_tokens_to_string(answer_tokens)

print(f"Question: {question}")
print(f"Context: {context}")
print(f"Start Index: {start_index}, End Index: {end_index}")
print(f"Answer Tokens: {answer_tokens}")
print(f"Answer: {answer}")


Question: Шоқан Уәлиханов қандай аймақтарда зерттеу жұмыстарын жүргізді?
Context: Шоқан Уәлиханов (1835–1865) — қазақтың ғалымы, тарихшысы, этнографы, географы, саяхатшысы, әрі саяси қайраткері. Ол қазақ халқының тарихын, мәдениетін және тұрмысын зерттеуде үлкен үлес қосты. Шоқан Уәлиханов Орыс Географиялық Қоғамының мүшесі болды және өзінің еңбектерімен танымал болды. Ол Жоңғария, Қашқария, Тянь-Шань аймақтарында зерттеу жұмыстарын жүргізді.
Start Index: 126, End Index: 141
Answer Tokens: ['Ж', '##о', '##ң', '##ға', '##рия', ',', 'Қ', '##аш', '##қа', '##рия', ',', 'Тя', '##нь', '-', 'Ш', '##ань']
Answer: Жоңғария, Қашқария, Тянь - Шань


In [19]:
dataset['validation'][713]

{'id': 'lit2252lit#31945_4_1',
 'title': 'Шалкиіз Тіленшіұлы',
 'context': 'Шалкиіз жырау (1465-1560) - ақын, жырау, батыр. Шалкиіз - туындылары орыс тіліне аударылған ақын-жыраулардың бірі. Оның шығармашылығын сол дәуірдегі орыс оқымыстылары мен зерттеушілері жоғары бағалаған. Темір биге айтқандары әртүрлі жинақтар мен басылымдарда бірнеше мәрте басылды. Жыраудың қазақ көне әдебиетіндегі орны ерекше. Шалкиіз жырау ғүмырының көп бөлігін Ноғай ордасының билеушісі Темір бидің қол астында өткізеді. Турашылдығымен, әділдігімен, даналығымен ел алдында беделді болады. Өмірінің соңғы жылдарын Хақназар хан тұсындағы Қазақ мемлекетінде өткізеді.',
 'question': 'Шалкиіз жырау қай биге толғау шығарған?',
 'answers': {'text': ['Темір биге'], 'answer_start': [203]}}