### Imports

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model


  from .autonotebook import tqdm as notebook_tqdm


### Load the Dataset

In [2]:
# Load Alpaca dataset from Hugging Face or local JSON
dataset = load_dataset("tatsu-lab/alpaca")

# Preview the dataset structure
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text'],
        num_rows: 52002
    })
})


### Format, Scale, and Tokenize the Dataset

In [3]:

# Combine instruction, input, and output into a single text field
def format_example(example):
    instruction = example['instruction']
    input_text = example['input']
    output = example['output']
    if input_text:
        return f"Instruction: {instruction}\nInput: {input_text}\nOutput: {output}"
    else:
        return f"Instruction: {instruction}\nOutput: {output}"

formatted_dataset = dataset.map(lambda x: {"text": format_example(x)})

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Set the padding token to the <eos_token> (end of sentence token)
tokenizer.pad_token = tokenizer.eos_token

# If you want to use a new token for padding
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize dataset
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Copy input_ids to labels
    return tokenized


tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


In [4]:
print(tokenized_dataset)
print(tokenized_dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 52002
    })
})
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Instruction: Give three tips for staying healthy.\nOutput: 1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'input_ids': [25464, 42, 13843, 1296, 5608, 327, 9286, 2458, 30, 198, 17597, 42, 216, 33, 30, 36693, 253, 8609, 2714, 284, 919, 2090, 288, 1453, 7568, 282, 5574, 284, 5136, 30, 3717, 34, 30, 15382, 5578, 288, 1446, 469, 1248, 3212, 284, 1837, 30, 3717, 35, 30, 5399,

In [5]:
# Scale datasets for testing
def scale_dataset(dataset, max_samples=1000):
    """Scale down a dataset to a maximum number of samples"""
    if len(dataset) > max_samples:
        scaled_indices = list(range(max_samples))
        return dataset.select(scaled_indices)
    return dataset

# Set your desired size
MAX_SAMPLES = 10000  # Adjust this number as needed

# Print original sizes
print(f"Original sizes - Train: {len(tokenized_dataset['train'])}")

# Scale the train dataset
tokenized_train = scale_dataset(tokenized_dataset["train"], MAX_SAMPLES)

# Optionally scale the eval dataset (if present or needed)
tokenized_eval = scale_dataset(
    tokenized_dataset["train"],  # Replace with tokenized_dataset["eval"] if you have a separate eval set
    max(50, int(MAX_SAMPLES * 0.1))  # Keep eval set ~10% of train, minimum 50 samples
)

# Print scaled sizes
print(f"Scaled sizes - Train: {len(tokenized_train)}, Eval: {len(tokenized_eval)}")


Original sizes - Train: 52002
Scaled sizes - Train: 10000, Eval: 1000


### Fine tuning setup

In [6]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from transformers import EarlyStoppingCallback

# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")

# Configure LoRA
lora_config = LoraConfig(
    r=8,  # Rank for low-rank adaptation
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # Target query and value projections
    task_type="CAUSAL_LM"  # This is a causal language model
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()


trainable params: 460,800 || all params: 134,975,808 || trainable%: 0.3414


In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # Evaluate periodically
    eval_steps=2000,              # Evaluate every 500 steps
    save_steps=2000,              # Save model every 500 steps
    logging_steps=100,          # Log progress every 1000 steps
    load_best_model_at_end=True,  # Load the best model after training
    metric_for_best_model="eval_loss",  # Use evaluation loss as the metric
    greater_is_better=False,     # Lower eval_loss is better
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    num_train_epochs=10,
    save_total_limit=2,
    fp16=False,
)



### Train

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_eval,  # Replace with validation set if available
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()


  0%|          | 101/25000 [00:14<59:39,  6.96it/s] 

{'loss': 1.3415, 'grad_norm': 0.14469361305236816, 'learning_rate': 0.000498, 'epoch': 0.04}


  1%|          | 201/25000 [00:28<57:34,  7.18it/s]  

{'loss': 0.3081, 'grad_norm': 0.12462545186281204, 'learning_rate': 0.000496, 'epoch': 0.08}


  1%|          | 301/25000 [00:42<57:40,  7.14it/s]  

{'loss': 0.2723, 'grad_norm': 0.13755421340465546, 'learning_rate': 0.000494, 'epoch': 0.12}


  2%|▏         | 401/25000 [00:56<57:39,  7.11it/s]

{'loss': 0.2834, 'grad_norm': 0.12591338157653809, 'learning_rate': 0.000492, 'epoch': 0.16}


  2%|▏         | 501/25000 [01:10<57:37,  7.08it/s]

{'loss': 0.2907, 'grad_norm': 0.13759462535381317, 'learning_rate': 0.00049, 'epoch': 0.2}


  2%|▏         | 601/25000 [01:24<58:48,  6.91it/s]

{'loss': 0.2854, 'grad_norm': 0.13613665103912354, 'learning_rate': 0.000488, 'epoch': 0.24}


  3%|▎         | 701/25000 [01:38<57:04,  7.10it/s]

{'loss': 0.2577, 'grad_norm': 0.12962140142917633, 'learning_rate': 0.000486, 'epoch': 0.28}


  3%|▎         | 801/25000 [01:52<56:41,  7.11it/s]

{'loss': 0.2958, 'grad_norm': 0.16412155330181122, 'learning_rate': 0.000484, 'epoch': 0.32}


  4%|▎         | 901/25000 [02:06<56:28,  7.11it/s]

{'loss': 0.2675, 'grad_norm': 0.16023916006088257, 'learning_rate': 0.000482, 'epoch': 0.36}


  4%|▍         | 1001/25000 [02:20<56:20,  7.10it/s]

{'loss': 0.2663, 'grad_norm': 0.11346624791622162, 'learning_rate': 0.00048, 'epoch': 0.4}


  4%|▍         | 1101/25000 [02:34<55:48,  7.14it/s]

{'loss': 0.2607, 'grad_norm': 0.13346008956432343, 'learning_rate': 0.00047799999999999996, 'epoch': 0.44}


  5%|▍         | 1201/25000 [02:48<56:01,  7.08it/s]

{'loss': 0.2781, 'grad_norm': 0.13212227821350098, 'learning_rate': 0.00047599999999999997, 'epoch': 0.48}


  5%|▌         | 1301/25000 [03:02<55:46,  7.08it/s]

{'loss': 0.2697, 'grad_norm': 0.14401455223560333, 'learning_rate': 0.000474, 'epoch': 0.52}


  6%|▌         | 1401/25000 [03:16<55:20,  7.11it/s]

{'loss': 0.2794, 'grad_norm': 0.09840647131204605, 'learning_rate': 0.000472, 'epoch': 0.56}


  6%|▌         | 1501/25000 [03:30<55:16,  7.09it/s]

{'loss': 0.269, 'grad_norm': 0.11762706190347672, 'learning_rate': 0.00047, 'epoch': 0.6}


  6%|▋         | 1601/25000 [03:44<55:13,  7.06it/s]

{'loss': 0.2753, 'grad_norm': 0.11360827833414078, 'learning_rate': 0.00046800000000000005, 'epoch': 0.64}


  7%|▋         | 1701/25000 [03:58<54:54,  7.07it/s]

{'loss': 0.2754, 'grad_norm': 0.15413062274456024, 'learning_rate': 0.00046600000000000005, 'epoch': 0.68}


  7%|▋         | 1801/25000 [04:12<54:22,  7.11it/s]

{'loss': 0.288, 'grad_norm': 0.10172335803508759, 'learning_rate': 0.00046400000000000006, 'epoch': 0.72}


  8%|▊         | 1901/25000 [04:27<54:20,  7.08it/s]

{'loss': 0.28, 'grad_norm': 0.13311727344989777, 'learning_rate': 0.000462, 'epoch': 0.76}


  8%|▊         | 2000/25000 [04:40<53:59,  7.10it/s]

{'loss': 0.2562, 'grad_norm': 0.0968274399638176, 'learning_rate': 0.00046, 'epoch': 0.8}


                                                    
  8%|▊         | 2000/25000 [04:52<53:59,  7.10it/s]

{'eval_loss': 0.2575596272945404, 'eval_runtime': 11.915, 'eval_samples_per_second': 83.928, 'eval_steps_per_second': 10.491, 'epoch': 0.8}


  8%|▊         | 2101/25000 [05:07<53:48,  7.09it/s]   

{'loss': 0.2859, 'grad_norm': 0.09401190280914307, 'learning_rate': 0.000458, 'epoch': 0.84}


  9%|▉         | 2201/25000 [05:21<54:11,  7.01it/s]

{'loss': 0.2664, 'grad_norm': 0.1365087479352951, 'learning_rate': 0.000456, 'epoch': 0.88}


  9%|▉         | 2301/25000 [05:35<53:13,  7.11it/s]

{'loss': 0.2635, 'grad_norm': 0.10041838884353638, 'learning_rate': 0.00045400000000000003, 'epoch': 0.92}


 10%|▉         | 2401/25000 [05:49<52:57,  7.11it/s]

{'loss': 0.2602, 'grad_norm': 0.13723599910736084, 'learning_rate': 0.00045200000000000004, 'epoch': 0.96}


 10%|█         | 2501/25000 [06:03<52:50,  7.10it/s]

{'loss': 0.2814, 'grad_norm': 0.09916690737009048, 'learning_rate': 0.00045000000000000004, 'epoch': 1.0}


 10%|█         | 2601/25000 [06:17<52:54,  7.06it/s]

{'loss': 0.2528, 'grad_norm': 0.10376251488924026, 'learning_rate': 0.000448, 'epoch': 1.04}


 11%|█         | 2701/25000 [06:31<52:19,  7.10it/s]

{'loss': 0.2518, 'grad_norm': 0.13343612849712372, 'learning_rate': 0.000446, 'epoch': 1.08}


 11%|█         | 2801/25000 [06:45<52:02,  7.11it/s]

{'loss': 0.2674, 'grad_norm': 0.10880777984857559, 'learning_rate': 0.000444, 'epoch': 1.12}


 12%|█▏        | 2901/25000 [06:59<51:49,  7.11it/s]

{'loss': 0.2727, 'grad_norm': 0.17273685336112976, 'learning_rate': 0.000442, 'epoch': 1.16}


 12%|█▏        | 3001/25000 [07:13<51:33,  7.11it/s]

{'loss': 0.2626, 'grad_norm': 0.08675751835107803, 'learning_rate': 0.00044, 'epoch': 1.2}


 12%|█▏        | 3101/25000 [07:27<51:16,  7.12it/s]

{'loss': 0.2484, 'grad_norm': 0.07538719475269318, 'learning_rate': 0.000438, 'epoch': 1.24}


 13%|█▎        | 3201/25000 [07:41<50:55,  7.13it/s]

{'loss': 0.2677, 'grad_norm': 0.10850366204977036, 'learning_rate': 0.000436, 'epoch': 1.28}


 13%|█▎        | 3301/25000 [07:55<50:55,  7.10it/s]

{'loss': 0.2716, 'grad_norm': 0.10468541830778122, 'learning_rate': 0.00043400000000000003, 'epoch': 1.32}


 14%|█▎        | 3401/25000 [08:09<50:18,  7.16it/s]

{'loss': 0.2728, 'grad_norm': 0.09729254990816116, 'learning_rate': 0.000432, 'epoch': 1.36}


 14%|█▍        | 3501/25000 [08:23<50:21,  7.11it/s]

{'loss': 0.2598, 'grad_norm': 0.11604098975658417, 'learning_rate': 0.00043, 'epoch': 1.4}


 14%|█▍        | 3601/25000 [08:37<49:59,  7.13it/s]

{'loss': 0.2698, 'grad_norm': 0.12100200355052948, 'learning_rate': 0.000428, 'epoch': 1.44}


 15%|█▍        | 3701/25000 [08:51<49:46,  7.13it/s]

{'loss': 0.2601, 'grad_norm': 0.15736685693264008, 'learning_rate': 0.000426, 'epoch': 1.48}


 15%|█▌        | 3801/25000 [09:05<49:27,  7.14it/s]

{'loss': 0.2724, 'grad_norm': 0.13542449474334717, 'learning_rate': 0.000424, 'epoch': 1.52}


 16%|█▌        | 3901/25000 [09:19<49:21,  7.12it/s]

{'loss': 0.2844, 'grad_norm': 0.08117757737636566, 'learning_rate': 0.000422, 'epoch': 1.56}


 16%|█▌        | 4000/25000 [09:33<48:43,  7.18it/s]

{'loss': 0.2527, 'grad_norm': 0.08366314321756363, 'learning_rate': 0.00042, 'epoch': 1.6}


                                                    
 16%|█▌        | 4000/25000 [09:45<48:43,  7.18it/s]

{'eval_loss': 0.2506283223628998, 'eval_runtime': 11.9029, 'eval_samples_per_second': 84.013, 'eval_steps_per_second': 10.502, 'epoch': 1.6}


 16%|█▋        | 4101/25000 [09:59<49:09,  7.08it/s]   

{'loss': 0.2754, 'grad_norm': 0.1576196700334549, 'learning_rate': 0.00041799999999999997, 'epoch': 1.64}


 17%|█▋        | 4201/25000 [10:13<48:46,  7.11it/s]

{'loss': 0.2588, 'grad_norm': 0.08843815326690674, 'learning_rate': 0.000416, 'epoch': 1.68}


 17%|█▋        | 4301/25000 [10:27<48:35,  7.10it/s]

{'loss': 0.2727, 'grad_norm': 0.10859877616167068, 'learning_rate': 0.000414, 'epoch': 1.72}


 18%|█▊        | 4401/25000 [10:41<48:14,  7.12it/s]

{'loss': 0.2621, 'grad_norm': 0.12293968349695206, 'learning_rate': 0.000412, 'epoch': 1.76}


 18%|█▊        | 4501/25000 [10:55<48:04,  7.11it/s]

{'loss': 0.2646, 'grad_norm': 0.16898401081562042, 'learning_rate': 0.00041, 'epoch': 1.8}


 18%|█▊        | 4601/25000 [11:09<48:00,  7.08it/s]

{'loss': 0.27, 'grad_norm': 0.1256466954946518, 'learning_rate': 0.000408, 'epoch': 1.84}


 19%|█▉        | 4701/25000 [11:23<47:31,  7.12it/s]

{'loss': 0.2798, 'grad_norm': 0.09311822801828384, 'learning_rate': 0.00040600000000000006, 'epoch': 1.88}


 19%|█▉        | 4801/25000 [11:37<47:17,  7.12it/s]

{'loss': 0.2572, 'grad_norm': 0.12005351483821869, 'learning_rate': 0.000404, 'epoch': 1.92}


 20%|█▉        | 4901/25000 [11:51<47:13,  7.09it/s]

{'loss': 0.2671, 'grad_norm': 0.28704744577407837, 'learning_rate': 0.000402, 'epoch': 1.96}


 20%|██        | 5001/25000 [12:05<47:10,  7.07it/s]

{'loss': 0.265, 'grad_norm': 0.09150093048810959, 'learning_rate': 0.0004, 'epoch': 2.0}


 20%|██        | 5101/25000 [12:19<46:33,  7.12it/s]

{'loss': 0.2684, 'grad_norm': 0.14676757156848907, 'learning_rate': 0.000398, 'epoch': 2.04}


 21%|██        | 5201/25000 [12:33<46:12,  7.14it/s]

{'loss': 0.2519, 'grad_norm': 0.14036999642848969, 'learning_rate': 0.00039600000000000003, 'epoch': 2.08}


 21%|██        | 5301/25000 [12:47<46:07,  7.12it/s]

{'loss': 0.2496, 'grad_norm': 0.09744295477867126, 'learning_rate': 0.00039400000000000004, 'epoch': 2.12}


 22%|██▏       | 5401/25000 [13:01<45:57,  7.11it/s]

{'loss': 0.2585, 'grad_norm': 0.10821956396102905, 'learning_rate': 0.00039200000000000004, 'epoch': 2.16}


 22%|██▏       | 5501/25000 [13:15<45:50,  7.09it/s]

{'loss': 0.2568, 'grad_norm': 0.12008672207593918, 'learning_rate': 0.00039000000000000005, 'epoch': 2.2}


 22%|██▏       | 5601/25000 [13:30<45:38,  7.08it/s]

{'loss': 0.2668, 'grad_norm': 0.1307804435491562, 'learning_rate': 0.000388, 'epoch': 2.24}


 23%|██▎       | 5701/25000 [13:44<45:16,  7.10it/s]

{'loss': 0.2556, 'grad_norm': 0.14454469084739685, 'learning_rate': 0.000386, 'epoch': 2.28}


 23%|██▎       | 5801/25000 [13:57<44:59,  7.11it/s]

{'loss': 0.2641, 'grad_norm': 0.17274780571460724, 'learning_rate': 0.000384, 'epoch': 2.32}


 24%|██▎       | 5901/25000 [14:11<44:39,  7.13it/s]

{'loss': 0.2684, 'grad_norm': 0.11441650986671448, 'learning_rate': 0.000382, 'epoch': 2.36}


 24%|██▍       | 6000/25000 [14:25<45:07,  7.02it/s]

{'loss': 0.2547, 'grad_norm': 0.13834808766841888, 'learning_rate': 0.00038, 'epoch': 2.4}


                                                    
 24%|██▍       | 6000/25000 [14:37<45:07,  7.02it/s]

{'eval_loss': 0.24578459560871124, 'eval_runtime': 11.891, 'eval_samples_per_second': 84.097, 'eval_steps_per_second': 10.512, 'epoch': 2.4}


 24%|██▍       | 6101/25000 [14:52<44:27,  7.08it/s]   

{'loss': 0.2757, 'grad_norm': 0.11816326528787613, 'learning_rate': 0.000378, 'epoch': 2.44}


 25%|██▍       | 6201/25000 [15:06<44:05,  7.11it/s]

{'loss': 0.2358, 'grad_norm': 0.13523408770561218, 'learning_rate': 0.00037600000000000003, 'epoch': 2.48}


 25%|██▌       | 6301/25000 [15:20<43:45,  7.12it/s]

{'loss': 0.2559, 'grad_norm': 0.1471514254808426, 'learning_rate': 0.000374, 'epoch': 2.52}


 26%|██▌       | 6401/25000 [15:34<43:30,  7.12it/s]

{'loss': 0.2547, 'grad_norm': 0.16584554314613342, 'learning_rate': 0.000372, 'epoch': 2.56}


 26%|██▌       | 6501/25000 [15:48<43:40,  7.06it/s]

{'loss': 0.2619, 'grad_norm': 0.14448687434196472, 'learning_rate': 0.00037, 'epoch': 2.6}


 26%|██▋       | 6601/25000 [16:02<43:02,  7.12it/s]

{'loss': 0.2512, 'grad_norm': 0.13689646124839783, 'learning_rate': 0.000368, 'epoch': 2.64}


 27%|██▋       | 6701/25000 [16:16<42:53,  7.11it/s]

{'loss': 0.2658, 'grad_norm': 0.13217468559741974, 'learning_rate': 0.000366, 'epoch': 2.68}


 27%|██▋       | 6801/25000 [16:30<43:05,  7.04it/s]

{'loss': 0.2597, 'grad_norm': 0.12785258889198303, 'learning_rate': 0.000364, 'epoch': 2.72}


 28%|██▊       | 6901/25000 [16:44<42:43,  7.06it/s]

{'loss': 0.261, 'grad_norm': 0.11404814571142197, 'learning_rate': 0.000362, 'epoch': 2.76}


 28%|██▊       | 7001/25000 [16:58<42:13,  7.11it/s]

{'loss': 0.2552, 'grad_norm': 0.14933638274669647, 'learning_rate': 0.00035999999999999997, 'epoch': 2.8}


 28%|██▊       | 7101/25000 [17:12<41:46,  7.14it/s]

{'loss': 0.2694, 'grad_norm': 0.13640643656253815, 'learning_rate': 0.000358, 'epoch': 2.84}


 29%|██▉       | 7201/25000 [17:26<41:35,  7.13it/s]

{'loss': 0.2574, 'grad_norm': 0.155978262424469, 'learning_rate': 0.000356, 'epoch': 2.88}


 29%|██▉       | 7301/25000 [17:40<41:29,  7.11it/s]

{'loss': 0.2562, 'grad_norm': 0.1513891965150833, 'learning_rate': 0.000354, 'epoch': 2.92}


 30%|██▉       | 7401/25000 [17:54<41:28,  7.07it/s]

{'loss': 0.2664, 'grad_norm': 0.10718756914138794, 'learning_rate': 0.000352, 'epoch': 2.96}


 30%|███       | 7501/25000 [18:08<41:04,  7.10it/s]

{'loss': 0.2545, 'grad_norm': 0.13504959642887115, 'learning_rate': 0.00035, 'epoch': 3.0}


 30%|███       | 7601/25000 [18:22<40:36,  7.14it/s]

{'loss': 0.2522, 'grad_norm': 0.12322588264942169, 'learning_rate': 0.000348, 'epoch': 3.04}


 31%|███       | 7701/25000 [18:36<40:28,  7.12it/s]

{'loss': 0.2526, 'grad_norm': 0.15676802396774292, 'learning_rate': 0.000346, 'epoch': 3.08}


 31%|███       | 7801/25000 [18:50<40:15,  7.12it/s]

{'loss': 0.2522, 'grad_norm': 0.08548246324062347, 'learning_rate': 0.00034399999999999996, 'epoch': 3.12}


 32%|███▏      | 7901/25000 [19:04<40:03,  7.11it/s]

{'loss': 0.2411, 'grad_norm': 0.10010451078414917, 'learning_rate': 0.000342, 'epoch': 3.16}


 32%|███▏      | 8000/25000 [19:18<39:39,  7.15it/s]

{'loss': 0.2575, 'grad_norm': 0.09038758277893066, 'learning_rate': 0.00034, 'epoch': 3.2}


                                                    
 32%|███▏      | 8000/25000 [19:30<39:39,  7.15it/s]

{'eval_loss': 0.24124911427497864, 'eval_runtime': 12.7341, 'eval_samples_per_second': 78.53, 'eval_steps_per_second': 9.816, 'epoch': 3.2}


 32%|███▏      | 8101/25000 [19:45<39:35,  7.11it/s]   

{'loss': 0.2572, 'grad_norm': 0.10264041274785995, 'learning_rate': 0.00033800000000000003, 'epoch': 3.24}


 33%|███▎      | 8201/25000 [19:59<39:14,  7.13it/s]

{'loss': 0.2597, 'grad_norm': 0.10624061524868011, 'learning_rate': 0.00033600000000000004, 'epoch': 3.28}


 33%|███▎      | 8301/25000 [20:13<39:03,  7.12it/s]

{'loss': 0.2349, 'grad_norm': 0.11543982475996017, 'learning_rate': 0.00033400000000000004, 'epoch': 3.32}


 34%|███▎      | 8401/25000 [20:27<38:41,  7.15it/s]

{'loss': 0.2476, 'grad_norm': 0.09206828474998474, 'learning_rate': 0.00033200000000000005, 'epoch': 3.36}


 34%|███▍      | 8501/25000 [20:41<38:42,  7.10it/s]

{'loss': 0.2564, 'grad_norm': 0.13006538152694702, 'learning_rate': 0.00033, 'epoch': 3.4}


 34%|███▍      | 8601/25000 [20:55<38:30,  7.10it/s]

{'loss': 0.2435, 'grad_norm': 0.1259143054485321, 'learning_rate': 0.000328, 'epoch': 3.44}


 35%|███▍      | 8701/25000 [21:09<38:16,  7.10it/s]

{'loss': 0.2576, 'grad_norm': 0.14562195539474487, 'learning_rate': 0.000326, 'epoch': 3.48}


 35%|███▌      | 8801/25000 [21:23<37:58,  7.11it/s]

{'loss': 0.2687, 'grad_norm': 0.1500462144613266, 'learning_rate': 0.000324, 'epoch': 3.52}


 36%|███▌      | 8901/25000 [21:37<37:36,  7.14it/s]

{'loss': 0.2634, 'grad_norm': 0.12704144418239594, 'learning_rate': 0.000322, 'epoch': 3.56}


 36%|███▌      | 9001/25000 [21:51<37:31,  7.11it/s]

{'loss': 0.2596, 'grad_norm': 0.15139420330524445, 'learning_rate': 0.00032, 'epoch': 3.6}


 36%|███▋      | 9101/25000 [22:05<37:09,  7.13it/s]

{'loss': 0.2418, 'grad_norm': 0.12828586995601654, 'learning_rate': 0.00031800000000000003, 'epoch': 3.64}


 37%|███▋      | 9201/25000 [22:19<37:01,  7.11it/s]

{'loss': 0.2491, 'grad_norm': 0.1162298396229744, 'learning_rate': 0.000316, 'epoch': 3.68}


 37%|███▋      | 9301/25000 [22:33<36:43,  7.13it/s]

{'loss': 0.2485, 'grad_norm': 0.13857892155647278, 'learning_rate': 0.000314, 'epoch': 3.72}


 38%|███▊      | 9401/25000 [22:47<36:32,  7.11it/s]

{'loss': 0.2666, 'grad_norm': 0.13242611289024353, 'learning_rate': 0.000312, 'epoch': 3.76}


 38%|███▊      | 9501/25000 [23:01<36:18,  7.12it/s]

{'loss': 0.2482, 'grad_norm': 0.11194045841693878, 'learning_rate': 0.00031, 'epoch': 3.8}


 38%|███▊      | 9601/25000 [23:15<36:14,  7.08it/s]

{'loss': 0.2576, 'grad_norm': 0.13068845868110657, 'learning_rate': 0.000308, 'epoch': 3.84}


 39%|███▉      | 9701/25000 [23:29<36:27,  7.00it/s]

{'loss': 0.2479, 'grad_norm': 0.14256861805915833, 'learning_rate': 0.000306, 'epoch': 3.88}


 39%|███▉      | 9801/25000 [23:43<35:42,  7.10it/s]

{'loss': 0.257, 'grad_norm': 0.13125759363174438, 'learning_rate': 0.000304, 'epoch': 3.92}


 40%|███▉      | 9901/25000 [23:57<35:47,  7.03it/s]

{'loss': 0.2761, 'grad_norm': 0.13414610922336578, 'learning_rate': 0.000302, 'epoch': 3.96}


 40%|████      | 10000/25000 [24:11<34:55,  7.16it/s]

{'loss': 0.2405, 'grad_norm': 0.15720456838607788, 'learning_rate': 0.0003, 'epoch': 4.0}


                                                     
 40%|████      | 10000/25000 [24:23<34:55,  7.16it/s]

{'eval_loss': 0.23756161332130432, 'eval_runtime': 11.9173, 'eval_samples_per_second': 83.912, 'eval_steps_per_second': 10.489, 'epoch': 4.0}


 40%|████      | 10101/25000 [24:37<35:00,  7.09it/s]   

{'loss': 0.2464, 'grad_norm': 0.142657071352005, 'learning_rate': 0.000298, 'epoch': 4.04}


 41%|████      | 10201/25000 [24:51<34:35,  7.13it/s]

{'loss': 0.2427, 'grad_norm': 0.12289410084486008, 'learning_rate': 0.000296, 'epoch': 4.08}


 41%|████      | 10301/25000 [25:05<34:28,  7.10it/s]

{'loss': 0.2375, 'grad_norm': 0.15419510006904602, 'learning_rate': 0.000294, 'epoch': 4.12}


 42%|████▏     | 10401/25000 [25:19<34:09,  7.12it/s]

{'loss': 0.2549, 'grad_norm': 0.10871855914592743, 'learning_rate': 0.000292, 'epoch': 4.16}


 42%|████▏     | 10501/25000 [25:33<34:03,  7.09it/s]

{'loss': 0.2554, 'grad_norm': 0.15821997821331024, 'learning_rate': 0.00029, 'epoch': 4.2}


 42%|████▏     | 10601/25000 [25:47<33:49,  7.09it/s]

{'loss': 0.2591, 'grad_norm': 0.16091087460517883, 'learning_rate': 0.000288, 'epoch': 4.24}


 43%|████▎     | 10701/25000 [26:01<33:28,  7.12it/s]

{'loss': 0.2526, 'grad_norm': 0.19381000101566315, 'learning_rate': 0.00028599999999999996, 'epoch': 4.28}


 43%|████▎     | 10801/25000 [26:15<33:14,  7.12it/s]

{'loss': 0.2561, 'grad_norm': 0.18849891424179077, 'learning_rate': 0.00028399999999999996, 'epoch': 4.32}


 44%|████▎     | 10901/25000 [26:29<33:12,  7.07it/s]

{'loss': 0.2426, 'grad_norm': 0.10073820501565933, 'learning_rate': 0.00028199999999999997, 'epoch': 4.36}


 44%|████▍     | 11001/25000 [26:43<32:57,  7.08it/s]

{'loss': 0.2631, 'grad_norm': 0.20305973291397095, 'learning_rate': 0.00028000000000000003, 'epoch': 4.4}


 44%|████▍     | 11101/25000 [26:57<32:33,  7.11it/s]

{'loss': 0.2494, 'grad_norm': 0.16225440800189972, 'learning_rate': 0.00027800000000000004, 'epoch': 4.44}


 45%|████▍     | 11201/25000 [27:11<32:15,  7.13it/s]

{'loss': 0.244, 'grad_norm': 0.14550647139549255, 'learning_rate': 0.00027600000000000004, 'epoch': 4.48}


 45%|████▌     | 11301/25000 [27:25<32:32,  7.02it/s]

{'loss': 0.2438, 'grad_norm': 0.17282670736312866, 'learning_rate': 0.00027400000000000005, 'epoch': 4.52}


 46%|████▌     | 11401/25000 [27:39<31:45,  7.14it/s]

{'loss': 0.2483, 'grad_norm': 0.12041207402944565, 'learning_rate': 0.00027200000000000005, 'epoch': 4.56}


 46%|████▌     | 11501/25000 [27:53<31:56,  7.04it/s]

{'loss': 0.2444, 'grad_norm': 0.1792166531085968, 'learning_rate': 0.00027, 'epoch': 4.6}


 46%|████▋     | 11601/25000 [28:07<31:37,  7.06it/s]

{'loss': 0.2603, 'grad_norm': 0.15911120176315308, 'learning_rate': 0.000268, 'epoch': 4.64}


 47%|████▋     | 11701/25000 [28:21<31:33,  7.03it/s]

{'loss': 0.2533, 'grad_norm': 0.14706546068191528, 'learning_rate': 0.000266, 'epoch': 4.68}


 47%|████▋     | 11801/25000 [28:36<30:52,  7.12it/s]

{'loss': 0.2447, 'grad_norm': 0.1686074137687683, 'learning_rate': 0.000264, 'epoch': 4.72}


 48%|████▊     | 11901/25000 [28:50<30:38,  7.13it/s]

{'loss': 0.2523, 'grad_norm': 0.11934149265289307, 'learning_rate': 0.000262, 'epoch': 4.76}


 48%|████▊     | 12000/25000 [29:03<30:24,  7.13it/s]

{'loss': 0.2497, 'grad_norm': 0.15627354383468628, 'learning_rate': 0.00026000000000000003, 'epoch': 4.8}


                                                     
 48%|████▊     | 12000/25000 [29:15<30:24,  7.13it/s]

{'eval_loss': 0.2342473566532135, 'eval_runtime': 11.9188, 'eval_samples_per_second': 83.901, 'eval_steps_per_second': 10.488, 'epoch': 4.8}


 48%|████▊     | 12101/25000 [29:30<30:15,  7.11it/s]   

{'loss': 0.2502, 'grad_norm': 0.16795358061790466, 'learning_rate': 0.00025800000000000004, 'epoch': 4.84}


 49%|████▉     | 12201/25000 [29:44<29:59,  7.11it/s]

{'loss': 0.2577, 'grad_norm': 0.13976575434207916, 'learning_rate': 0.000256, 'epoch': 4.88}


 49%|████▉     | 12301/25000 [29:58<29:56,  7.07it/s]

{'loss': 0.2195, 'grad_norm': 0.12079060077667236, 'learning_rate': 0.000254, 'epoch': 4.92}


 50%|████▉     | 12401/25000 [30:12<29:38,  7.08it/s]

{'loss': 0.2324, 'grad_norm': 0.14391347765922546, 'learning_rate': 0.000252, 'epoch': 4.96}


 50%|█████     | 12501/25000 [30:26<29:31,  7.05it/s]

{'loss': 0.2588, 'grad_norm': 0.1594959944486618, 'learning_rate': 0.00025, 'epoch': 5.0}


 50%|█████     | 12601/25000 [30:40<29:02,  7.12it/s]

{'loss': 0.2523, 'grad_norm': 0.17180724442005157, 'learning_rate': 0.000248, 'epoch': 5.04}


 51%|█████     | 12701/25000 [30:54<28:49,  7.11it/s]

{'loss': 0.2508, 'grad_norm': 0.13249191641807556, 'learning_rate': 0.000246, 'epoch': 5.08}


 51%|█████     | 12801/25000 [31:08<28:31,  7.13it/s]

{'loss': 0.2496, 'grad_norm': 0.15301527082920074, 'learning_rate': 0.000244, 'epoch': 5.12}


 52%|█████▏    | 12901/25000 [31:22<28:15,  7.14it/s]

{'loss': 0.2349, 'grad_norm': 0.12349250912666321, 'learning_rate': 0.000242, 'epoch': 5.16}


 52%|█████▏    | 13001/25000 [31:36<28:20,  7.06it/s]

{'loss': 0.2522, 'grad_norm': 0.217119961977005, 'learning_rate': 0.00024, 'epoch': 5.2}


 52%|█████▏    | 13101/25000 [31:50<27:57,  7.09it/s]

{'loss': 0.263, 'grad_norm': 0.14139530062675476, 'learning_rate': 0.00023799999999999998, 'epoch': 5.24}


 53%|█████▎    | 13201/25000 [32:04<27:49,  7.07it/s]

{'loss': 0.2365, 'grad_norm': 0.1209568902850151, 'learning_rate': 0.000236, 'epoch': 5.28}


 53%|█████▎    | 13301/25000 [32:19<27:34,  7.07it/s]

{'loss': 0.2355, 'grad_norm': 0.16740410029888153, 'learning_rate': 0.00023400000000000002, 'epoch': 5.32}


 54%|█████▎    | 13401/25000 [32:33<27:14,  7.10it/s]

{'loss': 0.2383, 'grad_norm': 0.1390448808670044, 'learning_rate': 0.00023200000000000003, 'epoch': 5.36}


 54%|█████▍    | 13501/25000 [32:47<26:59,  7.10it/s]

{'loss': 0.2477, 'grad_norm': 0.2314240038394928, 'learning_rate': 0.00023, 'epoch': 5.4}


 54%|█████▍    | 13601/25000 [33:01<26:40,  7.12it/s]

{'loss': 0.2368, 'grad_norm': 0.13746045529842377, 'learning_rate': 0.000228, 'epoch': 5.44}


 55%|█████▍    | 13701/25000 [33:15<26:27,  7.12it/s]

{'loss': 0.256, 'grad_norm': 0.18186333775520325, 'learning_rate': 0.00022600000000000002, 'epoch': 5.48}


 55%|█████▌    | 13801/25000 [33:29<26:11,  7.12it/s]

{'loss': 0.2256, 'grad_norm': 0.16389597952365875, 'learning_rate': 0.000224, 'epoch': 5.52}


 56%|█████▌    | 13901/25000 [33:43<26:01,  7.11it/s]

{'loss': 0.251, 'grad_norm': 0.19785484671592712, 'learning_rate': 0.000222, 'epoch': 5.56}


 56%|█████▌    | 14000/25000 [33:56<25:37,  7.15it/s]

{'loss': 0.2487, 'grad_norm': 0.16709795594215393, 'learning_rate': 0.00022, 'epoch': 5.6}


                                                     
 56%|█████▌    | 14000/25000 [34:08<25:37,  7.15it/s]

{'eval_loss': 0.23157745599746704, 'eval_runtime': 11.9089, 'eval_samples_per_second': 83.971, 'eval_steps_per_second': 10.496, 'epoch': 5.6}


 56%|█████▋    | 14101/25000 [34:23<25:29,  7.12it/s]   

{'loss': 0.2464, 'grad_norm': 0.195377916097641, 'learning_rate': 0.000218, 'epoch': 5.64}


 57%|█████▋    | 14201/25000 [34:37<25:19,  7.11it/s]

{'loss': 0.2381, 'grad_norm': 0.14292693138122559, 'learning_rate': 0.000216, 'epoch': 5.68}


 57%|█████▋    | 14301/25000 [34:51<25:08,  7.09it/s]

{'loss': 0.2337, 'grad_norm': 0.20457227528095245, 'learning_rate': 0.000214, 'epoch': 5.72}


 58%|█████▊    | 14401/25000 [35:05<24:48,  7.12it/s]

{'loss': 0.235, 'grad_norm': 0.13788937032222748, 'learning_rate': 0.000212, 'epoch': 5.76}


 58%|█████▊    | 14501/25000 [35:19<24:29,  7.15it/s]

{'loss': 0.2548, 'grad_norm': 0.13527649641036987, 'learning_rate': 0.00021, 'epoch': 5.8}


 58%|█████▊    | 14601/25000 [35:33<24:16,  7.14it/s]

{'loss': 0.2434, 'grad_norm': 0.17159534990787506, 'learning_rate': 0.000208, 'epoch': 5.84}


 59%|█████▉    | 14701/25000 [35:47<24:06,  7.12it/s]

{'loss': 0.2414, 'grad_norm': 0.23352576792240143, 'learning_rate': 0.000206, 'epoch': 5.88}


 59%|█████▉    | 14801/25000 [36:01<23:53,  7.11it/s]

{'loss': 0.245, 'grad_norm': 0.14933748543262482, 'learning_rate': 0.000204, 'epoch': 5.92}


 60%|█████▉    | 14901/25000 [36:15<23:39,  7.11it/s]

{'loss': 0.2435, 'grad_norm': 0.2003670334815979, 'learning_rate': 0.000202, 'epoch': 5.96}


 60%|██████    | 15001/25000 [36:29<23:26,  7.11it/s]

{'loss': 0.2567, 'grad_norm': 0.13364483416080475, 'learning_rate': 0.0002, 'epoch': 6.0}


 60%|██████    | 15101/25000 [36:43<23:12,  7.11it/s]

{'loss': 0.2319, 'grad_norm': 0.14463748037815094, 'learning_rate': 0.00019800000000000002, 'epoch': 6.04}


 61%|██████    | 15201/25000 [36:57<22:54,  7.13it/s]

{'loss': 0.2297, 'grad_norm': 0.1998894214630127, 'learning_rate': 0.00019600000000000002, 'epoch': 6.08}


 61%|██████    | 15301/25000 [37:11<22:45,  7.10it/s]

{'loss': 0.2475, 'grad_norm': 0.16264848411083221, 'learning_rate': 0.000194, 'epoch': 6.12}


 62%|██████▏   | 15401/25000 [37:25<22:15,  7.19it/s]

{'loss': 0.2347, 'grad_norm': 0.12403597682714462, 'learning_rate': 0.000192, 'epoch': 6.16}


 62%|██████▏   | 15501/25000 [37:39<22:10,  7.14it/s]

{'loss': 0.2253, 'grad_norm': 0.1874779760837555, 'learning_rate': 0.00019, 'epoch': 6.2}


 62%|██████▏   | 15601/25000 [37:53<21:58,  7.13it/s]

{'loss': 0.265, 'grad_norm': 0.14812105894088745, 'learning_rate': 0.00018800000000000002, 'epoch': 6.24}


 63%|██████▎   | 15701/25000 [38:07<21:43,  7.13it/s]

{'loss': 0.2435, 'grad_norm': 0.17289447784423828, 'learning_rate': 0.000186, 'epoch': 6.28}


 63%|██████▎   | 15801/25000 [38:21<21:41,  7.07it/s]

{'loss': 0.2408, 'grad_norm': 0.2260292023420334, 'learning_rate': 0.000184, 'epoch': 6.32}


 64%|██████▎   | 15901/25000 [38:35<21:14,  7.14it/s]

{'loss': 0.2385, 'grad_norm': 0.14801368117332458, 'learning_rate': 0.000182, 'epoch': 6.36}


 64%|██████▍   | 16000/25000 [38:49<20:51,  7.19it/s]

{'loss': 0.2475, 'grad_norm': 0.22132186591625214, 'learning_rate': 0.00017999999999999998, 'epoch': 6.4}


                                                     
 64%|██████▍   | 16000/25000 [39:01<20:51,  7.19it/s]

{'eval_loss': 0.22907140851020813, 'eval_runtime': 11.8948, 'eval_samples_per_second': 84.071, 'eval_steps_per_second': 10.509, 'epoch': 6.4}


 64%|██████▍   | 16101/25000 [39:15<20:47,  7.13it/s]  

{'loss': 0.23, 'grad_norm': 0.13092732429504395, 'learning_rate': 0.000178, 'epoch': 6.44}


 65%|██████▍   | 16201/25000 [39:29<20:33,  7.14it/s]

{'loss': 0.2513, 'grad_norm': 0.15007756650447845, 'learning_rate': 0.000176, 'epoch': 6.48}


 65%|██████▌   | 16301/25000 [39:43<20:15,  7.16it/s]

{'loss': 0.2255, 'grad_norm': 0.16170787811279297, 'learning_rate': 0.000174, 'epoch': 6.52}


 66%|██████▌   | 16401/25000 [39:57<20:09,  7.11it/s]

{'loss': 0.2416, 'grad_norm': 0.17298009991645813, 'learning_rate': 0.00017199999999999998, 'epoch': 6.56}


 66%|██████▌   | 16501/25000 [40:11<19:54,  7.11it/s]

{'loss': 0.2357, 'grad_norm': 0.17872434854507446, 'learning_rate': 0.00017, 'epoch': 6.6}


 66%|██████▋   | 16601/25000 [40:25<19:31,  7.17it/s]

{'loss': 0.2494, 'grad_norm': 0.17506183683872223, 'learning_rate': 0.00016800000000000002, 'epoch': 6.64}


 67%|██████▋   | 16701/25000 [40:39<19:23,  7.13it/s]

{'loss': 0.2482, 'grad_norm': 0.18646016716957092, 'learning_rate': 0.00016600000000000002, 'epoch': 6.68}


 67%|██████▋   | 16801/25000 [40:53<19:16,  7.09it/s]

{'loss': 0.2522, 'grad_norm': 0.1833641231060028, 'learning_rate': 0.000164, 'epoch': 6.72}


 68%|██████▊   | 16901/25000 [41:07<19:03,  7.08it/s]

{'loss': 0.2428, 'grad_norm': 0.2255616933107376, 'learning_rate': 0.000162, 'epoch': 6.76}


 68%|██████▊   | 17001/25000 [41:21<18:42,  7.13it/s]

{'loss': 0.2465, 'grad_norm': 0.2080107480287552, 'learning_rate': 0.00016, 'epoch': 6.8}


 68%|██████▊   | 17101/25000 [41:35<18:27,  7.13it/s]

{'loss': 0.2373, 'grad_norm': 0.1919228583574295, 'learning_rate': 0.000158, 'epoch': 6.84}


 69%|██████▉   | 17201/25000 [41:49<18:16,  7.11it/s]

{'loss': 0.2343, 'grad_norm': 0.13943248987197876, 'learning_rate': 0.000156, 'epoch': 6.88}


 69%|██████▉   | 17301/25000 [42:03<18:00,  7.12it/s]

{'loss': 0.241, 'grad_norm': 0.16703081130981445, 'learning_rate': 0.000154, 'epoch': 6.92}


 70%|██████▉   | 17401/25000 [42:17<17:49,  7.11it/s]

{'loss': 0.2494, 'grad_norm': 0.18613941967487335, 'learning_rate': 0.000152, 'epoch': 6.96}


 70%|███████   | 17501/25000 [42:31<17:35,  7.11it/s]

{'loss': 0.2397, 'grad_norm': 0.11912137269973755, 'learning_rate': 0.00015, 'epoch': 7.0}


 70%|███████   | 17601/25000 [42:44<17:13,  7.16it/s]

{'loss': 0.218, 'grad_norm': 0.17043422162532806, 'learning_rate': 0.000148, 'epoch': 7.04}


 71%|███████   | 17701/25000 [42:58<17:02,  7.14it/s]

{'loss': 0.2513, 'grad_norm': 0.13277243077754974, 'learning_rate': 0.000146, 'epoch': 7.08}


 71%|███████   | 17801/25000 [43:12<16:49,  7.13it/s]

{'loss': 0.2421, 'grad_norm': 0.16952815651893616, 'learning_rate': 0.000144, 'epoch': 7.12}


 72%|███████▏  | 17901/25000 [43:26<16:36,  7.12it/s]

{'loss': 0.236, 'grad_norm': 0.1961527317762375, 'learning_rate': 0.00014199999999999998, 'epoch': 7.16}


 72%|███████▏  | 18000/25000 [43:40<16:21,  7.13it/s]

{'loss': 0.2345, 'grad_norm': 0.1899266242980957, 'learning_rate': 0.00014000000000000001, 'epoch': 7.2}


                                                     
 72%|███████▏  | 18000/25000 [43:52<16:21,  7.13it/s]

{'eval_loss': 0.2272019237279892, 'eval_runtime': 11.8828, 'eval_samples_per_second': 84.155, 'eval_steps_per_second': 10.519, 'epoch': 7.2}


 72%|███████▏  | 18101/25000 [44:07<16:07,  7.13it/s]  

{'loss': 0.2442, 'grad_norm': 0.14892379939556122, 'learning_rate': 0.00013800000000000002, 'epoch': 7.24}


 73%|███████▎  | 18201/25000 [44:21<15:58,  7.09it/s]

{'loss': 0.2515, 'grad_norm': 0.1336948424577713, 'learning_rate': 0.00013600000000000003, 'epoch': 7.28}


 73%|███████▎  | 18301/25000 [44:35<15:43,  7.10it/s]

{'loss': 0.2233, 'grad_norm': 0.23355358839035034, 'learning_rate': 0.000134, 'epoch': 7.32}


 74%|███████▎  | 18401/25000 [44:49<15:27,  7.12it/s]

{'loss': 0.2431, 'grad_norm': 0.2131851464509964, 'learning_rate': 0.000132, 'epoch': 7.36}


 74%|███████▍  | 18501/25000 [45:02<15:10,  7.14it/s]

{'loss': 0.2426, 'grad_norm': 0.1420382261276245, 'learning_rate': 0.00013000000000000002, 'epoch': 7.4}


 74%|███████▍  | 18601/25000 [45:16<14:56,  7.14it/s]

{'loss': 0.2451, 'grad_norm': 0.1215156689286232, 'learning_rate': 0.000128, 'epoch': 7.44}


 75%|███████▍  | 18701/25000 [45:30<14:44,  7.13it/s]

{'loss': 0.2265, 'grad_norm': 0.14339645206928253, 'learning_rate': 0.000126, 'epoch': 7.48}


 75%|███████▌  | 18801/25000 [45:44<14:29,  7.13it/s]

{'loss': 0.2594, 'grad_norm': 0.1808350682258606, 'learning_rate': 0.000124, 'epoch': 7.52}


 76%|███████▌  | 18901/25000 [45:58<14:18,  7.11it/s]

{'loss': 0.2304, 'grad_norm': 0.1743525266647339, 'learning_rate': 0.000122, 'epoch': 7.56}


 76%|███████▌  | 19001/25000 [46:12<14:02,  7.12it/s]

{'loss': 0.2386, 'grad_norm': 0.2551339566707611, 'learning_rate': 0.00012, 'epoch': 7.6}


 76%|███████▋  | 19101/25000 [46:26<13:48,  7.12it/s]

{'loss': 0.2449, 'grad_norm': 0.20667506754398346, 'learning_rate': 0.000118, 'epoch': 7.64}


 77%|███████▋  | 19201/25000 [46:40<13:35,  7.11it/s]

{'loss': 0.223, 'grad_norm': 0.20768015086650848, 'learning_rate': 0.00011600000000000001, 'epoch': 7.68}


 77%|███████▋  | 19301/25000 [46:54<13:23,  7.09it/s]

{'loss': 0.2327, 'grad_norm': 0.12561549246311188, 'learning_rate': 0.000114, 'epoch': 7.72}


 78%|███████▊  | 19401/25000 [47:08<13:01,  7.17it/s]

{'loss': 0.2285, 'grad_norm': 0.20265726745128632, 'learning_rate': 0.000112, 'epoch': 7.76}


 78%|███████▊  | 19501/25000 [47:22<12:52,  7.11it/s]

{'loss': 0.2343, 'grad_norm': 0.1994476616382599, 'learning_rate': 0.00011, 'epoch': 7.8}


 78%|███████▊  | 19601/25000 [47:36<12:37,  7.13it/s]

{'loss': 0.2336, 'grad_norm': 0.1509179174900055, 'learning_rate': 0.000108, 'epoch': 7.84}


 79%|███████▉  | 19701/25000 [47:50<12:21,  7.14it/s]

{'loss': 0.2443, 'grad_norm': 0.19021166861057281, 'learning_rate': 0.000106, 'epoch': 7.88}


 79%|███████▉  | 19801/25000 [48:04<12:08,  7.14it/s]

{'loss': 0.2381, 'grad_norm': 0.18866460025310516, 'learning_rate': 0.000104, 'epoch': 7.92}


 80%|███████▉  | 19901/25000 [48:18<11:56,  7.11it/s]

{'loss': 0.2527, 'grad_norm': 0.20503176748752594, 'learning_rate': 0.000102, 'epoch': 7.96}


 80%|████████  | 20000/25000 [48:32<13:53,  6.00it/s]

{'loss': 0.2374, 'grad_norm': 0.17555010318756104, 'learning_rate': 0.0001, 'epoch': 8.0}


                                                     
 80%|████████  | 20000/25000 [48:44<13:53,  6.00it/s]

{'eval_loss': 0.2254587560892105, 'eval_runtime': 11.9527, 'eval_samples_per_second': 83.663, 'eval_steps_per_second': 10.458, 'epoch': 8.0}


 80%|████████  | 20101/25000 [48:59<11:33,  7.06it/s]  

{'loss': 0.2198, 'grad_norm': 0.1494578868150711, 'learning_rate': 9.800000000000001e-05, 'epoch': 8.04}


 81%|████████  | 20201/25000 [49:13<11:17,  7.09it/s]

{'loss': 0.2467, 'grad_norm': 0.12842132151126862, 'learning_rate': 9.6e-05, 'epoch': 8.08}


 81%|████████  | 20301/25000 [49:27<10:58,  7.14it/s]

{'loss': 0.2271, 'grad_norm': 0.18613772094249725, 'learning_rate': 9.400000000000001e-05, 'epoch': 8.12}


 82%|████████▏ | 20401/25000 [49:41<10:45,  7.12it/s]

{'loss': 0.2494, 'grad_norm': 0.17336255311965942, 'learning_rate': 9.2e-05, 'epoch': 8.16}


 82%|████████▏ | 20501/25000 [49:55<10:33,  7.11it/s]

{'loss': 0.2228, 'grad_norm': 0.16373199224472046, 'learning_rate': 8.999999999999999e-05, 'epoch': 8.2}


 82%|████████▏ | 20601/25000 [50:09<10:17,  7.12it/s]

{'loss': 0.2296, 'grad_norm': 0.15265978872776031, 'learning_rate': 8.8e-05, 'epoch': 8.24}


 83%|████████▎ | 20701/25000 [50:23<10:04,  7.11it/s]

{'loss': 0.231, 'grad_norm': 0.13452988862991333, 'learning_rate': 8.599999999999999e-05, 'epoch': 8.28}


 83%|████████▎ | 20801/25000 [50:37<09:51,  7.10it/s]

{'loss': 0.2326, 'grad_norm': 0.24381165206432343, 'learning_rate': 8.400000000000001e-05, 'epoch': 8.32}


 84%|████████▎ | 20901/25000 [50:51<09:36,  7.11it/s]

{'loss': 0.2229, 'grad_norm': 0.16389210522174835, 'learning_rate': 8.2e-05, 'epoch': 8.36}


 84%|████████▍ | 21001/25000 [51:05<09:23,  7.10it/s]

{'loss': 0.2369, 'grad_norm': 0.13832597434520721, 'learning_rate': 8e-05, 'epoch': 8.4}


 84%|████████▍ | 21101/25000 [51:19<09:05,  7.14it/s]

{'loss': 0.2408, 'grad_norm': 0.17855559289455414, 'learning_rate': 7.8e-05, 'epoch': 8.44}


 85%|████████▍ | 21201/25000 [51:33<08:54,  7.11it/s]

{'loss': 0.2446, 'grad_norm': 0.19877494871616364, 'learning_rate': 7.6e-05, 'epoch': 8.48}


 85%|████████▌ | 21301/25000 [51:47<08:40,  7.10it/s]

{'loss': 0.2199, 'grad_norm': 0.17295216023921967, 'learning_rate': 7.4e-05, 'epoch': 8.52}


 86%|████████▌ | 21401/25000 [52:01<08:26,  7.11it/s]

{'loss': 0.2385, 'grad_norm': 0.2185422033071518, 'learning_rate': 7.2e-05, 'epoch': 8.56}


 86%|████████▌ | 21501/25000 [52:15<08:15,  7.07it/s]

{'loss': 0.2566, 'grad_norm': 0.25178927183151245, 'learning_rate': 7.000000000000001e-05, 'epoch': 8.6}


 86%|████████▋ | 21601/25000 [52:29<07:58,  7.11it/s]

{'loss': 0.2541, 'grad_norm': 0.15219329297542572, 'learning_rate': 6.800000000000001e-05, 'epoch': 8.64}


 87%|████████▋ | 21701/25000 [52:43<07:44,  7.11it/s]

{'loss': 0.2523, 'grad_norm': 0.14658354222774506, 'learning_rate': 6.6e-05, 'epoch': 8.68}


 87%|████████▋ | 21801/25000 [52:57<07:31,  7.09it/s]

{'loss': 0.2335, 'grad_norm': 0.19576957821846008, 'learning_rate': 6.4e-05, 'epoch': 8.72}


 88%|████████▊ | 21901/25000 [53:11<07:15,  7.12it/s]

{'loss': 0.2202, 'grad_norm': 0.14529091119766235, 'learning_rate': 6.2e-05, 'epoch': 8.76}


 88%|████████▊ | 22000/25000 [53:25<07:19,  6.82it/s]

{'loss': 0.2267, 'grad_norm': 0.14238351583480835, 'learning_rate': 6e-05, 'epoch': 8.8}


                                                     
 88%|████████▊ | 22000/25000 [53:37<07:19,  6.82it/s]

{'eval_loss': 0.2244054526090622, 'eval_runtime': 11.9542, 'eval_samples_per_second': 83.653, 'eval_steps_per_second': 10.457, 'epoch': 8.8}


 88%|████████▊ | 22101/25000 [53:51<06:47,  7.11it/s]  

{'loss': 0.2497, 'grad_norm': 0.14438217878341675, 'learning_rate': 5.800000000000001e-05, 'epoch': 8.84}


 89%|████████▉ | 22201/25000 [54:05<07:34,  6.15it/s]

{'loss': 0.2314, 'grad_norm': 0.1679127812385559, 'learning_rate': 5.6e-05, 'epoch': 8.88}


 89%|████████▉ | 22301/25000 [54:19<06:19,  7.11it/s]

{'loss': 0.2275, 'grad_norm': 0.15336230397224426, 'learning_rate': 5.4e-05, 'epoch': 8.92}


 90%|████████▉ | 22401/25000 [54:33<06:05,  7.10it/s]

{'loss': 0.2286, 'grad_norm': 0.11763247847557068, 'learning_rate': 5.2e-05, 'epoch': 8.96}


 90%|█████████ | 22501/25000 [54:47<05:54,  7.06it/s]

{'loss': 0.2502, 'grad_norm': 0.18057139217853546, 'learning_rate': 5e-05, 'epoch': 9.0}


 90%|█████████ | 22601/25000 [55:01<05:34,  7.18it/s]

{'loss': 0.2481, 'grad_norm': 0.17036515474319458, 'learning_rate': 4.8e-05, 'epoch': 9.04}


 91%|█████████ | 22701/25000 [55:15<05:22,  7.14it/s]

{'loss': 0.223, 'grad_norm': 0.15749146044254303, 'learning_rate': 4.6e-05, 'epoch': 9.08}


 91%|█████████ | 22801/25000 [55:29<05:08,  7.13it/s]

{'loss': 0.2354, 'grad_norm': 0.12153053283691406, 'learning_rate': 4.4e-05, 'epoch': 9.12}


 92%|█████████▏| 22901/25000 [55:43<04:56,  7.09it/s]

{'loss': 0.232, 'grad_norm': 0.1984056681394577, 'learning_rate': 4.2000000000000004e-05, 'epoch': 9.16}


 92%|█████████▏| 23001/25000 [55:57<04:40,  7.13it/s]

{'loss': 0.235, 'grad_norm': 0.2532311677932739, 'learning_rate': 4e-05, 'epoch': 9.2}


 92%|█████████▏| 23101/25000 [56:11<04:26,  7.13it/s]

{'loss': 0.2278, 'grad_norm': 0.1909244954586029, 'learning_rate': 3.8e-05, 'epoch': 9.24}


 93%|█████████▎| 23201/25000 [56:25<04:13,  7.11it/s]

{'loss': 0.2443, 'grad_norm': 0.2482985258102417, 'learning_rate': 3.6e-05, 'epoch': 9.28}


 93%|█████████▎| 23301/25000 [56:39<03:58,  7.13it/s]

{'loss': 0.2261, 'grad_norm': 0.1613961160182953, 'learning_rate': 3.4000000000000007e-05, 'epoch': 9.32}


 94%|█████████▎| 23401/25000 [56:53<03:44,  7.12it/s]

{'loss': 0.2385, 'grad_norm': 0.15878909826278687, 'learning_rate': 3.2e-05, 'epoch': 9.36}


 94%|█████████▍| 23501/25000 [57:07<03:30,  7.11it/s]

{'loss': 0.2332, 'grad_norm': 0.18239812552928925, 'learning_rate': 3e-05, 'epoch': 9.4}


 94%|█████████▍| 23601/25000 [57:21<03:17,  7.10it/s]

{'loss': 0.2389, 'grad_norm': 0.17465896904468536, 'learning_rate': 2.8e-05, 'epoch': 9.44}


 95%|█████████▍| 23701/25000 [57:35<03:02,  7.11it/s]

{'loss': 0.2232, 'grad_norm': 0.17677749693393707, 'learning_rate': 2.6e-05, 'epoch': 9.48}


 95%|█████████▌| 23801/25000 [57:49<02:48,  7.13it/s]

{'loss': 0.2347, 'grad_norm': 0.13468244671821594, 'learning_rate': 2.4e-05, 'epoch': 9.52}


 96%|█████████▌| 23901/25000 [58:03<02:34,  7.12it/s]

{'loss': 0.2277, 'grad_norm': 0.17951129376888275, 'learning_rate': 2.2e-05, 'epoch': 9.56}


 96%|█████████▌| 24000/25000 [58:17<02:19,  7.15it/s]

{'loss': 0.2295, 'grad_norm': 0.21820013225078583, 'learning_rate': 2e-05, 'epoch': 9.6}


                                                     
 96%|█████████▌| 24000/25000 [58:29<02:19,  7.15it/s]

{'eval_loss': 0.22365118563175201, 'eval_runtime': 11.8935, 'eval_samples_per_second': 84.079, 'eval_steps_per_second': 10.51, 'epoch': 9.6}


 96%|█████████▋| 24101/25000 [58:43<02:05,  7.15it/s]  

{'loss': 0.2367, 'grad_norm': 0.1856994330883026, 'learning_rate': 1.8e-05, 'epoch': 9.64}


 97%|█████████▋| 24201/25000 [58:57<01:52,  7.09it/s]

{'loss': 0.2495, 'grad_norm': 0.1279367059469223, 'learning_rate': 1.6e-05, 'epoch': 9.68}


 97%|█████████▋| 24301/25000 [59:11<01:37,  7.15it/s]

{'loss': 0.2494, 'grad_norm': 0.17607693374156952, 'learning_rate': 1.4e-05, 'epoch': 9.72}


 98%|█████████▊| 24401/25000 [59:25<01:24,  7.10it/s]

{'loss': 0.2178, 'grad_norm': 0.24483703076839447, 'learning_rate': 1.2e-05, 'epoch': 9.76}


 98%|█████████▊| 24501/25000 [59:39<01:10,  7.10it/s]

{'loss': 0.2341, 'grad_norm': 0.19208550453186035, 'learning_rate': 1e-05, 'epoch': 9.8}


 98%|█████████▊| 24601/25000 [59:53<00:55,  7.16it/s]

{'loss': 0.2588, 'grad_norm': 0.20165905356407166, 'learning_rate': 8e-06, 'epoch': 9.84}


 99%|█████████▉| 24701/25000 [1:00:07<00:41,  7.15it/s]

{'loss': 0.2241, 'grad_norm': 0.23251308500766754, 'learning_rate': 6e-06, 'epoch': 9.88}


 99%|█████████▉| 24801/25000 [1:00:21<00:27,  7.16it/s]

{'loss': 0.2242, 'grad_norm': 0.16620193421840668, 'learning_rate': 4e-06, 'epoch': 9.92}


100%|█████████▉| 24901/25000 [1:00:35<00:13,  7.12it/s]

{'loss': 0.2248, 'grad_norm': 0.20689702033996582, 'learning_rate': 2e-06, 'epoch': 9.96}


100%|██████████| 25000/25000 [1:00:49<00:00,  7.22it/s]

{'loss': 0.2283, 'grad_norm': 0.1532580554485321, 'learning_rate': 0.0, 'epoch': 10.0}


100%|██████████| 25000/25000 [1:00:49<00:00,  6.85it/s]

{'train_runtime': 3649.2486, 'train_samples_per_second': 27.403, 'train_steps_per_second': 6.851, 'train_loss': 0.2538819474029541, 'epoch': 10.0}





TrainOutput(global_step=25000, training_loss=0.2538819474029541, metrics={'train_runtime': 3649.2486, 'train_samples_per_second': 27.403, 'train_steps_per_second': 6.851, 'total_flos': 3.27672594432e+16, 'train_loss': 0.2538819474029541, 'epoch': 10.0})