In [1]:
!nvidia-smi

Wed May 14 16:38:11 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.247.01             Driver Version: 535.247.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:03:00.0 Off |                    0 |
| N/A   31C    P0              69W / 700W |     13MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from models.transformers import DecoderOnlyTransformer
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import yaml
import gc
import torch
from torchinfo import summary
import os
import json
import shutil
import wandb
import yaml
from data.datasets import CFGDataset, verify_dataloader
from trainers.GPT_trainer import GPT_Trainer
from trainers.utils import create_optimizer, create_scheduler
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#Read config yaml file
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
print(f"Config loaded successfully.")


import os
os.system("export WANDB_DIR=\tmp")
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


ModuleNotFoundError: No module named 'models'

#### Loading Data

In [2]:
train_dataset =  CFGDataset(
    data_file="cfg_sentences_train_cfg3b.npy", 
    batch_size = config["data"]["batch_size"],
    seq_len = config["data"]["seq_len"],
    eos_token = config["data"]["eos_token"],
    sos_token = config["data"]["sos_token"],
    ) 

val_dataset =  CFGDataset(
    data_file="cfg_sentences_val_cfg3b.npy", 
    
    batch_size = config["data"]["batch_size"],
    seq_len = config["data"]["seq_len"],
    eos_token = config["data"]["eos_token"],
    sos_token = config["data"]["sos_token"],
    ) 


train_loader = DataLoader(train_dataset, 
                          batch_size = None, 
                          num_workers=config["data"]["NUM_WORKERS"] if device == "cuda" else 0, 
                          pin_memory=True)

val_loader = DataLoader(val_dataset, 
                        batch_size=None, 
                        num_workers=config["data"]["NUM_WORKERS"] if device == "cuda" else 0,
                        pin_memory=True)

verify_dataloader(train_loader)
print("_"*50)
verify_dataloader(val_loader)
#print("="*50)
#print("Verify Validation DataLoader")
#verify_dataloader(val_loader)

Verifying dataloader...
Number of batches:  40669
Total number of tokens:  1.998977e+09
Example batch shapes (shifted, golden):  torch.Size([96, 512]) torch.Size([96, 512])
__________________________________________________
Verifying dataloader...
Number of batches:  4518
Total number of tokens:  2.221086e+08
Example batch shapes (shifted, golden):  torch.Size([96, 512]) torch.Size([96, 512])


In [4]:
model_config = {}
model_config = config['model'].copy()

model_config.update({
    'num_classes': config["data"]["vocab_size"] +2 ,#include SOS and ESO tolen
    'seq_len': config["data"]["seq_len"],
    })

model = DecoderOnlyTransformer(**model_config)

for batch in train_loader:
    shifted_transcripts, golden_transcripts = batch
    print("Shape of shifted_transcripts : ", shifted_transcripts.shape)
    print("Shape of golden_transcripts  : ", golden_transcripts.shape)
    break

#model_stats = summary(model, input_data=[shifted_transcripts])
#print(model_stats)
print(f"Total parameters in the model: {sum(p.numel() for p in model.parameters()):,}")
#torch._dynamo.config.skip_nnmodule_hook_guards = False
#model = torch.compile(model, mode="default", fullgraph=True)


Shape of shifted_transcripts :  torch.Size([96, 512])
Shape of golden_transcripts  :  torch.Size([96, 512])
Total parameters in the model: 85,059,845


In [5]:
from trainers.utils.create_scheduler import plot_lr_schedule


torch.cuda.empty_cache()
gc.collect()
#wandb.finish()
trainer = GPT_Trainer(
    model = model, 
    config = config,
    config_file = "config.yaml",
    run_name = "full_rope_86", 
    device = device
)
wandb.watch(trainer.model, log="all")

trainer.optimizer = create_optimizer.create_optimizer(
    model = model, 
    opt_config = config["optimizer"]
)

trainer.scheduler = create_scheduler.create_scheduler(
    optimizer=trainer.optimizer,
    scheduler_config=config['scheduler'],
    train_loader=train_loader,
    gradient_accumulation_steps=config['training']['gradient_accumulation_steps']
)

#plot_lr_schedule(trainer.scheduler, num_epochs=len(train_loader), train_loader=train_loader)

#trainer.load_checkpoint("/ocean/projects/cis250019p/sfragara/lstm/expts/test/checkpoints/checkpoint-best-metric-model.pth")

Using device: cuda


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlucasfragara[0m ([33mteamlsfr[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



ðŸ”§ Configuring Optimizer:
â”œâ”€â”€ Type: ADAMW
â”œâ”€â”€ Base LR: 0.0003
â”œâ”€â”€ Weight Decay: 0.1
â”œâ”€â”€ Parameter Groups:
â”‚   â”œâ”€â”€ Group: self_attn
â”‚   â”‚   â”œâ”€â”€ LR: 0.0003
â”‚   â”‚   â””â”€â”€ Patterns: []
â”‚   â”œâ”€â”€ Group: ffn
â”‚   â”‚   â”œâ”€â”€ LR: 0.0003
â”‚   â”‚   â””â”€â”€ Patterns: []
â”‚   â””â”€â”€ Default Group (unmatched parameters)
â””â”€â”€ AdamW Specific:
    â”œâ”€â”€ Betas: [0.9, 0.98]
    â”œâ”€â”€ Epsilon: 1e-08
    â””â”€â”€ AMSGrad: False


  from .autonotebook import tqdm as notebook_tqdm



ðŸ“ˆ Configuring Learning Rate Scheduler:
â”œâ”€â”€ Type: COSINE
â”œâ”€â”€ Cosine Annealing Settings:
â”‚   â”œâ”€â”€ T_max: 40669 steps
â”‚   â””â”€â”€ Min LR: 1e-08
â””â”€â”€ Warmup: Disabled


In [None]:
trainer.model.train()

#trainer._validate_epoch(val_loader)
trainer.train_epoch(
    train_dataloader=train_loader,
    val_dataloader=val_loader,
)

                                                                                       [12:41<2:49:25,  3.75it/s, acc_step=1/1, ce_loss_token=0.9388, lr=0.000297, perplexity_token=2.5570]

generating
Using default generation config...
Generating with greedy search...




wandb metrics being logged:  {'train/ce_loss_token': 1.1724136917890904, 'train/perplexity_token': 3.229779005050659, 'val/ce_loss_token': 1.0457908771932125, 'val/perplexity_token': 2.8456482887268066, 'learning_rate': 0.000299887684217008}

ðŸ“Š Metrics (Epoch 500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.1724
â”‚   â””â”€â”€ perplexity_token: 3.2298
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 1.0458
    â””â”€â”€ perplexity_token: 2.8456
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000300


[Training LM]:   1%|â–‹                                                      | 527/40669 [02:39<2:58:28,  3.75it/s, acc_step=1/1, ce_loss_token=1.1665, lr=0.000300, perplexity_token=3.2107]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9388399131295205, 'train/perplexity_token': 2.5570132732391357, 'val/ce_loss_token': 0.7947334852069616, 'val/perplexity_token': 2.213850736618042, 'learning_rate': 0.00029720941233809124}

ðŸ“Š Metrics (Epoch 2500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9388
â”‚   â””â”€â”€ perplexity_token: 2.5570
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7947
    â””â”€â”€ perplexity_token: 2.2139
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000297


                                                                                       [15:12<4:59:15,  2.10it/s, acc_step=1/1, ce_loss_token=0.9139, lr=0.000296, perplexity_token=2.4939]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9138626235201136, 'train/perplexity_token': 2.4939372539520264, 'val/ce_loss_token': 0.7819392960518599, 'val/perplexity_token': 2.185706853866577, 'learning_rate': 0.00029598758619104335}

ðŸ“Š Metrics (Epoch 3000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9139
â”‚   â””â”€â”€ perplexity_token: 2.4939
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7819
    â””â”€â”€ perplexity_token: 2.1857
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000296


                                                                                       [17:48<2:45:09,  3.75it/s, acc_step=1/1, ce_loss_token=0.8945, lr=0.000295, perplexity_token=2.4462]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8945223544366494, 'train/perplexity_token': 2.446167230606079, 'val/ce_loss_token': 0.7714245095849037, 'val/perplexity_token': 2.1628451347351074, 'learning_rate': 0.0002945480092091429}

ðŸ“Š Metrics (Epoch 3500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8945
â”‚   â””â”€â”€ perplexity_token: 2.4462
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7714
    â””â”€â”€ perplexity_token: 2.1628
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000295


                                                                                       [20:45<4:57:10,  2.06it/s, acc_step=1/1, ce_loss_token=0.8790, lr=0.000293, perplexity_token=2.4084]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8789582620916054, 'train/perplexity_token': 2.4083895683288574, 'val/ce_loss_token': 0.7662643883377314, 'val/perplexity_token': 2.1517131328582764, 'learning_rate': 0.0002928928286970629}

ðŸ“Š Metrics (Epoch 4000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8790
â”‚   â””â”€â”€ perplexity_token: 2.4084
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7663
    â””â”€â”€ perplexity_token: 2.1517
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000293


                                                                                       [23:40<2:40:39,  3.75it/s, acc_step=1/1, ce_loss_token=0.8657, lr=0.000291, perplexity_token=2.3768]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8657384133026405, 'train/perplexity_token': 2.376760482788086, 'val/ce_loss_token': 0.758600115776062, 'val/perplexity_token': 2.135284900665283, 'learning_rate': 0.00029102451355844287}

ðŸ“Š Metrics (Epoch 4500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8657
â”‚   â””â”€â”€ perplexity_token: 2.3768
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7586
    â””â”€â”€ perplexity_token: 2.1353
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000291


                                                                                       [26:24<4:34:07,  2.17it/s, acc_step=1/1, ce_loss_token=0.8549, lr=0.000289, perplexity_token=2.3512]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8549123917596623, 'train/perplexity_token': 2.351168394088745, 'val/ce_loss_token': 0.7580171376466751, 'val/perplexity_token': 2.134040594100952, 'learning_rate': 0.0002889458506132172}

ðŸ“Š Metrics (Epoch 5000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8549
â”‚   â””â”€â”€ perplexity_token: 2.3512
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7580
    â””â”€â”€ perplexity_token: 2.1340
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000289


                                                                                       [28:54<2:36:12,  3.75it/s, acc_step=1/1, ce_loss_token=0.8459, lr=0.000287, perplexity_token=2.3301]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.845910506321287, 'train/perplexity_token': 2.3300983905792236, 'val/ce_loss_token': 0.7575489040464163, 'val/perplexity_token': 2.1330416202545166, 'learning_rate': 0.00028665994044073183}

ðŸ“Š Metrics (Epoch 5500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8459
â”‚   â””â”€â”€ perplexity_token: 2.3301
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7575
    â””â”€â”€ perplexity_token: 2.1330
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000287


                                                                                       [31:28<4:22:10,  2.20it/s, acc_step=1/1, ce_loss_token=0.8383, lr=0.000284, perplexity_token=2.3124]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.838273763785738, 'train/perplexity_token': 2.3123717308044434, 'val/ce_loss_token': 0.7527783960103989, 'val/perplexity_token': 2.122889995574951, 'learning_rate': 0.0002841701927548544}

ðŸ“Š Metrics (Epoch 6000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8383
â”‚   â””â”€â”€ perplexity_token: 2.3124
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7528
    â””â”€â”€ perplexity_token: 2.1229
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000284


                                                                                       [34:12<2:31:39,  3.75it/s, acc_step=1/1, ce_loss_token=0.8315, lr=0.000281, perplexity_token=2.2969]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8315406388052609, 'train/perplexity_token': 2.2968547344207764, 'val/ce_loss_token': 0.7505773119628429, 'val/perplexity_token': 2.11822247505188, 'learning_rate': 0.00028148032131796764}

ðŸ“Š Metrics (Epoch 6500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8315
â”‚   â””â”€â”€ perplexity_token: 2.2969
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7506
    â””â”€â”€ perplexity_token: 2.1182
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000281


                                                                                       [37:23<4:16:48,  2.19it/s, acc_step=1/1, ce_loss_token=0.8257, lr=0.000279, perplexity_token=2.2835]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8257084031936663, 'train/perplexity_token': 2.2834978103637695, 'val/ce_loss_token': 0.7494777161628008, 'val/perplexity_token': 2.1158947944641113, 'learning_rate': 0.0002785943384014407}

ðŸ“Š Metrics (Epoch 7000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8257
â”‚   â””â”€â”€ perplexity_token: 2.2835
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7495
    â””â”€â”€ perplexity_token: 2.1159
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000279


                                                                                       [40:21<2:27:11,  3.76it/s, acc_step=1/1, ce_loss_token=0.8206, lr=0.000276, perplexity_token=2.2719]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8206191904273705, 'train/perplexity_token': 2.2719061374664307, 'val/ce_loss_token': 0.7484979405999184, 'val/perplexity_token': 2.1138226985931396, 'learning_rate': 0.00027551654880083806}

ðŸ“Š Metrics (Epoch 7500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8206
â”‚   â””â”€â”€ perplexity_token: 2.2719
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7485
    â””â”€â”€ perplexity_token: 2.1138
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000276


                                                                                       [42:54<4:15:44,  2.13it/s, acc_step=1/1, ce_loss_token=0.8161, lr=0.000272, perplexity_token=2.2618]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8161448356792668, 'train/perplexity_token': 2.261763572692871, 'val/ce_loss_token': 0.7484059482812881, 'val/perplexity_token': 2.1136281490325928, 'learning_rate': 0.0002722515434147901}

ðŸ“Š Metrics (Epoch 8000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8161
â”‚   â””â”€â”€ perplexity_token: 2.2618
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7484
    â””â”€â”€ perplexity_token: 2.1136
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000272


                                                                                       [45:58<2:22:46,  3.76it/s, acc_step=1/1, ce_loss_token=0.8121, lr=0.000269, perplexity_token=2.2526]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8120821012831929, 'train/perplexity_token': 2.2525932788848877, 'val/ce_loss_token': 0.7465215474367142, 'val/perplexity_token': 2.1096489429473877, 'learning_rate': 0.00026880419239710864}

ðŸ“Š Metrics (Epoch 8500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8121
â”‚   â””â”€â”€ perplexity_token: 2.2526
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7465
    â””â”€â”€ perplexity_token: 2.1096
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000269


                                                                                       [48:48<4:12:28,  2.09it/s, acc_step=1/1, ce_loss_token=0.8085, lr=0.000265, perplexity_token=2.2444]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8084527337276965, 'train/perplexity_token': 2.2444324493408203, 'val/ce_loss_token': 0.7460612263530493, 'val/perplexity_token': 2.108678102493286, 'learning_rate': 0.00026517963789235823}

ðŸ“Š Metrics (Epoch 9000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8085
â”‚   â””â”€â”€ perplexity_token: 2.2444
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7461
    â””â”€â”€ perplexity_token: 2.1087
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000265


                                                                                       [51:29<2:18:25,  3.75it/s, acc_step=1/1, ce_loss_token=0.8052, lr=0.000261, perplexity_token=2.2371]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8051897767794381, 'train/perplexity_token': 2.237121105194092, 'val/ce_loss_token': 0.7472098730504513, 'val/perplexity_token': 2.1111013889312744, 'learning_rate': 0.00026138328636572217}

ðŸ“Š Metrics (Epoch 9500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8052
â”‚   â””â”€â”€ perplexity_token: 2.2371
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7472
    â””â”€â”€ perplexity_token: 2.1111
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000261


                                                                                       [54:39<4:03:29,  2.10it/s, acc_step=1/1, ce_loss_token=0.8022, lr=0.000257, perplexity_token=2.2305]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8022471547734676, 'train/perplexity_token': 2.2305476665496826, 'val/ce_loss_token': 0.7456682417541742, 'val/perplexity_token': 2.107849597930908, 'learning_rate': 0.00025742080053859436}

ðŸ“Š Metrics (Epoch 10000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8022
â”‚   â””â”€â”€ perplexity_token: 2.2305
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7457
    â””â”€â”€ perplexity_token: 2.1078
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000257


                                                                                       [57:21<2:13:55,  3.75it/s, acc_step=1/1, ce_loss_token=0.7995, lr=0.000253, perplexity_token=2.2245]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7995331451252816, 'train/perplexity_token': 2.2245020866394043, 'val/ce_loss_token': 0.7450180519372225, 'val/perplexity_token': 2.1064794063568115, 'learning_rate': 0.0002532980909419475}

ðŸ“Š Metrics (Epoch 10500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7995
â”‚   â””â”€â”€ perplexity_token: 2.2245
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7450
    â””â”€â”€ perplexity_token: 2.1065
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000253


                                                                                       [59:51<3:46:53,  2.18it/s, acc_step=1/1, ce_loss_token=0.7971, lr=0.000249, perplexity_token=2.2190]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7970585772453921, 'train/perplexity_token': 2.2190043926239014, 'val/ce_loss_token': 0.7449665665626526, 'val/perplexity_token': 2.1063709259033203, 'learning_rate': 0.00024902130710005013}

ðŸ“Š Metrics (Epoch 11000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7971
â”‚   â””â”€â”€ perplexity_token: 2.2190
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7450
    â””â”€â”€ perplexity_token: 2.1064
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000249


                                                                                       :02:21<2:09:33,  3.75it/s, acc_step=1/1, ce_loss_token=0.7948, lr=0.000245, perplexity_token=2.2140]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.794793989873743, 'train/perplexity_token': 2.213984727859497, 'val/ce_loss_token': 0.7452561482787132, 'val/perplexity_token': 2.106981039047241, 'learning_rate': 0.0002445968283577009}

ðŸ“Š Metrics (Epoch 11500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7948
â”‚   â””â”€â”€ perplexity_token: 2.2140
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7453
    â””â”€â”€ perplexity_token: 2.1070
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000245


                                                                                       :04:51<3:44:09,  2.13it/s, acc_step=1/1, ce_loss_token=0.7927, lr=0.000240, perplexity_token=2.2094]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7927151624187511, 'train/perplexity_token': 2.2093873023986816, 'val/ce_loss_token': 0.7449426092207432, 'val/perplexity_token': 2.10632061958313, 'learning_rate': 0.00024003125436466066}

ðŸ“Š Metrics (Epoch 12000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7927
â”‚   â””â”€â”€ perplexity_token: 2.2094
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7449
    â””â”€â”€ perplexity_token: 2.1063
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000240


                                                                                       :07:23<2:05:45,  3.73it/s, acc_step=1/1, ce_loss_token=0.7908, lr=0.000235, perplexity_token=2.2051]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7907791928085038, 'train/perplexity_token': 2.2051138877868652, 'val/ce_loss_token': 0.7442158851772547, 'val/perplexity_token': 2.104790449142456, 'learning_rate': 0.00023533139523146718}

ðŸ“Š Metrics (Epoch 12500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7908
â”‚   â””â”€â”€ perplexity_token: 2.2051
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1048
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000235


                                                                                       :10:32<3:42:30,  2.07it/s, acc_step=1/1, ce_loss_token=0.7890, lr=0.000231, perplexity_token=2.2012]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7889876150459558, 'train/perplexity_token': 2.201166868209839, 'val/ce_loss_token': 0.7440009228885174, 'val/perplexity_token': 2.1043379306793213, 'learning_rate': 0.00023050426137132527}

ðŸ“Š Metrics (Epoch 13000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7890
â”‚   â””â”€â”€ perplexity_token: 2.2012
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7440
    â””â”€â”€ perplexity_token: 2.1043
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000231


                                                                                       :13:34<2:00:39,  3.75it/s, acc_step=1/1, ce_loss_token=0.7873, lr=0.000226, perplexity_token=2.1975]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7873290717341301, 'train/perplexity_token': 2.197519063949585, 'val/ce_loss_token': 0.7443340253084898, 'val/perplexity_token': 2.105039119720459, 'learning_rate': 0.00022555705304322176}

ðŸ“Š Metrics (Epoch 13500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7873
â”‚   â””â”€â”€ perplexity_token: 2.1975
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1050
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000226


                                                                                       :16:35<3:42:15,  2.00it/s, acc_step=1/1, ce_loss_token=0.7858, lr=0.000220, perplexity_token=2.1941]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7857875585530487, 'train/perplexity_token': 2.19413423538208, 'val/ce_loss_token': 0.7440696936100721, 'val/perplexity_token': 2.104482650756836, 'learning_rate': 0.0002204971496118572}

ðŸ“Š Metrics (Epoch 14000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7858
â”‚   â””â”€â”€ perplexity_token: 2.1941
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7441
    â””â”€â”€ perplexity_token: 2.1045
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000220


                                                                                       :19:05<1:56:15,  3.75it/s, acc_step=1/1, ce_loss_token=0.7843, lr=0.000215, perplexity_token=2.1910]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7843432238279363, 'train/perplexity_token': 2.190967559814453, 'val/ce_loss_token': 0.7439249865710735, 'val/perplexity_token': 2.1041781902313232, 'learning_rate': 0.00021533209854042792}

ðŸ“Š Metrics (Epoch 14500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7843
â”‚   â””â”€â”€ perplexity_token: 2.1910
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7439
    â””â”€â”€ perplexity_token: 2.1042
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000215


                                                                                       :21:40<3:20:09,  2.14it/s, acc_step=1/1, ce_loss_token=0.7830, lr=0.000210, perplexity_token=2.1880]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7830059347721062, 'train/perplexity_token': 2.188039541244507, 'val/ce_loss_token': 0.7442951034754515, 'val/perplexity_token': 2.104957103729248, 'learning_rate': 0.00021006960413266058}

ðŸ“Š Metrics (Epoch 15000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7830
â”‚   â””â”€â”€ perplexity_token: 2.1880
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1050
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000210


                                                                                       :24:16<1:51:48,  3.75it/s, acc_step=1/1, ce_loss_token=0.7818, lr=0.000205, perplexity_token=2.1853]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7817552338668696, 'train/perplexity_token': 2.185304641723633, 'val/ce_loss_token': 0.7442290838807821, 'val/perplexity_token': 2.104818105697632, 'learning_rate': 0.00020471751604090498}

ðŸ“Š Metrics (Epoch 15500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7818
â”‚   â””â”€â”€ perplexity_token: 2.1853
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1048
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000205


                                                                                       :27:19<3:12:48,  2.13it/s, acc_step=1/1, ce_loss_token=0.7806, lr=0.000199, perplexity_token=2.1827]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7805819633222953, 'train/perplexity_token': 2.182742118835449, 'val/ce_loss_token': 0.7440232280641794, 'val/perplexity_token': 2.1043848991394043, 'learning_rate': 0.00019928381755741895}

ðŸ“Š Metrics (Epoch 16000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7806
â”‚   â””â”€â”€ perplexity_token: 2.1827
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7440
    â””â”€â”€ perplexity_token: 2.1044
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000199


                                                                                       :30:10<1:47:22,  3.75it/s, acc_step=1/1, ce_loss_token=0.7795, lr=0.000194, perplexity_token=2.1803]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7794800440648578, 'train/perplexity_token': 2.1803383827209473, 'val/ce_loss_token': 0.7442835960537195, 'val/perplexity_token': 2.104933023452759, 'learning_rate': 0.0001937766137063156}

ðŸ“Š Metrics (Epoch 16500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7795
â”‚   â””â”€â”€ perplexity_token: 2.1803
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1049
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000194


                                                                                       :32:57<3:03:24,  2.15it/s, acc_step=1/1, ce_loss_token=0.7784, lr=0.000188, perplexity_token=2.1781]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7784453987493268, 'train/perplexity_token': 2.178083658218384, 'val/ce_loss_token': 0.7443148251622915, 'val/perplexity_token': 2.1049985885620117, 'learning_rate': 0.00018820411915392997}

ðŸ“Š Metrics (Epoch 17000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7784
â”‚   â””â”€â”€ perplexity_token: 2.1781
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1050
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000188


                                                                                       :35:55<1:42:58,  3.75it/s, acc_step=1/1, ce_loss_token=0.7775, lr=0.000183, perplexity_token=2.1760]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7774689505968181, 'train/perplexity_token': 2.1759579181671143, 'val/ce_loss_token': 0.7441829107701778, 'val/perplexity_token': 2.1047208309173584, 'learning_rate': 0.00018257464595564273}

ðŸ“Š Metrics (Epoch 17500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7775
â”‚   â””â”€â”€ perplexity_token: 2.1760
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1047
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000183


                                                                                       :38:40<3:06:25,  2.03it/s, acc_step=1/1, ce_loss_token=0.7765, lr=0.000177, perplexity_token=2.1740]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7765478872370769, 'train/perplexity_token': 2.173954725265503, 'val/ce_loss_token': 0.7441919967532158, 'val/perplexity_token': 2.1047401428222656, 'learning_rate': 0.00017689659115743435}

ðŸ“Š Metrics (Epoch 18000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7765
â”‚   â””â”€â”€ perplexity_token: 2.1740
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1047
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000177


                                                                                       :41:47<1:38:26,  3.75it/s, acc_step=1/1, ce_loss_token=0.7757, lr=0.000171, perplexity_token=2.1721]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7756752551687671, 'train/perplexity_token': 2.172058343887329, 'val/ce_loss_token': 0.7442692648619413, 'val/perplexity_token': 2.104902744293213, 'learning_rate': 0.00017117842427066934}

ðŸ“Š Metrics (Epoch 18500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7757
â”‚   â””â”€â”€ perplexity_token: 2.1721
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1049
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000171


                                                                                       :44:24<2:53:54,  2.08it/s, acc_step=1/1, ce_loss_token=0.7748, lr=0.000165, perplexity_token=2.1703]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7748481147691179, 'train/perplexity_token': 2.170262336730957, 'val/ce_loss_token': 0.7445396985858679, 'val/perplexity_token': 2.1054720878601074, 'learning_rate': 0.00016542867463878417}

ðŸ“Š Metrics (Epoch 19000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7748
â”‚   â””â”€â”€ perplexity_token: 2.1703
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1055
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000165


                                                                                       :47:25<1:34:03,  3.75it/s, acc_step=1/1, ce_loss_token=0.7741, lr=0.000160, perplexity_token=2.1686]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7740661632304962, 'train/perplexity_token': 2.1685659885406494, 'val/ce_loss_token': 0.7447597943246365, 'val/perplexity_token': 2.105935573577881, 'learning_rate': 0.00015965591871473303}

ðŸ“Š Metrics (Epoch 19500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7741
â”‚   â””â”€â”€ perplexity_token: 2.1686
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7448
    â””â”€â”€ perplexity_token: 2.1059
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000160


                                                                                       :50:11<2:47:15,  2.06it/s, acc_step=1/1, ce_loss_token=0.7733, lr=0.000154, perplexity_token=2.1670]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7733246837369884, 'train/perplexity_token': 2.166958808898926, 'val/ce_loss_token': 0.7444193474948406, 'val/perplexity_token': 2.1052186489105225, 'learning_rate': 0.00015386876726816031}

ðŸ“Š Metrics (Epoch 20000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7733
â”‚   â””â”€â”€ perplexity_token: 2.1670
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7444
    â””â”€â”€ perplexity_token: 2.1052
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000154


                                                                                       :53:03<1:29:38,  3.75it/s, acc_step=1/1, ce_loss_token=0.7726, lr=0.000148, perplexity_token=2.1654]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7726187477559091, 'train/perplexity_token': 2.1654295921325684, 'val/ce_loss_token': 0.7442657630890608, 'val/perplexity_token': 2.1048953533172607, 'learning_rate': 0.0001480758525413892}

ðŸ“Š Metrics (Epoch 20500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7726
â”‚   â””â”€â”€ perplexity_token: 2.1654
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1049
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000148


                                                                                       :55:57<2:42:49,  2.01it/s, acc_step=1/1, ce_loss_token=0.7719, lr=0.000142, perplexity_token=2.1640]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7719428700711873, 'train/perplexity_token': 2.163966417312622, 'val/ce_loss_token': 0.744071738794446, 'val/perplexity_token': 2.1044869422912598, 'learning_rate': 0.00014228581537337803}

ðŸ“Š Metrics (Epoch 21000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7719
â”‚   â””â”€â”€ perplexity_token: 2.1640
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7441
    â””â”€â”€ perplexity_token: 2.1045
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000142


                                                                                       :58:33<1:25:06,  3.75it/s, acc_step=1/1, ce_loss_token=0.7713, lr=0.000137, perplexity_token=2.1626]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7712966299995112, 'train/perplexity_token': 2.1625685691833496, 'val/ce_loss_token': 0.7442003693431616, 'val/perplexity_token': 2.104757785797119, 'learning_rate': 0.00013650729231085454}

ðŸ“Š Metrics (Epoch 21500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7713
â”‚   â””â”€â”€ perplexity_token: 2.1626
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1048
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000137


                                                                                       :01:03<2:24:42,  2.15it/s, acc_step=1/1, ce_loss_token=0.7707, lr=0.000131, perplexity_token=2.1613]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7706894571876196, 'train/perplexity_token': 2.1612558364868164, 'val/ce_loss_token': 0.7448767181485891, 'val/perplexity_token': 2.1061818599700928, 'learning_rate': 0.00013074890272585493}

ðŸ“Š Metrics (Epoch 22000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7707
â”‚   â””â”€â”€ perplexity_token: 2.1613
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7449
    â””â”€â”€ perplexity_token: 2.1062
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000131


                                                                                       :03:47<1:20:14,  3.77it/s, acc_step=1/1, ce_loss_token=0.7701, lr=0.000125, perplexity_token=2.1600]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7701094367176601, 'train/perplexity_token': 2.1600024700164795, 'val/ce_loss_token': 0.7446757908910513, 'val/perplexity_token': 2.1057586669921875, 'learning_rate': 0.00012501923595887674}

ðŸ“Š Metrics (Epoch 22500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7701
â”‚   â””â”€â”€ perplexity_token: 2.1600
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7447
    â””â”€â”€ perplexity_token: 2.1058
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000125


                                                                                       :06:31<2:12:36,  2.22it/s, acc_step=1/1, ce_loss_token=0.7696, lr=0.000119, perplexity_token=2.1588]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7695551685346561, 'train/perplexity_token': 2.1588056087493896, 'val/ce_loss_token': 0.7445095330476761, 'val/perplexity_token': 2.1054084300994873, 'learning_rate': 0.00011932683850683092}

ðŸ“Š Metrics (Epoch 23000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7696
â”‚   â””â”€â”€ perplexity_token: 2.1588
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1054
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000119


                                                                                       :09:47<1:15:44,  3.78it/s, acc_step=1/1, ce_loss_token=0.7690, lr=0.000114, perplexity_token=2.1577]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.769024476726625, 'train/perplexity_token': 2.157660484313965, 'val/ce_loss_token': 0.7447206024080515, 'val/perplexity_token': 2.1058528423309326, 'learning_rate': 0.0001136802012748989}

ðŸ“Š Metrics (Epoch 23500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7690
â”‚   â””â”€â”€ perplexity_token: 2.1577
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7447
    â””â”€â”€ perplexity_token: 2.1059
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000114


                                                                                       :12:36<2:04:45,  2.23it/s, acc_step=1/1, ce_loss_token=0.7685, lr=0.000108, perplexity_token=2.1566]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7685141470560327, 'train/perplexity_token': 2.156559467315674, 'val/ce_loss_token': 0.744417043402791, 'val/perplexity_token': 2.1052138805389404, 'learning_rate': 0.00010808774691131223}

ðŸ“Š Metrics (Epoch 24000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7685
â”‚   â””â”€â”€ perplexity_token: 2.1566
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7444
    â””â”€â”€ perplexity_token: 2.1052
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000108


                                                                                       :15:06<1:11:19,  3.78it/s, acc_step=1/1, ce_loss_token=0.7680, lr=0.000103, perplexity_token=2.1555]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7680268160985745, 'train/perplexity_token': 2.1555087566375732, 'val/ce_loss_token': 0.7446798030287027, 'val/perplexity_token': 2.105767011642456, 'learning_rate': 0.000102557817243944}

ðŸ“Š Metrics (Epoch 24500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7680
â”‚   â””â”€â”€ perplexity_token: 2.1555
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7447
    â””â”€â”€ perplexity_token: 2.1058
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000103


                                                                                       :18:00<1:57:29,  2.22it/s, acc_step=1/1, ce_loss_token=0.7676, lr=0.000097, perplexity_token=2.1545]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7675575485687619, 'train/perplexity_token': 2.1544976234436035, 'val/ce_loss_token': 0.744890658184886, 'val/perplexity_token': 2.1062111854553223, 'learning_rate': 9.709866083745676e-05}

ðŸ“Š Metrics (Epoch 25000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7676
â”‚   â””â”€â”€ perplexity_token: 2.1545
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7449
    â””â”€â”€ perplexity_token: 2.1062
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000097


                                                                                       :20:28<1:06:54,  3.78it/s, acc_step=1/1, ce_loss_token=0.7671, lr=0.000092, perplexity_token=2.1535]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7671069154028453, 'train/perplexity_token': 2.153526782989502, 'val/ce_loss_token': 0.7446696162223816, 'val/perplexity_token': 2.105745553970337, 'learning_rate': 9.171842068956086e-05}

ðŸ“Š Metrics (Epoch 25500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7671
â”‚   â””â”€â”€ perplexity_token: 2.1535
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7447
    â””â”€â”€ perplexity_token: 2.1057
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000092


                                                                                       :23:14<1:48:28,  2.25it/s, acc_step=1/1, ce_loss_token=0.7667, lr=0.000086, perplexity_token=2.1526]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7666748706654701, 'train/perplexity_token': 2.1525967121124268, 'val/ce_loss_token': 0.7447612006217241, 'val/perplexity_token': 2.10593843460083, 'learning_rate': 8.642512208474032e-05}

ðŸ“Š Metrics (Epoch 26000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7667
â”‚   â””â”€â”€ perplexity_token: 2.1526
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7448
    â””â”€â”€ perplexity_token: 2.1059
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000086


                                                                                       :25:55<1:02:30,  3.78it/s, acc_step=1/1, ce_loss_token=0.7663, lr=0.000081, perplexity_token=2.1517]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7662578086678313, 'train/perplexity_token': 2.1516990661621094, 'val/ce_loss_token': 0.7450754418969154, 'val/perplexity_token': 2.106600284576416, 'learning_rate': 8.122666062356196e-05}

ðŸ“Š Metrics (Epoch 26500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7663
â”‚   â””â”€â”€ perplexity_token: 2.1517
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7451
    â””â”€â”€ perplexity_token: 2.1066
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000081


                                                                                       :28:36<1:40:50,  2.26it/s, acc_step=1/1, ce_loss_token=0.7659, lr=0.000076, perplexity_token=2.1508]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7658566588790526, 'train/perplexity_token': 2.150836229324341, 'val/ce_loss_token': 0.744797708466649, 'val/perplexity_token': 2.10601544380188, 'learning_rate': 7.613079044542494e-05}

ðŸ“Š Metrics (Epoch 27000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7659
â”‚   â””â”€â”€ perplexity_token: 2.1508
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7448
    â””â”€â”€ perplexity_token: 2.1060
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000076


                                                                                       [2:31:05<58:07,  3.78it/s, acc_step=1/1, ce_loss_token=0.7655, lr=0.000071, perplexity_token=2.1500]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.765468982554649, 'train/perplexity_token': 2.1500024795532227, 'val/ce_loss_token': 0.7446234282106161, 'val/perplexity_token': 2.1056482791900635, 'learning_rate': 7.114511266231505e-05}

ðŸ“Š Metrics (Epoch 27500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7655
â”‚   â””â”€â”€ perplexity_token: 2.1500
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7446
    â””â”€â”€ perplexity_token: 2.1056
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000071


                                                                                       :34:06<1:33:49,  2.25it/s, acc_step=1/1, ce_loss_token=0.7651, lr=0.000066, perplexity_token=2.1492]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7650921539763741, 'train/perplexity_token': 2.1491923332214355, 'val/ce_loss_token': 0.7447954900562763, 'val/perplexity_token': 2.106010675430298, 'learning_rate': 6.62770640208187e-05}

ðŸ“Š Metrics (Epoch 28000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7651
â”‚   â””â”€â”€ perplexity_token: 2.1492
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7448
    â””â”€â”€ perplexity_token: 2.1060
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000066


                                                                                       [2:36:46<53:48,  3.77it/s, acc_step=1/1, ce_loss_token=0.7647, lr=0.000062, perplexity_token=2.1484]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7647335577009435, 'train/perplexity_token': 2.1484217643737793, 'val/ce_loss_token': 0.7448504995554686, 'val/perplexity_token': 2.106126546859741, 'learning_rate': 6.153390580930932e-05}

ðŸ“Š Metrics (Epoch 28500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7647
â”‚   â””â”€â”€ perplexity_token: 2.1484
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7449
    â””â”€â”€ perplexity_token: 2.1061
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000062


                                                                                       :39:15<1:30:49,  2.14it/s, acc_step=1/1, ce_loss_token=0.7644, lr=0.000057, perplexity_token=2.1477]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7643853710304519, 'train/perplexity_token': 2.1476738452911377, 'val/ce_loss_token': 0.7446599788963795, 'val/perplexity_token': 2.1057252883911133, 'learning_rate': 5.692271302684908e-05}

ðŸ“Š Metrics (Epoch 29000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7644
â”‚   â””â”€â”€ perplexity_token: 2.1477
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7447
    â””â”€â”€ perplexity_token: 2.1057
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000057


                                                                                       [2:41:45<49:23,  3.77it/s, acc_step=1/1, ce_loss_token=0.7640, lr=0.000052, perplexity_token=2.1470]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7640493892620411, 'train/perplexity_token': 2.1469526290893555, 'val/ce_loss_token': 0.7446473240852356, 'val/perplexity_token': 2.105698585510254, 'learning_rate': 5.245036382996493e-05}

ðŸ“Š Metrics (Epoch 29500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7640
â”‚   â””â”€â”€ perplexity_token: 2.1470
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7446
    â””â”€â”€ perplexity_token: 2.1057
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000052


                                                                                       :44:25<1:19:50,  2.23it/s, acc_step=1/1, ce_loss_token=0.7637, lr=0.000048, perplexity_token=2.1463]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7637246501377823, 'train/perplexity_token': 2.1462552547454834, 'val/ce_loss_token': 0.7448520623147488, 'val/perplexity_token': 2.1061298847198486, 'learning_rate': 4.812352927303952e-05}

ðŸ“Š Metrics (Epoch 30000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7637
â”‚   â””â”€â”€ perplexity_token: 2.1463
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7449
    â””â”€â”€ perplexity_token: 2.1061
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000048


                                                                                       [2:47:07<44:54,  3.77it/s, acc_step=1/1, ce_loss_token=0.7634, lr=0.000044, perplexity_token=2.1456]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7634102411201511, 'train/perplexity_token': 2.145580768585205, 'val/ce_loss_token': 0.7445234842598438, 'val/perplexity_token': 2.105437994003296, 'learning_rate': 4.394866335761917e-05}

ðŸ“Š Metrics (Epoch 30500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7634
â”‚   â””â”€â”€ perplexity_token: 2.1456
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1054
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000044


                                                                                       :49:47<1:10:35,  2.28it/s, acc_step=1/1, ce_loss_token=0.7631, lr=0.000040, perplexity_token=2.1449]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7631043527019489, 'train/perplexity_token': 2.1449246406555176, 'val/ce_loss_token': 0.7447542231529951, 'val/perplexity_token': 2.105923652648926, 'learning_rate': 3.9931993405483926e-05}

ðŸ“Š Metrics (Epoch 31000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7631
â”‚   â””â”€â”€ perplexity_token: 2.1449
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7448
    â””â”€â”€ perplexity_token: 2.1059
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000040


                                                                                       [2:52:15<40:26,  3.78it/s, acc_step=1/1, ce_loss_token=0.7628, lr=0.000036, perplexity_token=2.1443]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7628082929386085, 'train/perplexity_token': 2.144289493560791, 'val/ce_loss_token': 0.7446843553334475, 'val/perplexity_token': 2.10577654838562, 'learning_rate': 3.607951076983703e-05}

ðŸ“Š Metrics (Epoch 31500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7628
â”‚   â””â”€â”€ perplexity_token: 2.1443
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7447
    â””â”€â”€ perplexity_token: 2.1058
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000036


                                                                                       :55:22<1:05:00,  2.22it/s, acc_step=1/1, ce_loss_token=0.7625, lr=0.000032, perplexity_token=2.1437]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7625204513681736, 'train/perplexity_token': 2.143672466278076, 'val/ce_loss_token': 0.7444878835231066, 'val/perplexity_token': 2.105362892150879, 'learning_rate': 3.239696189847127e-05}

ðŸ“Š Metrics (Epoch 32000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7625
â”‚   â””â”€â”€ perplexity_token: 2.1437
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1054
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000032


                                                                                       [2:57:53<36:01,  3.78it/s, acc_step=1/1, ce_loss_token=0.7622, lr=0.000029, perplexity_token=2.1431]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7622413607524845, 'train/perplexity_token': 2.1430742740631104, 'val/ce_loss_token': 0.7445158381015062, 'val/perplexity_token': 2.105421781539917, 'learning_rate': 2.888983976224031e-05}

ðŸ“Š Metrics (Epoch 32500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7622
â”‚   â””â”€â”€ perplexity_token: 2.1431
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1054
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000029


                                                                                       [3:00:29<59:08,  2.16it/s, acc_step=1/1, ce_loss_token=0.7620, lr=0.000026, perplexity_token=2.1425]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7619701941383018, 'train/perplexity_token': 2.14249324798584, 'val/ce_loss_token': 0.7445282526314259, 'val/perplexity_token': 2.105448007583618, 'learning_rate': 2.5563375661623586e-05}

ðŸ“Š Metrics (Epoch 33000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7620
â”‚   â””â”€â”€ perplexity_token: 2.1425
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1054
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000026


                                                                                       [3:02:58<31:37,  3.78it/s, acc_step=1/1, ce_loss_token=0.7617, lr=0.000022, perplexity_token=2.1419]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7617066172762652, 'train/perplexity_token': 2.1419284343719482, 'val/ce_loss_token': 0.7446348164230585, 'val/perplexity_token': 2.1056723594665527, 'learning_rate': 2.242253142360269e-05}

ðŸ“Š Metrics (Epoch 33500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7617
â”‚   â””â”€â”€ perplexity_token: 2.1419
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7446
    â””â”€â”€ perplexity_token: 2.1057
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000022


                                                                                       [3:05:26<50:54,  2.18it/s, acc_step=1/1, ce_loss_token=0.7615, lr=0.000019, perplexity_token=2.1414]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7614509520914684, 'train/perplexity_token': 2.141381025314331, 'val/ce_loss_token': 0.7443985622376204, 'val/perplexity_token': 2.1051747798919678, 'learning_rate': 1.947199200049152e-05}

ðŸ“Š Metrics (Epoch 34000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7615
â”‚   â””â”€â”€ perplexity_token: 2.1414
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7444
    â””â”€â”€ perplexity_token: 2.1052
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000019


                                                                                       [3:08:18<27:12,  3.78it/s, acc_step=1/1, ce_loss_token=0.7612, lr=0.000017, perplexity_token=2.1408]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7612013642723873, 'train/perplexity_token': 2.1408467292785645, 'val/ce_loss_token': 0.744394788518548, 'val/perplexity_token': 2.1051669120788574, 'learning_rate': 1.6716158481758408e-05}

ðŸ“Š Metrics (Epoch 34500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7612
â”‚   â””â”€â”€ perplexity_token: 2.1408
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7444
    â””â”€â”€ perplexity_token: 2.1052
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000017


                                                                                       [3:11:19<42:00,  2.25it/s, acc_step=1/1, ce_loss_token=0.7610, lr=0.000014, perplexity_token=2.1403]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.760959879294017, 'train/perplexity_token': 2.140329599380493, 'val/ce_loss_token': 0.7445674203336239, 'val/perplexity_token': 2.105530261993408, 'learning_rate': 1.4159141529264404e-05}

ðŸ“Š Metrics (Epoch 35000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7610
â”‚   â””â”€â”€ perplexity_token: 2.1403
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7446
    â””â”€â”€ perplexity_token: 2.1055
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000014


                                                                                       [3:13:50<22:47,  3.78it/s, acc_step=1/1, ce_loss_token=0.7607, lr=0.000012, perplexity_token=2.1398]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.760726687273594, 'train/perplexity_token': 2.1398305892944336, 'val/ce_loss_token': 0.7445052731782198, 'val/perplexity_token': 2.1053996086120605, 'learning_rate': 1.1804755245709455e-05}

ðŸ“Š Metrics (Epoch 35500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7607
â”‚   â””â”€â”€ perplexity_token: 2.1398
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7445
    â””â”€â”€ perplexity_token: 2.1054
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000012


                                                                                       [3:16:21<34:06,  2.28it/s, acc_step=1/1, ce_loss_token=0.7605, lr=0.000010, perplexity_token=2.1393]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7604975875346304, 'train/perplexity_token': 2.139340400695801, 'val/ce_loss_token': 0.7443153038620949, 'val/perplexity_token': 2.1049997806549072, 'learning_rate': 9.656511485433034e-06}

ðŸ“Š Metrics (Epoch 36000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7605
â”‚   â””â”€â”€ perplexity_token: 2.1393
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7443
    â””â”€â”€ perplexity_token: 2.1050
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000010


                                                                                       [3:19:01<18:23,  3.78it/s, acc_step=1/1, ce_loss_token=0.7603, lr=0.000008, perplexity_token=2.1389]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7602735502383947, 'train/perplexity_token': 2.1388611793518066, 'val/ce_loss_token': 0.7442119158804417, 'val/perplexity_token': 2.1047821044921875, 'learning_rate': 7.717614616055162e-06}

ðŸ“Š Metrics (Epoch 36500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7603
â”‚   â””â”€â”€ perplexity_token: 2.1389
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1048
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000008


                                                                                       [3:21:34<27:27,  2.23it/s, acc_step=1/1, ce_loss_token=0.7601, lr=0.000006, perplexity_token=2.1384]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7600555353988032, 'train/perplexity_token': 2.138395071029663, 'val/ce_loss_token': 0.7441726382821798, 'val/perplexity_token': 2.1046993732452393, 'learning_rate': 5.990956738771035e-06}

ðŸ“Š Metrics (Epoch 37000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7601
â”‚   â””â”€â”€ perplexity_token: 2.1384
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7442
    â””â”€â”€ perplexity_token: 2.1047
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000006


                                                                                       [3:24:05<13:58,  3.78it/s, acc_step=1/1, ce_loss_token=0.7598, lr=0.000004, perplexity_token=2.1379]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.7598410588271306, 'train/perplexity_token': 2.1379363536834717, 'val/ce_loss_token': 0.7441199235618114, 'val/perplexity_token': 2.104588508605957, 'learning_rate': 4.47911337442937e-06}

ðŸ“Š Metrics (Epoch 37500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.7598
â”‚   â””â”€â”€ perplexity_token: 2.1379
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7441
    â””â”€â”€ perplexity_token: 2.1046
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000004


[Training LM]:  93%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž   | 37861/40669 [3:26:02<12:23,  3.78it/s, acc_step=1/1, ce_loss_token=0.7597, lr=0.000004, perplexity_token=2.1376]