In [None]:
from models.transformers import DecoderOnlyTransformer
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import yaml
import gc
import torch
from torchinfo import summary
import os
import json
import shutil
import wandb
import yaml
from data.datasets import CFGDataset, verify_dataloader
from trainers.GPT_trainer import GPT_Trainer
from trainers.utils import create_optimizer, create_scheduler
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

#Read config yaml file
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
print(f"Config loaded successfully.")


import os
os.system("export WANDB_DIR=\tmp")
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


Using device: cuda
Config loaded successfully.


0

In [3]:
from data.CFG_parsers import CFGParser
from data.grammars import GRAMMAR_SIMPLE
parser = CFGParser(GRAMMAR_SIMPLE)
parser.is_valid("3213212123")


False

#### Loading Data

In [2]:
train_dataset =  CFGDataset(
    data_file="cfg_sentences_train_cfg3b.npy", 
    batch_size = config["data"]["batch_size"],
    seq_len = config["data"]["seq_len"],
    eos_token = config["data"]["eos_token"],
    sos_token = config["data"]["sos_token"],
    ) 

val_dataset =  CFGDataset(
    data_file="cfg_sentences_val_cfg3b.npy", 
    
    batch_size = config["data"]["batch_size"],
    seq_len = config["data"]["seq_len"],
    eos_token = config["data"]["eos_token"],
    sos_token = config["data"]["sos_token"],
    ) 


train_loader = DataLoader(train_dataset, 
                          batch_size = None, 
                          num_workers=config["data"]["NUM_WORKERS"] if device == "cuda" else 0, 
                          pin_memory=True)

val_loader = DataLoader(val_dataset, 
                        batch_size=None, 
                        num_workers=config["data"]["NUM_WORKERS"] if device == "cuda" else 0,
                        pin_memory=True)

verify_dataloader(train_loader)
print("_"*50)
verify_dataloader(val_loader)
#print("="*50)
#print("Verify Validation DataLoader")
#verify_dataloader(val_loader)

Verifying dataloader...
Number of batches:  40669
Total number of tokens:  1.998977e+09


Example batch shapes (shifted, golden):  torch.Size([96, 512]) torch.Size([96, 512])
__________________________________________________
Verifying dataloader...
Number of batches:  4518
Total number of tokens:  2.221086e+08
Example batch shapes (shifted, golden):  torch.Size([96, 512]) torch.Size([96, 512])


In [3]:
model_config = {}
model_config = config['model'].copy()

model_config.update({
    'num_classes': config["data"]["vocab_size"] +2 ,#include SOS and ESO tolen
    'seq_len': config["data"]["seq_len"],
    })

model = DecoderOnlyTransformer(**model_config)

for batch in train_loader:
    shifted_transcripts, golden_transcripts = batch
    print("Shape of shifted_transcripts : ", shifted_transcripts.shape)
    print("Shape of golden_transcripts  : ", golden_transcripts.shape)
    break

#model_stats = summary(model, input_data=[shifted_transcripts])
#print(model_stats)
print(f"Total parameters in the model: {sum(p.numel() for p in model.parameters()):,}")
#torch._dynamo.config.skip_nnmodule_hook_guards = False
#model = torch.compile(model, mode="default", fullgraph=True)


Shape of shifted_transcripts :  torch.Size([96, 512])
Shape of golden_transcripts  :  torch.Size([96, 512])
Total parameters in the model: 42,532,613


In [4]:
from trainers.utils.create_scheduler import plot_lr_schedule


torch.cuda.empty_cache()
gc.collect()
#wandb.finish()
trainer = GPT_Trainer(
    model = model, 
    config = config,
    config_file = "config.yaml",
    run_name = "full_rope", 
    device = device
)
wandb.watch(trainer.model, log="all")

trainer.optimizer = create_optimizer.create_optimizer(
    model = model, 
    opt_config = config["optimizer"]
)

trainer.scheduler = create_scheduler.create_scheduler(
    optimizer=trainer.optimizer,
    scheduler_config=config['scheduler'],
    train_loader=train_loader,
    gradient_accumulation_steps=config['training']['gradient_accumulation_steps']
)

#plot_lr_schedule(trainer.scheduler, num_epochs=len(train_loader), train_loader=train_loader)

#trainer.load_checkpoint("/ocean/projects/cis250019p/sfragara/lstm/expts/test/checkpoints/checkpoint-best-metric-model.pth")

Using device: cuda


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlucasfragara[0m ([33mteamlsfr[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



ðŸ”§ Configuring Optimizer:
â”œâ”€â”€ Type: ADAMW
â”œâ”€â”€ Base LR: 0.0007
â”œâ”€â”€ Weight Decay: 0.1
â”œâ”€â”€ Parameter Groups:
â”‚   â”œâ”€â”€ Group: self_attn
â”‚   â”‚   â”œâ”€â”€ LR: 0.0007
â”‚   â”‚   â””â”€â”€ Patterns: []
â”‚   â”œâ”€â”€ Group: ffn
â”‚   â”‚   â”œâ”€â”€ LR: 0.0007
â”‚   â”‚   â””â”€â”€ Patterns: []
â”‚   â””â”€â”€ Default Group (unmatched parameters)
â””â”€â”€ AdamW Specific:
    â”œâ”€â”€ Betas: [0.9, 0.98]
    â”œâ”€â”€ Epsilon: 1e-08
    â””â”€â”€ AMSGrad: False


  from .autonotebook import tqdm as notebook_tqdm



ðŸ“ˆ Configuring Learning Rate Scheduler:
â”œâ”€â”€ Type: COSINE
â”œâ”€â”€ Cosine Annealing Settings:
â”‚   â”œâ”€â”€ T_max: 40669 steps
â”‚   â””â”€â”€ Min LR: 1e-08
â””â”€â”€ Warmup: Disabled


In [5]:
trainer.model.train()

#trainer._validate_epoch(val_loader)
trainer.train_epoch(
    train_dataloader=train_loader,
    val_dataloader=val_loader,
)

                                                                                       1:30:17,  7.42it/s, acc_step=1/1, ce_loss_token=1.3246, lr=0.000700, perplexity_token=3.7608]

generating
Using default generation config...
Generating with greedy search...




wandb metrics being logged:  {'train/ce_loss_token': 1.3246285346691717, 'train/perplexity_token': 3.7607882022857666, 'val/ce_loss_token': 1.1964159235358238, 'val/perplexity_token': 3.3082385063171387, 'learning_rate': 0.0006997379248477051}

ðŸ“Š Metrics (Epoch 500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.3246
â”‚   â””â”€â”€ perplexity_token: 3.7608
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 1.1964
    â””â”€â”€ perplexity_token: 3.3082
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000700


                                                                                       2:42:45,  4.06it/s, acc_step=1/1, ce_loss_token=1.2014, lr=0.000699, perplexity_token=3.3247]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 1.201375137556802, 'train/perplexity_token': 3.324685573577881, 'val/ce_loss_token': 1.0131168588995934, 'val/perplexity_token': 2.754171848297119, 'learning_rate': 0.0006989541814204076}

ðŸ“Š Metrics (Epoch 1000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.2014
â”‚   â””â”€â”€ perplexity_token: 3.3247
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 1.0131
    â””â”€â”€ perplexity_token: 2.7542
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000699


                                                                                         7.40it/s, acc_step=1/1, ce_loss_token=1.1280, lr=0.000698, perplexity_token=3.0894]        

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 1.1279729470501416, 'train/perplexity_token': 3.089387893676758, 'val/ce_loss_token': 0.9538229033350945, 'val/perplexity_token': 2.595613479614258, 'learning_rate': 0.0006976499377227822}

ðŸ“Š Metrics (Epoch 1500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.1280
â”‚   â””â”€â”€ perplexity_token: 3.0894
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.9538
    â””â”€â”€ perplexity_token: 2.5956
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000698


                                                                                         4.37it/s, acc_step=1/1, ce_loss_token=1.0752, lr=0.000696, perplexity_token=2.9305]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 1.0751766070075657, 'train/perplexity_token': 2.9305102825164795, 'val/ce_loss_token': 0.910547249019146, 'val/perplexity_token': 2.485682487487793, 'learning_rate': 0.0006958271391933999}

ðŸ“Š Metrics (Epoch 2000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.0752
â”‚   â””â”€â”€ perplexity_token: 2.9305
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.9105
    â””â”€â”€ perplexity_token: 2.4857
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000696


                                                                                         7.40it/s, acc_step=1/1, ce_loss_token=1.0326, lr=0.000693, perplexity_token=2.8085]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 1.0326489400072414, 'train/perplexity_token': 2.80849552154541, 'val/ce_loss_token': 0.8485343288630247, 'val/perplexity_token': 2.3362202644348145, 'learning_rate': 0.0006934885047586219}

ðŸ“Š Metrics (Epoch 2500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.0326
â”‚   â””â”€â”€ perplexity_token: 2.8085
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.8485
    â””â”€â”€ perplexity_token: 2.3362
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000693


                                                                                         4.39it/s, acc_step=1/1, ce_loss_token=1.0010, lr=0.000691, perplexity_token=2.7210]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 1.0010097592284544, 'train/perplexity_token': 2.7210278511047363, 'val/ce_loss_token': 0.8329426404088736, 'val/perplexity_token': 2.300077199935913, 'learning_rate': 0.0006906375227769823}

ðŸ“Š Metrics (Epoch 3000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 1.0010
â”‚   â””â”€â”€ perplexity_token: 2.7210
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.8329
    â””â”€â”€ perplexity_token: 2.3001
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000691


                                                                                         7.40it/s, acc_step=1/1, ce_loss_token=0.9764, lr=0.000687, perplexity_token=2.6550]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9764482704137809, 'train/perplexity_token': 2.6550097465515137, 'val/ce_loss_token': 0.8235246017575264, 'val/perplexity_token': 2.2785165309906006, 'learning_rate': 0.0006872784458358806}

ðŸ“Š Metrics (Epoch 3500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9764
â”‚   â””â”€â”€ perplexity_token: 2.6550
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.8235
    â””â”€â”€ perplexity_token: 2.2785
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000687


                                                                                         4.36it/s, acc_step=1/1, ce_loss_token=0.9569, lr=0.000683, perplexity_token=2.6036]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9568777784232169, 'train/perplexity_token': 2.6035549640655518, 'val/ce_loss_token': 0.8162614721804857, 'val/perplexity_token': 2.2620272636413574, 'learning_rate': 0.0006834162844083303}

ðŸ“Š Metrics (Epoch 4000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9569
â”‚   â””â”€â”€ perplexity_token: 2.6036
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.8163
    â””â”€â”€ perplexity_token: 2.2620
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000683


                                                                                         7.41it/s, acc_step=1/1, ce_loss_token=0.9396, lr=0.000679, perplexity_token=2.5590]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9396313394315771, 'train/perplexity_token': 2.559037923812866, 'val/ce_loss_token': 0.7962020896375179, 'val/perplexity_token': 2.217104434967041, 'learning_rate': 0.0006790567993792218}

ðŸ“Š Metrics (Epoch 4500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9396
â”‚   â””â”€â”€ perplexity_token: 2.5590
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7962
    â””â”€â”€ perplexity_token: 2.2171
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000679


                                                                                         4.34it/s, acc_step=1/1, ce_loss_token=0.9253, lr=0.000674, perplexity_token=2.5226]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9252805511037533, 'train/perplexity_token': 2.522575855255127, 'val/ce_loss_token': 0.8026994336396456, 'val/perplexity_token': 2.2315568923950195, 'learning_rate': 0.0006742064934522636}

ðŸ“Š Metrics (Epoch 5000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9253
â”‚   â””â”€â”€ perplexity_token: 2.5226
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.8027
    â””â”€â”€ perplexity_token: 2.2316
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000674


                                                                                         7.40it/s, acc_step=1/1, ce_loss_token=0.9131, lr=0.000669, perplexity_token=2.4920]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9130734207087963, 'train/perplexity_token': 2.491969585418701, 'val/ce_loss_token': 0.7884537652134895, 'val/perplexity_token': 2.1999921798706055, 'learning_rate': 0.0006688726014504025}

ðŸ“Š Metrics (Epoch 5500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9131
â”‚   â””â”€â”€ perplexity_token: 2.4920
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7885
    â””â”€â”€ perplexity_token: 2.2000
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000669


                                                                                         4.29it/s, acc_step=1/1, ce_loss_token=0.9025, lr=0.000663, perplexity_token=2.4658]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.9025196654561003, 'train/perplexity_token': 2.465808153152466, 'val/ce_loss_token': 0.7842774074524641, 'val/perplexity_token': 2.1908233165740967, 'learning_rate': 0.0006630630795242126}

ðŸ“Š Metrics (Epoch 6000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.9025
â”‚   â””â”€â”€ perplexity_token: 2.4658
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7843
    â””â”€â”€ perplexity_token: 2.1908
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000663


                                                                                         7.40it/s, acc_step=1/1, ce_loss_token=0.8929, lr=0.000657, perplexity_token=2.4423]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8929395340640918, 'train/perplexity_token': 2.442298173904419, 'val/ce_loss_token': 0.7755730114877224, 'val/perplexity_token': 2.1718363761901855, 'learning_rate': 0.0006567865932843148}

ðŸ“Š Metrics (Epoch 6500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8929
â”‚   â””â”€â”€ perplexity_token: 2.4423
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7756
    â””â”€â”€ perplexity_token: 2.1718
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000657


                                                                                         4.37it/s, acc_step=1/1, ce_loss_token=0.8846, lr=0.000650, perplexity_token=2.4220]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8845842305446859, 'train/perplexity_token': 2.4219772815704346, 'val/ce_loss_token': 0.7744367402046919, 'val/perplexity_token': 2.169369697570801, 'learning_rate': 0.0006500525048755697}

ðŸ“Š Metrics (Epoch 7000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8846
â”‚   â””â”€â”€ perplexity_token: 2.4220
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7744
    â””â”€â”€ perplexity_token: 2.1694
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000650


                                                                                         7.39it/s, acc_step=1/1, ce_loss_token=0.8773, lr=0.000643, perplexity_token=2.4043]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8772654486821343, 'train/perplexity_token': 2.404315948486328, 'val/ce_loss_token': 0.774933610111475, 'val/perplexity_token': 2.170448064804077, 'learning_rate': 0.0006428708590122884}

ðŸ“Š Metrics (Epoch 7500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8773
â”‚   â””â”€â”€ perplexity_token: 2.4043
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7749
    â””â”€â”€ perplexity_token: 2.1704
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000643


                                                                                       ss_token=0.8708, lr=0.000635, perplexity_token=2.3888]                               

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8707922700568596, 'train/perplexity_token': 2.3888027667999268, 'val/ce_loss_token': 0.7708772197365761, 'val/perplexity_token': 2.1616616249084473, 'learning_rate': 0.0006352523679953224}

ðŸ“Š Metrics (Epoch 8000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8708
â”‚   â””â”€â”€ perplexity_token: 2.3888
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7709
    â””â”€â”€ perplexity_token: 2.1617
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000635


                                                                                       ss_token=0.8648, lr=0.000627, perplexity_token=2.3744]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8647592300780309, 'train/perplexity_token': 2.374434232711792, 'val/ce_loss_token': 0.7669652830809355, 'val/perplexity_token': 2.153221845626831, 'learning_rate': 0.0006272083957333581}

ðŸ“Š Metrics (Epoch 8500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8648
â”‚   â””â”€â”€ perplexity_token: 2.3744
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7670
    â””â”€â”€ perplexity_token: 2.1532
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000627


                                                                                       ss_token=0.8594, lr=0.000619, perplexity_token=2.3617]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8593781714359399, 'train/perplexity_token': 2.36169171333313, 'val/ce_loss_token': 0.7659981604665518, 'val/perplexity_token': 2.1511404514312744, 'learning_rate': 0.0006187509407922598}

ðŸ“Š Metrics (Epoch 9000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8594
â”‚   â””â”€â”€ perplexity_token: 2.3617
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7660
    â””â”€â”€ perplexity_token: 2.1511
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000619


                                                                                       ss_token=0.8546, lr=0.000610, perplexity_token=2.3503]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8545562778193252, 'train/perplexity_token': 2.3503313064575195, 'val/ce_loss_token': 0.7700695339590311, 'val/perplexity_token': 2.159916400909424, 'learning_rate': 0.0006098926184977486}

ðŸ“Š Metrics (Epoch 9500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8546
â”‚   â””â”€â”€ perplexity_token: 2.3503
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7701
    â””â”€â”€ perplexity_token: 2.1599
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000610


                                                                                       ss_token=0.8502, lr=0.000601, perplexity_token=2.3401]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8502063772020644, 'train/perplexity_token': 2.340129852294922, 'val/ce_loss_token': 0.7697925828397274, 'val/perplexity_token': 2.159318208694458, 'learning_rate': 0.000600646642118101}

ðŸ“Š Metrics (Epoch 10000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8502
â”‚   â””â”€â”€ perplexity_token: 2.3401
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7698
    â””â”€â”€ perplexity_token: 2.1593
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000601


                                                                                       ss_token=0.8461, lr=0.000591, perplexity_token=2.3306]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8461049551986964, 'train/perplexity_token': 2.3305516242980957, 'val/ce_loss_token': 0.7635428719222546, 'val/perplexity_token': 2.1458654403686523, 'learning_rate': 0.0005910268031549461}

ðŸ“Š Metrics (Epoch 10500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8461
â”‚   â””â”€â”€ perplexity_token: 2.3306
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7635
    â””â”€â”€ perplexity_token: 2.1459
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000591


                                                                                       ss_token=0.8424, lr=0.000581, perplexity_token=2.3219]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8423731838309887, 'train/perplexity_token': 2.3218705654144287, 'val/ce_loss_token': 0.7654754463583231, 'val/perplexity_token': 2.1500163078308105, 'learning_rate': 0.0005810474507715675}

ðŸ“Š Metrics (Epoch 11000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8424
â”‚   â””â”€â”€ perplexity_token: 2.3219
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7655
    â””â”€â”€ perplexity_token: 2.1500
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000581


                                                                                       ss_token=0.8390, lr=0.000571, perplexity_token=2.3141]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8390003393115338, 'train/perplexity_token': 2.3140525817871094, 'val/ce_loss_token': 0.7642270661890507, 'val/perplexity_token': 2.147334098815918, 'learning_rate': 0.0005707234703893648}

ðŸ“Š Metrics (Epoch 11500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8390
â”‚   â””â”€â”€ perplexity_token: 2.3141
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7642
    â””â”€â”€ perplexity_token: 2.1473
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000571


                                                                                       ss_token=0.8361, lr=0.000560, perplexity_token=2.3074]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.836119909031611, 'train/perplexity_token': 2.307396650314331, 'val/ce_loss_token': 0.7679433897137642, 'val/perplexity_token': 2.1553289890289307, 'learning_rate': 0.0005600702614844415}

ðŸ“Š Metrics (Epoch 12000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8361
â”‚   â””â”€â”€ perplexity_token: 2.3074
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7679
    â””â”€â”€ perplexity_token: 2.1553
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000560


                                                                                       ss_token=0.8337, lr=0.000549, perplexity_token=2.3019]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8337310784512086, 'train/perplexity_token': 2.301891326904297, 'val/ce_loss_token': 0.774030301719904, 'val/perplexity_token': 2.1684885025024414, 'learning_rate': 0.000549103714617398}

ðŸ“Š Metrics (Epoch 12500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8337
â”‚   â””â”€â”€ perplexity_token: 2.3019
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7740
    â””â”€â”€ perplexity_token: 2.1685
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000549


                                                                                       ss_token=0.8316, lr=0.000538, perplexity_token=2.2970]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8315836658625224, 'train/perplexity_token': 2.2969534397125244, 'val/ce_loss_token': 0.7748826462775469, 'val/perplexity_token': 2.170337438583374, 'learning_rate': 0.0005378401877306342}

ðŸ“Š Metrics (Epoch 13000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8316
â”‚   â””â”€â”€ perplexity_token: 2.2970
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7749
    â””â”€â”€ perplexity_token: 2.1703
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000538


                                                                                       ss_token=0.8295, lr=0.000526, perplexity_token=2.2922]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8295305043262479, 'train/perplexity_token': 2.2922422885894775, 'val/ce_loss_token': 0.7758742179721594, 'val/perplexity_token': 2.1724905967712402, 'learning_rate': 0.0005262964817484706}

ðŸ“Š Metrics (Epoch 13500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8295
â”‚   â””â”€â”€ perplexity_token: 2.2922
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7759
    â””â”€â”€ perplexity_token: 2.1725
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000526


                                                                                       ss_token=0.8275, lr=0.000514, perplexity_token=2.2877]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8275469545423708, 'train/perplexity_token': 2.2876999378204346, 'val/ce_loss_token': 0.7722153514623642, 'val/perplexity_token': 2.1645562648773193, 'learning_rate': 0.0005144898155165267}

ðŸ“Š Metrics (Epoch 14000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8275
â”‚   â””â”€â”€ perplexity_token: 2.2877
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7722
    â””â”€â”€ perplexity_token: 2.1646
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000514


                                                                                       ss_token=0.8256, lr=0.000502, perplexity_token=2.2832]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8255689957332434, 'train/perplexity_token': 2.283179521560669, 'val/ce_loss_token': 0.765829760581255, 'val/perplexity_token': 2.150778293609619, 'learning_rate': 0.0005024378001177138}

ðŸ“Š Metrics (Epoch 14500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8256
â”‚   â””â”€â”€ perplexity_token: 2.2832
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7658
    â””â”€â”€ perplexity_token: 2.1508
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000502


                                                                                       ss_token=0.8236, lr=0.000490, perplexity_token=2.2787]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8236229158935447, 'train/perplexity_token': 2.278740644454956, 'val/ce_loss_token': 0.7687706556171179, 'val/perplexity_token': 2.1571128368377686, 'learning_rate': 0.0004901584126031537}

ðŸ“Š Metrics (Epoch 15000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8236
â”‚   â””â”€â”€ perplexity_token: 2.2787
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7688
    â””â”€â”€ perplexity_token: 2.1571
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000490


                                                                                       ss_token=0.8220, lr=0.000478, perplexity_token=2.2751]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8220145363291189, 'train/perplexity_token': 2.275078296661377, 'val/ce_loss_token': 0.7676709219813347, 'val/perplexity_token': 2.1547417640686035, 'learning_rate': 0.0004776699691772117}

ðŸ“Š Metrics (Epoch 15500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8220
â”‚   â””â”€â”€ perplexity_token: 2.2751
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7677
    â””â”€â”€ perplexity_token: 2.1547
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000478


                                                                                       ss_token=0.8205, lr=0.000465, perplexity_token=2.2716]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8204981055746048, 'train/perplexity_token': 2.2716310024261475, 'val/ce_loss_token': 0.7692995630204678, 'val/perplexity_token': 2.1582539081573486, 'learning_rate': 0.00046499109787665133}

ðŸ“Š Metrics (Epoch 16000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8205
â”‚   â””â”€â”€ perplexity_token: 2.2716
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7693
    â””â”€â”€ perplexity_token: 2.1583
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000465


                                                                                       ss_token=0.8190, lr=0.000452, perplexity_token=2.2683]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8190128750081829, 'train/perplexity_token': 2.2682597637176514, 'val/ce_loss_token': 0.7699208240956068, 'val/perplexity_token': 2.159595251083374, 'learning_rate': 0.00045214071078463635}

ðŸ“Š Metrics (Epoch 16500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8190
â”‚   â””â”€â”€ perplexity_token: 2.2683
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7699
    â””â”€â”€ perplexity_token: 2.1596
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000452


                                                                                       ss_token=0.8176, lr=0.000439, perplexity_token=2.2651]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8176024276139799, 'train/perplexity_token': 2.2650628089904785, 'val/ce_loss_token': 0.7754874136298895, 'val/perplexity_token': 2.1716504096984863, 'learning_rate': 0.000439137975821056}

ðŸ“Š Metrics (Epoch 17000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8176
â”‚   â””â”€â”€ perplexity_token: 2.2651
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7755
    â””â”€â”€ perplexity_token: 2.1717
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000439


                                                                                       ss_token=0.8161, lr=0.000426, perplexity_token=2.2618]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8161478764663035, 'train/perplexity_token': 2.261770486831665, 'val/ce_loss_token': 0.7698382511734962, 'val/perplexity_token': 2.159416913986206, 'learning_rate': 0.0004260022881512363}

ðŸ“Š Metrics (Epoch 17500):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8161
â”‚   â””â”€â”€ perplexity_token: 2.2618
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7698
    â””â”€â”€ perplexity_token: 2.1594
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000426


                                                                                       ss_token=0.8147, lr=0.000413, perplexity_token=2.2586]

generating
Using default generation config...
Generating with greedy search...
wandb metrics being logged:  {'train/ce_loss_token': 0.8147241976696759, 'train/perplexity_token': 2.2585527896881104, 'val/ce_loss_token': 0.7988059241324663, 'val/perplexity_token': 2.2228851318359375, 'learning_rate': 0.00041275324125568017}

ðŸ“Š Metrics (Epoch 18000):
â”œâ”€â”€ TRAIN:
â”‚   â”œâ”€â”€ ce_loss_token: 0.8147
â”‚   â””â”€â”€ perplexity_token: 2.2586
â””â”€â”€ VAL:
    â”œâ”€â”€ ce_loss_token: 0.7988
    â””â”€â”€ perplexity_token: 2.2229
â””â”€â”€ TRAINING:
    â””â”€â”€ learning_rate: 0.000413


[Training LM]:  44%|â–ˆâ–ˆâ–ˆâ–‰     | 18016/40669 [45:49<58:11,  6.49it/s, acc_step=1/1, ce_loss_token=0.8147, lr=0.000412, perplexity_token=2.2585]

KeyboardInterrupt: 