## LLM

In [1]:
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer_alt import Transformer
from src.optimization import train_step, forward_and_loss, group_decay_parameters, save_checkpoint, load_checkpoint
from src.utils import saver, loader
from torch.utils.data import TensorDataset, DataLoader
from IPython.display import clear_output
import matplotlib.pyplot as plt

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


## Load Data

In [2]:
tokenizer = loader("tokenizers/cnn_tokenizer.pkl")

In [3]:
corpus_train1 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens1.pkl"))
corpus_train2 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens2.pkl"))
corpus_train3 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens3.pkl"))
corpus_train4 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens4.pkl"))
corpus_train = torch.cat((corpus_train1, corpus_train2, corpus_train3, corpus_train4), dim=0)

corpus_test = torch.tensor(loader("corpus/cnn_dailymail_article_test_tokens.pkl"))

In [4]:
def batch_data(corpus, batch_length=1024, offset=None):
    """
    Splits the corpus into batches of size batch_size.
    """
    length = len(corpus)
    batches = length // batch_length
    corpus_truncated = corpus[:batches * batch_length]  # trim to a multiple of batch_length
    corpus_batched = corpus_truncated.view(-1, batch_length)  # reshape into batches

    # overlapping batches augmentation
    if offset is not None:
        corpus_offset = corpus_truncated[offset : offset - batch_length]
        corpus_offset = corpus_offset.view(-1, batch_length)  # reshape into batches
        corpus_batched = torch.cat((corpus_batched, corpus_offset), dim=0)  # concatenate the offset batches

    return corpus_batched

In [5]:
corpus_train_batched = batch_data(corpus_train, batch_length=1024, offset=None)
corpus_test_batched = batch_data(corpus_test, batch_length=1024, offset=None)

In [6]:
loader_train = DataLoader(
    corpus_train_batched,
    batch_size=6,
    shuffle=True,       # shuffle every epoch
    drop_last=True      # drop the last incomplete batch
)

loader_test = DataLoader(
    corpus_test_batched,
    batch_size=6,
    shuffle=True,      # no need to shuffle test data
    drop_last=True      # drop the last incomplete batch
)

## Initialize Model

In [7]:
torch.manual_seed(42)

embed_dim = 119*10
ff_dim = 4*embed_dim
heads = 10
tf_blocks = 10

model = Transformer(
    embed_dim=embed_dim,
    embed_dim_ratio=0.87,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=tokenizer.vocab_size,
    max_seq_len=1024,
    dropout=0.1,
    start_token_id=tokenizer.token_to_idx["<s>"],
    use_weight_tying=False
).to(device)

optimizer_grouped_parameters = group_decay_parameters(
    model,
    weight_decay=0.1,
    no_decay=["bias", "LayerNorm.weight"],
    )

loss_train_list = []
loss_eval_list = []

filename = "checkpoint_transformer_alt3.pth"

In [8]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

print("Number of parameters:", get_n_params(model))

Number of parameters: 99126214


In [9]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
scaler = torch.amp.GradScaler("cuda")

num_epochs      = 3
steps_per_epoch = len(loader_train)
warmup_steps    = 250

def lr_lambda(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

### Load Model

In [10]:
#model, optimizer, scheduler = load_checkpoint("checkpoint_transformer_alt.pth", model, optimizer, scheduler)
#loss_train_list = loader("loss_train_alt.pkl")
#loss_eval_list = loader("loss_eval_alt.pkl")

In [11]:
optimizer.zero_grad()
model.train()
device = next(model.parameters()).device
accum_steps = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(tqdm(loader_train, desc="Training")):
        batch = batch.to(device)
        loss_train = train_step(model, 
                          batch, 
                          criterion, 
                          optimizer, 
                          scaler, 
                          scheduler, 
                          accum_steps,
                          step).item()
        if (step+1) % 500 == 0:
            model.eval()
            lr = scheduler.get_last_lr()[0]
            iter_test = iter(loader_test)
            with torch.no_grad():
                loss_eval = np.mean([forward_and_loss(model, next(iter_test).to(device), criterion).item() for _ in range(accum_steps)])
                print(f"Step {step+1}, Loss: {loss_train:<.4f}, Loss_eval: {loss_eval:<.4f}, Learning Rate: {lr:4e}")

            loss_train_list.append(loss_train)
            loss_eval_list.append(loss_eval)

            model.train()

        if (step+1) % 5000 == 0:
            save_checkpoint(model, 
                            optimizer, 
                            scheduler, 
                            filename=filename)
            saver("loss_train_alt3.pkl", loss_train_list)
            saver("loss_eval_alt3.pkl", loss_eval_list)

    save_checkpoint(model, 
                    optimizer, 
                    scheduler, 
                    filename=filename)
    saver("loss_train_alt3.pkl", loss_train_list)
    saver("loss_eval_alt3.pkl", loss_eval_list)


Epoch 1/3


Training:   0%|          | 0/43829 [00:00<?, ?it/s]

Step 500, Loss: 9.3667, Loss_eval: 9.3376, Learning Rate: 1.000000e-05


Step 1000, Loss: 8.9361, Loss_eval: 9.0260, Learning Rate: 2.000000e-05


Step 1500, Loss: 8.4515, Loss_eval: 8.4702, Learning Rate: 3.000000e-05


Step 2000, Loss: 7.9103, Loss_eval: 7.9415, Learning Rate: 4.000000e-05


Step 2500, Loss: 7.4134, Loss_eval: 7.4125, Learning Rate: 5.000000e-05


Step 3000, Loss: 7.0445, Loss_eval: 7.0288, Learning Rate: 5.000000e-05


Step 3500, Loss: 6.7654, Loss_eval: 6.7880, Learning Rate: 5.000000e-05


Step 4000, Loss: 6.5757, Loss_eval: 6.5902, Learning Rate: 5.000000e-05


Step 4500, Loss: 6.4692, Loss_eval: 6.5003, Learning Rate: 5.000000e-05


Step 5000, Loss: 6.5438, Loss_eval: 6.3889, Learning Rate: 5.000000e-05


Step 5500, Loss: 6.5348, Loss_eval: 6.2855, Learning Rate: 5.000000e-05


Step 6000, Loss: 6.3522, Loss_eval: 6.2232, Learning Rate: 5.000000e-05


Step 6500, Loss: 6.1399, Loss_eval: 6.2001, Learning Rate: 5.000000e-05


Step 7000, Loss: 6.0664, Loss_eval: 6.1402, Learning Rate: 5.000000e-05


Step 7500, Loss: 5.8955, Loss_eval: 6.0802, Learning Rate: 5.000000e-05


Step 8000, Loss: 5.8227, Loss_eval: 6.0468, Learning Rate: 5.000000e-05


Step 8500, Loss: 5.9902, Loss_eval: 5.9790, Learning Rate: 5.000000e-05


Step 9000, Loss: 5.9950, Loss_eval: 5.9146, Learning Rate: 5.000000e-05


Step 9500, Loss: 5.8534, Loss_eval: 5.8838, Learning Rate: 5.000000e-05


Step 10000, Loss: 5.9240, Loss_eval: 5.8552, Learning Rate: 5.000000e-05


Step 10500, Loss: 5.8381, Loss_eval: 5.7950, Learning Rate: 5.000000e-05


Step 11000, Loss: 5.8869, Loss_eval: 5.8318, Learning Rate: 5.000000e-05


Step 11500, Loss: 5.7796, Loss_eval: 5.7453, Learning Rate: 5.000000e-05


Step 12000, Loss: 5.6888, Loss_eval: 5.7054, Learning Rate: 5.000000e-05


Step 12500, Loss: 5.8466, Loss_eval: 5.7135, Learning Rate: 5.000000e-05


Step 13000, Loss: 5.8209, Loss_eval: 5.6726, Learning Rate: 5.000000e-05


Step 13500, Loss: 5.5241, Loss_eval: 5.6800, Learning Rate: 5.000000e-05


Step 14000, Loss: 5.6369, Loss_eval: 5.6414, Learning Rate: 5.000000e-05


Step 14500, Loss: 5.4143, Loss_eval: 5.5839, Learning Rate: 5.000000e-05


Step 15000, Loss: 5.6843, Loss_eval: 5.5397, Learning Rate: 5.000000e-05


Step 15500, Loss: 5.4163, Loss_eval: 5.5188, Learning Rate: 5.000000e-05


Step 16000, Loss: 5.5330, Loss_eval: 5.5354, Learning Rate: 5.000000e-05


Step 16500, Loss: 5.4136, Loss_eval: 5.4850, Learning Rate: 5.000000e-05


Step 17000, Loss: 5.3676, Loss_eval: 5.4442, Learning Rate: 5.000000e-05


Step 17500, Loss: 5.3852, Loss_eval: 5.4431, Learning Rate: 5.000000e-05


Step 18000, Loss: 5.3464, Loss_eval: 5.4068, Learning Rate: 5.000000e-05


Step 18500, Loss: 5.4232, Loss_eval: 5.3851, Learning Rate: 5.000000e-05


Step 19000, Loss: 5.4049, Loss_eval: 5.3368, Learning Rate: 5.000000e-05


Step 19500, Loss: 5.3214, Loss_eval: 5.3190, Learning Rate: 5.000000e-05


Step 20000, Loss: 5.3814, Loss_eval: 5.3468, Learning Rate: 5.000000e-05


Step 20500, Loss: 5.1502, Loss_eval: 5.3124, Learning Rate: 5.000000e-05


Step 21000, Loss: 5.3061, Loss_eval: 5.2892, Learning Rate: 5.000000e-05


Step 21500, Loss: 5.3370, Loss_eval: 5.3233, Learning Rate: 5.000000e-05


Step 22000, Loss: 5.1882, Loss_eval: 5.2155, Learning Rate: 5.000000e-05


Step 22500, Loss: 5.0756, Loss_eval: 5.2257, Learning Rate: 5.000000e-05


Step 23000, Loss: 5.2217, Loss_eval: 5.2019, Learning Rate: 5.000000e-05


Step 23500, Loss: 5.2266, Loss_eval: 5.2116, Learning Rate: 5.000000e-05


Step 24000, Loss: 5.0531, Loss_eval: 5.2302, Learning Rate: 5.000000e-05


Step 24500, Loss: 5.2094, Loss_eval: 5.1163, Learning Rate: 5.000000e-05


Step 25000, Loss: 5.1747, Loss_eval: 5.1622, Learning Rate: 5.000000e-05


Step 25500, Loss: 5.0011, Loss_eval: 5.1183, Learning Rate: 5.000000e-05


Step 26000, Loss: 4.9237, Loss_eval: 5.1025, Learning Rate: 5.000000e-05


Step 26500, Loss: 5.0771, Loss_eval: 5.1122, Learning Rate: 5.000000e-05


Step 27000, Loss: 4.9692, Loss_eval: 5.1210, Learning Rate: 5.000000e-05


Step 27500, Loss: 5.0677, Loss_eval: 5.0614, Learning Rate: 5.000000e-05


Step 28000, Loss: 5.0386, Loss_eval: 5.1457, Learning Rate: 5.000000e-05


Step 28500, Loss: 4.9332, Loss_eval: 5.0886, Learning Rate: 5.000000e-05


Step 29000, Loss: 4.8899, Loss_eval: 5.0523, Learning Rate: 5.000000e-05


Step 29500, Loss: 4.9576, Loss_eval: 5.0133, Learning Rate: 5.000000e-05


Step 30000, Loss: 4.8808, Loss_eval: 5.0822, Learning Rate: 5.000000e-05


Step 30500, Loss: 4.9660, Loss_eval: 5.0176, Learning Rate: 5.000000e-05


Step 31000, Loss: 4.8916, Loss_eval: 4.9917, Learning Rate: 5.000000e-05


Step 31500, Loss: 4.7353, Loss_eval: 4.9674, Learning Rate: 5.000000e-05


Step 32000, Loss: 4.9753, Loss_eval: 4.9478, Learning Rate: 5.000000e-05


Step 32500, Loss: 4.9479, Loss_eval: 4.9443, Learning Rate: 5.000000e-05


Step 33000, Loss: 4.9807, Loss_eval: 4.9272, Learning Rate: 5.000000e-05


Step 33500, Loss: 4.9793, Loss_eval: 4.9511, Learning Rate: 5.000000e-05


Step 34000, Loss: 5.0835, Loss_eval: 4.9315, Learning Rate: 5.000000e-05


Step 34500, Loss: 4.7566, Loss_eval: 4.9078, Learning Rate: 5.000000e-05


Step 35000, Loss: 4.8588, Loss_eval: 4.9271, Learning Rate: 5.000000e-05


Step 35500, Loss: 4.7638, Loss_eval: 4.9366, Learning Rate: 5.000000e-05


Step 36000, Loss: 4.8090, Loss_eval: 4.9191, Learning Rate: 5.000000e-05


Step 36500, Loss: 4.9008, Loss_eval: 4.8632, Learning Rate: 5.000000e-05


Step 37000, Loss: 4.6274, Loss_eval: 4.8510, Learning Rate: 5.000000e-05


Step 37500, Loss: 4.5076, Loss_eval: 4.8267, Learning Rate: 5.000000e-05


Step 38000, Loss: 4.8720, Loss_eval: 4.8428, Learning Rate: 5.000000e-05


Step 38500, Loss: 4.8183, Loss_eval: 4.8428, Learning Rate: 5.000000e-05


Step 39000, Loss: 4.8497, Loss_eval: 4.8249, Learning Rate: 5.000000e-05


Step 39500, Loss: 4.7107, Loss_eval: 4.7928, Learning Rate: 5.000000e-05


Step 40000, Loss: 4.7418, Loss_eval: 4.7909, Learning Rate: 5.000000e-05


Step 40500, Loss: 4.7148, Loss_eval: 4.8281, Learning Rate: 5.000000e-05


Step 41000, Loss: 4.6348, Loss_eval: 4.7896, Learning Rate: 5.000000e-05


Step 41500, Loss: 4.8395, Loss_eval: 4.8301, Learning Rate: 5.000000e-05


Step 42000, Loss: 4.5523, Loss_eval: 4.7504, Learning Rate: 5.000000e-05


Step 42500, Loss: 4.7390, Loss_eval: 4.7748, Learning Rate: 5.000000e-05


Step 43000, Loss: 4.8599, Loss_eval: 4.7591, Learning Rate: 5.000000e-05


Step 43500, Loss: 4.8385, Loss_eval: 4.6786, Learning Rate: 5.000000e-05


Epoch 2/3


Training:   0%|          | 0/43829 [00:00<?, ?it/s]

Step 500, Loss: 4.6418, Loss_eval: 4.7093, Learning Rate: 5.000000e-05


Step 1000, Loss: 4.5445, Loss_eval: 4.7000, Learning Rate: 5.000000e-05


Step 1500, Loss: 4.8421, Loss_eval: 4.6505, Learning Rate: 5.000000e-05


Step 2000, Loss: 4.7785, Loss_eval: 4.6890, Learning Rate: 5.000000e-05


Step 2500, Loss: 4.5841, Loss_eval: 4.6952, Learning Rate: 5.000000e-05


Step 3000, Loss: 4.5343, Loss_eval: 4.6701, Learning Rate: 5.000000e-05


Step 3500, Loss: 4.4426, Loss_eval: 4.6530, Learning Rate: 5.000000e-05


Step 4000, Loss: 4.2805, Loss_eval: 4.6533, Learning Rate: 5.000000e-05


Step 4500, Loss: 4.5427, Loss_eval: 4.6504, Learning Rate: 5.000000e-05


Step 5000, Loss: 4.5645, Loss_eval: 4.6579, Learning Rate: 5.000000e-05


Step 5500, Loss: 4.4972, Loss_eval: 4.5713, Learning Rate: 5.000000e-05


Step 6000, Loss: 4.7271, Loss_eval: 4.6568, Learning Rate: 5.000000e-05


Step 6500, Loss: 4.4472, Loss_eval: 4.6054, Learning Rate: 5.000000e-05


Step 7000, Loss: 4.5327, Loss_eval: 4.6318, Learning Rate: 5.000000e-05


Step 7500, Loss: 4.4541, Loss_eval: 4.6317, Learning Rate: 5.000000e-05


Step 8000, Loss: 4.5879, Loss_eval: 4.6393, Learning Rate: 5.000000e-05


Step 8500, Loss: 4.2811, Loss_eval: 4.5933, Learning Rate: 5.000000e-05


Step 9000, Loss: 4.4849, Loss_eval: 4.6064, Learning Rate: 5.000000e-05


Step 9500, Loss: 4.6040, Loss_eval: 4.6278, Learning Rate: 5.000000e-05


Step 10000, Loss: 4.4452, Loss_eval: 4.5585, Learning Rate: 5.000000e-05


Step 10500, Loss: 4.5813, Loss_eval: 4.5830, Learning Rate: 5.000000e-05


Step 11000, Loss: 4.5194, Loss_eval: 4.5319, Learning Rate: 5.000000e-05


Step 11500, Loss: 4.6969, Loss_eval: 4.5729, Learning Rate: 5.000000e-05


Step 12000, Loss: 4.4067, Loss_eval: 4.5306, Learning Rate: 5.000000e-05


Step 12500, Loss: 4.4117, Loss_eval: 4.5525, Learning Rate: 5.000000e-05


Step 13000, Loss: 4.5126, Loss_eval: 4.4848, Learning Rate: 5.000000e-05


Step 13500, Loss: 4.4291, Loss_eval: 4.4792, Learning Rate: 5.000000e-05


Step 14000, Loss: 4.5562, Loss_eval: 4.4582, Learning Rate: 5.000000e-05


Step 14500, Loss: 4.4303, Loss_eval: 4.4860, Learning Rate: 5.000000e-05


Step 15000, Loss: 4.3187, Loss_eval: 4.4562, Learning Rate: 5.000000e-05


Step 15500, Loss: 4.3788, Loss_eval: 4.5009, Learning Rate: 5.000000e-05


Step 16000, Loss: 4.3446, Loss_eval: 4.5264, Learning Rate: 5.000000e-05


Step 16500, Loss: 4.5087, Loss_eval: 4.4685, Learning Rate: 5.000000e-05


Step 17000, Loss: 4.4165, Loss_eval: 4.4936, Learning Rate: 5.000000e-05


Step 17500, Loss: 4.3580, Loss_eval: 4.4886, Learning Rate: 5.000000e-05


Step 18000, Loss: 4.3260, Loss_eval: 4.5177, Learning Rate: 5.000000e-05


Step 18500, Loss: 4.3462, Loss_eval: 4.4160, Learning Rate: 5.000000e-05


Step 19000, Loss: 4.2525, Loss_eval: 4.4409, Learning Rate: 5.000000e-05


Step 19500, Loss: 4.4934, Loss_eval: 4.3694, Learning Rate: 5.000000e-05


Step 20000, Loss: 4.3772, Loss_eval: 4.4764, Learning Rate: 5.000000e-05


Step 20500, Loss: 4.4858, Loss_eval: 4.4150, Learning Rate: 5.000000e-05


Step 21000, Loss: 4.4002, Loss_eval: 4.4798, Learning Rate: 5.000000e-05


Step 21500, Loss: 4.2762, Loss_eval: 4.4394, Learning Rate: 5.000000e-05


Step 22000, Loss: 4.3100, Loss_eval: 4.4083, Learning Rate: 5.000000e-05


Step 22500, Loss: 4.2778, Loss_eval: 4.4021, Learning Rate: 5.000000e-05


Step 23000, Loss: 4.2381, Loss_eval: 4.3747, Learning Rate: 5.000000e-05


Step 23500, Loss: 4.4424, Loss_eval: 4.4222, Learning Rate: 5.000000e-05


Step 24000, Loss: 4.3230, Loss_eval: 4.4356, Learning Rate: 5.000000e-05


Step 24500, Loss: 4.1513, Loss_eval: 4.3888, Learning Rate: 5.000000e-05


Step 25000, Loss: 4.1028, Loss_eval: 4.3680, Learning Rate: 5.000000e-05


Step 25500, Loss: 4.1608, Loss_eval: 4.3462, Learning Rate: 5.000000e-05


Step 26000, Loss: 4.3256, Loss_eval: 4.3575, Learning Rate: 5.000000e-05


Step 26500, Loss: 4.1563, Loss_eval: 4.3037, Learning Rate: 5.000000e-05


Step 27000, Loss: 4.1078, Loss_eval: 4.3304, Learning Rate: 5.000000e-05


Step 27500, Loss: 4.3701, Loss_eval: 4.3589, Learning Rate: 5.000000e-05


Step 28000, Loss: 4.5522, Loss_eval: 4.3418, Learning Rate: 5.000000e-05


Step 28500, Loss: 4.1444, Loss_eval: 4.3206, Learning Rate: 5.000000e-05


Step 29000, Loss: 4.1466, Loss_eval: 4.3475, Learning Rate: 5.000000e-05


Step 29500, Loss: 4.3654, Loss_eval: 4.3228, Learning Rate: 5.000000e-05


Step 30000, Loss: 4.1351, Loss_eval: 4.3192, Learning Rate: 5.000000e-05


Step 30500, Loss: 4.2851, Loss_eval: 4.2503, Learning Rate: 5.000000e-05


Step 31000, Loss: 4.1487, Loss_eval: 4.3347, Learning Rate: 5.000000e-05


Step 31500, Loss: 3.9185, Loss_eval: 4.3263, Learning Rate: 5.000000e-05


Step 32000, Loss: 4.1729, Loss_eval: 4.2328, Learning Rate: 5.000000e-05


Step 32500, Loss: 4.1677, Loss_eval: 4.2644, Learning Rate: 5.000000e-05


Step 33000, Loss: 3.9048, Loss_eval: 4.2809, Learning Rate: 5.000000e-05


Step 33500, Loss: 4.2221, Loss_eval: 4.2723, Learning Rate: 5.000000e-05


Step 34000, Loss: 4.1387, Loss_eval: 4.2828, Learning Rate: 5.000000e-05


Step 34500, Loss: 4.3447, Loss_eval: 4.2905, Learning Rate: 5.000000e-05


Step 35000, Loss: 4.1521, Loss_eval: 4.1941, Learning Rate: 5.000000e-05


Step 35500, Loss: 4.1881, Loss_eval: 4.2623, Learning Rate: 5.000000e-05


Step 36000, Loss: 4.0618, Loss_eval: 4.2482, Learning Rate: 5.000000e-05


Step 36500, Loss: 4.3086, Loss_eval: 4.2828, Learning Rate: 5.000000e-05


Step 37000, Loss: 4.0928, Loss_eval: 4.2958, Learning Rate: 5.000000e-05


Step 37500, Loss: 3.9477, Loss_eval: 4.2248, Learning Rate: 5.000000e-05


Step 38000, Loss: 4.1828, Loss_eval: 4.2396, Learning Rate: 5.000000e-05


Step 38500, Loss: 4.1296, Loss_eval: 4.2882, Learning Rate: 5.000000e-05


Step 39000, Loss: 4.0847, Loss_eval: 4.2780, Learning Rate: 5.000000e-05


Step 39500, Loss: 4.2053, Loss_eval: 4.2475, Learning Rate: 5.000000e-05


Step 40000, Loss: 4.0484, Loss_eval: 4.2257, Learning Rate: 5.000000e-05


Step 40500, Loss: 4.0340, Loss_eval: 4.2155, Learning Rate: 5.000000e-05


Step 41000, Loss: 3.9563, Loss_eval: 4.2585, Learning Rate: 5.000000e-05


Step 41500, Loss: 4.1184, Loss_eval: 4.2072, Learning Rate: 5.000000e-05


Step 42000, Loss: 3.9571, Loss_eval: 4.2182, Learning Rate: 5.000000e-05


Step 42500, Loss: 4.1777, Loss_eval: 4.2276, Learning Rate: 5.000000e-05


Step 43000, Loss: 3.9517, Loss_eval: 4.2079, Learning Rate: 5.000000e-05


Step 43500, Loss: 4.0016, Loss_eval: 4.2327, Learning Rate: 5.000000e-05


Epoch 3/3


Training:   0%|          | 0/43829 [00:00<?, ?it/s]

Step 500, Loss: 4.2672, Loss_eval: 4.1771, Learning Rate: 5.000000e-05


Step 1000, Loss: 3.9845, Loss_eval: 4.2392, Learning Rate: 5.000000e-05


Step 1500, Loss: 4.0898, Loss_eval: 4.1906, Learning Rate: 5.000000e-05


Step 2000, Loss: 4.0428, Loss_eval: 4.1655, Learning Rate: 5.000000e-05


Step 2500, Loss: 3.8855, Loss_eval: 4.1604, Learning Rate: 5.000000e-05


Step 3000, Loss: 4.1697, Loss_eval: 4.1243, Learning Rate: 5.000000e-05


Step 3500, Loss: 4.1055, Loss_eval: 4.1796, Learning Rate: 5.000000e-05


Step 4000, Loss: 3.9769, Loss_eval: 4.1168, Learning Rate: 5.000000e-05


Step 4500, Loss: 4.1548, Loss_eval: 4.1679, Learning Rate: 5.000000e-05


Step 5000, Loss: 3.9806, Loss_eval: 4.2212, Learning Rate: 5.000000e-05


Step 5500, Loss: 3.9581, Loss_eval: 4.1952, Learning Rate: 5.000000e-05


Step 6000, Loss: 4.0820, Loss_eval: 4.1211, Learning Rate: 5.000000e-05


Step 6500, Loss: 4.1856, Loss_eval: 4.1519, Learning Rate: 5.000000e-05


Step 7000, Loss: 4.0747, Loss_eval: 4.1640, Learning Rate: 5.000000e-05


Step 7500, Loss: 4.2589, Loss_eval: 4.1910, Learning Rate: 5.000000e-05


Step 8000, Loss: 3.9743, Loss_eval: 4.1010, Learning Rate: 5.000000e-05


Step 8500, Loss: 4.0042, Loss_eval: 4.2219, Learning Rate: 5.000000e-05


Step 9000, Loss: 3.9052, Loss_eval: 4.1251, Learning Rate: 5.000000e-05


Step 9500, Loss: 3.9488, Loss_eval: 4.1004, Learning Rate: 5.000000e-05


Step 10000, Loss: 4.0766, Loss_eval: 4.1115, Learning Rate: 5.000000e-05


Step 10500, Loss: 4.2130, Loss_eval: 4.1241, Learning Rate: 5.000000e-05


Step 11000, Loss: 3.7738, Loss_eval: 4.1146, Learning Rate: 5.000000e-05


Step 11500, Loss: 3.7793, Loss_eval: 4.1402, Learning Rate: 5.000000e-05


Step 12000, Loss: 4.1905, Loss_eval: 4.1808, Learning Rate: 5.000000e-05


Step 12500, Loss: 3.9797, Loss_eval: 4.1628, Learning Rate: 5.000000e-05


Step 13000, Loss: 4.1650, Loss_eval: 4.1191, Learning Rate: 5.000000e-05


Step 13500, Loss: 4.0206, Loss_eval: 4.1191, Learning Rate: 5.000000e-05


Step 14000, Loss: 3.6280, Loss_eval: 4.1402, Learning Rate: 5.000000e-05


Step 14500, Loss: 3.8559, Loss_eval: 4.1116, Learning Rate: 5.000000e-05


Step 15000, Loss: 3.9599, Loss_eval: 4.0565, Learning Rate: 5.000000e-05


Step 15500, Loss: 4.1237, Loss_eval: 4.1064, Learning Rate: 5.000000e-05


Step 16000, Loss: 4.0132, Loss_eval: 4.1105, Learning Rate: 5.000000e-05


Step 16500, Loss: 3.9136, Loss_eval: 4.0795, Learning Rate: 5.000000e-05


Step 17000, Loss: 4.0945, Loss_eval: 4.0882, Learning Rate: 5.000000e-05


Step 17500, Loss: 4.2119, Loss_eval: 4.1412, Learning Rate: 5.000000e-05


Step 18000, Loss: 4.1412, Loss_eval: 4.0939, Learning Rate: 5.000000e-05


Step 18500, Loss: 3.9438, Loss_eval: 4.0703, Learning Rate: 5.000000e-05


Step 19000, Loss: 3.9838, Loss_eval: 4.0958, Learning Rate: 5.000000e-05


Step 19500, Loss: 4.0591, Loss_eval: 4.1148, Learning Rate: 5.000000e-05


Step 20000, Loss: 4.0843, Loss_eval: 4.1214, Learning Rate: 5.000000e-05


Step 20500, Loss: 3.9188, Loss_eval: 4.1105, Learning Rate: 5.000000e-05


Step 21000, Loss: 3.6908, Loss_eval: 4.0646, Learning Rate: 5.000000e-05


Step 21500, Loss: 3.9989, Loss_eval: 4.1028, Learning Rate: 5.000000e-05


Step 22000, Loss: 3.9729, Loss_eval: 4.0440, Learning Rate: 5.000000e-05


Step 22500, Loss: 3.7571, Loss_eval: 4.1139, Learning Rate: 5.000000e-05


Step 23000, Loss: 3.8349, Loss_eval: 4.1048, Learning Rate: 5.000000e-05


Step 23500, Loss: 3.8821, Loss_eval: 4.0910, Learning Rate: 5.000000e-05


Step 24000, Loss: 3.9247, Loss_eval: 4.0810, Learning Rate: 5.000000e-05


Step 24500, Loss: 3.8892, Loss_eval: 4.0287, Learning Rate: 5.000000e-05


Step 25000, Loss: 3.6838, Loss_eval: 4.0216, Learning Rate: 5.000000e-05


Step 25500, Loss: 3.8452, Loss_eval: 4.0387, Learning Rate: 5.000000e-05


Step 26000, Loss: 3.8430, Loss_eval: 4.0082, Learning Rate: 5.000000e-05


Step 26500, Loss: 4.0894, Loss_eval: 4.0964, Learning Rate: 5.000000e-05


Step 27000, Loss: 3.8528, Loss_eval: 4.0047, Learning Rate: 5.000000e-05


Step 27500, Loss: 3.7685, Loss_eval: 4.0337, Learning Rate: 5.000000e-05


Step 28000, Loss: 4.0506, Loss_eval: 4.0738, Learning Rate: 5.000000e-05


Step 28500, Loss: 3.9945, Loss_eval: 4.0581, Learning Rate: 5.000000e-05


Step 29000, Loss: 3.9281, Loss_eval: 4.0635, Learning Rate: 5.000000e-05


Step 29500, Loss: 3.8333, Loss_eval: 4.0672, Learning Rate: 5.000000e-05


Step 30000, Loss: 3.7986, Loss_eval: 4.0324, Learning Rate: 5.000000e-05


Step 30500, Loss: 3.9659, Loss_eval: 4.0108, Learning Rate: 5.000000e-05


Step 31000, Loss: 3.8813, Loss_eval: 3.9804, Learning Rate: 5.000000e-05


Step 31500, Loss: 4.0449, Loss_eval: 4.0726, Learning Rate: 5.000000e-05


Step 32000, Loss: 3.6909, Loss_eval: 4.0466, Learning Rate: 5.000000e-05


Step 32500, Loss: 4.1008, Loss_eval: 4.0534, Learning Rate: 5.000000e-05


Step 33000, Loss: 3.8260, Loss_eval: 4.0340, Learning Rate: 5.000000e-05


Step 33500, Loss: 3.6845, Loss_eval: 4.0109, Learning Rate: 5.000000e-05


Step 34000, Loss: 3.9018, Loss_eval: 4.0285, Learning Rate: 5.000000e-05


Step 34500, Loss: 3.6609, Loss_eval: 4.0485, Learning Rate: 5.000000e-05


Step 35000, Loss: 3.9068, Loss_eval: 4.0906, Learning Rate: 5.000000e-05


Step 35500, Loss: 3.6617, Loss_eval: 4.0867, Learning Rate: 5.000000e-05


Step 36000, Loss: 3.6750, Loss_eval: 4.0092, Learning Rate: 5.000000e-05


Step 36500, Loss: 3.9094, Loss_eval: 3.9788, Learning Rate: 5.000000e-05


Step 37000, Loss: 3.9971, Loss_eval: 3.9992, Learning Rate: 5.000000e-05


Step 37500, Loss: 3.9370, Loss_eval: 3.9971, Learning Rate: 5.000000e-05


Step 38000, Loss: 3.6858, Loss_eval: 4.0309, Learning Rate: 5.000000e-05


Step 38500, Loss: 3.8422, Loss_eval: 3.9990, Learning Rate: 5.000000e-05


Step 39000, Loss: 4.1292, Loss_eval: 3.9477, Learning Rate: 5.000000e-05


Step 39500, Loss: 3.8350, Loss_eval: 3.9974, Learning Rate: 5.000000e-05


Step 40000, Loss: 3.8578, Loss_eval: 4.0031, Learning Rate: 5.000000e-05


Step 40500, Loss: 4.2051, Loss_eval: 3.9700, Learning Rate: 5.000000e-05


Step 41000, Loss: 4.0388, Loss_eval: 4.0424, Learning Rate: 5.000000e-05


Step 41500, Loss: 3.6959, Loss_eval: 4.0481, Learning Rate: 5.000000e-05


Step 42000, Loss: 3.9138, Loss_eval: 4.0283, Learning Rate: 5.000000e-05


Step 42500, Loss: 3.9592, Loss_eval: 4.0408, Learning Rate: 5.000000e-05


Step 43000, Loss: 3.8775, Loss_eval: 3.9750, Learning Rate: 5.000000e-05


Step 43500, Loss: 3.9609, Loss_eval: 3.9706, Learning Rate: 5.000000e-05
