## LLM

In [1]:
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer_alt import Transformer
from src.optimization import train_step, forward_and_loss, group_decay_parameters, save_checkpoint, load_checkpoint
from src.utils import saver, loader
from torch.utils.data import TensorDataset, DataLoader
from IPython.display import clear_output
import matplotlib.pyplot as plt

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


## Load Data

In [2]:
tokenizer = loader("tokenizers/cnn_tokenizer.pkl")

In [3]:
corpus_train1 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens1.pkl"))
corpus_train2 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens2.pkl"))
corpus_train3 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens3.pkl"))
corpus_train4 = torch.tensor(loader("corpus/cnn_dailymail_article_train_tokens4.pkl"))
corpus_train = torch.cat((corpus_train1, corpus_train2, corpus_train3, corpus_train4), dim=0)

corpus_test = torch.tensor(loader("corpus/cnn_dailymail_article_test_tokens.pkl"))

In [4]:
def batch_data(corpus, batch_length=1024, offset=None):
    """
    Splits the corpus into batches of size batch_size.
    """
    length = len(corpus)
    batches = length // batch_length
    corpus_truncated = corpus[:batches * batch_length]  # trim to a multiple of batch_length
    corpus_batched = corpus_truncated.view(-1, batch_length)  # reshape into batches

    # overlapping batches augmentation
    if offset is not None:
        corpus_offset = corpus_truncated[offset : offset - batch_length]
        corpus_offset = corpus_offset.view(-1, batch_length)  # reshape into batches
        corpus_batched = torch.cat((corpus_batched, corpus_offset), dim=0)  # concatenate the offset batches

    return corpus_batched

In [5]:
corpus_train_batched = batch_data(corpus_train, batch_length=1024, offset=None)
corpus_test_batched = batch_data(corpus_test, batch_length=1024, offset=None)

In [6]:
loader_train = DataLoader(
    corpus_train_batched,
    batch_size=6,
    shuffle=True,       # shuffle every epoch
    drop_last=True      # drop the last incomplete batch
)

loader_test = DataLoader(
    corpus_test_batched,
    batch_size=6,
    shuffle=True,      # no need to shuffle test data
    drop_last=True      # drop the last incomplete batch
)

## Initialize Model

In [7]:
torch.manual_seed(42)

embed_dim = 73*10
ff_dim = 4*embed_dim
heads = 10
tf_blocks = 10

model = Transformer(
    embed_dim=embed_dim,
    embed_dim_ratio=1.,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=tokenizer.vocab_size,
    max_seq_len=1024,
    dropout=0.1,
    start_token_id=tokenizer.token_to_idx["<s>"],
    use_weight_tying=False
).to(device)

optimizer_grouped_parameters = group_decay_parameters(
    model,
    weight_decay=0.1,
    no_decay=["bias", "LayerNorm.weight"],
    )

loss_train_list = []
loss_eval_list = []

filename = "checkpoint_transformer_alt2.pth"

In [8]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

print("Number of parameters:", get_n_params(model))

Number of parameters: 99933334


In [9]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
scaler = torch.amp.GradScaler("cuda")

num_epochs      = 3
steps_per_epoch = len(loader_train)
warmup_steps    = 250

def lr_lambda(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

### Load Model

In [10]:
#model, optimizer, scheduler = load_checkpoint("checkpoint_transformer_alt.pth", model, optimizer, scheduler)
#loss_train_list = loader("loss_train_alt.pkl")
#loss_eval_list = loader("loss_eval_alt.pkl")

In [11]:
optimizer.zero_grad()
model.train()
device = next(model.parameters()).device
accum_steps = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for step, batch in enumerate(tqdm(loader_train, desc="Training")):
        batch = batch.to(device)
        loss_train = train_step(model, 
                          batch, 
                          criterion, 
                          optimizer, 
                          scaler, 
                          scheduler, 
                          accum_steps,
                          step).item()
        if (step+1) % 500 == 0:
            model.eval()
            lr = scheduler.get_last_lr()[0]
            iter_test = iter(loader_test)
            with torch.no_grad():
                loss_eval = np.mean([forward_and_loss(model, next(iter_test).to(device), criterion).item() for _ in range(accum_steps)])
                print(f"Step {step+1}, Loss: {loss_train:<.4f}, Loss_eval: {loss_eval:<.4f}, Learning Rate: {lr:4e}")

            loss_train_list.append(loss_train)
            loss_eval_list.append(loss_eval)

            model.train()

        if (step+1) % 5000 == 0:
            save_checkpoint(model, 
                            optimizer, 
                            scheduler, 
                            filename=filename)
            saver("loss_train_alt2.pkl", loss_train_list)
            saver("loss_eval_alt2.pkl", loss_eval_list)

    save_checkpoint(model, 
                    optimizer, 
                    scheduler, 
                    filename=filename)
    saver("loss_train_alt2.pkl", loss_train_list)
    saver("loss_eval_alt2.pkl", loss_eval_list)


Epoch 1/3


Training:   0%|          | 0/43829 [00:00<?, ?it/s]

Step 500, Loss: 8.9040, Loss_eval: 8.8116, Learning Rate: 1.000000e-05


Step 1000, Loss: 8.1723, Loss_eval: 8.1909, Learning Rate: 2.000000e-05


Step 1500, Loss: 7.5327, Loss_eval: 7.4881, Learning Rate: 3.000000e-05


Step 2000, Loss: 7.1068, Loss_eval: 6.9722, Learning Rate: 4.000000e-05


Step 2500, Loss: 6.7723, Loss_eval: 6.7194, Learning Rate: 5.000000e-05


Step 3000, Loss: 6.5310, Loss_eval: 6.4667, Learning Rate: 5.000000e-05


Step 3500, Loss: 6.3408, Loss_eval: 6.2988, Learning Rate: 5.000000e-05


Step 4000, Loss: 6.2234, Loss_eval: 6.1614, Learning Rate: 5.000000e-05


Step 4500, Loss: 5.9725, Loss_eval: 6.0965, Learning Rate: 5.000000e-05


Step 5000, Loss: 6.0379, Loss_eval: 6.0003, Learning Rate: 5.000000e-05


Step 5500, Loss: 5.9854, Loss_eval: 5.9905, Learning Rate: 5.000000e-05


Step 6000, Loss: 6.0520, Loss_eval: 5.9152, Learning Rate: 5.000000e-05


Step 6500, Loss: 5.8638, Loss_eval: 5.8341, Learning Rate: 5.000000e-05


Step 7000, Loss: 5.8925, Loss_eval: 5.7942, Learning Rate: 5.000000e-05


Step 7500, Loss: 5.8458, Loss_eval: 5.7645, Learning Rate: 5.000000e-05


Step 8000, Loss: 5.7862, Loss_eval: 5.7758, Learning Rate: 5.000000e-05


Step 8500, Loss: 5.6004, Loss_eval: 5.6691, Learning Rate: 5.000000e-05


Step 9000, Loss: 5.5850, Loss_eval: 5.6497, Learning Rate: 5.000000e-05


Step 9500, Loss: 5.6313, Loss_eval: 5.5901, Learning Rate: 5.000000e-05


Step 10000, Loss: 5.3719, Loss_eval: 5.5884, Learning Rate: 5.000000e-05


Step 10500, Loss: 5.5549, Loss_eval: 5.5420, Learning Rate: 5.000000e-05


Step 11000, Loss: 5.6511, Loss_eval: 5.5116, Learning Rate: 5.000000e-05


Step 11500, Loss: 5.4649, Loss_eval: 5.4802, Learning Rate: 5.000000e-05


Step 12000, Loss: 5.7195, Loss_eval: 5.4471, Learning Rate: 5.000000e-05


Step 12500, Loss: 5.4865, Loss_eval: 5.4027, Learning Rate: 5.000000e-05


Step 13000, Loss: 5.1855, Loss_eval: 5.4452, Learning Rate: 5.000000e-05


Step 13500, Loss: 5.3405, Loss_eval: 5.3627, Learning Rate: 5.000000e-05


Step 14000, Loss: 5.4360, Loss_eval: 5.3077, Learning Rate: 5.000000e-05


Step 14500, Loss: 5.1565, Loss_eval: 5.2611, Learning Rate: 5.000000e-05


Step 15000, Loss: 5.2531, Loss_eval: 5.2931, Learning Rate: 5.000000e-05


Step 15500, Loss: 5.4962, Loss_eval: 5.2839, Learning Rate: 5.000000e-05


Step 16000, Loss: 5.3334, Loss_eval: 5.2554, Learning Rate: 5.000000e-05


Step 16500, Loss: 5.1167, Loss_eval: 5.1910, Learning Rate: 5.000000e-05


Step 17000, Loss: 5.2602, Loss_eval: 5.2474, Learning Rate: 5.000000e-05


Step 17500, Loss: 5.1225, Loss_eval: 5.1509, Learning Rate: 5.000000e-05


Step 18000, Loss: 5.1320, Loss_eval: 5.1220, Learning Rate: 5.000000e-05


Step 18500, Loss: 5.0314, Loss_eval: 5.1539, Learning Rate: 5.000000e-05


Step 19000, Loss: 5.2081, Loss_eval: 5.1271, Learning Rate: 5.000000e-05


Step 19500, Loss: 5.1967, Loss_eval: 5.1361, Learning Rate: 5.000000e-05


Step 20000, Loss: 4.9996, Loss_eval: 5.1308, Learning Rate: 5.000000e-05


Step 20500, Loss: 4.9950, Loss_eval: 5.0854, Learning Rate: 5.000000e-05


Step 21000, Loss: 4.9739, Loss_eval: 4.9996, Learning Rate: 5.000000e-05


Step 21500, Loss: 4.8742, Loss_eval: 5.0170, Learning Rate: 5.000000e-05


Step 22000, Loss: 4.9530, Loss_eval: 5.0154, Learning Rate: 5.000000e-05


Step 22500, Loss: 4.9381, Loss_eval: 4.9675, Learning Rate: 5.000000e-05


Step 23000, Loss: 4.8272, Loss_eval: 4.9909, Learning Rate: 5.000000e-05


Step 23500, Loss: 4.8659, Loss_eval: 4.9905, Learning Rate: 5.000000e-05


Step 24000, Loss: 5.0879, Loss_eval: 4.9093, Learning Rate: 5.000000e-05


Step 24500, Loss: 4.8107, Loss_eval: 4.9326, Learning Rate: 5.000000e-05


Step 25000, Loss: 4.8687, Loss_eval: 4.8916, Learning Rate: 5.000000e-05


Step 25500, Loss: 4.7932, Loss_eval: 4.8335, Learning Rate: 5.000000e-05


Step 26000, Loss: 4.9045, Loss_eval: 4.8502, Learning Rate: 5.000000e-05


Step 26500, Loss: 4.7965, Loss_eval: 4.8958, Learning Rate: 5.000000e-05


Step 27000, Loss: 4.7792, Loss_eval: 4.8547, Learning Rate: 5.000000e-05


Step 27500, Loss: 4.9436, Loss_eval: 4.8086, Learning Rate: 5.000000e-05


Step 28000, Loss: 4.8723, Loss_eval: 4.7795, Learning Rate: 5.000000e-05


Step 28500, Loss: 4.8295, Loss_eval: 4.8022, Learning Rate: 5.000000e-05


Step 29000, Loss: 4.9499, Loss_eval: 4.8416, Learning Rate: 5.000000e-05


Step 29500, Loss: 4.7187, Loss_eval: 4.7603, Learning Rate: 5.000000e-05


Step 30000, Loss: 4.7246, Loss_eval: 4.7672, Learning Rate: 5.000000e-05


Step 30500, Loss: 4.7570, Loss_eval: 4.7540, Learning Rate: 5.000000e-05


Step 31000, Loss: 4.6337, Loss_eval: 4.7397, Learning Rate: 5.000000e-05


Step 31500, Loss: 4.6807, Loss_eval: 4.7251, Learning Rate: 5.000000e-05


Step 32000, Loss: 4.6212, Loss_eval: 4.7251, Learning Rate: 5.000000e-05


Step 32500, Loss: 4.4807, Loss_eval: 4.7213, Learning Rate: 5.000000e-05


Step 33000, Loss: 4.6455, Loss_eval: 4.6865, Learning Rate: 5.000000e-05


Step 33500, Loss: 4.6759, Loss_eval: 4.7071, Learning Rate: 5.000000e-05


Step 34000, Loss: 4.6447, Loss_eval: 4.6962, Learning Rate: 5.000000e-05


Step 34500, Loss: 4.6186, Loss_eval: 4.6758, Learning Rate: 5.000000e-05


Step 35000, Loss: 4.5664, Loss_eval: 4.6456, Learning Rate: 5.000000e-05


Step 35500, Loss: 4.5195, Loss_eval: 4.6461, Learning Rate: 5.000000e-05


Step 36000, Loss: 4.7337, Loss_eval: 4.6189, Learning Rate: 5.000000e-05


Step 36500, Loss: 4.4476, Loss_eval: 4.6440, Learning Rate: 5.000000e-05


Step 37000, Loss: 4.4613, Loss_eval: 4.6312, Learning Rate: 5.000000e-05


Step 37500, Loss: 4.6457, Loss_eval: 4.5857, Learning Rate: 5.000000e-05


Step 38000, Loss: 4.4831, Loss_eval: 4.6431, Learning Rate: 5.000000e-05


Step 38500, Loss: 4.3168, Loss_eval: 4.5873, Learning Rate: 5.000000e-05


Step 39000, Loss: 4.3765, Loss_eval: 4.5835, Learning Rate: 5.000000e-05


Step 39500, Loss: 4.6367, Loss_eval: 4.5676, Learning Rate: 5.000000e-05


Step 40000, Loss: 4.6316, Loss_eval: 4.5941, Learning Rate: 5.000000e-05


Step 40500, Loss: 4.3546, Loss_eval: 4.5840, Learning Rate: 5.000000e-05


Step 41000, Loss: 4.7713, Loss_eval: 4.5826, Learning Rate: 5.000000e-05


Step 41500, Loss: 4.5722, Loss_eval: 4.5155, Learning Rate: 5.000000e-05


Step 42000, Loss: 4.3492, Loss_eval: 4.5637, Learning Rate: 5.000000e-05


Step 42500, Loss: 4.2942, Loss_eval: 4.5669, Learning Rate: 5.000000e-05


Step 43000, Loss: 4.6994, Loss_eval: 4.5550, Learning Rate: 5.000000e-05


Step 43500, Loss: 4.3649, Loss_eval: 4.5375, Learning Rate: 5.000000e-05


Epoch 2/3


Training:   0%|          | 0/43829 [00:00<?, ?it/s]

Step 500, Loss: 4.4295, Loss_eval: 4.5012, Learning Rate: 5.000000e-05


Step 1000, Loss: 4.3737, Loss_eval: 4.4995, Learning Rate: 5.000000e-05


Step 1500, Loss: 4.3032, Loss_eval: 4.5424, Learning Rate: 5.000000e-05


Step 2000, Loss: 4.3883, Loss_eval: 4.5278, Learning Rate: 5.000000e-05


Step 2500, Loss: 4.3775, Loss_eval: 4.4602, Learning Rate: 5.000000e-05


Step 3000, Loss: 4.5187, Loss_eval: 4.4878, Learning Rate: 5.000000e-05


Step 3500, Loss: 4.4398, Loss_eval: 4.4523, Learning Rate: 5.000000e-05


Step 4000, Loss: 4.4055, Loss_eval: 4.4550, Learning Rate: 5.000000e-05


Step 4500, Loss: 4.5226, Loss_eval: 4.4149, Learning Rate: 5.000000e-05


Step 5000, Loss: 4.4129, Loss_eval: 4.4369, Learning Rate: 5.000000e-05


Step 5500, Loss: 4.3217, Loss_eval: 4.4807, Learning Rate: 5.000000e-05


Step 6000, Loss: 4.6252, Loss_eval: 4.4610, Learning Rate: 5.000000e-05


Step 6500, Loss: 4.3536, Loss_eval: 4.4602, Learning Rate: 5.000000e-05


Step 7000, Loss: 4.3111, Loss_eval: 4.4109, Learning Rate: 5.000000e-05


Step 7500, Loss: 4.4772, Loss_eval: 4.4066, Learning Rate: 5.000000e-05


Step 8000, Loss: 4.5573, Loss_eval: 4.4258, Learning Rate: 5.000000e-05


Step 8500, Loss: 4.2901, Loss_eval: 4.4382, Learning Rate: 5.000000e-05


Step 9000, Loss: 4.2873, Loss_eval: 4.3843, Learning Rate: 5.000000e-05


Step 9500, Loss: 4.3039, Loss_eval: 4.3926, Learning Rate: 5.000000e-05


Step 10000, Loss: 4.4860, Loss_eval: 4.3623, Learning Rate: 5.000000e-05


Step 10500, Loss: 4.2400, Loss_eval: 4.3700, Learning Rate: 5.000000e-05


Step 11000, Loss: 4.1858, Loss_eval: 4.3294, Learning Rate: 5.000000e-05


Step 11500, Loss: 4.3512, Loss_eval: 4.3846, Learning Rate: 5.000000e-05


Step 12000, Loss: 4.1778, Loss_eval: 4.3898, Learning Rate: 5.000000e-05


Step 12500, Loss: 4.3893, Loss_eval: 4.3037, Learning Rate: 5.000000e-05


Step 13000, Loss: 4.3858, Loss_eval: 4.2808, Learning Rate: 5.000000e-05


Step 13500, Loss: 4.4144, Loss_eval: 4.3671, Learning Rate: 5.000000e-05


Step 14000, Loss: 4.5626, Loss_eval: 4.3346, Learning Rate: 5.000000e-05


Step 14500, Loss: 4.2271, Loss_eval: 4.3645, Learning Rate: 5.000000e-05


Step 15000, Loss: 4.4288, Loss_eval: 4.3094, Learning Rate: 5.000000e-05


Step 15500, Loss: 4.2845, Loss_eval: 4.3231, Learning Rate: 5.000000e-05


Step 16000, Loss: 4.0761, Loss_eval: 4.3176, Learning Rate: 5.000000e-05


Step 16500, Loss: 4.3528, Loss_eval: 4.2958, Learning Rate: 5.000000e-05


Step 17000, Loss: 4.1051, Loss_eval: 4.2628, Learning Rate: 5.000000e-05


Step 17500, Loss: 4.2334, Loss_eval: 4.2460, Learning Rate: 5.000000e-05


Step 18000, Loss: 4.0458, Loss_eval: 4.2945, Learning Rate: 5.000000e-05


Step 18500, Loss: 4.3382, Loss_eval: 4.3386, Learning Rate: 5.000000e-05


Step 19000, Loss: 4.2532, Loss_eval: 4.2609, Learning Rate: 5.000000e-05


Step 19500, Loss: 4.0943, Loss_eval: 4.2722, Learning Rate: 5.000000e-05


Step 20000, Loss: 4.1766, Loss_eval: 4.2790, Learning Rate: 5.000000e-05


Step 20500, Loss: 4.1705, Loss_eval: 4.2079, Learning Rate: 5.000000e-05


Step 21000, Loss: 4.2142, Loss_eval: 4.2239, Learning Rate: 5.000000e-05


Step 21500, Loss: 4.0439, Loss_eval: 4.2399, Learning Rate: 5.000000e-05


Step 22000, Loss: 4.4342, Loss_eval: 4.2481, Learning Rate: 5.000000e-05


Step 22500, Loss: 4.3639, Loss_eval: 4.2383, Learning Rate: 5.000000e-05


Step 23000, Loss: 4.0829, Loss_eval: 4.2325, Learning Rate: 5.000000e-05


Step 23500, Loss: 4.1851, Loss_eval: 4.2433, Learning Rate: 5.000000e-05


Step 24000, Loss: 3.9355, Loss_eval: 4.2249, Learning Rate: 5.000000e-05


Step 24500, Loss: 4.1010, Loss_eval: 4.2102, Learning Rate: 5.000000e-05


Step 25000, Loss: 4.1785, Loss_eval: 4.2210, Learning Rate: 5.000000e-05


Step 25500, Loss: 4.1764, Loss_eval: 4.1882, Learning Rate: 5.000000e-05


Step 26000, Loss: 3.9074, Loss_eval: 4.2490, Learning Rate: 5.000000e-05


Step 26500, Loss: 4.1034, Loss_eval: 4.1915, Learning Rate: 5.000000e-05


Step 27000, Loss: 3.9955, Loss_eval: 4.1733, Learning Rate: 5.000000e-05


Step 27500, Loss: 4.2833, Loss_eval: 4.1785, Learning Rate: 5.000000e-05


Step 28000, Loss: 4.0944, Loss_eval: 4.2002, Learning Rate: 5.000000e-05


Step 28500, Loss: 4.1964, Loss_eval: 4.1579, Learning Rate: 5.000000e-05


Step 29000, Loss: 4.1207, Loss_eval: 4.1373, Learning Rate: 5.000000e-05


Step 29500, Loss: 4.2304, Loss_eval: 4.1603, Learning Rate: 5.000000e-05


Step 30000, Loss: 4.3149, Loss_eval: 4.1531, Learning Rate: 5.000000e-05


Step 30500, Loss: 4.1108, Loss_eval: 4.1063, Learning Rate: 5.000000e-05


Step 31000, Loss: 4.0999, Loss_eval: 4.1336, Learning Rate: 5.000000e-05


Step 31500, Loss: 3.9035, Loss_eval: 4.1374, Learning Rate: 5.000000e-05


Step 32000, Loss: 3.9289, Loss_eval: 4.1335, Learning Rate: 5.000000e-05


Step 32500, Loss: 4.1522, Loss_eval: 4.1386, Learning Rate: 5.000000e-05


Step 33000, Loss: 4.2066, Loss_eval: 4.1201, Learning Rate: 5.000000e-05


Step 33500, Loss: 3.8598, Loss_eval: 4.1917, Learning Rate: 5.000000e-05


Step 34000, Loss: 3.9975, Loss_eval: 4.1540, Learning Rate: 5.000000e-05


Step 34500, Loss: 4.2082, Loss_eval: 4.1479, Learning Rate: 5.000000e-05


Step 35000, Loss: 4.0363, Loss_eval: 4.1285, Learning Rate: 5.000000e-05


Step 35500, Loss: 3.8399, Loss_eval: 4.1113, Learning Rate: 5.000000e-05


Step 36000, Loss: 4.0533, Loss_eval: 4.1469, Learning Rate: 5.000000e-05


Step 36500, Loss: 3.8747, Loss_eval: 4.1198, Learning Rate: 5.000000e-05


Step 37000, Loss: 3.9014, Loss_eval: 4.1403, Learning Rate: 5.000000e-05


Step 37500, Loss: 3.9277, Loss_eval: 4.0786, Learning Rate: 5.000000e-05


Step 38000, Loss: 3.7770, Loss_eval: 4.0587, Learning Rate: 5.000000e-05


Step 38500, Loss: 3.9337, Loss_eval: 4.0698, Learning Rate: 5.000000e-05


Step 39000, Loss: 3.9823, Loss_eval: 4.0988, Learning Rate: 5.000000e-05


Step 39500, Loss: 3.9524, Loss_eval: 4.1186, Learning Rate: 5.000000e-05


Step 40000, Loss: 4.1888, Loss_eval: 4.0380, Learning Rate: 5.000000e-05


Step 40500, Loss: 3.7979, Loss_eval: 4.0724, Learning Rate: 5.000000e-05


Step 41000, Loss: 3.7169, Loss_eval: 4.0599, Learning Rate: 5.000000e-05


Step 41500, Loss: 3.9460, Loss_eval: 4.0568, Learning Rate: 5.000000e-05


Step 42000, Loss: 4.1348, Loss_eval: 4.1087, Learning Rate: 5.000000e-05


Step 42500, Loss: 4.1423, Loss_eval: 4.0481, Learning Rate: 5.000000e-05


Step 43000, Loss: 3.9486, Loss_eval: 4.0623, Learning Rate: 5.000000e-05


Step 43500, Loss: 3.9295, Loss_eval: 3.9814, Learning Rate: 5.000000e-05


Epoch 3/3


Training:   0%|          | 0/43829 [00:00<?, ?it/s]

Step 500, Loss: 3.8193, Loss_eval: 4.0422, Learning Rate: 5.000000e-05


Step 1000, Loss: 3.7839, Loss_eval: 4.0809, Learning Rate: 5.000000e-05


Step 1500, Loss: 3.9106, Loss_eval: 4.0744, Learning Rate: 5.000000e-05


Step 2000, Loss: 3.9299, Loss_eval: 4.0761, Learning Rate: 5.000000e-05


Step 2500, Loss: 3.8737, Loss_eval: 4.0926, Learning Rate: 5.000000e-05


Step 3000, Loss: 3.9174, Loss_eval: 4.0856, Learning Rate: 5.000000e-05


Step 3500, Loss: 3.9082, Loss_eval: 4.0295, Learning Rate: 5.000000e-05


Step 4000, Loss: 4.1155, Loss_eval: 4.0134, Learning Rate: 5.000000e-05


Step 4500, Loss: 3.7963, Loss_eval: 3.9753, Learning Rate: 5.000000e-05


Step 5000, Loss: 3.7856, Loss_eval: 3.9826, Learning Rate: 5.000000e-05


Step 5500, Loss: 3.7037, Loss_eval: 4.0109, Learning Rate: 5.000000e-05


Step 6000, Loss: 4.1112, Loss_eval: 4.0412, Learning Rate: 5.000000e-05


Step 6500, Loss: 3.7108, Loss_eval: 3.9972, Learning Rate: 5.000000e-05


Step 7000, Loss: 3.8451, Loss_eval: 4.0317, Learning Rate: 5.000000e-05


Step 7500, Loss: 3.8105, Loss_eval: 3.9873, Learning Rate: 5.000000e-05


Step 8000, Loss: 3.7187, Loss_eval: 4.0681, Learning Rate: 5.000000e-05


Step 8500, Loss: 3.7293, Loss_eval: 4.0080, Learning Rate: 5.000000e-05


Step 9000, Loss: 3.8941, Loss_eval: 4.0149, Learning Rate: 5.000000e-05


Step 9500, Loss: 4.0022, Loss_eval: 4.0375, Learning Rate: 5.000000e-05


Step 10000, Loss: 4.0091, Loss_eval: 3.9835, Learning Rate: 5.000000e-05


Step 10500, Loss: 3.8466, Loss_eval: 3.9966, Learning Rate: 5.000000e-05


Step 11000, Loss: 3.9740, Loss_eval: 4.0203, Learning Rate: 5.000000e-05


Step 11500, Loss: 3.9396, Loss_eval: 4.0037, Learning Rate: 5.000000e-05


Step 12000, Loss: 3.8340, Loss_eval: 3.9729, Learning Rate: 5.000000e-05


Step 12500, Loss: 3.8190, Loss_eval: 4.0143, Learning Rate: 5.000000e-05


Step 13000, Loss: 3.7191, Loss_eval: 3.9923, Learning Rate: 5.000000e-05


Step 13500, Loss: 3.8161, Loss_eval: 3.9242, Learning Rate: 5.000000e-05


Step 14000, Loss: 3.8894, Loss_eval: 3.9849, Learning Rate: 5.000000e-05


Step 14500, Loss: 4.0159, Loss_eval: 3.9551, Learning Rate: 5.000000e-05


Step 15000, Loss: 3.8045, Loss_eval: 3.9920, Learning Rate: 5.000000e-05


Step 15500, Loss: 3.9330, Loss_eval: 4.0308, Learning Rate: 5.000000e-05


Step 16000, Loss: 3.8246, Loss_eval: 3.9817, Learning Rate: 5.000000e-05


Step 16500, Loss: 3.7952, Loss_eval: 3.9392, Learning Rate: 5.000000e-05


Step 17000, Loss: 3.7230, Loss_eval: 3.9489, Learning Rate: 5.000000e-05


Step 17500, Loss: 3.8379, Loss_eval: 3.9945, Learning Rate: 5.000000e-05


Step 18000, Loss: 3.6229, Loss_eval: 3.9447, Learning Rate: 5.000000e-05


Step 18500, Loss: 4.0099, Loss_eval: 3.8995, Learning Rate: 5.000000e-05


Step 19000, Loss: 3.8395, Loss_eval: 3.9560, Learning Rate: 5.000000e-05


Step 19500, Loss: 3.7351, Loss_eval: 3.9446, Learning Rate: 5.000000e-05


Step 20000, Loss: 3.8076, Loss_eval: 3.9896, Learning Rate: 5.000000e-05


Step 20500, Loss: 3.8363, Loss_eval: 3.9917, Learning Rate: 5.000000e-05


Step 21000, Loss: 3.9068, Loss_eval: 3.9639, Learning Rate: 5.000000e-05


Step 21500, Loss: 3.6975, Loss_eval: 3.9984, Learning Rate: 5.000000e-05


Step 22000, Loss: 3.9070, Loss_eval: 3.9625, Learning Rate: 5.000000e-05


Step 22500, Loss: 3.8752, Loss_eval: 3.9771, Learning Rate: 5.000000e-05


Step 23000, Loss: 3.8687, Loss_eval: 3.9399, Learning Rate: 5.000000e-05


Step 23500, Loss: 3.7699, Loss_eval: 3.9240, Learning Rate: 5.000000e-05


Step 24000, Loss: 3.8042, Loss_eval: 3.9316, Learning Rate: 5.000000e-05


Step 24500, Loss: 3.7641, Loss_eval: 3.9258, Learning Rate: 5.000000e-05


Step 25000, Loss: 3.8413, Loss_eval: 3.9241, Learning Rate: 5.000000e-05


Step 25500, Loss: 3.6127, Loss_eval: 3.9422, Learning Rate: 5.000000e-05


Step 26000, Loss: 3.8909, Loss_eval: 3.9179, Learning Rate: 5.000000e-05


Step 26500, Loss: 3.6227, Loss_eval: 3.8812, Learning Rate: 5.000000e-05


Step 27000, Loss: 3.8742, Loss_eval: 3.9398, Learning Rate: 5.000000e-05


Step 27500, Loss: 3.7681, Loss_eval: 3.9343, Learning Rate: 5.000000e-05


Step 28000, Loss: 3.5509, Loss_eval: 3.8994, Learning Rate: 5.000000e-05


Step 28500, Loss: 3.5064, Loss_eval: 3.9058, Learning Rate: 5.000000e-05


Step 29000, Loss: 3.7391, Loss_eval: 3.9095, Learning Rate: 5.000000e-05


Step 29500, Loss: 3.6449, Loss_eval: 3.8433, Learning Rate: 5.000000e-05


Step 30000, Loss: 3.7030, Loss_eval: 3.9308, Learning Rate: 5.000000e-05


Step 30500, Loss: 3.8782, Loss_eval: 3.9332, Learning Rate: 5.000000e-05


Step 31000, Loss: 3.8303, Loss_eval: 3.9034, Learning Rate: 5.000000e-05


Step 31500, Loss: 3.6351, Loss_eval: 3.9049, Learning Rate: 5.000000e-05


Step 32000, Loss: 3.5957, Loss_eval: 3.8967, Learning Rate: 5.000000e-05


Step 32500, Loss: 3.8118, Loss_eval: 3.9235, Learning Rate: 5.000000e-05


Step 33000, Loss: 3.6458, Loss_eval: 3.8797, Learning Rate: 5.000000e-05


Step 33500, Loss: 3.6332, Loss_eval: 3.8454, Learning Rate: 5.000000e-05


Step 34000, Loss: 3.8299, Loss_eval: 3.8882, Learning Rate: 5.000000e-05


Step 34500, Loss: 3.6770, Loss_eval: 3.8773, Learning Rate: 5.000000e-05


Step 35000, Loss: 3.7932, Loss_eval: 3.8383, Learning Rate: 5.000000e-05


Step 35500, Loss: 3.5732, Loss_eval: 3.8602, Learning Rate: 5.000000e-05


Step 36000, Loss: 3.6815, Loss_eval: 3.8860, Learning Rate: 5.000000e-05


Step 36500, Loss: 3.6766, Loss_eval: 3.9188, Learning Rate: 5.000000e-05


Step 37000, Loss: 3.8250, Loss_eval: 3.8587, Learning Rate: 5.000000e-05


Step 37500, Loss: 4.1378, Loss_eval: 3.8290, Learning Rate: 5.000000e-05


Step 38000, Loss: 3.8365, Loss_eval: 3.8683, Learning Rate: 5.000000e-05


Step 38500, Loss: 3.7461, Loss_eval: 3.8512, Learning Rate: 5.000000e-05


Step 39000, Loss: 3.5301, Loss_eval: 3.9043, Learning Rate: 5.000000e-05


Step 39500, Loss: 3.7674, Loss_eval: 3.9325, Learning Rate: 5.000000e-05


Step 40000, Loss: 3.9077, Loss_eval: 3.8176, Learning Rate: 5.000000e-05


Step 40500, Loss: 3.7946, Loss_eval: 3.8622, Learning Rate: 5.000000e-05


Step 41000, Loss: 3.7797, Loss_eval: 3.8884, Learning Rate: 5.000000e-05


Step 41500, Loss: 3.6415, Loss_eval: 3.8390, Learning Rate: 5.000000e-05


Step 42000, Loss: 3.7882, Loss_eval: 3.8880, Learning Rate: 5.000000e-05


Step 42500, Loss: 3.7014, Loss_eval: 3.8025, Learning Rate: 5.000000e-05


Step 43000, Loss: 3.7732, Loss_eval: 3.9424, Learning Rate: 5.000000e-05


Step 43500, Loss: 3.9184, Loss_eval: 3.8089, Learning Rate: 5.000000e-05
