## LLM

In [1]:
#import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
#from src.tokenizer import TokenizerBPE, fuse_tokenized_corpus, chunk_corpus

import os
import time
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import Transformer
from torch.utils.data import TensorDataset, DataLoader

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


In [3]:
corpus_train = pkl.load(open('corpus/corpus_CNN_24k_whitespace_train_numpy', 'rb'))
corpus_train = torch.tensor(corpus_train, dtype=torch.int64)
corpus_train = TensorDataset(corpus_train)


corpus_test = pkl.load(open('corpus/corpus_CNN_24k_whitespace_test_numpy', 'rb'))
corpus_test = torch.tensor(corpus_test, dtype=torch.int64)
corpus_test = TensorDataset(corpus_test)

In [8]:
loader_train = DataLoader(
    corpus_train,
    batch_size=3,
    shuffle=True,       # shuffle every epoch
    drop_last=False     # whether to drop the tail batch if smaller than batch_size
)

loader_test = DataLoader(
    corpus_test,
    batch_size=8,
    shuffle=True,      
    drop_last=False
)     

In [9]:
embed_dim = 64*10
ff_dim = 4*embed_dim
heads = 10
tf_blocks = 10


transformer = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=24072,
    max_seq_len=1024,
    dropout=0.1,
    pad_token_id=None,
    start_token_id=24070
).to(device)

In [10]:
def forward_and_loss(model, batch, criterion):
    model.train()
    #batch is a tensor of shape [batch, seq]
    src, tgt = batch[:, :-1].to(device), batch[:, 1:].to(device)
    logits = model(src)
    return criterion(logits.reshape(-1, logits.size(-1)), tgt.reshape(-1))

# 2) Do the zero_grad/backward/step around it
def train_step(model, batch, optimizer, criterion):
    # zero the grads in Python (not inside the compiled graph)
    optimizer.zero_grad()

    # call compiled forward+loss
    loss = forward_and_loss(model, batch, criterion)

    # backward & step in Python
    loss.backward()
    optimizer.step()

    return loss

In [11]:
optimizer = torch.optim.AdamW(transformer.parameters(), lr=5e-5, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()

for i, (batch,) in tqdm(enumerate(loader_train), total=len(loader_train), desc="Training"):
    batch = batch.to(device)
    loss = train_step(transformer, batch, optimizer, criterion)
    
    print(f"Step {i}, Loss: {loss:.4f}")

Training:   0%|          | 0/51794 [00:00<?, ?it/s]

Step 0, Loss: 637.0493
Step 1, Loss: 602.8871
Step 2, Loss: 571.5277
Step 3, Loss: 542.0153
Step 4, Loss: 520.7668
Step 5, Loss: 508.2817
Step 6, Loss: 490.9936
Step 7, Loss: 496.5493
Step 8, Loss: 483.6722
Step 9, Loss: 470.3539
Step 10, Loss: 450.7548
Step 11, Loss: 450.7498
Step 12, Loss: 459.8244
Step 13, Loss: 479.2814
Step 14, Loss: 477.1370
Step 15, Loss: 472.9950
Step 16, Loss: 463.4001
Step 17, Loss: 453.2903
Step 18, Loss: 445.7467
Step 19, Loss: 408.5549
Step 20, Loss: 414.5872
Step 21, Loss: 412.7249
Step 22, Loss: 403.9278
Step 23, Loss: 395.9747
Step 24, Loss: 416.9826
Step 25, Loss: 400.8383
Step 26, Loss: 392.5642
Step 27, Loss: 384.4294
Step 28, Loss: 374.0632
Step 29, Loss: 380.4855
Step 30, Loss: 374.5624
Step 31, Loss: 365.6567
Step 32, Loss: 365.1854
Step 33, Loss: 359.2446
Step 34, Loss: 356.8309
Step 35, Loss: 351.9932
Step 36, Loss: 349.5576
Step 37, Loss: 341.1428
Step 38, Loss: 338.7044
Step 39, Loss: 339.5231
Step 40, Loss: 352.5169
Step 41, Loss: 338.5050
St

KeyboardInterrupt: 