## LLM

In [1]:
#import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
#from src.tokenizer import TokenizerBPE, fuse_tokenized_corpus, chunk_corpus

import os
import time
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import Transformer
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Categorical

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


In [2]:
corpus_train = pkl.load(open('corpus/corpus_CNN_24k_whitespace_train_numpy', 'rb'))
corpus_train = torch.tensor(corpus_train, dtype=torch.int64)
corpus_train = TensorDataset(corpus_train)


corpus_test = pkl.load(open('corpus/corpus_CNN_24k_whitespace_test_numpy', 'rb'))
corpus_test = torch.tensor(corpus_test, dtype=torch.int64)
corpus_test = TensorDataset(corpus_test)

In [3]:
loader_train = DataLoader(
    corpus_train,
    batch_size=8,
    shuffle=True,       # shuffle every epoch
    drop_last=False     # whether to drop the tail batch if smaller than batch_size
)

loader_test = DataLoader(
    corpus_test,
    batch_size=8,
    shuffle=True,      
    drop_last=False
)     

In [16]:
embed_dim = 64*10
ff_dim = 4*embed_dim
heads = 10
tf_blocks = 10


transformer = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=24072,
    max_seq_len=1024,
    dropout=0.1,
    pad_token_id=None,
    start_token_id=24070
).to(device)

In [4]:
transformer = pkl.load(open("model.model","rb"))

In [19]:
@torch.compile
def forward_and_loss(model, batch, criterion):
    model.train()
    #batch is a tensor of shape [batch, seq]
    src, tgt = batch[:, :-1].to(device), batch[:, 1:].to(device)
    logits = model(src)
    return criterion(logits.reshape(-1, logits.size(-1)), tgt.reshape(-1))

# 2) Do the zero_grad/backward/step around it
def train_step(model, batch, optimizer, criterion, i):
    # zero the grads in Python (not inside the compiled graph)
    #optimizer.zero_grad()

    # call compiled forward+loss
    loss = forward_and_loss(model, batch, criterion)

    # backward & step in Python
    loss.backward()

    if i%8 == 0:
        optimizer.step()
        optimizer.zero_grad()

    return loss

In [None]:
optimizer = torch.optim.AdamW(transformer.parameters(), lr=5e-5, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()
optimizer.zero_grad()
for e in range(2):
    for i, (batch,) in tqdm(enumerate(loader_train), total=len(loader_train), desc="Training"):
        batch = batch.to(device)
        loss = train_step(transformer, batch, optimizer, criterion, i)
        if i%8 == 0:
            print(f"Step {i}, Loss: {loss:.4f}")

Training:   0%|          | 0/19423 [00:00<?, ?it/s]

Step 0, Loss: 0.0629
Step 8, Loss: 0.0609
Step 16, Loss: 0.0563
Step 24, Loss: 0.0448
Step 32, Loss: 0.0447
Step 40, Loss: 0.0353
Step 48, Loss: 0.0747
Step 56, Loss: 0.0514
Step 64, Loss: 0.0742
Step 72, Loss: 0.0405
Step 80, Loss: 0.0695
Step 88, Loss: 0.0502
Step 96, Loss: 0.0578
Step 104, Loss: 0.0441
Step 112, Loss: 0.0408
Step 120, Loss: 0.0635
Step 128, Loss: 0.0340
Step 136, Loss: 0.0678
Step 144, Loss: 0.0366
Step 152, Loss: 0.0400
Step 160, Loss: 0.0544
Step 168, Loss: 0.0496
Step 176, Loss: 0.0514
Step 184, Loss: 0.0583
Step 192, Loss: 0.0602
Step 200, Loss: 0.0463
Step 208, Loss: 0.0559
Step 216, Loss: 0.0444
Step 224, Loss: 0.0427
Step 232, Loss: 0.0439
Step 240, Loss: 0.0509
Step 248, Loss: 0.0626
Step 256, Loss: 0.0613
Step 264, Loss: 0.0817
Step 272, Loss: 0.0603
Step 280, Loss: 0.0496
Step 288, Loss: 0.0512
Step 296, Loss: 0.0532
Step 304, Loss: 0.0514
Step 312, Loss: 0.0541
Step 320, Loss: 0.0532
Step 328, Loss: 0.0448
Step 336, Loss: 0.0672
Step 344, Loss: 0.0435
Ste

In [11]:
table = pkl.load(open("table_CNN_24k_whitespace", "rb"))

In [15]:
tokens = torch.tensor([[24070]], dtype=torch.long).to(device)

In [None]:
import textwrap
import ipywidgets as widgets
from IPython.display import display

wrapper = textwrap.TextWrapper(width=80)

# create a read-only text area
ta = widgets.Textarea(
    value="",
    layout=widgets.Layout(width='80ch', height='20em'),
    disabled=True
)
display(ta)


T = 1
k = 50

torch.random.seed(42)

for i in range(512):
    logits = transformer(tokens)[0, -1:]
    topk_vals, _    = torch.topk(logits, k=k)
    #print(topk_vals)
    kth_value       = topk_vals[:,-1]

    logits = torch.where(logits >= kth_value, logits, -torch.inf)
    dist = Categorical(logits=logits/T)
    idx = dist.sample()
    tokens = torch.cat([tokens, idx.reshape(1,1)], dim=1)
    #print(tokens.shape)
    text = ""
    for i in tokens[0]:
        s = table[int(i)]
        text = text + s

    ta.value = wrapper.fill(text)  # this updates in-place

    if idx[0] == 24071:
        break

Textarea(value='', disabled=True, layout=Layout(height='20em', width='80ch'))

In [20]:
pkl.dump(transformer, open("model.model","wb"))