## LLM

In [1]:
#import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
#from src.tokenizer import TokenizerBPE, fuse_tokenized_corpus, chunk_corpus

#import matplotlib as plt
import os
import time
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import Transformer
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Categorical

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


In [2]:
corpus_train = pkl.load(open('corpus/corpus_CNN_24k_whitespace_train_numpy', 'rb'))
corpus_train = torch.tensor(corpus_train, dtype=torch.int64)
data_train = TensorDataset(corpus_train)


corpus_test = pkl.load(open('corpus/corpus_CNN_24k_whitespace_test_numpy', 'rb'))
corpus_test = torch.tensor(corpus_test, dtype=torch.int64)
data_test = TensorDataset(corpus_test)

In [3]:
corpus_train.shape[0]*corpus_train.shape[1]

159109120

In [4]:
loader_train = DataLoader(
    data_train,
    batch_size=3,
    shuffle=True,       # shuffle every epoch
    drop_last=False     # whether to drop the tail batch if smaller than batch_size
)

loader_test = DataLoader(
    data_test,
    batch_size=8,
    shuffle=True,      
    drop_last=False
)     

In [12]:
embed_dim = 64*14
ff_dim = 4*embed_dim
heads = 14
tf_blocks = 14

transformer = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=24072,
    max_seq_len=1024,
    dropout=0.1,
    start_token_id=24070,
    use_weight_tying=True
).to(device)

In [13]:
num_params = 0
for parameter in transformer.parameters():
    temp = 1
    for dim in parameter.shape:
        temp *= dim

    num_params += temp

print(num_params)

157496072


In [7]:
#transformer = pkl.load(open("corpus/model_large.model","rb"))

In [14]:
accum_steps = 40

@torch.compile
def forward_and_loss(model, batch, criterion):
    model.train()
    #batch is a tensor of shape [batch, seq]
    src, tgt = batch[:, :-1].to(device), batch[:, 1:].to(device)
    logits = model(src)
    return criterion(logits.reshape(-1, logits.size(-1)), tgt.reshape(-1))


def train_step(model, batch, criterion, optimizer, scaler, scheduler, i):
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        loss = forward_and_loss(model, batch, criterion)

    scaler.scale(loss/accum_steps).backward()

    if (i+1)%accum_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

    return loss

In [15]:
param_optimizer = list(transformer.named_parameters())

no_decay = ['bias', 'LayerNorm.weight'] # Parameters to exclude from weight decay

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01}, # Apply weight decay to these parameters
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0} # No weight decay for these parameters
]

In [16]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
scaler = torch.amp.GradScaler("cuda")

loss_train = []

num_epochs      = 1
steps_per_epoch = len(loader_train)
total_steps     = num_epochs * steps_per_epoch
warmup_steps    = 100

def lr_lambda(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [None]:
optimizer.zero_grad()
for e in range(4):
    loss_temp = 0
    for i, (batch,) in tqdm(enumerate(loader_train), total=len(loader_train), desc="Training"):
        
            batch = batch.to(device)
            loss = train_step(transformer, batch, criterion, optimizer, scaler, scheduler, i)
            loss_temp += loss
            if (i+1)%accum_steps == 0:
                lr = scheduler.get_last_lr()[0]
                print(f"Step {i}, Loss: {loss_temp/accum_steps:.4f}, LR: {lr}")
                loss_train.append(loss_temp/accum_steps)
                loss_temp = 0

            if (i+1)%1000 == 0:
                 pkl.dump(transformer, open(f"corpus/model_larger.model","wb"))

Training:   0%|          | 0/51794 [00:00<?, ?it/s]

Step 39, Loss: 10.5428, LR: 5.000000000000001e-07
Step 79, Loss: 10.5535, LR: 1.0000000000000002e-06
Step 119, Loss: 10.5162, LR: 1.5e-06
Step 159, Loss: 10.4400, LR: 2.0000000000000003e-06
Step 199, Loss: 10.3382, LR: 2.5e-06
Step 239, Loss: 10.1983, LR: 3e-06
Step 279, Loss: 10.0502, LR: 3.5000000000000004e-06
Step 319, Loss: 9.8515, LR: 4.000000000000001e-06
Step 359, Loss: 9.6711, LR: 4.5e-06
Step 399, Loss: 9.4906, LR: 5e-06
Step 439, Loss: 9.3000, LR: 5.500000000000001e-06
Step 479, Loss: 9.1879, LR: 6e-06
Step 519, Loss: 9.1196, LR: 6.5000000000000004e-06
Step 559, Loss: 9.0372, LR: 7.000000000000001e-06
Step 599, Loss: 8.8906, LR: 7.5e-06
Step 639, Loss: 8.7289, LR: 8.000000000000001e-06
Step 679, Loss: 8.7236, LR: 8.500000000000002e-06
Step 719, Loss: 8.6652, LR: 9e-06
Step 759, Loss: 8.6429, LR: 9.5e-06
Step 799, Loss: 8.5536, LR: 1e-05
Step 839, Loss: 8.4744, LR: 1.05e-05
Step 879, Loss: 8.4199, LR: 1.1000000000000001e-05
Step 919, Loss: 8.3764, LR: 1.1500000000000002e-05
St

In [None]:
tokens = torch.tensor([[24070]], dtype=torch.long).to(device)

In [None]:
import textwrap
import ipywidgets as widgets
from IPython.display import display

wrapper = textwrap.TextWrapper(width=80)

# create a read-only text area
ta = widgets.Textarea(
    value="",
    layout=widgets.Layout(width='80ch', height='20em'),
    disabled=True
)
display(ta)


T = 1
k = 50

#torch.random.torch.manual_seed(42) 

for i in range(1024):
    logits = transformer(tokens)[0, -1:]
    topk_vals, _    = torch.topk(logits, k=k)
    #print(topk_vals)
    kth_value       = topk_vals[:,-1]

    logits = torch.where(logits >= kth_value, logits, -torch.inf)
    dist = Categorical(logits=logits/T)
    idx = dist.sample()
    tokens = torch.cat([tokens, idx.reshape(1,1)], dim=1)
    #print(tokens.shape)
    text = ""
    for i in tokens[0]:
        s = table[int(i)]
        text = text + s

    ta.value = wrapper.fill(text.replace("\n", " "))  # this updates in-place

    if idx[0] == 24071:
        break

Textarea(value='', disabled=True, layout=Layout(height='20em', width='80ch'))

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 15.69 GiB of which 72.75 MiB is free. Including non-PyTorch memory, this process has 14.95 GiB memory in use. Of the allocated memory 13.84 GiB is allocated by PyTorch, and 835.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
text = ""
for i in corpus_train[4]:
    s = table[int(i)]
    text = text + s
print(text)

 york and minnesota will be able to see him in action at 8 p.m. et august 26 on nbc, he's not making yet another comeback to the nfl. favre was on the field in grant park in downtown chicago recently with other nfl greats such as jerry rice, michael irvin, deion sanders, marcus allen and barry sanders, playing in the annual ea sports "madden" pigskin pro am flag football game. favre said he was happy to be back on the gridiron in organized play. "it's kind of exciting," favre said. "obviously, it's different. some of these guys i've played against and had some battles (with) over the years, but it's always nice to regroup and swap some old stories. it's always fun to play football." before the high-scoring affair, which included video game-style gold balls for 12-point scores and special bonuses for touchdowns thrown in certain spots in the end zone, the 11-time pro bowler wasn't promising a return to his glory days. "i wouldn't expect too much, but we've got a lot of pride. no one's a