## LLM

In [1]:
#import matplotlib.pyplot as plt
import random
from IPython.display import clear_output

#import matplotlib as plt
import os
import time
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import Transformer
from src.utils import saver, loader
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Categorical

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


## Load Data

In [10]:
highlight_test_tokens = loader("corpus/cnn_dailymail_highlight_test_tokens.pkl")

In [None]:
def add_start_stop(corpus_list):
    corpus_list_new = []
    for line in tqdm(corpus_list, desc="Adding start and stop tokens"):
        corpus_list_new.append("<s>" + line + "</s>")

    return corpus_list_new

In [4]:
def batch_data(corpus, batch_length=1024, offset=None):
    """
    Splits the corpus into batches of size batch_size.
    """
    length = len(corpus)
    batches = length // batch_length
    corpus_truncated = corpus[:batches * batch_length]  # trim to a multiple of batch_length
    corpus_batched = corpus_truncated.view(-1, batch_length)  # reshape into batches

    # offset, typically set to half the batch length, allows for overlapping batches augmentation
    if offset is not None:
        corpus_offset = corpus_truncated[offset : offset - batch_length]
        corpus_offset = corpus_offset.view(-1, batch_length)  # reshape into batches
        corpus_batched = torch.cat((corpus_batched, corpus_offset), dim=0)  # concatenate the offset batches

    return corpus_batched

In [6]:
corpus_train_batched = batch_data(corpus_train, batch_length=1024, offset=512)
corpus_test_batched = batch_data(corpus_test, batch_length=1024, offset=None)

In [None]:
loader_train = DataLoader(
    corpus_train,
    batch_size=3,
    shuffle=True,       # shuffle every epoch
)
 
loader_test = DataLoader(
    corpus_test,
    batch_size=3,
    shuffle=False,      # no need to shuffle test data
)

In [29]:
embed_dim = 64*18
ff_dim = 4*embed_dim
heads = 18
tf_blocks = 18

transformer = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=24072,
    max_seq_len=1024,
    dropout=0.1,
    start_token_id=24070,
    use_weight_tying=True
).to(device)

In [30]:
num_params = 0
for parameter in transformer.parameters():
    temp = 1
    for dim in parameter.shape:
        temp *= dim

    num_params += temp

print(num_params)

315775752


In [31]:
#transformer = pkl.load(open("corpus/model_large.model","rb"))

In [32]:
accum_steps = 20

@torch.compile
def forward_and_loss(model, batch, criterion):
    model.train()
    #batch is a tensor of shape [batch, seq]
    src, tgt = batch[:, :-1].to(device), batch[:, 1:].to(device)
    logits = model(src)
    return criterion(logits.reshape(-1, logits.size(-1)), tgt.reshape(-1))


def train_step(model, batch, criterion, optimizer, scaler, scheduler, i):
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        loss = forward_and_loss(model, batch, criterion)

    scaler.scale(loss/accum_steps).backward()

    if (i+1)%accum_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

    return loss

In [33]:
param_optimizer = list(transformer.named_parameters())

no_decay = ['bias', 'LayerNorm.weight'] # Parameters to exclude from weight decay

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01}, # Apply weight decay to these parameters
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0} # No weight decay for these parameters
]

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
scaler = torch.amp.GradScaler("cuda")

loss_train = []

num_epochs      = 1
steps_per_epoch = len(loader_train)
total_steps     = num_epochs * steps_per_epoch
warmup_steps    = 100

def lr_lambda(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

: 

In [None]:
optimizer.zero_grad()
for e in range(2):
    loss_temp = 0
    for i, batch in tqdm(enumerate(loader_train), total=len(loader_train), desc="Training"):
        
            batch = batch.to(device)
            loss = train_step(transformer, batch, criterion, optimizer, scaler, scheduler, i)
            loss_temp += loss
            if (i+1)%accum_steps == 0:
                lr = scheduler.get_last_lr()[0]
                print(f"Step {i}, Loss: {loss_temp/accum_steps:.4f}, LR: {lr}")
                loss_train.append(loss_temp/accum_steps)
                loss_temp = 0

            if (i+1)%1000 == 0:
                 pkl.dump(transformer, open(f"corpus/model_larger.model","wb"))

Training:   0%|          | 0/175293 [00:00<?, ?it/s]

Step 19, Loss: 10.8941, LR: 5.000000000000001e-07
Step 39, Loss: 10.8964, LR: 1.0000000000000002e-06
Step 59, Loss: 10.8320, LR: 1.5e-06
Step 79, Loss: 10.6827, LR: 2.0000000000000003e-06
Step 99, Loss: 10.4687, LR: 2.5e-06
Step 119, Loss: 10.2041, LR: 3e-06
Step 139, Loss: 9.9248, LR: 3.5000000000000004e-06
Step 159, Loss: 9.6602, LR: 4.000000000000001e-06
Step 179, Loss: 9.4812, LR: 4.5e-06
Step 199, Loss: 9.4451, LR: 5e-06
Step 219, Loss: 9.2535, LR: 5.500000000000001e-06
Step 239, Loss: 9.1591, LR: 6e-06
Step 259, Loss: 9.1408, LR: 6.5000000000000004e-06
Step 279, Loss: 9.0960, LR: 7.000000000000001e-06
Step 299, Loss: 8.9789, LR: 7.5e-06
Step 319, Loss: 8.8434, LR: 8.000000000000001e-06
Step 339, Loss: 8.7913, LR: 8.500000000000002e-06
Step 359, Loss: 8.7536, LR: 9e-06
Step 379, Loss: 8.7074, LR: 9.5e-06
Step 399, Loss: 8.6413, LR: 1e-05
Step 419, Loss: 8.5095, LR: 1.05e-05
Step 439, Loss: 8.4994, LR: 1.1000000000000001e-05
Step 459, Loss: 8.3902, LR: 1.1500000000000002e-05
Step 4

In [3]:
transformer = pkl.load(open("corpus/model_larger.model","rb"))

EOFError: Ran out of input

In [None]:
tokens = torch.tensor([[24070]], dtype=torch.long).to(device)

In [None]:
import textwrap
import ipywidgets as widgets
from IPython.display import display

wrapper = textwrap.TextWrapper(width=80)

# create a read-only text area
ta = widgets.Textarea(
    value="",
    layout=widgets.Layout(width='80ch', height='20em'),
    disabled=True
)
display(ta)


T = 1
k = 50

#torch.random.torch.manual_seed(42) 

for i in range(1024):
    logits = transformer(tokens)[0, -1:]
    topk_vals, _    = torch.topk(logits, k=k)
    #print(topk_vals)
    kth_value       = topk_vals[:,-1]

    logits = torch.where(logits >= kth_value, logits, -torch.inf)
    dist = Categorical(logits=logits/T)
    idx = dist.sample()
    tokens = torch.cat([tokens, idx.reshape(1,1)], dim=1)
    #print(tokens.shape)
    text = ""
    for i in tokens[0]:
        s = table[int(i)]
        text = text + s

    ta.value = wrapper.fill(text.replace("\n", " "))  # this updates in-place

    if idx[0] == 24071:
        break

Textarea(value='', disabled=True, layout=Layout(height='20em', width='80ch'))

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 15.69 GiB of which 72.75 MiB is free. Including non-PyTorch memory, this process has 14.95 GiB memory in use. Of the allocated memory 13.84 GiB is allocated by PyTorch, and 835.50 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
text = ""
for i in corpus_train[4]:
    s = table[int(i)]
    text = text + s
print(text)

 york and minnesota will be able to see him in action at 8 p.m. et august 26 on nbc, he's not making yet another comeback to the nfl. favre was on the field in grant park in downtown chicago recently with other nfl greats such as jerry rice, michael irvin, deion sanders, marcus allen and barry sanders, playing in the annual ea sports "madden" pigskin pro am flag football game. favre said he was happy to be back on the gridiron in organized play. "it's kind of exciting," favre said. "obviously, it's different. some of these guys i've played against and had some battles (with) over the years, but it's always nice to regroup and swap some old stories. it's always fun to play football." before the high-scoring affair, which included video game-style gold balls for 12-point scores and special bonuses for touchdowns thrown in certain spots in the end zone, the 11-time pro bowler wasn't promising a return to his glory days. "i wouldn't expect too much, but we've got a lot of pride. no one's a