## LLM

In [None]:
#import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
#from src.tokenizer import TokenizerBPE, fuse_tokenized_corpus, chunk_corpus

import matplotlib as plt
import os
import time
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import Transformer
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Categorical

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


In [39]:
corpus_train = pkl.load(open('corpus/corpus_CNN_24k_whitespace_train_numpy', 'rb'))
corpus_train = torch.tensor(corpus_train, dtype=torch.int64)
data_train = TensorDataset(corpus_train)


corpus_test = pkl.load(open('corpus/corpus_CNN_24k_whitespace_test_numpy', 'rb'))
corpus_test = torch.tensor(corpus_test, dtype=torch.int64)
data_test = TensorDataset(corpus_test)

In [40]:
loader_train = DataLoader(
    data_train,
    batch_size=7,
    shuffle=True,       # shuffle every epoch
    drop_last=False     # whether to drop the tail batch if smaller than batch_size
)

loader_test = DataLoader(
    data_test,
    batch_size=8,
    shuffle=True,      
    drop_last=False
)     

In [41]:
embed_dim = 64*14
ff_dim = 4*embed_dim
heads = 14
tf_blocks = 14


transformer = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=24072,
    max_seq_len=1024,
    dropout=0.1,
    pad_token_id=None,
    start_token_id=24070
).to(device)

with torch.no_grad():
    # Initialize embedding weights properly
    torch.nn.init.normal_(transformer.word_embed.weight, mean=0.0, std=0.02)
    #torch.nn.init.normal_(transformer.pos_embed.weight, mean=0.0, std=0.02)

In [47]:
transformer = pkl.load(open("corpus/model.model","rb"))

In [48]:
accum_steps = 5

@torch.compile
def forward_and_loss(model, batch, criterion):
    model.train()
    #batch is a tensor of shape [batch, seq]
    src, tgt = batch[:, :-1].to(device), batch[:, 1:].to(device)
    logits = model(src)
    return criterion(logits.reshape(-1, logits.size(-1)), tgt.reshape(-1))


def train_step(model, batch, criterion, optimizer, scaler, scheduler, i):
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        loss = forward_and_loss(model, batch, criterion)

    scaler.scale(loss/accum_steps).backward()

    if (i+1)%accum_steps == 0:
        scaler.step(optimizer)
        scheduler.step()
        scaler.update()
        optimizer.zero_grad()

    return loss

In [49]:
param_optimizer = list(transformer.named_parameters())

no_decay = ['bias', 'LayerNorm.weight'] # Parameters to exclude from weight decay

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01}, # Apply weight decay to these parameters
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0} # No weight decay for these parameters
]

In [50]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=1e-4)
scaler = torch.amp.GradScaler("cuda")

loss_train = []

num_epochs      = 1
steps_per_epoch = len(loader_train)
total_steps     = num_epochs * steps_per_epoch
warmup_steps    = 100

def lr_lambda(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [None]:
optimizer.zero_grad()
for e in range(num_epochs):
    loss_temp = 0
    for i, (batch,) in tqdm(enumerate(loader_train), total=len(loader_train), desc="Training"):
        
            batch = batch.to(device)
            loss = train_step(transformer, batch, criterion, optimizer, scaler, scheduler, i)
            loss_temp += loss
            if (i+1)%accum_steps == 0:
                lr = scheduler.get_last_lr()[0]
                print(f"Step {i}, Loss: {loss_temp/accum_steps:.4f}, LR: {lr}")
                loss_train.append(loss_temp/accum_steps)
                loss_temp = 0


    pkl.dump(transformer, open("corpus/model2_{e}.model","wb"))

Training:   0%|          | 0/22198 [00:00<?, ?it/s]

Step 4, Loss: 7.1332, LR: 1.0000000000000002e-06
Step 9, Loss: 7.1331, LR: 2.0000000000000003e-06
Step 14, Loss: 7.1919, LR: 3e-06
Step 19, Loss: 7.2069, LR: 4.000000000000001e-06
Step 24, Loss: 7.1373, LR: 5e-06
Step 29, Loss: 7.1270, LR: 6e-06
Step 34, Loss: 7.1841, LR: 7.000000000000001e-06
Step 39, Loss: 7.1668, LR: 8.000000000000001e-06
Step 44, Loss: 7.1412, LR: 9e-06
Step 49, Loss: 7.1865, LR: 1e-05
Step 54, Loss: 7.2160, LR: 1.1000000000000001e-05
Step 59, Loss: 7.1578, LR: 1.2e-05
Step 64, Loss: 7.1456, LR: 1.3000000000000001e-05
Step 69, Loss: 7.1491, LR: 1.4000000000000001e-05
Step 74, Loss: 7.1941, LR: 1.5e-05
Step 79, Loss: 7.1642, LR: 1.6000000000000003e-05
Step 84, Loss: 7.1610, LR: 1.7000000000000003e-05
Step 89, Loss: 7.2016, LR: 1.8e-05
Step 94, Loss: 7.1664, LR: 1.9e-05
Step 99, Loss: 7.1426, LR: 2e-05
Step 104, Loss: 7.1359, LR: 2.1e-05
Step 109, Loss: 7.1659, LR: 2.2000000000000003e-05
Step 114, Loss: 7.1963, LR: 2.3000000000000003e-05
Step 119, Loss: 7.1938, LR: 2

In [None]:
plt.plot(loss_train)
plt.show()

In [12]:
table = pkl.load(open("table_CNN_24k_whitespace", "rb"))

In [33]:
tokens = torch.tensor([[24070]], dtype=torch.long).to(device)

In [35]:
import textwrap
import ipywidgets as widgets
from IPython.display import display

wrapper = textwrap.TextWrapper(width=80)

# create a read-only text area
ta = widgets.Textarea(
    value="",
    layout=widgets.Layout(width='80ch', height='20em'),
    disabled=True
)
display(ta)


T = 3
k = 50

torch.random.torch.manual_seed(43) 

for i in range(1024):
    logits = transformer(tokens)[0, -1:]
    topk_vals, _    = torch.topk(logits, k=k)
    #print(topk_vals)
    kth_value       = topk_vals[:,-1]

    logits = torch.where(logits >= kth_value, logits, -torch.inf)
    dist = Categorical(logits=logits/T)
    idx = dist.sample()
    tokens = torch.cat([tokens, idx.reshape(1,1)], dim=1)
    #print(tokens.shape)
    text = ""
    for i in tokens[0]:
        s = table[int(i)]
        text = text + s

    ta.value = wrapper.fill(text.replace("\n", " "))  # this updates in-place

    if idx[0] == 24071:
        break

Textarea(value='', disabled=True, layout=Layout(height='20em', width='80ch'))

KeyboardInterrupt: 

In [26]:
text = ""
for i in corpus_train[4]:
    s = table[int(i)]
    text = text + s
print(text)

 york and minnesota will be able to see him in action at 8 p.m. et august 26 on nbc, he's not making yet another comeback to the nfl. favre was on the field in grant park in downtown chicago recently with other nfl greats such as jerry rice, michael irvin, deion sanders, marcus allen and barry sanders, playing in the annual ea sports "madden" pigskin pro am flag football game. favre said he was happy to be back on the gridiron in organized play. "it's kind of exciting," favre said. "obviously, it's different. some of these guys i've played against and had some battles (with) over the years, but it's always nice to regroup and swap some old stories. it's always fun to play football." before the high-scoring affair, which included video game-style gold balls for 12-point scores and special bonuses for touchdowns thrown in certain spots in the end zone, the 11-time pro bowler wasn't promising a return to his glory days. "i wouldn't expect too much, but we've got a lot of pride. no one's a

In [20]:
data_train.shape

AttributeError: 'TensorDataset' object has no attribute 'shape'