## LLM

In [52]:
#import matplotlib.pyplot as plt
import random
from IPython.display import clear_output
#from src.tokenizer import TokenizerBPE, fuse_tokenized_corpus, chunk_corpus

import matplotlib as plt
import os
import time
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from src.transformer import Transformer
from torch.utils.data import TensorDataset, DataLoader
from torch.distributions import Categorical

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

ModuleNotFoundError: No module named 'matplotlib'

In [39]:
corpus_train = pkl.load(open('corpus/corpus_CNN_24k_whitespace_train_numpy', 'rb'))
corpus_train = torch.tensor(corpus_train, dtype=torch.int64)
data_train = TensorDataset(corpus_train)


corpus_test = pkl.load(open('corpus/corpus_CNN_24k_whitespace_test_numpy', 'rb'))
corpus_test = torch.tensor(corpus_test, dtype=torch.int64)
data_test = TensorDataset(corpus_test)

In [40]:
loader_train = DataLoader(
    data_train,
    batch_size=7,
    shuffle=True,       # shuffle every epoch
    drop_last=False     # whether to drop the tail batch if smaller than batch_size
)

loader_test = DataLoader(
    data_test,
    batch_size=8,
    shuffle=True,      
    drop_last=False
)     

In [41]:
embed_dim = 64*14
ff_dim = 4*embed_dim
heads = 14
tf_blocks = 14


transformer = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=24072,
    max_seq_len=1024,
    dropout=0.1,
    pad_token_id=None,
    start_token_id=24070
).to(device)

with torch.no_grad():
    # Initialize embedding weights properly
    torch.nn.init.normal_(transformer.word_embed.weight, mean=0.0, std=0.02)
    #torch.nn.init.normal_(transformer.pos_embed.weight, mean=0.0, std=0.02)

In [47]:
transformer = pkl.load(open("corpus/model.model","rb"))

In [48]:
accum_steps = 5

@torch.compile
def forward_and_loss(model, batch, criterion):
    model.train()
    #batch is a tensor of shape [batch, seq]
    src, tgt = batch[:, :-1].to(device), batch[:, 1:].to(device)
    logits = model(src)
    return criterion(logits.reshape(-1, logits.size(-1)), tgt.reshape(-1))


def train_step(model, batch, criterion, optimizer, scaler, scheduler, i):
    with torch.autocast(device_type="cuda", dtype=torch.float16):
        loss = forward_and_loss(model, batch, criterion)

    scaler.scale(loss/accum_steps).backward()

    if (i+1)%accum_steps == 0:
        scaler.step(optimizer)
        scheduler.step()
        scaler.update()
        optimizer.zero_grad()

    return loss

In [49]:
param_optimizer = list(transformer.named_parameters())

no_decay = ['bias', 'LayerNorm.weight'] # Parameters to exclude from weight decay

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay': 0.01}, # Apply weight decay to these parameters
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay': 0.0} # No weight decay for these parameters
]

In [50]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=1e-4)
scaler = torch.amp.GradScaler("cuda")

loss_train = []

num_epochs      = 1
steps_per_epoch = len(loader_train)
total_steps     = num_epochs * steps_per_epoch
warmup_steps    = 100

def lr_lambda(step):
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [69]:
optimizer.zero_grad()
for e in range(4):
    loss_temp = 0
    for i, (batch,) in tqdm(enumerate(loader_train), total=len(loader_train), desc="Training"):
        
            batch = batch.to(device)
            loss = train_step(transformer, batch, criterion, optimizer, scaler, scheduler, i)
            loss_temp += loss
            if (i+1)%accum_steps == 0:
                lr = scheduler.get_last_lr()[0]
                print(f"Step {i}, Loss: {loss_temp/accum_steps:.4f}, LR: {lr}")
                loss_train.append(loss_temp/accum_steps)
                loss_temp = 0


    pkl.dump(transformer, open("corpus/model2_{e}.model","wb"))

Training:   0%|          | 0/22198 [00:00<?, ?it/s]

Step 4, Loss: 4.2911, LR: 0.0001
Step 9, Loss: 4.3756, LR: 0.0001
Step 14, Loss: 4.4192, LR: 0.0001
Step 19, Loss: 4.4016, LR: 0.0001
Step 24, Loss: 4.3856, LR: 0.0001
Step 29, Loss: 4.4217, LR: 0.0001
Step 34, Loss: 4.3214, LR: 0.0001
Step 39, Loss: 4.3073, LR: 0.0001
Step 44, Loss: 4.3313, LR: 0.0001
Step 49, Loss: 4.2804, LR: 0.0001
Step 54, Loss: 4.4113, LR: 0.0001
Step 59, Loss: 4.3454, LR: 0.0001
Step 64, Loss: 4.4119, LR: 0.0001
Step 69, Loss: 4.4291, LR: 0.0001
Step 74, Loss: 4.3351, LR: 0.0001
Step 79, Loss: 4.4778, LR: 0.0001
Step 84, Loss: 4.3495, LR: 0.0001
Step 89, Loss: 4.3970, LR: 0.0001
Step 94, Loss: 4.3498, LR: 0.0001
Step 99, Loss: 4.4093, LR: 0.0001
Step 104, Loss: 4.3656, LR: 0.0001
Step 109, Loss: 4.1914, LR: 0.0001
Step 114, Loss: 4.4090, LR: 0.0001
Step 119, Loss: 4.3286, LR: 0.0001
Step 124, Loss: 4.2889, LR: 0.0001
Step 129, Loss: 4.4265, LR: 0.0001
Step 134, Loss: 4.4269, LR: 0.0001
Step 139, Loss: 4.3939, LR: 0.0001
Step 144, Loss: 4.4140, LR: 0.0001
Step 14

Training:   0%|          | 0/22198 [00:00<?, ?it/s]

Step 4, Loss: 4.1797, LR: 0.0001
Step 9, Loss: 4.1445, LR: 0.0001
Step 14, Loss: 4.1533, LR: 0.0001
Step 19, Loss: 4.2724, LR: 0.0001
Step 24, Loss: 4.0642, LR: 0.0001
Step 29, Loss: 4.0982, LR: 0.0001
Step 34, Loss: 4.1544, LR: 0.0001
Step 39, Loss: 4.1611, LR: 0.0001
Step 44, Loss: 4.2572, LR: 0.0001
Step 49, Loss: 4.2715, LR: 0.0001
Step 54, Loss: 4.2386, LR: 0.0001
Step 59, Loss: 4.1207, LR: 0.0001
Step 64, Loss: 4.2192, LR: 0.0001
Step 69, Loss: 4.2209, LR: 0.0001
Step 74, Loss: 4.2206, LR: 0.0001
Step 79, Loss: 4.1516, LR: 0.0001
Step 84, Loss: 4.2395, LR: 0.0001
Step 89, Loss: 4.1159, LR: 0.0001
Step 94, Loss: 4.1858, LR: 0.0001
Step 99, Loss: 4.1541, LR: 0.0001
Step 104, Loss: 4.2363, LR: 0.0001
Step 109, Loss: 4.1556, LR: 0.0001
Step 114, Loss: 4.1884, LR: 0.0001
Step 119, Loss: 4.1546, LR: 0.0001
Step 124, Loss: 4.1577, LR: 0.0001
Step 129, Loss: 4.1705, LR: 0.0001
Step 134, Loss: 4.2270, LR: 0.0001
Step 139, Loss: 4.2190, LR: 0.0001
Step 144, Loss: 4.1708, LR: 0.0001
Step 14

Training:   0%|          | 0/22198 [00:00<?, ?it/s]

Step 4, Loss: 3.9678, LR: 0.0001
Step 9, Loss: 4.1038, LR: 0.0001
Step 14, Loss: 4.1154, LR: 0.0001
Step 19, Loss: 4.0038, LR: 0.0001


KeyboardInterrupt: 

In [53]:
plt.plot(loss_train)
plt.show()

NameError: name 'plt' is not defined

In [12]:
table = pkl.load(open("table_CNN_24k_whitespace", "rb"))

In [73]:
tokens = torch.tensor([[24070]], dtype=torch.long).to(device)

In [75]:
import textwrap
import ipywidgets as widgets
from IPython.display import display

wrapper = textwrap.TextWrapper(width=80)

# create a read-only text area
ta = widgets.Textarea(
    value="",
    layout=widgets.Layout(width='80ch', height='20em'),
    disabled=True
)
display(ta)


T = 1
k = 50

#torch.random.torch.manual_seed(42) 

for i in range(1024):
    logits = transformer(tokens)[0, -1:]
    topk_vals, _    = torch.topk(logits, k=k)
    #print(topk_vals)
    kth_value       = topk_vals[:,-1]

    logits = torch.where(logits >= kth_value, logits, -torch.inf)
    dist = Categorical(logits=logits/T)
    idx = dist.sample()
    tokens = torch.cat([tokens, idx.reshape(1,1)], dim=1)
    #print(tokens.shape)
    text = ""
    for i in tokens[0]:
        s = table[int(i)]
        text = text + s

    ta.value = wrapper.fill(text.replace("\n", " "))  # this updates in-place

    if idx[0] == 24071:
        break

Textarea(value='', disabled=True, layout=Layout(height='20em', width='80ch'))

KeyboardInterrupt: 

In [26]:
text = ""
for i in corpus_train[4]:
    s = table[int(i)]
    text = text + s
print(text)

 york and minnesota will be able to see him in action at 8 p.m. et august 26 on nbc, he's not making yet another comeback to the nfl. favre was on the field in grant park in downtown chicago recently with other nfl greats such as jerry rice, michael irvin, deion sanders, marcus allen and barry sanders, playing in the annual ea sports "madden" pigskin pro am flag football game. favre said he was happy to be back on the gridiron in organized play. "it's kind of exciting," favre said. "obviously, it's different. some of these guys i've played against and had some battles (with) over the years, but it's always nice to regroup and swap some old stories. it's always fun to play football." before the high-scoring affair, which included video game-style gold balls for 12-point scores and special bonuses for touchdowns thrown in certain spots in the end zone, the 11-time pro bowler wasn't promising a return to his glory days. "i wouldn't expect too much, but we've got a lot of pride. no one's a

In [20]:
data_train.shape

AttributeError: 'TensorDataset' object has no attribute 'shape'