## LLM

In [1]:
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from transformer_kristianwold.transformer import Transformer, Inference
from transformer_kristianwold.optimization import train_step, forward_and_loss, group_decay_parameters, save_checkpoint, load_checkpoint
from transformer_kristianwold.utils import saver, loader
from torch.utils.data import TensorDataset, DataLoader
from IPython.display import clear_output
import matplotlib.pyplot as plt
from transformer_kristianwold.analysis import EmbeddingClustering

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

device = torch.device("cpu")
torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


## Load

In [12]:
tokenizer = loader("../../tokenizers/cnn_tokenizer3.pkl")
model = None
def load_model(filename=None):
    embed_dim = 64*18
    ff_dim = 4*embed_dim
    heads = 18
    tf_blocks = 18

    model = Transformer(
        embed_dim=embed_dim,
        ff_dim=ff_dim,
        heads=heads,
        tf_blocks=tf_blocks,
        vocab_size=tokenizer.vocab_size,
        max_seq_len=1024,
        dropout=0.1,
        start_token_id=tokenizer.token_to_idx["<s>"],
        use_weight_tying=True
    ).to(device)

    optimizer_grouped_parameters = group_decay_parameters(
        model,
        weight_decay=0.1,
        no_decay=["bias", "LayerNorm.weight"],
        )

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
    scaler = torch.amp.GradScaler("cuda")
    loss_train_list = []
    loss_test_list = []

    num_epochs      = 1
    steps_per_epoch = 1
    warmup_steps    = 1000

    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        return 1.0

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    if filename is not None:
        [model, 
        _, 
        _, 
        loss_train_list, 
        loss_test_list] = load_checkpoint(filename, 
                                        model, 
                                        optimizer, 
                                        scheduler, 
                                        loss_train_list, 
                                        loss_test_list)

    return model, loss_train_list, loss_test_list

## Load Model

In [None]:
model, loss_train_list, loss_test_list = load_model("../../models/checkpoint_transformer_5epoch.pth")

In [None]:
text = "obama"
token = tokenizer.encode(text)
print("Word:", text)
print("Tokenized:", token)
