## 

# Profiling

In [None]:
import sys, pathlib
project_root = pathlib.Path('../').resolve()   # one level up from the notebook
sys.path.insert(0, str(project_root))

import torch as torch
import numpy as np
import pickle as pkl

from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from IPython.display import clear_output

from src.transformer import Transformer
from src.optimization import train_step, forward_and_loss, group_decay_parameters, save_checkpoint, load_checkpoint
from src.utils import loader


print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


In [5]:
tokenizer = loader("cnn_tokenizer.pkl")
torch.manual_seed(42)

embed_dim = 64*10
ff_dim = 4*embed_dim
heads = 10
tf_blocks = 10

model = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=tokenizer.vocab_size,
    max_seq_len=1024,
    dropout=0.1,
    start_token_id=tokenizer.token_to_idx["<s>"],
    use_weight_tying=True
).to(device)

optimizer_grouped_parameters = group_decay_parameters(
    model,
    weight_decay=0.1,
    no_decay=["bias", "LayerNorm.weight"],
    )