## 

# Profiling

In [1]:
import sys, pathlib
project_root = pathlib.Path('../').resolve()   # one level up from the notebook
sys.path.insert(0, str(project_root))

import torch as torch
from torch.profiler import profile, record_function, ProfilerActivity


from src.transformer import Transformer
from src.optimization import train_step, forward_and_loss, group_decay_parameters, save_checkpoint, load_checkpoint
from src.utils import loader


print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701


In [2]:
tokenizer = loader("cnn_tokenizer.pkl")
torch.manual_seed(42)

embed_dim = 64*10
ff_dim = 4*embed_dim
heads = 10
tf_blocks = 10

model = Transformer(
    embed_dim=embed_dim,
    ff_dim=ff_dim,
    heads=heads,
    tf_blocks=tf_blocks,
    vocab_size=tokenizer.vocab_size,
    max_seq_len=1024,
    dropout=0.1,
    start_token_id=tokenizer.token_to_idx["<s>"],
    use_weight_tying=True
).to(device)

inputs = torch.randint(0, tokenizer.vocab_size, (10, 1024)).to(device)

In [5]:
with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],          # add CUDA/XPU if needed
        record_shapes=True,
        with_stack=True                            # capture Python stack frames
) as prof:
    with record_function("model_inference"):
        model(inputs)

print(
    prof.key_averages(                   # aggregate the events …
        group_by_stack_n=5               # … and group by the top 5 stack frames
    ).table(sort_by="cpu_time_total", row_limit=12)
)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                  cudaDeviceSynchronize        88.64%     147.529ms        88.64%     147.529ms     147.529ms       0.000us         0.00%       0.000us       0.000us             1  
                                        model_inference         1.64%       2.735ms        11.36%      18.914ms      18.914ms       0.000us         0.00%     161.555ms     161.555ms             1  
         