# profiling.ipynb

Code for profiling pytorch dataloaders

* Created on Thursday May 15th, 2025
* Created by Jacob A Rose

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.profiler import profile, record_function, ProfilerActivity


class MyDataset(Dataset):
    def __init__(self):
        self.data = torch.randn(10, 3, 224, 224)
        self.target = torch.randint(0, 10, (10,))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]

        with record_function("transform1"):
            x = x * 2

        with record_function("transform2"):
            y = y + 1

        return x, y


dataset = MyDataset()
loader = DataLoader(dataset, batch_size=5, num_workers=0)

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 loader        10.56%      11.637ms       100.00%     110.171ms     110.171ms             1  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        14.05%      15.478ms        77.07%      84.905ms      28.302ms             3  
                                             transform1        11.56%      12.737ms        28.93%      31.876ms       3.188ms            10  
                                              aten::mul        12.55%      13.831ms        17.37%      19.139ms       1.914ms            10  
      

ERROR:2025-05-16 01:46:28 53381:53381 DeviceProperties.cpp:47] gpuGetDeviceCount failed with code 35


In [None]:
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("loader"):
        for batch in loader:
            pass

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

In [4]:
from plantclef.pytorch.data_catalog import make_dataset


ds = make_dataset(name="plantclef2024", load_all_subsets=False)

100%|██████████| 2/2 [00:00<00:00,  6.93it/s]


In [19]:
batch_size = 16
cpu_count = 0


loader = DataLoader(
    ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=cpu_count,
    pin_memory=True,
)

In [20]:
batch = next(iter(loader))
type(batch)

dict

In [23]:
import torch.autograd.profiler as profiler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


with profiler.profile(use_cuda=("cuda" in device.type)) as prof:
    # Run your training loop for several iterations
    i = 0
    for batch in loader:
        images, labels = batch["image"], batch["label_idx"]
        images, labels = images.to(device), labels.to(device)
        i += 1
        if i > 10:
            break

    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))




In [24]:
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         0.04%       2.421ms         0.04%       2.421ms      12.880us           188  
                                          aten::random_         0.00%      23.131us         0.00%      23.131us      23.131us             1  
                                             aten::item         0.00%       4.147us         0.00%       6.627us       6.627us             1  
                              aten::_local_scalar_dense         0.00%       2.480us         0.00%       2.480us       2.480us             1  
enumer

In [3]:
import cProfile

cProfile.run('re.compile("foo|bar")')

         216 function calls (209 primitive calls) in 0.002 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        2    0.000    0.000    0.000    0.000 enum.py:359(__call__)
        2    0.000    0.000    0.000    0.000 enum.py:678(__new__)
        1    0.000    0.000    0.000    0.000 enum.py:986(__and__)
        1    0.000    0.000    0.000    0.000 re.py:249(compile)
        1    0.000    0.000    0.000    0.000 re.py:288(_compile)
        1    0.000    0.000    0.000    0.000 sre_compile.py:265(_compile_charset)
        1    0.000    0.000    0.000    0.000 sre_compile.py:292(_optimize_charset)
        2    0.000    0.000    0.000    0.000 sre_compile.py:477(_get_iscased)
        1    0.000    0.000    0.000    0.000 sre_compile.py:485(_get_literal_prefix)
        1    0.000    0.000    0.000    0.000 sre_compile.py:516(_get_charset_prefix)
        1   