# profiling.ipynb

Code for profiling pytorch dataloaders

* Created on Thursday May 15th, 2025
* Created by Jacob A Rose

In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from torch.profiler import profile, record_function, ProfilerActivity


# class MyDataset(Dataset):
#     def __init__(self):
#         self.data = torch.randn(10, 3, 224, 224)
#         self.target = torch.randint(0, 10, (10,))

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index):
#         x = self.data[index]
#         y = self.target[index]

#         with record_function("transform1"):
#             x = x * 2

#         with record_function("transform2"):
#             y = y + 1

#         return x, y


# dataset = MyDataset()
# loader = DataLoader(dataset, batch_size=5, num_workers=0)
# with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
#     with record_function("loader"):
#         for batch in loader:
#             pass

# print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

In [None]:
import torch
from torch.utils.data import DataLoader
from plantclef.pytorch.data_catalog import make_dataset


ds = make_dataset(name="plantclef2024", load_all_subsets=False)

In [9]:
batch_size = 64
cpu_count = 0


loader = DataLoader(
    ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=cpu_count,
    pin_memory=True,
)
# batch = next(iter(loader))
# type(batch)

In [10]:
import torch.autograd.profiler as profiler
from tqdm import tqdm
import torch


warmup_iterations = 5
num_iterations = 100


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

i = 0
for batch in tqdm(loader, total=warmup_iterations):
    images, labels = batch["image"], batch["label_idx"]
    i += 1
    if i > warmup_iterations:
        break

with profiler.profile(use_device=device.type) as prof:
    # Run your training loop for several iterations
    i = 0
    for batch in tqdm(loader, total=num_iterations):
        images, labels = batch["image"], batch["label_idx"]
        images, labels = images.to(device), labels.to(device)
        i += 1
        if i > num_iterations:
            break

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15, max_name_column_width=90))

100%|██████████| 5/5 [00:06<00:00,  1.39s/it]
100%|██████████| 100/100 [01:44<00:00,  1.05s/it]


---------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                           Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__         2.48%        2.597s        98.50%      103.129s        1.021s        2.470s         2.36%      103.130s        1.021s           101  
                                              _get_label_tensor        28.86%       30.212s        28.86%       30.212s       4.674ms       30.294s        28.94%       30.294s

In [15]:
dir(prof)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_ensure_function_events',
 '_function_events',
 '_needs_processing',
 '_old_function_events',
 '_parse_kineto_results',
 '_prepare_trace',
 '_start_trace',
 '_stats',
 'acc_events',
 'config',
 'create_trace_id',
 'custom_trace_id_callback',
 'default_trace_id',
 'enabled',
 'entered',
 'experimental_config',
 'export_chrome_trace',
 'export_stacks',
 'function_events',
 'key_averages',
 'kineto_activities',
 'kineto_results',
 'profile_memory',
 'profiler_kind',
 'profiling_end_time_ns',
 'profiling_start_time_ns',
 'record_shapes',
 'self_cpu_time_total',
 'table',
 'toggle_collection_dynamic

In [11]:
prof.export_chrome_trace("trace.json")

In [1]:
print(
    prof.key_averages().table(
        sort_by="cuda_time_total", row_limit=10, max_name_column_width=90
    )
)

NameError: name 'prof' is not defined

In [None]:
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        90.02%        2.589s        98.34%        2.828s     257.114ms        2.590s        90.12%        2.828s     257.131ms            11  
                                            aten::copy_         3.22%      92.693ms         4.70%     135.118ms     614.174us     136.342ms         4.74%     136.342ms     619.736us           220  
         

In [3]:
import cProfile

cProfile.run('re.compile("foo|bar")')

         216 function calls (209 primitive calls) in 0.002 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        2    0.000    0.000    0.000    0.000 enum.py:359(__call__)
        2    0.000    0.000    0.000    0.000 enum.py:678(__new__)
        1    0.000    0.000    0.000    0.000 enum.py:986(__and__)
        1    0.000    0.000    0.000    0.000 re.py:249(compile)
        1    0.000    0.000    0.000    0.000 re.py:288(_compile)
        1    0.000    0.000    0.000    0.000 sre_compile.py:265(_compile_charset)
        1    0.000    0.000    0.000    0.000 sre_compile.py:292(_optimize_charset)
        2    0.000    0.000    0.000    0.000 sre_compile.py:477(_get_iscased)
        1    0.000    0.000    0.000    0.000 sre_compile.py:485(_get_literal_prefix)
        1    0.000    0.000    0.000    0.000 sre_compile.py:516(_get_charset_prefix)
        1   