# profiling.ipynb

Code for profiling pytorch dataloaders

* Created on Thursday May 15th, 2025
* Created by Jacob A Rose

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import torch
# from torch.utils.data import Dataset, DataLoader
# from torch.profiler import profile, record_function, ProfilerActivity


# class MyDataset(Dataset):
#     def __init__(self):
#         self.data = torch.randn(10, 3, 224, 224)
#         self.target = torch.randint(0, 10, (10,))

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, index):
#         x = self.data[index]
#         y = self.target[index]

#         with record_function("transform1"):
#             x = x * 2

#         with record_function("transform2"):
#             y = y + 1

#         return x, y


# dataset = MyDataset()
# loader = DataLoader(dataset, batch_size=5, num_workers=0)
# with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
#     with record_function("loader"):
#         for batch in loader:
#             pass

# print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

In [10]:
batch_size = 256
cpu_count = 0

warmup_iterations = 2  # 20
num_iterations = 50

In [11]:
import torch
from torch.utils.data import DataLoader
import torch.autograd.profiler as profiler
from tqdm import tqdm, trange
from plantclef.pytorch.data_catalog import make_dataset


ds = make_dataset(name="plantclef2024", load_all_subsets=False, subset="val")

tx = ds.get_transforms(is_training=True, crop_size=518)
ds.set_transform(tx)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loader = DataLoader(
    ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=cpu_count,
    pin_memory=True,
)
total_samples = batch_size * num_iterations
print(f"Total samples: {total_samples}")

100%|██████████| 2/2 [00:00<00:00, 17.63it/s]

Total samples: 12800





In [12]:
i = 0
for batch in tqdm(loader, total=warmup_iterations):
    images, labels = batch["image"], batch["label_idx"]
    i += 1
    if i > warmup_iterations:
        break

profile_device = device.type  # "cpu" #

with profiler.profile(
    use_device=profile_device, profile_memory=True, with_stack=True, with_modules=True
) as prof:
    # Run your training loop for several iterations
    i = 0
    data_iter = iter(loader)
    with profiler.record_function("loader"):
        for i in trange(num_iterations):
            batch = next(data_iter)
            images, labels = batch["image"], batch["label_idx"]
            images, labels = (
                images.to(device, non_blocking=True),
                labels.to(device, non_blocking=True),
            )
    assert images.device.type == device.type
    assert labels.device.type == device.type
    print(f"images.shape: {images.shape}")
    print(f"labels.shape: {labels.shape}")
    print(f"Processed {i} batches")

print("Finished")

print(
    prof.key_averages().table(
        sort_by="cuda_time_total", row_limit=15, max_name_column_width=90
    )
)

  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:08<00:00,  4.50s/it]
100%|██████████| 50/50 [02:07<00:00,  2.55s/it]


images.shape: torch.Size([256, 3, 518, 518])
labels.shape: torch.Size([256])
Processed 49 batches
Finished
---------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                           Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
---------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                         loader         0.06%      78.403ms        99.95%

In [13]:
256 * 50

12800

In [14]:
12800 / 127.786

100.16746748470098

In [10]:
assert images.device.type == device.type
assert labels.device.type == device.type

print(f"images.shape: {images.shape}")
print(f"labels.shape: {labels.shape}")
print(f"Processed {i} batches")

images.shape: torch.Size([128, 3, 518, 518])
labels.shape: torch.Size([128])
Processed 4 batches


In [11]:
# prof.export_chrome_trace("trace.json")

In [9]:
print(
    prof.key_averages().table(
        sort_by="cpu_time_total", row_limit=10, max_name_column_width=90
    )
)
# print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-----------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-----------------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                           loader         0.04%      19.918ms       100.00%       55.146s       55.146s           0 b           0 b             1  
    enumerate(DataLoader)#_MultiProcessingDataLoaderIter.__next__        99.96%       55.124s        99.96%       55.124s        5.512s           0 b           0 b            10  
                                                         aten::to         0.00%     124.861us       

In [3]:
# import cProfile

# cProfile.run('re.compile("foo|bar")')

         216 function calls (209 primitive calls) in 0.002 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
        2    0.000    0.000    0.000    0.000 enum.py:359(__call__)
        2    0.000    0.000    0.000    0.000 enum.py:678(__new__)
        1    0.000    0.000    0.000    0.000 enum.py:986(__and__)
        1    0.000    0.000    0.000    0.000 re.py:249(compile)
        1    0.000    0.000    0.000    0.000 re.py:288(_compile)
        1    0.000    0.000    0.000    0.000 sre_compile.py:265(_compile_charset)
        1    0.000    0.000    0.000    0.000 sre_compile.py:292(_optimize_charset)
        2    0.000    0.000    0.000    0.000 sre_compile.py:477(_get_iscased)
        1    0.000    0.000    0.000    0.000 sre_compile.py:485(_get_literal_prefix)
        1    0.000    0.000    0.000    0.000 sre_compile.py:516(_get_charset_prefix)
        1   