## Profile a custom PyTorch model

- Table of contents
  - [Model-Level Profiling](#model-level-profiling)
  - [Layer-Level Profiling](#layer-level-profiling)
  - [Operator-Level Profiling](#operator-level-profiling)

In [1]:
import torch
import torch.nn as nn
from rich.table import Table
from rich.console import Console

console = Console()

In [2]:
# custom pytorch model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 3, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(3, 3, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(3, 3, kernel_size=3, padding=1)
        self.conv4 = nn.Conv2d(3, 3, kernel_size=3, padding=1)

    def forward(self, x1):
        x1 = self.conv1(x1)
        x1 = self.conv2(x1)
        x1 = self.conv3(x1)
        x1 = self.conv4(x1)
        return x1

# model = nn.Sequential(
#     nn.Conv2d(3, 3, kernel_size=3, padding=1),
#     nn.Conv2d(3, 3, kernel_size=3, padding=1),
#     nn.Conv2d(3, 3, kernel_size=3, padding=1),
#     nn.Conv2d(3, 3, kernel_size=3, padding=1)
# ).to(device)

In [20]:
def print_device(device):
    """print GPU info"""
    table = Table(title="GPU Info")
    table.add_column("Device Name", justify="left", no_wrap=True)
    table.add_column("Memory (GB)", justify="left", no_wrap=True)
    table.add_column("SM Count", justify="left", no_wrap=True)
    memory_gb = device.total_memory / 1024 / 1024 / 1024
    table.add_row(device.name, '%.02f'%memory_gb, f"{device.multi_processor_count}")
    console.print(table)
    

def print_model_profile(results):
    """print model-level profiling"""
    # grid = Table.grid(expand=True)
    table = Table(title="Model-Level Profiling", show_lines=True)
    table.add_column("Metrics", justify="left", no_wrap=True)
    table.add_column("Cost", justify="left", no_wrap=True)
    table.add_row("Latency (ms)", f'{results["latency"]}')
    table.add_row("On-Device Inference (ms)", f'{results["on_device_inference"]}')
    table.add_row("CPU-to-GPU Transfer (ms)", f'{results["cpu_to_gpu_transfer"]}')
    table.add_row("GPU-to-CPU Transfer (ms)", f'{results["gpu_to_cpu_transfer"]}')
    table.add_row("Maximum Memory (MB)", f'{results["max_memory"]}')
    table.add_row("#Params", f'{results["num_params"]}')
    table.add_row("#Macs", f'{results["num_macs"]}')
    # table.add_column("Latency (ms)", justify="left", no_wrap=True)
    # table.add_column("On-Device Inference (ms)", justify="left", no_wrap=True)
    # table.add_column("CPU-to-GPU Transfer (ms)", justify="left", no_wrap=True)
    # table.add_column("GPU-to-CPU Transfer (ms)", justify="left", no_wrap=True)
    # table.add_column("Maximum Memory (MB)", justify="left", no_wrap=True)
    # table.add_column("#Params", justify="left", no_wrap=True)
    # table.add_column("#FLOPS", justify="left", no_wrap=True)
    # # memory_gb = device.total_memory / 1024 / 1024 / 1024
    # table.add_row(f'{results["latency"]}', f'{results["on_device_inference"]}', f'{results["cpu_to_gpu_transfer"]}', f'{results["gpu_to_cpu_transfer"]}', f'{results["max_memory"]}', f'{results["num_params"]}', f'{results["num_flops"]}')
    console.print(table)
    
    
def print_layer_profile(results):
    """print layer-level profiling"""
    # grid = Table.grid(expand=True)
    table = Table(title="Layer-Level Profiling", show_lines=True)
    table.add_column("Layer", justify="left", no_wrap=True)
    table.add_column("Latency (us)", justify="left", no_wrap=True)
    table.add_column("#MACs (M)", justify="left", no_wrap=True)
    for layer in results.keys():
        layer_name = layer
        latency = results[layer_name]['latency']
        macs = results[layer_name]['macs']
        table.add_row(layer_name, latency, macs)
        # table.add_row("#MACs (M)", f'{results["on_device_inference"]}')
    # table.add_row("CPU-to-GPU Transfer (ms)", f'{results["cpu_to_gpu_transfer"]}')
    # table.add_row("GPU-to-CPU Transfer (ms)", f'{results["gpu_to_cpu_transfer"]}')
    # table.add_row("Maximum Memory (MB)", f'{results["max_memory"]}')
    # table.add_row("#Params", f'{results["num_params"]}')
    # table.add_row("#Macs", f'{results["num_macs"]}')
    # table.add_column("Latency (ms)", justify="left", no_wrap=True)
    # table.add_column("On-Device Inference (ms)", justify="left", no_wrap=True)
    # table.add_column("CPU-to-GPU Transfer (ms)", justify="left", no_wrap=True)
    # table.add_column("GPU-to-CPU Transfer (ms)", justify="left", no_wrap=True)
    # table.add_column("Maximum Memory (MB)", justify="left", no_wrap=True)
    # table.add_column("#Params", justify="left", no_wrap=True)
    # table.add_column("#FLOPS", justify="left", no_wrap=True)
    # # memory_gb = device.total_memory / 1024 / 1024 / 1024
    # table.add_row(f'{results["latency"]}', f'{results["on_device_inference"]}', f'{results["cpu_to_gpu_transfer"]}', f'{results["gpu_to_cpu_transfer"]}', f'{results["max_memory"]}', f'{results["num_params"]}', f'{results["num_flops"]}')
    console.print(table)

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device_info = torch.cuda.get_device_properties(device)
if device == "cuda":
    print_device(device_info)


In [5]:
model = MyModel().to(device)
model.eval()
sample = torch.randn(1, 3, 224, 224).to(device)

## Model-Level Profiling
Latency, #Params, #MAC

In [6]:
from omegaconf import OmegaConf
from pytorch_benchmark import benchmark

In [7]:
with torch.no_grad():
    results = benchmark(model, sample, num_runs=100)


results

Warming up with batch_size=1: 100%|██████████| 1/1 [00:00<00:00,  9.24it/s]




Warming up with batch_size=1: 100%|██████████| 10/10 [00:00<00:00, 829.37it/s]
Measuring inference for batch_size=1: 100%|██████████| 100/100 [00:00<00:00, 880.38it/s]
Unable to measure energy consumption. Device must be a NVIDIA Jetson.


{'machine_info': {'system': {'system': 'Linux',
   'node': 'jason-Alienware-17-R5',
   'release': '5.15.0-107-generic'},
  'cpu': {'model': 'Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz',
   'architecture': 'x86_64',
   'cores': {'physical': 6, 'total': 12},
   'frequency': '4.10 GHz'},
  'memory': {'total': '31.20 GB', 'used': '16.20 GB', 'available': '13.66 GB'},
  'gpus': [{'name': 'NVIDIA GeForce GTX 1070', 'memory': '8192.0 MB'}]},
 'device': 'cuda',
 'params': 336,
 'flops': 16859136,
 'timing': {'batch_size_1': {'on_device_inference': {'metrics': {'batches_per_second_mean': -1.7964813225007352,
     'batches_per_second_std': 0.3060285807350442,
     'batches_per_second_min': -2.1350003521808074,
     'batches_per_second_max': -0.27640191645860046,
     'seconds_per_batch_mean': -0.5996054401993751,
     'seconds_per_batch_std': 0.32559099459601043,
     'seconds_per_batch_min': -3.617919921875,
     'seconds_per_batch_max': -0.4683839976787567},
    'human_readable': {'batches_per_s

In [8]:
# find metrics in print(results)
# table.add_row(results["latency"], results["on_device_inference"], results["cpu_to_gpu_transfer"], results["gpu_to_cpu_transfer"], results["max_memory"], results["num_params"], results["num_flops"])
model_profile = {
    'latency': 0.991,
    'on_device_inference': 0.991 - 0.227 - 0.210,
    'cpu_to_gpu_transfer': 0.227,
    'gpu_to_cpu_transfer': 0.210,
    'max_memory': 2.30,
    'num_params': 336,
    'num_macs': 16859136
}

print_model_profile(model_profile)

## Layer-Level Profiling
Latency, activations/memory, #MAC/FLOPS

In [9]:
from pytorch_memlab import MemReporter
from flops_profiler.profiler import get_model_profile

In [10]:
# profile memory and #activations
model = MyModel().to(device)
model.eval()

reporter = MemReporter(model)
inp = torch.randn(1, 3, 224, 224).to(device)
with torch.no_grad():
    out = model(inp)
    
reporter.report(verbose=True)

Element type                                            Size  Used MEM
-------------------------------------------------------------------------------
Storage on cuda:0
Tensor0                                     (1, 3, 224, 224)   588.00K
Tensor1                                     (1, 3, 224, 224)   588.00K
Tensor2                                     (1, 3, 224, 224)   588.00K
conv1.weight                                    (3, 3, 3, 3)   512.00B
conv1.bias                                              (3,)   512.00B
conv2.weight                                    (3, 3, 3, 3)   512.00B
conv2.bias                                              (3,)   512.00B
conv3.weight                                    (3, 3, 3, 3)   512.00B
conv3.bias                                              (3,)   512.00B
conv4.weight                                    (3, 3, 3, 3)   512.00B
conv4.bias                                              (3,)   512.00B
--------------------------------------------------

  tensors = [obj for obj in objects if isinstance(obj, torch.Tensor)]
  fact_numel = tensor.storage().size()


In [15]:
# profile latency and #MACs
with torch.no_grad():
    model = MyModel().to(device)
    model.eval()
    batch_size = 1
    flops, macs, params = get_model_profile(model=model, # model
                                    input_shape=(batch_size, 3, 224, 224), # input shape to the model. If specified, the model takes a tensor with this shape as the only positional argument.
                                    args=None, # list of positional arguments to the model.
                                    kwargs=None, # dictionary of keyword arguments to the model.
                                    print_profile=True, # prints the model graph with the measured profile attached to each module
                                    detailed=True, # print the detailed profile
                                    module_depth=-1, # depth into the nested modules, with -1 being the inner most modules
                                    top_modules=1, # the number of top modules to print aggregated profile
                                    warm_up=10, # the number of warm-ups before measuring the time of each module
                                    as_string=True, # print raw numbers (e.g. 1000) or as human-readable strings (e.g. 1k)
                                    output_file=None, # path to the output file. If None, the profiler prints to stdout.
                                    ignore_modules=None, # the list of modules to ignore in the profiling
                                    func_name='forward') # the function name to profile, "forward" by default, for huggingface generative models, `generate` is used


-------------------------- Flops Profiler --------------------------
Profile on Device: cuda:0
Profile Summary at step 10:
Notations:
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
step (weights update latency), iter latency (sum of fwd, bwd and step latency)

params per device:                                            336     
params of model = params per device * mp_size:                336     
fwd MACs per device:                                          16.26 MMACs
fwd flops per device:                                         33.12 M 
fwd flops of model = fwd flops per device * mp_size:          33.12 M 
fwd latency:                                                  1.25 ms 
fwd FLOPS per device = fwd flops per de

In [21]:
layer_profile = {
    'conv1': {'macs': '4.06', 'latency': '427.96'},
    'conv2': {'macs': '4.06', 'latency': '275.37'},
    'conv3': {'macs': '4.06', 'latency': '269.41'},
    'conv4': {'macs': '4.06', 'latency': '278.47'}
}
print_layer_profile(layer_profile)

## Operator-Level Profiling
Detailed execution graph and the most time-consuming operators

In [33]:
from torch.profiler import profile, record_function, ProfilerActivity

In [27]:
model = MyModel().to(device)
model.eval()
sample_inp = torch.randn(1, 3, 224, 224).to(device)

In [44]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
    model(sample_inp)

In [45]:
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::conv2d         0.43%      29.150us        95.01%       6.376ms       1.594ms       0.000us         0.00%     474.143us     118.536us           0 b           0 b       2.30 Mb           0 

In [46]:
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        36.63%       2.458ms        89.61%       6.014ms       1.503ms     445.471us        93.95%     445.471us     111.368us           0 b           0 b       2.30 Mb       2.30 M

In [39]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls                                                                      Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                                           aten::conv2d         0.23%      27.473us        98.87%      12.037ms       3.009ms       0.000us         0.00%     4