Skip to content

Make FusionProfile object not a singleton and allow copying #3771

@kshitij12345

Description

@kshitij12345

Using thunder on a model, several Fusions are generated with Fuser. When we want to profile the generated regions, it can be easily done so with fd.profile(). However, problem is that fd.profile() returns a singleton object which is overwritten when we profile another fusion definition in the same script (see example script). It would be great if fd.profile() returns a separate object. Also, it would be great if it can be serialized.

Example Script

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id12(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[4096, 3584], contiguity=[True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[1, 0])
    T1 = fd.define_tensor(shape=[1, 4096, 3584], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T2 = fd.define_tensor(shape=[1, 4096, 18944], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T7 = fd.ops.reshape(T0, new_shape=[1, 4096, 3584])
    T8 = fd.ops.cast(T1, dtype=DataType.Float)
    T9 = fd.ops.cast(T7, dtype=DataType.Float)
    T10 = fd.ops.add(T8, T9)
    T11 = fd.ops.cast(T10, dtype=DataType.BFloat16)
    T15 = fd.ops.reshape(T2, new_shape=[4096, 18944])
    fd.add_output(T11)
    fd.add_output(T15)

with FusionDefinition() as fd:
    nvfuser_fusion_id12(fd)

inputs = [
    torch.testing.make_tensor((4096, 3584), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 4096, 3584), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 4096, 18944), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs, profile=True)

prof_data = fd.profile()

print("BEFORE PROFILING FD2")
print(prof_data.kernel_time_ms)

def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[64], contiguity=[True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[0])
    S1 = fd.define_scalar(4096, dtype=DataType.Int)
    S2 = fd.define_scalar(0, dtype=DataType.Int)
    S3 = fd.define_scalar(1, dtype=DataType.Int)
    T4 = fd.ops.iota(S1, S2, S3, dtype=DataType.Int)
    T8 = fd.ops.broadcast_in_dim(T4, shape=[1, 4096], broadcast_dims=[1])
    T13 = fd.ops.broadcast_in_dim(T0, shape=[1, 64, 1], broadcast_dims=[1])
    T14 = fd.ops.cast(T13, dtype=DataType.Float)
    T19 = fd.ops.broadcast_in_dim(T14, shape=[1, 64, 1], broadcast_dims=[0, 1, 2])
    T24 = fd.ops.broadcast_in_dim(T8, shape=[1, 1, 4096], broadcast_dims=[0, 2])
    T25 = fd.ops.cast(T24, dtype=DataType.Float)
    fd.add_output(T19)
    fd.add_output(T25)

with FusionDefinition() as fd2:
    nvfuser_fusion_id0(fd2)

inputs = [
    torch.testing.make_tensor((64,), dtype=torch.bfloat16, device='cuda:0'),
]
fd2.execute(inputs, profile=True)

# Running profile on different fusion definition.
prof_data2 = fd2.profile()  # data in prof_data1 is overwritten.

print("AFTER PROFILING FD2")
print(prof_data.kernel_time_ms)
print(prof_data2.kernel_time_ms)
print(prof_data is prof_data2)  # True
print(prof_data)
print(prof_data2)


import copy
# Traceback (most recent call last):
#   File "/home/kkalambarkar/lightning-thunder/scratchpad/test_nvfuser_profile.py", line 61, in <module>
#     copy.deepcopy(prof_data2)
#   File "/home/kkalambarkar/miniconda3/envs/pytorch-dev/lib/python3.10/copy.py", line 161, in deepcopy
#     rv = reductor(4)
# TypeError: cannot pickle 'nvfuser._C.FusionProfile' object
copy.deepcopy(prof_data2)

Output

BEFORE PROFILING FD2
0.08934399999999999
AFTER PROFILING FD2
0.002048
0.002048
True
<nvfuser._C.FusionProfile object at 0x7f4a5243d470>
<nvfuser._C.FusionProfile object at 0x7f4a5243d470>

Metadata

Metadata

Labels

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions