-
Notifications
You must be signed in to change notification settings - Fork 78
Open
Labels
Description
Using thunder on a model, several Fusions are generated with Fuser. When we want to profile the generated regions, it can be easily done so with fd.profile(). However, problem is that fd.profile() returns a singleton object which is overwritten when we profile another fusion definition in the same script (see example script). It would be great if fd.profile() returns a separate object. Also, it would be great if it can be serialized.
Example Script
import torch
from nvfuser import FusionDefinition, DataType
def nvfuser_fusion_id12(fd : FusionDefinition) -> None :
T0 = fd.define_tensor(shape=[4096, 3584], contiguity=[True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[1, 0])
T1 = fd.define_tensor(shape=[1, 4096, 3584], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
T2 = fd.define_tensor(shape=[1, 4096, 18944], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
T7 = fd.ops.reshape(T0, new_shape=[1, 4096, 3584])
T8 = fd.ops.cast(T1, dtype=DataType.Float)
T9 = fd.ops.cast(T7, dtype=DataType.Float)
T10 = fd.ops.add(T8, T9)
T11 = fd.ops.cast(T10, dtype=DataType.BFloat16)
T15 = fd.ops.reshape(T2, new_shape=[4096, 18944])
fd.add_output(T11)
fd.add_output(T15)
with FusionDefinition() as fd:
nvfuser_fusion_id12(fd)
inputs = [
torch.testing.make_tensor((4096, 3584), dtype=torch.bfloat16, device='cuda:0'),
torch.testing.make_tensor((1, 4096, 3584), dtype=torch.bfloat16, device='cuda:0'),
torch.testing.make_tensor((1, 4096, 18944), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs, profile=True)
prof_data = fd.profile()
print("BEFORE PROFILING FD2")
print(prof_data.kernel_time_ms)
def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
T0 = fd.define_tensor(shape=[64], contiguity=[True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[0])
S1 = fd.define_scalar(4096, dtype=DataType.Int)
S2 = fd.define_scalar(0, dtype=DataType.Int)
S3 = fd.define_scalar(1, dtype=DataType.Int)
T4 = fd.ops.iota(S1, S2, S3, dtype=DataType.Int)
T8 = fd.ops.broadcast_in_dim(T4, shape=[1, 4096], broadcast_dims=[1])
T13 = fd.ops.broadcast_in_dim(T0, shape=[1, 64, 1], broadcast_dims=[1])
T14 = fd.ops.cast(T13, dtype=DataType.Float)
T19 = fd.ops.broadcast_in_dim(T14, shape=[1, 64, 1], broadcast_dims=[0, 1, 2])
T24 = fd.ops.broadcast_in_dim(T8, shape=[1, 1, 4096], broadcast_dims=[0, 2])
T25 = fd.ops.cast(T24, dtype=DataType.Float)
fd.add_output(T19)
fd.add_output(T25)
with FusionDefinition() as fd2:
nvfuser_fusion_id0(fd2)
inputs = [
torch.testing.make_tensor((64,), dtype=torch.bfloat16, device='cuda:0'),
]
fd2.execute(inputs, profile=True)
# Running profile on different fusion definition.
prof_data2 = fd2.profile() # data in prof_data1 is overwritten.
print("AFTER PROFILING FD2")
print(prof_data.kernel_time_ms)
print(prof_data2.kernel_time_ms)
print(prof_data is prof_data2) # True
print(prof_data)
print(prof_data2)
import copy
# Traceback (most recent call last):
# File "/home/kkalambarkar/lightning-thunder/scratchpad/test_nvfuser_profile.py", line 61, in <module>
# copy.deepcopy(prof_data2)
# File "/home/kkalambarkar/miniconda3/envs/pytorch-dev/lib/python3.10/copy.py", line 161, in deepcopy
# rv = reductor(4)
# TypeError: cannot pickle 'nvfuser._C.FusionProfile' object
copy.deepcopy(prof_data2)Output
BEFORE PROFILING FD2
0.08934399999999999
AFTER PROFILING FD2
0.002048
0.002048
True
<nvfuser._C.FusionProfile object at 0x7f4a5243d470>
<nvfuser._C.FusionProfile object at 0x7f4a5243d470>Reactions are currently unavailable