Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
339731c
M8 Task 1a: Move PolymorphicValue alias to detail namespace
csarofeen Dec 26, 2025
8bbb51f
Revert "M8 Task 1a: Move PolymorphicValue alias to detail namespace"
csarofeen Dec 30, 2025
b2999ee
M8 Task 2a: Create DynamicType split header structure (decl.h, impl.h…
csarofeen Dec 30, 2025
a33f855
M8 Task 3: Add extern template for PolymorphicValue DynamicType
csarofeen Dec 30, 2025
2a056e1
M8 Task 4: Move operator<< to impl.h
csarofeen Dec 30, 2025
7bcace5
M8 Task 5: Move unary operators (+, -, ~, !) to impl.h
csarofeen Dec 30, 2025
7666a7f
M8 Task 6: Move operator* (dereference) to impl.h
csarofeen Dec 30, 2025
95d755f
M8 Task 7: Move prefix ++/-- operators to impl.h
csarofeen Dec 30, 2025
3d45bdb
M8 Task 8: Move postfix ++/-- operators to impl.h
csarofeen Dec 30, 2025
2e4be09
M8 Task 9: Move compound assignment operators to impl.h
csarofeen Dec 30, 2025
6e5f789
M8 Task 10a: Move operator+ to impl.h (binary op pattern validation)
csarofeen Dec 30, 2025
10e1dff
M8 Task 10b: Move all binary operators to impl.h
csarofeen Dec 30, 2025
473ae6b
M8 Task 12: Convert 22 binary operators to friend function pattern fo…
csarofeen Dec 30, 2025
fecdd77
M8 Task 12: Convert remaining operators (unary, ++/--, compound assig…
csarofeen Dec 30, 2025
36a8879
Refactor DynamicType operators to non-template friends with recursion…
csarofeen Dec 31, 2025
4366828
Move getDataType and castToDtype from type.h to type.cpp
csarofeen Dec 31, 2025
1c4484a
Enable narrow PCH for polymorphic_value.h
csarofeen Dec 31, 2025
d9ab351
Extend PCH to test targets
csarofeen Dec 31, 2025
3211649
Expand PCH to include top nvFuser headers
csarofeen Jan 1, 2026
89ae03c
Implement index-based switch dispatch for operator== in DynamicType
csarofeen Jan 1, 2026
d2fdb38
Convert comparison operators to switch-based dispatch to eliminate Fo…
csarofeen Jan 2, 2026
4e50e4a
Extend switch-based dispatch to all binary operators (arithmetic, bit…
csarofeen Jan 2, 2026
518198f
Fix symbol visibility for DynamicType by compiling polymorphic_value.…
csarofeen Jan 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,42 @@ endif()

target_link_libraries(codegen_internal PUBLIC LLVM_JIT)

# Precompiled Headers for Top nvFuser Headers
# Post-M8, template instantiation is reduced by 81%, making header parsing
# a significant fraction of build cost. This PCH targets the top 10 heaviest
# nvFuser-controllable headers by exclusive parse time (from M9 Task 4 analysis).
# Enabled by default for Release builds (provides ~50% build time improvement).
if(CMAKE_BUILD_TYPE STREQUAL "Release")
option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" ON)
else()
option(NVFUSER_USE_POLYMORPHIC_PCH "Use PCH for top nvFuser headers to reduce parse time" OFF)
endif()

if(NVFUSER_USE_POLYMORPHIC_PCH)
message(STATUS "Enabling PCH for top 10 nvFuser headers")
target_precompile_headers(codegen_internal PRIVATE
# Top 10 nvFuser headers by exclusive parse time (M9 Task 4 analysis)
"${NVFUSER_SRCS_DIR}/polymorphic_value.h" # 1675s (27.9m)
"${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h" # 473.6s (7.9m)
"${NVFUSER_SRCS_DIR}/ir/base_nodes.h" # 284.5s (4.7m)
"${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h" # 162.1s (2.7m)
"${NVFUSER_SRCS_DIR}/type.h" # 81.6s (1.4m)
"${NVFUSER_SRCS_DIR}/ir/container.h" # 51.6s (0.9m)
"${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h" # 44.1s (0.7m)
"${NVFUSER_SRCS_DIR}/iter_visitor.h" # 38.2s (0.6m)
"${NVFUSER_SRCS_DIR}/ir/internal_nodes.h" # 33.3s (0.6m)
"${NVFUSER_SRCS_DIR}/ir/interface_nodes.h" # 29.6s (0.5m)
)
# Skip PCH for polymorphic_value.cpp to allow visibility override
# (PCH caches type with hidden visibility)
set_source_files_properties(
"${NVFUSER_SRCS_DIR}/polymorphic_value.cpp"
PROPERTIES
SKIP_PRECOMPILE_HEADERS ON
COMPILE_OPTIONS "-fvisibility=default"
)
endif()

add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)

if (BUILD_CUTLASS AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
Expand Down Expand Up @@ -1109,6 +1145,35 @@ function(add_test_without_main TEST_NAME TEST_SRC ADDITIONAL_LINK)
add_executable(${TEST_NAME} ${TEST_SRC})
set_property(TARGET ${TEST_NAME} PROPERTY CXX_STANDARD ${NVFUSER_CPP_STANDARD})
target_compile_definitions(${TEST_NAME} PRIVATE USE_GTEST)

# PCH for test targets: All test executables share a single PCH to avoid
# redundant compilation. The first test target (test_nvfuser) creates the PCH,
# and all subsequent tests reuse it via REUSE_FROM.
# Note: Can't reuse from codegen_internal due to -fPIC flag difference.
if(NVFUSER_USE_POLYMORPHIC_PCH)
get_property(NVFUSER_TEST_PCH_TARGET GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET)
if(NOT NVFUSER_TEST_PCH_TARGET)
# First test target: create the PCH with top 10 nvFuser headers
message(STATUS "Creating shared test PCH on target: ${TEST_NAME}")
target_precompile_headers(${TEST_NAME} PRIVATE
"${NVFUSER_SRCS_DIR}/polymorphic_value.h"
"${NVFUSER_ROOT}/lib/dynamic_type/src/dynamic_type/type_traits.h"
"${NVFUSER_SRCS_DIR}/ir/base_nodes.h"
"${NVFUSER_SRCS_DIR}/scheduler/tools/abstract_tensor.h"
"${NVFUSER_SRCS_DIR}/type.h"
"${NVFUSER_SRCS_DIR}/ir/container.h"
"${NVFUSER_SRCS_DIR}/serde/fusion_cache_generated.h"
"${NVFUSER_SRCS_DIR}/iter_visitor.h"
"${NVFUSER_SRCS_DIR}/ir/internal_nodes.h"
"${NVFUSER_SRCS_DIR}/ir/interface_nodes.h"
)
set_property(GLOBAL PROPERTY NVFUSER_TEST_PCH_TARGET ${TEST_NAME})
else()
# Subsequent test targets: reuse existing PCH
target_precompile_headers(${TEST_NAME} REUSE_FROM ${NVFUSER_TEST_PCH_TARGET})
endif()
endif()

target_include_directories(${TEST_NAME} PRIVATE "${NVFUSER_ROOT}")
target_include_directories(${TEST_NAME} SYSTEM PRIVATE
${NVFUSER_ROOT}/third_party/googletest/googletest/include
Expand Down
8 changes: 4 additions & 4 deletions csrc/multidevice/symmetric_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ class SymmetricTensor {
size_t aligned_size_;
bool are_remote_tensors_setup_ = false;
bool is_multicast_setup_ = false;
CUmemGenericAllocationHandle mcast_handle_{};
CUdevice cu_dev_{};
[[maybe_unused]] CUmemGenericAllocationHandle mcast_handle_{};
[[maybe_unused]] CUdevice cu_dev_{};
void* mc_ptr_{nullptr};
int exporter_rank_{-1};
int peer_fd_{-1};
[[maybe_unused]] int exporter_rank_{-1};
[[maybe_unused]] int peer_fd_{-1};
bool is_contiguous_view_setup_ = false;
at::Tensor contiguous_view_;
};
Expand Down
15 changes: 15 additions & 0 deletions csrc/polymorphic_value.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,18 @@ c10::IValue toIValue(const PolymorphicValue& x) {
} // namespace PolymorphicValue_functions

} // namespace nvfuser

// Explicit instantiation of DynamicType for PolymorphicValue.
// This is the single point where the template is fully instantiated.
// Note: This file is compiled with -fvisibility=default (set in CMakeLists.txt)
// to ensure all DynamicType symbols are exported from the shared library.
template struct dynamic_type::DynamicType<
dynamic_type::Containers<std::vector>,
nvfuser::StructHandle,
nvfuser::Pointer,
nvfuser::Opaque,
at::Tensor,
std::complex<double>,
double,
int64_t,
bool>;
13 changes: 13 additions & 0 deletions csrc/polymorphic_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -544,4 +544,17 @@ c10::IValue toIValue(const PolymorphicValue& x);

} // namespace nvfuser

// Prevent implicit instantiation in other TUs - use explicit instantiation from
// polymorphic_value.cpp
extern template struct dynamic_type::DynamicType<
dynamic_type::Containers<std::vector>,
nvfuser::StructHandle,
nvfuser::Pointer,
nvfuser::Opaque,
at::Tensor,
std::complex<double>,
double,
int64_t,
bool>;

#include <struct.inl>
61 changes: 61 additions & 0 deletions csrc/type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,67 @@

namespace nvfuser {

// Implementation moved from type.h to reduce template instantiation costs.
// Uses PolymorphicValue::for_all_types() which triggers ForAllTypes dispatch.
DataType getDataType(const PolymorphicValue& value) {
std::optional<DataType> dtype = std::nullopt;
PolymorphicValue::for_all_types([&value, &dtype](auto _) {
using T = typename decltype(_)::type;
if constexpr (IsPrimitiveNativeType<T>::value) {
if (value.is<T>()) {
dtype = NativeTypeToDataType<T>::type;
}
} else if constexpr (std::is_same_v<T, std::vector<PolymorphicValue>>) {
if (value.is<T>()) {
const auto& vec = value.as<T>();
size_t size = vec.size();
NVF_CHECK(size > 0, "Empty array is not supported");
dtype =
ArrayType{std::make_shared<DataType>(getDataType(vec[0])), size};
}
} else if constexpr (std::is_same_v<T, Pointer>) {
// For pointers in polymorphic value, we only store the data size of the
// pointee, so it is impossible to infer the pointer type.
NVF_CHECK(!value.is<T>(), "Can not infer pointer type.");
} else if constexpr (std::is_same_v<T, StructHandle>) {
if (value.is<T>()) {
dtype = value.as<T>().type();
}
} else if constexpr (std::is_same_v<T, Opaque>) {
if (value.is<T>()) {
const auto& opaque = value.as<T>();
dtype = DataType(OpaqueType{
.type_info = opaque.any().type(), .size = opaque.size()});
}
}
});
NVF_CHECK(dtype.has_value(), "Unknown dtype for ", value.type().name());
return dtype.value();
}

// Implementation moved from type.h to reduce template instantiation costs.
// Uses PolymorphicValue::for_all_types() which triggers ForAllTypes dispatch.
PolymorphicValue castToDtype(PolymorphicValue value, const DataType& dtype) {
if (!value.hasValue()) {
return value;
}
// Cast the given value to the given data type. This enables interface
// like: IrBuilder::create<Val>(0, DataType::Double) where value is
// an integer but the desired data type is double.
if (!hasCompatibleDataType(value, dtype)) {
PolymorphicValue::for_all_types([&](auto _) {
using T = typename decltype(_)::type;
if constexpr (IsPrimitiveNativeType<T>::value) {
if (isCompatibleDataType(NativeTypeToDataType<T>::type, dtype)) {
value = PolymorphicValue(static_cast<T>(value));
}
}
// TODO: support arrays and pointers
});
}
return value;
}

StructType NotImplementedStruct::type() const {
NVF_THROW("Not implemented");
}
Expand Down
63 changes: 7 additions & 56 deletions csrc/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -414,41 +414,9 @@ DEFINE_DATATYPE_TO_NATIVE_TYPE(DataType::ComplexDouble, std::complex<double>);

#undef DEFINE_DATATYPE_TO_NATIVE_TYPE

inline DataType getDataType(const PolymorphicValue& value) {
std::optional<DataType> dtype = std::nullopt;
PolymorphicValue::for_all_types([&value, &dtype](auto _) {
using T = typename decltype(_)::type;
if constexpr (IsPrimitiveNativeType<T>::value) {
if (value.is<T>()) {
dtype = NativeTypeToDataType<T>::type;
}
} else if constexpr (std::is_same_v<T, std::vector<PolymorphicValue>>) {
if (value.is<T>()) {
const auto& vec = value.as<T>();
size_t size = vec.size();
NVF_CHECK(size > 0, "Empty array is not supported");
dtype =
ArrayType{std::make_shared<DataType>(getDataType(vec[0])), size};
}
} else if constexpr (std::is_same_v<T, Pointer>) {
// For pointers in polymorphic value, we only store the data size of the
// pointee, so it is impossible to infer the pointer type.
NVF_CHECK(!value.is<T>(), "Can not infer pointer type.");
} else if constexpr (std::is_same_v<T, StructHandle>) {
if (value.is<T>()) {
dtype = value.as<T>().type();
}
} else if constexpr (std::is_same_v<T, Opaque>) {
if (value.is<T>()) {
const auto& opaque = value.as<T>();
dtype = DataType(OpaqueType{
.type_info = opaque.any().type(), .size = opaque.size()});
}
}
});
NVF_CHECK(dtype.has_value(), "Unknown dtype for ", value.type().name());
return dtype.value();
}
// Get the DataType corresponding to the runtime type held in a PolymorphicValue.
// Implementation moved to type.cpp to reduce template instantiation costs.
NVF_API DataType getDataType(const PolymorphicValue& value);

inline bool isCompatibleDataType(DataType dtype, DataType dtype2) {
if (dtype == dtype2) {
Expand Down Expand Up @@ -1128,28 +1096,11 @@ Pointer::Pointer(void* ptr, DataType dtype)
: ptr_(reinterpret_cast<std::byte*>(ptr)),
size_bit_(dataTypeSizeBit(dtype)) {}

inline PolymorphicValue castToDtype(
// Cast a PolymorphicValue to match the specified DataType.
// Implementation moved to type.cpp to reduce template instantiation costs.
NVF_API PolymorphicValue castToDtype(
PolymorphicValue value,
const DataType& dtype) {
if (!value.hasValue()) {
return value;
}
// Cast the given value to the given data type. This enables interface
// like: IrBuilder::create<Val>(0, DataType::Double) where value is
// an integer but the desired data type is double.
if (!hasCompatibleDataType(value, dtype)) {
PolymorphicValue::for_all_types([&](auto _) {
using T = typename decltype(_)::type;
if constexpr (IsPrimitiveNativeType<T>::value) {
if (isCompatibleDataType(NativeTypeToDataType<T>::type, dtype)) {
value = PolymorphicValue(static_cast<T>(value));
}
}
// TODO: support arrays and pointers
});
}
return value;
}
const DataType& dtype);

// Converts an enum to its underlying type.
// It corresponds with std::to_underlying introduced in c++23
Expand Down
2 changes: 1 addition & 1 deletion lib/dynamic_type/benchmark/knn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ static StructVecDouble kNN_Dictionary(
sum += distances_and_values.top().second;
distances_and_values.pop();
}
return sum / k;
return sum / static_cast<double>(k);
}

static void kNN_Dictionary(benchmark::State& state) {
Expand Down
Loading
Loading