Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ option(MATX_EN_CUDA_LINEINFO "Enable line information for CUDA kernels via -line
option(MATX_EN_EXTENDED_LAMBDA "Enable extended lambda support for device/host lambdas" ON)
option(MATX_EN_MATHDX "Enable MathDx support for kernel fusion" OFF)
option(MATX_EN_UNSAFE_ALIAS_DETECTION "Enable aliased memory detection" OFF)
option(MATX_DISABLE_EXCEPTIONS "Disable C++ exceptions and log errors instead" OFF)

set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")

Expand Down Expand Up @@ -206,6 +207,9 @@ if (MATX_NVTX_FLAGS)
add_definitions(-DMATX_NVTX_FLAGS)
target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
endif()
if (MATX_DISABLE_EXCEPTIONS)
target_compile_definitions(matx INTERFACE MATX_DISABLE_EXCEPTIONS)
endif()
if (MATX_BUILD_32_BIT)
set(MATX_NVPL_INT_TYPE "lp64")
target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)
Expand Down
2 changes: 2 additions & 0 deletions docs_input/build.rst
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ By default, all of these options are OFF.
- ``-DMATX_EN_MATHDX=ON``
* - Enable pybind11 Support. This option is usually not explicitly set, but is enabled by other options.
- ``-DMATX_EN_PYBIND11=ON``
* - Disable Exceptions
- ``-DMATX_DISABLE_EXCEPTIONS=ON``


NVTX Flags
Expand Down
21 changes: 9 additions & 12 deletions examples/black_scholes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,16 +150,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
index_t input_size = 100'000'000;
constexpr uint32_t num_iterations = 100;
float time_ms;

tensor_t<dtype, 1> K_tensor{{input_size}};
tensor_t<dtype, 1> S_tensor{{input_size}};
tensor_t<dtype, 1> V_tensor{{input_size}};
tensor_t<dtype, 1> r_tensor{{input_size}};
tensor_t<dtype, 1> T_tensor{{input_size}};
tensor_t<dtype, 1> output_tensor{{input_size}};
tensor_t<dtype, 1> output_tensor2{{input_size}};
tensor_t<dtype, 1> output_tensor3{{input_size}};
tensor_t<dtype, 1> output_tensor4{{input_size}};
auto K_tensor = make_tensor<dtype>({input_size});
auto S_tensor = make_tensor<dtype>({input_size});
auto V_tensor = make_tensor<dtype>({input_size});
auto r_tensor = make_tensor<dtype>({input_size});
auto T_tensor = make_tensor<dtype>({input_size});
auto output_tensor = make_tensor<dtype>({input_size});
auto output_tensor2 = make_tensor<dtype>({input_size});
auto output_tensor3 = make_tensor<dtype>({input_size});
auto output_tensor4 = make_tensor<dtype>({input_size});

(K_tensor = random<float>({input_size}, UNIFORM)).run();
(S_tensor = random<float>({input_size}, UNIFORM)).run();
Expand All @@ -171,8 +170,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
cudaStreamCreate(&stream);
cudaExecutor exec{stream};

//compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
Expand Down
8 changes: 8 additions & 0 deletions include/matx/core/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

#include "matx/core/error.h"
#include "matx/core/nvtx.h"
#include "matx/core/log.h"
#include <cuda/std/functional>
#include <cuda/std/__algorithm/max.h>

Expand Down Expand Up @@ -122,6 +123,9 @@ struct MemTracker {

size_t bytes = iter->second.size;

MATX_LOG_DEBUG("Deallocating memory: ptr={}, {} bytes, space={}, remaining={} bytes",
ptr, bytes, static_cast<int>(iter->second.kind), matxMemoryStats.currentBytesAllocated - bytes);

matxMemoryStats.currentBytesAllocated -= bytes;

switch (iter->second.kind) {
Expand Down Expand Up @@ -187,6 +191,8 @@ struct MemTracker {
}
}

MATX_LOG_DEBUG("Allocating memory: {} bytes, space={}, stream={}", bytes, static_cast<int>(space), reinterpret_cast<void*>(stream));

switch (space) {
case MATX_MANAGED_MEMORY:
err = cudaMallocManaged(ptr, bytes);
Expand Down Expand Up @@ -214,6 +220,8 @@ struct MemTracker {
MATX_THROW(matxOutOfMemory, "Failed to allocate memory");
}

MATX_LOG_DEBUG("Allocated memory: ptr={}, {} bytes, total_current={} bytes", *ptr, bytes, matxMemoryStats.currentBytesAllocated + bytes);

[[maybe_unused]] std::unique_lock lck(memory_mtx);
matxMemoryStats.currentBytesAllocated += bytes;
matxMemoryStats.totalBytesAllocated += bytes;
Expand Down
4 changes: 4 additions & 0 deletions include/matx/core/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,15 @@ class matxCache_t {
auto &common_params_cache = rmap[key];
auto cache_el = common_params_cache.find(params);
if (cache_el == common_params_cache.end()) {
MATX_LOG_DEBUG("Cache MISS for transform: id={}, device={}, thread={}",
id, key.device_id, reinterpret_cast<void*>(std::hash<std::thread::id>{}(key.thread_id)));
std::any tmp = mfun();
common_params_cache.insert({params, tmp});
efun(std::any_cast<decltype(mfun())>(tmp));
}
else {
MATX_LOG_DEBUG("Cache HIT for transform: id={}, device={}, thread={}",
id, key.device_id, reinterpret_cast<void*>(std::hash<std::thread::id>{}(key.thread_id)));
efun(std::any_cast<decltype(mfun())>(cache_el->second));
}
}
Expand Down
43 changes: 32 additions & 11 deletions include/matx/core/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#endif

#include "matx/core/stacktrace.h"
#include "matx/core/log.h"
#endif

namespace matx
Expand Down Expand Up @@ -150,6 +151,23 @@ namespace matx
};
}

#ifdef MATX_DISABLE_EXCEPTIONS

#define MATX_ENTER_HANDLER() {
#define MATX_EXIT_HANDLER() }

#define MATX_THROW(e, str_arg) \
do { \
MATX_LOG_FATAL("matxException ({}: {}) - {}:{}", matxErrorString(e), str_arg, __FILE__, __LINE__); \
std::stringstream matx_stack_trace; \
detail::printStackTrace(matx_stack_trace); \
std::string matx_stack_str = matx_stack_trace.str(); \
MATX_LOG_FATAL("Stack Trace:\n{}", matx_stack_str); \
std::abort(); \
} while(0)

#else

#define MATX_ENTER_HANDLER() \
try \
{
Expand All @@ -158,8 +176,8 @@ namespace matx
} \
catch (matx::detail::matxException & e) \
{ \
fprintf(stderr, "%s\n", e.what()); \
fprintf(stderr, "Stack Trace:\n%s", e.stack.str().c_str()); \
MATX_LOG_FATAL("{}", e.what()); \
MATX_LOG_FATAL("Stack Trace:\n{}", e.stack.str()); \
exit(1); \
}

Expand All @@ -168,6 +186,8 @@ namespace matx
throw matx::detail::matxException(e, str, __FILE__, __LINE__); \
}

#endif

#if !defined(NDEBUG) && !defined(__CUDA_ARCH__)
#define MATX_ASSERT(a, error) \
{ \
Expand All @@ -190,7 +210,7 @@ namespace matx
auto tmp = a; \
if ((tmp != expected)) \
{ \
std::cout << #a ": " << str << "(" << tmp << " != " << expected << ")\n";\
MATX_LOG_ERROR("{}: {} ({} != {})", #a, str, static_cast<int>(tmp), static_cast<int>(expected)); \
MATX_THROW(error, ""); \
} \
}
Expand All @@ -217,7 +237,7 @@ namespace matx
const auto e_ = (e); \
if (e_ != cudaSuccess) \
{ \
fprintf(stderr, "%s:%d CUDA Error: %s (%d)\n", __FILE__,__LINE__, cudaGetErrorString(e_), e_); \
MATX_LOG_ERROR("{}:{} CUDA Error: {} ({})", __FILE__, __LINE__, cudaGetErrorString(e_), static_cast<int>(e_)); \
MATX_THROW(matx::matxCudaError, cudaGetErrorString(e_)); \
} \
} while (0)
Expand All @@ -239,21 +259,22 @@ namespace matx
compatible = (size == 0 || size == Size(i)); \
} \
if (!compatible) { \
std::cerr << "Incompatible operator sizes: ("; \
std::string msg = "Incompatible operator sizes: ("; \
for (int32_t i = 0; i < Rank(); i++) { \
std::cerr << Size(i); \
msg += std::to_string(Size(i)); \
if (i != Rank() - 1) { \
std::cerr << ","; \
msg += ","; \
} \
} \
std::cerr << ") not compatible with ("; \
msg += ") not compatible with ("; \
for (int32_t i = 0; i < Rank(); i++) { \
std::cerr << matx::detail::get_expanded_size<Rank()>(op, i); \
msg += std::to_string(matx::detail::get_expanded_size<Rank()>(op, i)); \
if (i != Rank() - 1) { \
std::cerr << ","; \
msg += ","; \
} \
} \
std::cerr << ")" << std::endl; \
msg += ")"; \
MATX_LOG_ERROR("{}", msg); \
MATX_THROW(matxInvalidSize, "Incompatible operator sizes"); \
} \
}
Expand Down
79 changes: 78 additions & 1 deletion include/matx/core/log.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,83 @@
#include <memory>
#include <mutex>

// Include MatX type traits and complex types for formatting support
#include "matx/core/half.h"
#include "matx/core/half_complex.h"
#include <complex>
#include <cuda/std/complex>

// Helper for formatting complex types
namespace matx {
namespace detail {
// Generic helper to format any complex-like type with real() and imag() methods
template<typename ComplexType>
inline std::string format_complex(const ComplexType& c) {
return std::format("({:g}{:+g}j)",
static_cast<double>(c.real()),
static_cast<double>(c.imag()));
}
}
}

// Formatter specializations for all types supported by MatX
namespace std {
// Formatter for std::complex<T>
template<typename T>
struct formatter<std::complex<T>> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }

template<typename FormatContext>
auto format(const std::complex<T>& c, FormatContext& ctx) const {
return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
}
};

// Formatter for cuda::std::complex<T>
template<typename T>
struct formatter<cuda::std::complex<T>> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }

template<typename FormatContext>
auto format(const cuda::std::complex<T>& c, FormatContext& ctx) const {
return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
}
};

// Formatter for matxHalfComplex (fp16/bf16 complex)
template<typename T>
struct formatter<matx::matxHalfComplex<T>> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }

template<typename FormatContext>
auto format(const matx::matxHalfComplex<T>& c, FormatContext& ctx) const {
return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
}
};

// Formatter for matxFp16 (half-precision float)
template<>
struct formatter<matx::matxFp16> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }

template<typename FormatContext>
auto format(const matx::matxFp16& val, FormatContext& ctx) const {
return format_to(ctx.out(), "{:g}", static_cast<float>(val));
}
};

// Formatter for matxBf16 (bfloat16)
template<>
struct formatter<matx::matxBf16> {
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }

template<typename FormatContext>
auto format(const matx::matxBf16& val, FormatContext& ctx) const {
return format_to(ctx.out(), "{:g}", static_cast<float>(val));
}
};
}

namespace matx {
namespace detail {

Expand Down Expand Up @@ -139,7 +216,7 @@ class Logger {
std::mutex mutex_;
bool show_function_;

Logger() : min_level_(LogLevel::OFF), output_stream_(&std::cout), show_function_(false) {
Logger() : min_level_(LogLevel::ERROR), output_stream_(&std::cout), show_function_(false) {
// Read log level from environment
const char* level_env = std::getenv("MATX_LOG_LEVEL");
if (level_env) {
Expand Down
Loading