NVIDIA · cliffburdick · Oct 31, 2025 · Oct 30, 2025 · Oct 31, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -81,6 +81,7 @@ option(MATX_EN_CUDA_LINEINFO "Enable line information for CUDA kernels via -line
 option(MATX_EN_EXTENDED_LAMBDA "Enable extended lambda support for device/host lambdas" ON)
 option(MATX_EN_MATHDX "Enable MathDx support for kernel fusion" OFF)
 option(MATX_EN_UNSAFE_ALIAS_DETECTION "Enable aliased memory detection" OFF)
+option(MATX_DISABLE_EXCEPTIONS "Disable C++ exceptions and log errors instead" OFF)
 
 set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")
 
@@ -206,6 +207,9 @@ if (MATX_NVTX_FLAGS)
     add_definitions(-DMATX_NVTX_FLAGS)
     target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
 endif()
+if (MATX_DISABLE_EXCEPTIONS)
+    target_compile_definitions(matx INTERFACE MATX_DISABLE_EXCEPTIONS)
+endif()
 if (MATX_BUILD_32_BIT)
     set(MATX_NVPL_INT_TYPE "lp64")
     target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)

diff --git a/docs_input/build.rst b/docs_input/build.rst
@@ -201,6 +201,8 @@ By default, all of these options are OFF.
     - ``-DMATX_EN_MATHDX=ON``
   * - Enable pybind11 Support. This option is usually not explicitly set, but is enabled by other options.
     - ``-DMATX_EN_PYBIND11=ON``
+  * - Disable Exceptions
+    - ``-DMATX_DISABLE_EXCEPTIONS=ON``
 
 
 NVTX Flags

diff --git a/examples/black_scholes.cu b/examples/black_scholes.cu
@@ -150,16 +150,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t input_size = 100'000'000;
   constexpr uint32_t num_iterations = 100;
   float time_ms;
-
-  tensor_t<dtype, 1> K_tensor{{input_size}};
-  tensor_t<dtype, 1> S_tensor{{input_size}};
-  tensor_t<dtype, 1> V_tensor{{input_size}};
-  tensor_t<dtype, 1> r_tensor{{input_size}};
-  tensor_t<dtype, 1> T_tensor{{input_size}};
-  tensor_t<dtype, 1> output_tensor{{input_size}};
-  tensor_t<dtype, 1> output_tensor2{{input_size}};
-  tensor_t<dtype, 1> output_tensor3{{input_size}};
-  tensor_t<dtype, 1> output_tensor4{{input_size}};
+  auto K_tensor = make_tensor<dtype>({input_size});
+  auto S_tensor = make_tensor<dtype>({input_size});
+  auto V_tensor = make_tensor<dtype>({input_size});
+  auto r_tensor = make_tensor<dtype>({input_size});
+  auto T_tensor = make_tensor<dtype>({input_size});
+  auto output_tensor = make_tensor<dtype>({input_size});
+  auto output_tensor2 = make_tensor<dtype>({input_size});
+  auto output_tensor3 = make_tensor<dtype>({input_size});
+  auto output_tensor4 = make_tensor<dtype>({input_size});
 
   (K_tensor = random<float>({input_size}, UNIFORM)).run();
   (S_tensor = random<float>({input_size}, UNIFORM)).run();
@@ -171,8 +170,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   cudaStreamCreate(&stream);
   cudaExecutor exec{stream};
 
-  //compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
-
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

diff --git a/include/matx/core/allocator.h b/include/matx/core/allocator.h
@@ -43,6 +43,7 @@
 
 #include "matx/core/error.h"
 #include "matx/core/nvtx.h"
+#include "matx/core/log.h"
 #include <cuda/std/functional>
 #include <cuda/std/__algorithm/max.h>
 
@@ -122,6 +123,9 @@ struct MemTracker {
 
     size_t bytes = iter->second.size;
 
+    MATX_LOG_DEBUG("Deallocating memory: ptr={}, {} bytes, space={}, remaining={} bytes", 
+                   ptr, bytes, static_cast<int>(iter->second.kind), matxMemoryStats.currentBytesAllocated - bytes);
+
     matxMemoryStats.currentBytesAllocated -= bytes;
 
     switch (iter->second.kind) {
@@ -187,6 +191,8 @@ struct MemTracker {
       }
     }
 
+    MATX_LOG_DEBUG("Allocating memory: {} bytes, space={}, stream={}", bytes, static_cast<int>(space), reinterpret_cast<void*>(stream));
+
     switch (space) {
     case MATX_MANAGED_MEMORY:
       err = cudaMallocManaged(ptr, bytes);
@@ -214,6 +220,8 @@ struct MemTracker {
       MATX_THROW(matxOutOfMemory, "Failed to allocate memory");
     }
 
+    MATX_LOG_DEBUG("Allocated memory: ptr={}, {} bytes, total_current={} bytes", *ptr, bytes, matxMemoryStats.currentBytesAllocated + bytes);
+
     [[maybe_unused]] std::unique_lock lck(memory_mtx);
     matxMemoryStats.currentBytesAllocated += bytes;
     matxMemoryStats.totalBytesAllocated += bytes;

diff --git a/include/matx/core/cache.h b/include/matx/core/cache.h
@@ -192,11 +192,15 @@ class matxCache_t {
     auto &common_params_cache = rmap[key];
     auto cache_el = common_params_cache.find(params);
     if (cache_el == common_params_cache.end()) {
+      MATX_LOG_DEBUG("Cache MISS for transform: id={}, device={}, thread={}", 
+                     id, key.device_id, reinterpret_cast<void*>(std::hash<std::thread::id>{}(key.thread_id)));
       std::any tmp = mfun();
       common_params_cache.insert({params, tmp});
       efun(std::any_cast<decltype(mfun())>(tmp));
     }
     else {
+      MATX_LOG_DEBUG("Cache HIT for transform: id={}, device={}, thread={}", 
+                     id, key.device_id, reinterpret_cast<void*>(std::hash<std::thread::id>{}(key.thread_id)));
       efun(std::any_cast<decltype(mfun())>(cache_el->second));
     }
   }

diff --git a/include/matx/core/error.h b/include/matx/core/error.h
@@ -42,6 +42,7 @@
 #endif
 
 #include "matx/core/stacktrace.h"
+#include "matx/core/log.h"
 #endif
 
 namespace matx
@@ -150,6 +151,23 @@ namespace matx
   };
   }
 
+#ifdef MATX_DISABLE_EXCEPTIONS
+
+#define MATX_ENTER_HANDLER() {
+#define MATX_EXIT_HANDLER() }
+
+#define MATX_THROW(e, str_arg)                       \
+  do {                                               \
+    MATX_LOG_FATAL("matxException ({}: {}) - {}:{}", matxErrorString(e), str_arg, __FILE__, __LINE__); \
+    std::stringstream matx_stack_trace;              \
+    detail::printStackTrace(matx_stack_trace);       \
+    std::string matx_stack_str = matx_stack_trace.str(); \
+    MATX_LOG_FATAL("Stack Trace:\n{}", matx_stack_str); \
+    std::abort();                                    \
+  } while(0)
+
+#else
+
 #define MATX_ENTER_HANDLER() \
   try                        \
   {
@@ -158,8 +176,8 @@ namespace matx
   }                                                             \
   catch (matx::detail::matxException & e)                       \
   {                                                             \
-    fprintf(stderr, "%s\n", e.what());                          \
-    fprintf(stderr, "Stack Trace:\n%s", e.stack.str().c_str()); \
+    MATX_LOG_FATAL("{}", e.what());                             \
+    MATX_LOG_FATAL("Stack Trace:\n{}", e.stack.str());          \
     exit(1);                                                    \
   }
 
@@ -168,6 +186,8 @@ namespace matx
     throw matx::detail::matxException(e, str, __FILE__, __LINE__); \
   }
 
+#endif
+
 #if !defined(NDEBUG) && !defined(__CUDA_ARCH__)
   #define MATX_ASSERT(a, error) \
   {                           \
@@ -190,7 +210,7 @@ namespace matx
     auto tmp = a;                      \
     if ((tmp != expected))             \
     {                                  \
-      std::cout << #a ": " << str << "(" << tmp << " != " << expected << ")\n";\
+      MATX_LOG_ERROR("{}: {} ({} != {})", #a, str, static_cast<int>(tmp), static_cast<int>(expected)); \
       MATX_THROW(error, "");           \
     }                                  \
   }
@@ -217,7 +237,7 @@ namespace matx
     const auto e_ = (e);                                        \
     if (e_ != cudaSuccess)                                      \
     {                                                           \
-      fprintf(stderr, "%s:%d CUDA Error: %s (%d)\n", __FILE__,__LINE__, cudaGetErrorString(e_), e_); \
+      MATX_LOG_ERROR("{}:{} CUDA Error: {} ({})", __FILE__, __LINE__, cudaGetErrorString(e_), static_cast<int>(e_)); \
       MATX_THROW(matx::matxCudaError, cudaGetErrorString(e_));  \
     }                                                           \
   } while (0)
@@ -239,21 +259,22 @@ namespace matx
       compatible = (size == 0 || size == Size(i));                   \
     }                                                                \
     if (!compatible) { \
-      std::cerr << "Incompatible operator sizes: ("; \
+      std::string msg = "Incompatible operator sizes: ("; \
       for (int32_t i = 0; i < Rank(); i++) { \
-        std::cerr << Size(i); \
+        msg += std::to_string(Size(i)); \
         if (i != Rank() - 1) { \
-          std::cerr << ","; \
+          msg += ","; \
         } \
       } \
-      std::cerr << ") not compatible with ("; \
+      msg += ") not compatible with ("; \
       for (int32_t i = 0; i < Rank(); i++) { \
-        std::cerr << matx::detail::get_expanded_size<Rank()>(op, i); \
+        msg += std::to_string(matx::detail::get_expanded_size<Rank()>(op, i)); \
         if (i != Rank() - 1) { \
-          std::cerr << ","; \
+          msg += ","; \
         } \
       } \
-      std::cerr << ")" << std::endl; \
+      msg += ")"; \
+      MATX_LOG_ERROR("{}", msg); \
       MATX_THROW(matxInvalidSize, "Incompatible operator sizes"); \
     } \
   }

diff --git a/include/matx/core/log.h b/include/matx/core/log.h
@@ -46,6 +46,83 @@
 #include <memory>
 #include <mutex>
 
+// Include MatX type traits and complex types for formatting support
+#include "matx/core/half.h"
+#include "matx/core/half_complex.h"
+#include <complex>
+#include <cuda/std/complex>
+
+// Helper for formatting complex types
+namespace matx {
+namespace detail {
+  // Generic helper to format any complex-like type with real() and imag() methods
+  template<typename ComplexType>
+  inline std::string format_complex(const ComplexType& c) {
+    return std::format("({:g}{:+g}j)", 
+                      static_cast<double>(c.real()), 
+                      static_cast<double>(c.imag()));
+  }
+}
+}
+
+// Formatter specializations for all types supported by MatX
+namespace std {
+  // Formatter for std::complex<T>
+  template<typename T>
+  struct formatter<std::complex<T>> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+    template<typename FormatContext>
+    auto format(const std::complex<T>& c, FormatContext& ctx) const {
+      return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
+    }
+  };
+
+  // Formatter for cuda::std::complex<T>
+  template<typename T>
+  struct formatter<cuda::std::complex<T>> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+    template<typename FormatContext>
+    auto format(const cuda::std::complex<T>& c, FormatContext& ctx) const {
+      return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
+    }
+  };
+
+  // Formatter for matxHalfComplex (fp16/bf16 complex)
+  template<typename T>
+  struct formatter<matx::matxHalfComplex<T>> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+    template<typename FormatContext>
+    auto format(const matx::matxHalfComplex<T>& c, FormatContext& ctx) const {
+      return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
+    }
+  };
+
+  // Formatter for matxFp16 (half-precision float)
+  template<>
+  struct formatter<matx::matxFp16> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+    template<typename FormatContext>
+    auto format(const matx::matxFp16& val, FormatContext& ctx) const {
+      return format_to(ctx.out(), "{:g}", static_cast<float>(val));
+    }
+  };
+
+  // Formatter for matxBf16 (bfloat16)
+  template<>
+  struct formatter<matx::matxBf16> {
+    constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
+
+    template<typename FormatContext>
+    auto format(const matx::matxBf16& val, FormatContext& ctx) const {
+      return format_to(ctx.out(), "{:g}", static_cast<float>(val));
+    }
+  };
+}
+
 namespace matx {
 namespace detail {
 
@@ -139,7 +216,7 @@ class Logger {
   std::mutex mutex_;
   bool show_function_;
 
-  Logger() : min_level_(LogLevel::OFF), output_stream_(&std::cout), show_function_(false) {
+  Logger() : min_level_(LogLevel::ERROR), output_stream_(&std::cout), show_function_(false) {
     // Read log level from environment
     const char* level_env = std::getenv("MATX_LOG_LEVEL");
     if (level_env) {