diff --git a/CMakeLists.txt b/CMakeLists.txt index 91c2b01..2e10db2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,12 +18,21 @@ option(WITH_ASCEND "Enable Ascend backend" OFF) option(WITH_TORCH "Enable PyTorch C++ backend" OFF) -# Default OFF until CANN's `extract_host_stub.py` path handling is fixed for -# `scikit-build-core` temp-dir builds (triggers `KeyError` on the preprocessed -# object path). Enable explicitly with `-DBUILD_CUSTOM_KERNEL=ON` when the -# toolchain is compatible or when building via the standalone -# `src/ascend/custom/build.sh` script. -option(BUILD_CUSTOM_KERNEL "Build custom AscendC kernel PyTorch extension (requires `torch_npu`)" OFF) +# Custom `AscendC` kernels under `src/ascend/custom/`. `ON` by default +# so CI and routine dev builds always exercise `implementation_index=1/2` +# for `RmsNorm` / `AddRmsNorm`. Gated by `WITH_ASCEND` in +# `src/CMakeLists.txt` — non-Ascend builds ignore it. Pass +# `-DBUILD_ASCEND_CUSTOM=OFF` to skip the `ccec` build on Ascend +# machines where the custom kernels aren't needed. +# +# When `ON`, `src/CMakeLists.txt` drives the standalone +# `src/ascend/custom/build.sh` via `execute_process` at configure time +# (sidesteps a `CANN` `extract_host_stub.py` path bug that breaks +# in-tree `ascendc_library()` under `scikit-build-core` temp-dir builds) +# and links the produced `libno_workspace_kernel.a` into the `ops` +# module with `--whole-archive`. Requires `torch_npu` and the +# `AscendC` toolchain (`ccec`). +option(BUILD_ASCEND_CUSTOM "Build custom AscendC kernels" ON) option(AUTO_DETECT_DEVICES "Automatically detect available devices" OFF) option(AUTO_DETECT_BACKENDS "Automatically detect available backends" OFF) diff --git a/pyproject.toml b/pyproject.toml index 959699f..6b51702 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,15 @@ name = "InfiniOps" version = "0.1.0" [project.optional-dependencies] +# TODO: `torch` here is unconstrained. On Ascend hosts, the working +# torch is the Ascend-matched `torch 2.9.0+cpu` paired with +# `torch_npu 2.9.0.post1+…`. A `pip install -e .[dev] --force-reinstall` +# will re-resolve `torch` to the latest PyPI version (currently +# `torch 2.11.0`), which now declares `cuda-toolkit` / `nvidia-cublas` / +# `nvidia-cudnn` / … as hard deps — downloads GBs of CUDA wheels and +# kills the `torch_npu` / `vllm-ascend` pairing. Needs a platform-aware +# split (e.g. `torch; platform_machine != 'aarch64'`, or move `torch` +# out of `dev` and require it pre-installed in the container image). dev = ["pytest", "pytest-cov", "pytest-xdist", "ruff", "torch", "pyyaml"] [tool.scikit-build.wheel] diff --git a/scripts/generate_wrappers.py b/scripts/generate_wrappers.py index 49b6c19..9810404 100644 --- a/scripts/generate_wrappers.py +++ b/scripts/generate_wrappers.py @@ -112,9 +112,29 @@ def _find_vector_tensor_params(op_name): return set(re.findall(r"std::vector\s+(\w+)", source)) +def _find_params_with_defaults(op_name): + """Return ``{param_name: default_literal}`` for base-header params that + carry a `= ` default value. `libclang`'s cursor API does not + expose defaults reliably, so we regex-scan the source. Only used for + plain scalar defaults such as ``bool pre_gathered = false``. + """ + source = (_BASE_DIR / f"{op_name}.h").read_text() + + mapping = {} + + for name, default in re.findall( + r"\b(?:bool|int(?:64_t|32_t|8_t|16_t)?|std::size_t|std::uint\w+_t|float|double)\s+(\w+)\s*=\s*([^,\)]+?)\s*(?:,|\))", + source, + ): + mapping[name] = default.strip() + + return mapping + + def _generate_pybind11(operator): optional_tensor_params = _find_optional_tensor_params(operator.name) vector_tensor_params = _find_vector_tensor_params(operator.name) + params_with_defaults = _find_params_with_defaults(operator.name) def _is_optional_tensor(arg): if arg.spelling in optional_tensor_params: @@ -186,6 +206,10 @@ def _generate_py_args(node): if _is_optional(arg): parts.append(f'py::arg("{arg.spelling}") = py::none()') + elif arg.spelling in params_with_defaults: + parts.append( + f'py::arg("{arg.spelling}") = {params_with_defaults[arg.spelling]}' + ) else: parts.append(f'py::arg("{arg.spelling}")') @@ -257,8 +281,7 @@ def _generate_call(op_name, call, method=True): }}) .def_static("clear_cache", &Self::clear_cache); -{callers} -}} +{callers}}} }} // namespace infini::ops diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 32c9294..1e2eeea 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -241,8 +241,66 @@ if(WITH_ASCEND) list(APPEND DEVICE_LIST "ascend") # Custom `AscendC` kernels (PyTorch extension, requires `torch_npu`). - if(BUILD_CUSTOM_KERNEL) - add_subdirectory(ascend/custom) + if(BUILD_ASCEND_CUSTOM) + # In-tree `ascendc_library()` trips the `CANN` `extract_host_stub.py` + # path-handling bug under `scikit-build-core`'s temp-dir builds + # (`KeyError` on `/./workspace/...` paths in `$`). + # Work around it by driving the standalone `src/ascend/custom/build.sh` + # — that script invokes a separate `cmake` with + # `src/ascend/custom/` as its `SOURCE_DIR`, avoiding the buggy + # path shape. The produced `.a` is imported and linked into + # `ops` with `--whole-archive`. + set(_custom_build_dir "${CMAKE_SOURCE_DIR}/build/build_ascend_custom") + set(_custom_lib "${_custom_build_dir}/lib/libno_workspace_kernel.a") + + if(NOT DEFINED SOC_VERSION OR "${SOC_VERSION}" STREQUAL "") + include(${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/cmake/detect_soc.cmake) + infiniops_detect_soc(SOC_VERSION) + endif() + + # Drive `build.sh` as a build-phase target with explicit source + # dependencies so that editing any `op_host/` or `op_kernel/` + # source re-triggers the build (plain `execute_process` at + # configure time would only gate on file existence and leave + # stale `.a` files in place). + file(GLOB_RECURSE _custom_srcs CONFIGURE_DEPENDS + "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh") + + # Scrub env inherited from the outer `scikit-build-core` invocation + # before handing control to `build.sh`: + # * `CMAKE_GENERATOR` / `CMAKE_EXPORT_COMPILE_COMMANDS` leaking + # into the inner `cmake` change the path format passed to + # `ninja`'s `_host_cpp` rule and re-trigger the `CANN` + # `extract_host_stub.py` `KeyError` (`/./workspace/...`) that + # standalone `build.sh` avoids. + # * `PYTHONPATH` from `pip`'s build-isolation overlay makes the + # child `python3` skip the system `site-packages` — child + # `cmake` modules that `import torch` (`config_envs.cmake`) + # then fail with `ModuleNotFoundError` even though `torch` is + # installed. + add_custom_command( + OUTPUT ${_custom_lib} + COMMAND ${CMAKE_COMMAND} -E env + --unset=CMAKE_GENERATOR + --unset=CMAKE_EXPORT_COMPILE_COMMANDS + --unset=CMAKE_BUILD_PARALLEL_LEVEL + --unset=PYTHONPATH + "BUILD_DIR=${_custom_build_dir}" + "CMAKE_EXE=${CMAKE_COMMAND}" + bash ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom/build.sh ${SOC_VERSION} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ascend/custom + DEPENDS ${_custom_srcs} + COMMENT "Building custom AscendC kernels (SOC_VERSION=${SOC_VERSION})" + VERBATIM) + + add_custom_target(no_workspace_kernel_build ALL DEPENDS ${_custom_lib}) + + add_library(no_workspace_kernel STATIC IMPORTED GLOBAL) + set_target_properties(no_workspace_kernel PROPERTIES + IMPORTED_LOCATION "${_custom_lib}") + add_dependencies(no_workspace_kernel no_workspace_kernel_build) # Link the compiled `AscendC` kernel objects into `infiniops` so that # custom kernel implementations (e.g. `RmsNorm` index 1) can call @@ -379,9 +437,17 @@ if(GENERATE_PYTHON_BINDINGS) # The `Operator<..., 1>` template instantiations that call # `aclrtlaunch_*` live in `ops.cc`, so link here with # `--whole-archive` to ensure all launch functions are available. - if(BUILD_CUSTOM_KERNEL) + # `$` works for both real `ascendc_library()` targets and + # `IMPORTED` targets pointing at a pre-built `.a`. The + # `no_workspace_kernel` target is only created inside the + # `WITH_ASCEND` block above, so this branch must mirror that gate; + # otherwise non-Ascend builds error out with "No target + # no_workspace_kernel". + if(WITH_ASCEND AND BUILD_ASCEND_CUSTOM) target_link_libraries(ops PRIVATE - -Wl,--whole-archive no_workspace_kernel -Wl,--no-whole-archive) + -Wl,--whole-archive $ -Wl,--no-whole-archive) + # `ops` link step must wait for `build.sh` to produce the `.a`. + add_dependencies(ops no_workspace_kernel_build) endif() set_target_properties(infiniops PROPERTIES INSTALL_RPATH "$ORIGIN") diff --git a/src/ascend/add_rms_norm/kernel.h b/src/ascend/add_rms_norm/kernel.h new file mode 100644 index 0000000..38b0a5a --- /dev/null +++ b/src/ascend/add_rms_norm/kernel.h @@ -0,0 +1,144 @@ +#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_ +#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_H_ + +#include + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnn_add.h" +#include "aclnn_rms_norm.h" +#include "ascend/common.h" +#include "ascend/workspace_pool_.h" +#include "base/add_rms_norm.h" +#include "operator.h" + +namespace infini::ops { + +// Decomposed implementation: `aclnnAdd` + `aclnnRmsNorm`. +// +// The fused `aclnnAddRmsNorm` API has ~200 us host-side launch overhead that +// dominates small-tensor dispatch. Decomposing into two fast ACLNN calls +// reduces host dispatch from ~224 us to ~56 us (4x faster) with negligible +// NPU-side impact for inference tensor sizes. +template <> +class Operator : public AddRmsNorm { + public: + Operator(const Tensor input, const Tensor residual, const Tensor weight, + float eps, Tensor out, Tensor residual_out) + : AddRmsNorm(input, residual, weight, eps, out, residual_out), + input_cache_(input), + residual_cache_(residual), + weight_cache_(weight), + out_cache_(out), + residual_out_cache_(residual_out) { + // Alpha scalar for `aclnnAdd` (`residual_out = input + 1.0 * residual`). + alpha_ = aclCreateScalar(&alpha_storage_, ACL_FLOAT); + + // `aclnnRmsNorm` writes `rstd` as a required side output. Size is + // computed here; the buffer is obtained from the pool in `operator()`. + rstd_shape_ = {static_cast(batch_size_), + static_cast(nhead_)}; + rstd_size_ = batch_size_ * nhead_ * sizeof(float); + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + input_cache_.release(); + residual_cache_.release(); + weight_cache_.release(); + out_cache_.release(); + residual_out_cache_.release(); + + // `rstd_tensor_` leaks with `norm_exec_` at shutdown (see `64c367c`). + if (alpha_) aclDestroyScalar(alpha_); + } + + void operator()(const Tensor input, const Tensor residual, + const Tensor weight, float eps, Tensor out, + Tensor residual_out) const override { + auto t_input = input_cache_.get(const_cast(input.data())); + auto t_residual = residual_cache_.get(const_cast(residual.data())); + auto t_weight = weight_cache_.get(const_cast(weight.data())); + auto t_out = out_cache_.get(out.data()); + auto t_residual_out = residual_out_cache_.get(residual_out.data()); + auto stream = static_cast(stream_); + + // Step 1: `residual_out = input + residual`. + if (!add_exec_) { + aclnnAddGetWorkspaceSize(t_input, t_residual, alpha_, t_residual_out, + &add_ws_, &add_exec_); + aclSetAclOpExecutorRepeatable(add_exec_); + } else { + aclSetInputTensorAddr(add_exec_, 0, t_input, + const_cast(input.data())); + aclSetInputTensorAddr(add_exec_, 1, t_residual, + const_cast(residual.data())); + aclSetOutputTensorAddr(add_exec_, 0, t_residual_out, residual_out.data()); + } + auto& add_arena = ascend::GetWorkspacePool().Ensure(stream, add_ws_); + aclnnAdd(add_arena.buf, add_ws_, add_exec_, stream); + + // Obtain shared `rstd` buffer from pool. + auto& rstd_arena = + ascend::GetWorkspacePool().Ensure(stream, rstd_size_, "temp"); + + // Lazily create the `rstd` tensor descriptor on first call. + if (!rstd_tensor_) { + rstd_tensor_ = aclCreateTensor(rstd_shape_.data(), 2, ACL_FLOAT, + /*strides=*/nullptr, 0, ACL_FORMAT_ND, + rstd_shape_.data(), 2, rstd_arena.buf); + } else { + aclSetRawTensorAddr(rstd_tensor_, rstd_arena.buf); + } + + // Step 2: `out = rms_norm(residual_out, weight, eps)`. + if (!norm_exec_) { + aclnnRmsNormGetWorkspaceSize(t_residual_out, t_weight, eps, t_out, + rstd_tensor_, &norm_ws_, &norm_exec_); + aclSetAclOpExecutorRepeatable(norm_exec_); + } else { + aclSetInputTensorAddr(norm_exec_, 0, t_residual_out, residual_out.data()); + aclSetInputTensorAddr(norm_exec_, 1, t_weight, + const_cast(weight.data())); + aclSetOutputTensorAddr(norm_exec_, 0, t_out, out.data()); + aclSetOutputTensorAddr(norm_exec_, 1, rstd_tensor_, rstd_arena.buf); + } + auto& norm_arena = ascend::GetWorkspacePool().Ensure(stream, norm_ws_); + aclnnRmsNorm(norm_arena.buf, norm_ws_, norm_exec_, stream); + } + + private: + mutable ascend::AclTensorCache input_cache_; + + mutable ascend::AclTensorCache residual_cache_; + + mutable ascend::AclTensorCache weight_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable ascend::AclTensorCache residual_out_cache_; + + float alpha_storage_ = 1.0f; + + aclScalar* alpha_ = nullptr; + + std::vector rstd_shape_; + + uint64_t rstd_size_ = 0; + + mutable aclTensor* rstd_tensor_ = nullptr; + + mutable aclOpExecutor* add_exec_ = nullptr; + + mutable uint64_t add_ws_ = 0; + + mutable aclOpExecutor* norm_exec_ = nullptr; + + mutable uint64_t norm_ws_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/ascend/add_rms_norm/kernel_custom.h b/src/ascend/add_rms_norm/kernel_custom.h new file mode 100644 index 0000000..daaa8c3 --- /dev/null +++ b/src/ascend/add_rms_norm/kernel_custom.h @@ -0,0 +1,171 @@ +#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_ +#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_ + +#ifdef INFINI_HAS_CUSTOM_KERNELS + +#include +#include + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_cast.h" +#include "ascend/common.h" +#include "ascend/workspace_pool_.h" +#include "base/add_rms_norm.h" +#include "operator.h" + +// Forward-declare the `aclrtlaunch_AddRmsNorm` launch symbol defined +// by the AscendC toolchain from `custom/add_rms_norm/op_kernel/`. +extern "C" uint32_t aclrtlaunch_AddRmsNorm( + uint32_t block_dim, void* stream, void* input, void* residual, void* weight, + int64_t total_rows, int64_t dim_length, int64_t dim_length_align, + int64_t former_num, int64_t former_length, int64_t tail_length, float eps, + int64_t dtype_code, void* out, void* residual_out); + +namespace infini::ops { + +// Custom AscendC fused `AddRmsNorm` kernel (implementation index 2). +// +// A single-kernel implementation that computes `residual_out = input + +// residual` followed by `out = rms_norm(residual_out, weight, eps)` in one +// launch, avoiding the decomposed `aclnnAdd` + `aclnnRmsNorm` calls (index 0) +// or the fused `aclnnAddRmsNorm` call (index 1). Migrated from the custom +// `RmsNorm` kernel (index 1 of `RmsNorm`). +// +// Select via `implementation_index=2` in Python: +// `infini.ops.add_rms_norm(input, residual, weight, eps, out, residual_out, +// implementation_index=2, stream=s)`. +// +// Requirements: +// - Input last dimension must be 32-byte aligned (divisible by 16 for +// `float16` or 8 for `float32`). All standard LLM hidden dimensions +// satisfy this. +// - `weight` must have the same dtype as `input`. +// - The custom kernel binary must be linked (`BUILD_ASCEND_CUSTOM=ON`). +template <> +class Operator : public AddRmsNorm { + public: + Operator(const Tensor input, const Tensor residual, const Tensor weight, + float eps, Tensor out, Tensor residual_out) + : AddRmsNorm(input, residual, weight, eps, out, residual_out), + dtype_{input.dtype()} { + assert((dtype_ == DataType::kFloat16 || dtype_ == DataType::kBFloat16 || + dtype_ == DataType::kFloat32) && + "`AddRmsNorm` custom kernel: `input` must be `fp16`, `bf16`, or " + "`fp32`"); + + // 32-byte alignment on the last dimension — kernel relies on aligned + // `DataCopyPad` loads/stores. + int64_t align_elems = 32 / static_cast(kDataTypeToSize.at(dtype_)); + dim_length_align_ = + ((static_cast(dim_) + align_elems - 1) / align_elems) * + align_elems; + assert(static_cast(dim_) == dim_length_align_ && + "`AddRmsNorm` custom kernel: last dimension must be 32-byte " + "aligned"); + + total_rows_ = + static_cast(batch_size_) * static_cast(nhead_); + + // The custom kernel always reads `weight` as fp32. fp16 / bf16 inputs + // trigger a lazy cast in `operator()` (guarded by `last_weight_ptr_` + // so that the cast runs only when the weight pointer changes — model + // weights are typically fixed after loading). + if (dtype_ != DataType::kFloat32) { + size_t fp32_bytes = static_cast(dim_) * sizeof(float); + aclrtMalloc(&weight_fp32_data_, fp32_bytes, ACL_MEM_MALLOC_NORMAL_ONLY); + + weight_src_cache_ = ascend::AclTensorCache( + {static_cast(dim_)}, ascend::ToAclDtype(dtype_), nullptr); + weight_dst_cache_ = ascend::AclTensorCache({static_cast(dim_)}, + ACL_FLOAT, weight_fp32_data_); + } + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + weight_src_cache_.release(); + weight_dst_cache_.release(); + + if (weight_fp32_data_) aclrtFree(weight_fp32_data_); + } + + void operator()(const Tensor input, const Tensor residual, + const Tensor weight, float eps, Tensor out, + Tensor residual_out) const override { + auto stream = static_cast(stream_); + + void* weight_fp32; + + if (dtype_ != DataType::kFloat32) { + const void* cur_weight = weight.data(); + + // Model weights are fixed after loading, so the cast typically runs + // once on the first call and is skipped on all subsequent calls. + if (cur_weight != last_weight_ptr_) { + auto t_src = weight_src_cache_.get(const_cast(cur_weight)); + auto t_dst = weight_dst_cache_.get(weight_fp32_data_); + + if (!cast_exec_) { + aclnnCastGetWorkspaceSize(t_src, ACL_FLOAT, t_dst, &cast_ws_, + &cast_exec_); + aclSetAclOpExecutorRepeatable(cast_exec_); + } else { + aclSetInputTensorAddr(cast_exec_, 0, t_src, + const_cast(cur_weight)); + aclSetOutputTensorAddr(cast_exec_, 0, t_dst, weight_fp32_data_); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, cast_ws_); + aclnnCast(arena.buf, cast_ws_, cast_exec_, stream); + last_weight_ptr_ = cur_weight; + } + + weight_fp32 = weight_fp32_data_; + } else { + weight_fp32 = const_cast(weight.data()); + } + + // Block-level tiling. Ascend 910B has 20–40 AIV cores; over-subscribing + // is safe (runtime multiplexes) but wastes one weight load per block. + static constexpr int64_t kMaxBlockDim = 40; + int64_t used_cores = std::min(total_rows_, kMaxBlockDim); + int64_t former_length = (total_rows_ + used_cores - 1) / used_cores; + int64_t tail_length = former_length - 1; + int64_t former_num = total_rows_ - tail_length * used_cores; + uint32_t block_dim = static_cast(used_cores); + + aclrtlaunch_AddRmsNorm(block_dim, stream, const_cast(input.data()), + const_cast(residual.data()), weight_fp32, + total_rows_, static_cast(dim_), + dim_length_align_, former_num, former_length, + tail_length, eps, static_cast(dtype_), + out.data(), residual_out.data()); + } + + private: + DataType dtype_; + + int64_t dim_length_align_; + + int64_t total_rows_; + + void* weight_fp32_data_ = nullptr; + + mutable ascend::AclTensorCache weight_src_cache_; + + mutable ascend::AclTensorCache weight_dst_cache_; + + mutable const void* last_weight_ptr_ = nullptr; + + mutable aclOpExecutor* cast_exec_ = nullptr; + + mutable uint64_t cast_ws_ = 0; +}; + +} // namespace infini::ops + +#endif // INFINI_HAS_CUSTOM_KERNELS +#endif // INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_CUSTOM_H_ diff --git a/src/ascend/add_rms_norm/kernel_fused.h b/src/ascend/add_rms_norm/kernel_fused.h new file mode 100644 index 0000000..e28d7c2 --- /dev/null +++ b/src/ascend/add_rms_norm/kernel_fused.h @@ -0,0 +1,132 @@ +#ifndef INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_FUSED_H_ +#define INFINI_OPS_ASCEND_ADD_RMS_NORM_KERNEL_FUSED_H_ + +#include + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnnop/aclnn_add_rms_norm.h" +#include "ascend/common.h" +#include "ascend/workspace_pool_.h" +#include "base/add_rms_norm.h" +#include "operator.h" + +namespace infini::ops { + +// Fused implementation via `aclnnAddRmsNorm` (implementation index 1). +// +// Computes `residual_out = input + residual` and `out = rms_norm(residual_out, +// weight, eps)` in a single CANN launch. The fused API has higher host-side +// launch overhead (~200 us) compared to the decomposed `aclnnAdd` + +// `aclnnRmsNorm` path (~39 us), but may offer better NPU-side efficiency for +// large tensors where kernel fusion reduces memory traffic. +// +// Select via `implementation_index=1` in Python: +// `infini.ops.add_rms_norm(..., implementation_index=1, stream=s)`. +template <> +class Operator : public AddRmsNorm { + public: + Operator(const Tensor input, const Tensor residual, const Tensor weight, + float eps, Tensor out, Tensor residual_out) + : AddRmsNorm(input, residual, weight, eps, out, residual_out), + input_cache_(input), + residual_cache_(residual), + weight_cache_(weight), + out_cache_(out), + residual_out_cache_(residual_out) { + // `aclnnAddRmsNorm` requires `rstdOut` to have the same ndim as `input`, + // with the last `weight.ndim()` dimensions set to 1. For example: + // `input` (2, 32, 128), `weight` (128) -> `rstdOut` (2, 32, 1). + // `input` (64, 128), `weight` (128) -> `rstdOut` (64, 1). + fused_rstd_shape_.reserve(ndim_); + for (size_t i = 0; i < ndim_ - weight.ndim(); ++i) { + fused_rstd_shape_.push_back(static_cast(input.size(i))); + } + for (size_t i = 0; i < weight.ndim(); ++i) { + fused_rstd_shape_.push_back(1); + } + + size_t rstd_elems = 1; + for (auto d : fused_rstd_shape_) { + rstd_elems *= static_cast(d); + } + size_t rstd_bytes = rstd_elems * sizeof(float); + aclrtMalloc(&rstd_data_, rstd_bytes, ACL_MEM_MALLOC_NORMAL_ONLY); + + rstd_tensor_ = aclCreateTensor( + fused_rstd_shape_.data(), + static_cast(fused_rstd_shape_.size()), ACL_FLOAT, + /*strides=*/nullptr, 0, ACL_FORMAT_ND, fused_rstd_shape_.data(), + static_cast(fused_rstd_shape_.size()), rstd_data_); + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + input_cache_.release(); + residual_cache_.release(); + weight_cache_.release(); + out_cache_.release(); + residual_out_cache_.release(); + + // `rstd_tensor_` leaks with the executor at shutdown (see `64c367c`). + if (rstd_data_) aclrtFree(rstd_data_); + } + + void operator()(const Tensor input, const Tensor residual, + const Tensor weight, float eps, Tensor out, + Tensor residual_out) const override { + auto t_input = input_cache_.get(const_cast(input.data())); + auto t_residual = residual_cache_.get(const_cast(residual.data())); + auto t_weight = weight_cache_.get(const_cast(weight.data())); + auto t_out = out_cache_.get(out.data()); + auto t_residual_out = residual_out_cache_.get(residual_out.data()); + auto stream = static_cast(stream_); + + if (!executor_) { + aclnnAddRmsNormGetWorkspaceSize( + t_input, t_residual, t_weight, static_cast(eps), t_out, + rstd_tensor_, t_residual_out, &ws_size_, &executor_); + aclSetAclOpExecutorRepeatable(executor_); + } else { + aclSetInputTensorAddr(executor_, 0, t_input, + const_cast(input.data())); + aclSetInputTensorAddr(executor_, 1, t_residual, + const_cast(residual.data())); + aclSetInputTensorAddr(executor_, 2, t_weight, + const_cast(weight.data())); + aclSetOutputTensorAddr(executor_, 0, t_out, out.data()); + // `rstd` at output index 1 has a stable address — no update needed. + aclSetOutputTensorAddr(executor_, 2, t_residual_out, residual_out.data()); + } + + auto& arena = ascend::GetWorkspacePool().Ensure(stream, ws_size_); + aclnnAddRmsNorm(arena.buf, ws_size_, executor_, stream); + } + + private: + mutable ascend::AclTensorCache input_cache_; + + mutable ascend::AclTensorCache residual_cache_; + + mutable ascend::AclTensorCache weight_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable ascend::AclTensorCache residual_out_cache_; + + std::vector fused_rstd_shape_; + + void* rstd_data_ = nullptr; + + aclTensor* rstd_tensor_ = nullptr; + + mutable aclOpExecutor* executor_ = nullptr; + + mutable uint64_t ws_size_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/ascend/causal_softmax/kernel.h b/src/ascend/causal_softmax/kernel.h new file mode 100644 index 0000000..975a034 --- /dev/null +++ b/src/ascend/causal_softmax/kernel.h @@ -0,0 +1,173 @@ +#ifndef INFINI_OPS_ASCEND_CAUSAL_SOFTMAX_KERNEL_H_ +#define INFINI_OPS_ASCEND_CAUSAL_SOFTMAX_KERNEL_H_ + +#include +#include + +#include "acl/acl.h" +#include "aclnn/aclnn_base.h" +#include "aclnn_copy.h" +#include "aclnn_masked_fill_scalar.h" +#include "aclnn_softmax.h" +#include "ascend/common.h" +#include "ascend/workspace_pool_.h" +#include "base/causal_softmax.h" +#include "data_type.h" +#include "operator.h" + +namespace infini::ops { + +// CANN 8.5 has no single API covering causal-mask-then-softmax: the nearest +// candidates (`aclnnSoftmaxV2`, `aclnnScaledSoftmaxGrad`) do not accept a +// boolean mask argument, and `aclnnScaledMaskedSoftmax` requires a +// pre-scaled attention-score tensor produced inside flash-attention, not a +// standalone softmax input. Decomposing into three ACLNN calls is therefore +// unavoidable until a `aclnnCausalSoftmax` ships: +// 1. `aclnnInplaceCopy(temp, input)` — stride-aware copy to a contiguous +// `temp` buffer. +// 2. `aclnnInplaceMaskedFillScalar(temp, mask, -inf)` — apply the +// upper-triangle mask. +// 3. `aclnnSoftmax(temp, dim=-1, out)` — softmax over the last dimension. +// +// The boolean causal mask is pre-computed and uploaded to device once in the +// constructor. Its shape `(seq_len, total_seq_len)` broadcasts over the +// batch dimension. +template <> +class Operator : public CausalSoftmax { + public: + Operator(const Tensor input, Tensor out) + : CausalSoftmax(input, out), in_cache_(input), out_cache_(out) { + // Compute `temp` buffer size — allocated lazily from the pool in + // `operator()`. + size_t n_elems = input.numel(); + size_t elem_bytes = kDataTypeToSize.at(dtype_); + temp_size_ = n_elems * elem_bytes; + + // Build a contiguous `Tensor` descriptor — data pointer set on first use. + Tensor temp_t{nullptr, input.shape(), input.dtype(), input.device()}; + temp_cache_ = ascend::AclTensorCache(temp_t); + + // Causal mask: `mask[i][j] = 1` when position `j` must be masked for + // query `i`. Shape `(seq_len, total_seq_len)` broadcasts over the batch + // dimension. + size_t mask_elems = seq_len_ * total_seq_len_; + std::vector mask_host(mask_elems, 0); + + for (size_t i = 0; i < seq_len_; ++i) { + auto vis_end = static_cast(total_seq_len_ - seq_len_ + i); + + for (auto j = vis_end + 1; j < static_cast(total_seq_len_); + ++j) { + mask_host[i * total_seq_len_ + j] = 1; + } + } + + aclrtMalloc(&mask_buf_, mask_elems, ACL_MEM_MALLOC_NORMAL_ONLY); + aclrtMemcpy(mask_buf_, mask_elems, mask_host.data(), mask_elems, + ACL_MEMCPY_HOST_TO_DEVICE); + + std::vector mshape = {static_cast(seq_len_), + static_cast(total_seq_len_)}; + std::vector mstrides = {static_cast(total_seq_len_), 1}; + mask_tensor_ = aclCreateTensor(mshape.data(), mshape.size(), ACL_BOOL, + mstrides.data(), 0, ACL_FORMAT_ND, + mshape.data(), mshape.size(), mask_buf_); + + // Scalar `-inf` for the masked-fill step. `aclCreateScalar` stores the + // pointer rather than copying, so `neg_inf_storage_` must stay alive + // with the object. + neg_inf_ = aclCreateScalar(&neg_inf_storage_, ACL_FLOAT); + // Workspaces are allocated lazily on the first `operator()` call. + } + + ~Operator() { + if (!ascend::IsAclRuntimeAlive()) return; + + // Null cached descriptors — see `AclTensorCache::release()`. + in_cache_.release(); + out_cache_.release(); + temp_cache_.release(); + + // `mask_tensor_` leaks with `fill_exec_` at shutdown (see `64c367c`). + if (mask_buf_) aclrtFree(mask_buf_); + if (neg_inf_) aclDestroyScalar(neg_inf_); + } + + void operator()(const Tensor input, Tensor out) const override { + auto t_in = in_cache_.get(const_cast(input.data())); + auto t_out = out_cache_.get(out.data()); + auto stream = static_cast(stream_); + + // Obtain shared `temp` buffer from the pool. + auto& temp = ascend::GetWorkspacePool().Ensure(stream, temp_size_, "temp"); + auto t_temp = temp_cache_.get(temp.buf); + + // Step 1: copy `input` (possibly non-contiguous) into a contiguous `temp`. + if (!copy_exec_) { + aclnnInplaceCopyGetWorkspaceSize(t_temp, t_in, ©_ws_, ©_exec_); + aclSetAclOpExecutorRepeatable(copy_exec_); + } else { + aclSetInputTensorAddr(copy_exec_, 0, t_temp, temp.buf); + aclSetInputTensorAddr(copy_exec_, 1, t_in, + const_cast(input.data())); + } + auto& copy_arena = ascend::GetWorkspacePool().Ensure(stream, copy_ws_); + aclnnInplaceCopy(copy_arena.buf, copy_ws_, copy_exec_, stream); + + // Step 2: mask upper-triangle positions with `-inf` in-place. + // `mask_tensor_` and `neg_inf_` have stable addresses — first-call only. + if (!fill_exec_) { + aclnnInplaceMaskedFillScalarGetWorkspaceSize( + t_temp, mask_tensor_, neg_inf_, &fill_ws_, &fill_exec_); + aclSetAclOpExecutorRepeatable(fill_exec_); + } + auto& fill_arena = ascend::GetWorkspacePool().Ensure(stream, fill_ws_); + aclnnInplaceMaskedFillScalar(fill_arena.buf, fill_ws_, fill_exec_, stream); + + // Step 3: softmax over the last dimension -> `out`. + if (!softmax_exec_) { + constexpr int64_t kLastDim = -1; + aclnnSoftmaxGetWorkspaceSize(t_temp, kLastDim, t_out, &softmax_ws_, + &softmax_exec_); + aclSetAclOpExecutorRepeatable(softmax_exec_); + } else { + aclSetOutputTensorAddr(softmax_exec_, 0, t_out, out.data()); + } + auto& softmax_arena = + ascend::GetWorkspacePool().Ensure(stream, softmax_ws_); + aclnnSoftmax(softmax_arena.buf, softmax_ws_, softmax_exec_, stream); + } + + private: + mutable ascend::AclTensorCache in_cache_; + + mutable ascend::AclTensorCache out_cache_; + + mutable ascend::AclTensorCache temp_cache_; + + float neg_inf_storage_ = -std::numeric_limits::infinity(); + + uint64_t temp_size_ = 0; + + void* mask_buf_ = nullptr; + + aclTensor* mask_tensor_ = nullptr; + + aclScalar* neg_inf_ = nullptr; + + mutable aclOpExecutor* copy_exec_ = nullptr; + + mutable uint64_t copy_ws_ = 0; + + mutable aclOpExecutor* fill_exec_ = nullptr; + + mutable uint64_t fill_ws_ = 0; + + mutable aclOpExecutor* softmax_exec_ = nullptr; + + mutable uint64_t softmax_ws_ = 0; +}; + +} // namespace infini::ops + +#endif diff --git a/src/ascend/custom/CMakeLists.txt b/src/ascend/custom/CMakeLists.txt index ca6e688..fb90041 100644 --- a/src/ascend/custom/CMakeLists.txt +++ b/src/ascend/custom/CMakeLists.txt @@ -30,8 +30,6 @@ else() endif() set(PROJECT_OP_SRC_BASE ${PROJECT_SOURCE_DIR}) -set(PROJECT_BUILD_PATH ${PROJECT_SOURCE_DIR}/build) -set(PROJECT_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/output) include(cmake/config_envs.cmake) include(cmake/config_ascend.cmake) @@ -43,13 +41,15 @@ if(CCACHE_PROGRAM) set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") endif() -# Shared library output location. -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_OUTPUT_PATH}) +# `CMAKE_LIBRARY_OUTPUT_DIRECTORY` is set by `build.sh` so that the +# standalone `libascend_kernel.so` lands next to `libno_workspace_kernel.a` +# under `/build/build_ascend_custom/output/`. # Host-side files. file(GLOB OP_SRCS ${PROJECT_OP_SRC_BASE}/torch_binding.cpp ${PROJECT_OP_SRC_BASE}/rms_norm/op_host/rms_norm.cpp + ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_host/add_rms_norm.cpp ) # Shared library name — consumed by `kernel_custom.h` variants and by the @@ -59,8 +59,18 @@ set(OP_PLUGIN_NAME ascend_kernel) # Kernel-side files (device code compiled by the `AscendC` toolchain). ascendc_library(no_workspace_kernel STATIC ${PROJECT_OP_SRC_BASE}/rms_norm/op_kernel/rms_norm.cpp + ${PROJECT_OP_SRC_BASE}/add_rms_norm/op_kernel/add_rms_norm.cpp ) +# The kernel translation units include `"data_type_enum.h"` from the main +# project's `src/` so that launcher and device code share one `DataType` +# enum. `ascendc_library` forwards the interface target's `INCLUDES` +# property to the nested `ExternalProject_Add` (see +# `${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake/legacy_modules/function.cmake`), +# so append the main `src/` dir here. +set_property(TARGET no_workspace_kernel_interface APPEND PROPERTY + INCLUDES ${PROJECT_OP_SRC_BASE}/../..) + # Create the shared library `libascend_kernel.so`. add_library(${OP_PLUGIN_NAME} SHARED ${OP_SRCS}) diff --git a/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp b/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp index b8e0d50..b561eaa 100644 --- a/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp +++ b/src/ascend/custom/add_rms_norm/op_host/add_rms_norm.cpp @@ -1,4 +1,4 @@ -#include "aclrtlaunch_add_rms_norm.h" +#include "aclrtlaunch_AddRmsNorm.h" #include "tiling/platform/platform_ascendc.h" #include "torch_kernel_helper.h" @@ -105,16 +105,13 @@ std::vector AddRmsNorm(const at::Tensor& x1, const at::Tensor& x2, float eps_float = static_cast(eps); int64_t dtype_size_val = dtype_size; - // The first arg `add_rms_norm` is the AscendC kernel entry-point name — it - // must match `ascendc_add_operator(OP_NAME add_rms_norm)` in `CMakeLists.txt`, - // the `__global__ __aicore__ void add_rms_norm(...)` definition in - // `op_kernel/`, and the generated `aclrtlaunch_add_rms_norm.h` header. - // Google C++ Style's PascalCase rule does NOT apply: this identifier is - // dictated by the AscendC toolchain's symbol convention. - EXEC_KERNEL_CMD(add_rms_norm, block_dim, kernel_input1, kernel_input2, - weight_float, kernel_output_y, kernel_output_x_out, - total_rows, dim_length, dim_length_align, former_num, - former_length, tail_length, eps_float, dtype_size_val); + // The first arg `AddRmsNorm` is the AscendC kernel entry-point name — it + // must match the `__global__ __aicore__ void AddRmsNorm(...)` definition + // in `op_kernel/` and the generated `aclrtlaunch_AddRmsNorm.h` header. + EXEC_KERNEL_CMD(AddRmsNorm, block_dim, kernel_input1, kernel_input2, + weight_float, total_rows, dim_length, dim_length_align, + former_num, former_length, tail_length, eps_float, + dtype_size_val, kernel_output_y, kernel_output_x_out); // Remove padding and reshape back to original shape. at::Tensor output_y = kernel_output_y; diff --git a/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy b/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy new file mode 100644 index 0000000..ccf1397 --- /dev/null +++ b/src/ascend/custom/add_rms_norm/op_kernel/.clang-tidy @@ -0,0 +1,9 @@ +--- +# `op_kernel/*.cpp` is `AscendC` device code compiled by `ccec`, not by +# the host toolchain, so it has no entry in `compile_commands.json` and +# `clang-tidy` cannot parse it correctly (the `__aicore__` macro expands +# unexpectedly when `kernel_operator.h` is absent). Disable all checks +# here — the `op_host/` side and the `kernel_custom.h` launcher still +# enforce the full ruleset. + +Checks: '-*' diff --git a/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp b/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp index e2a08e5..4b677d3 100644 --- a/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp +++ b/src/ascend/custom/add_rms_norm/op_kernel/add_rms_norm.cpp @@ -1,98 +1,102 @@ +#include "data_type.h" #include "kernel_operator.h" -constexpr int32_t BUFFER_NUM = 2; +constexpr int32_t kBufferNum = 2; template class KernelAddRmsNorm { public: __aicore__ inline KernelAddRmsNorm() {} - __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, - GM_ADDR x_out, int64_t totalRows, - int64_t dimLength, int64_t dimLengthAlign, - int64_t formerNum, int64_t formerLength, - int64_t tailLength, float eps) { - this->dimLength = dimLength; - this->dimLengthAlign = dimLengthAlign; - this->eps = eps; + __aicore__ inline void Init(GM_ADDR input, GM_ADDR residual, GM_ADDR weight, + int64_t total_rows, int64_t dim_length, + int64_t dim_length_align, int64_t former_num, + int64_t former_length, int64_t tail_length, + float eps, GM_ADDR out, GM_ADDR residual_out) { + dim_length_ = dim_length; + dim_length_align_ = dim_length_align; + eps_ = eps; // Block-level tiling: determine row range for this core. - int64_t blockIdx = AscendC::GetBlockIdx(); - int64_t rowOffset; + int64_t block_idx = AscendC::GetBlockIdx(); + int64_t row_offset; - if (blockIdx < formerNum) { - this->blockRows = formerLength; - rowOffset = formerLength * blockIdx; + if (block_idx < former_num) { + block_rows_ = former_length; + row_offset = former_length * block_idx; } else { - this->blockRows = tailLength; - int64_t tailIdx = blockIdx - formerNum; - rowOffset = formerLength * formerNum + tailLength * tailIdx; + block_rows_ = tail_length; + int64_t tail_idx = block_idx - former_num; + row_offset = former_length * former_num + tail_length * tail_idx; } // Global memory pointers. - x1Gm.SetGlobalBuffer((__gm__ T*)x1 + rowOffset * dimLengthAlign, - this->blockRows * dimLengthAlign); - x2Gm.SetGlobalBuffer((__gm__ T*)x2 + rowOffset * dimLengthAlign, - this->blockRows * dimLengthAlign); - yGm.SetGlobalBuffer((__gm__ T*)y + rowOffset * dimLengthAlign, - this->blockRows * dimLengthAlign); - xOutGm.SetGlobalBuffer((__gm__ T*)x_out + rowOffset * dimLengthAlign, - this->blockRows * dimLengthAlign); - weightGm.SetGlobalBuffer((__gm__ float*)weight, dimLengthAlign); - - int32_t dimLenAlign = static_cast(this->dimLengthAlign); + input_gm_.SetGlobalBuffer((__gm__ T*)input + row_offset * dim_length_align, + block_rows_ * dim_length_align); + residual_gm_.SetGlobalBuffer( + (__gm__ T*)residual + row_offset * dim_length_align, + block_rows_ * dim_length_align); + out_gm_.SetGlobalBuffer((__gm__ T*)out + row_offset * dim_length_align, + block_rows_ * dim_length_align); + residual_out_gm_.SetGlobalBuffer( + (__gm__ T*)residual_out + row_offset * dim_length_align, + block_rows_ * dim_length_align); + weight_gm_.SetGlobalBuffer((__gm__ float*)weight, dim_length_align); + + int32_t dim_len_align = static_cast(dim_length_align_); // I/O queues (double-buffered). - pipe.InitBuffer(inQueueX1, BUFFER_NUM, - dimLenAlign * static_cast(sizeof(T))); - pipe.InitBuffer(inQueueX2, BUFFER_NUM, - dimLenAlign * static_cast(sizeof(T))); - pipe.InitBuffer(outQueueY, BUFFER_NUM, - dimLenAlign * static_cast(sizeof(T))); - pipe.InitBuffer(outQueueXOut, BUFFER_NUM, - dimLenAlign * static_cast(sizeof(T))); + pipe_.InitBuffer(in_queue_input_, kBufferNum, + dim_len_align * static_cast(sizeof(T))); + pipe_.InitBuffer(in_queue_residual_, kBufferNum, + dim_len_align * static_cast(sizeof(T))); + pipe_.InitBuffer(out_queue_out_, kBufferNum, + dim_len_align * static_cast(sizeof(T))); + pipe_.InitBuffer(out_queue_residual_out_, kBufferNum, + dim_len_align * static_cast(sizeof(T))); // Weight buffer (fp32, loaded once, reused for all rows). - pipe.InitBuffer(weightBuf, - dimLenAlign * static_cast(sizeof(float))); + pipe_.InitBuffer(weight_buf_, + dim_len_align * static_cast(sizeof(float))); - // FP16 path needs extra fp32 compute buffers. - // buf1: holds x_out in fp32 (reused from x1_fp32 after Add). - // buf2: holds x2_fp32 initially, then x_out^2, then final result. + // FP16/BF16 path needs extra fp32 compute buffers. + // `fp32_buf1_`: holds `x_out` in fp32 (reused from `x1_fp32` after Add). + // `fp32_buf2_`: holds `x2_fp32` initially, then `x_out^2`, then final + // result. if constexpr (sizeof(T) == 2) { - pipe.InitBuffer(fp32Buf1, - dimLenAlign * static_cast(sizeof(float))); - pipe.InitBuffer(fp32Buf2, - dimLenAlign * static_cast(sizeof(float))); + pipe_.InitBuffer(fp32_buf1_, + dim_len_align * static_cast(sizeof(float))); + pipe_.InitBuffer(fp32_buf2_, + dim_len_align * static_cast(sizeof(float))); } - // ReduceSum temporary buffer (size per API formula). - constexpr int32_t ELEMS_PER_REPEAT = 256 / sizeof(float); - constexpr int32_t ELEMS_PER_BLOCK = 32 / sizeof(float); - int32_t firstMaxRepeat = - (dimLenAlign + ELEMS_PER_REPEAT - 1) / ELEMS_PER_REPEAT; - int32_t reduceTmpSize = - ((firstMaxRepeat + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK) * - ELEMS_PER_BLOCK; - pipe.InitBuffer(reduceTmpBuf, - reduceTmpSize * static_cast(sizeof(float))); + // `ReduceSum` temporary buffer (size per API formula). + constexpr int32_t kElemsPerRepeat = 256 / sizeof(float); + constexpr int32_t kElemsPerBlock = 32 / sizeof(float); + int32_t first_max_repeat = + (dim_len_align + kElemsPerRepeat - 1) / kElemsPerRepeat; + int32_t reduce_tmp_size = + ((first_max_repeat + kElemsPerBlock - 1) / kElemsPerBlock) * + kElemsPerBlock; + pipe_.InitBuffer(reduce_tmp_buf_, + reduce_tmp_size * static_cast(sizeof(float))); // Scalar buffer for reduction result (8 floats = 32 bytes). - pipe.InitBuffer(sumBuf, 32); + pipe_.InitBuffer(sum_buf_, 32); - // Load weight (fp32) from GM into `weightBuf`. - AscendC::LocalTensor wLocal = weightBuf.Get(); - AscendC::DataCopyExtParams wParams{ - 1, static_cast(dimLenAlign * sizeof(float)), 0, 0, 0}; - AscendC::DataCopyPadExtParams wPad{false, 0, 0, 0.0f}; - AscendC::DataCopyPad(wLocal, weightGm, wParams, wPad); + // Load weight (fp32) from GM into `weight_buf_`. + AscendC::LocalTensor w_local = weight_buf_.Get(); + AscendC::DataCopyExtParams w_params{ + 1, static_cast(dim_len_align * sizeof(float)), 0, 0, 0}; + AscendC::DataCopyPadExtParams w_pad{false, 0, 0, 0.0f}; + AscendC::DataCopyPad(w_local, weight_gm_, w_params, w_pad); // Ensure weight DMA completes before compute. AscendC::PipeBarrier(); } __aicore__ inline void Process() { - for (int64_t row = 0; row < this->blockRows; ++row) { + for (int64_t row = 0; row < block_rows_; ++row) { CopyIn(row); Compute(row); CopyOut(row); @@ -101,149 +105,175 @@ class KernelAddRmsNorm { private: __aicore__ inline void CopyIn(int64_t row) { - AscendC::LocalTensor x1Local = inQueueX1.AllocTensor(); - AscendC::LocalTensor x2Local = inQueueX2.AllocTensor(); + AscendC::LocalTensor input_local = in_queue_input_.AllocTensor(); + AscendC::LocalTensor residual_local = + in_queue_residual_.AllocTensor(); AscendC::DataCopyExtParams params{ - 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; + 1, static_cast(dim_length_align_ * sizeof(T)), 0, 0, 0}; AscendC::DataCopyPadExtParams pad{false, 0, 0, static_cast(0)}; - AscendC::DataCopyPad(x1Local, x1Gm[row * this->dimLengthAlign], params, - pad); - AscendC::DataCopyPad(x2Local, x2Gm[row * this->dimLengthAlign], params, - pad); - inQueueX1.EnQue(x1Local); - inQueueX2.EnQue(x2Local); + AscendC::DataCopyPad(input_local, input_gm_[row * dim_length_align_], + params, pad); + AscendC::DataCopyPad(residual_local, residual_gm_[row * dim_length_align_], + params, pad); + in_queue_input_.EnQue(input_local); + in_queue_residual_.EnQue(residual_local); } __aicore__ inline void Compute(int64_t row) { - AscendC::LocalTensor x1Local = inQueueX1.DeQue(); - AscendC::LocalTensor x2Local = inQueueX2.DeQue(); - AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); - AscendC::LocalTensor xOutLocal = outQueueXOut.AllocTensor(); + AscendC::LocalTensor input_local = in_queue_input_.DeQue(); + AscendC::LocalTensor residual_local = in_queue_residual_.DeQue(); + AscendC::LocalTensor out_local = out_queue_out_.AllocTensor(); + AscendC::LocalTensor residual_out_local = + out_queue_residual_out_.AllocTensor(); - AscendC::LocalTensor wLocal = weightBuf.Get(); - AscendC::LocalTensor rTmp = reduceTmpBuf.Get(); - AscendC::LocalTensor sLocal = sumBuf.Get(); + AscendC::LocalTensor w_local = weight_buf_.Get(); + AscendC::LocalTensor r_tmp = reduce_tmp_buf_.Get(); + AscendC::LocalTensor s_local = sum_buf_.Get(); - int32_t dimLen = static_cast(this->dimLength); - int32_t dimLenAlign = static_cast(this->dimLengthAlign); + int32_t dim_len = static_cast(dim_length_); + int32_t dim_len_align = static_cast(dim_length_align_); if constexpr (sizeof(T) == 4) { // ---- FP32 path: compute directly. ---- // Step 1: x_out = x1 + x2. - AscendC::Add(xOutLocal, x1Local, x2Local, dimLenAlign); + AscendC::Add(residual_out_local, input_local, residual_local, + dim_len_align); - // Step 2: x_out^2 into yLocal (reuse output buffer temporarily). - AscendC::Mul(yLocal, xOutLocal, xOutLocal, dimLenAlign); + // Step 2: x_out^2 into out_local (reuse output buffer temporarily). + AscendC::Mul(out_local, residual_out_local, residual_out_local, + dim_len_align); - // Step 3: ReduceSum(x_out^2) -> sLocal[0]. - // ReduceSum may modify yLocal, but we overwrite it below. - AscendC::ReduceSum(sLocal, yLocal, rTmp, dimLenAlign); + // Step 3: ReduceSum(x_out^2) -> s_local[0]. + // `ReduceSum` may modify `out_local`, but we overwrite it below. + AscendC::ReduceSum(s_local, out_local, r_tmp, dim_len_align); // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps). - float sumVal = sLocal.GetValue(0); - float meanVal = sumVal / static_cast(dimLen) + this->eps; - sLocal.SetValue(0, meanVal); - AscendC::Sqrt(sLocal, sLocal, 8); - float scale = 1.0f / sLocal.GetValue(0); + float sum_val = s_local.GetValue(0); + float mean_val = sum_val / static_cast(dim_len) + eps_; + s_local.SetValue(0, mean_val); + AscendC::Sqrt(s_local, s_local, 8); + float scale = 1.0f / s_local.GetValue(0); // Step 6: y = x_out * scale. - AscendC::Muls(yLocal, xOutLocal, scale, dimLenAlign); + AscendC::Muls(out_local, residual_out_local, scale, dim_len_align); // Step 7: y = y * weight. - AscendC::Mul(yLocal, yLocal, wLocal, dimLenAlign); + AscendC::Mul(out_local, out_local, w_local, dim_len_align); } else { - // ---- FP16 path: cast → fp32 compute → cast back. ---- - AscendC::LocalTensor b1 = fp32Buf1.Get(); - AscendC::LocalTensor b2 = fp32Buf2.Get(); + // ---- FP16/BF16 path: cast → fp32 compute → cast back. ---- + AscendC::LocalTensor b1 = fp32_buf1_.Get(); + AscendC::LocalTensor b2 = fp32_buf2_.Get(); - // Cast inputs fp16 → fp32. - AscendC::Cast(b1, x1Local, AscendC::RoundMode::CAST_NONE, dimLenAlign); - AscendC::Cast(b2, x2Local, AscendC::RoundMode::CAST_NONE, dimLenAlign); + // Cast inputs fp16/bf16 → fp32. + AscendC::Cast(b1, input_local, AscendC::RoundMode::CAST_NONE, + dim_len_align); + AscendC::Cast(b2, residual_local, AscendC::RoundMode::CAST_NONE, + dim_len_align); // Step 1: x_out = x1 + x2 (fp32), stored in b1. - AscendC::Add(b1, b1, b2, dimLenAlign); + AscendC::Add(b1, b1, b2, dim_len_align); - // Cast x_out fp32 → fp16 for the x_out output. - AscendC::Cast(xOutLocal, b1, AscendC::RoundMode::CAST_ROUND, dimLenAlign); + // Cast `x_out` fp32 → fp16/bf16 for the residual output. + AscendC::Cast(residual_out_local, b1, AscendC::RoundMode::CAST_RINT, + dim_len_align); // Step 2: x_out^2 in fp32, stored in b2. - AscendC::Mul(b2, b1, b1, dimLenAlign); + AscendC::Mul(b2, b1, b1, dim_len_align); - // Step 3: ReduceSum(x_out^2) -> sLocal[0]. - AscendC::ReduceSum(sLocal, b2, rTmp, dimLenAlign); + // Step 3: ReduceSum(x_out^2) -> s_local[0]. + AscendC::ReduceSum(s_local, b2, r_tmp, dim_len_align); // Step 4-5: scale = 1 / sqrt(mean(x_out^2) + eps). - float sumVal = sLocal.GetValue(0); - float meanVal = sumVal / static_cast(dimLen) + this->eps; - sLocal.SetValue(0, meanVal); - AscendC::Sqrt(sLocal, sLocal, 8); - float scale = 1.0f / sLocal.GetValue(0); + float sum_val = s_local.GetValue(0); + float mean_val = sum_val / static_cast(dim_len) + eps_; + s_local.SetValue(0, mean_val); + AscendC::Sqrt(s_local, s_local, 8); + float scale = 1.0f / s_local.GetValue(0); // Step 6: y = x_out * scale (fp32), reuse b2. - AscendC::Muls(b2, b1, scale, dimLenAlign); + AscendC::Muls(b2, b1, scale, dim_len_align); // Step 7: y = y * weight (fp32). - AscendC::Mul(b2, b2, wLocal, dimLenAlign); + AscendC::Mul(b2, b2, w_local, dim_len_align); - // Cast result fp32 → fp16. - AscendC::Cast(yLocal, b2, AscendC::RoundMode::CAST_ROUND, dimLenAlign); + AscendC::Cast(out_local, b2, AscendC::RoundMode::CAST_RINT, + dim_len_align); } - inQueueX1.FreeTensor(x1Local); - inQueueX2.FreeTensor(x2Local); - outQueueY.EnQue(yLocal); - outQueueXOut.EnQue(xOutLocal); + in_queue_input_.FreeTensor(input_local); + in_queue_residual_.FreeTensor(residual_local); + out_queue_out_.EnQue(out_local); + out_queue_residual_out_.EnQue(residual_out_local); } __aicore__ inline void CopyOut(int64_t row) { - AscendC::LocalTensor yLocal = outQueueY.DeQue(); - AscendC::LocalTensor xOutLocal = outQueueXOut.DeQue(); + AscendC::LocalTensor out_local = out_queue_out_.DeQue(); + AscendC::LocalTensor residual_out_local = + out_queue_residual_out_.DeQue(); AscendC::DataCopyExtParams params{ - 1, static_cast(this->dimLengthAlign * sizeof(T)), 0, 0, 0}; - AscendC::DataCopyPad(yGm[row * this->dimLengthAlign], yLocal, params); - AscendC::DataCopyPad(xOutGm[row * this->dimLengthAlign], xOutLocal, params); - outQueueY.FreeTensor(yLocal); - outQueueXOut.FreeTensor(xOutLocal); + 1, static_cast(dim_length_align_ * sizeof(T)), 0, 0, 0}; + AscendC::DataCopyPad(out_gm_[row * dim_length_align_], out_local, params); + AscendC::DataCopyPad(residual_out_gm_[row * dim_length_align_], + residual_out_local, params); + out_queue_out_.FreeTensor(out_local); + out_queue_residual_out_.FreeTensor(residual_out_local); } private: - AscendC::TPipe pipe; - AscendC::TQue inQueueX1; - AscendC::TQue inQueueX2; - AscendC::TQue outQueueY; - AscendC::TQue outQueueXOut; - - AscendC::TBuf weightBuf; - AscendC::TBuf fp32Buf1; - AscendC::TBuf fp32Buf2; - AscendC::TBuf reduceTmpBuf; - AscendC::TBuf sumBuf; - - AscendC::GlobalTensor x1Gm, x2Gm, yGm, xOutGm; - AscendC::GlobalTensor weightGm; - - int64_t blockRows; - int64_t dimLength; - int64_t dimLengthAlign; - float eps; + AscendC::TPipe pipe_; + AscendC::TQue in_queue_input_; + AscendC::TQue in_queue_residual_; + AscendC::TQue out_queue_out_; + AscendC::TQue out_queue_residual_out_; + + AscendC::TBuf weight_buf_; + AscendC::TBuf fp32_buf1_; + AscendC::TBuf fp32_buf2_; + AscendC::TBuf reduce_tmp_buf_; + AscendC::TBuf sum_buf_; + + AscendC::GlobalTensor input_gm_, residual_gm_, out_gm_, residual_out_gm_; + AscendC::GlobalTensor weight_gm_; + + int64_t block_rows_; + int64_t dim_length_; + int64_t dim_length_align_; + float eps_; }; -extern "C" __global__ __aicore__ void add_rms_norm( - GM_ADDR x1, GM_ADDR x2, GM_ADDR weight, GM_ADDR y, GM_ADDR x_out, - int64_t totalRows, int64_t dimLength, int64_t dimLengthAlign, - int64_t formerNum, int64_t formerLength, int64_t tailLength, float eps, - int64_t dtypeSize) { - if (dtypeSize == 2) { - KernelAddRmsNorm op; - op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign, - formerNum, formerLength, tailLength, eps); - op.Process(); - } else { - KernelAddRmsNorm op; - op.Init(x1, x2, weight, y, x_out, totalRows, dimLength, dimLengthAlign, - formerNum, formerLength, tailLength, eps); - op.Process(); +// `dtype_code` is `static_cast(infini::ops::DataType)` forwarded +// by the host launcher. fp16 and bf16 both have `sizeof == 2` but need +// distinct numeric paths, so dispatch is on the `DataType` tag rather +// than the byte size. +extern "C" __global__ __aicore__ void AddRmsNorm( + GM_ADDR input, GM_ADDR residual, GM_ADDR weight, int64_t total_rows, + int64_t dim_length, int64_t dim_length_align, int64_t former_num, + int64_t former_length, int64_t tail_length, float eps, int64_t dtype_code, + GM_ADDR out, GM_ADDR residual_out) { + switch (static_cast(dtype_code)) { + case infini::ops::DataType::kFloat16: { + KernelAddRmsNorm op; + op.Init(input, residual, weight, total_rows, dim_length, dim_length_align, + former_num, former_length, tail_length, eps, out, residual_out); + op.Process(); + break; + } + case infini::ops::DataType::kBFloat16: { + KernelAddRmsNorm op; + op.Init(input, residual, weight, total_rows, dim_length, dim_length_align, + former_num, former_length, tail_length, eps, out, residual_out); + op.Process(); + break; + } + case infini::ops::DataType::kFloat32: + default: { + KernelAddRmsNorm op; + op.Init(input, residual, weight, total_rows, dim_length, dim_length_align, + former_num, former_length, tail_length, eps, out, residual_out); + op.Process(); + break; + } } } diff --git a/src/ascend/custom/build.sh b/src/ascend/custom/build.sh index 258a88e..8374088 100755 --- a/src/ascend/custom/build.sh +++ b/src/ascend/custom/build.sh @@ -1,30 +1,45 @@ #!/bin/bash -# Build custom `AscendC` kernels into `libascend_kernel.so`. +# Build custom `AscendC` kernels into `libno_workspace_kernel.a` (+ the +# standalone `libascend_kernel.so`). +# +# Intermediate artefacts default to `/build/build_ascend_custom/` +# so the source tree under `src/` stays free of build output. Override +# via `BUILD_DIR= bash build.sh …` if needed. set -e SOC_VERSION="${1:-Ascend910_9382}" +# Use the same `cmake` the caller resolved (default: first `cmake` on +# PATH). The outer `src/CMakeLists.txt` forwards `${CMAKE_COMMAND}` +# via `CMAKE_EXE` so the child build doesn't accidentally pick up the +# PyPI `cmake` shim whose Python package only exists in `pip`'s +# build-isolation overlay. +CMAKE_EXE="${CMAKE_EXE:-cmake}" + # Detect CANN toolkit path. _CANN_TOOLKIT_INSTALL_PATH=$(grep "Toolkit_InstallPath" /etc/Ascend/ascend_cann_install.info | awk -F'=' '{print $2}') source "${_CANN_TOOLKIT_INSTALL_PATH}/set_env.sh" echo "CANN: ${ASCEND_TOOLKIT_HOME}" ASCEND_INCLUDE_DIR=${ASCEND_TOOLKIT_HOME}/$(arch)-linux/include -CURRENT_DIR=$(pwd) -OUTPUT_DIR=${CURRENT_DIR}/output -mkdir -p "${OUTPUT_DIR}" -BUILD_DIR=build +# Resolve build directory. `