From f0c5a569f40bdb9e573eb5cf4759fbc43d25c6a3 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Tue, 1 Jul 2025 10:00:07 +0800 Subject: [PATCH 1/8] =?UTF-8?q?issue/291/build:=20nvidia=20=E5=92=8C=20met?= =?UTF-8?q?ax=20=E5=85=B1=E7=94=A8=20softmax=20kernel=20=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .../cuda/causal_softmax_cuda.cuh | 8 --- .../{causal_softmax_kernel.cuh => kernel.cuh} | 17 ++--- .../maca/causal_softmax_kernel.h | 60 ---------------- .../causal_softmax/maca/causal_softmax_maca.h | 8 --- .../metax/causal_softmax_metax.h | 8 +++ .../causal_softmax_metax.maca} | 23 +++++-- .../causal_softmax_nvidia.cu} | 23 +++++-- .../nvidia/causal_softmax_nvidia.cuh | 8 +++ src/infiniop/ops/causal_softmax/operator.cc | 68 +++++++------------ .../ops/rms_norm/cuda/rms_norm_kernel.cuh | 3 + .../rms_norm_metax.cuh} | 0 .../rms_norm_metax.maca} | 2 +- src/infiniop/ops/rms_norm/operator.cc | 2 +- src/infiniop/reduce/cuda/reduce.cuh | 2 - src/infiniop/reduce/maca/reduce.h | 63 ----------------- xmake.lua | 2 +- xmake/cuda.lua | 2 +- xmake/{maca.lua => metax.lua} | 4 +- 18 files changed, 94 insertions(+), 209 deletions(-) delete mode 100644 src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh rename src/infiniop/ops/causal_softmax/cuda/{causal_softmax_kernel.cuh => kernel.cuh} (86%) delete mode 100644 src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h delete mode 100644 src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h create mode 100644 src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h rename src/infiniop/ops/causal_softmax/{maca/causal_softmax_maca.maca => metax/causal_softmax_metax.maca} (83%) rename src/infiniop/ops/causal_softmax/{cuda/causal_softmax_cuda.cu => nvidia/causal_softmax_nvidia.cu} (84%) create mode 100644 src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh rename src/infiniop/ops/rms_norm/{maca/rms_norm_maca.cuh => metax/rms_norm_metax.cuh} (100%) rename src/infiniop/ops/rms_norm/{maca/rms_norm_maca.maca => metax/rms_norm_metax.maca} (99%) delete mode 100644 src/infiniop/reduce/maca/reduce.h rename xmake/{maca.lua => metax.lua} (94%) diff --git a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh b/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh deleted file mode 100644 index cbb2f8a42..000000000 --- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __CAUSAL_SOFTMAX_CUDA_H__ -#define __CAUSAL_SOFTMAX_CUDA_H__ - -#include "../causal_softmax.h" - -DESCRIPTOR(cuda) - -#endif diff --git a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh similarity index 86% rename from src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh rename to src/infiniop/ops/causal_softmax/cuda/kernel.cuh index 16b23291c..4dce83a6b 100644 --- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh +++ b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh @@ -1,11 +1,8 @@ -#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__ +#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__ #define __CAUSAL_SOFTMAX_KERNEL_CUH__ -#include "../../../devices/cuda/cuda_kernel_common.cuh" -#include "../../../reduce/cuda/reduce.cuh" - template -INFINIOP_CUDA_KERNEL causalSoftmax( +__device__ void causalSoftmaxKernel( Tdata *y_, const Tdata *x_, size_t batch, size_t height, size_t width, ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, @@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax( // 2 | * * * ... * * * | // height: 3 col_id-> if (width + blockIdx.x >= threadIdx.x + height) { -#ifdef ENABLE_NVIDIA_API - y[col] = exp_(x[col] - max_); -#else - y[col] = exp(x[col] - max_); -#endif + if constexpr (std::is_same_v) { + y[col] = hexp(x[col] - max_); + } else { + y[col] = exp(x[col] - max_); + } } else { y[col] = Tdata(0); } diff --git a/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h b/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h deleted file mode 100644 index 4ecc8f0da..000000000 --- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef __CAUSAL_SOFTMAX_KERNEL_H__ -#define __CAUSAL_SOFTMAX_KERNEL_H__ - -#include "../../../devices/maca/maca_kernel_common.h" -#include "../../../reduce/maca/reduce.h" - -template -INFINIOP_MACA_KERNEL causalSoftmax( - Tdata *y_, const Tdata *x_, - size_t batch, size_t height, size_t width, - ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) { - - Tdata *y = y_ // threadIdx.x for col_id - + blockIdx.y * y_stride_b // gridDim.y for batch_id - + blockIdx.x * y_stride_h; // gridDim.x for row_id - const Tdata *x = x_ + blockIdx.y * x_stride_b + blockIdx.x * x_stride_h; - - // [Reduce] Find max value in each row and store in shared memory - __shared__ Tdata max_; - Tdata max_0 = op::common_maca::reduce_op::max(x, width - height + 1 + blockIdx.x); - if (threadIdx.x == 0) { - max_ = max_0; - } - __syncthreads(); - - // [Elementwise] Subtract max value from each element and apply causal mask - for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) { - // row_id ↓ |<- width ->| - // 0 | * * * ... * | - // 1 | * * * ... * * | - // 2 | * * * ... * * * | - // height: 3 col_id-> - if (width + blockIdx.x >= threadIdx.x + height) { -#ifdef ENABLE_MACA_API - y[col] = exp_(x[col] - max_); -#else - y[col] = exp(x[col] - max_); -#endif - } else { - y[col] = Tdata(0); - } - } - __syncthreads(); - - // [Reduce] Find the sum of each updated row and store in shared memory - __shared__ Tcompute sum_; - Tcompute sum_0 = op::common_maca::reduce_op::sum(y, width); - if (threadIdx.x == 0) { - sum_ = sum_0; - } - __syncthreads(); - - // [Elementwise] Divide each element by the sum and store in shared memory - for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) { - y[col] /= Tdata(sum_); - } -} - -#endif // __CAUSAL_SOFTMAX_KERNEL_H__ diff --git a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h b/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h deleted file mode 100644 index 0f0075db7..000000000 --- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __CAUSAL_SOFTMAX_MACA_H__ -#define __CAUSAL_SOFTMAX_MACA_H__ - -#include "../causal_softmax.h" - -DESCRIPTOR(maca) - -#endif diff --git a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h new file mode 100644 index 000000000..1b68a8f5f --- /dev/null +++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h @@ -0,0 +1,8 @@ +#ifndef __CAUSAL_SOFTMAX_METAX_H__ +#define __CAUSAL_SOFTMAX_METAX_H__ + +#include "../causal_softmax.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca similarity index 83% rename from src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca rename to src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca index c48f1f49b..4823c04ba 100644 --- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca +++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca @@ -1,8 +1,23 @@ #include "../../../devices/maca/common_maca.h" -#include "causal_softmax_kernel.h" -#include "causal_softmax_maca.h" +#include "../../../devices/maca/maca_kernel_common.h" +#include "causal_softmax_metax.h" -namespace op::causal_softmax::maca { +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_MACA_KERNEL causalSoftmax( + Tdata *y, const Tdata *x, + size_t batch, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) { + causalSoftmaxKernel(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h); +} + +namespace op::causal_softmax::metax { struct Descriptor::Opaque { std::shared_ptr internal; @@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, return INFINI_STATUS_SUCCESS; } -} // namespace op::causal_softmax::maca +} // namespace op::causal_softmax::metax diff --git a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu similarity index 84% rename from src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu rename to src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu index e3f3eca2f..dce85428d 100644 --- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu +++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu @@ -1,8 +1,23 @@ #include "../../../devices/cuda/cuda_common.cuh" -#include "causal_softmax_cuda.cuh" -#include "causal_softmax_kernel.cuh" +#include "../../../devices/cuda/cuda_kernel_common.cuh" +#include "causal_softmax_nvidia.cuh" -namespace op::causal_softmax::cuda { +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_CUDA_KERNEL causalSoftmax( + Tdata *y, const Tdata *x, + size_t batch, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) { + causalSoftmaxKernel(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h); +} + +namespace op::causal_softmax::nvidia { struct Descriptor::Opaque { std::shared_ptr internal; @@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, return INFINI_STATUS_SUCCESS; } -} // namespace op::causal_softmax::cuda +} // namespace op::causal_softmax::nvidia diff --git a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh new file mode 100644 index 000000000..0362c43b1 --- /dev/null +++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CAUSAL_SOFTMAX_NVIDIA_H__ +#define __CAUSAL_SOFTMAX_NVIDIA_H__ + +#include "../causal_softmax.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/causal_softmax/operator.cc b/src/infiniop/ops/causal_softmax/operator.cc index 2f3b34fbe..c2051e28a 100644 --- a/src/infiniop/ops/causal_softmax/operator.cc +++ b/src/infiniop/ops/causal_softmax/operator.cc @@ -6,10 +6,10 @@ #include "cpu/causal_softmax_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/causal_softmax_cuda.cuh" +#include "nvidia/causal_softmax_nvidia.cuh" #endif #ifdef ENABLE_METAX_API -#include "maca/causal_softmax_maca.h" +#include "metax/causal_softmax_metax.h" #endif #ifdef ENABLE_ASCEND_API #include "ascend/causal_softmax_ascend.h" @@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( CREATE(INFINI_DEVICE_CPU, cpu) #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda) + CREATE(INFINI_DEVICE_NVIDIA, nvidia) #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca) + CREATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_ASCEND_API + CREATE(INFINI_DEVICE_ASCEND, ascend) #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); } #endif -#ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend) -#endif -#ifdef ENABLE_METAX_GPU - case DevMetaxGpu: { - return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc); - } -#endif #ifdef ENABLE_MTHREADS_GPU case DevMthreadsGpu: { return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); @@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe GET(INFINI_DEVICE_CPU, cpu) #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend) #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe } #endif -#ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend) -#endif -#ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, maca) -#endif -#ifdef ENABLE_METAX_GPU - case DevMetaxGpu: { - return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size); - } -#endif #ifdef ENABLE_MTHREADS_GPU case DevMthreadsGpu: { return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); @@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax( CALCULATE(INFINI_DEVICE_CPU, cpu) #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda) + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca) + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_ASCEND_API + CALCULATE(INFINI_DEVICE_ASCEND, ascend) #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax( // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); } #endif -#ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend) -#endif -#ifdef ENABLE_METAX_GPU - case DevMetaxGpu: { - return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream); - } -#endif #ifdef ENABLE_MTHREADS_GPU case DevMthreadsGpu: { return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); @@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD DESTROY(INFINI_DEVICE_CPU, cpu) #endif #ifdef ENABLE_NVIDIA_API - DESTROY(INFINI_DEVICE_NVIDIA, cuda) + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) #endif #ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, maca) + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +#ifdef ENABLE_ASCEND_API + DESTROY(INFINI_DEVICE_ASCEND, ascend) #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); } #endif -#ifdef ENABLE_ASCEND_API - DESTROY(INFINI_DEVICE_ASCEND, ascend) -#endif -#ifdef ENABLE_METAX_GPU - case DevMetaxGpu: { - return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc); - } -#endif #ifdef ENABLE_MTHREADS_GPU case DevMthreadsGpu: return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh index 99c2bb105..c2690817e 100644 --- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh +++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh @@ -2,6 +2,9 @@ #define __RMS_NORM_CUDA_KERNEL_H__ #include "../../../devices/cuda/cuda_kernel_common.cuh" + +#include + #include "../../../reduce/cuda/reduce.cuh" template diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.cuh similarity index 100% rename from src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh rename to src/infiniop/ops/rms_norm/metax/rms_norm_metax.cuh diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca similarity index 99% rename from src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca rename to src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca index 9ed931404..42e70d812 100644 --- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca +++ b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca @@ -1,6 +1,6 @@ #include "../../../devices/maca/common_maca.h" #include "../cuda/rms_norm_kernel.cuh" -#include "rms_norm_maca.cuh" +#include "rms_norm_metax.cuh" namespace op::rms_norm::maca { diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc index 7b460dd1b..571582e82 100644 --- a/src/infiniop/ops/rms_norm/operator.cc +++ b/src/infiniop/ops/rms_norm/operator.cc @@ -12,7 +12,7 @@ #include "ascend/rms_norm_aclnn.h" #endif #ifdef ENABLE_METAX_API -#include "maca/rms_norm_maca.cuh" +#include "metax/rms_norm_metax.cuh" #endif #ifdef ENABLE_MOORE_API #include "musa/rms_norm_musa.cuh" diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index 1fab64016..89a0dd18b 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -1,8 +1,6 @@ #ifndef __INFINIOP_REDUCE_CUDA_H__ #define __INFINIOP_REDUCE_CUDA_H__ -#include - /* * Device functions for reduction operations on CUDA. * diff --git a/src/infiniop/reduce/maca/reduce.h b/src/infiniop/reduce/maca/reduce.h deleted file mode 100644 index 21ea7a83f..000000000 --- a/src/infiniop/reduce/maca/reduce.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef __INFINIOP_REDUCE_MACA_H__ -#define __INFINIOP_REDUCE_MACA_H__ - -#include - -/* - * Device functions for reduction operations on MACA. - * - * Note: Only local result on thread 0 is guranteed to be correct. - * A manual broadcast is needed for other threads. - */ -namespace op::common_maca::reduce_op { - -// Sum(x^2) on contiguous data of length count -template -__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) { - Tcompute ss = 0; - - // Each thread computes its partial sum - for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { - ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]); - } - - // Use CUB block-level reduction - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - return BlockReduce(temp_storage).Sum(ss); -} - -// Sum(x) on contiguous data of length count -template -__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) { - Tcompute s = 0; - - for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { - s += Tcompute(data_ptr[i]); - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - return BlockReduce(temp_storage).Sum(s); -} - -// Max(x) on contiguous data of length count -template -__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) { - Tdata max_ = data_ptr[0]; - - for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { - max_ = cub::Max()(max_, data_ptr[i]); - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); -} - -} // namespace op::common_maca::reduce_op - -#endif diff --git a/xmake.lua b/xmake.lua index e6bab1700..753d7a588 100644 --- a/xmake.lua +++ b/xmake.lua @@ -108,7 +108,7 @@ option_end() if has_config("metax-gpu") then add_defines("ENABLE_METAX_API") - includes("xmake/maca.lua") + includes("xmake/metax.lua") end -- 摩尔线程 diff --git a/xmake/cuda.lua b/xmake/cuda.lua index 4703cb69f..bd162624e 100644 --- a/xmake/cuda.lua +++ b/xmake/cuda.lua @@ -46,7 +46,7 @@ target("infiniop-cuda") add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations") set_languages("cxx17") - add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../build/ninetoothed/*.c") + add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c") target_end() target("infinirt-cuda") diff --git a/xmake/maca.lua b/xmake/metax.lua similarity index 94% rename from xmake/maca.lua rename to xmake/metax.lua index c08d67e74..7a17f54e6 100644 --- a/xmake/maca.lua +++ b/xmake/metax.lua @@ -35,7 +35,7 @@ target("infiniop-metax") set_warnings("all", "error") add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing") add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc") - add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"}) + add_files("../src/infiniop/ops/*/maca/*.maca", "../src/infiniop/ops/*/metax/*.maca", {rule = "maca"}) target_end() target("infinirt-metax") @@ -61,5 +61,5 @@ target("infiniccl-metax") add_files("../src/infiniccl/maca/*.cc") end set_languages("cxx17") - + target_end() From 27a13460dd9608f3f11cfa8af9f0de4c426eb0d8 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Wed, 9 Jul 2025 15:53:34 +0800 Subject: [PATCH 2/8] =?UTF-8?q?issue/291/fix:=20=E5=85=BC=E5=AE=B9=20bf16?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- src/infiniop/devices/cuda/cuda_kernel_common.cuh | 9 +++++---- src/infiniop/devices/maca/maca_kernel_common.h | 9 ++++++--- src/infiniop/ops/causal_softmax/cuda/kernel.cuh | 2 +- .../ops/causal_softmax/metax/causal_softmax_metax.maca | 2 +- .../ops/causal_softmax/nvidia/causal_softmax_nvidia.cu | 2 +- 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/src/infiniop/devices/cuda/cuda_kernel_common.cuh b/src/infiniop/devices/cuda/cuda_kernel_common.cuh index 88e9fdaa0..cad778d9a 100644 --- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh +++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh @@ -4,6 +4,9 @@ #define INFINIOP_CUDA_KERNEL __global__ void #endif +#include +#include + // Posible maximum number of threads per block for CUDA architectures // Used for picking correct kernel launch configuration #define CUDA_BLOCK_SIZE_4096 4096 @@ -12,8 +15,9 @@ #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess) -namespace device::cuda { +using cuda_bfloat16 = nv_bfloat16; +namespace device::cuda { // return the memory offset of original tensor, given the flattened index of broadcasted tensor __forceinline__ __device__ __host__ size_t indexToReducedOffset( @@ -45,8 +49,6 @@ indexToOffset( } } // namespace device::cuda -#ifdef ENABLE_NVIDIA_API -#include __forceinline__ __device__ float exp_(const float val) { return expf(val); @@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16 exp_(const __nv_bfloat16 x) { return hexp(x); } -#endif diff --git a/src/infiniop/devices/maca/maca_kernel_common.h b/src/infiniop/devices/maca/maca_kernel_common.h index 0e2b376aa..af76f78fe 100644 --- a/src/infiniop/devices/maca/maca_kernel_common.h +++ b/src/infiniop/devices/maca/maca_kernel_common.h @@ -1,4 +1,8 @@ #define INFINIOP_MACA_KERNEL __global__ void + +#include +#include + // Posible maximum number of threads per block for MACA architectures // Used for picking correct kernel launch configuration #define MACA_BLOCK_SIZE_1024 1024 @@ -6,6 +10,8 @@ #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess) +using cuda_bfloat16 = maca_bfloat16; + namespace device::maca { // return the memory offset of original tensor, given the flattened index of broadcasted tensor @@ -39,8 +45,6 @@ indexToOffset( } } // namespace device::maca -#ifdef ENABLE_MACA_API -#include __forceinline__ __device__ float exp_(const float val) { return expf(val); @@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16; exp_(const __hpcc_bfloat16; x) { return hexp(x); } -#endif diff --git a/src/infiniop/ops/causal_softmax/cuda/kernel.cuh b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh index 4dce83a6b..fa8d1dd1d 100644 --- a/src/infiniop/ops/causal_softmax/cuda/kernel.cuh +++ b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh @@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel( // 2 | * * * ... * * * | // height: 3 col_id-> if (width + blockIdx.x >= threadIdx.x + height) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v || std::is_same_v) { y[col] = hexp(x[col] - max_); } else { y[col] = exp(x[col] - max_); diff --git a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca index 4823c04ba..e23130923 100644 --- a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca +++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca @@ -1,8 +1,8 @@ #include "../../../devices/maca/common_maca.h" -#include "../../../devices/maca/maca_kernel_common.h" #include "causal_softmax_metax.h" #include +#include "../../../devices/maca/maca_kernel_common.h" #include "../../../reduce/cuda/reduce.cuh" diff --git a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu index dce85428d..523645686 100644 --- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu +++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu @@ -1,7 +1,7 @@ #include "../../../devices/cuda/cuda_common.cuh" -#include "../../../devices/cuda/cuda_kernel_common.cuh" #include "causal_softmax_nvidia.cuh" +#include "../../../devices/cuda/cuda_kernel_common.cuh" #include #include "../../../reduce/cuda/reduce.cuh" From c235afb160106e1800379afbda8a02b20929ebe6 Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Thu, 10 Jul 2025 05:34:25 +0800 Subject: [PATCH 3/8] =?UTF-8?q?issue/291/refactor:=20=E6=94=B9=E9=80=A0=20?= =?UTF-8?q?clip=20=E5=92=8C=20element-wise?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- src/infiniop/elementwise/elementwise.h | 78 +++++++++---------- src/infiniop/ops/add/cpu/add_cpu.h | 2 +- src/infiniop/ops/add/cuda/add_cuda.cuh | 2 +- src/infiniop/ops/clip/cpu/clip_cpu.h | 2 +- .../{clip_cuda_internal.cuh => kernel.cuh} | 0 .../clip_cuda.cu => nvidia/clip_nvidia.cu} | 16 ++-- .../clip_cuda.cuh => nvidia/clip_nvidia.cuh} | 2 +- src/infiniop/ops/clip/operator.cc | 10 +-- src/infiniop/ops/mul/cpu/mul_cpu.h | 2 +- src/infiniop/ops/mul/cuda/mul_cuda.cuh | 2 +- src/infiniop/ops/relu/cpu/relu_cpu.h | 2 +- src/infiniop/ops/sub/cpu/sub_cpu.h | 2 +- src/infiniop/ops/sub/cuda/sub_cuda.cuh | 2 +- src/infiniop/ops/swiglu/cpu/swiglu_cpu.h | 2 +- src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh | 2 +- 15 files changed, 63 insertions(+), 63 deletions(-) rename src/infiniop/ops/clip/cuda/{clip_cuda_internal.cuh => kernel.cuh} (100%) rename src/infiniop/ops/clip/{cuda/clip_cuda.cu => nvidia/clip_nvidia.cu} (73%) rename src/infiniop/ops/clip/{cuda/clip_cuda.cuh => nvidia/clip_nvidia.cuh} (80%) diff --git a/src/infiniop/elementwise/elementwise.h b/src/infiniop/elementwise/elementwise.h index a6a5477f4..02e19a0da 100644 --- a/src/infiniop/elementwise/elementwise.h +++ b/src/infiniop/elementwise/elementwise.h @@ -12,45 +12,45 @@ #include #include -#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ - \ - namespace op::OP::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - infiniDtype_t _dtype; \ - op::elementwise::ElementwiseInfo _info; \ - std::unique_ptr _device_info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - infiniDtype_t dtype, \ - op::elementwise::ElementwiseInfo info, \ - op::elementwise::NAMESPACE::DeviceImpl *device_info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _dtype(dtype), \ - _info(std::move(info)), \ - _device_info(std::move(device_info)), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t output_desc, \ - std::vector input_descs); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - void *output, \ - std::vector inputs, \ - void *stream) const; \ - }; \ +#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON) \ + \ + namespace op::OP::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + infiniDtype_t _dtype; \ + op::elementwise::ElementwiseInfo _info; \ + std::unique_ptr _device_info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + infiniDtype_t dtype, \ + op::elementwise::ElementwiseInfo info, \ + op::elementwise::KERNEL_COMMON::DeviceImpl *device_info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _dtype(dtype), \ + _info(std::move(info)), \ + _device_info(std::move(device_info)), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + std::vector input_descs); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const; \ + }; \ } namespace op::elementwise { diff --git a/src/infiniop/ops/add/cpu/add_cpu.h b/src/infiniop/ops/add/cpu/add_cpu.h index 736468acf..d95ee05c8 100644 --- a/src/infiniop/ops/add/cpu/add_cpu.h +++ b/src/infiniop/ops/add/cpu/add_cpu.h @@ -3,7 +3,7 @@ #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(add, cpu) +ELEMENTWISE_DESCRIPTOR(add, cpu, cpu) namespace op::add::cpu { typedef struct AddOp { diff --git a/src/infiniop/ops/add/cuda/add_cuda.cuh b/src/infiniop/ops/add/cuda/add_cuda.cuh index 2e81480b9..8a0680553 100644 --- a/src/infiniop/ops/add/cuda/add_cuda.cuh +++ b/src/infiniop/ops/add/cuda/add_cuda.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(add, cuda) +ELEMENTWISE_DESCRIPTOR(add, cuda, cuda) #endif // __ADD_CUDA_API_H__ diff --git a/src/infiniop/ops/clip/cpu/clip_cpu.h b/src/infiniop/ops/clip/cpu/clip_cpu.h index ab42fc6f4..6d37954cb 100644 --- a/src/infiniop/ops/clip/cpu/clip_cpu.h +++ b/src/infiniop/ops/clip/cpu/clip_cpu.h @@ -4,7 +4,7 @@ #include "../../../elementwise/cpu/elementwise_cpu.h" #include "infiniop/ops/clip.h" -ELEMENTWISE_DESCRIPTOR(clip, cpu) +ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu) namespace op::clip::cpu { diff --git a/src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh b/src/infiniop/ops/clip/cuda/kernel.cuh similarity index 100% rename from src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh rename to src/infiniop/ops/clip/cuda/kernel.cuh diff --git a/src/infiniop/ops/clip/cuda/clip_cuda.cu b/src/infiniop/ops/clip/nvidia/clip_nvidia.cu similarity index 73% rename from src/infiniop/ops/clip/cuda/clip_cuda.cu rename to src/infiniop/ops/clip/nvidia/clip_nvidia.cu index f9618aeed..c2e43b456 100644 --- a/src/infiniop/ops/clip/cuda/clip_cuda.cu +++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cu @@ -1,7 +1,7 @@ -#include "clip_cuda.cuh" -#include "clip_cuda_internal.cuh" +#include "../cuda/kernel.cuh" +#include "clip_nvidia.cuh" -namespace op::clip::cuda { +namespace op::clip::nvidia { Descriptor::~Descriptor() = default; @@ -45,17 +45,17 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, ClipOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ClipOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, ClipOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ClipOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, ClipOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; } -} // namespace op::clip::cuda +} // namespace op::clip::nvidia diff --git a/src/infiniop/ops/clip/cuda/clip_cuda.cuh b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh similarity index 80% rename from src/infiniop/ops/clip/cuda/clip_cuda.cuh rename to src/infiniop/ops/clip/nvidia/clip_nvidia.cuh index 87e8a068d..6a44e6176 100644 --- a/src/infiniop/ops/clip/cuda/clip_cuda.cuh +++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh @@ -4,6 +4,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" #include "infiniop/ops/clip.h" -ELEMENTWISE_DESCRIPTOR(clip, cuda) +ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda) #endif // __CLIP_CUDA_API_H__ diff --git a/src/infiniop/ops/clip/operator.cc b/src/infiniop/ops/clip/operator.cc index bac56b3d0..e1398440d 100644 --- a/src/infiniop/ops/clip/operator.cc +++ b/src/infiniop/ops/clip/operator.cc @@ -6,7 +6,7 @@ #include "cpu/clip_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/clip_cuda.cuh" +#include "nvidia/clip_nvidia.cuh" #endif __C infiniStatus_t infiniopCreateClipDescriptor( @@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateClipDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda); + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -53,7 +53,7 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s GET(INFINI_DEVICE_CPU, cpu) #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia) #endif } @@ -83,7 +83,7 @@ __C infiniStatus_t infiniopClip( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda); + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -107,7 +107,7 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, cuda); + DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: diff --git a/src/infiniop/ops/mul/cpu/mul_cpu.h b/src/infiniop/ops/mul/cpu/mul_cpu.h index a60d1ee94..26b381340 100644 --- a/src/infiniop/ops/mul/cpu/mul_cpu.h +++ b/src/infiniop/ops/mul/cpu/mul_cpu.h @@ -3,7 +3,7 @@ #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(mul, cpu) +ELEMENTWISE_DESCRIPTOR(mul, cpu, cpu) namespace op::mul::cpu { typedef struct MulOp { diff --git a/src/infiniop/ops/mul/cuda/mul_cuda.cuh b/src/infiniop/ops/mul/cuda/mul_cuda.cuh index 00219e34d..5bd420c56 100644 --- a/src/infiniop/ops/mul/cuda/mul_cuda.cuh +++ b/src/infiniop/ops/mul/cuda/mul_cuda.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(mul, cuda) +ELEMENTWISE_DESCRIPTOR(mul, cuda, cuda) #endif // __MUL_CUDA_API_H__ diff --git a/src/infiniop/ops/relu/cpu/relu_cpu.h b/src/infiniop/ops/relu/cpu/relu_cpu.h index b507a5d18..66dec3da8 100644 --- a/src/infiniop/ops/relu/cpu/relu_cpu.h +++ b/src/infiniop/ops/relu/cpu/relu_cpu.h @@ -5,7 +5,7 @@ #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(relu, cpu) +ELEMENTWISE_DESCRIPTOR(relu, cpu, cpu) namespace op::relu::cpu { typedef struct ReluOp { diff --git a/src/infiniop/ops/sub/cpu/sub_cpu.h b/src/infiniop/ops/sub/cpu/sub_cpu.h index af460466e..2277a8e07 100644 --- a/src/infiniop/ops/sub/cpu/sub_cpu.h +++ b/src/infiniop/ops/sub/cpu/sub_cpu.h @@ -3,7 +3,7 @@ #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(sub, cpu) +ELEMENTWISE_DESCRIPTOR(sub, cpu, cpu) namespace op::sub::cpu { typedef struct SubOp { diff --git a/src/infiniop/ops/sub/cuda/sub_cuda.cuh b/src/infiniop/ops/sub/cuda/sub_cuda.cuh index 4590ba787..515c43d26 100644 --- a/src/infiniop/ops/sub/cuda/sub_cuda.cuh +++ b/src/infiniop/ops/sub/cuda/sub_cuda.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(sub, cuda) +ELEMENTWISE_DESCRIPTOR(sub, cuda, cuda) #endif // __SUB_CUDA_API_H__ diff --git a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h index 65c1c7c33..f41d46cd5 100644 --- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h +++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h @@ -3,7 +3,7 @@ #include "../../../elementwise/cpu/elementwise_cpu.h" -ELEMENTWISE_DESCRIPTOR(swiglu, cpu) +ELEMENTWISE_DESCRIPTOR(swiglu, cpu, cpu) namespace op::swiglu::cpu { typedef struct SwiGLUOp { diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh b/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh index 75e529ab1..e91f9dbc0 100644 --- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh +++ b/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(swiglu, cuda) +ELEMENTWISE_DESCRIPTOR(swiglu, cuda, cuda) #endif // __SWIGLU_CUDA_API_H__ From f06eb359e0c8e18c74eda757b2ade68b7d8af78a Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Thu, 10 Jul 2025 08:55:14 +0800 Subject: [PATCH 4/8] =?UTF-8?q?issue/291/refactor:=20=E6=94=B9=E9=80=A0=20?= =?UTF-8?q?+-*?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .../cuda/{add_cuda_internal.cuh => kernel.cuh} | 0 .../{cuda/add_cuda.cu => nvidia/add_nvidia.cu} | 14 +++++++------- .../{cuda/add_cuda.cuh => nvidia/add_nvidia.cuh} | 2 +- src/infiniop/ops/add/operator.cc | 16 ++++++++-------- src/infiniop/ops/clip/nvidia/clip_nvidia.cuh | 1 - .../cuda/{mul_cuda_internal.cuh => kernel.cuh} | 0 .../{cuda/mul_cuda.cu => nvidia/mul_nvidia.cu} | 16 ++++++++-------- .../{cuda/mul_cuda.cuh => nvidia/mul_nvidia.cuh} | 2 +- src/infiniop/ops/mul/operator.cc | 14 +++++++------- .../cuda/{sub_cuda_internal.cuh => kernel.cuh} | 0 .../{cuda/sub_cuda.cu => nvidia/sub_nvidia.cu} | 14 +++++++------- .../{cuda/sub_cuda.cuh => nvidia/sub_nvidia.cuh} | 2 +- src/infiniop/ops/sub/operator.cc | 14 +++++++------- 13 files changed, 47 insertions(+), 48 deletions(-) rename src/infiniop/ops/add/cuda/{add_cuda_internal.cuh => kernel.cuh} (100%) rename src/infiniop/ops/add/{cuda/add_cuda.cu => nvidia/add_nvidia.cu} (73%) rename src/infiniop/ops/add/{cuda/add_cuda.cuh => nvidia/add_nvidia.cuh} (77%) rename src/infiniop/ops/mul/cuda/{mul_cuda_internal.cuh => kernel.cuh} (100%) rename src/infiniop/ops/mul/{cuda/mul_cuda.cu => nvidia/mul_nvidia.cu} (71%) rename src/infiniop/ops/mul/{cuda/mul_cuda.cuh => nvidia/mul_nvidia.cuh} (77%) rename src/infiniop/ops/sub/cuda/{sub_cuda_internal.cuh => kernel.cuh} (100%) rename src/infiniop/ops/sub/{cuda/sub_cuda.cu => nvidia/sub_nvidia.cu} (73%) rename src/infiniop/ops/sub/{cuda/sub_cuda.cuh => nvidia/sub_nvidia.cuh} (77%) diff --git a/src/infiniop/ops/add/cuda/add_cuda_internal.cuh b/src/infiniop/ops/add/cuda/kernel.cuh similarity index 100% rename from src/infiniop/ops/add/cuda/add_cuda_internal.cuh rename to src/infiniop/ops/add/cuda/kernel.cuh diff --git a/src/infiniop/ops/add/cuda/add_cuda.cu b/src/infiniop/ops/add/nvidia/add_nvidia.cu similarity index 73% rename from src/infiniop/ops/add/cuda/add_cuda.cu rename to src/infiniop/ops/add/nvidia/add_nvidia.cu index 93ae0dd7d..b31d23b63 100644 --- a/src/infiniop/ops/add/cuda/add_cuda.cu +++ b/src/infiniop/ops/add/nvidia/add_nvidia.cu @@ -1,7 +1,7 @@ -#include "add_cuda.cuh" -#include "add_cuda_internal.cuh" +#include "add_nvidia.cuh" +#include "../cuda/kernel.cuh" -namespace op::add::cuda { +namespace op::add::nvidia { Descriptor::~Descriptor() = default; @@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, AddOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, AddOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, AddOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/add/cuda/add_cuda.cuh b/src/infiniop/ops/add/nvidia/add_nvidia.cuh similarity index 77% rename from src/infiniop/ops/add/cuda/add_cuda.cuh rename to src/infiniop/ops/add/nvidia/add_nvidia.cuh index 8a0680553..04e8ecd39 100644 --- a/src/infiniop/ops/add/cuda/add_cuda.cuh +++ b/src/infiniop/ops/add/nvidia/add_nvidia.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(add, cuda, cuda) +ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda) #endif // __ADD_CUDA_API_H__ diff --git a/src/infiniop/ops/add/operator.cc b/src/infiniop/ops/add/operator.cc index b9a00064c..49aa8f922 100644 --- a/src/infiniop/ops/add/operator.cc +++ b/src/infiniop/ops/add/operator.cc @@ -6,7 +6,7 @@ #include "cpu/add_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/add_cuda.cuh" +#include "nvidia/add_nvidia.cuh" #endif __C infiniStatus_t infiniopCreateAddDescriptor( @@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda); + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz #define GET(CASE, NAMESPACE) \ case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { #ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) + GET(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda); + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -99,7 +99,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { #define DELETE(CASE, NAMESPACE) \ case CASE: \ delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { @@ -107,7 +107,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, cuda); + DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: diff --git a/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh index 6a44e6176..45358f706 100644 --- a/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh +++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh @@ -2,7 +2,6 @@ #define __CLIP_CUDA_API_H__ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -#include "infiniop/ops/clip.h" ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda) diff --git a/src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh b/src/infiniop/ops/mul/cuda/kernel.cuh similarity index 100% rename from src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh rename to src/infiniop/ops/mul/cuda/kernel.cuh diff --git a/src/infiniop/ops/mul/cuda/mul_cuda.cu b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu similarity index 71% rename from src/infiniop/ops/mul/cuda/mul_cuda.cu rename to src/infiniop/ops/mul/nvidia/mul_nvidia.cu index e98878352..9d71f31e8 100644 --- a/src/infiniop/ops/mul/cuda/mul_cuda.cu +++ b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu @@ -1,7 +1,7 @@ -#include "mul_cuda.cuh" -#include "mul_cuda_internal.cuh" +#include "mul_nvidia.cuh" +#include "../cuda/kernel.cuh" -namespace op::mul::cuda { +namespace op::mul::nvidia { Descriptor::~Descriptor() = default; @@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::MulOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::MulOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; } -} // namespace op::mul::cuda +} // namespace op::mul::nvidia diff --git a/src/infiniop/ops/mul/cuda/mul_cuda.cuh b/src/infiniop/ops/mul/nvidia/mul_nvidia.cuh similarity index 77% rename from src/infiniop/ops/mul/cuda/mul_cuda.cuh rename to src/infiniop/ops/mul/nvidia/mul_nvidia.cuh index 5bd420c56..fd59ed904 100644 --- a/src/infiniop/ops/mul/cuda/mul_cuda.cuh +++ b/src/infiniop/ops/mul/nvidia/mul_nvidia.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(mul, cuda, cuda) +ELEMENTWISE_DESCRIPTOR(mul, nvidia, cuda) #endif // __MUL_CUDA_API_H__ diff --git a/src/infiniop/ops/mul/operator.cc b/src/infiniop/ops/mul/operator.cc index 89c9586c4..cb965924b 100644 --- a/src/infiniop/ops/mul/operator.cc +++ b/src/infiniop/ops/mul/operator.cc @@ -7,7 +7,7 @@ #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/mul_cuda.cuh" +#include "nvidia/mul_nvidia.cuh" #endif __C infiniStatus_t infiniopCreateMulDescriptor( @@ -32,7 +32,7 @@ __C infiniStatus_t infiniopCreateMulDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda); + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -47,14 +47,14 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz #define GET(CASE, NAMESPACE) \ case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { #ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) + GET(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -84,7 +84,7 @@ __C infiniStatus_t infiniopMul( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda); + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -108,7 +108,7 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, cuda); + DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: diff --git a/src/infiniop/ops/sub/cuda/sub_cuda_internal.cuh b/src/infiniop/ops/sub/cuda/kernel.cuh similarity index 100% rename from src/infiniop/ops/sub/cuda/sub_cuda_internal.cuh rename to src/infiniop/ops/sub/cuda/kernel.cuh diff --git a/src/infiniop/ops/sub/cuda/sub_cuda.cu b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu similarity index 73% rename from src/infiniop/ops/sub/cuda/sub_cuda.cu rename to src/infiniop/ops/sub/nvidia/sub_nvidia.cu index 9aedb7c3e..4460a2019 100644 --- a/src/infiniop/ops/sub/cuda/sub_cuda.cu +++ b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu @@ -1,7 +1,7 @@ -#include "sub_cuda.cuh" -#include "sub_cuda_internal.cuh" +#include "sub_nvidia.cuh" +#include "../cuda/kernel.cuh" -namespace op::sub::cuda { +namespace op::sub::nvidia { Descriptor::~Descriptor() = default; @@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, SubOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SubOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, SubOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SubOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, SubOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, SubOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SubOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/sub/cuda/sub_cuda.cuh b/src/infiniop/ops/sub/nvidia/sub_nvidia.cuh similarity index 77% rename from src/infiniop/ops/sub/cuda/sub_cuda.cuh rename to src/infiniop/ops/sub/nvidia/sub_nvidia.cuh index 515c43d26..caebe1051 100644 --- a/src/infiniop/ops/sub/cuda/sub_cuda.cuh +++ b/src/infiniop/ops/sub/nvidia/sub_nvidia.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(sub, cuda, cuda) +ELEMENTWISE_DESCRIPTOR(sub, nvidia, cuda) #endif // __SUB_CUDA_API_H__ diff --git a/src/infiniop/ops/sub/operator.cc b/src/infiniop/ops/sub/operator.cc index 826efbd7e..ca4a8f0ab 100644 --- a/src/infiniop/ops/sub/operator.cc +++ b/src/infiniop/ops/sub/operator.cc @@ -6,7 +6,7 @@ #include "cpu/sub_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/sub_cuda.cuh" +#include "nvidia/sub_nvidia.cuh" #endif __C infiniStatus_t infiniopCreateSubDescriptor( @@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateSubDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda); + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz #define GET(CASE, NAMESPACE) \ case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { #ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) + GET(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -83,7 +83,7 @@ __C infiniStatus_t infiniopSub( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda); + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: @@ -107,7 +107,7 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, cuda); + DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif default: From abf1e0219e951caca72c3cd42b3aa33ae37a667c Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Thu, 10 Jul 2025 09:01:48 +0800 Subject: [PATCH 5/8] =?UTF-8?q?issue/291/refactor:=20=E6=94=B9=E9=80=A0=20?= =?UTF-8?q?rms=5Fnorm=E3=80=81rope=E3=80=81swiglu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .../cuda/{rms_norm_kernel.cuh => kernel.cuh} | 6 -- .../rms_norm_nvidia.cu} | 12 +++- .../rms_norm_nvidia.cuh} | 2 +- src/infiniop/ops/rms_norm/operator.cc | 58 +++++++++---------- .../cuda/{rope_cuda_kernel.cuh => kernel.cuh} | 2 - .../rope_cuda.cu => nvidia/rope_nvidia.cu} | 9 ++- .../rope_cuda.cuh => nvidia/rope_nvidia.cuh} | 2 +- src/infiniop/ops/rope/operator.cc | 10 ++-- .../{swiglu_cuda_internal.cuh => kernel.cuh} | 0 .../swiglu_nvidia.cu} | 14 ++--- .../swiglu_nvidia.cuh} | 2 +- src/infiniop/ops/swiglu/operator.cc | 23 +++----- 12 files changed, 68 insertions(+), 72 deletions(-) rename src/infiniop/ops/rms_norm/cuda/{rms_norm_kernel.cuh => kernel.cuh} (88%) rename src/infiniop/ops/rms_norm/{cuda/rms_norm_cuda.cu => nvidia/rms_norm_nvidia.cu} (94%) rename src/infiniop/ops/rms_norm/{cuda/rms_norm_cuda.cuh => nvidia/rms_norm_nvidia.cuh} (82%) rename src/infiniop/ops/rope/cuda/{rope_cuda_kernel.cuh => kernel.cuh} (97%) rename src/infiniop/ops/rope/{cuda/rope_cuda.cu => nvidia/rope_nvidia.cu} (96%) rename src/infiniop/ops/rope/{cuda/rope_cuda.cuh => nvidia/rope_nvidia.cuh} (86%) rename src/infiniop/ops/swiglu/cuda/{swiglu_cuda_internal.cuh => kernel.cuh} (100%) rename src/infiniop/ops/swiglu/{cuda/swiglu_cuda.cu => nvidia/swiglu_nvidia.cu} (72%) rename src/infiniop/ops/swiglu/{cuda/swiglu_cuda.cuh => nvidia/swiglu_nvidia.cuh} (77%) diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh b/src/infiniop/ops/rms_norm/cuda/kernel.cuh similarity index 88% rename from src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh rename to src/infiniop/ops/rms_norm/cuda/kernel.cuh index c2690817e..3185672e2 100644 --- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh +++ b/src/infiniop/ops/rms_norm/cuda/kernel.cuh @@ -1,12 +1,6 @@ #ifndef __RMS_NORM_CUDA_KERNEL_H__ #define __RMS_NORM_CUDA_KERNEL_H__ -#include "../../../devices/cuda/cuda_kernel_common.cuh" - -#include - -#include "../../../reduce/cuda/reduce.cuh" - template INFINIOP_CUDA_KERNEL rmsnormBlock( Tdata *__restrict__ y, diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu similarity index 94% rename from src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu rename to src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu index 48fe0a3aa..12435df72 100644 --- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu +++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -1,8 +1,14 @@ #include "../../../devices/cuda/cuda_common.cuh" -#include "rms_norm_cuda.cuh" -#include "rms_norm_kernel.cuh" +#include "rms_norm_nvidia.cuh" -namespace op::rms_norm::cuda { +#include "../../../devices/cuda/cuda_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::rms_norm::nvidia { struct Descriptor::Opaque { std::shared_ptr internal; diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh similarity index 82% rename from src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh rename to src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh index c56b6bf23..e42d41c39 100644 --- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh +++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh @@ -3,6 +3,6 @@ #include "../rms_norm.h" -DESCRIPTOR(cuda) +DESCRIPTOR(nvidia) #endif diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc index 571582e82..9c7fbab27 100644 --- a/src/infiniop/ops/rms_norm/operator.cc +++ b/src/infiniop/ops/rms_norm/operator.cc @@ -6,7 +6,7 @@ #include "cpu/rms_norm_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/rms_norm_cuda.cuh" +#include "nvidia/rms_norm_nvidia.cuh" #endif #ifdef ENABLE_ASCEND_API #include "ascend/rms_norm_aclnn.h" @@ -37,17 +37,17 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor( y_desc, \ x_desc, \ w_desc, \ - epsilon); + epsilon) switch (handle->device) { #ifdef ENABLE_CPU_API - CREATE(INFINI_DEVICE_CPU, cpu) + CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda) + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API - CREATE(INFINI_DEVICE_KUNLUN, kunlun) + CREATE(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -55,13 +55,13 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor( } #endif #ifdef ENABLE_ASCEND_API - CREATE(INFINI_DEVICE_ASCEND, ascend) + CREATE(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca) + CREATE(INFINI_DEVICE_METAX, maca); #endif #ifdef ENABLE_MOORE_API - CREATE(INFINI_DEVICE_MOORE, musa) + CREATE(INFINI_DEVICE_MOORE, musa); #endif } @@ -75,17 +75,17 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d #define GET(CASE, NAMESPACE) \ case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { #ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) + GET(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun) + GET(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -93,13 +93,13 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d } #endif #ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend) + GET(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, maca) + GET(INFINI_DEVICE_METAX, maca); #endif #ifdef ENABLE_MOORE_API - GET(INFINI_DEVICE_MOORE, musa) + GET(INFINI_DEVICE_MOORE, musa); #endif } @@ -114,17 +114,17 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works #define CALCULATE(CASE, NAMESPACE) \ case CASE: \ return reinterpret_cast(desc)->calculate( \ - workspace, workspace_size, y, x, w, stream); + workspace, workspace_size, y, x, w, stream) switch (desc->device_type) { #ifdef ENABLE_CPU_API - CALCULATE(INFINI_DEVICE_CPU, cpu) + CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda) + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API - CALCULATE(INFINI_DEVICE_KUNLUN, kunlun) + CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -132,13 +132,13 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works } #endif #ifdef ENABLE_ASCEND_API - CALCULATE(INFINI_DEVICE_ASCEND, ascend) + CALCULATE(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca) + CALCULATE(INFINI_DEVICE_METAX, maca); #endif #ifdef ENABLE_MOORE_API - CALCULATE(INFINI_DEVICE_MOORE, musa) + CALCULATE(INFINI_DEVICE_MOORE, musa); #endif } @@ -152,17 +152,17 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t #define DESTROY(CASE, NAMESPACE) \ case CASE: \ delete reinterpret_cast(desc); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { #ifdef ENABLE_CPU_API - DESTROY(INFINI_DEVICE_CPU, cpu) + DESTROY(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DESTROY(INFINI_DEVICE_NVIDIA, cuda) + DESTROY(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API - DESTROY(INFINI_DEVICE_KUNLUN, kunlun) + DESTROY(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -170,13 +170,13 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t } #endif #ifdef ENABLE_ASCEND_API - DESTROY(INFINI_DEVICE_ASCEND, ascend) + DESTROY(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - DESTROY(INFINI_DEVICE_METAX, maca) + DESTROY(INFINI_DEVICE_METAX, maca); #endif #ifdef ENABLE_MOORE_API - DESTROY(INFINI_DEVICE_MOORE, musa) + DESTROY(INFINI_DEVICE_MOORE, musa); #endif } diff --git a/src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh b/src/infiniop/ops/rope/cuda/kernel.cuh similarity index 97% rename from src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh rename to src/infiniop/ops/rope/cuda/kernel.cuh index c2a36a377..3d584ac76 100644 --- a/src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh +++ b/src/infiniop/ops/rope/cuda/kernel.cuh @@ -1,8 +1,6 @@ #ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__ #define __INFINIOP_ROPE_CUDA_KERNEL_CUH__ -#include "../../../devices/cuda/cuda_kernel_common.cuh" - template INFINIOP_CUDA_KERNEL ropeThreadPerItem( Tdata *y_, diff --git a/src/infiniop/ops/rope/cuda/rope_cuda.cu b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu similarity index 96% rename from src/infiniop/ops/rope/cuda/rope_cuda.cu rename to src/infiniop/ops/rope/nvidia/rope_nvidia.cu index 6f35f394f..c2833dcf8 100644 --- a/src/infiniop/ops/rope/cuda/rope_cuda.cu +++ b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu @@ -1,8 +1,11 @@ #include "../../../devices/cuda/cuda_common.cuh" -#include "rope_cuda.cuh" -#include "rope_cuda_kernel.cuh" +#include "rope_nvidia.cuh" -namespace op::rope::cuda { +#include "../../../devices/cuda/cuda_kernel_common.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::rope::nvidia { struct Descriptor::Opaque { std::shared_ptr internal; diff --git a/src/infiniop/ops/rope/cuda/rope_cuda.cuh b/src/infiniop/ops/rope/nvidia/rope_nvidia.cuh similarity index 86% rename from src/infiniop/ops/rope/cuda/rope_cuda.cuh rename to src/infiniop/ops/rope/nvidia/rope_nvidia.cuh index 003a961c2..6b2455600 100644 --- a/src/infiniop/ops/rope/cuda/rope_cuda.cuh +++ b/src/infiniop/ops/rope/nvidia/rope_nvidia.cuh @@ -3,6 +3,6 @@ #include "../rope.h" -DESCRIPTOR(cuda) +DESCRIPTOR(nvidia) #endif // __INFINIOP_ROPE_CUDA_H__ diff --git a/src/infiniop/ops/rope/operator.cc b/src/infiniop/ops/rope/operator.cc index 95c42d06d..077f6e86b 100644 --- a/src/infiniop/ops/rope/operator.cc +++ b/src/infiniop/ops/rope/operator.cc @@ -6,7 +6,7 @@ #include "cpu/rope_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/rope_cuda.cuh" +#include "nvidia/rope_nvidia.cuh" #endif #ifdef ENABLE_ASCEND_API #include "ascend/rope_ascend.h" @@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda); + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, maca); @@ -81,7 +81,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, GET(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda); + GET(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, maca); @@ -132,7 +132,7 @@ __C infiniStatus_t infiniopRoPE( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda); + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, maca); @@ -178,7 +178,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, cuda); + DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API DELETE(INFINI_DEVICE_METAX, maca); diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh b/src/infiniop/ops/swiglu/cuda/kernel.cuh similarity index 100% rename from src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh rename to src/infiniop/ops/swiglu/cuda/kernel.cuh diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu similarity index 72% rename from src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu rename to src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu index f124a671f..ae744fdc7 100644 --- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu +++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -1,7 +1,7 @@ -#include "swiglu_cuda.cuh" -#include "swiglu_cuda_internal.cuh" +#include "swiglu_nvidia.cuh" +#include "../cuda/kernel.cuh" -namespace op::swiglu::cuda { +namespace op::swiglu::nvidia { Descriptor::~Descriptor() = default; @@ -42,13 +42,13 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh similarity index 77% rename from src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh rename to src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh index e91f9dbc0..c3b26821d 100644 --- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh +++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh @@ -3,6 +3,6 @@ #include "../../../elementwise/cuda/elementwise_cuda_api.cuh" -ELEMENTWISE_DESCRIPTOR(swiglu, cuda, cuda) +ELEMENTWISE_DESCRIPTOR(swiglu, nvidia, cuda) #endif // __SWIGLU_CUDA_API_H__ diff --git a/src/infiniop/ops/swiglu/operator.cc b/src/infiniop/ops/swiglu/operator.cc index 018ec3bd2..2b831016d 100644 --- a/src/infiniop/ops/swiglu/operator.cc +++ b/src/infiniop/ops/swiglu/operator.cc @@ -6,7 +6,7 @@ #include "cpu/swiglu_cpu.h" #endif #ifdef ENABLE_NVIDIA_API -#include "cuda/swiglu_cuda.cuh" +#include "nvidia/swiglu_nvidia.cuh" #endif #ifdef ENABLE_KUNLUN_API #include "kunlun/swiglu_kunlun.h" @@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor( CREATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CREATE(INFINI_DEVICE_NVIDIA, cuda); + CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API CREATE(INFINI_DEVICE_KUNLUN, kunlun); @@ -83,17 +83,17 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des #define GET(CASE, NAMESPACE) \ case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ - return INFINI_STATUS_SUCCESS; + return INFINI_STATUS_SUCCESS switch (desc->device_type) { #ifdef ENABLE_CPU_API - GET(INFINI_DEVICE_CPU, cpu) + GET(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - GET(INFINI_DEVICE_NVIDIA, cuda) + GET(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API - GET(INFINI_DEVICE_KUNLUN, kunlun) + GET(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, maca); @@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des } #endif #ifdef ENABLE_ASCEND_API - GET(INFINI_DEVICE_ASCEND, ascend) -#endif -#ifdef ENABLE_METAX_GPU - case DevMetaxGpu: { - return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size); - } + GET(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_MTHREADS_GPU case DevMthreadsGpu: { @@ -143,7 +138,7 @@ __C infiniStatus_t infiniopSwiGLU( CALCULATE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - CALCULATE(INFINI_DEVICE_NVIDIA, cuda); + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); @@ -189,7 +184,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) { DELETE(INFINI_DEVICE_CPU, cpu); #endif #ifdef ENABLE_NVIDIA_API - DELETE(INFINI_DEVICE_NVIDIA, cuda); + DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_KUNLUN_API DELETE(INFINI_DEVICE_KUNLUN, kunlun); From 05247bb7c0fa766024190490b8485f957e56484d Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Thu, 10 Jul 2025 09:06:47 +0000 Subject: [PATCH 6/8] =?UTF-8?q?issue/291/refactor:=20=E9=80=82=E9=85=8D?= =?UTF-8?q?=E6=B2=90=E6=9B=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: PanZezhong --- .../devices/cuda/cuda_kernel_common.cuh | 1 + .../devices/maca/maca_kernel_common.h | 12 +++--- src/infiniop/ops/gemm/maca/gemm_maca.cc | 9 ++-- src/infiniop/ops/rms_norm/cuda/kernel.cuh | 4 +- .../ops/rms_norm/metax/rms_norm_metax.maca | 40 +++++++++++++----- .../ops/rms_norm/nvidia/rms_norm_nvidia.cu | 30 +++++++++---- src/infiniop/ops/rope/cuda/kernel.cuh | 8 ++-- src/infiniop/ops/rope/maca/rope_maca_kernel.h | 42 ------------------- .../{maca/rope_maca.h => metax/rope_metax.h} | 2 +- .../rope_maca.maca => metax/rope_metax.maca} | 33 +++++++++++++-- src/infiniop/ops/rope/nvidia/rope_nvidia.cu | 26 ++++++++++-- src/infiniop/ops/rope/operator.cc | 10 ++--- src/infiniop/ops/swiglu/cuda/kernel.cuh | 16 +++---- .../ops/swiglu/maca/swiglu_maca_internal.h | 40 ------------------ .../swiglu_maca.h => metax/swiglu_metax.h} | 2 +- .../swiglu_metax.maca} | 21 ++++++---- .../ops/swiglu/nvidia/swiglu_nvidia.cu | 5 ++- src/infiniop/ops/swiglu/operator.cc | 10 ++--- xmake.lua | 2 +- 19 files changed, 155 insertions(+), 158 deletions(-) delete mode 100644 src/infiniop/ops/rope/maca/rope_maca_kernel.h rename src/infiniop/ops/rope/{maca/rope_maca.h => metax/rope_metax.h} (87%) rename src/infiniop/ops/rope/{maca/rope_maca.maca => metax/rope_metax.maca} (81%) delete mode 100644 src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h rename src/infiniop/ops/swiglu/{maca/swiglu_maca.h => metax/swiglu_metax.h} (77%) rename src/infiniop/ops/swiglu/{maca/swiglu_maca.maca => metax/swiglu_metax.maca} (62%) diff --git a/src/infiniop/devices/cuda/cuda_kernel_common.cuh b/src/infiniop/devices/cuda/cuda_kernel_common.cuh index cad778d9a..41b448069 100644 --- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh +++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh @@ -16,6 +16,7 @@ #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess) using cuda_bfloat16 = nv_bfloat16; +using cuda_bfloat162 = nv_bfloat162; namespace device::cuda { // return the memory offset of original tensor, given the flattened index of broadcasted tensor diff --git a/src/infiniop/devices/maca/maca_kernel_common.h b/src/infiniop/devices/maca/maca_kernel_common.h index af76f78fe..5e2a7a254 100644 --- a/src/infiniop/devices/maca/maca_kernel_common.h +++ b/src/infiniop/devices/maca/maca_kernel_common.h @@ -1,8 +1,5 @@ #define INFINIOP_MACA_KERNEL __global__ void -#include -#include - // Posible maximum number of threads per block for MACA architectures // Used for picking correct kernel launch configuration #define MACA_BLOCK_SIZE_1024 1024 @@ -10,7 +7,8 @@ #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess) -using cuda_bfloat16 = maca_bfloat16; +using cuda_bfloat16 = hpcc_bfloat16; +using cuda_bfloat162 = hpcc_bfloat162; namespace device::maca { @@ -52,7 +50,7 @@ exp_(const float val) { __forceinline__ __device__ long double exp_(const long double val) { - return expl(val); + return exp(val); } __forceinline__ __device__ double @@ -65,7 +63,7 @@ exp_(const __half x) { return hexp(x); } -__forceinline__ __device__ __hpcc_bfloat16; -exp_(const __hpcc_bfloat16; x) { +__forceinline__ __device__ __hpcc_bfloat16 +exp_(const __hpcc_bfloat16 x) { return hexp(x); } diff --git a/src/infiniop/ops/gemm/maca/gemm_maca.cc b/src/infiniop/ops/gemm/maca/gemm_maca.cc index e1f1b9674..2c824a377 100644 --- a/src/infiniop/ops/gemm/maca/gemm_maca.cc +++ b/src/infiniop/ops/gemm/maca/gemm_maca.cc @@ -21,9 +21,7 @@ infiniStatus_t Descriptor::create( auto handle = reinterpret_cast(handle_); auto dtype = c_desc->dtype(); - if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) { - return INFINI_STATUS_BAD_TENSOR_DTYPE; - } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR); CHECK_RESULT(result); @@ -53,7 +51,10 @@ infiniStatus_t Descriptor::calculate( a_type = b_type = c_type = HPCC_R_16F; compute_type = HCBLAS_COMPUTE_32F; break; - + case INFINI_DTYPE_BF16: + a_type = b_type = c_type = HPCC_R_16BF; + compute_type = HCBLAS_COMPUTE_32F; + break; case INFINI_DTYPE_F32: a_type = b_type = c_type = HPCC_R_32F; compute_type = HCBLAS_COMPUTE_32F_FAST_TF32; diff --git a/src/infiniop/ops/rms_norm/cuda/kernel.cuh b/src/infiniop/ops/rms_norm/cuda/kernel.cuh index 3185672e2..53c4a5587 100644 --- a/src/infiniop/ops/rms_norm/cuda/kernel.cuh +++ b/src/infiniop/ops/rms_norm/cuda/kernel.cuh @@ -1,8 +1,8 @@ #ifndef __RMS_NORM_CUDA_KERNEL_H__ #define __RMS_NORM_CUDA_KERNEL_H__ -template -INFINIOP_CUDA_KERNEL rmsnormBlock( +template +__device__ void rmsnormBlock( Tdata *__restrict__ y, ptrdiff_t stride_y, const Tdata *__restrict__ x, diff --git a/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca index 42e70d812..2e76303f3 100644 --- a/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca +++ b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca @@ -1,7 +1,25 @@ #include "../../../devices/maca/common_maca.h" -#include "../cuda/rms_norm_kernel.cuh" #include "rms_norm_metax.cuh" +#include "../../../devices/maca/maca_kernel_common.h" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_MACA_KERNEL rmsnormKernel( + Tdata *__restrict__ y, + ptrdiff_t stride_y, + const Tdata *__restrict__ x, + ptrdiff_t stride_x, + const Tweight *__restrict__ w, + size_t dim, + float epsilon) { + rmsnormBlock(y, stride_y, x, stride_x, w, dim, epsilon); +} + namespace op::rms_norm::maca { struct Descriptor::Opaque { @@ -46,14 +64,14 @@ infiniStatus_t launchKernel( float epsilon, hcStream_t maca_stream) { -#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \ - rmsnormBlock<<>>( \ - reinterpret_cast(y), \ - stride_y, \ - reinterpret_cast(x), \ - stride_x, \ - reinterpret_cast(w), \ - dim, \ +#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \ + rmsnormKernel<<>>( \ + reinterpret_cast(y), \ + stride_y, \ + reinterpret_cast(x), \ + stride_x, \ + reinterpret_cast(w), \ + dim, \ epsilon) if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) { @@ -91,8 +109,8 @@ infiniStatus_t Descriptor::calculate( auto maca_stream = reinterpret_cast(stream); // launch kernel with different block sizes - if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { - CHECK_STATUS(launchKernel(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream)); + if (_opaque->internal->maxThreadsPerBlock() == MACA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu index 12435df72..bc6f9fe20 100644 --- a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu +++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu @@ -8,6 +8,18 @@ #include "../cuda/kernel.cuh" +template +INFINIOP_CUDA_KERNEL rmsnormKernel( + Tdata *__restrict__ y, + ptrdiff_t stride_y, + const Tdata *__restrict__ x, + ptrdiff_t stride_x, + const Tweight *__restrict__ w, + size_t dim, + float epsilon) { + rmsnormBlock(y, stride_y, x, stride_x, w, dim, epsilon); +} + namespace op::rms_norm::nvidia { struct Descriptor::Opaque { @@ -52,14 +64,14 @@ infiniStatus_t launchKernel( float epsilon, cudaStream_t cuda_stream) { -#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \ - rmsnormBlock<<>>( \ - reinterpret_cast(y), \ - stride_y, \ - reinterpret_cast(x), \ - stride_x, \ - reinterpret_cast(w), \ - dim, \ +#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute) \ + rmsnormKernel<<>>( \ + reinterpret_cast(y), \ + stride_y, \ + reinterpret_cast(x), \ + stride_x, \ + reinterpret_cast(w), \ + dim, \ epsilon) if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) { @@ -108,4 +120,4 @@ infiniStatus_t Descriptor::calculate( } return INFINI_STATUS_SUCCESS; } -} // namespace op::rms_norm::cuda +} // namespace op::rms_norm::nvidia diff --git a/src/infiniop/ops/rope/cuda/kernel.cuh b/src/infiniop/ops/rope/cuda/kernel.cuh index 3d584ac76..01f2bc9d1 100644 --- a/src/infiniop/ops/rope/cuda/kernel.cuh +++ b/src/infiniop/ops/rope/cuda/kernel.cuh @@ -2,7 +2,7 @@ #define __INFINIOP_ROPE_CUDA_KERNEL_CUH__ template -INFINIOP_CUDA_KERNEL ropeThreadPerItem( +__device__ void ropeThreadPerItemBlock( Tdata *y_, const Tdata *x_, const Tindex *__restrict__ pos_ids, @@ -28,9 +28,9 @@ INFINIOP_CUDA_KERNEL ropeThreadPerItem( Tangle y0 = x.x * cos__ - x.y * sin__, y1 = x.x * sin__ + x.y * cos__; y = half2(y0, y1); - } else if constexpr (std::is_same::value) { - auto &y = reinterpret_cast<__nv_bfloat162 &>(y_[y_offset + 2 * i]); - auto &x = reinterpret_cast(x_[x_offset + 2 * i]); + } else if constexpr (std::is_same::value) { + auto &y = reinterpret_cast(y_[y_offset + 2 * i]); + auto &x = reinterpret_cast(x_[x_offset + 2 * i]); Tangle x0 = __low2bfloat16(x); Tangle x1 = __high2bfloat16(x); diff --git a/src/infiniop/ops/rope/maca/rope_maca_kernel.h b/src/infiniop/ops/rope/maca/rope_maca_kernel.h deleted file mode 100644 index 8579df1fb..000000000 --- a/src/infiniop/ops/rope/maca/rope_maca_kernel.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef __INFINIOP_ROPE_MACA_KERNEL_H__ -#define __INFINIOP_ROPE_MACA_KERNEL_H__ - -#include "../../../devices/maca/maca_kernel_common.h" - -template -INFINIOP_MACA_KERNEL ropeThreadPerItem( - Tdata *y_, - const Tdata *x_, - const Tindex *__restrict__ pos_ids, - const Tangle *__restrict__ sin_table, - const Tangle *__restrict__ cos_table, - size_t table_dim, - ptrdiff_t y_stride_seqlen, - ptrdiff_t y_stride_nhead, - ptrdiff_t x_stride_seqlen, - ptrdiff_t x_stride_nhead) { - - auto y_offset = blockIdx.x * y_stride_seqlen + blockIdx.y * y_stride_nhead; - auto x_offset = blockIdx.x * x_stride_seqlen + blockIdx.y * x_stride_nhead; - size_t pos_id = size_t(pos_ids[blockIdx.x]); - auto table_offset = pos_id * table_dim; - - for (size_t i = threadIdx.x; i < table_dim; i += blockDim.x) { - Tangle sin__ = sin_table[table_offset + i], - cos__ = cos_table[table_offset + i]; - if constexpr (std::is_same::value) { - auto &y = reinterpret_cast(y_[y_offset + 2 * i]); - auto &x = reinterpret_cast(x_[x_offset + 2 * i]); - Tangle y0 = x.x * cos__ - x.y * sin__, - y1 = x.x * sin__ + x.y * cos__; - y = half2(y0, y1); - } else { - Tangle x0 = x_[x_offset + 2 * i], - x1 = x_[x_offset + 2 * i + 1]; - y_[y_offset + 2 * i] = Tdata(x0 * cos__ - x1 * sin__); - y_[y_offset + 2 * i + 1] = Tdata(x0 * sin__ + x1 * cos__); - } - } -} - -#endif diff --git a/src/infiniop/ops/rope/maca/rope_maca.h b/src/infiniop/ops/rope/metax/rope_metax.h similarity index 87% rename from src/infiniop/ops/rope/maca/rope_maca.h rename to src/infiniop/ops/rope/metax/rope_metax.h index 106ed2cc0..40908f092 100644 --- a/src/infiniop/ops/rope/maca/rope_maca.h +++ b/src/infiniop/ops/rope/metax/rope_metax.h @@ -3,6 +3,6 @@ #include "../rope.h" -DESCRIPTOR(maca) +DESCRIPTOR(metax) #endif // __INFINIOP_ROPE_MACA_H__ diff --git a/src/infiniop/ops/rope/maca/rope_maca.maca b/src/infiniop/ops/rope/metax/rope_metax.maca similarity index 81% rename from src/infiniop/ops/rope/maca/rope_maca.maca rename to src/infiniop/ops/rope/metax/rope_metax.maca index 7c079e6d7..d7a3b9e82 100644 --- a/src/infiniop/ops/rope/maca/rope_maca.maca +++ b/src/infiniop/ops/rope/metax/rope_metax.maca @@ -1,8 +1,31 @@ #include "../../../devices/maca/common_maca.h" -#include "rope_maca.h" -#include "rope_maca_kernel.h" +#include "rope_metax.h" + +#include "../../../devices/maca/maca_kernel_common.h" + +#include "../cuda/kernel.cuh" + +template +INFINIOP_MACA_KERNEL ropeThreadPerItemKernel( + Tdata *y_, + const Tdata *x_, + const Tindex *__restrict__ pos_ids, + const Tangle *__restrict__ sin_table, + const Tangle *__restrict__ cos_table, + size_t table_dim, + ptrdiff_t y_stride_seqlen, + ptrdiff_t y_stride_nhead, + ptrdiff_t x_stride_seqlen, + ptrdiff_t x_stride_nhead) { + ropeThreadPerItemBlock( + y_, x_, pos_ids, + sin_table, cos_table, + table_dim, + y_stride_seqlen, y_stride_nhead, + x_stride_seqlen, x_stride_nhead); +} -namespace op::rope::maca { +namespace op::rope::metax { struct Descriptor::Opaque { std::shared_ptr internal; @@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info, dimy = uint32_t(info.nhead); int nthreads = std::max(int(info.table_dim), block_size); - ropeThreadPerItem<<>>( + ropeThreadPerItemKernel<<>>( y, x, pos_ids, sin_table, cos_table, info.table_dim, info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead); @@ -102,6 +125,8 @@ infiniStatus_t Descriptor::calculate( switch (_info.data_type) { case INFINI_DTYPE_F16: ROPE_TYPE(half); + case INFINI_DTYPE_BF16: + ROPE_TYPE(cuda_bfloat16); case INFINI_DTYPE_F32: ROPE_TYPE(float); case INFINI_DTYPE_F64: diff --git a/src/infiniop/ops/rope/nvidia/rope_nvidia.cu b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu index c2833dcf8..d4d5f0f9a 100644 --- a/src/infiniop/ops/rope/nvidia/rope_nvidia.cu +++ b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu @@ -5,6 +5,26 @@ #include "../cuda/kernel.cuh" +template +INFINIOP_CUDA_KERNEL ropeThreadPerItemKernel( + Tdata *y_, + const Tdata *x_, + const Tindex *__restrict__ pos_ids, + const Tangle *__restrict__ sin_table, + const Tangle *__restrict__ cos_table, + size_t table_dim, + ptrdiff_t y_stride_seqlen, + ptrdiff_t y_stride_nhead, + ptrdiff_t x_stride_seqlen, + ptrdiff_t x_stride_nhead) { + ropeThreadPerItemBlock( + y_, x_, pos_ids, + sin_table, cos_table, + table_dim, + y_stride_seqlen, y_stride_nhead, + x_stride_seqlen, x_stride_nhead); +} + namespace op::rope::nvidia { struct Descriptor::Opaque { @@ -53,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info, dimy = uint32_t(info.nhead); int nthreads = std::max(int(info.table_dim), block_size); - ropeThreadPerItem<<>>( + ropeThreadPerItemKernel<<>>( y, x, pos_ids, sin_table, cos_table, info.table_dim, info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead); @@ -106,7 +126,7 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_F16: ROPE_TYPE(half); case INFINI_DTYPE_BF16: - ROPE_TYPE(__nv_bfloat16); + ROPE_TYPE(cuda_bfloat16); case INFINI_DTYPE_F32: ROPE_TYPE(float); case INFINI_DTYPE_F64: @@ -121,4 +141,4 @@ infiniStatus_t Descriptor::calculate( #undef ROPE_TYPE #undef CALCULATE_ROPE -} // namespace op::rope::cuda +} // namespace op::rope::nvidia diff --git a/src/infiniop/ops/rope/operator.cc b/src/infiniop/ops/rope/operator.cc index 077f6e86b..e5bbbe3b7 100644 --- a/src/infiniop/ops/rope/operator.cc +++ b/src/infiniop/ops/rope/operator.cc @@ -12,7 +12,7 @@ #include "ascend/rope_ascend.h" #endif #ifdef ENABLE_METAX_API -#include "maca/rope_maca.h" +#include "metax/rope_metax.h" #endif __C infiniStatus_t infiniopCreateRoPEDescriptor( @@ -43,7 +43,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor( CREATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca); + CREATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_ASCEND_API CREATE(INFINI_DEVICE_ASCEND, ascend); @@ -84,7 +84,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc, GET(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, maca); + GET(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -135,7 +135,7 @@ __C infiniStatus_t infiniopRoPE( CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca); + CALCULATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -181,7 +181,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) { DELETE(INFINI_DEVICE_NVIDIA, nvidia); #endif #ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, maca); + DELETE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { diff --git a/src/infiniop/ops/swiglu/cuda/kernel.cuh b/src/infiniop/ops/swiglu/cuda/kernel.cuh index a75fd53e9..3ce26a8fe 100644 --- a/src/infiniop/ops/swiglu/cuda/kernel.cuh +++ b/src/infiniop/ops/swiglu/cuda/kernel.cuh @@ -1,10 +1,6 @@ #ifndef __SWIGLU_CUDA_H__ #define __SWIGLU_CUDA_H__ -#include "../../../elementwise/cuda/elementwise_cuda.cuh" -#include -#include - namespace op::swiglu::cuda { typedef struct SwiGLUOp { private: @@ -14,13 +10,13 @@ private: return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x)))); } else if constexpr (std::is_same_v) { return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x)))))); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { float x0 = __bfloat162float(__low2bfloat16(x)); float x1 = __bfloat162float(__high2bfloat16(x)); float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0))); float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1))); return __floats2bfloat162_rn(sig0, sig1); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { float xf = __bfloat162float(x); return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf)))); } else if constexpr (std::is_same_v) { @@ -38,8 +34,8 @@ public: return __hmul2(__hmul2(gate, sigmoid(gate)), up); } else if constexpr (std::is_same_v) { return __hmul(__hmul(gate, sigmoid(gate)), up); - } else if constexpr (std::is_same_v) { - __nv_bfloat162 sig = sigmoid(gate); + } else if constexpr (std::is_same_v) { + cuda_bfloat162 sig = sigmoid(gate); float gate0 = __bfloat162float(__low2bfloat16(gate)); float gate1 = __bfloat162float(__high2bfloat16(gate)); float sig0 = __bfloat162float(__low2bfloat16(sig)); @@ -49,8 +45,8 @@ public: float res0 = __fmul_rn(__fmul_rn(gate0, sig0), up0); float res1 = __fmul_rn(__fmul_rn(gate1, sig1), up1); return __floats2bfloat162_rn(res0, res1); - } else if constexpr (std::is_same_v) { - __nv_bfloat16 sig = sigmoid(gate); + } else if constexpr (std::is_same_v) { + cuda_bfloat16 sig = sigmoid(gate); float gatef = __bfloat162float(gate); float sigf = __bfloat162float(sig); float upf = __bfloat162float(up); diff --git a/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h b/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h deleted file mode 100644 index 8af1068f4..000000000 --- a/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __SWIGLU_MACA_H__ -#define __SWIGLU_MACA_H__ - -#include "../../../elementwise/maca/elementwise_maca.h" -#include - -namespace op::swiglu::maca { -typedef struct SwiGLUOp { -private: - template - __device__ __forceinline__ T sigmoid(const T &x) const { - if constexpr (std::is_same_v) { - return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x)))); - } else if constexpr (std::is_same_v) { - return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x)))))); - } else if constexpr (std::is_same_v) { - return __frcp_rn(__fadd_rn(1, __expf(-x))); - } else { - return 1 / (1 + std::exp(-x)); - } - } - -public: - static constexpr size_t num_inputs = 2; - template - __device__ __forceinline__ T operator()(const T &up, const T &gate) const { - if constexpr (std::is_same_v) { - return __hmul2(__hmul2(gate, sigmoid(gate)), up); - } else if constexpr (std::is_same_v) { - return __hmul(__hmul(gate, sigmoid(gate)), up); - } else if constexpr (std::is_same_v) { - return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up); - } else { - return gate * sigmoid(gate) * up; - } - } -} SwiGLUOp; -} // namespace op::swiglu::maca - -#endif diff --git a/src/infiniop/ops/swiglu/maca/swiglu_maca.h b/src/infiniop/ops/swiglu/metax/swiglu_metax.h similarity index 77% rename from src/infiniop/ops/swiglu/maca/swiglu_maca.h rename to src/infiniop/ops/swiglu/metax/swiglu_metax.h index d261d6e6e..830275fb8 100644 --- a/src/infiniop/ops/swiglu/maca/swiglu_maca.h +++ b/src/infiniop/ops/swiglu/metax/swiglu_metax.h @@ -3,6 +3,6 @@ #include "../../../elementwise/maca/elementwise_maca_api.h" -ELEMENTWISE_DESCRIPTOR(swiglu, maca) +ELEMENTWISE_DESCRIPTOR(swiglu, metax, maca) #endif // __SWIGLU_MACA_API_H__ diff --git a/src/infiniop/ops/swiglu/maca/swiglu_maca.maca b/src/infiniop/ops/swiglu/metax/swiglu_metax.maca similarity index 62% rename from src/infiniop/ops/swiglu/maca/swiglu_maca.maca rename to src/infiniop/ops/swiglu/metax/swiglu_metax.maca index c2515b580..cc511f467 100644 --- a/src/infiniop/ops/swiglu/maca/swiglu_maca.maca +++ b/src/infiniop/ops/swiglu/metax/swiglu_metax.maca @@ -1,7 +1,10 @@ -#include "swiglu_maca.h" -#include "swiglu_maca_internal.h" +#include "swiglu_metax.h" -namespace op::swiglu::maca { +#include "../../../elementwise/maca/elementwise_maca.h" + +#include "../cuda/kernel.cuh" + +namespace op::swiglu::metax { Descriptor::~Descriptor() = default; @@ -20,7 +23,7 @@ infiniStatus_t Descriptor::create( const auto &up_shape = up_desc->shape(); const auto &gate_shape = gate_desc->shape(); - CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape); // create MACA elementwise descriptor @@ -42,15 +45,17 @@ infiniStatus_t Descriptor::calculate( switch (_dtype) { case INFINI_DTYPE_F16: - return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: - return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: - return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream); default: return INFINI_STATUS_BAD_TENSOR_DTYPE; } return INFINI_STATUS_SUCCESS; } -} // namespace op::swiglu::maca +} // namespace op::swiglu::metax diff --git a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu index ae744fdc7..682077e13 100644 --- a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu +++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -1,4 +1,7 @@ #include "swiglu_nvidia.cuh" + +#include "../../../elementwise/cuda/elementwise_cuda.cuh" + #include "../cuda/kernel.cuh" namespace op::swiglu::nvidia { @@ -44,7 +47,7 @@ infiniStatus_t Descriptor::calculate( case INFINI_DTYPE_F16: return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_BF16: - return _device_info->calculate<256, cuda::SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F32: return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream); case INFINI_DTYPE_F64: diff --git a/src/infiniop/ops/swiglu/operator.cc b/src/infiniop/ops/swiglu/operator.cc index 2b831016d..838276521 100644 --- a/src/infiniop/ops/swiglu/operator.cc +++ b/src/infiniop/ops/swiglu/operator.cc @@ -12,7 +12,7 @@ #include "kunlun/swiglu_kunlun.h" #endif #ifdef ENABLE_METAX_API -#include "maca/swiglu_maca.h" +#include "metax/swiglu_metax.h" #endif #ifdef ENABLE_ASCEND_API #include "ascend/swiglu_ascend.h" @@ -46,7 +46,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor( CREATE(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca); + CREATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -96,7 +96,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des GET(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, maca); + GET(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -144,7 +144,7 @@ __C infiniStatus_t infiniopSwiGLU( CALCULATE(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca); + CALCULATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { @@ -190,7 +190,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) { DELETE(INFINI_DEVICE_KUNLUN, kunlun); #endif #ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, maca); + DELETE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_CAMBRICON_MLU case DevCambriconMlu: { diff --git a/xmake.lua b/xmake.lua index 753d7a588..e0c610dad 100644 --- a/xmake.lua +++ b/xmake.lua @@ -174,7 +174,7 @@ target("infini-utils") add_cxflags("-fPIC", "-Wno-unknown-pragmas") if has_config("omp") then add_cxflags("-fopenmp") - add_ldflags("-fopenmp") + add_ldflags("-fopenmp", {force = true}) end end From d15aaa3b20717efa6cc31731b76c417d0a66d3c1 Mon Sep 17 00:00:00 2001 From: PanZezhong Date: Fri, 11 Jul 2025 05:48:53 +0000 Subject: [PATCH 7/8] =?UTF-8?q?issue/291/style:=20=E4=B8=80=E7=B3=BB?= =?UTF-8?q?=E5=88=97=20maca=20=E6=94=B9=E4=B8=BA=20metax=20=E5=B9=B6?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: PanZezhong --- src/infiniop/ops/add/nvidia/add_nvidia.cu | 4 ++-- .../gemm/{maca/gemm_maca.cc => metax/gemm_metax.cc} | 6 +++--- .../ops/gemm/{maca/gemm_maca.h => metax/gemm_metax.h} | 2 +- src/infiniop/ops/gemm/operator.cc | 10 +++++----- src/infiniop/ops/mul/nvidia/mul_nvidia.cu | 2 +- .../{maca => metax}/random_sample_kernel.h | 4 ++-- .../random_sample_metax.h} | 2 +- .../random_sample_metax.maca} | 4 ++-- src/infiniop/ops/random_sample/operator.cc | 10 +++++----- .../ops/rearrange/{maca => metax}/rearrange_kernel.h | 0 .../{maca/rearrange_maca.h => metax/rearrange_metax.h} | 2 +- .../rearrange_maca.maca => metax/rearrange_metax.maca} | 6 +++--- src/infiniop/ops/rearrange/operator.cc | 8 ++++---- src/infiniop/ops/sub/nvidia/sub_nvidia.cu | 4 ++-- src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu | 2 +- xmake/metax.lua | 4 ++-- 16 files changed, 35 insertions(+), 35 deletions(-) rename src/infiniop/ops/gemm/{maca/gemm_maca.cc => metax/gemm_metax.cc} (97%) rename src/infiniop/ops/gemm/{maca/gemm_maca.h => metax/gemm_metax.h} (84%) rename src/infiniop/ops/random_sample/{maca => metax}/random_sample_kernel.h (98%) rename src/infiniop/ops/random_sample/{maca/random_sample_maca.h => metax/random_sample_metax.h} (88%) rename src/infiniop/ops/random_sample/{maca/random_sample_maca.maca => metax/random_sample_metax.maca} (97%) rename src/infiniop/ops/rearrange/{maca => metax}/rearrange_kernel.h (100%) rename src/infiniop/ops/rearrange/{maca/rearrange_maca.h => metax/rearrange_metax.h} (86%) rename src/infiniop/ops/rearrange/{maca/rearrange_maca.maca => metax/rearrange_metax.maca} (99%) diff --git a/src/infiniop/ops/add/nvidia/add_nvidia.cu b/src/infiniop/ops/add/nvidia/add_nvidia.cu index b31d23b63..7b3764482 100644 --- a/src/infiniop/ops/add/nvidia/add_nvidia.cu +++ b/src/infiniop/ops/add/nvidia/add_nvidia.cu @@ -1,5 +1,5 @@ -#include "add_nvidia.cuh" #include "../cuda/kernel.cuh" +#include "add_nvidia.cuh" namespace op::add::nvidia { @@ -56,4 +56,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::add::cuda +} // namespace op::add::nvidia diff --git a/src/infiniop/ops/gemm/maca/gemm_maca.cc b/src/infiniop/ops/gemm/metax/gemm_metax.cc similarity index 97% rename from src/infiniop/ops/gemm/maca/gemm_maca.cc rename to src/infiniop/ops/gemm/metax/gemm_metax.cc index 2c824a377..65b16b077 100644 --- a/src/infiniop/ops/gemm/maca/gemm_maca.cc +++ b/src/infiniop/ops/gemm/metax/gemm_metax.cc @@ -1,8 +1,8 @@ -#include "gemm_maca.h" +#include "gemm_metax.h" #include "../../../devices/maca/common_maca.h" #include "../../../devices/maca/maca_handle.h" -namespace op::gemm::maca { +namespace op::gemm::metax { struct Descriptor::Opaque { std::shared_ptr internal; @@ -104,4 +104,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::gemm::maca +} // namespace op::gemm::metax diff --git a/src/infiniop/ops/gemm/maca/gemm_maca.h b/src/infiniop/ops/gemm/metax/gemm_metax.h similarity index 84% rename from src/infiniop/ops/gemm/maca/gemm_maca.h rename to src/infiniop/ops/gemm/metax/gemm_metax.h index 2c1decd9a..947fb6e83 100644 --- a/src/infiniop/ops/gemm/maca/gemm_maca.h +++ b/src/infiniop/ops/gemm/metax/gemm_metax.h @@ -3,6 +3,6 @@ #include "../gemm.h" -DESCRIPTOR(maca) +DESCRIPTOR(metax) #endif // __GEMM_MACA_H__ diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc index ce067cf25..5ecbdd93a 100644 --- a/src/infiniop/ops/gemm/operator.cc +++ b/src/infiniop/ops/gemm/operator.cc @@ -15,7 +15,7 @@ #include "ascend/gemm_ascend.h" #endif #ifdef ENABLE_METAX_API -#include "maca/gemm_maca.h" +#include "metax/gemm_metax.h" #endif #ifdef ENABLE_MOORE_API #include "musa/gemm_musa.h" @@ -55,7 +55,7 @@ __C infiniStatus_t infiniopCreateGemmDescriptor( CREATE(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca); + CREATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_MOORE_API CREATE(INFINI_DEVICE_MOORE, musa); @@ -97,7 +97,7 @@ infiniopGetGemmWorkspaceSize( GET(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, maca); + GET(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_MOORE_API GET(INFINI_DEVICE_MOORE, musa); @@ -146,7 +146,7 @@ __C infiniStatus_t infiniopGemm( CALCULATE(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca); + CALCULATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_MOORE_API CALCULATE(INFINI_DEVICE_MOORE, musa); @@ -185,7 +185,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) { DELETE(INFINI_DEVICE_ASCEND, ascend); #endif #ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, maca); + DELETE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_MOORE_API DELETE(INFINI_DEVICE_MOORE, musa); diff --git a/src/infiniop/ops/mul/nvidia/mul_nvidia.cu b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu index 9d71f31e8..fa0af5f7a 100644 --- a/src/infiniop/ops/mul/nvidia/mul_nvidia.cu +++ b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu @@ -1,5 +1,5 @@ -#include "mul_nvidia.cuh" #include "../cuda/kernel.cuh" +#include "mul_nvidia.cuh" namespace op::mul::nvidia { diff --git a/src/infiniop/ops/random_sample/maca/random_sample_kernel.h b/src/infiniop/ops/random_sample/metax/random_sample_kernel.h similarity index 98% rename from src/infiniop/ops/random_sample/maca/random_sample_kernel.h rename to src/infiniop/ops/random_sample/metax/random_sample_kernel.h index 7a347c897..011580673 100644 --- a/src/infiniop/ops/random_sample/maca/random_sample_kernel.h +++ b/src/infiniop/ops/random_sample/metax/random_sample_kernel.h @@ -4,7 +4,7 @@ #include #include -namespace op::random_sample::maca { +namespace op::random_sample::metax { // ↓↓↓ 重新封装 cub api,减少模板参数,方便调用 @@ -256,4 +256,4 @@ struct Algo { } }; -} // namespace op::random_sample::maca +} // namespace op::random_sample::metax diff --git a/src/infiniop/ops/random_sample/maca/random_sample_maca.h b/src/infiniop/ops/random_sample/metax/random_sample_metax.h similarity index 88% rename from src/infiniop/ops/random_sample/maca/random_sample_maca.h rename to src/infiniop/ops/random_sample/metax/random_sample_metax.h index b35248d6f..cc961479e 100644 --- a/src/infiniop/ops/random_sample/maca/random_sample_maca.h +++ b/src/infiniop/ops/random_sample/metax/random_sample_metax.h @@ -3,6 +3,6 @@ #include "../random_sample.h" -DESCRIPTOR(maca) +DESCRIPTOR(metax) #endif // __RANDOM_SAMPLE_MACA_H__ diff --git a/src/infiniop/ops/random_sample/maca/random_sample_maca.maca b/src/infiniop/ops/random_sample/metax/random_sample_metax.maca similarity index 97% rename from src/infiniop/ops/random_sample/maca/random_sample_maca.maca rename to src/infiniop/ops/random_sample/metax/random_sample_metax.maca index db5bfd05e..e61c1b0a8 100644 --- a/src/infiniop/ops/random_sample/maca/random_sample_maca.maca +++ b/src/infiniop/ops/random_sample/metax/random_sample_metax.maca @@ -2,9 +2,9 @@ #include "../../../devices/maca/maca_handle.h" #include "../info.h" #include "random_sample_kernel.h" -#include "random_sample_maca.h" +#include "random_sample_metax.h" -namespace op::random_sample::maca { +namespace op::random_sample::metax { struct Descriptor::Opaque { std::shared_ptr internal; diff --git a/src/infiniop/ops/random_sample/operator.cc b/src/infiniop/ops/random_sample/operator.cc index cf55024eb..2918b684f 100644 --- a/src/infiniop/ops/random_sample/operator.cc +++ b/src/infiniop/ops/random_sample/operator.cc @@ -9,7 +9,7 @@ #include "cuda/random_sample_cuda.cuh" #endif #ifdef ENABLE_METAX_API -#include "maca/random_sample_maca.h" +#include "metax/random_sample_metax.h" #endif #ifdef ENABLE_ASCEND_API #include "ascend/random_sample_aclnn.h" @@ -39,7 +39,7 @@ infiniopCreateRandomSampleDescriptor( CREATE(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca); + CREATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_ASCEND_API CREATE(INFINI_DEVICE_ASCEND, ascend); @@ -72,7 +72,7 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize( GET(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - GET(INFINI_DEVICE_METAX, maca); + GET(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_ASCEND_API GET(INFINI_DEVICE_ASCEND, ascend); @@ -115,7 +115,7 @@ __C infiniStatus_t infiniopRandomSample( CALCULATE(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca); + CALCULATE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_ASCEND_API CALCULATE(INFINI_DEVICE_ASCEND, ascend); @@ -145,7 +145,7 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor( DELETE(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, maca); + DELETE(INFINI_DEVICE_METAX, metax); #endif #ifdef ENABLE_ASCEND_API DELETE(INFINI_DEVICE_ASCEND, ascend); diff --git a/src/infiniop/ops/rearrange/maca/rearrange_kernel.h b/src/infiniop/ops/rearrange/metax/rearrange_kernel.h similarity index 100% rename from src/infiniop/ops/rearrange/maca/rearrange_kernel.h rename to src/infiniop/ops/rearrange/metax/rearrange_kernel.h diff --git a/src/infiniop/ops/rearrange/maca/rearrange_maca.h b/src/infiniop/ops/rearrange/metax/rearrange_metax.h similarity index 86% rename from src/infiniop/ops/rearrange/maca/rearrange_maca.h rename to src/infiniop/ops/rearrange/metax/rearrange_metax.h index b0a712ea4..93a45bda3 100644 --- a/src/infiniop/ops/rearrange/maca/rearrange_maca.h +++ b/src/infiniop/ops/rearrange/metax/rearrange_metax.h @@ -3,6 +3,6 @@ #include "../rearrange.h" -DESCRIPTOR(maca) +DESCRIPTOR(metax) #endif // __REARRANGE_MACA_H__ diff --git a/src/infiniop/ops/rearrange/maca/rearrange_maca.maca b/src/infiniop/ops/rearrange/metax/rearrange_metax.maca similarity index 99% rename from src/infiniop/ops/rearrange/maca/rearrange_maca.maca rename to src/infiniop/ops/rearrange/metax/rearrange_metax.maca index 29c50f524..9149d0c5c 100644 --- a/src/infiniop/ops/rearrange/maca/rearrange_maca.maca +++ b/src/infiniop/ops/rearrange/metax/rearrange_metax.maca @@ -1,13 +1,13 @@ #include "../../../tensor.h" #include "rearrange_kernel.h" -#include "rearrange_maca.h" +#include "rearrange_metax.h" #include #include #include #include #include -namespace op::rearrange::maca { +namespace op::rearrange::metax { struct Descriptor::Opaque { std::shared_ptr internal; @@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate( return status; } -} // namespace op::rearrange::maca +} // namespace op::rearrange::metax diff --git a/src/infiniop/ops/rearrange/operator.cc b/src/infiniop/ops/rearrange/operator.cc index 9e467ad84..a892cfb9d 100644 --- a/src/infiniop/ops/rearrange/operator.cc +++ b/src/infiniop/ops/rearrange/operator.cc @@ -13,7 +13,7 @@ #include "cuda/rearrange_cuda.cuh" #endif #ifdef ENABLE_METAX_API -#include "maca/rearrange_maca.h" +#include "metax/rearrange_metax.h" #endif __C infiniStatus_t infiniopCreateRearrangeDescriptor( @@ -43,7 +43,7 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor( CREATE(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - CREATE(INFINI_DEVICE_METAX, maca); + CREATE(INFINI_DEVICE_METAX, metax); #endif default: return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; @@ -76,7 +76,7 @@ __C infiniStatus_t infiniopRearrange( CALCULATE(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - CALCULATE(INFINI_DEVICE_METAX, maca); + CALCULATE(INFINI_DEVICE_METAX, metax); #endif default: @@ -107,7 +107,7 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor( DELETE(INFINI_DEVICE_NVIDIA, cuda); #endif #ifdef ENABLE_METAX_API - DELETE(INFINI_DEVICE_METAX, maca); + DELETE(INFINI_DEVICE_METAX, metax); #endif default: diff --git a/src/infiniop/ops/sub/nvidia/sub_nvidia.cu b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu index 4460a2019..0032f5d74 100644 --- a/src/infiniop/ops/sub/nvidia/sub_nvidia.cu +++ b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu @@ -1,5 +1,5 @@ -#include "sub_nvidia.cuh" #include "../cuda/kernel.cuh" +#include "sub_nvidia.cuh" namespace op::sub::nvidia { @@ -56,4 +56,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::sub::cuda +} // namespace op::sub::nvidia diff --git a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu index 682077e13..d73247a01 100644 --- a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu +++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu @@ -58,4 +58,4 @@ infiniStatus_t Descriptor::calculate( return INFINI_STATUS_SUCCESS; } -} // namespace op::swiglu::cuda +} // namespace op::swiglu::nvidia diff --git a/xmake/metax.lua b/xmake/metax.lua index 7a17f54e6..48a0a8562 100644 --- a/xmake/metax.lua +++ b/xmake/metax.lua @@ -34,8 +34,8 @@ target("infiniop-metax") set_languages("cxx17") set_warnings("all", "error") add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing") - add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc") - add_files("../src/infiniop/ops/*/maca/*.maca", "../src/infiniop/ops/*/metax/*.maca", {rule = "maca"}) + add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/metax/*.cc") + add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"}) target_end() target("infinirt-metax") From eac2b0cac6e10a5c5dc08bbfbef00c5f63b4a15b Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Fri, 11 Jul 2025 14:17:47 +0800 Subject: [PATCH 8/8] =?UTF-8?q?issue/291/docs:=20=E8=A1=A5=E5=85=85?= =?UTF-8?q?=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- src/infiniop/reduce/cuda/reduce.cuh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index 89a0dd18b..a1d2c2501 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -6,6 +6,11 @@ * * Note: Only local result on thread 0 is guranteed to be correct. * A manual broadcast is needed for other threads. + * + * Important Note: This is a device-independent header file containing reduce kernels + * for all cuda-supporting platforms. Include device-specific headers + * (such as for nvidia) in your source file + * and then include this file for proper usage. */ namespace op::common_cuda::reduce_op {