From f0c5a569f40bdb9e573eb5cf4759fbc43d25c6a3 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Tue, 1 Jul 2025 10:00:07 +0800
Subject: [PATCH 1/8] =?UTF-8?q?issue/291/build:=20nvidia=20=E5=92=8C=20met?=
 =?UTF-8?q?ax=20=E5=85=B1=E7=94=A8=20softmax=20kernel=20=E5=AE=9E=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .../cuda/causal_softmax_cuda.cuh              |  8 ---
 .../{causal_softmax_kernel.cuh => kernel.cuh} | 17 ++---
 .../maca/causal_softmax_kernel.h              | 60 ----------------
 .../causal_softmax/maca/causal_softmax_maca.h |  8 ---
 .../metax/causal_softmax_metax.h              |  8 +++
 .../causal_softmax_metax.maca}                | 23 +++++--
 .../causal_softmax_nvidia.cu}                 | 23 +++++--
 .../nvidia/causal_softmax_nvidia.cuh          |  8 +++
 src/infiniop/ops/causal_softmax/operator.cc   | 68 +++++++------------
 .../ops/rms_norm/cuda/rms_norm_kernel.cuh     |  3 +
 .../rms_norm_metax.cuh}                       |  0
 .../rms_norm_metax.maca}                      |  2 +-
 src/infiniop/ops/rms_norm/operator.cc         |  2 +-
 src/infiniop/reduce/cuda/reduce.cuh           |  2 -
 src/infiniop/reduce/maca/reduce.h             | 63 -----------------
 xmake.lua                                     |  2 +-
 xmake/cuda.lua                                |  2 +-
 xmake/{maca.lua => metax.lua}                 |  4 +-
 18 files changed, 94 insertions(+), 209 deletions(-)
 delete mode 100644 src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh
 rename src/infiniop/ops/causal_softmax/cuda/{causal_softmax_kernel.cuh => kernel.cuh} (86%)
 delete mode 100644 src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
 delete mode 100644 src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h
 create mode 100644 src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
 rename src/infiniop/ops/causal_softmax/{maca/causal_softmax_maca.maca => metax/causal_softmax_metax.maca} (83%)
 rename src/infiniop/ops/causal_softmax/{cuda/causal_softmax_cuda.cu => nvidia/causal_softmax_nvidia.cu} (84%)
 create mode 100644 src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
 rename src/infiniop/ops/rms_norm/{maca/rms_norm_maca.cuh => metax/rms_norm_metax.cuh} (100%)
 rename src/infiniop/ops/rms_norm/{maca/rms_norm_maca.maca => metax/rms_norm_metax.maca} (99%)
 delete mode 100644 src/infiniop/reduce/maca/reduce.h
 rename xmake/{maca.lua => metax.lua} (94%)
diff --git a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh b/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh
deleted file mode 100644
index cbb2f8a42..000000000
--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CAUSAL_SOFTMAX_CUDA_H__
-#define __CAUSAL_SOFTMAX_CUDA_H__
-
-#include "../causal_softmax.h"
-
-DESCRIPTOR(cuda)
-
-#endif
diff --git a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
similarity index 86%
rename from src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
rename to src/infiniop/ops/causal_softmax/cuda/kernel.cuh
index 16b23291c..4dce83a6b 100644
--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh
+++ b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
@@ -1,11 +1,8 @@
-#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
+﻿#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
 #define __CAUSAL_SOFTMAX_KERNEL_CUH__
 
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-#include "../../../reduce/cuda/reduce.cuh"
-
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_CUDA_KERNEL causalSoftmax(
+__device__ void causalSoftmaxKernel(
     Tdata *y_, const Tdata *x_,
     size_t batch, size_t height, size_t width,
     ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
         //          2 | * * * ... * * * |
         //  height: 3  col_id->
         if (width + blockIdx.x >= threadIdx.x + height) {
-#ifdef ENABLE_NVIDIA_API
-            y[col] = exp_(x[col] - max_);
-#else
-            y[col] = exp(x[col] - max_);
-#endif
+            if constexpr (std::is_same_v<Tdata, half>) {
+                y[col] = hexp(x[col] - max_);
+            } else {
+                y[col] = exp(x[col] - max_);
+            }
         } else {
             y[col] = Tdata(0);
         }
diff --git a/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h b/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
deleted file mode 100644
index 4ecc8f0da..000000000
--- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef __CAUSAL_SOFTMAX_KERNEL_H__
-#define __CAUSAL_SOFTMAX_KERNEL_H__
-
-#include "../../../devices/maca/maca_kernel_common.h"
-#include "../../../reduce/maca/reduce.h"
-
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_MACA_KERNEL causalSoftmax(
-    Tdata *y_, const Tdata *x_,
-    size_t batch, size_t height, size_t width,
-    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
-
-    Tdata *y = y_                       // threadIdx.x for col_id
-             + blockIdx.y * y_stride_b  // gridDim.y for batch_id
-             + blockIdx.x * y_stride_h; // gridDim.x for row_id
-    const Tdata *x = x_ + blockIdx.y * x_stride_b + blockIdx.x * x_stride_h;
-
-    // [Reduce] Find max value in each row and store in shared memory
-    __shared__ Tdata max_;
-    Tdata max_0 = op::common_maca::reduce_op::max<BLOCK_SIZE, Tdata>(x, width - height + 1 + blockIdx.x);
-    if (threadIdx.x == 0) {
-        max_ = max_0;
-    }
-    __syncthreads();
-
-    // [Elementwise] Subtract max value from each element and apply causal mask
-    for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
-        //   row_id ↓ |<-     width   ->|
-        //          0 | * * * ... *     |
-        //          1 | * * * ... * *   |
-        //          2 | * * * ... * * * |
-        //  height: 3  col_id->
-        if (width + blockIdx.x >= threadIdx.x + height) {
-#ifdef ENABLE_MACA_API
-            y[col] = exp_(x[col] - max_);
-#else
-            y[col] = exp(x[col] - max_);
-#endif
-        } else {
-            y[col] = Tdata(0);
-        }
-    }
-    __syncthreads();
-
-    // [Reduce] Find the sum of each updated row and store in shared memory
-    __shared__ Tcompute sum_;
-    Tcompute sum_0 = op::common_maca::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(y, width);
-    if (threadIdx.x == 0) {
-        sum_ = sum_0;
-    }
-    __syncthreads();
-
-    // [Elementwise] Divide each element by the sum and store in shared memory
-    for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
-        y[col] /= Tdata(sum_);
-    }
-}
-
-#endif // __CAUSAL_SOFTMAX_KERNEL_H__
diff --git a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h b/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h
deleted file mode 100644
index 0f0075db7..000000000
--- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CAUSAL_SOFTMAX_MACA_H__
-#define __CAUSAL_SOFTMAX_MACA_H__
-
-#include "../causal_softmax.h"
-
-DESCRIPTOR(maca)
-
-#endif
diff --git a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
new file mode 100644
index 000000000..1b68a8f5f
--- /dev/null
+++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CAUSAL_SOFTMAX_METAX_H__
+#define __CAUSAL_SOFTMAX_METAX_H__
+
+#include "../causal_softmax.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
similarity index 83%
rename from src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca
rename to src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
index c48f1f49b..4823c04ba 100644
--- a/src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.maca
+++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
@@ -1,8 +1,23 @@
 #include "../../../devices/maca/common_maca.h"
-#include "causal_softmax_kernel.h"
-#include "causal_softmax_maca.h"
+#include "../../../devices/maca/maca_kernel_common.h"
+#include "causal_softmax_metax.h"
 
-namespace op::causal_softmax::maca {
+#include <hccub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_MACA_KERNEL causalSoftmax(
+    Tdata *y, const Tdata *x,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
+    causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
+}
+
+namespace op::causal_softmax::metax {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::maca::Handle::Internal> internal;
@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     return INFINI_STATUS_SUCCESS;
 }
 
-} // namespace op::causal_softmax::maca
+} // namespace op::causal_softmax::metax
diff --git a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
similarity index 84%
rename from src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
rename to src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
index e3f3eca2f..dce85428d 100644
--- a/src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cu
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
@@ -1,8 +1,23 @@
 #include "../../../devices/cuda/cuda_common.cuh"
-#include "causal_softmax_cuda.cuh"
-#include "causal_softmax_kernel.cuh"
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include "causal_softmax_nvidia.cuh"
 
-namespace op::causal_softmax::cuda {
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL causalSoftmax(
+    Tdata *y, const Tdata *x,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
+    causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
+}
+
+namespace op::causal_softmax::nvidia {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::cuda::Handle::Internal> internal;
@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     return INFINI_STATUS_SUCCESS;
 }
 
-} // namespace op::causal_softmax::cuda
+} // namespace op::causal_softmax::nvidia
diff --git a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
new file mode 100644
index 000000000..0362c43b1
--- /dev/null
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CAUSAL_SOFTMAX_NVIDIA_H__
+#define __CAUSAL_SOFTMAX_NVIDIA_H__
+
+#include "../causal_softmax.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/causal_softmax/operator.cc b/src/infiniop/ops/causal_softmax/operator.cc
index 2f3b34fbe..c2051e28a 100644
--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -6,10 +6,10 @@
 #include "cpu/causal_softmax_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/causal_softmax_cuda.cuh"
+#include "nvidia/causal_softmax_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/causal_softmax_maca.h"
+#include "metax/causal_softmax_metax.h"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/causal_softmax_ascend.h"
@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda)
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca)
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
         // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
     }
 #endif
-#ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
         return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
         GET(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
     }
 
 #endif
-#ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
         return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax(
         CALCULATE(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca)
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
         // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
     }
 #endif
-#ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
         return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
         DESTROY(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, cuda)
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
 #ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, maca)
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+#ifdef ENABLE_ASCEND_API
+        DESTROY(INFINI_DEVICE_ASCEND, ascend)
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
         // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
     }
 #endif
-#ifdef ENABLE_ASCEND_API
-        DESTROY(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc);
-    }
-#endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu:
         return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
index 99c2bb105..c2690817e 100644
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
@@ -2,6 +2,9 @@
 #define __RMS_NORM_CUDA_KERNEL_H__
 
 #include "../../../devices/cuda/cuda_kernel_common.cuh"
+
+#include <cub/block/block_reduce.cuh>
+
 #include "../../../reduce/cuda/reduce.cuh"
 
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.cuh
similarity index 100%
rename from src/infiniop/ops/rms_norm/maca/rms_norm_maca.cuh
rename to src/infiniop/ops/rms_norm/metax/rms_norm_metax.cuh
diff --git a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
similarity index 99%
rename from src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
rename to src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
index 9ed931404..42e70d812 100644
--- a/src/infiniop/ops/rms_norm/maca/rms_norm_maca.maca
+++ b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
@@ -1,6 +1,6 @@
 #include "../../../devices/maca/common_maca.h"
 #include "../cuda/rms_norm_kernel.cuh"
-#include "rms_norm_maca.cuh"
+#include "rms_norm_metax.cuh"
 
 namespace op::rms_norm::maca {
 
diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc
index 7b460dd1b..571582e82 100644
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -12,7 +12,7 @@
 #include "ascend/rms_norm_aclnn.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/rms_norm_maca.cuh"
+#include "metax/rms_norm_metax.cuh"
 #endif
 #ifdef ENABLE_MOORE_API
 #include "musa/rms_norm_musa.cuh"
diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index 1fab64016..89a0dd18b 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -1,8 +1,6 @@
 #ifndef __INFINIOP_REDUCE_CUDA_H__
 #define __INFINIOP_REDUCE_CUDA_H__
 
-#include <cub/block/block_reduce.cuh>
-
 /*
  * Device functions for reduction operations on CUDA.
  *
diff --git a/src/infiniop/reduce/maca/reduce.h b/src/infiniop/reduce/maca/reduce.h
deleted file mode 100644
index 21ea7a83f..000000000
--- a/src/infiniop/reduce/maca/reduce.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef __INFINIOP_REDUCE_MACA_H__
-#define __INFINIOP_REDUCE_MACA_H__
-
-#include <hccub/block/block_reduce.cuh>
-
-/*
- * Device functions for reduction operations on MACA.
- *
- * Note: Only local result on thread 0 is guranteed to be correct.
- *       A manual broadcast is needed for other threads.
- */
-namespace op::common_maca::reduce_op {
-
-// Sum(x^2) on contiguous data of length count
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tcompute sumSquared(const Tdata *data_ptr, size_t count) {
-    Tcompute ss = 0;
-
-    // Each thread computes its partial sum
-    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        ss += Tcompute(data_ptr[i]) * Tcompute(data_ptr[i]);
-    }
-
-    // Use CUB block-level reduction
-    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-
-    return BlockReduce(temp_storage).Sum(ss);
-}
-
-// Sum(x) on contiguous data of length count
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, size_t count) {
-    Tcompute s = 0;
-
-    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        s += Tcompute(data_ptr[i]);
-    }
-
-    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-
-    return BlockReduce(temp_storage).Sum(s);
-}
-
-// Max(x) on contiguous data of length count
-template <unsigned int BLOCK_SIZE, typename Tdata>
-__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
-    Tdata max_ = data_ptr[0];
-
-    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
-        max_ = cub::Max()(max_, data_ptr[i]);
-    }
-
-    using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
-
-    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
-}
-
-} // namespace op::common_maca::reduce_op
-
-#endif
diff --git a/xmake.lua b/xmake.lua
index e6bab1700..753d7a588 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -108,7 +108,7 @@ option_end()
 
 if has_config("metax-gpu") then
     add_defines("ENABLE_METAX_API")
-    includes("xmake/maca.lua")
+    includes("xmake/metax.lua")
 end
 
 -- 摩尔线程
diff --git a/xmake/cuda.lua b/xmake/cuda.lua
index 4703cb69f..bd162624e 100644
--- a/xmake/cuda.lua
+++ b/xmake/cuda.lua
@@ -46,7 +46,7 @@ target("infiniop-cuda")
     add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
 
     set_languages("cxx17")
-    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../build/ninetoothed/*.c")
+    add_files("../src/infiniop/devices/cuda/*.cu", "../src/infiniop/ops/*/cuda/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../build/ninetoothed/*.c")
 target_end()
 
 target("infinirt-cuda")
diff --git a/xmake/maca.lua b/xmake/metax.lua
similarity index 94%
rename from xmake/maca.lua
rename to xmake/metax.lua
index c08d67e74..7a17f54e6 100644
--- a/xmake/maca.lua
+++ b/xmake/metax.lua
@@ -35,7 +35,7 @@ target("infiniop-metax")
     set_warnings("all", "error")
     add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
     add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
-    add_files("../src/infiniop/ops/*/maca/*.maca", {rule = "maca"})
+    add_files("../src/infiniop/ops/*/maca/*.maca", "../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
 target_end()
 
 target("infinirt-metax")
@@ -61,5 +61,5 @@ target("infiniccl-metax")
         add_files("../src/infiniccl/maca/*.cc")
     end
     set_languages("cxx17")
-    
+
 target_end()

From 27a13460dd9608f3f11cfa8af9f0de4c426eb0d8 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Wed, 9 Jul 2025 15:53:34 +0800
Subject: [PATCH 2/8] =?UTF-8?q?issue/291/fix:=20=E5=85=BC=E5=AE=B9=20bf16?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 src/infiniop/devices/cuda/cuda_kernel_common.cuh         | 9 +++++----
 src/infiniop/devices/maca/maca_kernel_common.h           | 9 ++++++---
 src/infiniop/ops/causal_softmax/cuda/kernel.cuh          | 2 +-
 .../ops/causal_softmax/metax/causal_softmax_metax.maca   | 2 +-
 .../ops/causal_softmax/nvidia/causal_softmax_nvidia.cu   | 2 +-
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/infiniop/devices/cuda/cuda_kernel_common.cuh b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
index 88e9fdaa0..cad778d9a 100644
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
@@ -4,6 +4,9 @@
 #define INFINIOP_CUDA_KERNEL __global__ void
 #endif
 
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
 // Posible maximum number of threads per block for CUDA architectures
 // Used for picking correct kernel launch configuration
 #define CUDA_BLOCK_SIZE_4096 4096
@@ -12,8 +15,9 @@
 
 #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
 
-namespace device::cuda {
+using cuda_bfloat16 = nv_bfloat16;
 
+namespace device::cuda {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
 indexToReducedOffset(
@@ -45,8 +49,6 @@ indexToOffset(
 }
 } // namespace device::cuda
 
-#ifdef ENABLE_NVIDIA_API
-#include <cuda_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
     return expf(val);
@@ -73,4 +75,3 @@ __forceinline__ __device__ __nv_bfloat16
 exp_(const __nv_bfloat16 x) {
     return hexp(x);
 }
-#endif
diff --git a/src/infiniop/devices/maca/maca_kernel_common.h b/src/infiniop/devices/maca/maca_kernel_common.h
index 0e2b376aa..af76f78fe 100644
--- a/src/infiniop/devices/maca/maca_kernel_common.h
+++ b/src/infiniop/devices/maca/maca_kernel_common.h
@@ -1,4 +1,8 @@
 #define INFINIOP_MACA_KERNEL __global__ void
+
+#include <maca_bf16.h>
+#include <maca_fp16.h>
+
 // Posible maximum number of threads per block for MACA architectures
 // Used for picking correct kernel launch configuration
 #define MACA_BLOCK_SIZE_1024 1024
@@ -6,6 +10,8 @@
 
 #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
 
+using cuda_bfloat16 = maca_bfloat16;
+
 namespace device::maca {
 
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
@@ -39,8 +45,6 @@ indexToOffset(
 }
 } // namespace device::maca
 
-#ifdef ENABLE_MACA_API
-#include <maca_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
     return expf(val);
@@ -65,4 +69,3 @@ __forceinline__ __device__ __hpcc_bfloat16;
 exp_(const __hpcc_bfloat16; x) {
     return hexp(x);
 }
-#endif
diff --git a/src/infiniop/ops/causal_softmax/cuda/kernel.cuh b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
index 4dce83a6b..fa8d1dd1d 100644
--- a/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
+++ b/src/infiniop/ops/causal_softmax/cuda/kernel.cuh
@@ -29,7 +29,7 @@ __device__ void causalSoftmaxKernel(
         //          2 | * * * ... * * * |
         //  height: 3  col_id->
         if (width + blockIdx.x >= threadIdx.x + height) {
-            if constexpr (std::is_same_v<Tdata, half>) {
+            if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
                 y[col] = hexp(x[col] - max_);
             } else {
                 y[col] = exp(x[col] - max_);
diff --git a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
index 4823c04ba..e23130923 100644
--- a/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
+++ b/src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.maca
@@ -1,8 +1,8 @@
 #include "../../../devices/maca/common_maca.h"
-#include "../../../devices/maca/maca_kernel_common.h"
 #include "causal_softmax_metax.h"
 
 #include <hccub/block/block_reduce.cuh>
+#include "../../../devices/maca/maca_kernel_common.h"
 
 #include "../../../reduce/cuda/reduce.cuh"
 
diff --git a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
index dce85428d..523645686 100644
--- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
@@ -1,7 +1,7 @@
 #include "../../../devices/cuda/cuda_common.cuh"
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
 #include "causal_softmax_nvidia.cuh"
 
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
 #include <cub/block/block_reduce.cuh>
 
 #include "../../../reduce/cuda/reduce.cuh"

From c235afb160106e1800379afbda8a02b20929ebe6 Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 10 Jul 2025 05:34:25 +0800
Subject: [PATCH 3/8] =?UTF-8?q?issue/291/refactor:=20=E6=94=B9=E9=80=A0=20?=
 =?UTF-8?q?clip=20=E5=92=8C=20element-wise?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 src/infiniop/elementwise/elementwise.h        | 78 +++++++++----------
 src/infiniop/ops/add/cpu/add_cpu.h            |  2 +-
 src/infiniop/ops/add/cuda/add_cuda.cuh        |  2 +-
 src/infiniop/ops/clip/cpu/clip_cpu.h          |  2 +-
 .../{clip_cuda_internal.cuh => kernel.cuh}    |  0
 .../clip_cuda.cu => nvidia/clip_nvidia.cu}    | 16 ++--
 .../clip_cuda.cuh => nvidia/clip_nvidia.cuh}  |  2 +-
 src/infiniop/ops/clip/operator.cc             | 10 +--
 src/infiniop/ops/mul/cpu/mul_cpu.h            |  2 +-
 src/infiniop/ops/mul/cuda/mul_cuda.cuh        |  2 +-
 src/infiniop/ops/relu/cpu/relu_cpu.h          |  2 +-
 src/infiniop/ops/sub/cpu/sub_cpu.h            |  2 +-
 src/infiniop/ops/sub/cuda/sub_cuda.cuh        |  2 +-
 src/infiniop/ops/swiglu/cpu/swiglu_cpu.h      |  2 +-
 src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh  |  2 +-
 15 files changed, 63 insertions(+), 63 deletions(-)
 rename src/infiniop/ops/clip/cuda/{clip_cuda_internal.cuh => kernel.cuh} (100%)
 rename src/infiniop/ops/clip/{cuda/clip_cuda.cu => nvidia/clip_nvidia.cu} (73%)
 rename src/infiniop/ops/clip/{cuda/clip_cuda.cuh => nvidia/clip_nvidia.cuh} (80%)

diff --git a/src/infiniop/elementwise/elementwise.h b/src/infiniop/elementwise/elementwise.h
index a6a5477f4..02e19a0da 100644
--- a/src/infiniop/elementwise/elementwise.h
+++ b/src/infiniop/elementwise/elementwise.h
@@ -12,45 +12,45 @@
 #include <numeric>
 #include <vector>
 
-#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                                 \
-                                                                              \
-    namespace op::OP::NAMESPACE {                                             \
-    class Descriptor final : public InfiniopDescriptor {                      \
-        infiniDtype_t _dtype;                                                 \
-        op::elementwise::ElementwiseInfo _info;                               \
-        std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
-        size_t _workspace_size;                                               \
-                                                                              \
-        Descriptor(                                                           \
-            infiniDtype_t dtype,                                              \
-            op::elementwise::ElementwiseInfo info,                            \
-            op::elementwise::NAMESPACE::DeviceImpl *device_info,              \
-            size_t workspace_size,                                            \
-            infiniDevice_t device_type,                                       \
-            int device_id)                                                    \
-            : InfiniopDescriptor{device_type, device_id},                     \
-              _dtype(dtype),                                                  \
-              _info(std::move(info)),                                         \
-              _device_info(std::move(device_info)),                           \
-              _workspace_size(workspace_size) {}                              \
-                                                                              \
-    public:                                                                   \
-        ~Descriptor();                                                        \
-                                                                              \
-        size_t workspaceSize() const { return _workspace_size; }              \
-                                                                              \
-        static infiniStatus_t create(                                         \
-            infiniopHandle_t handle,                                          \
-            Descriptor **desc_ptr,                                            \
-            infiniopTensorDescriptor_t output_desc,                           \
-            std::vector<infiniopTensorDescriptor_t> input_descs);             \
-                                                                              \
-        infiniStatus_t calculate(                                             \
-            void *workspace, size_t workspace_size,                           \
-            void *output,                                                     \
-            std::vector<const void *> inputs,                                 \
-            void *stream) const;                                              \
-    };                                                                        \
+#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON)                      \
+                                                                                  \
+    namespace op::OP::NAMESPACE {                                                 \
+    class Descriptor final : public InfiniopDescriptor {                          \
+        infiniDtype_t _dtype;                                                     \
+        op::elementwise::ElementwiseInfo _info;                                   \
+        std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
+        size_t _workspace_size;                                                   \
+                                                                                  \
+        Descriptor(                                                               \
+            infiniDtype_t dtype,                                                  \
+            op::elementwise::ElementwiseInfo info,                                \
+            op::elementwise::KERNEL_COMMON::DeviceImpl *device_info,              \
+            size_t workspace_size,                                                \
+            infiniDevice_t device_type,                                           \
+            int device_id)                                                        \
+            : InfiniopDescriptor{device_type, device_id},                         \
+              _dtype(dtype),                                                      \
+              _info(std::move(info)),                                             \
+              _device_info(std::move(device_info)),                               \
+              _workspace_size(workspace_size) {}                                  \
+                                                                                  \
+    public:                                                                       \
+        ~Descriptor();                                                            \
+                                                                                  \
+        size_t workspaceSize() const { return _workspace_size; }                  \
+                                                                                  \
+        static infiniStatus_t create(                                             \
+            infiniopHandle_t handle,                                              \
+            Descriptor **desc_ptr,                                                \
+            infiniopTensorDescriptor_t output_desc,                               \
+            std::vector<infiniopTensorDescriptor_t> input_descs);                 \
+                                                                                  \
+        infiniStatus_t calculate(                                                 \
+            void *workspace, size_t workspace_size,                               \
+            void *output,                                                         \
+            std::vector<const void *> inputs,                                     \
+            void *stream) const;                                                  \
+    };                                                                            \
     }
 
 namespace op::elementwise {
diff --git a/src/infiniop/ops/add/cpu/add_cpu.h b/src/infiniop/ops/add/cpu/add_cpu.h
index 736468acf..d95ee05c8 100644
--- a/src/infiniop/ops/add/cpu/add_cpu.h
+++ b/src/infiniop/ops/add/cpu/add_cpu.h
@@ -3,7 +3,7 @@
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(add, cpu)
+ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)
 
 namespace op::add::cpu {
 typedef struct AddOp {
diff --git a/src/infiniop/ops/add/cuda/add_cuda.cuh b/src/infiniop/ops/add/cuda/add_cuda.cuh
index 2e81480b9..8a0680553 100644
--- a/src/infiniop/ops/add/cuda/add_cuda.cuh
+++ b/src/infiniop/ops/add/cuda/add_cuda.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(add, cuda)
+ELEMENTWISE_DESCRIPTOR(add, cuda, cuda)
 
 #endif // __ADD_CUDA_API_H__
diff --git a/src/infiniop/ops/clip/cpu/clip_cpu.h b/src/infiniop/ops/clip/cpu/clip_cpu.h
index ab42fc6f4..6d37954cb 100644
--- a/src/infiniop/ops/clip/cpu/clip_cpu.h
+++ b/src/infiniop/ops/clip/cpu/clip_cpu.h
@@ -4,7 +4,7 @@
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 #include "infiniop/ops/clip.h"
 
-ELEMENTWISE_DESCRIPTOR(clip, cpu)
+ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu)
 
 namespace op::clip::cpu {
 
diff --git a/src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh b/src/infiniop/ops/clip/cuda/kernel.cuh
similarity index 100%
rename from src/infiniop/ops/clip/cuda/clip_cuda_internal.cuh
rename to src/infiniop/ops/clip/cuda/kernel.cuh
diff --git a/src/infiniop/ops/clip/cuda/clip_cuda.cu b/src/infiniop/ops/clip/nvidia/clip_nvidia.cu
similarity index 73%
rename from src/infiniop/ops/clip/cuda/clip_cuda.cu
rename to src/infiniop/ops/clip/nvidia/clip_nvidia.cu
index f9618aeed..c2e43b456 100644
--- a/src/infiniop/ops/clip/cuda/clip_cuda.cu
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cu
@@ -1,7 +1,7 @@
-#include "clip_cuda.cuh"
-#include "clip_cuda_internal.cuh"
+#include "../cuda/kernel.cuh"
+#include "clip_nvidia.cuh"
 
-namespace op::clip::cuda {
+namespace op::clip::nvidia {
 
 Descriptor::~Descriptor() = default;
 
@@ -45,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, ClipOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ClipOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, ClipOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ClipOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, ClipOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::clip::cuda
+} // namespace op::clip::nvidia
diff --git a/src/infiniop/ops/clip/cuda/clip_cuda.cuh b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
similarity index 80%
rename from src/infiniop/ops/clip/cuda/clip_cuda.cuh
rename to src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
index 87e8a068d..6a44e6176 100644
--- a/src/infiniop/ops/clip/cuda/clip_cuda.cuh
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
@@ -4,6 +4,6 @@
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 #include "infiniop/ops/clip.h"
 
-ELEMENTWISE_DESCRIPTOR(clip, cuda)
+ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda)
 
 #endif // __CLIP_CUDA_API_H__
diff --git a/src/infiniop/ops/clip/operator.cc b/src/infiniop/ops/clip/operator.cc
index bac56b3d0..e1398440d 100644
--- a/src/infiniop/ops/clip/operator.cc
+++ b/src/infiniop/ops/clip/operator.cc
@@ -6,7 +6,7 @@
 #include "cpu/clip_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/clip_cuda.cuh"
+#include "nvidia/clip_nvidia.cuh"
 #endif
 
 __C infiniStatus_t infiniopCreateClipDescriptor(
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -53,7 +53,7 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
         GET(INFINI_DEVICE_CPU, cpu)
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
     }
 
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopClip(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -107,7 +107,7 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
diff --git a/src/infiniop/ops/mul/cpu/mul_cpu.h b/src/infiniop/ops/mul/cpu/mul_cpu.h
index a60d1ee94..26b381340 100644
--- a/src/infiniop/ops/mul/cpu/mul_cpu.h
+++ b/src/infiniop/ops/mul/cpu/mul_cpu.h
@@ -3,7 +3,7 @@
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(mul, cpu)
+ELEMENTWISE_DESCRIPTOR(mul, cpu, cpu)
 
 namespace op::mul::cpu {
 typedef struct MulOp {
diff --git a/src/infiniop/ops/mul/cuda/mul_cuda.cuh b/src/infiniop/ops/mul/cuda/mul_cuda.cuh
index 00219e34d..5bd420c56 100644
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cuh
+++ b/src/infiniop/ops/mul/cuda/mul_cuda.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(mul, cuda)
+ELEMENTWISE_DESCRIPTOR(mul, cuda, cuda)
 
 #endif // __MUL_CUDA_API_H__
diff --git a/src/infiniop/ops/relu/cpu/relu_cpu.h b/src/infiniop/ops/relu/cpu/relu_cpu.h
index b507a5d18..66dec3da8 100644
--- a/src/infiniop/ops/relu/cpu/relu_cpu.h
+++ b/src/infiniop/ops/relu/cpu/relu_cpu.h
@@ -5,7 +5,7 @@
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(relu, cpu)
+ELEMENTWISE_DESCRIPTOR(relu, cpu, cpu)
 
 namespace op::relu::cpu {
 typedef struct ReluOp {
diff --git a/src/infiniop/ops/sub/cpu/sub_cpu.h b/src/infiniop/ops/sub/cpu/sub_cpu.h
index af460466e..2277a8e07 100644
--- a/src/infiniop/ops/sub/cpu/sub_cpu.h
+++ b/src/infiniop/ops/sub/cpu/sub_cpu.h
@@ -3,7 +3,7 @@
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(sub, cpu)
+ELEMENTWISE_DESCRIPTOR(sub, cpu, cpu)
 
 namespace op::sub::cpu {
 typedef struct SubOp {
diff --git a/src/infiniop/ops/sub/cuda/sub_cuda.cuh b/src/infiniop/ops/sub/cuda/sub_cuda.cuh
index 4590ba787..515c43d26 100644
--- a/src/infiniop/ops/sub/cuda/sub_cuda.cuh
+++ b/src/infiniop/ops/sub/cuda/sub_cuda.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(sub, cuda)
+ELEMENTWISE_DESCRIPTOR(sub, cuda, cuda)
 
 #endif // __SUB_CUDA_API_H__
diff --git a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
index 65c1c7c33..f41d46cd5 100644
--- a/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
+++ b/src/infiniop/ops/swiglu/cpu/swiglu_cpu.h
@@ -3,7 +3,7 @@
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(swiglu, cpu)
+ELEMENTWISE_DESCRIPTOR(swiglu, cpu, cpu)
 
 namespace op::swiglu::cpu {
 typedef struct SwiGLUOp {
diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh b/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
index 75e529ab1..e91f9dbc0 100644
--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
+++ b/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(swiglu, cuda)
+ELEMENTWISE_DESCRIPTOR(swiglu, cuda, cuda)
 
 #endif // __SWIGLU_CUDA_API_H__

From f06eb359e0c8e18c74eda757b2ade68b7d8af78a Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 10 Jul 2025 08:55:14 +0800
Subject: [PATCH 4/8] =?UTF-8?q?issue/291/refactor:=20=E6=94=B9=E9=80=A0=20?=
 =?UTF-8?q?+-*?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .../cuda/{add_cuda_internal.cuh => kernel.cuh}   |  0
 .../{cuda/add_cuda.cu => nvidia/add_nvidia.cu}   | 14 +++++++-------
 .../{cuda/add_cuda.cuh => nvidia/add_nvidia.cuh} |  2 +-
 src/infiniop/ops/add/operator.cc                 | 16 ++++++++--------
 src/infiniop/ops/clip/nvidia/clip_nvidia.cuh     |  1 -
 .../cuda/{mul_cuda_internal.cuh => kernel.cuh}   |  0
 .../{cuda/mul_cuda.cu => nvidia/mul_nvidia.cu}   | 16 ++++++++--------
 .../{cuda/mul_cuda.cuh => nvidia/mul_nvidia.cuh} |  2 +-
 src/infiniop/ops/mul/operator.cc                 | 14 +++++++-------
 .../cuda/{sub_cuda_internal.cuh => kernel.cuh}   |  0
 .../{cuda/sub_cuda.cu => nvidia/sub_nvidia.cu}   | 14 +++++++-------
 .../{cuda/sub_cuda.cuh => nvidia/sub_nvidia.cuh} |  2 +-
 src/infiniop/ops/sub/operator.cc                 | 14 +++++++-------
 13 files changed, 47 insertions(+), 48 deletions(-)
 rename src/infiniop/ops/add/cuda/{add_cuda_internal.cuh => kernel.cuh} (100%)
 rename src/infiniop/ops/add/{cuda/add_cuda.cu => nvidia/add_nvidia.cu} (73%)
 rename src/infiniop/ops/add/{cuda/add_cuda.cuh => nvidia/add_nvidia.cuh} (77%)
 rename src/infiniop/ops/mul/cuda/{mul_cuda_internal.cuh => kernel.cuh} (100%)
 rename src/infiniop/ops/mul/{cuda/mul_cuda.cu => nvidia/mul_nvidia.cu} (71%)
 rename src/infiniop/ops/mul/{cuda/mul_cuda.cuh => nvidia/mul_nvidia.cuh} (77%)
 rename src/infiniop/ops/sub/cuda/{sub_cuda_internal.cuh => kernel.cuh} (100%)
 rename src/infiniop/ops/sub/{cuda/sub_cuda.cu => nvidia/sub_nvidia.cu} (73%)
 rename src/infiniop/ops/sub/{cuda/sub_cuda.cuh => nvidia/sub_nvidia.cuh} (77%)

diff --git a/src/infiniop/ops/add/cuda/add_cuda_internal.cuh b/src/infiniop/ops/add/cuda/kernel.cuh
similarity index 100%
rename from src/infiniop/ops/add/cuda/add_cuda_internal.cuh
rename to src/infiniop/ops/add/cuda/kernel.cuh
diff --git a/src/infiniop/ops/add/cuda/add_cuda.cu b/src/infiniop/ops/add/nvidia/add_nvidia.cu
similarity index 73%
rename from src/infiniop/ops/add/cuda/add_cuda.cu
rename to src/infiniop/ops/add/nvidia/add_nvidia.cu
index 93ae0dd7d..b31d23b63 100644
--- a/src/infiniop/ops/add/cuda/add_cuda.cu
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cu
@@ -1,7 +1,7 @@
-#include "add_cuda.cuh"
-#include "add_cuda_internal.cuh"
+#include "add_nvidia.cuh"
+#include "../cuda/kernel.cuh"
 
-namespace op::add::cuda {
+namespace op::add::nvidia {
 
 Descriptor::~Descriptor() = default;
 
@@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, AddOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, AddOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, AddOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/add/cuda/add_cuda.cuh b/src/infiniop/ops/add/nvidia/add_nvidia.cuh
similarity index 77%
rename from src/infiniop/ops/add/cuda/add_cuda.cuh
rename to src/infiniop/ops/add/nvidia/add_nvidia.cuh
index 8a0680553..04e8ecd39 100644
--- a/src/infiniop/ops/add/cuda/add_cuda.cuh
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(add, cuda, cuda)
+ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)
 
 #endif // __ADD_CUDA_API_H__
diff --git a/src/infiniop/ops/add/operator.cc b/src/infiniop/ops/add/operator.cc
index b9a00064c..49aa8f922 100644
--- a/src/infiniop/ops/add/operator.cc
+++ b/src/infiniop/ops/add/operator.cc
@@ -6,7 +6,7 @@
 #include "cpu/add_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/add_cuda.cuh"
+#include "nvidia/add_nvidia.cuh"
 #endif
 
 __C infiniStatus_t infiniopCreateAddDescriptor(
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
 #define GET(CASE, NAMESPACE)                                                               \
     case CASE:                                                                             \
         *size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -99,7 +99,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
 #define DELETE(CASE, NAMESPACE)                                                \
     case CASE:                                                                 \
         delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 
@@ -107,7 +107,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
diff --git a/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
index 6a44e6176..45358f706 100644
--- a/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
+++ b/src/infiniop/ops/clip/nvidia/clip_nvidia.cuh
@@ -2,7 +2,6 @@
 #define __CLIP_CUDA_API_H__
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
-#include "infiniop/ops/clip.h"
 
 ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda)
 
diff --git a/src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh b/src/infiniop/ops/mul/cuda/kernel.cuh
similarity index 100%
rename from src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
rename to src/infiniop/ops/mul/cuda/kernel.cuh
diff --git a/src/infiniop/ops/mul/cuda/mul_cuda.cu b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu
similarity index 71%
rename from src/infiniop/ops/mul/cuda/mul_cuda.cu
rename to src/infiniop/ops/mul/nvidia/mul_nvidia.cu
index e98878352..9d71f31e8 100644
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cu
+++ b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu
@@ -1,7 +1,7 @@
-#include "mul_cuda.cuh"
-#include "mul_cuda_internal.cuh"
+#include "mul_nvidia.cuh"
+#include "../cuda/kernel.cuh"
 
-namespace op::mul::cuda {
+namespace op::mul::nvidia {
 
 Descriptor::~Descriptor() = default;
 
@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::MulOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::MulOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::mul::cuda
+} // namespace op::mul::nvidia
diff --git a/src/infiniop/ops/mul/cuda/mul_cuda.cuh b/src/infiniop/ops/mul/nvidia/mul_nvidia.cuh
similarity index 77%
rename from src/infiniop/ops/mul/cuda/mul_cuda.cuh
rename to src/infiniop/ops/mul/nvidia/mul_nvidia.cuh
index 5bd420c56..fd59ed904 100644
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cuh
+++ b/src/infiniop/ops/mul/nvidia/mul_nvidia.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(mul, cuda, cuda)
+ELEMENTWISE_DESCRIPTOR(mul, nvidia, cuda)
 
 #endif // __MUL_CUDA_API_H__
diff --git a/src/infiniop/ops/mul/operator.cc b/src/infiniop/ops/mul/operator.cc
index 89c9586c4..cb965924b 100644
--- a/src/infiniop/ops/mul/operator.cc
+++ b/src/infiniop/ops/mul/operator.cc
@@ -7,7 +7,7 @@
 #endif
 
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/mul_cuda.cuh"
+#include "nvidia/mul_nvidia.cuh"
 #endif
 
 __C infiniStatus_t infiniopCreateMulDescriptor(
@@ -32,7 +32,7 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -47,14 +47,14 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
 #define GET(CASE, NAMESPACE)                                                               \
     case CASE:                                                                             \
         *size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -84,7 +84,7 @@ __C infiniStatus_t infiniopMul(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -108,7 +108,7 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
diff --git a/src/infiniop/ops/sub/cuda/sub_cuda_internal.cuh b/src/infiniop/ops/sub/cuda/kernel.cuh
similarity index 100%
rename from src/infiniop/ops/sub/cuda/sub_cuda_internal.cuh
rename to src/infiniop/ops/sub/cuda/kernel.cuh
diff --git a/src/infiniop/ops/sub/cuda/sub_cuda.cu b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu
similarity index 73%
rename from src/infiniop/ops/sub/cuda/sub_cuda.cu
rename to src/infiniop/ops/sub/nvidia/sub_nvidia.cu
index 9aedb7c3e..4460a2019 100644
--- a/src/infiniop/ops/sub/cuda/sub_cuda.cu
+++ b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu
@@ -1,7 +1,7 @@
-#include "sub_cuda.cuh"
-#include "sub_cuda_internal.cuh"
+#include "sub_nvidia.cuh"
+#include "../cuda/kernel.cuh"
 
-namespace op::sub::cuda {
+namespace op::sub::nvidia {
 
 Descriptor::~Descriptor() = default;
 
@@ -43,13 +43,13 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, SubOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SubOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, SubOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SubOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, SubOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, SubOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SubOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/sub/cuda/sub_cuda.cuh b/src/infiniop/ops/sub/nvidia/sub_nvidia.cuh
similarity index 77%
rename from src/infiniop/ops/sub/cuda/sub_cuda.cuh
rename to src/infiniop/ops/sub/nvidia/sub_nvidia.cuh
index 515c43d26..caebe1051 100644
--- a/src/infiniop/ops/sub/cuda/sub_cuda.cuh
+++ b/src/infiniop/ops/sub/nvidia/sub_nvidia.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(sub, cuda, cuda)
+ELEMENTWISE_DESCRIPTOR(sub, nvidia, cuda)
 
 #endif // __SUB_CUDA_API_H__
diff --git a/src/infiniop/ops/sub/operator.cc b/src/infiniop/ops/sub/operator.cc
index 826efbd7e..ca4a8f0ab 100644
--- a/src/infiniop/ops/sub/operator.cc
+++ b/src/infiniop/ops/sub/operator.cc
@@ -6,7 +6,7 @@
 #include "cpu/sub_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/sub_cuda.cuh"
+#include "nvidia/sub_nvidia.cuh"
 #endif
 
 __C infiniStatus_t infiniopCreateSubDescriptor(
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
 #define GET(CASE, NAMESPACE)                                                               \
     case CASE:                                                                             \
         *size = reinterpret_cast<op::sub::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopSub(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -107,7 +107,7 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:

From abf1e0219e951caca72c3cd42b3aa33ae37a667c Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Thu, 10 Jul 2025 09:01:48 +0800
Subject: [PATCH 5/8] =?UTF-8?q?issue/291/refactor:=20=E6=94=B9=E9=80=A0=20?=
 =?UTF-8?q?rms=5Fnorm=E3=80=81rope=E3=80=81swiglu?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 .../cuda/{rms_norm_kernel.cuh => kernel.cuh}  |  6 --
 .../rms_norm_nvidia.cu}                       | 12 +++-
 .../rms_norm_nvidia.cuh}                      |  2 +-
 src/infiniop/ops/rms_norm/operator.cc         | 58 +++++++++----------
 .../cuda/{rope_cuda_kernel.cuh => kernel.cuh} |  2 -
 .../rope_cuda.cu => nvidia/rope_nvidia.cu}    |  9 ++-
 .../rope_cuda.cuh => nvidia/rope_nvidia.cuh}  |  2 +-
 src/infiniop/ops/rope/operator.cc             | 10 ++--
 .../{swiglu_cuda_internal.cuh => kernel.cuh}  |  0
 .../swiglu_nvidia.cu}                         | 14 ++---
 .../swiglu_nvidia.cuh}                        |  2 +-
 src/infiniop/ops/swiglu/operator.cc           | 23 +++-----
 12 files changed, 68 insertions(+), 72 deletions(-)
 rename src/infiniop/ops/rms_norm/cuda/{rms_norm_kernel.cuh => kernel.cuh} (88%)
 rename src/infiniop/ops/rms_norm/{cuda/rms_norm_cuda.cu => nvidia/rms_norm_nvidia.cu} (94%)
 rename src/infiniop/ops/rms_norm/{cuda/rms_norm_cuda.cuh => nvidia/rms_norm_nvidia.cuh} (82%)
 rename src/infiniop/ops/rope/cuda/{rope_cuda_kernel.cuh => kernel.cuh} (97%)
 rename src/infiniop/ops/rope/{cuda/rope_cuda.cu => nvidia/rope_nvidia.cu} (96%)
 rename src/infiniop/ops/rope/{cuda/rope_cuda.cuh => nvidia/rope_nvidia.cuh} (86%)
 rename src/infiniop/ops/swiglu/cuda/{swiglu_cuda_internal.cuh => kernel.cuh} (100%)
 rename src/infiniop/ops/swiglu/{cuda/swiglu_cuda.cu => nvidia/swiglu_nvidia.cu} (72%)
 rename src/infiniop/ops/swiglu/{cuda/swiglu_cuda.cuh => nvidia/swiglu_nvidia.cuh} (77%)

diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh b/src/infiniop/ops/rms_norm/cuda/kernel.cuh
similarity index 88%
rename from src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
rename to src/infiniop/ops/rms_norm/cuda/kernel.cuh
index c2690817e..3185672e2 100644
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/kernel.cuh
@@ -1,12 +1,6 @@
 #ifndef __RMS_NORM_CUDA_KERNEL_H__
 #define __RMS_NORM_CUDA_KERNEL_H__
 
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-
-#include <cub/block/block_reduce.cuh>
-
-#include "../../../reduce/cuda/reduce.cuh"
-
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
 INFINIOP_CUDA_KERNEL rmsnormBlock(
     Tdata *__restrict__ y,
diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
similarity index 94%
rename from src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
rename to src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
index 48fe0a3aa..12435df72 100644
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cu
+++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
@@ -1,8 +1,14 @@
 #include "../../../devices/cuda/cuda_common.cuh"
-#include "rms_norm_cuda.cuh"
-#include "rms_norm_kernel.cuh"
+#include "rms_norm_nvidia.cuh"
 
-namespace op::rms_norm::cuda {
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::rms_norm::nvidia {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::cuda::Handle::Internal> internal;
diff --git a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
similarity index 82%
rename from src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
rename to src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
index c56b6bf23..e42d41c39 100644
--- a/src/infiniop/ops/rms_norm/cuda/rms_norm_cuda.cuh
+++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cuh
@@ -3,6 +3,6 @@
 
 #include "../rms_norm.h"
 
-DESCRIPTOR(cuda)
+DESCRIPTOR(nvidia)
 
 #endif
diff --git a/src/infiniop/ops/rms_norm/operator.cc b/src/infiniop/ops/rms_norm/operator.cc
index 571582e82..9c7fbab27 100644
--- a/src/infiniop/ops/rms_norm/operator.cc
+++ b/src/infiniop/ops/rms_norm/operator.cc
@@ -6,7 +6,7 @@
 #include "cpu/rms_norm_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/rms_norm_cuda.cuh"
+#include "nvidia/rms_norm_nvidia.cuh"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/rms_norm_aclnn.h"
@@ -37,17 +37,17 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
             y_desc,                                                             \
             x_desc,                                                             \
             w_desc,                                                             \
-            epsilon);
+            epsilon)
 
     switch (handle->device) {
 #ifdef ENABLE_CPU_API
-        CREATE(INFINI_DEVICE_CPU, cpu)
+        CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda)
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        CREATE(INFINI_DEVICE_KUNLUN, kunlun)
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -55,13 +55,13 @@ __C infiniStatus_t infiniopCreateRMSNormDescriptor(
     }
 #endif
 #ifdef ENABLE_ASCEND_API
-        CREATE(INFINI_DEVICE_ASCEND, ascend)
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca)
+        CREATE(INFINI_DEVICE_METAX, maca);
 #endif
 #ifdef ENABLE_MOORE_API
-        CREATE(INFINI_DEVICE_MOORE, musa)
+        CREATE(INFINI_DEVICE_MOORE, musa);
 #endif
     }
 
@@ -75,17 +75,17 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
 #define GET(CASE, NAMESPACE)                                                                    \
     case CASE:                                                                                  \
         *size = reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun)
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -93,13 +93,13 @@ __C infiniStatus_t infiniopGetRMSNormWorkspaceSize(infiniopRMSNormDescriptor_t d
     }
 #endif
 #ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend)
+        GET(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca)
+        GET(INFINI_DEVICE_METAX, maca);
 #endif
 #ifdef ENABLE_MOORE_API
-        GET(INFINI_DEVICE_MOORE, musa)
+        GET(INFINI_DEVICE_MOORE, musa);
 #endif
     }
 
@@ -114,17 +114,17 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
 #define CALCULATE(CASE, NAMESPACE)                                                       \
     case CASE:                                                                           \
         return reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc)->calculate( \
-            workspace, workspace_size, y, x, w, stream);
+            workspace, workspace_size, y, x, w, stream)
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        CALCULATE(INFINI_DEVICE_CPU, cpu)
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda)
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun)
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -132,13 +132,13 @@ __C infiniStatus_t infiniopRMSNorm(infiniopRMSNormDescriptor_t desc, void *works
     }
 #endif
 #ifdef ENABLE_ASCEND_API
-        CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca)
+        CALCULATE(INFINI_DEVICE_METAX, maca);
 #endif
 #ifdef ENABLE_MOORE_API
-        CALCULATE(INFINI_DEVICE_MOORE, musa)
+        CALCULATE(INFINI_DEVICE_MOORE, musa);
 #endif
     }
 
@@ -152,17 +152,17 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
 #define DESTROY(CASE, NAMESPACE)                                              \
     case CASE:                                                                \
         delete reinterpret_cast<op::rms_norm::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        DESTROY(INFINI_DEVICE_CPU, cpu)
+        DESTROY(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DESTROY(INFINI_DEVICE_NVIDIA, cuda)
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        DESTROY(INFINI_DEVICE_KUNLUN, kunlun)
+        DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -170,13 +170,13 @@ __C infiniStatus_t infiniopDestroyRMSNormDescriptor(infiniopRMSNormDescriptor_t
     }
 #endif
 #ifdef ENABLE_ASCEND_API
-        DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        DESTROY(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, maca)
+        DESTROY(INFINI_DEVICE_METAX, maca);
 #endif
 #ifdef ENABLE_MOORE_API
-        DESTROY(INFINI_DEVICE_MOORE, musa)
+        DESTROY(INFINI_DEVICE_MOORE, musa);
 #endif
     }
 
diff --git a/src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh b/src/infiniop/ops/rope/cuda/kernel.cuh
similarity index 97%
rename from src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh
rename to src/infiniop/ops/rope/cuda/kernel.cuh
index c2a36a377..3d584ac76 100644
--- a/src/infiniop/ops/rope/cuda/rope_cuda_kernel.cuh
+++ b/src/infiniop/ops/rope/cuda/kernel.cuh
@@ -1,8 +1,6 @@
 #ifndef __INFINIOP_ROPE_CUDA_KERNEL_CUH__
 #define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
 
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-
 template <typename Tdata, typename Tindex, typename Tangle>
 INFINIOP_CUDA_KERNEL ropeThreadPerItem(
     Tdata *y_,
diff --git a/src/infiniop/ops/rope/cuda/rope_cuda.cu b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu
similarity index 96%
rename from src/infiniop/ops/rope/cuda/rope_cuda.cu
rename to src/infiniop/ops/rope/nvidia/rope_nvidia.cu
index 6f35f394f..c2833dcf8 100644
--- a/src/infiniop/ops/rope/cuda/rope_cuda.cu
+++ b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu
@@ -1,8 +1,11 @@
 #include "../../../devices/cuda/cuda_common.cuh"
-#include "rope_cuda.cuh"
-#include "rope_cuda_kernel.cuh"
+#include "rope_nvidia.cuh"
 
-namespace op::rope::cuda {
+#include "../../../devices/cuda/cuda_kernel_common.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::rope::nvidia {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::cuda::Handle::Internal> internal;
diff --git a/src/infiniop/ops/rope/cuda/rope_cuda.cuh b/src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
similarity index 86%
rename from src/infiniop/ops/rope/cuda/rope_cuda.cuh
rename to src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
index 003a961c2..6b2455600 100644
--- a/src/infiniop/ops/rope/cuda/rope_cuda.cuh
+++ b/src/infiniop/ops/rope/nvidia/rope_nvidia.cuh
@@ -3,6 +3,6 @@
 
 #include "../rope.h"
 
-DESCRIPTOR(cuda)
+DESCRIPTOR(nvidia)
 
 #endif // __INFINIOP_ROPE_CUDA_H__
diff --git a/src/infiniop/ops/rope/operator.cc b/src/infiniop/ops/rope/operator.cc
index 95c42d06d..077f6e86b 100644
--- a/src/infiniop/ops/rope/operator.cc
+++ b/src/infiniop/ops/rope/operator.cc
@@ -6,7 +6,7 @@
 #include "cpu/rope_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/rope_cuda.cuh"
+#include "nvidia/rope_nvidia.cuh"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/rope_ascend.h"
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
         CREATE(INFINI_DEVICE_METAX, maca);
@@ -81,7 +81,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
         GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda);
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
         GET(INFINI_DEVICE_METAX, maca);
@@ -132,7 +132,7 @@ __C infiniStatus_t infiniopRoPE(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
         CALCULATE(INFINI_DEVICE_METAX, maca);
@@ -178,7 +178,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
         DELETE(INFINI_DEVICE_METAX, maca);
diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh b/src/infiniop/ops/swiglu/cuda/kernel.cuh
similarity index 100%
rename from src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
rename to src/infiniop/ops/swiglu/cuda/kernel.cuh
diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
similarity index 72%
rename from src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu
rename to src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
index f124a671f..ae744fdc7 100644
--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cu
+++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
@@ -1,7 +1,7 @@
-#include "swiglu_cuda.cuh"
-#include "swiglu_cuda_internal.cuh"
+#include "swiglu_nvidia.cuh"
+#include "../cuda/kernel.cuh"
 
-namespace op::swiglu::cuda {
+namespace op::swiglu::nvidia {
 
 Descriptor::~Descriptor() = default;
 
@@ -42,13 +42,13 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
diff --git a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
similarity index 77%
rename from src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
rename to src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
index e91f9dbc0..c3b26821d 100644
--- a/src/infiniop/ops/swiglu/cuda/swiglu_cuda.cuh
+++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cuh
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(swiglu, cuda, cuda)
+ELEMENTWISE_DESCRIPTOR(swiglu, nvidia, cuda)
 
 #endif // __SWIGLU_CUDA_API_H__
diff --git a/src/infiniop/ops/swiglu/operator.cc b/src/infiniop/ops/swiglu/operator.cc
index 018ec3bd2..2b831016d 100644
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -6,7 +6,7 @@
 #include "cpu/swiglu_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/swiglu_cuda.cuh"
+#include "nvidia/swiglu_nvidia.cuh"
 #endif
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/swiglu_kunlun.h"
@@ -40,7 +40,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
         CREATE(INFINI_DEVICE_KUNLUN, kunlun);
@@ -83,17 +83,17 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
 #define GET(CASE, NAMESPACE)                                                                  \
     case CASE:                                                                                \
         *size = reinterpret_cast<op::swiglu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
-        GET(INFINI_DEVICE_KUNLUN, kunlun)
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
         GET(INFINI_DEVICE_METAX, maca);
@@ -104,12 +104,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
     }
 #endif
 #ifdef ENABLE_ASCEND_API
-        GET(INFINI_DEVICE_ASCEND, ascend)
-#endif
-#ifdef ENABLE_METAX_GPU
-    case DevMetaxGpu: {
-        return macaGetSwiGLUWorkspaceSize((SwiGLUMacaDescriptor_t)desc, size);
-    }
+        GET(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_MTHREADS_GPU
     case DevMthreadsGpu: {
@@ -143,7 +138,7 @@ __C infiniStatus_t infiniopSwiGLU(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
         CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
@@ -189,7 +184,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_KUNLUN_API
         DELETE(INFINI_DEVICE_KUNLUN, kunlun);

From 05247bb7c0fa766024190490b8485f957e56484d Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Thu, 10 Jul 2025 09:06:47 +0000
Subject: [PATCH 6/8] =?UTF-8?q?issue/291/refactor:=20=E9=80=82=E9=85=8D?=
 =?UTF-8?q?=E6=B2=90=E6=9B=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: PanZezhong <panzezhong@qiyuanlab.com>
---
 .../devices/cuda/cuda_kernel_common.cuh       |  1 +
 .../devices/maca/maca_kernel_common.h         | 12 +++---
 src/infiniop/ops/gemm/maca/gemm_maca.cc       |  9 ++--
 src/infiniop/ops/rms_norm/cuda/kernel.cuh     |  4 +-
 .../ops/rms_norm/metax/rms_norm_metax.maca    | 40 +++++++++++++-----
 .../ops/rms_norm/nvidia/rms_norm_nvidia.cu    | 30 +++++++++----
 src/infiniop/ops/rope/cuda/kernel.cuh         |  8 ++--
 src/infiniop/ops/rope/maca/rope_maca_kernel.h | 42 -------------------
 .../{maca/rope_maca.h => metax/rope_metax.h}  |  2 +-
 .../rope_maca.maca => metax/rope_metax.maca}  | 33 +++++++++++++--
 src/infiniop/ops/rope/nvidia/rope_nvidia.cu   | 26 ++++++++++--
 src/infiniop/ops/rope/operator.cc             | 10 ++---
 src/infiniop/ops/swiglu/cuda/kernel.cuh       | 16 +++----
 .../ops/swiglu/maca/swiglu_maca_internal.h    | 40 ------------------
 .../swiglu_maca.h => metax/swiglu_metax.h}    |  2 +-
 .../swiglu_metax.maca}                        | 21 ++++++----
 .../ops/swiglu/nvidia/swiglu_nvidia.cu        |  5 ++-
 src/infiniop/ops/swiglu/operator.cc           | 10 ++---
 xmake.lua                                     |  2 +-
 19 files changed, 155 insertions(+), 158 deletions(-)
 delete mode 100644 src/infiniop/ops/rope/maca/rope_maca_kernel.h
 rename src/infiniop/ops/rope/{maca/rope_maca.h => metax/rope_metax.h} (87%)
 rename src/infiniop/ops/rope/{maca/rope_maca.maca => metax/rope_metax.maca} (81%)
 delete mode 100644 src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
 rename src/infiniop/ops/swiglu/{maca/swiglu_maca.h => metax/swiglu_metax.h} (77%)
 rename src/infiniop/ops/swiglu/{maca/swiglu_maca.maca => metax/swiglu_metax.maca} (62%)

diff --git a/src/infiniop/devices/cuda/cuda_kernel_common.cuh b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
index cad778d9a..41b448069 100644
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
@@ -16,6 +16,7 @@
 #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
 
 using cuda_bfloat16 = nv_bfloat16;
+using cuda_bfloat162 = nv_bfloat162;
 
 namespace device::cuda {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
diff --git a/src/infiniop/devices/maca/maca_kernel_common.h b/src/infiniop/devices/maca/maca_kernel_common.h
index af76f78fe..5e2a7a254 100644
--- a/src/infiniop/devices/maca/maca_kernel_common.h
+++ b/src/infiniop/devices/maca/maca_kernel_common.h
@@ -1,8 +1,5 @@
 #define INFINIOP_MACA_KERNEL __global__ void
 
-#include <maca_bf16.h>
-#include <maca_fp16.h>
-
 // Posible maximum number of threads per block for MACA architectures
 // Used for picking correct kernel launch configuration
 #define MACA_BLOCK_SIZE_1024 1024
@@ -10,7 +7,8 @@
 
 #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
 
-using cuda_bfloat16 = maca_bfloat16;
+using cuda_bfloat16 = hpcc_bfloat16;
+using cuda_bfloat162 = hpcc_bfloat162;
 
 namespace device::maca {
 
@@ -52,7 +50,7 @@ exp_(const float val) {
 
 __forceinline__ __device__ long double
 exp_(const long double val) {
-    return expl(val);
+    return exp(val);
 }
 
 __forceinline__ __device__ double
@@ -65,7 +63,7 @@ exp_(const __half x) {
     return hexp(x);
 }
 
-__forceinline__ __device__ __hpcc_bfloat16;
-exp_(const __hpcc_bfloat16; x) {
+__forceinline__ __device__ __hpcc_bfloat16
+exp_(const __hpcc_bfloat16 x) {
     return hexp(x);
 }
diff --git a/src/infiniop/ops/gemm/maca/gemm_maca.cc b/src/infiniop/ops/gemm/maca/gemm_maca.cc
index e1f1b9674..2c824a377 100644
--- a/src/infiniop/ops/gemm/maca/gemm_maca.cc
+++ b/src/infiniop/ops/gemm/maca/gemm_maca.cc
@@ -21,9 +21,7 @@ infiniStatus_t Descriptor::create(
     auto handle = reinterpret_cast<device::maca::Handle *>(handle_);
     auto dtype = c_desc->dtype();
 
-    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32) {
-        return INFINI_STATUS_BAD_TENSOR_DTYPE;
-    }
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
 
     auto result = MatmulInfo::create(c_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
     CHECK_RESULT(result);
@@ -53,7 +51,10 @@ infiniStatus_t Descriptor::calculate(
         a_type = b_type = c_type = HPCC_R_16F;
         compute_type = HCBLAS_COMPUTE_32F;
         break;
-
+    case INFINI_DTYPE_BF16:
+        a_type = b_type = c_type = HPCC_R_16BF;
+        compute_type = HCBLAS_COMPUTE_32F;
+        break;
     case INFINI_DTYPE_F32:
         a_type = b_type = c_type = HPCC_R_32F;
         compute_type = HCBLAS_COMPUTE_32F_FAST_TF32;
diff --git a/src/infiniop/ops/rms_norm/cuda/kernel.cuh b/src/infiniop/ops/rms_norm/cuda/kernel.cuh
index 3185672e2..53c4a5587 100644
--- a/src/infiniop/ops/rms_norm/cuda/kernel.cuh
+++ b/src/infiniop/ops/rms_norm/cuda/kernel.cuh
@@ -1,8 +1,8 @@
 #ifndef __RMS_NORM_CUDA_KERNEL_H__
 #define __RMS_NORM_CUDA_KERNEL_H__
 
-template <unsigned int BLOCK_SIZE, typename Tdata, typename Tweight, typename Tcompute>
-INFINIOP_CUDA_KERNEL rmsnormBlock(
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+__device__ void rmsnormBlock(
     Tdata *__restrict__ y,
     ptrdiff_t stride_y,
     const Tdata *__restrict__ x,
diff --git a/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
index 42e70d812..2e76303f3 100644
--- a/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
+++ b/src/infiniop/ops/rms_norm/metax/rms_norm_metax.maca
@@ -1,7 +1,25 @@
 #include "../../../devices/maca/common_maca.h"
-#include "../cuda/rms_norm_kernel.cuh"
 #include "rms_norm_metax.cuh"
 
+#include "../../../devices/maca/maca_kernel_common.h"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+INFINIOP_MACA_KERNEL rmsnormKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
+}
+
 namespace op::rms_norm::maca {
 
 struct Descriptor::Opaque {
@@ -46,14 +64,14 @@ infiniStatus_t launchKernel(
     float epsilon,
     hcStream_t maca_stream) {
 
-#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
-    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                               \
-        stride_y,                                                                                   \
-        reinterpret_cast<const Tdata *>(x),                                                         \
-        stride_x,                                                                                   \
-        reinterpret_cast<const Tweight *>(w),                                                       \
-        dim,                                                                                        \
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                      \
+    rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, maca_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                                \
+        stride_y,                                                                                    \
+        reinterpret_cast<const Tdata *>(x),                                                          \
+        stride_x,                                                                                    \
+        reinterpret_cast<const Tweight *>(w),                                                        \
+        dim,                                                                                         \
         epsilon)
 
     if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
@@ -91,8 +109,8 @@ infiniStatus_t Descriptor::calculate(
     auto maca_stream = reinterpret_cast<hcStream_t>(stream);
 
     // launch kernel with different block sizes
-    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
+    if (_opaque->internal->maxThreadsPerBlock() == MACA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<MACA_BLOCK_SIZE_1024>(batch_size, dim, y, _info.atype, stride_y, x, stride_x, w, _info.wtype, _info.epsilon, maca_stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
diff --git a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
index 12435df72..bc6f9fe20 100644
--- a/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
+++ b/src/infiniop/ops/rms_norm/nvidia/rms_norm_nvidia.cu
@@ -8,6 +8,18 @@
 
 #include "../cuda/kernel.cuh"
 
+template <unsigned int BLOCK_SIZE, typename Tcompute, typename Tdata, typename Tweight>
+INFINIOP_CUDA_KERNEL rmsnormKernel(
+    Tdata *__restrict__ y,
+    ptrdiff_t stride_y,
+    const Tdata *__restrict__ x,
+    ptrdiff_t stride_x,
+    const Tweight *__restrict__ w,
+    size_t dim,
+    float epsilon) {
+    rmsnormBlock<BLOCK_SIZE, Tcompute>(y, stride_y, x, stride_x, w, dim, epsilon);
+}
+
 namespace op::rms_norm::nvidia {
 
 struct Descriptor::Opaque {
@@ -52,14 +64,14 @@ infiniStatus_t launchKernel(
     float epsilon,
     cudaStream_t cuda_stream) {
 
-#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                     \
-    rmsnormBlock<BLOCK_SIZE, Tdata, Tweight, Tcompute><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
-        reinterpret_cast<Tdata *>(y),                                                               \
-        stride_y,                                                                                   \
-        reinterpret_cast<const Tdata *>(x),                                                         \
-        stride_x,                                                                                   \
-        reinterpret_cast<const Tweight *>(w),                                                       \
-        dim,                                                                                        \
+#define LAUNCH_KERNEL(Tdata, Tweight, Tcompute)                                                      \
+    rmsnormKernel<BLOCK_SIZE, Tcompute, Tdata, Tweight><<<batch_size, BLOCK_SIZE, 0, cuda_stream>>>( \
+        reinterpret_cast<Tdata *>(y),                                                                \
+        stride_y,                                                                                    \
+        reinterpret_cast<const Tdata *>(x),                                                          \
+        stride_x,                                                                                    \
+        reinterpret_cast<const Tweight *>(w),                                                        \
+        dim,                                                                                         \
         epsilon)
 
     if (atype == INFINI_DTYPE_F16 && wtype == INFINI_DTYPE_F16) {
@@ -108,4 +120,4 @@ infiniStatus_t Descriptor::calculate(
     }
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::rms_norm::cuda
+} // namespace op::rms_norm::nvidia
diff --git a/src/infiniop/ops/rope/cuda/kernel.cuh b/src/infiniop/ops/rope/cuda/kernel.cuh
index 3d584ac76..01f2bc9d1 100644
--- a/src/infiniop/ops/rope/cuda/kernel.cuh
+++ b/src/infiniop/ops/rope/cuda/kernel.cuh
@@ -2,7 +2,7 @@
 #define __INFINIOP_ROPE_CUDA_KERNEL_CUH__
 
 template <typename Tdata, typename Tindex, typename Tangle>
-INFINIOP_CUDA_KERNEL ropeThreadPerItem(
+__device__ void ropeThreadPerItemBlock(
     Tdata *y_,
     const Tdata *x_,
     const Tindex *__restrict__ pos_ids,
@@ -28,9 +28,9 @@ INFINIOP_CUDA_KERNEL ropeThreadPerItem(
             Tangle y0 = x.x * cos__ - x.y * sin__,
                    y1 = x.x * sin__ + x.y * cos__;
             y = half2(y0, y1);
-        } else if constexpr (std::is_same<Tdata, __nv_bfloat16>::value) {
-            auto &y = reinterpret_cast<__nv_bfloat162 &>(y_[y_offset + 2 * i]);
-            auto &x = reinterpret_cast<const __nv_bfloat162 &>(x_[x_offset + 2 * i]);
+        } else if constexpr (std::is_same<Tdata, cuda_bfloat16>::value) {
+            auto &y = reinterpret_cast<cuda_bfloat162 &>(y_[y_offset + 2 * i]);
+            auto &x = reinterpret_cast<const cuda_bfloat162 &>(x_[x_offset + 2 * i]);
 
             Tangle x0 = __low2bfloat16(x);
             Tangle x1 = __high2bfloat16(x);
diff --git a/src/infiniop/ops/rope/maca/rope_maca_kernel.h b/src/infiniop/ops/rope/maca/rope_maca_kernel.h
deleted file mode 100644
index 8579df1fb..000000000
--- a/src/infiniop/ops/rope/maca/rope_maca_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef __INFINIOP_ROPE_MACA_KERNEL_H__
-#define __INFINIOP_ROPE_MACA_KERNEL_H__
-
-#include "../../../devices/maca/maca_kernel_common.h"
-
-template <typename Tdata, typename Tindex, typename Tangle>
-INFINIOP_MACA_KERNEL ropeThreadPerItem(
-    Tdata *y_,
-    const Tdata *x_,
-    const Tindex *__restrict__ pos_ids,
-    const Tangle *__restrict__ sin_table,
-    const Tangle *__restrict__ cos_table,
-    size_t table_dim,
-    ptrdiff_t y_stride_seqlen,
-    ptrdiff_t y_stride_nhead,
-    ptrdiff_t x_stride_seqlen,
-    ptrdiff_t x_stride_nhead) {
-
-    auto y_offset = blockIdx.x * y_stride_seqlen + blockIdx.y * y_stride_nhead;
-    auto x_offset = blockIdx.x * x_stride_seqlen + blockIdx.y * x_stride_nhead;
-    size_t pos_id = size_t(pos_ids[blockIdx.x]);
-    auto table_offset = pos_id * table_dim;
-
-    for (size_t i = threadIdx.x; i < table_dim; i += blockDim.x) {
-        Tangle sin__ = sin_table[table_offset + i],
-               cos__ = cos_table[table_offset + i];
-        if constexpr (std::is_same<Tdata, half>::value) {
-            auto &y = reinterpret_cast<half2 &>(y_[y_offset + 2 * i]);
-            auto &x = reinterpret_cast<const half2 &>(x_[x_offset + 2 * i]);
-            Tangle y0 = x.x * cos__ - x.y * sin__,
-                   y1 = x.x * sin__ + x.y * cos__;
-            y = half2(y0, y1);
-        } else {
-            Tangle x0 = x_[x_offset + 2 * i],
-                   x1 = x_[x_offset + 2 * i + 1];
-            y_[y_offset + 2 * i] = Tdata(x0 * cos__ - x1 * sin__);
-            y_[y_offset + 2 * i + 1] = Tdata(x0 * sin__ + x1 * cos__);
-        }
-    }
-}
-
-#endif
diff --git a/src/infiniop/ops/rope/maca/rope_maca.h b/src/infiniop/ops/rope/metax/rope_metax.h
similarity index 87%
rename from src/infiniop/ops/rope/maca/rope_maca.h
rename to src/infiniop/ops/rope/metax/rope_metax.h
index 106ed2cc0..40908f092 100644
--- a/src/infiniop/ops/rope/maca/rope_maca.h
+++ b/src/infiniop/ops/rope/metax/rope_metax.h
@@ -3,6 +3,6 @@
 
 #include "../rope.h"
 
-DESCRIPTOR(maca)
+DESCRIPTOR(metax)
 
 #endif // __INFINIOP_ROPE_MACA_H__
diff --git a/src/infiniop/ops/rope/maca/rope_maca.maca b/src/infiniop/ops/rope/metax/rope_metax.maca
similarity index 81%
rename from src/infiniop/ops/rope/maca/rope_maca.maca
rename to src/infiniop/ops/rope/metax/rope_metax.maca
index 7c079e6d7..d7a3b9e82 100644
--- a/src/infiniop/ops/rope/maca/rope_maca.maca
+++ b/src/infiniop/ops/rope/metax/rope_metax.maca
@@ -1,8 +1,31 @@
 #include "../../../devices/maca/common_maca.h"
-#include "rope_maca.h"
-#include "rope_maca_kernel.h"
+#include "rope_metax.h"
+
+#include "../../../devices/maca/maca_kernel_common.h"
+
+#include "../cuda/kernel.cuh"
+
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_MACA_KERNEL ropeThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}
 
-namespace op::rope::maca {
+namespace op::rope::metax {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::maca::Handle::Internal> internal;
@@ -50,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
          dimy = uint32_t(info.nhead);
     int nthreads = std::max(int(info.table_dim), block_size);
 
-    ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+    ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
         y, x, pos_ids, sin_table, cos_table, info.table_dim,
         info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
 
@@ -102,6 +125,8 @@ infiniStatus_t Descriptor::calculate(
     switch (_info.data_type) {
     case INFINI_DTYPE_F16:
         ROPE_TYPE(half);
+    case INFINI_DTYPE_BF16:
+        ROPE_TYPE(cuda_bfloat16);
     case INFINI_DTYPE_F32:
         ROPE_TYPE(float);
     case INFINI_DTYPE_F64:
diff --git a/src/infiniop/ops/rope/nvidia/rope_nvidia.cu b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu
index c2833dcf8..d4d5f0f9a 100644
--- a/src/infiniop/ops/rope/nvidia/rope_nvidia.cu
+++ b/src/infiniop/ops/rope/nvidia/rope_nvidia.cu
@@ -5,6 +5,26 @@
 
 #include "../cuda/kernel.cuh"
 
+template <typename Tdata, typename Tindex, typename Tangle>
+INFINIOP_CUDA_KERNEL ropeThreadPerItemKernel(
+    Tdata *y_,
+    const Tdata *x_,
+    const Tindex *__restrict__ pos_ids,
+    const Tangle *__restrict__ sin_table,
+    const Tangle *__restrict__ cos_table,
+    size_t table_dim,
+    ptrdiff_t y_stride_seqlen,
+    ptrdiff_t y_stride_nhead,
+    ptrdiff_t x_stride_seqlen,
+    ptrdiff_t x_stride_nhead) {
+    ropeThreadPerItemBlock(
+        y_, x_, pos_ids,
+        sin_table, cos_table,
+        table_dim,
+        y_stride_seqlen, y_stride_nhead,
+        x_stride_seqlen, x_stride_nhead);
+}
+
 namespace op::rope::nvidia {
 
 struct Descriptor::Opaque {
@@ -53,7 +73,7 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
          dimy = uint32_t(info.nhead);
     int nthreads = std::max(int(info.table_dim), block_size);
 
-    ropeThreadPerItem<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
+    ropeThreadPerItemKernel<<<dim3(dimx, dimy), nthreads, 0, stream>>>(
         y, x, pos_ids, sin_table, cos_table, info.table_dim,
         info.y_stride_seqlen, info.y_stride_nhead, info.x_stride_seqlen, info.x_stride_nhead);
 
@@ -106,7 +126,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F16:
         ROPE_TYPE(half);
     case INFINI_DTYPE_BF16:
-        ROPE_TYPE(__nv_bfloat16);
+        ROPE_TYPE(cuda_bfloat16);
     case INFINI_DTYPE_F32:
         ROPE_TYPE(float);
     case INFINI_DTYPE_F64:
@@ -121,4 +141,4 @@ infiniStatus_t Descriptor::calculate(
 #undef ROPE_TYPE
 #undef CALCULATE_ROPE
 
-} // namespace op::rope::cuda
+} // namespace op::rope::nvidia
diff --git a/src/infiniop/ops/rope/operator.cc b/src/infiniop/ops/rope/operator.cc
index 077f6e86b..e5bbbe3b7 100644
--- a/src/infiniop/ops/rope/operator.cc
+++ b/src/infiniop/ops/rope/operator.cc
@@ -12,7 +12,7 @@
 #include "ascend/rope_ascend.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/rope_maca.h"
+#include "metax/rope_metax.h"
 #endif
 
 __C infiniStatus_t infiniopCreateRoPEDescriptor(
@@ -43,7 +43,7 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
         CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_ASCEND_API
         CREATE(INFINI_DEVICE_ASCEND, ascend);
@@ -84,7 +84,7 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
         GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca);
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -135,7 +135,7 @@ __C infiniStatus_t infiniopRoPE(
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca);
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -181,7 +181,7 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
         DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
diff --git a/src/infiniop/ops/swiglu/cuda/kernel.cuh b/src/infiniop/ops/swiglu/cuda/kernel.cuh
index a75fd53e9..3ce26a8fe 100644
--- a/src/infiniop/ops/swiglu/cuda/kernel.cuh
+++ b/src/infiniop/ops/swiglu/cuda/kernel.cuh
@@ -1,10 +1,6 @@
 #ifndef __SWIGLU_CUDA_H__
 #define __SWIGLU_CUDA_H__
 
-#include "../../../elementwise/cuda/elementwise_cuda.cuh"
-#include <cuda_bf16.h>
-#include <cuda_fp16.h>
-
 namespace op::swiglu::cuda {
 typedef struct SwiGLUOp {
 private:
@@ -14,13 +10,13 @@ private:
             return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
         } else if constexpr (std::is_same_v<T, half>) {
             return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
-        } else if constexpr (std::is_same_v<T, __nv_bfloat162>) {
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
             float x0 = __bfloat162float(__low2bfloat16(x));
             float x1 = __bfloat162float(__high2bfloat16(x));
             float sig0 = __frcp_rn(__fadd_rn(1.0f, __expf(-x0)));
             float sig1 = __frcp_rn(__fadd_rn(1.0f, __expf(-x1)));
             return __floats2bfloat162_rn(sig0, sig1);
-        } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
             float xf = __bfloat162float(x);
             return __float2bfloat16_rn(__frcp_rn(__fadd_rn(1.0f, __expf(-xf))));
         } else if constexpr (std::is_same_v<T, float>) {
@@ -38,8 +34,8 @@ public:
             return __hmul2(__hmul2(gate, sigmoid(gate)), up);
         } else if constexpr (std::is_same_v<T, half>) {
             return __hmul(__hmul(gate, sigmoid(gate)), up);
-        } else if constexpr (std::is_same_v<T, __nv_bfloat162>) {
-            __nv_bfloat162 sig = sigmoid(gate);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            cuda_bfloat162 sig = sigmoid(gate);
             float gate0 = __bfloat162float(__low2bfloat16(gate));
             float gate1 = __bfloat162float(__high2bfloat16(gate));
             float sig0 = __bfloat162float(__low2bfloat16(sig));
@@ -49,8 +45,8 @@ public:
             float res0 = __fmul_rn(__fmul_rn(gate0, sig0), up0);
             float res1 = __fmul_rn(__fmul_rn(gate1, sig1), up1);
             return __floats2bfloat162_rn(res0, res1);
-        } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
-            __nv_bfloat16 sig = sigmoid(gate);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            cuda_bfloat16 sig = sigmoid(gate);
             float gatef = __bfloat162float(gate);
             float sigf = __bfloat162float(sig);
             float upf = __bfloat162float(up);
diff --git a/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h b/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
deleted file mode 100644
index 8af1068f4..000000000
--- a/src/infiniop/ops/swiglu/maca/swiglu_maca_internal.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __SWIGLU_MACA_H__
-#define __SWIGLU_MACA_H__
-
-#include "../../../elementwise/maca/elementwise_maca.h"
-#include <hctlass/half.h>
-
-namespace op::swiglu::maca {
-typedef struct SwiGLUOp {
-private:
-    template <typename T>
-    __device__ __forceinline__ T sigmoid(const T &x) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return h2rcp(__hadd2(make_half2(1, 1), h2exp(__hneg2(x))));
-        } else if constexpr (std::is_same_v<T, half>) {
-            return hrcp(__hadd(half(1.f), __float2half(__expf(__half2float(__hneg(x))))));
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __frcp_rn(__fadd_rn(1, __expf(-x)));
-        } else {
-            return 1 / (1 + std::exp(-x));
-        }
-    }
-
-public:
-    static constexpr size_t num_inputs = 2;
-    template <typename T>
-    __device__ __forceinline__ T operator()(const T &up, const T &gate) const {
-        if constexpr (std::is_same_v<T, half2>) {
-            return __hmul2(__hmul2(gate, sigmoid(gate)), up);
-        } else if constexpr (std::is_same_v<T, half>) {
-            return __hmul(__hmul(gate, sigmoid(gate)), up);
-        } else if constexpr (std::is_same_v<T, float>) {
-            return __fmul_rn(__fmul_rn(gate, sigmoid(gate)), up);
-        } else {
-            return gate * sigmoid(gate) * up;
-        }
-    }
-} SwiGLUOp;
-} // namespace op::swiglu::maca
-
-#endif
diff --git a/src/infiniop/ops/swiglu/maca/swiglu_maca.h b/src/infiniop/ops/swiglu/metax/swiglu_metax.h
similarity index 77%
rename from src/infiniop/ops/swiglu/maca/swiglu_maca.h
rename to src/infiniop/ops/swiglu/metax/swiglu_metax.h
index d261d6e6e..830275fb8 100644
--- a/src/infiniop/ops/swiglu/maca/swiglu_maca.h
+++ b/src/infiniop/ops/swiglu/metax/swiglu_metax.h
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/maca/elementwise_maca_api.h"
 
-ELEMENTWISE_DESCRIPTOR(swiglu, maca)
+ELEMENTWISE_DESCRIPTOR(swiglu, metax, maca)
 
 #endif // __SWIGLU_MACA_API_H__
diff --git a/src/infiniop/ops/swiglu/maca/swiglu_maca.maca b/src/infiniop/ops/swiglu/metax/swiglu_metax.maca
similarity index 62%
rename from src/infiniop/ops/swiglu/maca/swiglu_maca.maca
rename to src/infiniop/ops/swiglu/metax/swiglu_metax.maca
index c2515b580..cc511f467 100644
--- a/src/infiniop/ops/swiglu/maca/swiglu_maca.maca
+++ b/src/infiniop/ops/swiglu/metax/swiglu_metax.maca
@@ -1,7 +1,10 @@
-#include "swiglu_maca.h"
-#include "swiglu_maca_internal.h"
+#include "swiglu_metax.h"
 
-namespace op::swiglu::maca {
+#include "../../../elementwise/maca/elementwise_maca.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::swiglu::metax {
 
 Descriptor::~Descriptor() = default;
 
@@ -20,7 +23,7 @@ infiniStatus_t Descriptor::create(
     const auto &up_shape = up_desc->shape();
     const auto &gate_shape = gate_desc->shape();
 
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
     CHECK_SAME_SHAPE(out_shape, up_shape, gate_shape);
 
     // create MACA elementwise descriptor
@@ -42,15 +45,17 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, SwiGLUOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, SwiGLUOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, SwiGLUOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::swiglu::maca
+} // namespace op::swiglu::metax
diff --git a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
index ae744fdc7..682077e13 100644
--- a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
+++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
@@ -1,4 +1,7 @@
 #include "swiglu_nvidia.cuh"
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+
 #include "../cuda/kernel.cuh"
 
 namespace op::swiglu::nvidia {
@@ -44,7 +47,7 @@ infiniStatus_t Descriptor::calculate(
     case INFINI_DTYPE_F16:
         return _device_info->calculate<256, cuda::SwiGLUOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::SwiGLUOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::SwiGLUOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
         return _device_info->calculate<256, cuda::SwiGLUOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
diff --git a/src/infiniop/ops/swiglu/operator.cc b/src/infiniop/ops/swiglu/operator.cc
index 2b831016d..838276521 100644
--- a/src/infiniop/ops/swiglu/operator.cc
+++ b/src/infiniop/ops/swiglu/operator.cc
@@ -12,7 +12,7 @@
 #include "kunlun/swiglu_kunlun.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/swiglu_maca.h"
+#include "metax/swiglu_metax.h"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/swiglu_ascend.h"
@@ -46,7 +46,7 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
         CREATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -96,7 +96,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
         GET(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca);
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -144,7 +144,7 @@ __C infiniStatus_t infiniopSwiGLU(
         CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca);
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
@@ -190,7 +190,7 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
         DELETE(INFINI_DEVICE_KUNLUN, kunlun);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_CAMBRICON_MLU
     case DevCambriconMlu: {
diff --git a/xmake.lua b/xmake.lua
index 753d7a588..e0c610dad 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -174,7 +174,7 @@ target("infini-utils")
         add_cxflags("-fPIC", "-Wno-unknown-pragmas")
         if has_config("omp") then
             add_cxflags("-fopenmp")
-            add_ldflags("-fopenmp")
+            add_ldflags("-fopenmp", {force = true})
         end
     end
 

From d15aaa3b20717efa6cc31731b76c417d0a66d3c1 Mon Sep 17 00:00:00 2001
From: PanZezhong <panzezhong@qiyuanlab.com>
Date: Fri, 11 Jul 2025 05:48:53 +0000
Subject: [PATCH 7/8] =?UTF-8?q?issue/291/style:=20=E4=B8=80=E7=B3=BB?=
 =?UTF-8?q?=E5=88=97=20maca=20=E6=94=B9=E4=B8=BA=20metax=20=E5=B9=B6?=
 =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: PanZezhong <panzezhong@qiyuanlab.com>
---
 src/infiniop/ops/add/nvidia/add_nvidia.cu              |  4 ++--
 .../gemm/{maca/gemm_maca.cc => metax/gemm_metax.cc}    |  6 +++---
 .../ops/gemm/{maca/gemm_maca.h => metax/gemm_metax.h}  |  2 +-
 src/infiniop/ops/gemm/operator.cc                      | 10 +++++-----
 src/infiniop/ops/mul/nvidia/mul_nvidia.cu              |  2 +-
 .../{maca => metax}/random_sample_kernel.h             |  4 ++--
 .../random_sample_metax.h}                             |  2 +-
 .../random_sample_metax.maca}                          |  4 ++--
 src/infiniop/ops/random_sample/operator.cc             | 10 +++++-----
 .../ops/rearrange/{maca => metax}/rearrange_kernel.h   |  0
 .../{maca/rearrange_maca.h => metax/rearrange_metax.h} |  2 +-
 .../rearrange_maca.maca => metax/rearrange_metax.maca} |  6 +++---
 src/infiniop/ops/rearrange/operator.cc                 |  8 ++++----
 src/infiniop/ops/sub/nvidia/sub_nvidia.cu              |  4 ++--
 src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu        |  2 +-
 xmake/metax.lua                                        |  4 ++--
 16 files changed, 35 insertions(+), 35 deletions(-)
 rename src/infiniop/ops/gemm/{maca/gemm_maca.cc => metax/gemm_metax.cc} (97%)
 rename src/infiniop/ops/gemm/{maca/gemm_maca.h => metax/gemm_metax.h} (84%)
 rename src/infiniop/ops/random_sample/{maca => metax}/random_sample_kernel.h (98%)
 rename src/infiniop/ops/random_sample/{maca/random_sample_maca.h => metax/random_sample_metax.h} (88%)
 rename src/infiniop/ops/random_sample/{maca/random_sample_maca.maca => metax/random_sample_metax.maca} (97%)
 rename src/infiniop/ops/rearrange/{maca => metax}/rearrange_kernel.h (100%)
 rename src/infiniop/ops/rearrange/{maca/rearrange_maca.h => metax/rearrange_metax.h} (86%)
 rename src/infiniop/ops/rearrange/{maca/rearrange_maca.maca => metax/rearrange_metax.maca} (99%)

diff --git a/src/infiniop/ops/add/nvidia/add_nvidia.cu b/src/infiniop/ops/add/nvidia/add_nvidia.cu
index b31d23b63..7b3764482 100644
--- a/src/infiniop/ops/add/nvidia/add_nvidia.cu
+++ b/src/infiniop/ops/add/nvidia/add_nvidia.cu
@@ -1,5 +1,5 @@
-#include "add_nvidia.cuh"
 #include "../cuda/kernel.cuh"
+#include "add_nvidia.cuh"
 
 namespace op::add::nvidia {
 
@@ -56,4 +56,4 @@ infiniStatus_t Descriptor::calculate(
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::add::cuda
+} // namespace op::add::nvidia
diff --git a/src/infiniop/ops/gemm/maca/gemm_maca.cc b/src/infiniop/ops/gemm/metax/gemm_metax.cc
similarity index 97%
rename from src/infiniop/ops/gemm/maca/gemm_maca.cc
rename to src/infiniop/ops/gemm/metax/gemm_metax.cc
index 2c824a377..65b16b077 100644
--- a/src/infiniop/ops/gemm/maca/gemm_maca.cc
+++ b/src/infiniop/ops/gemm/metax/gemm_metax.cc
@@ -1,8 +1,8 @@
-#include "gemm_maca.h"
+#include "gemm_metax.h"
 #include "../../../devices/maca/common_maca.h"
 #include "../../../devices/maca/maca_handle.h"
 
-namespace op::gemm::maca {
+namespace op::gemm::metax {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::maca::Handle::Internal> internal;
@@ -104,4 +104,4 @@ infiniStatus_t Descriptor::calculate(
     return INFINI_STATUS_SUCCESS;
 }
 
-} // namespace op::gemm::maca
+} // namespace op::gemm::metax
diff --git a/src/infiniop/ops/gemm/maca/gemm_maca.h b/src/infiniop/ops/gemm/metax/gemm_metax.h
similarity index 84%
rename from src/infiniop/ops/gemm/maca/gemm_maca.h
rename to src/infiniop/ops/gemm/metax/gemm_metax.h
index 2c1decd9a..947fb6e83 100644
--- a/src/infiniop/ops/gemm/maca/gemm_maca.h
+++ b/src/infiniop/ops/gemm/metax/gemm_metax.h
@@ -3,6 +3,6 @@
 
 #include "../gemm.h"
 
-DESCRIPTOR(maca)
+DESCRIPTOR(metax)
 
 #endif // __GEMM_MACA_H__
diff --git a/src/infiniop/ops/gemm/operator.cc b/src/infiniop/ops/gemm/operator.cc
index ce067cf25..5ecbdd93a 100644
--- a/src/infiniop/ops/gemm/operator.cc
+++ b/src/infiniop/ops/gemm/operator.cc
@@ -15,7 +15,7 @@
 #include "ascend/gemm_ascend.h"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/gemm_maca.h"
+#include "metax/gemm_metax.h"
 #endif
 #ifdef ENABLE_MOORE_API
 #include "musa/gemm_musa.h"
@@ -55,7 +55,7 @@ __C infiniStatus_t infiniopCreateGemmDescriptor(
         CREATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
         CREATE(INFINI_DEVICE_MOORE, musa);
@@ -97,7 +97,7 @@ infiniopGetGemmWorkspaceSize(
         GET(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca);
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
         GET(INFINI_DEVICE_MOORE, musa);
@@ -146,7 +146,7 @@ __C infiniStatus_t infiniopGemm(
         CALCULATE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca);
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
         CALCULATE(INFINI_DEVICE_MOORE, musa);
@@ -185,7 +185,7 @@ infiniopDestroyGemmDescriptor(infiniopGemmDescriptor_t desc) {
         DELETE(INFINI_DEVICE_ASCEND, ascend);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_MOORE_API
         DELETE(INFINI_DEVICE_MOORE, musa);
diff --git a/src/infiniop/ops/mul/nvidia/mul_nvidia.cu b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu
index 9d71f31e8..fa0af5f7a 100644
--- a/src/infiniop/ops/mul/nvidia/mul_nvidia.cu
+++ b/src/infiniop/ops/mul/nvidia/mul_nvidia.cu
@@ -1,5 +1,5 @@
-#include "mul_nvidia.cuh"
 #include "../cuda/kernel.cuh"
+#include "mul_nvidia.cuh"
 
 namespace op::mul::nvidia {
 
diff --git a/src/infiniop/ops/random_sample/maca/random_sample_kernel.h b/src/infiniop/ops/random_sample/metax/random_sample_kernel.h
similarity index 98%
rename from src/infiniop/ops/random_sample/maca/random_sample_kernel.h
rename to src/infiniop/ops/random_sample/metax/random_sample_kernel.h
index 7a347c897..011580673 100644
--- a/src/infiniop/ops/random_sample/maca/random_sample_kernel.h
+++ b/src/infiniop/ops/random_sample/metax/random_sample_kernel.h
@@ -4,7 +4,7 @@
 #include <hccub/device/device_reduce.cuh>
 #include <hccub/device/device_scan.cuh>
 
-namespace op::random_sample::maca {
+namespace op::random_sample::metax {
 
 // ↓↓↓ 重新封装 cub api，减少模板参数，方便调用
 
@@ -256,4 +256,4 @@ struct Algo {
     }
 };
 
-} // namespace op::random_sample::maca
+} // namespace op::random_sample::metax
diff --git a/src/infiniop/ops/random_sample/maca/random_sample_maca.h b/src/infiniop/ops/random_sample/metax/random_sample_metax.h
similarity index 88%
rename from src/infiniop/ops/random_sample/maca/random_sample_maca.h
rename to src/infiniop/ops/random_sample/metax/random_sample_metax.h
index b35248d6f..cc961479e 100644
--- a/src/infiniop/ops/random_sample/maca/random_sample_maca.h
+++ b/src/infiniop/ops/random_sample/metax/random_sample_metax.h
@@ -3,6 +3,6 @@
 
 #include "../random_sample.h"
 
-DESCRIPTOR(maca)
+DESCRIPTOR(metax)
 
 #endif // __RANDOM_SAMPLE_MACA_H__
diff --git a/src/infiniop/ops/random_sample/maca/random_sample_maca.maca b/src/infiniop/ops/random_sample/metax/random_sample_metax.maca
similarity index 97%
rename from src/infiniop/ops/random_sample/maca/random_sample_maca.maca
rename to src/infiniop/ops/random_sample/metax/random_sample_metax.maca
index db5bfd05e..e61c1b0a8 100644
--- a/src/infiniop/ops/random_sample/maca/random_sample_maca.maca
+++ b/src/infiniop/ops/random_sample/metax/random_sample_metax.maca
@@ -2,9 +2,9 @@
 #include "../../../devices/maca/maca_handle.h"
 #include "../info.h"
 #include "random_sample_kernel.h"
-#include "random_sample_maca.h"
+#include "random_sample_metax.h"
 
-namespace op::random_sample::maca {
+namespace op::random_sample::metax {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::maca::Handle::Internal> internal;
diff --git a/src/infiniop/ops/random_sample/operator.cc b/src/infiniop/ops/random_sample/operator.cc
index cf55024eb..2918b684f 100644
--- a/src/infiniop/ops/random_sample/operator.cc
+++ b/src/infiniop/ops/random_sample/operator.cc
@@ -9,7 +9,7 @@
 #include "cuda/random_sample_cuda.cuh"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/random_sample_maca.h"
+#include "metax/random_sample_metax.h"
 #endif
 #ifdef ENABLE_ASCEND_API
 #include "ascend/random_sample_aclnn.h"
@@ -39,7 +39,7 @@ infiniopCreateRandomSampleDescriptor(
         CREATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_ASCEND_API
         CREATE(INFINI_DEVICE_ASCEND, ascend);
@@ -72,7 +72,7 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
         GET(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, maca);
+        GET(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_ASCEND_API
         GET(INFINI_DEVICE_ASCEND, ascend);
@@ -115,7 +115,7 @@ __C infiniStatus_t infiniopRandomSample(
         CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca);
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_ASCEND_API
         CALCULATE(INFINI_DEVICE_ASCEND, ascend);
@@ -145,7 +145,7 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
         DELETE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
 #ifdef ENABLE_ASCEND_API
         DELETE(INFINI_DEVICE_ASCEND, ascend);
diff --git a/src/infiniop/ops/rearrange/maca/rearrange_kernel.h b/src/infiniop/ops/rearrange/metax/rearrange_kernel.h
similarity index 100%
rename from src/infiniop/ops/rearrange/maca/rearrange_kernel.h
rename to src/infiniop/ops/rearrange/metax/rearrange_kernel.h
diff --git a/src/infiniop/ops/rearrange/maca/rearrange_maca.h b/src/infiniop/ops/rearrange/metax/rearrange_metax.h
similarity index 86%
rename from src/infiniop/ops/rearrange/maca/rearrange_maca.h
rename to src/infiniop/ops/rearrange/metax/rearrange_metax.h
index b0a712ea4..93a45bda3 100644
--- a/src/infiniop/ops/rearrange/maca/rearrange_maca.h
+++ b/src/infiniop/ops/rearrange/metax/rearrange_metax.h
@@ -3,6 +3,6 @@
 
 #include "../rearrange.h"
 
-DESCRIPTOR(maca)
+DESCRIPTOR(metax)
 
 #endif // __REARRANGE_MACA_H__
diff --git a/src/infiniop/ops/rearrange/maca/rearrange_maca.maca b/src/infiniop/ops/rearrange/metax/rearrange_metax.maca
similarity index 99%
rename from src/infiniop/ops/rearrange/maca/rearrange_maca.maca
rename to src/infiniop/ops/rearrange/metax/rearrange_metax.maca
index 29c50f524..9149d0c5c 100644
--- a/src/infiniop/ops/rearrange/maca/rearrange_maca.maca
+++ b/src/infiniop/ops/rearrange/metax/rearrange_metax.maca
@@ -1,13 +1,13 @@
 #include "../../../tensor.h"
 #include "rearrange_kernel.h"
-#include "rearrange_maca.h"
+#include "rearrange_metax.h"
 #include <algorithm>
 #include <cmath>
 #include <memory>
 #include <stdint.h>
 #include <vector>
 
-namespace op::rearrange::maca {
+namespace op::rearrange::metax {
 
 struct Descriptor::Opaque {
     std::shared_ptr<device::maca::Handle::Internal> internal;
@@ -480,4 +480,4 @@ infiniStatus_t Descriptor::calculate(
     return status;
 }
 
-} // namespace op::rearrange::maca
+} // namespace op::rearrange::metax
diff --git a/src/infiniop/ops/rearrange/operator.cc b/src/infiniop/ops/rearrange/operator.cc
index 9e467ad84..a892cfb9d 100644
--- a/src/infiniop/ops/rearrange/operator.cc
+++ b/src/infiniop/ops/rearrange/operator.cc
@@ -13,7 +13,7 @@
 #include "cuda/rearrange_cuda.cuh"
 #endif
 #ifdef ENABLE_METAX_API
-#include "maca/rearrange_maca.h"
+#include "metax/rearrange_metax.h"
 #endif
 
 __C infiniStatus_t infiniopCreateRearrangeDescriptor(
@@ -43,7 +43,7 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
         CREATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, maca);
+        CREATE(INFINI_DEVICE_METAX, metax);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -76,7 +76,7 @@ __C infiniStatus_t infiniopRearrange(
         CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, maca);
+        CALCULATE(INFINI_DEVICE_METAX, metax);
 #endif
 
     default:
@@ -107,7 +107,7 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
         DELETE(INFINI_DEVICE_NVIDIA, cuda);
 #endif
 #ifdef ENABLE_METAX_API
-        DELETE(INFINI_DEVICE_METAX, maca);
+        DELETE(INFINI_DEVICE_METAX, metax);
 #endif
 
     default:
diff --git a/src/infiniop/ops/sub/nvidia/sub_nvidia.cu b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu
index 4460a2019..0032f5d74 100644
--- a/src/infiniop/ops/sub/nvidia/sub_nvidia.cu
+++ b/src/infiniop/ops/sub/nvidia/sub_nvidia.cu
@@ -1,5 +1,5 @@
-#include "sub_nvidia.cuh"
 #include "../cuda/kernel.cuh"
+#include "sub_nvidia.cuh"
 
 namespace op::sub::nvidia {
 
@@ -56,4 +56,4 @@ infiniStatus_t Descriptor::calculate(
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::sub::cuda
+} // namespace op::sub::nvidia
diff --git a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
index 682077e13..d73247a01 100644
--- a/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
+++ b/src/infiniop/ops/swiglu/nvidia/swiglu_nvidia.cu
@@ -58,4 +58,4 @@ infiniStatus_t Descriptor::calculate(
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::swiglu::cuda
+} // namespace op::swiglu::nvidia
diff --git a/xmake/metax.lua b/xmake/metax.lua
index 7a17f54e6..48a0a8562 100644
--- a/xmake/metax.lua
+++ b/xmake/metax.lua
@@ -34,8 +34,8 @@ target("infiniop-metax")
     set_languages("cxx17")
     set_warnings("all", "error")
     add_cxflags("-lstdc++", "-fPIC", "-Wno-defaulted-function-deleted", "-Wno-strict-aliasing")
-    add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/maca/*.cc")
-    add_files("../src/infiniop/ops/*/maca/*.maca", "../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
+    add_files("../src/infiniop/devices/maca/*.cc", "../src/infiniop/ops/*/metax/*.cc")
+    add_files("../src/infiniop/ops/*/metax/*.maca", {rule = "maca"})
 target_end()
 
 target("infinirt-metax")

From eac2b0cac6e10a5c5dc08bbfbef00c5f63b4a15b Mon Sep 17 00:00:00 2001
From: YdrMaster <ydrml@hotmail.com>
Date: Fri, 11 Jul 2025 14:17:47 +0800
Subject: [PATCH 8/8] =?UTF-8?q?issue/291/docs:=20=E8=A1=A5=E5=85=85?=
 =?UTF-8?q?=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: YdrMaster <ydrml@hotmail.com>
---
 src/infiniop/reduce/cuda/reduce.cuh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index 89a0dd18b..a1d2c2501 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -6,6 +6,11 @@
  *
  * Note: Only local result on thread 0 is guranteed to be correct.
  *       A manual broadcast is needed for other threads.
+ *
+ * Important Note: This is a device-independent header file containing reduce kernels
+ *                 for all cuda-supporting platforms. Include device-specific headers
+ *                 (such as <cub/block/block_reduce.cuh> for nvidia) in your source file
+ *                 and then include this file for proper usage.
  */
 namespace op::common_cuda::reduce_op {