diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..ce720703d 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -3,17 +3,27 @@
 
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
+#include "infiniop/ops/all_equal.h"
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/cross_entropy_loss_backward.h"
+#include "infiniop/ops/div.h"
+#include "infiniop/ops/equal.h"
+#include "infiniop/ops/gelu.h"
+#include "infiniop/ops/gelu_backward.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/logical_and.h"
+#include "infiniop/ops/logical_or.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
+#include "infiniop/ops/relu_backward.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/silu.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
diff --git a/include/infiniop/ops/all_equal.h b/include/infiniop/ops/all_equal.h
new file mode 100644
index 000000000..b260b49b5
--- /dev/null
+++ b/include/infiniop/ops/all_equal.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_ALL_EQUAL_API_H__
+#define __INFINIOP_ALL_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAllEqualDescriptor(infiniopHandle_t handle,
+                                                             infiniopAllEqualDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t c,
+                                                             infiniopTensorDescriptor_t a,
+                                                             infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAllEqual(infiniopAllEqualDescriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *c,
+                                             const void *a,
+                                             const void *b,
+                                             void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cross_entropy_loss_backward.h b/include/infiniop/ops/cross_entropy_loss_backward.h
new file mode 100644
index 000000000..edc2821ac
--- /dev/null
+++ b/include/infiniop/ops/cross_entropy_loss_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_BACKWARD_API_H__
+#define __INFINIOP_CROSS_ENTROPY_LOSS_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCrossEntropyLossBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCrossEntropyLossBackwardDescriptor(infiniopHandle_t handle,
+                                                                             infiniopCrossEntropyLossBackwardDescriptor_t *desc_ptr,
+                                                                             infiniopTensorDescriptor_t grad_logits,
+                                                                             infiniopTensorDescriptor_t probs,
+                                                                             infiniopTensorDescriptor_t target);
+
+__C __export infiniStatus_t infiniopGetCrossEntropyLossBackwardWorkspaceSize(infiniopCrossEntropyLossBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCrossEntropyLossBackward(infiniopCrossEntropyLossBackwardDescriptor_t desc,
+                                                             void *workspace,
+                                                             size_t workspace_size,
+                                                             void *grad_logits,
+                                                             const void *probs,
+                                                             const void *target,
+                                                             void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCrossEntropyLossBackwardDescriptor(infiniopCrossEntropyLossBackwardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h
new file mode 100644
index 000000000..e539b440c
--- /dev/null
+++ b/include/infiniop/ops/div.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_DIV_API_H__
+#define __INFINIOP_DIV_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopDivDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle,
+                                                        infiniopDivDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h
new file mode 100644
index 000000000..3ac071eb4
--- /dev/null
+++ b/include/infiniop/ops/equal.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_EQUAL_API_H__
+#define __INFINIOP_EQUAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateEqualDescriptor(infiniopHandle_t handle,
+                                                          infiniopEqualDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopEqual(infiniopEqualDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *c,
+                                          const void *a,
+                                          const void *b,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/gelu.h b/include/infiniop/ops/gelu.h
new file mode 100644
index 000000000..444092b6a
--- /dev/null
+++ b/include/infiniop/ops/gelu.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_GELU_API_H__
+#define __INFINIOP_GELU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle,
+                                                         infiniopGeluDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t intput);
+
+__C __export infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *intput,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/gelu_backward.h b/include/infiniop/ops/gelu_backward.h
new file mode 100644
index 000000000..9516df81b
--- /dev/null
+++ b/include/infiniop/ops/gelu_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_GELU_BACKWARD_API_H__
+#define __INFINIOP_GELU_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGeluBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGeluBackwardDescriptor(infiniopHandle_t handle,
+                                                                 infiniopGeluBackwardDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t grad_input,
+                                                                 infiniopTensorDescriptor_t intput,
+                                                                 infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGetGeluBackwardWorkspaceSize(infiniopGeluBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopGeluBackward(infiniopGeluBackwardDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *grad_input,
+                                                 const void *input,
+                                                 const void *grad_output,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroyGeluBackwardDescriptor(infiniopGeluBackwardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/logical_and.h b/include/infiniop/ops/logical_and.h
new file mode 100644
index 000000000..5c237f79c
--- /dev/null
+++ b/include/infiniop/ops/logical_and.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_LOGICAL_AND_API_H__
+#define __INFINIOP_LOGICAL_AND_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalAndDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalAndDescriptor(infiniopHandle_t handle,
+                                                               infiniopLogicalAndDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t c,
+                                                               infiniopTensorDescriptor_t a,
+                                                               infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalAnd(infiniopLogicalAndDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *c,
+                                               const void *a,
+                                               const void *b,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h
new file mode 100644
index 000000000..1c0066139
--- /dev/null
+++ b/include/infiniop/ops/logical_or.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_LOGICAL_OR_API_H__
+#define __INFINIOP_LOGICAL_OR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handle,
+                                                              infiniopLogicalOrDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t c,
+                                                              infiniopTensorDescriptor_t a,
+                                                              infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *c,
+                                              const void *a,
+                                              const void *b,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/relu_backward.h b/include/infiniop/ops/relu_backward.h
new file mode 100644
index 000000000..e1c13fec3
--- /dev/null
+++ b/include/infiniop/ops/relu_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_RELU_BACKWARD_API_H__
+#define __INFINIOP_RELU_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReluBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReluBackwardDescriptor(infiniopHandle_t handle,
+                                                                 infiniopReluBackwardDescriptor_t *desc_ptr,
+                                                                 infiniopTensorDescriptor_t grad_input,
+                                                                 infiniopTensorDescriptor_t intput,
+                                                                 infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGetReluBackwardWorkspaceSize(infiniopReluBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReluBackward(infiniopReluBackwardDescriptor_t desc,
+                                                 void *workspace,
+                                                 size_t workspace_size,
+                                                 void *grad_input,
+                                                 const void *input,
+                                                 const void *grad_output,
+                                                 void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReluBackwardDescriptor(infiniopReluBackwardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/silu.h b/include/infiniop/ops/silu.h
new file mode 100644
index 000000000..037d6323f
--- /dev/null
+++ b/include/infiniop/ops/silu.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SILU_API_H__
+#define __INFINIOP_SILU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSiluDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSiluDescriptor(infiniopHandle_t handle,
+                                                         infiniopSiluDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t intput);
+
+__C __export infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSilu(infiniopSiluDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *intput,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..3234a22da 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -12,21 +12,31 @@
 def run_tests(args):
     failed = []
     for test in [
-        "add.py",
-        "attention.py",
-        "causal_softmax.py",
-        "clip.py",
-        "gemm.py",
-        "mul.py",
-        "random_sample.py",
-        "rearrange.py",
-        "rms_norm.py",
-        "rope.py",
-        "sub.py",
-        "swiglu.py",
+        # "add.py",
+        # "attention.py",
+        # "causal_softmax.py",
+        # "clip.py",
+        # "gemm.py",
+        # "mul.py",
+        # "random_sample.py",
+        # "rearrange.py",
+        # "rms_norm.py",
+        # "rope.py",
+        # "sub.py",
+        # "swiglu.py",
+        "silu.py",
+        "div.py",
+        "logical_and.py",
+        "logical_or.py",
+        "equal.py",
+        "all_equal.py",
+        "relu_backward.py",
+        "gelu.py",
+        "gelu_backward.py",
+        "cross_entropy_loss_backward.py"
     ]:
         result = subprocess.run(
-            f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
+            f"python {test} {args} --profile", text=True, encoding="utf-8", shell=True
         )
         if result.returncode != 0:
             failed.append(test)
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..d571d4b55 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,16 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(silu)
+DECLARE_INFINIOP_TEST(div)
+DECLARE_INFINIOP_TEST(logical_and)
+DECLARE_INFINIOP_TEST(logical_or)
+DECLARE_INFINIOP_TEST(equal)
+DECLARE_INFINIOP_TEST(all_equal)
+DECLARE_INFINIOP_TEST(relu_backward)
+DECLARE_INFINIOP_TEST(gelu)
+DECLARE_INFINIOP_TEST(gelu_backward)
+DECLARE_INFINIOP_TEST(cross_entropy_loss_backward)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -30,19 +40,29 @@ DECLARE_INFINIOP_TEST(sub)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
+#define TEST_BUILDER_MAPPINGS                               \
+    {                                                       \
+        REGISTER_INFINIOP_TEST(gemm)                        \
+        REGISTER_INFINIOP_TEST(random_sample)               \
+        REGISTER_INFINIOP_TEST(add)                         \
+        REGISTER_INFINIOP_TEST(mul)                         \
+        REGISTER_INFINIOP_TEST(clip)                        \
+        REGISTER_INFINIOP_TEST(swiglu)                      \
+        REGISTER_INFINIOP_TEST(rope)                        \
+        REGISTER_INFINIOP_TEST(rms_norm)                    \
+        REGISTER_INFINIOP_TEST(causal_softmax)              \
+        REGISTER_INFINIOP_TEST(rearrange)                   \
+        REGISTER_INFINIOP_TEST(sub)                         \
+        REGISTER_INFINIOP_TEST(silu)                        \
+        REGISTER_INFINIOP_TEST(div)                         \
+        REGISTER_INFINIOP_TEST(logical_and)                 \
+        REGISTER_INFINIOP_TEST(logical_or)                  \
+        REGISTER_INFINIOP_TEST(equal)                       \
+        REGISTER_INFINIOP_TEST(all_equal)                   \
+        REGISTER_INFINIOP_TEST(relu_backward)               \
+        REGISTER_INFINIOP_TEST(gelu)                        \
+        REGISTER_INFINIOP_TEST(gelu_backward)               \
+        REGISTER_INFINIOP_TEST(cross_entropy_loss_backward) \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/all_equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp
new file mode 100644
index 000000000..616f8d8b5
--- /dev/null
+++ b/src/infiniop-test/src/ops/all_equal.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::all_equal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopAllEqualDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc,
+                                              c->desc(),
+                                              a->desc(),
+                                              b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size,
+                              c->data(),
+                              a->data(),
+                              b->data(),
+                              nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopAllEqual(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::all_equal
diff --git a/src/infiniop-test/src/ops/cross_entropy_loss_backward.cpp b/src/infiniop-test/src/ops/cross_entropy_loss_backward.cpp
new file mode 100644
index 000000000..972d3656b
--- /dev/null
+++ b/src/infiniop-test/src/ops/cross_entropy_loss_backward.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cross_entropy_loss_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> probs;
+    std::shared_ptr<Tensor> target;
+    std::shared_ptr<Tensor> grad_logits;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("probs") == tensors.end()
+        || tensors.find("target") == tensors.end()
+        || tensors.find("grad_logits") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->probs = tensors["probs"];
+    test->_attributes->target = tensors["target"];
+    test->_attributes->grad_logits = tensors["grad_logits"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopCrossEntropyLossBackwardDescriptor_t op_desc;
+    auto probs = _attributes->probs->to(device, device_id);
+    auto target = _attributes->target->to(device, device_id);
+    auto grad_logits = _attributes->grad_logits->to(device, device_id);
+    CHECK_OR(infiniopCreateCrossEntropyLossBackwardDescriptor(handle, &op_desc,
+                                                              grad_logits->desc(),
+                                                              probs->desc(),
+                                                              target->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCrossEntropyLossBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopCrossEntropyLossBackward(op_desc, workspace, workspace_size,
+                                              grad_logits->data(),
+                                              probs->data(),
+                                              target->data(),
+                                              nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_logits, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopCrossEntropyLossBackward(
+                op_desc, workspace, workspace_size,
+                grad_logits->data(),
+                probs->data(),
+                target->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"probs", "target", "grad_logits", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_logits"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- probs: " << _attributes->probs->info() << std::endl;
+    oss << "- target: " << _attributes->target->info() << std::endl;
+    oss << "- grad_logits: " << _attributes->grad_logits->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::cross_entropy_loss_backward
diff --git a/src/infiniop-test/src/ops/div.cpp b/src/infiniop-test/src/ops/div.cpp
new file mode 100644
index 000000000..c1f49bda6
--- /dev/null
+++ b/src/infiniop-test/src/ops/div.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::div {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopDivDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateDivDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetDivWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopDiv(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopDiv(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::div
diff --git a/src/infiniop-test/src/ops/equal.cpp b/src/infiniop-test/src/ops/equal.cpp
new file mode 100644
index 000000000..a4c236410
--- /dev/null
+++ b/src/infiniop-test/src/ops/equal.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::equal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopEqualDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateEqualDescriptor(handle, &op_desc,
+                                           c->desc(),
+                                           a->desc(),
+                                           b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetEqualWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopEqual(op_desc, workspace, workspace_size,
+                           c->data(),
+                           a->data(),
+                           b->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopEqual(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::equal
diff --git a/src/infiniop-test/src/ops/gelu.cpp b/src/infiniop-test/src/ops/gelu.cpp
new file mode 100644
index 000000000..ab0bbfa5f
--- /dev/null
+++ b/src/infiniop-test/src/ops/gelu.cpp
@@ -0,0 +1,101 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::gelu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopGeluDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateGeluDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetGeluWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopGelu(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopGelu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::gelu
diff --git a/src/infiniop-test/src/ops/gelu_backward.cpp b/src/infiniop-test/src/ops/gelu_backward.cpp
new file mode 100644
index 000000000..2fc7882a1
--- /dev/null
+++ b/src/infiniop-test/src/ops/gelu_backward.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::gelu_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopGeluBackwardDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+    CHECK_OR(infiniopCreateGeluBackwardDescriptor(handle, &op_desc,
+                                                  grad_input->desc(),
+                                                  input->desc(),
+                                                  grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetGeluBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopGeluBackward(op_desc, workspace, workspace_size,
+                                  grad_input->data(),
+                                  input->data(),
+                                  grad_output->data(),
+                                  nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_input, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopGeluBackward(
+                op_desc, workspace, workspace_size,
+                grad_input->data(),
+                input->data(),
+                grad_output->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_input"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::gelu_backward
diff --git a/src/infiniop-test/src/ops/logical_and.cpp b/src/infiniop-test/src/ops/logical_and.cpp
new file mode 100644
index 000000000..ac65f984b
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_and.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_and {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalAndDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalAndDescriptor(handle, &op_desc,
+                                                c->desc(),
+                                                a->desc(),
+                                                b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalAndWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalAnd(op_desc, workspace, workspace_size,
+                                c->data(),
+                                a->data(),
+                                b->data(),
+                                nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalAnd(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_and
diff --git a/src/infiniop-test/src/ops/logical_or.cpp b/src/infiniop-test/src/ops/logical_or.cpp
new file mode 100644
index 000000000..ec37f1d78
--- /dev/null
+++ b/src/infiniop-test/src/ops/logical_or.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::logical_or {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLogicalOrDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateLogicalOrDescriptor(handle, &op_desc,
+                                               c->desc(),
+                                               a->desc(),
+                                               b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLogicalOrWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLogicalOr(op_desc, workspace, workspace_size,
+                               c->data(),
+                               a->data(),
+                               b->data(),
+                               nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLogicalOr(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::logical_or
diff --git a/src/infiniop-test/src/ops/relu_backward.cpp b/src/infiniop-test/src/ops/relu_backward.cpp
new file mode 100644
index 000000000..38f300ab6
--- /dev/null
+++ b/src/infiniop-test/src/ops/relu_backward.cpp
@@ -0,0 +1,109 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::relu_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopReluBackwardDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+    CHECK_OR(infiniopCreateReluBackwardDescriptor(handle, &op_desc,
+                                                  grad_input->desc(),
+                                                  input->desc(),
+                                                  grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReluBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopReluBackward(op_desc, workspace, workspace_size,
+                                  grad_input->data(),
+                                  input->data(),
+                                  grad_output->data(),
+                                  nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_input, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReluBackward(
+                op_desc, workspace, workspace_size,
+                grad_input->data(),
+                input->data(),
+                grad_output->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_input"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::relu_backward
diff --git a/src/infiniop-test/src/ops/silu.cpp b/src/infiniop-test/src/ops/silu.cpp
new file mode 100644
index 000000000..75684503c
--- /dev/null
+++ b/src/infiniop-test/src/ops/silu.cpp
@@ -0,0 +1,101 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::silu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSiluDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateSiluDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSiluWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSilu(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSilu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::silu
diff --git a/src/infiniop/ops/all_equal/all_equal.h b/src/infiniop/ops/all_equal/all_equal.h
new file mode 100644
index 000000000..634856aae
--- /dev/null
+++ b/src/infiniop/ops/all_equal/all_equal.h
@@ -0,0 +1,47 @@
+#ifndef __ALL_EQUAL_H
+#define __ALL_EQUAL_H
+
+#include "../../handle.h"
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                     \
+                                                                  \
+    namespace op::all_equal::NAMESPACE {                          \
+    class Descriptor final : public InfiniopDescriptor {          \
+        struct Opaque;                                            \
+        Opaque *_opaque;                                          \
+        op::all_equal::AllEqualInfo _info;                        \
+        size_t _workspace_size;                                   \
+                                                                  \
+        Descriptor(                                               \
+            Opaque *opaque,                                       \
+            AllEqualInfo info,                                    \
+            size_t workspace_size,                                \
+            infiniDevice_t device_type,                           \
+            int device_id)                                        \
+            : InfiniopDescriptor{device_type, device_id},         \
+              _opaque(opaque),                                    \
+              _info(info),                                        \
+              _workspace_size(workspace_size) {}                  \
+                                                                  \
+    public:                                                       \
+        ~Descriptor();                                            \
+                                                                  \
+        size_t workspaceSize() const { return _workspace_size; }  \
+                                                                  \
+        static infiniStatus_t create(                             \
+            infiniopHandle_t handle,                              \
+            Descriptor **desc_ptr,                                \
+            infiniopTensorDescriptor_t output_desc,               \
+            std::vector<infiniopTensorDescriptor_t> input_descs); \
+                                                                  \
+        infiniStatus_t calculate(                                 \
+            void *workspace, size_t workspace_size,               \
+            void *output,                                         \
+            std::vector<const void *> inputs,                     \
+            void *stream) const;                                  \
+    };                                                            \
+    }
+
+#endif // __ALL_EQUAL_H
diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc
new file mode 100644
index 000000000..b280a6963
--- /dev/null
+++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc
@@ -0,0 +1,112 @@
+#include "all_equal_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <cstddef>
+
+namespace op::all_equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(a_shape, b_shape);
+
+    auto info_result = AllEqualInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+
+    *desc_ptr = new Descriptor(nullptr, info_result.take(), 0, handle_->device, handle_->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// Perform elementwise operation when all inputs have the same type
+template <typename Op, typename Tout, typename Tin>
+void calculate_impl(const op::all_equal::AllEqualInfo &info,
+                    void *output,
+                    const std::vector<const void *> &inputs) {
+    Tout *out = reinterpret_cast<Tout *>(output);
+    auto input_a = reinterpret_cast<const Tin *>(inputs[0]);
+    auto input_b = reinterpret_cast<const Tin *>(inputs[1]);
+    const ptrdiff_t input_numel = static_cast<ptrdiff_t>(info.getInputNumel());
+
+    bool all_equal = true;
+
+#pragma omp parallel for
+    for (ptrdiff_t i = 0; i < input_numel; ++i) {
+        auto get_input_idx = [&](size_t input_id) {
+            return info.getInputContiguous()[input_id]
+                     ? i
+                     : (info.getInputBroadcasted()[input_id]
+                            ? op::common_cpu::indexToReducedOffset(i, info.getNdim(), info.getDefaultStrides(), info.getInputStrides(input_id))
+                            : op::common_cpu::indexToOffset(i, info.getNdim(), info.getInputShape(input_id), info.getInputStrides(input_id)));
+        };
+        if constexpr (std::is_same_v<Tin, fp16_t> || std::is_same_v<Tin, bf16_t>) {
+            Tout elem = Op{}.template operator()<Tout, float>(utils::cast<float>(input_a[get_input_idx(0)]), utils::cast<float>(input_b[get_input_idx(1)]));
+            if (elem == false) {
+                all_equal = false;
+            }
+        } else {
+            Tout elem = Op{}.template operator()<Tout, Tin>(input_a[(get_input_idx(0))], input_b[get_input_idx(1)]);
+            if (elem == false) {
+                all_equal = false;
+            }
+        }
+    }
+    *out = all_equal ? true : false;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_info._dtype) {
+    case INFINI_DTYPE_BOOL:
+        calculate_impl<AllEqualOp, bool, bool>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_I8:
+        calculate_impl<AllEqualOp, bool, int8_t>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_I16:
+        calculate_impl<AllEqualOp, bool, int16_t>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_I32:
+        calculate_impl<AllEqualOp, bool, int32_t>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_I64:
+        calculate_impl<AllEqualOp, bool, int64_t>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_BF16:
+        calculate_impl<AllEqualOp, bool, bf16_t>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_F16:
+        calculate_impl<AllEqualOp, bool, fp16_t>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_F32:
+        calculate_impl<AllEqualOp, bool, float>(_info, output, inputs);
+        break;
+    case INFINI_DTYPE_F64:
+        calculate_impl<AllEqualOp, bool, double>(_info, output, inputs);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::all_equal::cpu
diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h
new file mode 100644
index 000000000..7aab41a6e
--- /dev/null
+++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __ALL_EQUAL_CPU_H__
+#define __ALL_EQUAL_CPU_H__
+
+#include "../all_equal.h"
+
+DESCRIPTOR(cpu)
+
+namespace op::all_equal::cpu {
+typedef struct AllEqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Tin>
+    Tout operator()(const Tin &a, const Tin &b) const {
+        return a == b;
+    }
+} AllEqualOp;
+} // namespace op::all_equal::cpu
+
+#endif // __ALL_EQUAL_CPU_H__
diff --git a/src/infiniop/ops/all_equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh
new file mode 100644
index 000000000..82e5c321a
--- /dev/null
+++ b/src/infiniop/ops/all_equal/cuda/kernel.cuh
@@ -0,0 +1,42 @@
+#ifndef __ALL_EQUAL_CUDA_H__
+#define __ALL_EQUAL_CUDA_H__
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cstddef>
+#include <cstdint>
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__global__ void compareKernel(size_t input_numel, size_t ndim, const bool *__restrict__ input_contiguous, const bool *__restrict__ input_broadcasted, const size_t *__restrict__ input_shapes, const ptrdiff_t *__restrict__ output_strides, const ptrdiff_t *__restrict__ input_strides, const void *const *inputs, uint8_t *flags) {
+    const Tdata *const a = reinterpret_cast<const Tdata *const *>(inputs)[0];
+    const Tdata *const b = reinterpret_cast<const Tdata *const *>(inputs)[1];
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < input_numel) {
+        op::elementwise::nvidia::InputIndexer indexer{idx, ndim, input_contiguous, input_broadcasted, input_shapes, input_strides, output_strides};
+        size_t idx_a = indexer(0);
+        size_t idx_b = indexer(1);
+        flags[idx] = (a[idx_a] != b[idx_b]) ? 1 : 0;
+    }
+}
+
+template <unsigned int BLOCK_SIZE>
+__global__ void countKernel(uint8_t *flags, unsigned int *count, int input_numel) {
+    __shared__ unsigned int s_data[BLOCK_SIZE];
+    int tid = threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + tid;
+
+    s_data[tid] = (idx < input_numel) ? flags[idx] : 0;
+    __syncthreads();
+
+    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            s_data[tid] += s_data[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        atomicAdd(count, s_data[0]);
+    }
+}
+
+#endif // __ALL_EQUAL_CUDA_H__
diff --git a/src/infiniop/ops/all_equal/info.h b/src/infiniop/ops/all_equal/info.h
new file mode 100644
index 000000000..847386dab
--- /dev/null
+++ b/src/infiniop/ops/all_equal/info.h
@@ -0,0 +1,148 @@
+#ifndef __ALL_EQUAL_INFO_H__
+#define __ALL_EQUAL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include "infinicore.h"
+#include <cstddef>
+#include <cstring>
+#include <vector>
+
+namespace op::all_equal {
+struct AllEqualInfo {
+private:
+    std::vector<size_t> _meta;
+    size_t _input_size;
+    size_t _input_numel;
+    size_t _ndim;
+
+    AllEqualInfo(std::vector<size_t> meta,
+                 size_t input_size,
+                 size_t input_numel,
+                 size_t ndim,
+                 infiniDtype_t dtype)
+        : _meta(std::move(meta)),
+          _input_size(input_size), _input_numel(input_numel), _ndim(ndim), _dtype(dtype) {}
+
+public:
+    infiniDtype_t _dtype;
+
+public:
+    // Get the Memory size of the meta data in bytes
+    inline size_t getMetaMemSize() const {
+        return _meta.size() * sizeof(size_t);
+    }
+    inline const int8_t *getMetaStart() const {
+        return reinterpret_cast<const int8_t *>(_meta.data());
+    }
+    inline size_t getInputSize() const {
+        return _input_size;
+    }
+    inline size_t getInputNumel() const {
+        return _input_numel;
+    }
+    inline size_t getNdim() const {
+        return _ndim;
+    }
+    inline const ptrdiff_t *getDefaultStrides() const {
+        return reinterpret_cast<const ptrdiff_t *>(_meta.data());
+    }
+    inline const size_t *getAllInputShapes() const {
+        return reinterpret_cast<const size_t *>(getDefaultStrides() + _ndim);
+    }
+    inline const size_t *getInputShape(const size_t &index) const {
+        if (index < _input_size) {
+            return reinterpret_cast<const size_t *>(getAllInputShapes() + index * _ndim);
+        }
+        return nullptr;
+    }
+    inline const ptrdiff_t *getAllInputStrides() const {
+        return reinterpret_cast<const ptrdiff_t *>(getAllInputShapes() + _input_size * _ndim);
+    }
+    inline const ptrdiff_t *getInputStrides(const size_t &index) const {
+        if (index < _input_size) {
+            return reinterpret_cast<const ptrdiff_t *>(getAllInputStrides() + index * _ndim);
+        }
+        return nullptr;
+    }
+    inline const bool *getInputContiguous() const {
+        return reinterpret_cast<const bool *>(getAllInputStrides() + _input_size * _ndim);
+    }
+    inline const bool *getInputBroadcasted() const {
+        return reinterpret_cast<const bool *>(getInputContiguous() + _input_size);
+    }
+
+    using ResultType = utils::Result<AllEqualInfo>;
+
+    /**
+     * @brief Construct ElementwiseInfo from output and input tensor descriptors.
+     * @param output_desc Descriptor of the output tensor.
+     * @param input_descs Descriptors of the input tensors.
+     * @return Result<ElementwiseInfo> with the successfully constructed ElementwiseInfo,
+     *         or the status code.
+     */
+    static ResultType create(
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs) {
+
+        if (!output_desc || input_descs.empty()) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        // Destination cannot have broadcast setup
+        if (output_desc->hasBroadcastDim()) {
+            return INFINI_STATUS_BAD_TENSOR_STRIDES;
+        }
+
+        auto input_size = input_descs.size();
+        auto input_a_desc = input_descs[0];
+        auto input_numel = input_a_desc->numel();
+        auto ndim = input_a_desc->ndim();
+        auto dtype = input_a_desc->dtype();
+
+        // Allocate memory for meta
+        auto shape_unit = input_a_desc->dim(0);
+        auto stride_unit = input_a_desc->stride(0);
+        size_t meta_mem_size = ndim * sizeof(stride_unit) + input_size * ndim * sizeof(shape_unit)
+                             + input_size * ndim * sizeof(stride_unit)
+                             + 2 * input_size * sizeof(bool);
+        std::vector<size_t> meta(CEIL_DIV(meta_mem_size, sizeof(size_t)));
+        int8_t *meta_ptr = reinterpret_cast<int8_t *>(meta.data());
+
+        std::vector<ptrdiff_t> default_strides(ndim);
+        auto default_shape = input_a_desc->shape();
+        ptrdiff_t dsize = 1;
+        for (int i = (int)ndim - 1; i >= 0; i--) {
+            default_strides[i] = dsize;
+            dsize *= default_shape[i];
+        }
+
+        // Pointers to the sections within _meta
+        size_t *default_strides_p = reinterpret_cast<size_t *>(meta_ptr);
+        size_t *input_shapes = reinterpret_cast<size_t *>(default_strides_p + ndim);
+        ptrdiff_t *input_strides = reinterpret_cast<ptrdiff_t *>(input_shapes + input_size * ndim);
+        bool *input_contiguous = reinterpret_cast<bool *>(input_strides + input_size * ndim);
+        bool *input_broadcasted = input_contiguous + input_size;
+
+        // Copy default strides
+        std::memcpy(default_strides_p, default_strides.data(), ndim * sizeof(*default_strides_p));
+
+        // Copy input shapes, strides, contiguous, and broadcasted flags
+        for (size_t i = 0; i < input_size; ++i) {
+            auto &desc = input_descs[i];
+            const auto in_shape = desc->shape();
+            const auto in_strides = desc->strides();
+            std::memcpy(input_shapes + i * ndim, in_shape.data(), ndim * sizeof(*input_shapes));
+            std::memcpy(input_strides + i * ndim, in_strides.data(), ndim * sizeof(*input_strides));
+            input_contiguous[i] = desc->isContiguous();
+            input_broadcasted[i] = !input_contiguous[i] && desc->hasBroadcastDim();
+        }
+
+        AllEqualInfo info(std::move(meta), input_size, input_numel, ndim, dtype);
+        return ResultType(std::move(info));
+    }
+};
+
+} // namespace op::all_equal
+
+#endif // __ALL_EQUAL_INFO_H__
diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.h b/src/infiniop/ops/all_equal/metax/all_equal_metax.h
new file mode 100644
index 000000000..82813b997
--- /dev/null
+++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ALL_EQUAL_METAX_API_H__
+#define __ALL_EQUAL_METAX_API_H__
+
+#include "../all_equal.h"
+
+DESCRIPTOR(metax)
+
+#endif // __ALL_EQUAL_METAX_API_H__
diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.maca b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca
new file mode 100644
index 000000000..53d1da778
--- /dev/null
+++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca
@@ -0,0 +1,213 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "kernel.cuh"
+#include "all_equal_metax.h"
+#include <cstdint>
+namespace op::all_equal::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(a_shape, b_shape);
+
+    auto info_result = AllEqualInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+
+    auto opaque_ptr = new Opaque{reinterpret_cast<device::metax::Handle *>(handle_)->internal()};
+    info_result.take();
+
+    *desc_ptr = new Descriptor(opaque_ptr, std::move(info), workspace_size, handle_->device, handle_->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+template <size_t N>
+infiniStatus_t infoToDevice(
+    const op::all_equal::AllEqualInfo &info,
+    void *workspace,
+    const void *const *h_inputs_arr,
+    const void **&d_inputs_arr,
+    const bool *&d_input_contiguous,
+    const bool *&d_input_broadcasted,
+    const ptrdiff_t *&d_output_strides,
+    const size_t *&d_input_shapes,
+    const ptrdiff_t *&d_input_strides,
+    hcStream_t stream) {
+
+    constexpr auto input_size = N;
+    const auto ndim = info.getNdim();
+    constexpr auto input_arr_size = N * sizeof(*h_inputs_arr);
+    const int8_t *info_meta_start = info.getMetaStart();
+    const int8_t *d_meta_start = reinterpret_cast<int8_t *>(workspace) + input_arr_size;
+
+    // copy the input pointer array and meta to device
+    CHECK_METAX(hcMemcpyAsync(workspace, h_inputs_arr, input_arr_size, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), hcMemcpyHostToDevice, stream));
+
+    // offset/assign the pointers
+    d_inputs_arr = reinterpret_cast<const void **>(workspace);
+    d_output_strides = reinterpret_cast<const ptrdiff_t *>(d_meta_start);
+    d_input_shapes = reinterpret_cast<const size_t *>(d_output_strides + ndim);
+    d_input_strides = reinterpret_cast<const ptrdiff_t *>(d_input_shapes + input_size * ndim);
+    d_input_contiguous = reinterpret_cast<const bool *>(d_input_strides + input_size * ndim);
+    d_input_broadcasted = reinterpret_cast<const bool *>(d_input_contiguous + input_size);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t launchCompareKernel(const AllEqualInfo &info, void *workspace, std::vector<const void *> inputs, uint8_t *flags, void *stream) {
+    // Device pointers
+    const void **d_inputs_arr = nullptr;
+    const bool *d_input_contiguous = nullptr;
+    const bool *d_input_broadcasted = nullptr;
+    const ptrdiff_t *d_output_strides = nullptr;
+    const size_t *d_input_shapes = nullptr;
+    const ptrdiff_t *d_input_strides = nullptr;
+
+    CHECK_STATUS(infoToDevice<2>(info, workspace, inputs.data(), d_inputs_arr,
+                                 d_input_contiguous, d_input_broadcasted,
+                                 d_output_strides,
+                                 d_input_shapes, d_input_strides, reinterpret_cast<hcStream_t>(stream)));
+
+    const std::size_t input_numel = info.getInputNumel();
+    const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    compareKernel<BLOCK_SIZE, Tdata><<<GRID_SIZE, BLOCK_SIZE, 0, reinterpret_cast<hcStream_t>(stream)>>>(input_numel, info.getNdim(), d_input_contiguous, d_input_broadcasted, d_input_shapes, d_output_strides, d_input_strides, reinterpret_cast<const void **>(d_inputs_arr), flags);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchCountKernel(uint8_t *flags, unsigned int *count, std::size_t input_numel, void *stream) {
+    const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    countKernel<BLOCK_SIZE><<<GRID_SIZE, BLOCK_SIZE, 0, reinterpret_cast<hcStream_t>(stream)>>>(flags, count, input_numel);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const std::size_t input_numel = _info.getInputNumel();
+    std::vector<uint8_t> flags(input_numel, 0);
+    uint8_t *h_flags = flags.data();
+    uint8_t *d_flags = nullptr;
+    hcMalloc(&d_flags, input_numel * sizeof(uint8_t));
+    hcMemcpy(d_flags, h_flags, input_numel * sizeof(uint8_t), hcMemcpyHostToDevice);
+    unsigned int h_count{};
+    unsigned int *d_count;
+    hcMalloc(&d_count, sizeof(unsigned int));
+    hcMemset(d_count, 0, sizeof(unsigned int));
+
+    const std::size_t BLOCK_SIZE = 256;
+    bool ans{false};
+    switch (_info._dtype) {
+    case INFINI_DTYPE_BOOL:
+        launchCompareKernel<BLOCK_SIZE, bool>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    case INFINI_DTYPE_I8:
+        launchCompareKernel<BLOCK_SIZE, int8_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    case INFINI_DTYPE_I16: {
+        launchCompareKernel<BLOCK_SIZE, int16_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_I32: {
+        launchCompareKernel<BLOCK_SIZE, int32_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_I64: {
+        launchCompareKernel<BLOCK_SIZE, int64_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        launchCompareKernel<BLOCK_SIZE, hpcc_bfloat16>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        launchCompareKernel<BLOCK_SIZE, half>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_F32: {
+        launchCompareKernel<BLOCK_SIZE, float>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_F64: {
+        launchCompareKernel<BLOCK_SIZE, double>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    hcFree(d_count);
+    hcFree(d_flags);
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::all_equal::metax
diff --git a/src/infiniop/ops/all_equal/metax/kernel.cuh b/src/infiniop/ops/all_equal/metax/kernel.cuh
new file mode 100644
index 000000000..d03d989ae
--- /dev/null
+++ b/src/infiniop/ops/all_equal/metax/kernel.cuh
@@ -0,0 +1,42 @@
+#ifndef __ALL_EQUAL_METAX_H__
+#define __ALL_EQUAL_METAX_H__
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include <cstddef>
+#include <cstdint>
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__global__ void compareKernel(size_t input_numel, size_t ndim, const bool *__restrict__ input_contiguous, const bool *__restrict__ input_broadcasted, const size_t *__restrict__ input_shapes, const ptrdiff_t *__restrict__ output_strides, const ptrdiff_t *__restrict__ input_strides, const void *const *inputs, uint8_t *flags) {
+    const Tdata *const a = reinterpret_cast<const Tdata *const *>(inputs)[0];
+    const Tdata *const b = reinterpret_cast<const Tdata *const *>(inputs)[1];
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < input_numel) {
+        op::elementwise::metax::InputIndexer indexer{idx, ndim, input_contiguous, input_broadcasted, input_shapes, input_strides, output_strides};
+        size_t idx_a = indexer(0);
+        size_t idx_b = indexer(1);
+        flags[idx] = (a[idx_a] != b[idx_b]) ? 1 : 0;
+    }
+}
+
+template <unsigned int BLOCK_SIZE>
+__global__ void countKernel(uint8_t *flags, unsigned int *count, int input_numel) {
+    __shared__ unsigned int s_data[BLOCK_SIZE];
+    int tid = threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + tid;
+
+    s_data[tid] = (idx < input_numel) ? flags[idx] : 0;
+    __syncthreads();
+
+    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            s_data[tid] += s_data[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        atomicAdd(count, s_data[0]);
+    }
+}
+
+#endif // __ALL_EQUAL_METAX_H__
diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu
new file mode 100644
index 000000000..4b3fbb72d
--- /dev/null
+++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu
@@ -0,0 +1,214 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "all_equal_nvidia.cuh"
+#include <cstdint>
+#include <driver_types.h>
+namespace op::all_equal::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(a_shape, b_shape);
+
+    auto info_result = AllEqualInfo::create(out_desc, input_desc_vec);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+    auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *);
+
+    auto opaque_ptr = new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle_)->internal()};
+    info_result.take();
+
+    *desc_ptr = new Descriptor(opaque_ptr, std::move(info), workspace_size, handle_->device, handle_->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+template <size_t N>
+infiniStatus_t infoToDevice(
+    const op::all_equal::AllEqualInfo &info,
+    void *workspace,
+    const void *const *h_inputs_arr,
+    const void **&d_inputs_arr,
+    const bool *&d_input_contiguous,
+    const bool *&d_input_broadcasted,
+    const ptrdiff_t *&d_output_strides,
+    const size_t *&d_input_shapes,
+    const ptrdiff_t *&d_input_strides,
+    cudaStream_t stream) {
+
+    constexpr auto input_size = N;
+    const auto ndim = info.getNdim();
+    constexpr auto input_arr_size = N * sizeof(*h_inputs_arr);
+    const int8_t *info_meta_start = info.getMetaStart();
+    const int8_t *d_meta_start = reinterpret_cast<int8_t *>(workspace) + input_arr_size;
+
+    // copy the input pointer array and meta to device
+    CHECK_CUDA(cudaMemcpyAsync(workspace, h_inputs_arr, input_arr_size, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), cudaMemcpyHostToDevice, stream));
+
+    // offset/assign the pointers
+    d_inputs_arr = reinterpret_cast<const void **>(workspace);
+    d_output_strides = reinterpret_cast<const ptrdiff_t *>(d_meta_start);
+    d_input_shapes = reinterpret_cast<const size_t *>(d_output_strides + ndim);
+    d_input_strides = reinterpret_cast<const ptrdiff_t *>(d_input_shapes + input_size * ndim);
+    d_input_contiguous = reinterpret_cast<const bool *>(d_input_strides + input_size * ndim);
+    d_input_broadcasted = reinterpret_cast<const bool *>(d_input_contiguous + input_size);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t launchCompareKernel(const AllEqualInfo &info, void *workspace, std::vector<const void *> inputs, uint8_t *flags, void *stream) {
+    // Device pointers
+    const void **d_inputs_arr = nullptr;
+    const bool *d_input_contiguous = nullptr;
+    const bool *d_input_broadcasted = nullptr;
+    const ptrdiff_t *d_output_strides = nullptr;
+    const size_t *d_input_shapes = nullptr;
+    const ptrdiff_t *d_input_strides = nullptr;
+
+    CHECK_STATUS(infoToDevice<2>(info, workspace, inputs.data(), d_inputs_arr,
+                                 d_input_contiguous, d_input_broadcasted,
+                                 d_output_strides,
+                                 d_input_shapes, d_input_strides, reinterpret_cast<cudaStream_t>(stream)));
+
+    const std::size_t input_numel = info.getInputNumel();
+    const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    compareKernel<BLOCK_SIZE, Tdata><<<GRID_SIZE, BLOCK_SIZE, 0, reinterpret_cast<cudaStream_t>(stream)>>>(input_numel, info.getNdim(), d_input_contiguous, d_input_broadcasted, d_input_shapes, d_output_strides, d_input_strides, reinterpret_cast<const void **>(d_inputs_arr), flags);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchCountKernel(uint8_t *flags, unsigned int *count, std::size_t input_numel, void *stream) {
+    const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    countKernel<BLOCK_SIZE><<<GRID_SIZE, BLOCK_SIZE, 0, reinterpret_cast<cudaStream_t>(stream)>>>(flags, count, input_numel);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const std::size_t input_numel = _info.getInputNumel();
+    std::vector<uint8_t> flags(input_numel, 0);
+    uint8_t *h_flags = flags.data();
+    uint8_t *d_flags = nullptr;
+    cudaMalloc(&d_flags, input_numel * sizeof(uint8_t));
+    cudaMemcpy(d_flags, h_flags, input_numel * sizeof(uint8_t), cudaMemcpyHostToDevice);
+    unsigned int h_count{};
+    unsigned int *d_count;
+    cudaMalloc(&d_count, sizeof(unsigned int));
+    cudaMemset(d_count, 0, sizeof(unsigned int));
+
+    const std::size_t BLOCK_SIZE = 256;
+    bool ans{false};
+    switch (_info._dtype) {
+    case INFINI_DTYPE_BOOL:
+        launchCompareKernel<BLOCK_SIZE, bool>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    case INFINI_DTYPE_I8:
+        launchCompareKernel<BLOCK_SIZE, int8_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    case INFINI_DTYPE_I16: {
+        launchCompareKernel<BLOCK_SIZE, int16_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_I32: {
+        launchCompareKernel<BLOCK_SIZE, int32_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_I64: {
+        launchCompareKernel<BLOCK_SIZE, int64_t>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        launchCompareKernel<BLOCK_SIZE, cuda_bfloat16>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        launchCompareKernel<BLOCK_SIZE, half>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_F32: {
+        launchCompareKernel<BLOCK_SIZE, float>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    case INFINI_DTYPE_F64: {
+        launchCompareKernel<BLOCK_SIZE, double>(_info, workspace, inputs, d_flags, stream);
+        launchCountKernel<BLOCK_SIZE>(d_flags, d_count, input_numel, stream);
+        cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost);
+        ans = (h_count == 0);
+        cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    cudaFree(d_count);
+    cudaFree(d_flags);
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::all_equal::nvidia
diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh
new file mode 100644
index 000000000..ca7018b9b
--- /dev/null
+++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ALL_EQUAL_CUDA_API_H__
+#define __ALL_EQUAL_CUDA_API_H__
+
+#include "../all_equal.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __ALL_EQUAL_CUDA_API_H__
diff --git a/src/infiniop/ops/all_equal/operator.cc b/src/infiniop/ops/all_equal/operator.cc
new file mode 100644
index 000000000..f4f91b981
--- /dev/null
+++ b/src/infiniop/ops/all_equal/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/all_equal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/all_equal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/all_equal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/all_equal_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAllEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopAllEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::all_equal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::all_equal::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                              \
+            {a_desc,                                                             \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::all_equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopAllEqual(
+    infiniopAllEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::all_equal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::all_equal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward.cc b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward.cc
new file mode 100644
index 000000000..9bab4c14a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward.cc
@@ -0,0 +1,62 @@
+#include "cross_entropy_loss_backward_cpu.h"
+#include <cstddef>
+
+namespace op::cross_entropy_loss_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &probs_desc = input_desc_vec.at(0);
+    const auto &target_desc = input_desc_vec.at(1);
+    const auto &grad_logits_shape = out_desc->shape();
+    const auto &probs_shape = probs_desc->shape();
+    const auto &target_shape = target_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_logits_shape, probs_shape, target_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    const size_t *output_shape = _info.getOutputShape();
+    const size_t dim = _info.getNdim();
+    size_t N{1};
+    for (size_t i = 0; i < dim - 1; ++i) {
+        N *= output_shape[i];
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<CrossEntropyLossBackwardOp, bf16_t>(_info, output, inputs, stream, N);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CrossEntropyLossBackwardOp, fp16_t>(_info, output, inputs, stream, N);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CrossEntropyLossBackwardOp, float>(_info, output, inputs, stream, N);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<CrossEntropyLossBackwardOp, double>(_info, output, inputs, stream, N);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cross_entropy_loss_backward::cpu
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward_cpu.h b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward_cpu.h
new file mode 100644
index 000000000..11972075f
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_CPU_H__
+#define __CROSS_ENTROPY_LOSS_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(cross_entropy_loss_backward, cpu)
+
+namespace op::cross_entropy_loss_backward::cpu {
+typedef struct CrossEntropyLossBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b, const size_t N) const {
+        return (a - b) / static_cast<T>(N);
+    }
+} CrossEntropyLossBackwardOp;
+} // namespace op::cross_entropy_loss_backward::cpu
+
+#endif // __CROSS_ENTROPY_LOSS_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy_loss_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..169889e67
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/cuda/kernel.cuh
@@ -0,0 +1,27 @@
+#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_H__
+#define __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_H__
+
+namespace op::cross_entropy_loss_backward::cuda {
+typedef struct CrossEntropyLossBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b, const size_t N) const {
+        float f_N = static_cast<float>(N);
+        if constexpr (std::is_same_v<T, half2>) {
+            half2 h2_N = __float2half2_rn(f_N);
+            return __h2div(__hsub2(a, b), h2_N);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            return __hdiv(__hsub(a, b), __float2bfloat16(f_N));
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hdiv(__hsub(a, b), __float2half(f_N));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fdiv_rn(__fsub_rn(a, b), f_N);
+        } else {
+            return (a - b) / static_cast<T>(N);
+        }
+    }
+} CrossEntropyLossBackwardOp;
+} // namespace op::cross_entropy_loss_backward::cuda
+
+#endif // __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_H__
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.h b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.h
new file mode 100644
index 000000000..d5821c81a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_METAX_API_H__
+#define __CROSS_ENTROPY_LOSS_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(cross_entropy_loss_backward, metax)
+
+#endif // __CROSS_ENTROPY_LOSS_BACKWARD_METAX_API_H__
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.maca b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.maca
new file mode 100644
index 000000000..b30b13404
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.maca
@@ -0,0 +1,69 @@
+#include "cross_entropy_loss_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cross_entropy_loss_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &probs_desc = input_desc_vec.at(0);
+    const auto &target_desc = input_desc_vec.at(1);
+    const auto &grad_logits_shape = out_desc->shape();
+    const auto &probs_shape = probs_desc->shape();
+    const auto &target_shape = target_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_logits_shape, probs_shape, target_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const size_t *output_shape = _info.getOutputShape();
+    const size_t dim = _info.getNdim();
+    size_t N{1};
+    for (size_t i = 0; i < dim - 1; ++i) {
+        N *= output_shape[i];
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, half>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, float>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, double>(_info, workspace, output, inputs, stream, std::move(N));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cross_entropy_loss_backward::metax
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cu b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cu
new file mode 100644
index 000000000..d6aab249c
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cu
@@ -0,0 +1,71 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cross_entropy_loss_backward_nvidia.cuh"
+#include <cstddef>
+#include <cuda_device_runtime_api.h>
+#include <driver_types.h>
+
+namespace op::cross_entropy_loss_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &probs_desc = input_desc_vec.at(0);
+    const auto &target_desc = input_desc_vec.at(1);
+    const auto &grad_logits_shape = out_desc->shape();
+    const auto &probs_shape = probs_desc->shape();
+    const auto &target_shape = target_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_logits_shape, probs_shape, target_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    const size_t *output_shape = _info.getOutputShape();
+    const size_t dim = _info.getNdim();
+    size_t N{1};
+    for (size_t i = 0; i < dim - 1; ++i) {
+        N *= output_shape[i];
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, half>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, float>(_info, workspace, output, inputs, stream, std::move(N));
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, double>(_info, workspace, output, inputs, stream, std::move(N));
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cross_entropy_loss_backward::nvidia
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cuh
new file mode 100644
index 000000000..8f8a7a2be
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_API_H__
+#define __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cross_entropy_loss_backward, nvidia)
+
+#endif // __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_API_H__
diff --git a/src/infiniop/ops/cross_entropy_loss_backward/operator.cc b/src/infiniop/ops/cross_entropy_loss_backward/operator.cc
new file mode 100644
index 000000000..aba99f46c
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cross_entropy_loss_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cross_entropy_loss_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cross_entropy_loss_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cross_entropy_loss_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyLossBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                     \
+        return op::cross_entropy_loss_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                                \
+            reinterpret_cast<op::cross_entropy_loss_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                                       \
+            {input_desc,                                                                           \
+             grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCrossEntropyLossBackwardWorkspaceSize(infiniopCrossEntropyLossBackwardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                       \
+    case CASE:                                                                                                     \
+        *size = reinterpret_cast<op::cross_entropy_loss_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCrossEntropyLossBackward(
+    infiniopCrossEntropyLossBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                        \
+        return reinterpret_cast<const op::cross_entropy_loss_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCrossEntropyLossBackwardDescriptor(infiniopCrossEntropyLossBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                         \
+        delete reinterpret_cast<const op::cross_entropy_loss_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc
new file mode 100644
index 000000000..6b5edef36
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.cc
@@ -0,0 +1,54 @@
+#include "div_cpu.h"
+
+namespace op::div::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<DivOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<DivOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<DivOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<DivOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::cpu
diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h
new file mode 100644
index 000000000..0373b766f
--- /dev/null
+++ b/src/infiniop/ops/div/cpu/div_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __DIV_CPU_H__
+#define __DIV_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(div, cpu)
+
+namespace op::div::cpu {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a / b;
+    }
+} DivOp;
+} // namespace op::div::cpu
+
+#endif // __DIV_CPU_H__
diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh
new file mode 100644
index 000000000..cefbf0141
--- /dev/null
+++ b/src/infiniop/ops/div/cuda/kernel.cuh
@@ -0,0 +1,23 @@
+#ifndef __DIV_CUDA_H__
+#define __DIV_CUDA_H__
+
+namespace op::div::cuda {
+typedef struct DivOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __h2div(a, b);
+        } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
+            return __hdiv(a, b);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fdiv_rn(a, b);
+        } else {
+            return a / b;
+        }
+    }
+} DivOp;
+} // namespace op::div::cuda
+
+#endif // __DIV_CUDA_H__
diff --git a/src/infiniop/ops/div/metax/div_metax.h b/src/infiniop/ops/div/metax/div_metax.h
new file mode 100644
index 000000000..1e56a7d44
--- /dev/null
+++ b/src/infiniop/ops/div/metax/div_metax.h
@@ -0,0 +1,8 @@
+#ifndef __DIV_METAX_API_H__
+#define __DIV_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(div, metax)
+
+#endif // __DIV_METAX_API_H__
diff --git a/src/infiniop/ops/div/metax/div_metax.maca b/src/infiniop/ops/div/metax/div_metax.maca
new file mode 100644
index 000000000..a8ecd8643
--- /dev/null
+++ b/src/infiniop/ops/div/metax/div_metax.maca
@@ -0,0 +1,62 @@
+#include "div_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::div::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::metax
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu
new file mode 100644
index 000000000..4cb64af63
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "div_nvidia.cuh"
+
+namespace op::div::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::div::nvidia
diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
new file mode 100644
index 000000000..1ad8af94e
--- /dev/null
+++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __DIV_CUDA_API_H__
+#define __DIV_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(div, nvidia)
+
+#endif // __DIV_CUDA_API_H__
diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc
new file mode 100644
index 000000000..162156887
--- /dev/null
+++ b/src/infiniop/ops/div/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/div.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/div_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/div_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/div_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateDivDescriptor(
+    infiniopHandle_t handle,
+    infiniopDivDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::div::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::div::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::div::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDiv(
+    infiniopDivDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::div::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc
new file mode 100644
index 000000000..aea021ed1
--- /dev/null
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc
@@ -0,0 +1,66 @@
+#include "equal_cpu.h"
+#include "infinicore.h"
+
+namespace op::equal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<EqualOp, bool, bool, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<EqualOp, bool, int8_t, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<EqualOp, bool, int16_t, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<EqualOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<EqualOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<EqualOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<EqualOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<EqualOp, bool, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<EqualOp, bool, double, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::equal::cpu
diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h
new file mode 100644
index 000000000..c09a276d7
--- /dev/null
+++ b/src/infiniop/ops/equal/cpu/equal_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __EQUAL_CPU_H__
+#define __EQUAL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(equal, cpu)
+
+namespace op::equal::cpu {
+typedef struct EqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        if constexpr (std::is_same_v<Ta, bf16_t> || std::is_same_v<Ta, fp16_t>) {
+            float f_a = utils::cast<float, Ta>(a);
+            float f_b = utils::cast<float, Ta>(b);
+            return f_a == f_b;
+        } else {
+            return a == b;
+        }
+    }
+} EqualOp;
+} // namespace op::equal::cpu
+
+#endif // __EQUAL_CPU_H__
diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh
new file mode 100644
index 000000000..636913b26
--- /dev/null
+++ b/src/infiniop/ops/equal/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __EQUAL_CUDA_H__
+#define __EQUAL_CUDA_H__
+
+namespace op::equal::cuda {
+typedef struct EqualOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        return a == b;
+    }
+} EqualOp;
+} // namespace op::equal::cuda
+
+#endif // __EQUAL_CUDA_H__
diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h
new file mode 100644
index 000000000..6e4cd64b9
--- /dev/null
+++ b/src/infiniop/ops/equal/metax/equal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __EQUAL_METAX_API_H__
+#define __EQUAL_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(equal, metax)
+
+#endif // __EQUAL_METAX_API_H__
diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca
new file mode 100644
index 000000000..7629cf6aa
--- /dev/null
+++ b/src/infiniop/ops/equal/metax/equal_metax.maca
@@ -0,0 +1,73 @@
+#include "equal_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::equal::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::EqualOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::equal::metax
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
new file mode 100644
index 000000000..6e8f7444c
--- /dev/null
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu
@@ -0,0 +1,73 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "equal_nvidia.cuh"
+#include "infinicore.h"
+
+namespace op::equal::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::EqualOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::equal::nvidia
diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
new file mode 100644
index 000000000..361e54b02
--- /dev/null
+++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EQUAL_CUDA_API_H__
+#define __EQUAL_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(equal, nvidia)
+
+#endif // __EQUAL_CUDA_API_H__
diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc
new file mode 100644
index 000000000..2c46c28cd
--- /dev/null
+++ b/src/infiniop/ops/equal/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/equal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/equal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/equal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/equal_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateEqualDescriptor(
+    infiniopHandle_t handle,
+    infiniopEqualDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::equal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::equal::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                          \
+            {a_desc,                                                         \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::equal::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopEqual(
+    infiniopEqualDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::equal::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.cc b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc
new file mode 100644
index 000000000..a057ca4bc
--- /dev/null
+++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc
@@ -0,0 +1,52 @@
+#include "gelu_cpu.h"
+
+namespace op::gelu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<GeluOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<GeluOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<GeluOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<GeluOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu::cpu
diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.h b/src/infiniop/ops/gelu/cpu/gelu_cpu.h
new file mode 100644
index 000000000..9c8713ef3
--- /dev/null
+++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.h
@@ -0,0 +1,27 @@
+#ifndef __GELU_CPU_H__
+#define __GELU_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(gelu, cpu)
+
+#include <cmath>
+
+namespace op::gelu::cpu {
+typedef struct GeluOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        constexpr double Alpha = 0.7978845608028654;
+        constexpr double Beta = 0.044715;
+        double inner = x + Beta * x * x * x;
+        double tanh_term = std::tanh(Alpha * inner);
+        return static_cast<T>(0.5 * x * (1.0 + tanh_term));
+    }
+} GeluOp;
+
+} // namespace op::gelu::cpu
+
+#endif // __GELU_CPU_H__
diff --git a/src/infiniop/ops/gelu/cuda/kernel.cuh b/src/infiniop/ops/gelu/cuda/kernel.cuh
new file mode 100644
index 000000000..6673662f5
--- /dev/null
+++ b/src/infiniop/ops/gelu/cuda/kernel.cuh
@@ -0,0 +1,61 @@
+#ifndef __GELU_CUDA_H__
+#define __GELU_CUDA_H__
+
+#include <cmath>
+
+namespace op::gelu::cuda {
+
+typedef struct GeluOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        constexpr float Alpha = 0.7978845608;
+        constexpr float Beta = 0.044715;
+
+        if constexpr (std::is_same_v<T, half2>) {
+            const half2 alpha = __float2half2_rn(Alpha);
+            const half2 beta = __float2half2_rn(Beta);
+            const half2 one = __float2half2_rn(1.0f);
+            const half2 half_val = __float2half2_rn(0.5f);
+
+            half2 x_cubed = __hmul2(x, __hmul2(x, x)); // x³
+            half2 inner = __hfma2(beta, x_cubed, x);   // x + βx³
+            half2 tanh_in = __hmul2(alpha, inner);     // α(x + βx³)
+
+            // 向量化tanh近似（避免拆包）
+            float2 f_val = __half22float2(tanh_in);
+            f_val.x = tanhf(f_val.x);
+            f_val.y = tanhf(f_val.y);
+            half2 tanh_val = __float22half2_rn(f_val);
+
+            return __hmul2(__hmul2(half_val, x), __hadd2(one, tanh_val)); // 0.5*x*(1+tanh)
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float x_f = __bfloat162float(x);
+            float result = 0.5f * x_f * (1.0f + tanhf(Alpha * (x_f + Beta * x_f * x_f * x_f)));
+
+            return __float2bfloat16(result);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float x_f = __half2float(x);
+            float result = 0.5f * x_f * (1.0f + tanhf(Alpha * (x_f + Beta * x_f * x_f * x_f)));
+
+            return __float2half(result);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float x_cubed = x * x * x;
+            float inner = x + Beta * x_cubed;
+            float tanh_val = tanhf(Alpha * inner);
+
+            return 0.5f * x * (1.0f + tanh_val);
+        } else {
+            double x_cubed = x * x * x;
+            double inner = x + static_cast<double>(Beta) * x_cubed;
+            double tanh_val = tanh(static_cast<double>(Alpha) * inner);
+
+            return 0.5 * x * (1 + tanh_val);
+        }
+    }
+} GeluOp;
+
+} // namespace op::gelu::cuda
+
+#endif // __GELU_CUDA_H__
diff --git a/src/infiniop/ops/gelu/metax/gelu_meta.maca b/src/infiniop/ops/gelu/metax/gelu_meta.maca
new file mode 100644
index 000000000..3a311530a
--- /dev/null
+++ b/src/infiniop/ops/gelu/metax/gelu_meta.maca
@@ -0,0 +1,60 @@
+#include "gelu_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::gelu::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::GeluOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu::metax
diff --git a/src/infiniop/ops/gelu/metax/gelu_metax.h b/src/infiniop/ops/gelu/metax/gelu_metax.h
new file mode 100644
index 000000000..9385b7a27
--- /dev/null
+++ b/src/infiniop/ops/gelu/metax/gelu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __GELU_METAX_API_H__
+#define __GELU_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(gelu, metax)
+
+#endif // __GELU_METAX_API_H__
diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu
new file mode 100644
index 000000000..4d42cf2df
--- /dev/null
+++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "gelu_nvidia.cuh"
+
+namespace op::gelu::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::GeluOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu::nvidia
diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh
new file mode 100644
index 000000000..72dbbd4f0
--- /dev/null
+++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __GELU_CUDA_API_H__
+#define __GELU_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(gelu, nvidia)
+
+#endif // __GELU_CUDA_API_H__
diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc
new file mode 100644
index 000000000..115a1c2fd
--- /dev/null
+++ b/src/infiniop/ops/gelu/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/gelu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gelu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/gelu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/gelu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateGeluDescriptor(
+    infiniopHandle_t handle,
+    infiniopGeluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::gelu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::gelu::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::gelu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGelu(
+    infiniopGeluDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::gelu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::gelu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc
new file mode 100644
index 000000000..f9e6ca660
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc
@@ -0,0 +1,54 @@
+#include "gelu_backward_cpu.h"
+
+namespace op::gelu_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<GeluBackwardOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<GeluBackwardOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<GeluBackwardOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<GeluBackwardOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu_backward::cpu
diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h
new file mode 100644
index 000000000..b008ef101
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h
@@ -0,0 +1,38 @@
+#ifndef __GELU_BACKWARD_CPU_H__
+#define __GELU_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(gelu_backward, cpu)
+
+namespace op::gelu_backward::cpu {
+typedef struct GeluBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        constexpr double alpha = 0.7978845608028654;
+        constexpr double beta = 0.044714998453855515;
+
+        // 计算中间变量 u = α(x + βx³)
+        const double x_cubed = a * a * a;
+        const double u = alpha * (a + beta * x_cubed);
+
+        // 计算 tanh(u) 及其导数 sech²(u) = 1 - tanh²(u)
+        const double tanh_u = std::tanh(u);
+        const double sech2_u = 1.0 - tanh_u * tanh_u;
+
+        // 计算 du/dx = α(1 + 3βx²)
+        const double du_dx = alpha * (1.0 + 3.0 * beta * a * a);
+
+        // 计算 GELU 的导数 dy/dx
+        const double dy_dx = 0.5 * (1.0 + tanh_u) + 0.5 * a * sech2_u * du_dx;
+
+        // 链式法则：dL/dx = dL/dy * dy/dx
+        const double ans = static_cast<double>(b) * dy_dx;
+        return static_cast<T>(ans);
+    }
+} GeluBackwardOp;
+} // namespace op::gelu_backward::cpu
+
+#endif // __GELU_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/gelu_backward/cuda/kernel.cuh b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..6dfbd7fbb
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh
@@ -0,0 +1,88 @@
+#ifndef __GELU_BACKWARD_CUDA_H__
+#define __GELU_BACKWARD_CUDA_H__
+
+namespace op::gelu_backward::cuda {
+typedef struct GeluBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        constexpr float alpha = 0.7978845608028654f;
+        constexpr float beta = 0.044715f;
+        constexpr float beta3 = 3.0f * beta;
+
+        if constexpr (std::is_same_v<T, half2>) {
+            // half2向量化优化
+            float2 x_f = __half22float2(a);
+            float2 grad_output_f = __half22float2(b);
+
+            float2 u = {
+                alpha * (x_f.x + beta * x_f.x * x_f.x * x_f.x),
+                alpha * (x_f.y + beta * x_f.y * x_f.y * x_f.y)};
+            // 分别计算 tanh 和 sech²
+            float tanh_u_x = tanhf(u.x);
+            float tanh_u_y = tanhf(u.y);
+            float sech2_u_x = 1.0f - tanh_u_x * tanh_u_x;
+            float sech2_u_y = 1.0f - tanh_u_y * tanh_u_y;
+            // 分别计算导数分量
+            float du_dx_x = alpha * (1.0f + beta3 * x_f.x * x_f.x);
+            float du_dx_y = alpha * (1.0f + beta3 * x_f.y * x_f.y);
+            float dy_dx_x = 0.5f * (1.0f + tanh_u_x) + 0.5f * x_f.x * sech2_u_x * du_dx_x;
+            float dy_dx_y = 0.5f * (1.0f + tanh_u_y) + 0.5f * x_f.y * sech2_u_y * du_dx_y;
+
+            float2 grad_input_f = {
+                grad_output_f.x * dy_dx_x,
+                grad_output_f.y * dy_dx_y};
+            return __float22half2_rn(grad_input_f);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16精度
+            float x_f = __bfloat162float(a);
+            float grad_output_f = __bfloat162float(b);
+
+            float u = alpha * (x_f + beta * x_f * x_f * x_f);
+            float tanh_u = tanhf(u);
+            float sech2_u = 1.0f - tanh_u * tanh_u;
+            float du_dx = alpha * (1.0f + beta3 * x_f * x_f);
+            float dy_dx = 0.5f * (1.0f + tanh_u) + 0.5f * x_f * sech2_u * du_dx;
+            float ans = __fmul_rn(grad_output_f, dy_dx);
+
+            return __float2bfloat16(ans);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16精度
+            float x_f = __half2float(a);
+            float grad_output_f = __half2float(b);
+
+            float u = alpha * (x_f + beta * x_f * x_f * x_f);
+            float tanh_u = tanhf(u);
+            float sech2_u = 1.0f - tanh_u * tanh_u;
+            float du_dx = alpha * (1.0f + beta3 * x_f * x_f);
+            float dy_dx = 0.5f * (1.0f + tanh_u) + 0.5f * x_f * sech2_u * du_dx;
+            float ans = __fmul_rn(grad_output_f, dy_dx);
+
+            return __float2half(ans);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32精度
+            float x = a;
+            float u = alpha * (x + beta * x * x * x);
+            float tanh_u = tanhf(u);
+            float sech2_u = 1.0f - tanh_u * tanh_u;
+            float du_dx = alpha * (1.0f + beta3 * x * x);
+            float dy_dx = 0.5f * (1.0f + tanh_u) + 0.5f * x * sech2_u * du_dx;
+            return __fmul_rn(b, dy_dx);
+        } else {
+            // FP64精度或其他
+            constexpr double alpha_d = 0.7978845608028654;
+            constexpr double beta_d = 0.044715;
+            double x = a;
+            double u = alpha_d * (x + beta_d * x * x * x);
+            double tanh_u = tanh(u);
+            double sech2_u = 1.0 - tanh_u * tanh_u;
+            double du_dx = alpha_d * (1.0 + 3.0 * beta_d * x * x);
+            double dy_dx = 0.5 * (1.0 + tanh_u) + 0.5 * x * sech2_u * du_dx;
+            return static_cast<T>(b * dy_dx);
+        }
+    }
+} GeluBackwardOp;
+} // namespace op::gelu_backward::cuda
+
+#endif // __GELU_BACKWARD_CUDA_H__
diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h
new file mode 100644
index 000000000..b4213e977
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __GELU_BACKWARD_METAX_API_H__
+#define __GELU_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(gelu_backward, metax)
+
+#endif // __GELU_BACKWARD_METAX_API_H__
diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca
new file mode 100644
index 000000000..a79004e4c
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca
@@ -0,0 +1,62 @@
+#include "gelu_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::gelu_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu_backward::metax
diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu
new file mode 100644
index 000000000..2ff128567
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "gelu_backward_nvidia.cuh"
+
+namespace op::gelu_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::GeluBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gelu_backward::nvidia
diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh
new file mode 100644
index 000000000..82e6c2ae8
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __GELU_BACKWARD_CUDA_API_H__
+#define __GELU_BACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(gelu_backward, nvidia)
+
+#endif // __GELU_BACKWARD_CUDA_API_H__
diff --git a/src/infiniop/ops/gelu_backward/operator.cc b/src/infiniop/ops/gelu_backward/operator.cc
new file mode 100644
index 000000000..b74d71510
--- /dev/null
+++ b/src/infiniop/ops/gelu_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/gelu_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gelu_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/gelu_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/gelu_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateGeluBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopGeluBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::gelu_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::gelu_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                         \
+            {input_desc,                                                             \
+             grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetGeluBackwardWorkspaceSize(infiniopGeluBackwardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::gelu_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGeluBackward(
+    infiniopGeluBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::gelu_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyGeluBackwardDescriptor(infiniopGeluBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::gelu_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
new file mode 100644
index 000000000..c81f0a539
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc
@@ -0,0 +1,66 @@
+#include "logical_and_cpu.h"
+#include "infinicore.h"
+
+namespace op::logical_and::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<LogicalAndOp, bool, bool, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<LogicalAndOp, bool, int8_t, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<LogicalAndOp, bool, int16_t, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<LogicalAndOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<LogicalAndOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<LogicalAndOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LogicalAndOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LogicalAndOp, bool, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<LogicalAndOp, bool, double, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::cpu
diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
new file mode 100644
index 000000000..701960bd5
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __LOGICAL_AND_CPU_H__
+#define __LOGICAL_AND_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, cpu)
+
+namespace op::logical_and::cpu {
+typedef struct LogicalAndOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        if constexpr (std::is_same_v<Ta, bf16_t> || std::is_same_v<Ta, fp16_t>) {
+            float f_a = utils::cast<float, Ta>(a);
+            float f_b = utils::cast<float, Ta>(b);
+            return f_a && f_b;
+        } else {
+            return a && b;
+        }
+    }
+} LogicalAndOp;
+} // namespace op::logical_and::cpu
+
+#endif // __LOGICAL_AND_CPU_H__
diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh
new file mode 100644
index 000000000..0b763d951
--- /dev/null
+++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_AND_CUDA_H__
+#define __LOGICAL_AND_CUDA_H__
+
+namespace op::logical_and::cuda {
+typedef struct LogicalAndOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        return a && b;
+    }
+} LogicalAndOp;
+} // namespace op::logical_and::cuda
+
+#endif // __LOGICAL_AND_CUDA_H__
diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.h b/src/infiniop/ops/logical_and/metax/logical_and_metax.h
new file mode 100644
index 000000000..696697322
--- /dev/null
+++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_METAX_API_H__
+#define __LOGICAL_AND_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, metax)
+
+#endif // __LOGICAL_AND_METAX_API_H__
diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.maca b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca
new file mode 100644
index 000000000..68e3f0ddb
--- /dev/null
+++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca
@@ -0,0 +1,73 @@
+#include "logical_and_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::logical_and::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::metax
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
new file mode 100644
index 000000000..7f0680a57
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu
@@ -0,0 +1,73 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_and_nvidia.cuh"
+
+namespace op::logical_and::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_and::nvidia
diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
new file mode 100644
index 000000000..9d68754bf
--- /dev/null
+++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_AND_CUDA_API_H__
+#define __LOGICAL_AND_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_and, nvidia)
+
+#endif // __LOGICAL_AND_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc
new file mode 100644
index 000000000..a029c3678
--- /dev/null
+++ b/src/infiniop/ops/logical_and/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/logical_and.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_and_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/logical_and_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_and_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalAndDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalAndDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::logical_and::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::logical_and::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                                \
+            {a_desc,                                                               \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<op::logical_and::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLogicalAnd(
+    infiniopLogicalAndDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::logical_and::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::logical_and::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
new file mode 100644
index 000000000..1324c98f1
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc
@@ -0,0 +1,66 @@
+#include "logical_or_cpu.h"
+#include "infinicore.h"
+
+namespace op::logical_or::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<LogicalOrOp, bool, bool, bool>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<LogicalOrOp, bool, int8_t, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<LogicalOrOp, bool, int16_t, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<LogicalOrOp, bool, int32_t, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<LogicalOrOp, bool, int64_t, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<LogicalOrOp, bool, bf16_t, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<LogicalOrOp, bool, fp16_t, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<LogicalOrOp, bool, float, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<LogicalOrOp, bool, double, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::cpu
diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
new file mode 100644
index 000000000..7c26c4d37
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h
@@ -0,0 +1,29 @@
+#ifndef __LOGICAL_OR_CPU_H__
+#define __LOGICAL_OR_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, cpu)
+
+namespace op::logical_or::cpu {
+typedef struct LogicalOrOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        if constexpr (std::is_same_v<Ta, bf16_t> || std::is_same_v<Ta, fp16_t>) {
+            float f_a = utils::cast<float, Ta>(a);
+            float f_b = utils::cast<float, Ta>(b);
+            return f_a || f_b;
+        } else {
+            return a || b;
+        }
+    }
+} LogicalOrOp;
+} // namespace op::logical_or::cpu
+
+#endif // __LOGICAL_OR_CPU_H__
diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh
new file mode 100644
index 000000000..3c705428e
--- /dev/null
+++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __LOGICAL_OR_CUDA_H__
+#define __LOGICAL_OR_CUDA_H__
+
+namespace op::logical_or::cuda {
+typedef struct LogicalOrOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename Tout, typename Ta, typename Tb>
+    __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const {
+        if constexpr (!std::is_same_v<Ta, Tb>) {
+            printf("Ta and Tb must be the same type!\n");
+            std::abort();
+        }
+        return a || b;
+    }
+} LogicalOrOp;
+} // namespace op::logical_or::cuda
+
+#endif // __LOGICAL_OR_CUDA_H__
diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.h b/src/infiniop/ops/logical_or/metax/logical_or_metax.h
new file mode 100644
index 000000000..e530d9ed5
--- /dev/null
+++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_METAX_API_H__
+#define __LOGICAL_OR_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, metax)
+
+#endif // __LOGICAL_OR_METAX_API_H__
diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.maca b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca
new file mode 100644
index 000000000..17d1f8ed0
--- /dev/null
+++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca
@@ -0,0 +1,73 @@
+#include "logical_or_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::logical_or::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::metax
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
new file mode 100644
index 000000000..151079f07
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu
@@ -0,0 +1,73 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "infinicore.h"
+#include "logical_or_nvidia.cuh"
+
+namespace op::logical_or::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    auto dtype = a_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::logical_or::nvidia
diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
new file mode 100644
index 000000000..a70bd8da7
--- /dev/null
+++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGICAL_OR_CUDA_API_H__
+#define __LOGICAL_OR_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(logical_or, nvidia)
+
+#endif // __LOGICAL_OR_CUDA_API_H__
diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc
new file mode 100644
index 000000000..8f258e3e0
--- /dev/null
+++ b/src/infiniop/ops/logical_or/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/logical_or.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/logical_or_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/logical_or_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/logical_or_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLogicalOrDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogicalOrDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::logical_or::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::logical_or::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                               \
+            {a_desc,                                                              \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::logical_or::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLogicalOr(
+    infiniopLogicalOrDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::logical_or::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        delete reinterpret_cast<const op::logical_or::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc
new file mode 100644
index 000000000..1022657c5
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc
@@ -0,0 +1,54 @@
+#include "relu_backward_cpu.h"
+
+namespace op::relu_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ReluBackwardOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReluBackwardOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReluBackwardOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ReluBackwardOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu_backward::cpu
diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h
new file mode 100644
index 000000000..84e232184
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h
@@ -0,0 +1,24 @@
+#ifndef __RELU_BACKWARD_CPU_H__
+#define __RELU_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(relu_backward, cpu)
+
+namespace op::relu_backward::cpu {
+typedef struct ReluBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        T zero{0};
+        if (a > zero) {
+            return b;
+        } else {
+            return zero;
+        }
+    }
+} ReluBackwardOp;
+} // namespace op::relu_backward::cpu
+
+#endif // __RELU_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/relu_backward/cuda/kernel.cuh b/src/infiniop/ops/relu_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..3ead42756
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/cuda/kernel.cuh
@@ -0,0 +1,20 @@
+#ifndef __RELU_BACKWARD_CUDA_H__
+#define __RELU_BACKWARD_CUDA_H__
+
+namespace op::relu_backward::cuda {
+typedef struct ReluBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        T zero{0};
+        if (a > zero) {
+            return b;
+        } else {
+            return zero;
+        }
+    }
+} ReluBackwardOp;
+} // namespace op::relu_backward::cuda
+
+#endif // __RELU_BACKWARD_CUDA_H__
diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h
new file mode 100644
index 000000000..13c41b83a
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __RELU_BACKWARD_METAX_API_H__
+#define __RELU_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(relu_backward, metax)
+
+#endif // __RELU_BACKWARD_METAX_API_H__
diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca
new file mode 100644
index 000000000..b4b934f01
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca
@@ -0,0 +1,62 @@
+#include "relu_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::relu_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu_backward::metax
diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu
new file mode 100644
index 000000000..2eed8e443
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "relu_backward_nvidia.cuh"
+
+namespace op::relu_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ReluBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::relu_backward::nvidia
diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh
new file mode 100644
index 000000000..1a743b7b6
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __RELU_BACKWARD_CUDA_API_H__
+#define __RELU_BACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(relu_backward, nvidia)
+
+#endif // __RELU_BACKWARD_CUDA_API_H__
diff --git a/src/infiniop/ops/relu_backward/operator.cc b/src/infiniop/ops/relu_backward/operator.cc
new file mode 100644
index 000000000..ffca7a7bd
--- /dev/null
+++ b/src/infiniop/ops/relu_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/relu_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/relu_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/relu_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/relu_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReluBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopReluBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        return op::relu_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                  \
+            reinterpret_cast<op::relu_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                         \
+            {input_desc,                                                             \
+             grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetReluBackwardWorkspaceSize(infiniopReluBackwardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<op::relu_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReluBackward(
+    infiniopReluBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::relu_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyReluBackwardDescriptor(infiniopReluBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::relu_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.cc b/src/infiniop/ops/silu/cpu/silu_cpu.cc
new file mode 100644
index 000000000..c8466d227
--- /dev/null
+++ b/src/infiniop/ops/silu/cpu/silu_cpu.cc
@@ -0,0 +1,52 @@
+#include "silu_cpu.h"
+
+namespace op::silu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SiluOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SiluOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SiluOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SiluOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::silu::cpu
diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.h b/src/infiniop/ops/silu/cpu/silu_cpu.h
new file mode 100644
index 000000000..e1e9da4e3
--- /dev/null
+++ b/src/infiniop/ops/silu/cpu/silu_cpu.h
@@ -0,0 +1,23 @@
+#ifndef __SILU_CPU_H__
+#define __SILU_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(silu, cpu)
+
+#include <cmath>
+
+namespace op::silu::cpu {
+typedef struct SiluOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &x) const {
+        return x / (static_cast<T>(1) + std::exp(-x));
+    }
+} SiluOp;
+
+} // namespace op::silu::cpu
+
+#endif // __SILU_CPU_H__
diff --git a/src/infiniop/ops/silu/cuda/kernel.cuh b/src/infiniop/ops/silu/cuda/kernel.cuh
new file mode 100644
index 000000000..5cb8616b0
--- /dev/null
+++ b/src/infiniop/ops/silu/cuda/kernel.cuh
@@ -0,0 +1,37 @@
+#ifndef __SILU_CUDA_H__
+#define __SILU_CUDA_H__
+
+#include <cmath>
+
+namespace op::silu::cuda {
+
+typedef struct SiluOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // half2向量化优化
+            return __hmul2(x, __h2div(__float2half2_rn(1.0f),
+                                      __hadd2(__float2half2_rn(1.0f), h2exp(__hneg2(x)))));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16
+            const float x_f = __bfloat162float(x);
+            return __float2bfloat16(x_f / (1.0f + __expf(-x_f)));
+        } else if constexpr (std::is_same_v<T, half>) {
+            // FP16
+            const float x_f = __half2float(x);
+            return __float2half(x_f / (1.0f + __expf(-x_f)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // FP32
+            return x * (1.0f / (1.0f + __expf(-x)));
+        } else if constexpr (std::is_same_v<T, double>) {
+            // FP64
+            return x / (1.0 + exp(-x));
+        }
+    }
+} SiluOp;
+
+} // namespace op::silu::cuda
+
+#endif // __SILU_CUDA_H__
diff --git a/src/infiniop/ops/silu/metax/silu_metax.h b/src/infiniop/ops/silu/metax/silu_metax.h
new file mode 100644
index 000000000..a9717ccd0
--- /dev/null
+++ b/src/infiniop/ops/silu/metax/silu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SILU_METAX_API_H__
+#define __SILU_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(silu, metax)
+
+#endif // __SILU_METAX_API_H__
diff --git a/src/infiniop/ops/silu/metax/silu_metax.maca b/src/infiniop/ops/silu/metax/silu_metax.maca
new file mode 100644
index 000000000..73408bfc6
--- /dev/null
+++ b/src/infiniop/ops/silu/metax/silu_metax.maca
@@ -0,0 +1,60 @@
+#include "silu_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::silu::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SiluOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::silu::metax
diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cu b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu
new file mode 100644
index 000000000..291b9835f
--- /dev/null
+++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "silu_nvidia.cuh"
+
+namespace op::silu::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SiluOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::silu::nvidia
diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh
new file mode 100644
index 000000000..b13c7fd44
--- /dev/null
+++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SILU_CUDA_API_H__
+#define __SILU_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(silu, nvidia)
+
+#endif // __SILU_CUDA_API_H__
diff --git a/src/infiniop/ops/silu/operator.cc b/src/infiniop/ops/silu/operator.cc
new file mode 100644
index 000000000..5ae6ea4ff
--- /dev/null
+++ b/src/infiniop/ops/silu/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/silu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/silu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/silu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/silu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSiluDescriptor(
+    infiniopHandle_t handle,
+    infiniopSiluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::silu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::silu::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::silu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSilu(
+    infiniopSiluDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::silu::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::silu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/all_equal.py b/test/infiniop/all_equal.py
new file mode 100644
index 000000000..9c96f166b
--- /dev/null
+++ b/test/infiniop/all_equal.py
@@ -0,0 +1,206 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+    torch_device_map,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), None),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), None),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), None),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), None),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [Inplace.OUT_OF_PLACE]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.BOOL,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.BF16,
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor((1,), c_stride, InfiniDtype.BOOL, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing AllEqual on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    all_equal = torch.equal(a.torch_tensor(), b.torch_tensor())
+    new_tensor = torch.tensor([all_equal], device=torch_device_map[device])
+    c.update_torch_tensor(new_tensor)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAllEqualDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAllEqualWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_all_equal():
+        check_error(
+            LIBINFINIOP.infiniopAllEqual(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_all_equal()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.equal(c.actual_tensor(), c.torch_tensor())
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.equal(a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_all_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyAllEqualDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BOOL,
+                InfiniDtype.I8,
+                InfiniDtype.I16,
+                InfiniDtype.I32,
+                InfiniDtype.I64,
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/cross_entropy_loss_backward.py b/test/infiniop/cross_entropy_loss_backward.py
new file mode 100644
index 000000000..ec1296ef5
--- /dev/null
+++ b/test/infiniop/cross_entropy_loss_backward.py
@@ -0,0 +1,210 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import numpy as np
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 2.25e-15, "rtol": 2.25e-15},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cross_entropy_loss_backward(
+    probs: torch.Tensor, target: torch.Tensor, shape
+) -> torch.Tensor:
+    grad_logits = probs - target
+    shape = np.array(shape)
+    batch_size = np.prod(shape) // shape[-1]
+    grad_logits = grad_logits / batch_size
+    return grad_logits
+
+
+def test(
+    handle,
+    device,
+    shape,
+    probs_stride=None,
+    target_stride=None,
+    grad_logits_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    probs = TestTensor(shape, probs_stride, dtype, device)
+    target = TestTensor(shape, target_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if probs_stride != grad_logits_stride:
+            return
+        grad_logits = probs
+    elif inplace == Inplace.INPLACE_B:
+        if target_stride != grad_logits_stride:
+            return
+        grad_logits = target
+    else:
+        grad_logits = TestTensor(shape, grad_logits_stride, dtype, device, mode="ones")
+
+    if grad_logits.is_broadcast():
+        return
+
+    print(
+        f"Testing CrossEntropyLossBackward on {InfiniDeviceNames[device]} with shape:{shape} probs_stride:{probs_stride} target_stride:{target_stride} grad_logits_stride:{grad_logits_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    new_grad_logits = cross_entropy_loss_backward(
+        probs.torch_tensor(), target.torch_tensor(), shape
+    )
+    grad_logits.update_torch_tensor(new_grad_logits)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCrossEntropyLossBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_logits.descriptor,
+            probs.descriptor,
+            target.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [probs, target, grad_logits]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCrossEntropyLossBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_logits.device)
+
+    def lib_cross_entropy_loss_backward():
+        check_error(
+            LIBINFINIOP.infiniopCrossEntropyLossBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                grad_logits.data(),
+                probs.data(),
+                target.data(),
+                None,
+            )
+        )
+
+    lib_cross_entropy_loss_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(
+            grad_logits.actual_tensor(),
+            grad_logits.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+    assert torch.allclose(
+        grad_logits.actual_tensor(), grad_logits.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cross_entropy_loss_backward(probs.torch_tensor(), target.torch_tensor(), shape), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cross_entropy_loss_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(
+        LIBINFINIOP.infiniopDestroyCrossEntropyLossBackwardDescriptor(descriptor)
+    )
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/div.py b/test/infiniop/div.py
new file mode 100644
index 000000000..826f1141c
--- /dev/null
+++ b/test/infiniop/div.py
@@ -0,0 +1,191 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def div(c, a, b):
+    torch.div(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device, bias=1e-6)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, dtype, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateDivDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetDivWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_div():
+        check_error(
+            LIBINFINIOP.infiniopDiv(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_div()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py
new file mode 100644
index 000000000..7ba6b6949
--- /dev/null
+++ b/test/infiniop/equal.py
@@ -0,0 +1,209 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.BOOL,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.BF16,
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def eq(c, a, b):
+    torch.eq(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    eq(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateEqualDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetEqualWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_equal():
+        check_error(
+            LIBINFINIOP.infiniopEqual(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_equal()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.equal(c.actual_tensor(), c.torch_tensor())
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: eq(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BOOL,
+                InfiniDtype.I8,
+                InfiniDtype.I16,
+                InfiniDtype.I32,
+                InfiniDtype.I64,
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/gelu.py b/test/infiniop/gelu.py
new file mode 100644
index 000000000..1a3328432
--- /dev/null
+++ b/test/infiniop/gelu.py
@@ -0,0 +1,188 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+    to_torch_dtype,
+    torch_device_map,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 1e-8, "rtol": 1e-8},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Gelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # ans的shape对齐至input，而input可能存在广播维度
+    ans = torch.nn.functional.gelu(input.torch_tensor(), approximate="tanh")
+    # 利用add(+)计算的自动广播机制，确保ouput的torch_tensor与actual_tensor shape一致，以通过debug模式的shape检查
+    zero = torch.zeros(
+        *shape, dtype=to_torch_dtype(dtype), device=torch_device_map[device]
+    )
+    new_output = ans + zero
+    output.update_torch_tensor(new_output)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGeluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGeluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_gelu():
+        check_error(
+            LIBINFINIOP.infiniopGelu(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_gelu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.nn.functional.gelu(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gelu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyGeluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/gelu_backward.py b/test/infiniop/gelu_backward.py
new file mode 100644
index 000000000..7e2f170a7
--- /dev/null
+++ b/test/infiniop/gelu_backward.py
@@ -0,0 +1,213 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.F64: {"atol": 1e-8, "rtol": 1e-8},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def gelu_backward(input: torch.Tensor, grad_output: torch.Tensor) -> torch.Tensor:
+    sqrt_2_over_pi = torch.sqrt(torch.tensor(2.0 / torch.pi, device=input.device))
+    kappa = 0.044715
+
+    # 计算中间变量 u = √(2/π)(x + κx³)
+    x_cubed = input.pow(3)
+    u = sqrt_2_over_pi * (input + kappa * x_cubed)
+
+    # 计算 tanh(u) 及其导数 sech²(u) = 1 - tanh²(u)
+    tanh_u = torch.tanh(u)
+    sech2_u = 1.0 - tanh_u.square()
+
+    # 计算 du/dx = √(2/π)(1 + 3κx²)
+    du_dx = sqrt_2_over_pi * (1.0 + 3 * kappa * input.square())
+
+    # 局部梯度 dy/dx = 0.5*(1 + tanh_u) + 0.5*x*sech2_u*du_dx
+    dy_dx = 0.5 * (1.0 + tanh_u) + 0.5 * input * sech2_u * du_dx
+
+    return grad_output * dy_dx
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    grad_output_stride=None,
+    grad_input_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    grad_output = TestTensor(shape, grad_output_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if input_stride != grad_input_stride:
+            return
+        grad_input = input
+    elif inplace == Inplace.INPLACE_B:
+        if grad_output_stride != grad_input_stride:
+            return
+        grad_input = grad_output
+    else:
+        grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones")
+
+    if grad_input.is_broadcast():
+        return
+
+    print(
+        f"Testing GeluBackward on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    new_grad_input = gelu_backward(input.torch_tensor(), grad_output.torch_tensor())
+    grad_input.update_torch_tensor(new_grad_input)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGeluBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input.descriptor,
+            input.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, grad_output, grad_input]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGeluBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_input.device)
+
+    def lib_gelu_backward():
+        check_error(
+            LIBINFINIOP.infiniopGeluBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                grad_input.data(),
+                input.data(),
+                grad_output.data(),
+                None,
+            )
+        )
+
+    lib_gelu_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(
+            grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol
+        )
+    assert torch.allclose(
+        grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: gelu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gelu_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyGeluBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..7b6a06c40 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -162,11 +162,6 @@ def clip_(lib):
     ]
 
 
-@OpRegister.operator
-def conv_(lib):
-    pass
-
-
 @OpRegister.operator
 def gemm_(lib):
     lib.infiniopCreateGemmDescriptor.restype = c_int32
@@ -454,6 +449,7 @@ def swiglu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def conv_(lib):
     lib.infiniopCreateConvDescriptor.restype = c_int32
@@ -489,3 +485,340 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def silu_(lib):
+    lib.infiniopCreateSiluDescriptor.restype = c_int32
+    lib.infiniopCreateSiluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSiluWorkspaceSize.restype = c_int32
+    lib.infiniopGetSiluWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSilu.restype = c_int32
+    lib.infiniopSilu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySiluDescriptor.restype = c_int32
+    lib.infiniopDestroySiluDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def div_(lib):
+    lib.infiniopCreateDivDescriptor.restype = c_int32
+    lib.infiniopCreateDivDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetDivWorkspaceSize.restype = c_int32
+    lib.infiniopGetDivWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopDiv.restype = c_int32
+    lib.infiniopDiv.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyDivDescriptor.restype = c_int32
+    lib.infiniopDestroyDivDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_and_(lib):
+    lib.infiniopCreateLogicalAndDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalAndDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLogicalAnd.restype = c_int32
+    lib.infiniopLogicalAnd.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalAndDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def logical_or_(lib):
+    lib.infiniopCreateLogicalOrDescriptor.restype = c_int32
+    lib.infiniopCreateLogicalOrDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32
+    lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLogicalOr.restype = c_int32
+    lib.infiniopLogicalOr.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32
+    lib.infiniopDestroyLogicalOrDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def equal_(lib):
+    lib.infiniopCreateEqualDescriptor.restype = c_int32
+    lib.infiniopCreateEqualDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetEqualWorkspaceSize.restype = c_int32
+    lib.infiniopGetEqualWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopEqual.restype = c_int32
+    lib.infiniopEqual.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyEqualDescriptor.restype = c_int32
+    lib.infiniopDestroyEqualDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def all_equal_(lib):
+    lib.infiniopCreateAllEqualDescriptor.restype = c_int32
+    lib.infiniopCreateAllEqualDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAllEqualWorkspaceSize.restype = c_int32
+    lib.infiniopGetAllEqualWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAllEqual.restype = c_int32
+    lib.infiniopAllEqual.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyAllEqualDescriptor.restype = c_int32
+    lib.infiniopDestroyAllEqualDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+
+@OpRegister.operator
+def relu_backward_(lib):
+    lib.infiniopCreateReluBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateReluBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetReluBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetReluBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReluBackward.restype = c_int32
+    lib.infiniopReluBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReluBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyReluBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def gelu_(lib):
+    lib.infiniopCreateGeluDescriptor.restype = c_int32
+    lib.infiniopCreateGeluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGeluWorkspaceSize.restype = c_int32
+    lib.infiniopGetGeluWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGelu.restype = c_int32
+    lib.infiniopGelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGeluDescriptor.restype = c_int32
+    lib.infiniopDestroyGeluDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def gelu_backward_(lib):
+    lib.infiniopCreateGeluBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateGeluBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetGeluBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetGeluBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopGeluBackward.restype = c_int32
+    lib.infiniopGeluBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyGeluBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyGeluBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cross_entropy_loss_backward_(lib):
+    lib.infiniopCreateCrossEntropyLossBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateCrossEntropyLossBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCrossEntropyLossBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetCrossEntropyLossBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCrossEntropyLossBackward.restype = c_int32
+    lib.infiniopCrossEntropyLossBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCrossEntropyLossBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyCrossEntropyLossBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 5c8e7f80a..1a8eaf505 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -66,10 +66,33 @@ def __init__(
                 torch_strides.append(strides[i])
             else:
                 torch_shape.append(shape[i])
+
+        is_bool = dt == InfiniDtype.BOOL
+        if is_bool:
+            dt = InfiniDtype.F32
+
+        is_int = (
+            dt == InfiniDtype.I8
+            or dt == InfiniDtype.I16
+            or dt == InfiniDtype.I32
+            or dt == InfiniDtype.I64
+        )
+
         if mode == "random":
-            self._torch_tensor = torch.rand(
-                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
-            )
+            if is_int:
+                self._torch_tensor = torch.randint(
+                    0,
+                    100,
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
+            else:
+                self._torch_tensor = torch.rand(
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
         elif mode == "zeros":
             self._torch_tensor = torch.zeros(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
@@ -88,6 +111,12 @@ def __init__(
         else:
             raise ValueError("Unsupported mode")
 
+        if is_bool:
+            self._torch_tensor = self._torch_tensor > 0.5
+
+        if is_bool:
+            self._torch_tensor = self._torch_tensor > 0.5
+
         if scale is not None:
             self._torch_tensor *= scale
         if bias is not None:
@@ -103,6 +132,9 @@ def __init__(
     def torch_tensor(self):
         return self._torch_tensor
 
+    def update_torch_tensor(self, new_tensor: torch.tensor):
+        self._torch_tensor = new_tensor
+
     def actual_tensor(self):
         return self._data_tensor
 
@@ -120,6 +152,9 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
             shape_, strides_, dt, device, mode="manual", set_tensor=torch_tensor
         )
 
+    def update_torch_tensor(self, new_tensor: torch.Tensor):
+        self._torch_tensor = new_tensor
+
 
 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
     if dt == InfiniDtype.I8:
@@ -140,6 +175,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
         return torch.float32
     elif dt == InfiniDtype.F64:
         return torch.float64
+    elif dt == InfiniDtype.BOOL:
+        return torch.bool
     # TODO: These following types may not be supported by older
     # versions of PyTorch. Use compatability mode to convert them.
     elif dt == InfiniDtype.U16:
@@ -330,6 +367,11 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
         actual = actual.to(torch.float32)
         desired = desired.to(torch.float32)
 
+    # 如果是BOOL，全部转成FP32再比对
+    if actual.dtype == torch.bool or desired.dtype == torch.bool:
+        actual = actual.to(torch.float32)
+        desired = desired.to(torch.float32)
+
     print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose)
     np.testing.assert_allclose(
         actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True
@@ -523,7 +565,7 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS):
 
     # Timed execution
     elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device)
-    print(f" {desc} time: {elapsed * 1000 :6f} ms")
+    print(f" {desc} time: {elapsed * 1000:6f} ms")
 
 
 def test_operator(device, test_func, test_cases, tensor_dtypes):
diff --git a/test/infiniop/logical_and.py b/test/infiniop/logical_and.py
new file mode 100644
index 000000000..02369745b
--- /dev/null
+++ b/test/infiniop/logical_and.py
@@ -0,0 +1,209 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.BOOL,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.BF16,
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_and(c, a, b):
+    torch.logical_and(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalAnd on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalAndDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalAndWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_and():
+        check_error(
+            LIBINFINIOP.infiniopLogicalAnd(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_and()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.equal(c.actual_tensor(), c.torch_tensor())
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_and(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalAndDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BOOL,
+                InfiniDtype.I8,
+                InfiniDtype.I16,
+                InfiniDtype.I32,
+                InfiniDtype.I64,
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/logical_or.py b/test/infiniop/logical_or.py
new file mode 100644
index 000000000..abfff16f2
--- /dev/null
+++ b/test/infiniop/logical_or.py
@@ -0,0 +1,209 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [
+    InfiniDtype.BOOL,
+    InfiniDtype.I8,
+    InfiniDtype.I16,
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.BF16,
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BOOL: {"atol": 0, "rtol": 0},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+    InfiniDtype.I16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.F64: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def logical_or(c, a, b):
+    torch.logical_or(a, b, out=c)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    a = TestTensor(shape, a_stride, dtype, device)
+    b = TestTensor(shape, b_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    else:
+        c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing LogicalOr on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLogicalOrDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c.descriptor,
+            a.descriptor,
+            b.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a, b, c]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLogicalOrWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_logical_or():
+        check_error(
+            LIBINFINIOP.infiniopLogicalOr(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                None,
+            )
+        )
+
+    lib_logical_or()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.equal(c.actual_tensor(), c.torch_tensor())
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_logical_or(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLogicalOrDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BOOL,
+                InfiniDtype.I8,
+                InfiniDtype.I16,
+                InfiniDtype.I32,
+                InfiniDtype.I64,
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/relu_backward.py b/test/infiniop/relu_backward.py
new file mode 100644
index 000000000..ffb737476
--- /dev/null
+++ b/test/infiniop/relu_backward.py
@@ -0,0 +1,199 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def relu_backward(input: torch.Tensor, grad_output: torch.Tensor) -> torch.Tensor:
+    mask = input > 0
+    grad_input = torch.where(mask, grad_output, torch.zeros_like(grad_output))
+    return grad_input
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    grad_output_stride=None,
+    grad_input_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    grad_output = TestTensor(shape, grad_output_stride, dtype, device)
+    if inplace == Inplace.INPLACE_A:
+        if input_stride != grad_input_stride:
+            return
+        grad_input = input
+    elif inplace == Inplace.INPLACE_B:
+        if grad_output_stride != grad_input_stride:
+            return
+        grad_input = grad_output
+    else:
+        grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones")
+
+    if grad_input.is_broadcast():
+        return
+
+    print(
+        f"Testing ReluBackward on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    new_grad_input = relu_backward(input.torch_tensor(), grad_output.torch_tensor())
+    grad_input.update_torch_tensor(new_grad_input)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReluBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input.descriptor,
+            input.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, grad_output, grad_input]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReluBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_input.device)
+
+    def lib_relu_backward():
+        check_error(
+            LIBINFINIOP.infiniopReluBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                grad_input.data(),
+                input.data(),
+                grad_output.data(),
+                None,
+            )
+        )
+
+    lib_relu_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(
+            grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol
+        )
+    assert torch.allclose(
+        grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: relu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_relu_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyReluBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/silu.py b/test/infiniop/silu.py
new file mode 100644
index 000000000..dcc7fdf2b
--- /dev/null
+++ b/test/infiniop/silu.py
@@ -0,0 +1,188 @@
+import ctypes
+from ctypes import c_uint64
+from enum import Enum, auto
+
+import torch
+from libinfiniop import (
+    LIBINFINIOP,
+    InfiniDeviceEnum,
+    InfiniDeviceNames,
+    InfiniDtype,
+    InfiniDtypeNames,
+    TestTensor,
+    TestWorkspace,
+    check_error,
+    debug,
+    get_args,
+    get_test_devices,
+    get_tolerance,
+    infiniopOperatorDescriptor_t,
+    profile_operation,
+    test_operator,
+    to_torch_dtype,
+    torch_device_map,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Silu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}"
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # ans的shape对齐至input，而input可能存在广播维度
+    ans = torch.nn.functional.silu(input.torch_tensor())
+    # 利用add(+)计算的自动广播机制，确保ouput的torch_tensor与actual_tensor shape一致，以通过debug模式的shape检查
+    zero = torch.zeros(
+        *shape, dtype=to_torch_dtype(dtype), device=torch_device_map[device]
+    )
+    new_output = ans + zero
+    output.update_torch_tensor(new_output)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSiluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSiluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_silu():
+        check_error(
+            LIBINFINIOP.infiniopSilu(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_silu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(
+        output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol
+    )
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch.nn.functional.silu(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_silu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroySiluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        if device == InfiniDeviceEnum.ILUVATAR:
+            _TENSOR_DTYPES = [
+                InfiniDtype.BF16,
+                InfiniDtype.F16,
+                InfiniDtype.F32,
+            ]
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")