diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..ce720703d 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -3,17 +3,27 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" +#include "infiniop/ops/all_equal.h" #include "infiniop/ops/attention.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/cross_entropy_loss_backward.h" +#include "infiniop/ops/div.h" +#include "infiniop/ops/equal.h" +#include "infiniop/ops/gelu.h" +#include "infiniop/ops/gelu_backward.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/logical_and.h" +#include "infiniop/ops/logical_or.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" +#include "infiniop/ops/relu_backward.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/silu.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" diff --git a/include/infiniop/ops/all_equal.h b/include/infiniop/ops/all_equal.h new file mode 100644 index 000000000..b260b49b5 --- /dev/null +++ b/include/infiniop/ops/all_equal.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_ALL_EQUAL_API_H__ +#define __INFINIOP_ALL_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopAllEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateAllEqualDescriptor(infiniopHandle_t handle, + infiniopAllEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopAllEqual(infiniopAllEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cross_entropy_loss_backward.h b/include/infiniop/ops/cross_entropy_loss_backward.h new file mode 100644 index 000000000..edc2821ac --- /dev/null +++ b/include/infiniop/ops/cross_entropy_loss_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_BACKWARD_API_H__ +#define __INFINIOP_CROSS_ENTROPY_LOSS_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCrossEntropyLossBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCrossEntropyLossBackwardDescriptor(infiniopHandle_t handle, + infiniopCrossEntropyLossBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_logits, + infiniopTensorDescriptor_t probs, + infiniopTensorDescriptor_t target); + +__C __export infiniStatus_t infiniopGetCrossEntropyLossBackwardWorkspaceSize(infiniopCrossEntropyLossBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCrossEntropyLossBackward(infiniopCrossEntropyLossBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_logits, + const void *probs, + const void *target, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCrossEntropyLossBackwardDescriptor(infiniopCrossEntropyLossBackwardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/div.h b/include/infiniop/ops/div.h new file mode 100644 index 000000000..e539b440c --- /dev/null +++ b/include/infiniop/ops/div.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_DIV_API_H__ +#define __INFINIOP_DIV_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopDivDescriptor_t; + +__C __export infiniStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/equal.h b/include/infiniop/ops/equal.h new file mode 100644 index 000000000..3ac071eb4 --- /dev/null +++ b/include/infiniop/ops/equal.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_EQUAL_API_H__ +#define __INFINIOP_EQUAL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopEqualDescriptor_t; + +__C __export infiniStatus_t infiniopCreateEqualDescriptor(infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopEqual(infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gelu.h b/include/infiniop/ops/gelu.h new file mode 100644 index 000000000..444092b6a --- /dev/null +++ b/include/infiniop/ops/gelu.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_GELU_API_H__ +#define __INFINIOP_GELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGeluDescriptor(infiniopHandle_t handle, + infiniopGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); + +__C __export infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGelu(infiniopGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); + +__C __export infiniStatus_t infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/gelu_backward.h b/include/infiniop/ops/gelu_backward.h new file mode 100644 index 000000000..9516df81b --- /dev/null +++ b/include/infiniop/ops/gelu_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_GELU_BACKWARD_API_H__ +#define __INFINIOP_GELU_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGeluBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGeluBackwardDescriptor(infiniopHandle_t handle, + infiniopGeluBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t intput, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGetGeluBackwardWorkspaceSize(infiniopGeluBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGeluBackward(infiniopGeluBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroyGeluBackwardDescriptor(infiniopGeluBackwardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/logical_and.h b/include/infiniop/ops/logical_and.h new file mode 100644 index 000000000..5c237f79c --- /dev/null +++ b/include/infiniop/ops/logical_and.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_LOGICAL_AND_API_H__ +#define __INFINIOP_LOGICAL_AND_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalAndDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalAndDescriptor(infiniopHandle_t handle, + infiniopLogicalAndDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalAnd(infiniopLogicalAndDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/logical_or.h b/include/infiniop/ops/logical_or.h new file mode 100644 index 000000000..1c0066139 --- /dev/null +++ b/include/infiniop/ops/logical_or.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_LOGICAL_OR_API_H__ +#define __INFINIOP_LOGICAL_OR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLogicalOrDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLogicalOrDescriptor(infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLogicalOr(infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/relu_backward.h b/include/infiniop/ops/relu_backward.h new file mode 100644 index 000000000..e1c13fec3 --- /dev/null +++ b/include/infiniop/ops/relu_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_RELU_BACKWARD_API_H__ +#define __INFINIOP_RELU_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReluBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReluBackwardDescriptor(infiniopHandle_t handle, + infiniopReluBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t intput, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGetReluBackwardWorkspaceSize(infiniopReluBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReluBackward(infiniopReluBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReluBackwardDescriptor(infiniopReluBackwardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/silu.h b/include/infiniop/ops/silu.h new file mode 100644 index 000000000..037d6323f --- /dev/null +++ b/include/infiniop/ops/silu.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SILU_API_H__ +#define __INFINIOP_SILU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSiluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSiluDescriptor(infiniopHandle_t handle, + infiniopSiluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t intput); + +__C __export infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSilu(infiniopSiluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *intput, + void *stream); + +__C __export infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..3234a22da 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -12,21 +12,31 @@ def run_tests(args): failed = [] for test in [ - "add.py", - "attention.py", - "causal_softmax.py", - "clip.py", - "gemm.py", - "mul.py", - "random_sample.py", - "rearrange.py", - "rms_norm.py", - "rope.py", - "sub.py", - "swiglu.py", + # "add.py", + # "attention.py", + # "causal_softmax.py", + # "clip.py", + # "gemm.py", + # "mul.py", + # "random_sample.py", + # "rearrange.py", + # "rms_norm.py", + # "rope.py", + # "sub.py", + # "swiglu.py", + "silu.py", + "div.py", + "logical_and.py", + "logical_or.py", + "equal.py", + "all_equal.py", + "relu_backward.py", + "gelu.py", + "gelu_backward.py", + "cross_entropy_loss_backward.py" ]: result = subprocess.run( - f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True + f"python {test} {args} --profile", text=True, encoding="utf-8", shell=True ) if result.returncode != 0: failed.append(test) diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..d571d4b55 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,16 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(silu) +DECLARE_INFINIOP_TEST(div) +DECLARE_INFINIOP_TEST(logical_and) +DECLARE_INFINIOP_TEST(logical_or) +DECLARE_INFINIOP_TEST(equal) +DECLARE_INFINIOP_TEST(all_equal) +DECLARE_INFINIOP_TEST(relu_backward) +DECLARE_INFINIOP_TEST(gelu) +DECLARE_INFINIOP_TEST(gelu_backward) +DECLARE_INFINIOP_TEST(cross_entropy_loss_backward) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -30,19 +40,29 @@ DECLARE_INFINIOP_TEST(sub) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(silu) \ + REGISTER_INFINIOP_TEST(div) \ + REGISTER_INFINIOP_TEST(logical_and) \ + REGISTER_INFINIOP_TEST(logical_or) \ + REGISTER_INFINIOP_TEST(equal) \ + REGISTER_INFINIOP_TEST(all_equal) \ + REGISTER_INFINIOP_TEST(relu_backward) \ + REGISTER_INFINIOP_TEST(gelu) \ + REGISTER_INFINIOP_TEST(gelu_backward) \ + REGISTER_INFINIOP_TEST(cross_entropy_loss_backward) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/all_equal.cpp b/src/infiniop-test/src/ops/all_equal.cpp new file mode 100644 index 000000000..616f8d8b5 --- /dev/null +++ b/src/infiniop-test/src/ops/all_equal.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::all_equal { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopAllEqualDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateAllEqualDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetAllEqualWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopAllEqual(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopAllEqual( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::all_equal diff --git a/src/infiniop-test/src/ops/cross_entropy_loss_backward.cpp b/src/infiniop-test/src/ops/cross_entropy_loss_backward.cpp new file mode 100644 index 000000000..972d3656b --- /dev/null +++ b/src/infiniop-test/src/ops/cross_entropy_loss_backward.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cross_entropy_loss_backward { +struct Test::Attributes { + std::shared_ptr probs; + std::shared_ptr target; + std::shared_ptr grad_logits; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("probs") == tensors.end() + || tensors.find("target") == tensors.end() + || tensors.find("grad_logits") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->probs = tensors["probs"]; + test->_attributes->target = tensors["target"]; + test->_attributes->grad_logits = tensors["grad_logits"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopCrossEntropyLossBackwardDescriptor_t op_desc; + auto probs = _attributes->probs->to(device, device_id); + auto target = _attributes->target->to(device, device_id); + auto grad_logits = _attributes->grad_logits->to(device, device_id); + CHECK_OR(infiniopCreateCrossEntropyLossBackwardDescriptor(handle, &op_desc, + grad_logits->desc(), + probs->desc(), + target->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetCrossEntropyLossBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopCrossEntropyLossBackward(op_desc, workspace, workspace_size, + grad_logits->data(), + probs->data(), + target->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_logits, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopCrossEntropyLossBackward( + op_desc, workspace, workspace_size, + grad_logits->data(), + probs->data(), + target->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"probs", "target", "grad_logits", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_logits"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- probs: " << _attributes->probs->info() << std::endl; + oss << "- target: " << _attributes->target->info() << std::endl; + oss << "- grad_logits: " << _attributes->grad_logits->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::cross_entropy_loss_backward diff --git a/src/infiniop-test/src/ops/div.cpp b/src/infiniop-test/src/ops/div.cpp new file mode 100644 index 000000000..c1f49bda6 --- /dev/null +++ b/src/infiniop-test/src/ops/div.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::div { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopDivDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateDivDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetDivWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopDiv(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopDiv( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::div diff --git a/src/infiniop-test/src/ops/equal.cpp b/src/infiniop-test/src/ops/equal.cpp new file mode 100644 index 000000000..a4c236410 --- /dev/null +++ b/src/infiniop-test/src/ops/equal.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::equal { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopEqualDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateEqualDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetEqualWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopEqual(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopEqual( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::equal diff --git a/src/infiniop-test/src/ops/gelu.cpp b/src/infiniop-test/src/ops/gelu.cpp new file mode 100644 index 000000000..ab0bbfa5f --- /dev/null +++ b/src/infiniop-test/src/ops/gelu.cpp @@ -0,0 +1,101 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::gelu { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopGeluDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateGeluDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetGeluWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopGelu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopGelu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::gelu diff --git a/src/infiniop-test/src/ops/gelu_backward.cpp b/src/infiniop-test/src/ops/gelu_backward.cpp new file mode 100644 index 000000000..2fc7882a1 --- /dev/null +++ b/src/infiniop-test/src/ops/gelu_backward.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::gelu_backward { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopGeluBackwardDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + CHECK_OR(infiniopCreateGeluBackwardDescriptor(handle, &op_desc, + grad_input->desc(), + input->desc(), + grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetGeluBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopGeluBackward(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_input, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopGeluBackward( + op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_input"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::gelu_backward diff --git a/src/infiniop-test/src/ops/logical_and.cpp b/src/infiniop-test/src/ops/logical_and.cpp new file mode 100644 index 000000000..ac65f984b --- /dev/null +++ b/src/infiniop-test/src/ops/logical_and.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_and { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalAndDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalAndDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalAndWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalAnd(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalAnd( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_and diff --git a/src/infiniop-test/src/ops/logical_or.cpp b/src/infiniop-test/src/ops/logical_or.cpp new file mode 100644 index 000000000..ec37f1d78 --- /dev/null +++ b/src/infiniop-test/src/ops/logical_or.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::logical_or { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLogicalOrDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateLogicalOrDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLogicalOrWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLogicalOr(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLogicalOr( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::logical_or diff --git a/src/infiniop-test/src/ops/relu_backward.cpp b/src/infiniop-test/src/ops/relu_backward.cpp new file mode 100644 index 000000000..38f300ab6 --- /dev/null +++ b/src/infiniop-test/src/ops/relu_backward.cpp @@ -0,0 +1,109 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::relu_backward { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopReluBackwardDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + CHECK_OR(infiniopCreateReluBackwardDescriptor(handle, &op_desc, + grad_input->desc(), + input->desc(), + grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetReluBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopReluBackward(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_input, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReluBackward( + op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_input"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::relu_backward diff --git a/src/infiniop-test/src/ops/silu.cpp b/src/infiniop-test/src/ops/silu.cpp new file mode 100644 index 000000000..75684503c --- /dev/null +++ b/src/infiniop-test/src/ops/silu.cpp @@ -0,0 +1,101 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::silu { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSiluDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateSiluDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSiluWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSilu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSilu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::silu diff --git a/src/infiniop/ops/all_equal/all_equal.h b/src/infiniop/ops/all_equal/all_equal.h new file mode 100644 index 000000000..634856aae --- /dev/null +++ b/src/infiniop/ops/all_equal/all_equal.h @@ -0,0 +1,47 @@ +#ifndef __ALL_EQUAL_H +#define __ALL_EQUAL_H + +#include "../../handle.h" +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::all_equal::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + op::all_equal::AllEqualInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + AllEqualInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + std::vector input_descs); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + std::vector inputs, \ + void *stream) const; \ + }; \ + } + +#endif // __ALL_EQUAL_H diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc new file mode 100644 index 000000000..b280a6963 --- /dev/null +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.cc @@ -0,0 +1,112 @@ +#include "all_equal_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include + +namespace op::all_equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(a_shape, b_shape); + + auto info_result = AllEqualInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + + *desc_ptr = new Descriptor(nullptr, info_result.take(), 0, handle_->device, handle_->device_id); + + return INFINI_STATUS_SUCCESS; +} + +// Perform elementwise operation when all inputs have the same type +template +void calculate_impl(const op::all_equal::AllEqualInfo &info, + void *output, + const std::vector &inputs) { + Tout *out = reinterpret_cast(output); + auto input_a = reinterpret_cast(inputs[0]); + auto input_b = reinterpret_cast(inputs[1]); + const ptrdiff_t input_numel = static_cast(info.getInputNumel()); + + bool all_equal = true; + +#pragma omp parallel for + for (ptrdiff_t i = 0; i < input_numel; ++i) { + auto get_input_idx = [&](size_t input_id) { + return info.getInputContiguous()[input_id] + ? i + : (info.getInputBroadcasted()[input_id] + ? op::common_cpu::indexToReducedOffset(i, info.getNdim(), info.getDefaultStrides(), info.getInputStrides(input_id)) + : op::common_cpu::indexToOffset(i, info.getNdim(), info.getInputShape(input_id), info.getInputStrides(input_id))); + }; + if constexpr (std::is_same_v || std::is_same_v) { + Tout elem = Op{}.template operator()(utils::cast(input_a[get_input_idx(0)]), utils::cast(input_b[get_input_idx(1)])); + if (elem == false) { + all_equal = false; + } + } else { + Tout elem = Op{}.template operator()(input_a[(get_input_idx(0))], input_b[get_input_idx(1)]); + if (elem == false) { + all_equal = false; + } + } + } + *out = all_equal ? true : false; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_info._dtype) { + case INFINI_DTYPE_BOOL: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_I8: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_I16: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_I32: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_I64: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_BF16: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_F16: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_F32: + calculate_impl(_info, output, inputs); + break; + case INFINI_DTYPE_F64: + calculate_impl(_info, output, inputs); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::all_equal::cpu diff --git a/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h new file mode 100644 index 000000000..7aab41a6e --- /dev/null +++ b/src/infiniop/ops/all_equal/cpu/all_equal_cpu.h @@ -0,0 +1,19 @@ +#ifndef __ALL_EQUAL_CPU_H__ +#define __ALL_EQUAL_CPU_H__ + +#include "../all_equal.h" + +DESCRIPTOR(cpu) + +namespace op::all_equal::cpu { +typedef struct AllEqualOp { +public: + static constexpr size_t num_inputs = 2; + template + Tout operator()(const Tin &a, const Tin &b) const { + return a == b; + } +} AllEqualOp; +} // namespace op::all_equal::cpu + +#endif // __ALL_EQUAL_CPU_H__ diff --git a/src/infiniop/ops/all_equal/cuda/kernel.cuh b/src/infiniop/ops/all_equal/cuda/kernel.cuh new file mode 100644 index 000000000..82e5c321a --- /dev/null +++ b/src/infiniop/ops/all_equal/cuda/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef __ALL_EQUAL_CUDA_H__ +#define __ALL_EQUAL_CUDA_H__ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +template +__global__ void compareKernel(size_t input_numel, size_t ndim, const bool *__restrict__ input_contiguous, const bool *__restrict__ input_broadcasted, const size_t *__restrict__ input_shapes, const ptrdiff_t *__restrict__ output_strides, const ptrdiff_t *__restrict__ input_strides, const void *const *inputs, uint8_t *flags) { + const Tdata *const a = reinterpret_cast(inputs)[0]; + const Tdata *const b = reinterpret_cast(inputs)[1]; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < input_numel) { + op::elementwise::nvidia::InputIndexer indexer{idx, ndim, input_contiguous, input_broadcasted, input_shapes, input_strides, output_strides}; + size_t idx_a = indexer(0); + size_t idx_b = indexer(1); + flags[idx] = (a[idx_a] != b[idx_b]) ? 1 : 0; + } +} + +template +__global__ void countKernel(uint8_t *flags, unsigned int *count, int input_numel) { + __shared__ unsigned int s_data[BLOCK_SIZE]; + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + tid; + + s_data[tid] = (idx < input_numel) ? flags[idx] : 0; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + s_data[tid] += s_data[tid + stride]; + } + __syncthreads(); + } + + if (tid == 0) { + atomicAdd(count, s_data[0]); + } +} + +#endif // __ALL_EQUAL_CUDA_H__ diff --git a/src/infiniop/ops/all_equal/info.h b/src/infiniop/ops/all_equal/info.h new file mode 100644 index 000000000..847386dab --- /dev/null +++ b/src/infiniop/ops/all_equal/info.h @@ -0,0 +1,148 @@ +#ifndef __ALL_EQUAL_INFO_H__ +#define __ALL_EQUAL_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include "infinicore.h" +#include +#include +#include + +namespace op::all_equal { +struct AllEqualInfo { +private: + std::vector _meta; + size_t _input_size; + size_t _input_numel; + size_t _ndim; + + AllEqualInfo(std::vector meta, + size_t input_size, + size_t input_numel, + size_t ndim, + infiniDtype_t dtype) + : _meta(std::move(meta)), + _input_size(input_size), _input_numel(input_numel), _ndim(ndim), _dtype(dtype) {} + +public: + infiniDtype_t _dtype; + +public: + // Get the Memory size of the meta data in bytes + inline size_t getMetaMemSize() const { + return _meta.size() * sizeof(size_t); + } + inline const int8_t *getMetaStart() const { + return reinterpret_cast(_meta.data()); + } + inline size_t getInputSize() const { + return _input_size; + } + inline size_t getInputNumel() const { + return _input_numel; + } + inline size_t getNdim() const { + return _ndim; + } + inline const ptrdiff_t *getDefaultStrides() const { + return reinterpret_cast(_meta.data()); + } + inline const size_t *getAllInputShapes() const { + return reinterpret_cast(getDefaultStrides() + _ndim); + } + inline const size_t *getInputShape(const size_t &index) const { + if (index < _input_size) { + return reinterpret_cast(getAllInputShapes() + index * _ndim); + } + return nullptr; + } + inline const ptrdiff_t *getAllInputStrides() const { + return reinterpret_cast(getAllInputShapes() + _input_size * _ndim); + } + inline const ptrdiff_t *getInputStrides(const size_t &index) const { + if (index < _input_size) { + return reinterpret_cast(getAllInputStrides() + index * _ndim); + } + return nullptr; + } + inline const bool *getInputContiguous() const { + return reinterpret_cast(getAllInputStrides() + _input_size * _ndim); + } + inline const bool *getInputBroadcasted() const { + return reinterpret_cast(getInputContiguous() + _input_size); + } + + using ResultType = utils::Result; + + /** + * @brief Construct ElementwiseInfo from output and input tensor descriptors. + * @param output_desc Descriptor of the output tensor. + * @param input_descs Descriptors of the input tensors. + * @return Result with the successfully constructed ElementwiseInfo, + * or the status code. + */ + static ResultType create( + infiniopTensorDescriptor_t output_desc, + std::vector input_descs) { + + if (!output_desc || input_descs.empty()) { + return INFINI_STATUS_BAD_PARAM; + } + + // Destination cannot have broadcast setup + if (output_desc->hasBroadcastDim()) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + auto input_size = input_descs.size(); + auto input_a_desc = input_descs[0]; + auto input_numel = input_a_desc->numel(); + auto ndim = input_a_desc->ndim(); + auto dtype = input_a_desc->dtype(); + + // Allocate memory for meta + auto shape_unit = input_a_desc->dim(0); + auto stride_unit = input_a_desc->stride(0); + size_t meta_mem_size = ndim * sizeof(stride_unit) + input_size * ndim * sizeof(shape_unit) + + input_size * ndim * sizeof(stride_unit) + + 2 * input_size * sizeof(bool); + std::vector meta(CEIL_DIV(meta_mem_size, sizeof(size_t))); + int8_t *meta_ptr = reinterpret_cast(meta.data()); + + std::vector default_strides(ndim); + auto default_shape = input_a_desc->shape(); + ptrdiff_t dsize = 1; + for (int i = (int)ndim - 1; i >= 0; i--) { + default_strides[i] = dsize; + dsize *= default_shape[i]; + } + + // Pointers to the sections within _meta + size_t *default_strides_p = reinterpret_cast(meta_ptr); + size_t *input_shapes = reinterpret_cast(default_strides_p + ndim); + ptrdiff_t *input_strides = reinterpret_cast(input_shapes + input_size * ndim); + bool *input_contiguous = reinterpret_cast(input_strides + input_size * ndim); + bool *input_broadcasted = input_contiguous + input_size; + + // Copy default strides + std::memcpy(default_strides_p, default_strides.data(), ndim * sizeof(*default_strides_p)); + + // Copy input shapes, strides, contiguous, and broadcasted flags + for (size_t i = 0; i < input_size; ++i) { + auto &desc = input_descs[i]; + const auto in_shape = desc->shape(); + const auto in_strides = desc->strides(); + std::memcpy(input_shapes + i * ndim, in_shape.data(), ndim * sizeof(*input_shapes)); + std::memcpy(input_strides + i * ndim, in_strides.data(), ndim * sizeof(*input_strides)); + input_contiguous[i] = desc->isContiguous(); + input_broadcasted[i] = !input_contiguous[i] && desc->hasBroadcastDim(); + } + + AllEqualInfo info(std::move(meta), input_size, input_numel, ndim, dtype); + return ResultType(std::move(info)); + } +}; + +} // namespace op::all_equal + +#endif // __ALL_EQUAL_INFO_H__ diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.h b/src/infiniop/ops/all_equal/metax/all_equal_metax.h new file mode 100644 index 000000000..82813b997 --- /dev/null +++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __ALL_EQUAL_METAX_API_H__ +#define __ALL_EQUAL_METAX_API_H__ + +#include "../all_equal.h" + +DESCRIPTOR(metax) + +#endif // __ALL_EQUAL_METAX_API_H__ diff --git a/src/infiniop/ops/all_equal/metax/all_equal_metax.maca b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca new file mode 100644 index 000000000..53d1da778 --- /dev/null +++ b/src/infiniop/ops/all_equal/metax/all_equal_metax.maca @@ -0,0 +1,213 @@ +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "kernel.cuh" +#include "all_equal_metax.h" +#include +namespace op::all_equal::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(a_shape, b_shape); + + auto info_result = AllEqualInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto opaque_ptr = new Opaque{reinterpret_cast(handle_)->internal()}; + info_result.take(); + + *desc_ptr = new Descriptor(opaque_ptr, std::move(info), workspace_size, handle_->device, handle_->device_id); + + return INFINI_STATUS_SUCCESS; +} +template +infiniStatus_t infoToDevice( + const op::all_equal::AllEqualInfo &info, + void *workspace, + const void *const *h_inputs_arr, + const void **&d_inputs_arr, + const bool *&d_input_contiguous, + const bool *&d_input_broadcasted, + const ptrdiff_t *&d_output_strides, + const size_t *&d_input_shapes, + const ptrdiff_t *&d_input_strides, + hcStream_t stream) { + + constexpr auto input_size = N; + const auto ndim = info.getNdim(); + constexpr auto input_arr_size = N * sizeof(*h_inputs_arr); + const int8_t *info_meta_start = info.getMetaStart(); + const int8_t *d_meta_start = reinterpret_cast(workspace) + input_arr_size; + + // copy the input pointer array and meta to device + CHECK_METAX(hcMemcpyAsync(workspace, h_inputs_arr, input_arr_size, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), hcMemcpyHostToDevice, stream)); + + // offset/assign the pointers + d_inputs_arr = reinterpret_cast(workspace); + d_output_strides = reinterpret_cast(d_meta_start); + d_input_shapes = reinterpret_cast(d_output_strides + ndim); + d_input_strides = reinterpret_cast(d_input_shapes + input_size * ndim); + d_input_contiguous = reinterpret_cast(d_input_strides + input_size * ndim); + d_input_broadcasted = reinterpret_cast(d_input_contiguous + input_size); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchCompareKernel(const AllEqualInfo &info, void *workspace, std::vector inputs, uint8_t *flags, void *stream) { + // Device pointers + const void **d_inputs_arr = nullptr; + const bool *d_input_contiguous = nullptr; + const bool *d_input_broadcasted = nullptr; + const ptrdiff_t *d_output_strides = nullptr; + const size_t *d_input_shapes = nullptr; + const ptrdiff_t *d_input_strides = nullptr; + + CHECK_STATUS(infoToDevice<2>(info, workspace, inputs.data(), d_inputs_arr, + d_input_contiguous, d_input_broadcasted, + d_output_strides, + d_input_shapes, d_input_strides, reinterpret_cast(stream))); + + const std::size_t input_numel = info.getInputNumel(); + const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE; + compareKernel<<(stream)>>>(input_numel, info.getNdim(), d_input_contiguous, d_input_broadcasted, d_input_shapes, d_output_strides, d_input_strides, reinterpret_cast(d_inputs_arr), flags); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchCountKernel(uint8_t *flags, unsigned int *count, std::size_t input_numel, void *stream) { + const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE; + countKernel<<(stream)>>>(flags, count, input_numel); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const std::size_t input_numel = _info.getInputNumel(); + std::vector flags(input_numel, 0); + uint8_t *h_flags = flags.data(); + uint8_t *d_flags = nullptr; + hcMalloc(&d_flags, input_numel * sizeof(uint8_t)); + hcMemcpy(d_flags, h_flags, input_numel * sizeof(uint8_t), hcMemcpyHostToDevice); + unsigned int h_count{}; + unsigned int *d_count; + hcMalloc(&d_count, sizeof(unsigned int)); + hcMemset(d_count, 0, sizeof(unsigned int)); + + const std::size_t BLOCK_SIZE = 256; + bool ans{false}; + switch (_info._dtype) { + case INFINI_DTYPE_BOOL: + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + case INFINI_DTYPE_I8: + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + case INFINI_DTYPE_I16: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_I32: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_I64: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_BF16: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_F16: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_F32: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_F64: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + hcMemcpy(&h_count, d_count, sizeof(unsigned int), hcMemcpyDeviceToHost); + ans = (h_count == 0); + hcMemcpy(output, &ans, sizeof(bool), hcMemcpyHostToDevice); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + hcFree(d_count); + hcFree(d_flags); + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::all_equal::metax diff --git a/src/infiniop/ops/all_equal/metax/kernel.cuh b/src/infiniop/ops/all_equal/metax/kernel.cuh new file mode 100644 index 000000000..d03d989ae --- /dev/null +++ b/src/infiniop/ops/all_equal/metax/kernel.cuh @@ -0,0 +1,42 @@ +#ifndef __ALL_EQUAL_METAX_H__ +#define __ALL_EQUAL_METAX_H__ +#include "../../../elementwise/metax/elementwise_metax.h" +#include +#include + +template +__global__ void compareKernel(size_t input_numel, size_t ndim, const bool *__restrict__ input_contiguous, const bool *__restrict__ input_broadcasted, const size_t *__restrict__ input_shapes, const ptrdiff_t *__restrict__ output_strides, const ptrdiff_t *__restrict__ input_strides, const void *const *inputs, uint8_t *flags) { + const Tdata *const a = reinterpret_cast(inputs)[0]; + const Tdata *const b = reinterpret_cast(inputs)[1]; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < input_numel) { + op::elementwise::metax::InputIndexer indexer{idx, ndim, input_contiguous, input_broadcasted, input_shapes, input_strides, output_strides}; + size_t idx_a = indexer(0); + size_t idx_b = indexer(1); + flags[idx] = (a[idx_a] != b[idx_b]) ? 1 : 0; + } +} + +template +__global__ void countKernel(uint8_t *flags, unsigned int *count, int input_numel) { + __shared__ unsigned int s_data[BLOCK_SIZE]; + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + tid; + + s_data[tid] = (idx < input_numel) ? flags[idx] : 0; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + s_data[tid] += s_data[tid + stride]; + } + __syncthreads(); + } + + if (tid == 0) { + atomicAdd(count, s_data[0]); + } +} + +#endif // __ALL_EQUAL_METAX_H__ diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu new file mode 100644 index 000000000..4b3fbb72d --- /dev/null +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cu @@ -0,0 +1,214 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "all_equal_nvidia.cuh" +#include +#include +namespace op::all_equal::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(a_shape, b_shape); + + auto info_result = AllEqualInfo::create(out_desc, input_desc_vec); + CHECK_RESULT(info_result); + auto info = info_result.take(); + auto workspace_size = info.getMetaMemSize() + info.getInputSize() * sizeof(void *); + + auto opaque_ptr = new Opaque{reinterpret_cast(handle_)->internal()}; + info_result.take(); + + *desc_ptr = new Descriptor(opaque_ptr, std::move(info), workspace_size, handle_->device, handle_->device_id); + + return INFINI_STATUS_SUCCESS; +} +template +infiniStatus_t infoToDevice( + const op::all_equal::AllEqualInfo &info, + void *workspace, + const void *const *h_inputs_arr, + const void **&d_inputs_arr, + const bool *&d_input_contiguous, + const bool *&d_input_broadcasted, + const ptrdiff_t *&d_output_strides, + const size_t *&d_input_shapes, + const ptrdiff_t *&d_input_strides, + cudaStream_t stream) { + + constexpr auto input_size = N; + const auto ndim = info.getNdim(); + constexpr auto input_arr_size = N * sizeof(*h_inputs_arr); + const int8_t *info_meta_start = info.getMetaStart(); + const int8_t *d_meta_start = reinterpret_cast(workspace) + input_arr_size; + + // copy the input pointer array and meta to device + CHECK_CUDA(cudaMemcpyAsync(workspace, h_inputs_arr, input_arr_size, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync((void *)d_meta_start, info_meta_start, info.getMetaMemSize(), cudaMemcpyHostToDevice, stream)); + + // offset/assign the pointers + d_inputs_arr = reinterpret_cast(workspace); + d_output_strides = reinterpret_cast(d_meta_start); + d_input_shapes = reinterpret_cast(d_output_strides + ndim); + d_input_strides = reinterpret_cast(d_input_shapes + input_size * ndim); + d_input_contiguous = reinterpret_cast(d_input_strides + input_size * ndim); + d_input_broadcasted = reinterpret_cast(d_input_contiguous + input_size); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchCompareKernel(const AllEqualInfo &info, void *workspace, std::vector inputs, uint8_t *flags, void *stream) { + // Device pointers + const void **d_inputs_arr = nullptr; + const bool *d_input_contiguous = nullptr; + const bool *d_input_broadcasted = nullptr; + const ptrdiff_t *d_output_strides = nullptr; + const size_t *d_input_shapes = nullptr; + const ptrdiff_t *d_input_strides = nullptr; + + CHECK_STATUS(infoToDevice<2>(info, workspace, inputs.data(), d_inputs_arr, + d_input_contiguous, d_input_broadcasted, + d_output_strides, + d_input_shapes, d_input_strides, reinterpret_cast(stream))); + + const std::size_t input_numel = info.getInputNumel(); + const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE; + compareKernel<<(stream)>>>(input_numel, info.getNdim(), d_input_contiguous, d_input_broadcasted, d_input_shapes, d_output_strides, d_input_strides, reinterpret_cast(d_inputs_arr), flags); + + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchCountKernel(uint8_t *flags, unsigned int *count, std::size_t input_numel, void *stream) { + const std::size_t GRID_SIZE = (input_numel + BLOCK_SIZE - 1) / BLOCK_SIZE; + countKernel<<(stream)>>>(flags, count, input_numel); + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const std::size_t input_numel = _info.getInputNumel(); + std::vector flags(input_numel, 0); + uint8_t *h_flags = flags.data(); + uint8_t *d_flags = nullptr; + cudaMalloc(&d_flags, input_numel * sizeof(uint8_t)); + cudaMemcpy(d_flags, h_flags, input_numel * sizeof(uint8_t), cudaMemcpyHostToDevice); + unsigned int h_count{}; + unsigned int *d_count; + cudaMalloc(&d_count, sizeof(unsigned int)); + cudaMemset(d_count, 0, sizeof(unsigned int)); + + const std::size_t BLOCK_SIZE = 256; + bool ans{false}; + switch (_info._dtype) { + case INFINI_DTYPE_BOOL: + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + case INFINI_DTYPE_I8: + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + case INFINI_DTYPE_I16: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_I32: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_I64: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_BF16: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_F16: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_F32: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + case INFINI_DTYPE_F64: { + launchCompareKernel(_info, workspace, inputs, d_flags, stream); + launchCountKernel(d_flags, d_count, input_numel, stream); + cudaMemcpy(&h_count, d_count, sizeof(unsigned int), cudaMemcpyDeviceToHost); + ans = (h_count == 0); + cudaMemcpy(output, &ans, sizeof(bool), cudaMemcpyHostToDevice); + break; + } + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + cudaFree(d_count); + cudaFree(d_flags); + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::all_equal::nvidia diff --git a/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh new file mode 100644 index 000000000..ca7018b9b --- /dev/null +++ b/src/infiniop/ops/all_equal/nvidia/all_equal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __ALL_EQUAL_CUDA_API_H__ +#define __ALL_EQUAL_CUDA_API_H__ + +#include "../all_equal.h" + +DESCRIPTOR(nvidia) + +#endif // __ALL_EQUAL_CUDA_API_H__ diff --git a/src/infiniop/ops/all_equal/operator.cc b/src/infiniop/ops/all_equal/operator.cc new file mode 100644 index 000000000..f4f91b981 --- /dev/null +++ b/src/infiniop/ops/all_equal/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/all_equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/all_equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/all_equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/all_equal_metax.h" +#endif + +__C infiniStatus_t infiniopCreateAllEqualDescriptor( + infiniopHandle_t handle, + infiniopAllEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::all_equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetAllEqualWorkspaceSize(infiniopAllEqualDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopAllEqual( + infiniopAllEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyAllEqualDescriptor(infiniopAllEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward.cc b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward.cc new file mode 100644 index 000000000..9bab4c14a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward.cc @@ -0,0 +1,62 @@ +#include "cross_entropy_loss_backward_cpu.h" +#include + +namespace op::cross_entropy_loss_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &probs_desc = input_desc_vec.at(0); + const auto &target_desc = input_desc_vec.at(1); + const auto &grad_logits_shape = out_desc->shape(); + const auto &probs_shape = probs_desc->shape(); + const auto &target_shape = target_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_logits_shape, probs_shape, target_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + const size_t *output_shape = _info.getOutputShape(); + const size_t dim = _info.getNdim(); + size_t N{1}; + for (size_t i = 0; i < dim - 1; ++i) { + N *= output_shape[i]; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream, N); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream, N); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream, N); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream, N); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cross_entropy_loss_backward::cpu diff --git a/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward_cpu.h b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward_cpu.h new file mode 100644 index 000000000..11972075f --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/cpu/cross_entropy_loss_backward_cpu.h @@ -0,0 +1,19 @@ +#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_CPU_H__ +#define __CROSS_ENTROPY_LOSS_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(cross_entropy_loss_backward, cpu) + +namespace op::cross_entropy_loss_backward::cpu { +typedef struct CrossEntropyLossBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b, const size_t N) const { + return (a - b) / static_cast(N); + } +} CrossEntropyLossBackwardOp; +} // namespace op::cross_entropy_loss_backward::cpu + +#endif // __CROSS_ENTROPY_LOSS_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/cross_entropy_loss_backward/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy_loss_backward/cuda/kernel.cuh new file mode 100644 index 000000000..169889e67 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/cuda/kernel.cuh @@ -0,0 +1,27 @@ +#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_H__ +#define __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_H__ + +namespace op::cross_entropy_loss_backward::cuda { +typedef struct CrossEntropyLossBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b, const size_t N) const { + float f_N = static_cast(N); + if constexpr (std::is_same_v) { + half2 h2_N = __float2half2_rn(f_N); + return __h2div(__hsub2(a, b), h2_N); + } else if constexpr (std::is_same_v) { + return __hdiv(__hsub(a, b), __float2bfloat16(f_N)); + } else if constexpr (std::is_same_v) { + return __hdiv(__hsub(a, b), __float2half(f_N)); + } else if constexpr (std::is_same_v) { + return __fdiv_rn(__fsub_rn(a, b), f_N); + } else { + return (a - b) / static_cast(N); + } + } +} CrossEntropyLossBackwardOp; +} // namespace op::cross_entropy_loss_backward::cuda + +#endif // __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_H__ diff --git a/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.h b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.h new file mode 100644 index 000000000..d5821c81a --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_METAX_API_H__ +#define __CROSS_ENTROPY_LOSS_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(cross_entropy_loss_backward, metax) + +#endif // __CROSS_ENTROPY_LOSS_BACKWARD_METAX_API_H__ diff --git a/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.maca b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.maca new file mode 100644 index 000000000..b30b13404 --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/metax/cross_entropy_loss_backward_metax.maca @@ -0,0 +1,69 @@ +#include "cross_entropy_loss_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cross_entropy_loss_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &probs_desc = input_desc_vec.at(0); + const auto &target_desc = input_desc_vec.at(1); + const auto &grad_logits_shape = out_desc->shape(); + const auto &probs_shape = probs_desc->shape(); + const auto &target_shape = target_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_logits_shape, probs_shape, target_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const size_t *output_shape = _info.getOutputShape(); + const size_t dim = _info.getNdim(); + size_t N{1}; + for (size_t i = 0; i < dim - 1; ++i) { + N *= output_shape[i]; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, half>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, float>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, double>(_info, workspace, output, inputs, stream, std::move(N)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cross_entropy_loss_backward::metax diff --git a/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cu b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cu new file mode 100644 index 000000000..d6aab249c --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cu @@ -0,0 +1,71 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cross_entropy_loss_backward_nvidia.cuh" +#include +#include +#include + +namespace op::cross_entropy_loss_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &probs_desc = input_desc_vec.at(0); + const auto &target_desc = input_desc_vec.at(1); + const auto &grad_logits_shape = out_desc->shape(); + const auto &probs_shape = probs_desc->shape(); + const auto &target_shape = target_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_logits_shape, probs_shape, target_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + const size_t *output_shape = _info.getOutputShape(); + const size_t dim = _info.getNdim(); + size_t N{1}; + for (size_t i = 0; i < dim - 1; ++i) { + N *= output_shape[i]; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, half>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, float>(_info, workspace, output, inputs, stream, std::move(N)); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CrossEntropyLossBackwardOp, double>(_info, workspace, output, inputs, stream, std::move(N)); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cross_entropy_loss_backward::nvidia diff --git a/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cuh new file mode 100644 index 000000000..8f8a7a2be --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/nvidia/cross_entropy_loss_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_API_H__ +#define __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cross_entropy_loss_backward, nvidia) + +#endif // __CROSS_ENTROPY_LOSS_BACKWARD_CUDA_API_H__ diff --git a/src/infiniop/ops/cross_entropy_loss_backward/operator.cc b/src/infiniop/ops/cross_entropy_loss_backward/operator.cc new file mode 100644 index 000000000..aba99f46c --- /dev/null +++ b/src/infiniop/ops/cross_entropy_loss_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cross_entropy_loss_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cross_entropy_loss_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cross_entropy_loss_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cross_entropy_loss_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCrossEntropyLossBackwardDescriptor( + infiniopHandle_t handle, + infiniopCrossEntropyLossBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cross_entropy_loss_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, \ + grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCrossEntropyLossBackwardWorkspaceSize(infiniopCrossEntropyLossBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCrossEntropyLossBackward( + infiniopCrossEntropyLossBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCrossEntropyLossBackwardDescriptor(infiniopCrossEntropyLossBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/div/cpu/div_cpu.cc b/src/infiniop/ops/div/cpu/div_cpu.cc new file mode 100644 index 000000000..6b5edef36 --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.cc @@ -0,0 +1,54 @@ +#include "div_cpu.h" + +namespace op::div::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::cpu diff --git a/src/infiniop/ops/div/cpu/div_cpu.h b/src/infiniop/ops/div/cpu/div_cpu.h new file mode 100644 index 000000000..0373b766f --- /dev/null +++ b/src/infiniop/ops/div/cpu/div_cpu.h @@ -0,0 +1,19 @@ +#ifndef __DIV_CPU_H__ +#define __DIV_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(div, cpu) + +namespace op::div::cpu { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + return a / b; + } +} DivOp; +} // namespace op::div::cpu + +#endif // __DIV_CPU_H__ diff --git a/src/infiniop/ops/div/cuda/kernel.cuh b/src/infiniop/ops/div/cuda/kernel.cuh new file mode 100644 index 000000000..cefbf0141 --- /dev/null +++ b/src/infiniop/ops/div/cuda/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __DIV_CUDA_H__ +#define __DIV_CUDA_H__ + +namespace op::div::cuda { +typedef struct DivOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v || std::is_same_v) { + return __hdiv(a, b); + } else if constexpr (std::is_same_v) { + return __fdiv_rn(a, b); + } else { + return a / b; + } + } +} DivOp; +} // namespace op::div::cuda + +#endif // __DIV_CUDA_H__ diff --git a/src/infiniop/ops/div/metax/div_metax.h b/src/infiniop/ops/div/metax/div_metax.h new file mode 100644 index 000000000..1e56a7d44 --- /dev/null +++ b/src/infiniop/ops/div/metax/div_metax.h @@ -0,0 +1,8 @@ +#ifndef __DIV_METAX_API_H__ +#define __DIV_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(div, metax) + +#endif // __DIV_METAX_API_H__ diff --git a/src/infiniop/ops/div/metax/div_metax.maca b/src/infiniop/ops/div/metax/div_metax.maca new file mode 100644 index 000000000..a8ecd8643 --- /dev/null +++ b/src/infiniop/ops/div/metax/div_metax.maca @@ -0,0 +1,62 @@ +#include "div_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::div::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::metax diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cu b/src/infiniop/ops/div/nvidia/div_nvidia.cu new file mode 100644 index 000000000..4cb64af63 --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "div_nvidia.cuh" + +namespace op::div::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::DivOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::DivOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::DivOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::DivOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::div::nvidia diff --git a/src/infiniop/ops/div/nvidia/div_nvidia.cuh b/src/infiniop/ops/div/nvidia/div_nvidia.cuh new file mode 100644 index 000000000..1ad8af94e --- /dev/null +++ b/src/infiniop/ops/div/nvidia/div_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __DIV_CUDA_API_H__ +#define __DIV_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(div, nvidia) + +#endif // __DIV_CUDA_API_H__ diff --git a/src/infiniop/ops/div/operator.cc b/src/infiniop/ops/div/operator.cc new file mode 100644 index 000000000..162156887 --- /dev/null +++ b/src/infiniop/ops/div/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/div.h" + +#ifdef ENABLE_CPU_API +#include "cpu/div_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/div_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/div_metax.h" +#endif + +__C infiniStatus_t infiniopCreateDivDescriptor( + infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::div::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetDivWorkspaceSize(infiniopDivDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDiv( + infiniopDivDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.cc b/src/infiniop/ops/equal/cpu/equal_cpu.cc new file mode 100644 index 000000000..aea021ed1 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.cc @@ -0,0 +1,66 @@ +#include "equal_cpu.h" +#include "infinicore.h" + +namespace op::equal::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::cpu diff --git a/src/infiniop/ops/equal/cpu/equal_cpu.h b/src/infiniop/ops/equal/cpu/equal_cpu.h new file mode 100644 index 000000000..c09a276d7 --- /dev/null +++ b/src/infiniop/ops/equal/cpu/equal_cpu.h @@ -0,0 +1,29 @@ +#ifndef __EQUAL_CPU_H__ +#define __EQUAL_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(equal, cpu) + +namespace op::equal::cpu { +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + template + Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + if constexpr (std::is_same_v || std::is_same_v) { + float f_a = utils::cast(a); + float f_b = utils::cast(b); + return f_a == f_b; + } else { + return a == b; + } + } +} EqualOp; +} // namespace op::equal::cpu + +#endif // __EQUAL_CPU_H__ diff --git a/src/infiniop/ops/equal/cuda/kernel.cuh b/src/infiniop/ops/equal/cuda/kernel.cuh new file mode 100644 index 000000000..636913b26 --- /dev/null +++ b/src/infiniop/ops/equal/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __EQUAL_CUDA_H__ +#define __EQUAL_CUDA_H__ + +namespace op::equal::cuda { +typedef struct EqualOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + return a == b; + } +} EqualOp; +} // namespace op::equal::cuda + +#endif // __EQUAL_CUDA_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.h b/src/infiniop/ops/equal/metax/equal_metax.h new file mode 100644 index 000000000..6e4cd64b9 --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.h @@ -0,0 +1,8 @@ +#ifndef __EQUAL_METAX_API_H__ +#define __EQUAL_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(equal, metax) + +#endif // __EQUAL_METAX_API_H__ diff --git a/src/infiniop/ops/equal/metax/equal_metax.maca b/src/infiniop/ops/equal/metax/equal_metax.maca new file mode 100644 index 000000000..7629cf6aa --- /dev/null +++ b/src/infiniop/ops/equal/metax/equal_metax.maca @@ -0,0 +1,73 @@ +#include "equal_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::equal::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::EqualOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::EqualOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::EqualOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::metax diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cu b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu new file mode 100644 index 000000000..6e8f7444c --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cu @@ -0,0 +1,73 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "equal_nvidia.cuh" +#include "infinicore.h" + +namespace op::equal::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::EqualOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::EqualOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::EqualOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::EqualOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::EqualOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::EqualOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::EqualOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::EqualOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::EqualOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::equal::nvidia diff --git a/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh new file mode 100644 index 000000000..361e54b02 --- /dev/null +++ b/src/infiniop/ops/equal/nvidia/equal_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EQUAL_CUDA_API_H__ +#define __EQUAL_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(equal, nvidia) + +#endif // __EQUAL_CUDA_API_H__ diff --git a/src/infiniop/ops/equal/operator.cc b/src/infiniop/ops/equal/operator.cc new file mode 100644 index 000000000..2c46c28cd --- /dev/null +++ b/src/infiniop/ops/equal/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/equal.h" + +#ifdef ENABLE_CPU_API +#include "cpu/equal_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/equal_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/equal_metax.h" +#endif + +__C infiniStatus_t infiniopCreateEqualDescriptor( + infiniopHandle_t handle, + infiniopEqualDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::equal::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetEqualWorkspaceSize(infiniopEqualDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopEqual( + infiniopEqualDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyEqualDescriptor(infiniopEqualDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.cc b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc new file mode 100644 index 000000000..a057ca4bc --- /dev/null +++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.cc @@ -0,0 +1,52 @@ +#include "gelu_cpu.h" + +namespace op::gelu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu::cpu diff --git a/src/infiniop/ops/gelu/cpu/gelu_cpu.h b/src/infiniop/ops/gelu/cpu/gelu_cpu.h new file mode 100644 index 000000000..9c8713ef3 --- /dev/null +++ b/src/infiniop/ops/gelu/cpu/gelu_cpu.h @@ -0,0 +1,27 @@ +#ifndef __GELU_CPU_H__ +#define __GELU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(gelu, cpu) + +#include + +namespace op::gelu::cpu { +typedef struct GeluOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + constexpr double Alpha = 0.7978845608028654; + constexpr double Beta = 0.044715; + double inner = x + Beta * x * x * x; + double tanh_term = std::tanh(Alpha * inner); + return static_cast(0.5 * x * (1.0 + tanh_term)); + } +} GeluOp; + +} // namespace op::gelu::cpu + +#endif // __GELU_CPU_H__ diff --git a/src/infiniop/ops/gelu/cuda/kernel.cuh b/src/infiniop/ops/gelu/cuda/kernel.cuh new file mode 100644 index 000000000..6673662f5 --- /dev/null +++ b/src/infiniop/ops/gelu/cuda/kernel.cuh @@ -0,0 +1,61 @@ +#ifndef __GELU_CUDA_H__ +#define __GELU_CUDA_H__ + +#include + +namespace op::gelu::cuda { + +typedef struct GeluOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + constexpr float Alpha = 0.7978845608; + constexpr float Beta = 0.044715; + + if constexpr (std::is_same_v) { + const half2 alpha = __float2half2_rn(Alpha); + const half2 beta = __float2half2_rn(Beta); + const half2 one = __float2half2_rn(1.0f); + const half2 half_val = __float2half2_rn(0.5f); + + half2 x_cubed = __hmul2(x, __hmul2(x, x)); // x³ + half2 inner = __hfma2(beta, x_cubed, x); // x + βx³ + half2 tanh_in = __hmul2(alpha, inner); // α(x + βx³) + + // 向量化tanh近似(避免拆包) + float2 f_val = __half22float2(tanh_in); + f_val.x = tanhf(f_val.x); + f_val.y = tanhf(f_val.y); + half2 tanh_val = __float22half2_rn(f_val); + + return __hmul2(__hmul2(half_val, x), __hadd2(one, tanh_val)); // 0.5*x*(1+tanh) + } else if constexpr (std::is_same_v) { + float x_f = __bfloat162float(x); + float result = 0.5f * x_f * (1.0f + tanhf(Alpha * (x_f + Beta * x_f * x_f * x_f))); + + return __float2bfloat16(result); + } else if constexpr (std::is_same_v) { + float x_f = __half2float(x); + float result = 0.5f * x_f * (1.0f + tanhf(Alpha * (x_f + Beta * x_f * x_f * x_f))); + + return __float2half(result); + } else if constexpr (std::is_same_v) { + float x_cubed = x * x * x; + float inner = x + Beta * x_cubed; + float tanh_val = tanhf(Alpha * inner); + + return 0.5f * x * (1.0f + tanh_val); + } else { + double x_cubed = x * x * x; + double inner = x + static_cast(Beta) * x_cubed; + double tanh_val = tanh(static_cast(Alpha) * inner); + + return 0.5 * x * (1 + tanh_val); + } + } +} GeluOp; + +} // namespace op::gelu::cuda + +#endif // __GELU_CUDA_H__ diff --git a/src/infiniop/ops/gelu/metax/gelu_meta.maca b/src/infiniop/ops/gelu/metax/gelu_meta.maca new file mode 100644 index 000000000..3a311530a --- /dev/null +++ b/src/infiniop/ops/gelu/metax/gelu_meta.maca @@ -0,0 +1,60 @@ +#include "gelu_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::gelu::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu::metax diff --git a/src/infiniop/ops/gelu/metax/gelu_metax.h b/src/infiniop/ops/gelu/metax/gelu_metax.h new file mode 100644 index 000000000..9385b7a27 --- /dev/null +++ b/src/infiniop/ops/gelu/metax/gelu_metax.h @@ -0,0 +1,8 @@ +#ifndef __GELU_METAX_API_H__ +#define __GELU_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(gelu, metax) + +#endif // __GELU_METAX_API_H__ diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu new file mode 100644 index 000000000..4d42cf2df --- /dev/null +++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "gelu_nvidia.cuh" + +namespace op::gelu::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu::nvidia diff --git a/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh new file mode 100644 index 000000000..72dbbd4f0 --- /dev/null +++ b/src/infiniop/ops/gelu/nvidia/gelu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GELU_CUDA_API_H__ +#define __GELU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelu, nvidia) + +#endif // __GELU_CUDA_API_H__ diff --git a/src/infiniop/ops/gelu/operator.cc b/src/infiniop/ops/gelu/operator.cc new file mode 100644 index 000000000..115a1c2fd --- /dev/null +++ b/src/infiniop/ops/gelu/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/gelu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/gelu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateGeluDescriptor( + infiniopHandle_t handle, + infiniopGeluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGeluWorkspaceSize(infiniopGeluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGelu( + infiniopGeluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyGeluDescriptor(infiniopGeluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc new file mode 100644 index 000000000..f9e6ca660 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.cc @@ -0,0 +1,54 @@ +#include "gelu_backward_cpu.h" + +namespace op::gelu_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu_backward::cpu diff --git a/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h new file mode 100644 index 000000000..b008ef101 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/cpu/gelu_backward_cpu.h @@ -0,0 +1,38 @@ +#ifndef __GELU_BACKWARD_CPU_H__ +#define __GELU_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(gelu_backward, cpu) + +namespace op::gelu_backward::cpu { +typedef struct GeluBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + constexpr double alpha = 0.7978845608028654; + constexpr double beta = 0.044714998453855515; + + // 计算中间变量 u = α(x + βx³) + const double x_cubed = a * a * a; + const double u = alpha * (a + beta * x_cubed); + + // 计算 tanh(u) 及其导数 sech²(u) = 1 - tanh²(u) + const double tanh_u = std::tanh(u); + const double sech2_u = 1.0 - tanh_u * tanh_u; + + // 计算 du/dx = α(1 + 3βx²) + const double du_dx = alpha * (1.0 + 3.0 * beta * a * a); + + // 计算 GELU 的导数 dy/dx + const double dy_dx = 0.5 * (1.0 + tanh_u) + 0.5 * a * sech2_u * du_dx; + + // 链式法则:dL/dx = dL/dy * dy/dx + const double ans = static_cast(b) * dy_dx; + return static_cast(ans); + } +} GeluBackwardOp; +} // namespace op::gelu_backward::cpu + +#endif // __GELU_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/gelu_backward/cuda/kernel.cuh b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh new file mode 100644 index 000000000..6dfbd7fbb --- /dev/null +++ b/src/infiniop/ops/gelu_backward/cuda/kernel.cuh @@ -0,0 +1,88 @@ +#ifndef __GELU_BACKWARD_CUDA_H__ +#define __GELU_BACKWARD_CUDA_H__ + +namespace op::gelu_backward::cuda { +typedef struct GeluBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + constexpr float alpha = 0.7978845608028654f; + constexpr float beta = 0.044715f; + constexpr float beta3 = 3.0f * beta; + + if constexpr (std::is_same_v) { + // half2向量化优化 + float2 x_f = __half22float2(a); + float2 grad_output_f = __half22float2(b); + + float2 u = { + alpha * (x_f.x + beta * x_f.x * x_f.x * x_f.x), + alpha * (x_f.y + beta * x_f.y * x_f.y * x_f.y)}; + // 分别计算 tanh 和 sech² + float tanh_u_x = tanhf(u.x); + float tanh_u_y = tanhf(u.y); + float sech2_u_x = 1.0f - tanh_u_x * tanh_u_x; + float sech2_u_y = 1.0f - tanh_u_y * tanh_u_y; + // 分别计算导数分量 + float du_dx_x = alpha * (1.0f + beta3 * x_f.x * x_f.x); + float du_dx_y = alpha * (1.0f + beta3 * x_f.y * x_f.y); + float dy_dx_x = 0.5f * (1.0f + tanh_u_x) + 0.5f * x_f.x * sech2_u_x * du_dx_x; + float dy_dx_y = 0.5f * (1.0f + tanh_u_y) + 0.5f * x_f.y * sech2_u_y * du_dx_y; + + float2 grad_input_f = { + grad_output_f.x * dy_dx_x, + grad_output_f.y * dy_dx_y}; + return __float22half2_rn(grad_input_f); + } else if constexpr (std::is_same_v) { + // BF16精度 + float x_f = __bfloat162float(a); + float grad_output_f = __bfloat162float(b); + + float u = alpha * (x_f + beta * x_f * x_f * x_f); + float tanh_u = tanhf(u); + float sech2_u = 1.0f - tanh_u * tanh_u; + float du_dx = alpha * (1.0f + beta3 * x_f * x_f); + float dy_dx = 0.5f * (1.0f + tanh_u) + 0.5f * x_f * sech2_u * du_dx; + float ans = __fmul_rn(grad_output_f, dy_dx); + + return __float2bfloat16(ans); + } else if constexpr (std::is_same_v) { + // FP16精度 + float x_f = __half2float(a); + float grad_output_f = __half2float(b); + + float u = alpha * (x_f + beta * x_f * x_f * x_f); + float tanh_u = tanhf(u); + float sech2_u = 1.0f - tanh_u * tanh_u; + float du_dx = alpha * (1.0f + beta3 * x_f * x_f); + float dy_dx = 0.5f * (1.0f + tanh_u) + 0.5f * x_f * sech2_u * du_dx; + float ans = __fmul_rn(grad_output_f, dy_dx); + + return __float2half(ans); + } else if constexpr (std::is_same_v) { + // FP32精度 + float x = a; + float u = alpha * (x + beta * x * x * x); + float tanh_u = tanhf(u); + float sech2_u = 1.0f - tanh_u * tanh_u; + float du_dx = alpha * (1.0f + beta3 * x * x); + float dy_dx = 0.5f * (1.0f + tanh_u) + 0.5f * x * sech2_u * du_dx; + return __fmul_rn(b, dy_dx); + } else { + // FP64精度或其他 + constexpr double alpha_d = 0.7978845608028654; + constexpr double beta_d = 0.044715; + double x = a; + double u = alpha_d * (x + beta_d * x * x * x); + double tanh_u = tanh(u); + double sech2_u = 1.0 - tanh_u * tanh_u; + double du_dx = alpha_d * (1.0 + 3.0 * beta_d * x * x); + double dy_dx = 0.5 * (1.0 + tanh_u) + 0.5 * x * sech2_u * du_dx; + return static_cast(b * dy_dx); + } + } +} GeluBackwardOp; +} // namespace op::gelu_backward::cuda + +#endif // __GELU_BACKWARD_CUDA_H__ diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h new file mode 100644 index 000000000..b4213e977 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __GELU_BACKWARD_METAX_API_H__ +#define __GELU_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(gelu_backward, metax) + +#endif // __GELU_BACKWARD_METAX_API_H__ diff --git a/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca new file mode 100644 index 000000000..a79004e4c --- /dev/null +++ b/src/infiniop/ops/gelu_backward/metax/gelu_backward_metax.maca @@ -0,0 +1,62 @@ +#include "gelu_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::gelu_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu_backward::metax diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu new file mode 100644 index 000000000..2ff128567 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "gelu_backward_nvidia.cuh" + +namespace op::gelu_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::GeluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::GeluBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::GeluBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::GeluBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gelu_backward::nvidia diff --git a/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh new file mode 100644 index 000000000..82e6c2ae8 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/nvidia/gelu_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __GELU_BACKWARD_CUDA_API_H__ +#define __GELU_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(gelu_backward, nvidia) + +#endif // __GELU_BACKWARD_CUDA_API_H__ diff --git a/src/infiniop/ops/gelu_backward/operator.cc b/src/infiniop/ops/gelu_backward/operator.cc new file mode 100644 index 000000000..b74d71510 --- /dev/null +++ b/src/infiniop/ops/gelu_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gelu_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gelu_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/gelu_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/gelu_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateGeluBackwardDescriptor( + infiniopHandle_t handle, + infiniopGeluBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gelu_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, \ + grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGeluBackwardWorkspaceSize(infiniopGeluBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGeluBackward( + infiniopGeluBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyGeluBackwardDescriptor(infiniopGeluBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc new file mode 100644 index 000000000..c81f0a539 --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.cc @@ -0,0 +1,66 @@ +#include "logical_and_cpu.h" +#include "infinicore.h" + +namespace op::logical_and::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::cpu diff --git a/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h new file mode 100644 index 000000000..701960bd5 --- /dev/null +++ b/src/infiniop/ops/logical_and/cpu/logical_and_cpu.h @@ -0,0 +1,29 @@ +#ifndef __LOGICAL_AND_CPU_H__ +#define __LOGICAL_AND_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_and, cpu) + +namespace op::logical_and::cpu { +typedef struct LogicalAndOp { +public: + static constexpr size_t num_inputs = 2; + template + Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + if constexpr (std::is_same_v || std::is_same_v) { + float f_a = utils::cast(a); + float f_b = utils::cast(b); + return f_a && f_b; + } else { + return a && b; + } + } +} LogicalAndOp; +} // namespace op::logical_and::cpu + +#endif // __LOGICAL_AND_CPU_H__ diff --git a/src/infiniop/ops/logical_and/cuda/kernel.cuh b/src/infiniop/ops/logical_and/cuda/kernel.cuh new file mode 100644 index 000000000..0b763d951 --- /dev/null +++ b/src/infiniop/ops/logical_and/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_AND_CUDA_H__ +#define __LOGICAL_AND_CUDA_H__ + +namespace op::logical_and::cuda { +typedef struct LogicalAndOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + return a && b; + } +} LogicalAndOp; +} // namespace op::logical_and::cuda + +#endif // __LOGICAL_AND_CUDA_H__ diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.h b/src/infiniop/ops/logical_and/metax/logical_and_metax.h new file mode 100644 index 000000000..696697322 --- /dev/null +++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_METAX_API_H__ +#define __LOGICAL_AND_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_and, metax) + +#endif // __LOGICAL_AND_METAX_API_H__ diff --git a/src/infiniop/ops/logical_and/metax/logical_and_metax.maca b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca new file mode 100644 index 000000000..68e3f0ddb --- /dev/null +++ b/src/infiniop/ops/logical_and/metax/logical_and_metax.maca @@ -0,0 +1,73 @@ +#include "logical_and_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::logical_and::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::metax diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu new file mode 100644 index 000000000..7f0680a57 --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cu @@ -0,0 +1,73 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_and_nvidia.cuh" + +namespace op::logical_and::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalAndOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_and::nvidia diff --git a/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh new file mode 100644 index 000000000..9d68754bf --- /dev/null +++ b/src/infiniop/ops/logical_and/nvidia/logical_and_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_AND_CUDA_API_H__ +#define __LOGICAL_AND_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_and, nvidia) + +#endif // __LOGICAL_AND_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_and/operator.cc b/src/infiniop/ops/logical_and/operator.cc new file mode 100644 index 000000000..a029c3678 --- /dev/null +++ b/src/infiniop/ops/logical_and/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logical_and.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_and_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/logical_and_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_and_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLogicalAndDescriptor( + infiniopHandle_t handle, + infiniopLogicalAndDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_and::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalAndWorkspaceSize(infiniopLogicalAndDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogicalAnd( + infiniopLogicalAndDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogicalAndDescriptor(infiniopLogicalAndDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc new file mode 100644 index 000000000..1324c98f1 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.cc @@ -0,0 +1,66 @@ +#include "logical_or_cpu.h" +#include "infinicore.h" + +namespace op::logical_or::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::cpu diff --git a/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h new file mode 100644 index 000000000..7c26c4d37 --- /dev/null +++ b/src/infiniop/ops/logical_or/cpu/logical_or_cpu.h @@ -0,0 +1,29 @@ +#ifndef __LOGICAL_OR_CPU_H__ +#define __LOGICAL_OR_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, cpu) + +namespace op::logical_or::cpu { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + if constexpr (std::is_same_v || std::is_same_v) { + float f_a = utils::cast(a); + float f_b = utils::cast(b); + return f_a || f_b; + } else { + return a || b; + } + } +} LogicalOrOp; +} // namespace op::logical_or::cpu + +#endif // __LOGICAL_OR_CPU_H__ diff --git a/src/infiniop/ops/logical_or/cuda/kernel.cuh b/src/infiniop/ops/logical_or/cuda/kernel.cuh new file mode 100644 index 000000000..3c705428e --- /dev/null +++ b/src/infiniop/ops/logical_or/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __LOGICAL_OR_CUDA_H__ +#define __LOGICAL_OR_CUDA_H__ + +namespace op::logical_or::cuda { +typedef struct LogicalOrOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ Tout operator()(const Ta &a, const Tb &b) const { + if constexpr (!std::is_same_v) { + printf("Ta and Tb must be the same type!\n"); + std::abort(); + } + return a || b; + } +} LogicalOrOp; +} // namespace op::logical_or::cuda + +#endif // __LOGICAL_OR_CUDA_H__ diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.h b/src/infiniop/ops/logical_or/metax/logical_or_metax.h new file mode 100644 index 000000000..e530d9ed5 --- /dev/null +++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.h @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_METAX_API_H__ +#define __LOGICAL_OR_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(logical_or, metax) + +#endif // __LOGICAL_OR_METAX_API_H__ diff --git a/src/infiniop/ops/logical_or/metax/logical_or_metax.maca b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca new file mode 100644 index 000000000..17d1f8ed0 --- /dev/null +++ b/src/infiniop/ops/logical_or/metax/logical_or_metax.maca @@ -0,0 +1,73 @@ +#include "logical_or_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::logical_or::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::metax diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu new file mode 100644 index 000000000..151079f07 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cu @@ -0,0 +1,73 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "infinicore.h" +#include "logical_or_nvidia.cuh" + +namespace op::logical_or::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + + auto dtype = a_desc->dtype(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BOOL, INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, bool, bool>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int8_t, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int16_t, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int32_t, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, int64_t, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, cuda_bfloat16, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, half, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, float, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::LogicalOrOp, bool, double, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::logical_or::nvidia diff --git a/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh new file mode 100644 index 000000000..a70bd8da7 --- /dev/null +++ b/src/infiniop/ops/logical_or/nvidia/logical_or_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LOGICAL_OR_CUDA_API_H__ +#define __LOGICAL_OR_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(logical_or, nvidia) + +#endif // __LOGICAL_OR_CUDA_API_H__ diff --git a/src/infiniop/ops/logical_or/operator.cc b/src/infiniop/ops/logical_or/operator.cc new file mode 100644 index 000000000..8f258e3e0 --- /dev/null +++ b/src/infiniop/ops/logical_or/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/logical_or.h" + +#ifdef ENABLE_CPU_API +#include "cpu/logical_or_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/logical_or_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/logical_or_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLogicalOrDescriptor( + infiniopHandle_t handle, + infiniopLogicalOrDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::logical_or::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLogicalOrWorkspaceSize(infiniopLogicalOrDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLogicalOr( + infiniopLogicalOrDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLogicalOrDescriptor(infiniopLogicalOrDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc new file mode 100644 index 000000000..1022657c5 --- /dev/null +++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.cc @@ -0,0 +1,54 @@ +#include "relu_backward_cpu.h" + +namespace op::relu_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::relu_backward::cpu diff --git a/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h new file mode 100644 index 000000000..84e232184 --- /dev/null +++ b/src/infiniop/ops/relu_backward/cpu/relu_backward_cpu.h @@ -0,0 +1,24 @@ +#ifndef __RELU_BACKWARD_CPU_H__ +#define __RELU_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(relu_backward, cpu) + +namespace op::relu_backward::cpu { +typedef struct ReluBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &a, const T &b) const { + T zero{0}; + if (a > zero) { + return b; + } else { + return zero; + } + } +} ReluBackwardOp; +} // namespace op::relu_backward::cpu + +#endif // __RELU_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/relu_backward/cuda/kernel.cuh b/src/infiniop/ops/relu_backward/cuda/kernel.cuh new file mode 100644 index 000000000..3ead42756 --- /dev/null +++ b/src/infiniop/ops/relu_backward/cuda/kernel.cuh @@ -0,0 +1,20 @@ +#ifndef __RELU_BACKWARD_CUDA_H__ +#define __RELU_BACKWARD_CUDA_H__ + +namespace op::relu_backward::cuda { +typedef struct ReluBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + T zero{0}; + if (a > zero) { + return b; + } else { + return zero; + } + } +} ReluBackwardOp; +} // namespace op::relu_backward::cuda + +#endif // __RELU_BACKWARD_CUDA_H__ diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h new file mode 100644 index 000000000..13c41b83a --- /dev/null +++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __RELU_BACKWARD_METAX_API_H__ +#define __RELU_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(relu_backward, metax) + +#endif // __RELU_BACKWARD_METAX_API_H__ diff --git a/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca new file mode 100644 index 000000000..b4b934f01 --- /dev/null +++ b/src/infiniop/ops/relu_backward/metax/relu_backward_metax.maca @@ -0,0 +1,62 @@ +#include "relu_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::relu_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ReluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReluBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReluBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ReluBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::relu_backward::metax diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu new file mode 100644 index 000000000..2eed8e443 --- /dev/null +++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "relu_backward_nvidia.cuh" + +namespace op::relu_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ReluBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ReluBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ReluBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ReluBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::relu_backward::nvidia diff --git a/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh new file mode 100644 index 000000000..1a743b7b6 --- /dev/null +++ b/src/infiniop/ops/relu_backward/nvidia/relu_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __RELU_BACKWARD_CUDA_API_H__ +#define __RELU_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(relu_backward, nvidia) + +#endif // __RELU_BACKWARD_CUDA_API_H__ diff --git a/src/infiniop/ops/relu_backward/operator.cc b/src/infiniop/ops/relu_backward/operator.cc new file mode 100644 index 000000000..ffca7a7bd --- /dev/null +++ b/src/infiniop/ops/relu_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/relu_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/relu_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/relu_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/relu_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateReluBackwardDescriptor( + infiniopHandle_t handle, + infiniopReluBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::relu_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, \ + grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetReluBackwardWorkspaceSize(infiniopReluBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReluBackward( + infiniopReluBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyReluBackwardDescriptor(infiniopReluBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.cc b/src/infiniop/ops/silu/cpu/silu_cpu.cc new file mode 100644 index 000000000..c8466d227 --- /dev/null +++ b/src/infiniop/ops/silu/cpu/silu_cpu.cc @@ -0,0 +1,52 @@ +#include "silu_cpu.h" + +namespace op::silu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::silu::cpu diff --git a/src/infiniop/ops/silu/cpu/silu_cpu.h b/src/infiniop/ops/silu/cpu/silu_cpu.h new file mode 100644 index 000000000..e1e9da4e3 --- /dev/null +++ b/src/infiniop/ops/silu/cpu/silu_cpu.h @@ -0,0 +1,23 @@ +#ifndef __SILU_CPU_H__ +#define __SILU_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(silu, cpu) + +#include + +namespace op::silu::cpu { +typedef struct SiluOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &x) const { + return x / (static_cast(1) + std::exp(-x)); + } +} SiluOp; + +} // namespace op::silu::cpu + +#endif // __SILU_CPU_H__ diff --git a/src/infiniop/ops/silu/cuda/kernel.cuh b/src/infiniop/ops/silu/cuda/kernel.cuh new file mode 100644 index 000000000..5cb8616b0 --- /dev/null +++ b/src/infiniop/ops/silu/cuda/kernel.cuh @@ -0,0 +1,37 @@ +#ifndef __SILU_CUDA_H__ +#define __SILU_CUDA_H__ + +#include + +namespace op::silu::cuda { + +typedef struct SiluOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + if constexpr (std::is_same_v) { + // half2向量化优化 + return __hmul2(x, __h2div(__float2half2_rn(1.0f), + __hadd2(__float2half2_rn(1.0f), h2exp(__hneg2(x))))); + } else if constexpr (std::is_same_v) { + // BF16 + const float x_f = __bfloat162float(x); + return __float2bfloat16(x_f / (1.0f + __expf(-x_f))); + } else if constexpr (std::is_same_v) { + // FP16 + const float x_f = __half2float(x); + return __float2half(x_f / (1.0f + __expf(-x_f))); + } else if constexpr (std::is_same_v) { + // FP32 + return x * (1.0f / (1.0f + __expf(-x))); + } else if constexpr (std::is_same_v) { + // FP64 + return x / (1.0 + exp(-x)); + } + } +} SiluOp; + +} // namespace op::silu::cuda + +#endif // __SILU_CUDA_H__ diff --git a/src/infiniop/ops/silu/metax/silu_metax.h b/src/infiniop/ops/silu/metax/silu_metax.h new file mode 100644 index 000000000..a9717ccd0 --- /dev/null +++ b/src/infiniop/ops/silu/metax/silu_metax.h @@ -0,0 +1,8 @@ +#ifndef __SILU_METAX_API_H__ +#define __SILU_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(silu, metax) + +#endif // __SILU_METAX_API_H__ diff --git a/src/infiniop/ops/silu/metax/silu_metax.maca b/src/infiniop/ops/silu/metax/silu_metax.maca new file mode 100644 index 000000000..73408bfc6 --- /dev/null +++ b/src/infiniop/ops/silu/metax/silu_metax.maca @@ -0,0 +1,60 @@ +#include "silu_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::silu::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create METAX elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SiluOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::silu::metax diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cu b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu new file mode 100644 index 000000000..291b9835f --- /dev/null +++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "silu_nvidia.cuh" + +namespace op::silu::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SiluOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::silu::nvidia diff --git a/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh new file mode 100644 index 000000000..b13c7fd44 --- /dev/null +++ b/src/infiniop/ops/silu/nvidia/silu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SILU_CUDA_API_H__ +#define __SILU_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(silu, nvidia) + +#endif // __SILU_CUDA_API_H__ diff --git a/src/infiniop/ops/silu/operator.cc b/src/infiniop/ops/silu/operator.cc new file mode 100644 index 000000000..5ae6ea4ff --- /dev/null +++ b/src/infiniop/ops/silu/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/silu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/silu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/silu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/silu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSiluDescriptor( + infiniopHandle_t handle, + infiniopSiluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::silu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSilu( + infiniopSiluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/all_equal.py b/test/infiniop/all_equal.py new file mode 100644 index 000000000..9c96f166b --- /dev/null +++ b/test/infiniop/all_equal.py @@ -0,0 +1,206 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, + torch_device_map, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), None), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), None), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), None), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), None), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [Inplace.OUT_OF_PLACE] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.F64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor((1,), c_stride, InfiniDtype.BOOL, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing AllEqual on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + all_equal = torch.equal(a.torch_tensor(), b.torch_tensor()) + new_tensor = torch.tensor([all_equal], device=torch_device_map[device]) + c.update_torch_tensor(new_tensor) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateAllEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetAllEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_all_equal(): + check_error( + LIBINFINIOP.infiniopAllEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_all_equal() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.equal(c.actual_tensor(), c.torch_tensor()) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.equal(a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_all_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyAllEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/cross_entropy_loss_backward.py b/test/infiniop/cross_entropy_loss_backward.py new file mode 100644 index 000000000..ec1296ef5 --- /dev/null +++ b/test/infiniop/cross_entropy_loss_backward.py @@ -0,0 +1,210 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import numpy as np +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.25e-15, "rtol": 2.25e-15}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cross_entropy_loss_backward( + probs: torch.Tensor, target: torch.Tensor, shape +) -> torch.Tensor: + grad_logits = probs - target + shape = np.array(shape) + batch_size = np.prod(shape) // shape[-1] + grad_logits = grad_logits / batch_size + return grad_logits + + +def test( + handle, + device, + shape, + probs_stride=None, + target_stride=None, + grad_logits_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + probs = TestTensor(shape, probs_stride, dtype, device) + target = TestTensor(shape, target_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if probs_stride != grad_logits_stride: + return + grad_logits = probs + elif inplace == Inplace.INPLACE_B: + if target_stride != grad_logits_stride: + return + grad_logits = target + else: + grad_logits = TestTensor(shape, grad_logits_stride, dtype, device, mode="ones") + + if grad_logits.is_broadcast(): + return + + print( + f"Testing CrossEntropyLossBackward on {InfiniDeviceNames[device]} with shape:{shape} probs_stride:{probs_stride} target_stride:{target_stride} grad_logits_stride:{grad_logits_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + new_grad_logits = cross_entropy_loss_backward( + probs.torch_tensor(), target.torch_tensor(), shape + ) + grad_logits.update_torch_tensor(new_grad_logits) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCrossEntropyLossBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_logits.descriptor, + probs.descriptor, + target.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [probs, target, grad_logits]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCrossEntropyLossBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_logits.device) + + def lib_cross_entropy_loss_backward(): + check_error( + LIBINFINIOP.infiniopCrossEntropyLossBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_logits.data(), + probs.data(), + target.data(), + None, + ) + ) + + lib_cross_entropy_loss_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug( + grad_logits.actual_tensor(), + grad_logits.torch_tensor(), + atol=atol, + rtol=rtol, + ) + assert torch.allclose( + grad_logits.actual_tensor(), grad_logits.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cross_entropy_loss_backward(probs.torch_tensor(), target.torch_tensor(), shape), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cross_entropy_loss_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error( + LIBINFINIOP.infiniopDestroyCrossEntropyLossBackwardDescriptor(descriptor) + ) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/div.py b/test/infiniop/div.py new file mode 100644 index 000000000..826f1141c --- /dev/null +++ b/test/infiniop/div.py @@ -0,0 +1,191 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def div(c, a, b): + torch.div(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device, bias=1e-6) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, dtype, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Div on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetDivWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_div(): + check_error( + LIBINFINIOP.infiniopDiv( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_div() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: div(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_div(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyDivDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/equal.py b/test/infiniop/equal.py new file mode 100644 index 000000000..7ba6b6949 --- /dev/null +++ b/test/infiniop/equal.py @@ -0,0 +1,209 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.F64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def eq(c, a, b): + torch.eq(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Equal on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + eq(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateEqualDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetEqualWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_equal(): + check_error( + LIBINFINIOP.infiniopEqual( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_equal() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.equal(c.actual_tensor(), c.torch_tensor()) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: eq(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_equal(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyEqualDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/gelu.py b/test/infiniop/gelu.py new file mode 100644 index 000000000..1a3328432 --- /dev/null +++ b/test/infiniop/gelu.py @@ -0,0 +1,188 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, + to_torch_dtype, + torch_device_map, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 1e-8, "rtol": 1e-8}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Gelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}" + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # ans的shape对齐至input,而input可能存在广播维度 + ans = torch.nn.functional.gelu(input.torch_tensor(), approximate="tanh") + # 利用add(+)计算的自动广播机制,确保ouput的torch_tensor与actual_tensor shape一致,以通过debug模式的shape检查 + zero = torch.zeros( + *shape, dtype=to_torch_dtype(dtype), device=torch_device_map[device] + ) + new_output = ans + zero + output.update_torch_tensor(new_output) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGeluDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGeluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_gelu(): + check_error( + LIBINFINIOP.infiniopGelu( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_gelu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.nn.functional.gelu(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gelu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyGeluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/gelu_backward.py b/test/infiniop/gelu_backward.py new file mode 100644 index 000000000..7e2f170a7 --- /dev/null +++ b/test/infiniop/gelu_backward.py @@ -0,0 +1,213 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.F64: {"atol": 1e-8, "rtol": 1e-8}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def gelu_backward(input: torch.Tensor, grad_output: torch.Tensor) -> torch.Tensor: + sqrt_2_over_pi = torch.sqrt(torch.tensor(2.0 / torch.pi, device=input.device)) + kappa = 0.044715 + + # 计算中间变量 u = √(2/π)(x + κx³) + x_cubed = input.pow(3) + u = sqrt_2_over_pi * (input + kappa * x_cubed) + + # 计算 tanh(u) 及其导数 sech²(u) = 1 - tanh²(u) + tanh_u = torch.tanh(u) + sech2_u = 1.0 - tanh_u.square() + + # 计算 du/dx = √(2/π)(1 + 3κx²) + du_dx = sqrt_2_over_pi * (1.0 + 3 * kappa * input.square()) + + # 局部梯度 dy/dx = 0.5*(1 + tanh_u) + 0.5*x*sech2_u*du_dx + dy_dx = 0.5 * (1.0 + tanh_u) + 0.5 * input * sech2_u * du_dx + + return grad_output * dy_dx + + +def test( + handle, + device, + shape, + input_stride=None, + grad_output_stride=None, + grad_input_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + grad_output = TestTensor(shape, grad_output_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if input_stride != grad_input_stride: + return + grad_input = input + elif inplace == Inplace.INPLACE_B: + if grad_output_stride != grad_input_stride: + return + grad_input = grad_output + else: + grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones") + + if grad_input.is_broadcast(): + return + + print( + f"Testing GeluBackward on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + new_grad_input = gelu_backward(input.torch_tensor(), grad_output.torch_tensor()) + grad_input.update_torch_tensor(new_grad_input) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGeluBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, + input.descriptor, + grad_output.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, grad_output, grad_input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGeluBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input.device) + + def lib_gelu_backward(): + check_error( + LIBINFINIOP.infiniopGeluBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + input.data(), + grad_output.data(), + None, + ) + ) + + lib_gelu_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug( + grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol + ) + assert torch.allclose( + grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: gelu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gelu_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyGeluBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..7b6a06c40 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -162,11 +162,6 @@ def clip_(lib): ] -@OpRegister.operator -def conv_(lib): - pass - - @OpRegister.operator def gemm_(lib): lib.infiniopCreateGemmDescriptor.restype = c_int32 @@ -454,6 +449,7 @@ def swiglu_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def conv_(lib): lib.infiniopCreateConvDescriptor.restype = c_int32 @@ -489,3 +485,340 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def silu_(lib): + lib.infiniopCreateSiluDescriptor.restype = c_int32 + lib.infiniopCreateSiluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSiluWorkspaceSize.restype = c_int32 + lib.infiniopGetSiluWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSilu.restype = c_int32 + lib.infiniopSilu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySiluDescriptor.restype = c_int32 + lib.infiniopDestroySiluDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def div_(lib): + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetDivWorkspaceSize.restype = c_int32 + lib.infiniopGetDivWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_and_(lib): + lib.infiniopCreateLogicalAndDescriptor.restype = c_int32 + lib.infiniopCreateLogicalAndDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalAndWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalAndWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalAnd.restype = c_int32 + lib.infiniopLogicalAnd.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalAndDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalAndDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def logical_or_(lib): + lib.infiniopCreateLogicalOrDescriptor.restype = c_int32 + lib.infiniopCreateLogicalOrDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetLogicalOrWorkspaceSize.restype = c_int32 + lib.infiniopGetLogicalOrWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLogicalOr.restype = c_int32 + lib.infiniopLogicalOr.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLogicalOrDescriptor.restype = c_int32 + lib.infiniopDestroyLogicalOrDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def equal_(lib): + lib.infiniopCreateEqualDescriptor.restype = c_int32 + lib.infiniopCreateEqualDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopEqual.restype = c_int32 + lib.infiniopEqual.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyEqualDescriptor.restype = c_int32 + lib.infiniopDestroyEqualDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def all_equal_(lib): + lib.infiniopCreateAllEqualDescriptor.restype = c_int32 + lib.infiniopCreateAllEqualDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetAllEqualWorkspaceSize.restype = c_int32 + lib.infiniopGetAllEqualWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopAllEqual.restype = c_int32 + lib.infiniopAllEqual.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyAllEqualDescriptor.restype = c_int32 + lib.infiniopDestroyAllEqualDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + + +@OpRegister.operator +def relu_backward_(lib): + lib.infiniopCreateReluBackwardDescriptor.restype = c_int32 + lib.infiniopCreateReluBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetReluBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetReluBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReluBackward.restype = c_int32 + lib.infiniopReluBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReluBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyReluBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def gelu_(lib): + lib.infiniopCreateGeluDescriptor.restype = c_int32 + lib.infiniopCreateGeluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetGeluWorkspaceSize.restype = c_int32 + lib.infiniopGetGeluWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopGelu.restype = c_int32 + lib.infiniopGelu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGeluDescriptor.restype = c_int32 + lib.infiniopDestroyGeluDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def gelu_backward_(lib): + lib.infiniopCreateGeluBackwardDescriptor.restype = c_int32 + lib.infiniopCreateGeluBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetGeluBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetGeluBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopGeluBackward.restype = c_int32 + lib.infiniopGeluBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGeluBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyGeluBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def cross_entropy_loss_backward_(lib): + lib.infiniopCreateCrossEntropyLossBackwardDescriptor.restype = c_int32 + lib.infiniopCreateCrossEntropyLossBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCrossEntropyLossBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetCrossEntropyLossBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCrossEntropyLossBackward.restype = c_int32 + lib.infiniopCrossEntropyLossBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCrossEntropyLossBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyCrossEntropyLossBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index 5c8e7f80a..1a8eaf505 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -66,10 +66,33 @@ def __init__( torch_strides.append(strides[i]) else: torch_shape.append(shape[i]) + + is_bool = dt == InfiniDtype.BOOL + if is_bool: + dt = InfiniDtype.F32 + + is_int = ( + dt == InfiniDtype.I8 + or dt == InfiniDtype.I16 + or dt == InfiniDtype.I32 + or dt == InfiniDtype.I64 + ) + if mode == "random": - self._torch_tensor = torch.rand( - torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] - ) + if is_int: + self._torch_tensor = torch.randint( + 0, + 100, + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) + else: + self._torch_tensor = torch.rand( + torch_shape, + dtype=to_torch_dtype(dt), + device=torch_device_map[device], + ) elif mode == "zeros": self._torch_tensor = torch.zeros( torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device] @@ -88,6 +111,12 @@ def __init__( else: raise ValueError("Unsupported mode") + if is_bool: + self._torch_tensor = self._torch_tensor > 0.5 + + if is_bool: + self._torch_tensor = self._torch_tensor > 0.5 + if scale is not None: self._torch_tensor *= scale if bias is not None: @@ -103,6 +132,9 @@ def __init__( def torch_tensor(self): return self._torch_tensor + def update_torch_tensor(self, new_tensor: torch.tensor): + self._torch_tensor = new_tensor + def actual_tensor(self): return self._data_tensor @@ -120,6 +152,9 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): shape_, strides_, dt, device, mode="manual", set_tensor=torch_tensor ) + def update_torch_tensor(self, new_tensor: torch.Tensor): + self._torch_tensor = new_tensor + def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): if dt == InfiniDtype.I8: @@ -140,6 +175,8 @@ def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): return torch.float32 elif dt == InfiniDtype.F64: return torch.float64 + elif dt == InfiniDtype.BOOL: + return torch.bool # TODO: These following types may not be supported by older # versions of PyTorch. Use compatability mode to convert them. elif dt == InfiniDtype.U16: @@ -330,6 +367,11 @@ def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True): actual = actual.to(torch.float32) desired = desired.to(torch.float32) + # 如果是BOOL,全部转成FP32再比对 + if actual.dtype == torch.bool or desired.dtype == torch.bool: + actual = actual.to(torch.float32) + desired = desired.to(torch.float32) + print_discrepancy(actual, desired, atol, rtol, equal_nan, verbose) np.testing.assert_allclose( actual.cpu(), desired.cpu(), rtol, atol, equal_nan, verbose=True @@ -523,7 +565,7 @@ def profile_operation(desc, func, torch_device, NUM_PRERUN, NUM_ITERATIONS): # Timed execution elapsed = timed_op(lambda: func(), NUM_ITERATIONS, torch_device) - print(f" {desc} time: {elapsed * 1000 :6f} ms") + print(f" {desc} time: {elapsed * 1000:6f} ms") def test_operator(device, test_func, test_cases, tensor_dtypes): diff --git a/test/infiniop/logical_and.py b/test/infiniop/logical_and.py new file mode 100644 index 000000000..02369745b --- /dev/null +++ b/test/infiniop/logical_and.py @@ -0,0 +1,209 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.F64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_and(c, a, b): + torch.logical_and(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing LogicalAnd on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalAndDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalAndWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_and(): + check_error( + LIBINFINIOP.infiniopLogicalAnd( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_and() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.equal(c.actual_tensor(), c.torch_tensor()) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_and(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_and(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalAndDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/logical_or.py b/test/infiniop/logical_or.py new file mode 100644 index 000000000..abfff16f2 --- /dev/null +++ b/test/infiniop/logical_or.py @@ -0,0 +1,209 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BOOL: {"atol": 0, "rtol": 0}, + InfiniDtype.I8: {"atol": 0, "rtol": 0}, + InfiniDtype.I16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.F64: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def logical_or(c, a, b): + torch.logical_or(a, b, out=c) + + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + a = TestTensor(shape, a_stride, dtype, device) + b = TestTensor(shape, b_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + else: + c = TestTensor(shape, c_stride, InfiniDtype.BOOL, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing LogicalOr on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLogicalOrDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [a, b, c]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLogicalOrWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_logical_or(): + check_error( + LIBINFINIOP.infiniopLogicalOr( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + None, + ) + ) + + lib_logical_or() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + assert torch.equal(c.actual_tensor(), c.torch_tensor()) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: logical_or(c.torch_tensor(), a.torch_tensor(), b.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_logical_or(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLogicalOrDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BOOL, + InfiniDtype.I8, + InfiniDtype.I16, + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/relu_backward.py b/test/infiniop/relu_backward.py new file mode 100644 index 000000000..ffb737476 --- /dev/null +++ b/test/infiniop/relu_backward.py @@ -0,0 +1,199 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, a_stride, b_stride, c_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def relu_backward(input: torch.Tensor, grad_output: torch.Tensor) -> torch.Tensor: + mask = input > 0 + grad_input = torch.where(mask, grad_output, torch.zeros_like(grad_output)) + return grad_input + + +def test( + handle, + device, + shape, + input_stride=None, + grad_output_stride=None, + grad_input_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + grad_output = TestTensor(shape, grad_output_stride, dtype, device) + if inplace == Inplace.INPLACE_A: + if input_stride != grad_input_stride: + return + grad_input = input + elif inplace == Inplace.INPLACE_B: + if grad_output_stride != grad_input_stride: + return + grad_input = grad_output + else: + grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones") + + if grad_input.is_broadcast(): + return + + print( + f"Testing ReluBackward on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + new_grad_input = relu_backward(input.torch_tensor(), grad_output.torch_tensor()) + grad_input.update_torch_tensor(new_grad_input) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReluBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, + input.descriptor, + grad_output.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, grad_output, grad_input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReluBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input.device) + + def lib_relu_backward(): + check_error( + LIBINFINIOP.infiniopReluBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + input.data(), + grad_output.data(), + None, + ) + ) + + lib_relu_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug( + grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol + ) + assert torch.allclose( + grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: relu_backward(input.torch_tensor(), grad_output.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_relu_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyReluBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/silu.py b/test/infiniop/silu.py new file mode 100644 index 000000000..dcc7fdf2b --- /dev/null +++ b/test/infiniop/silu.py @@ -0,0 +1,188 @@ +import ctypes +from ctypes import c_uint64 +from enum import Enum, auto + +import torch +from libinfiniop import ( + LIBINFINIOP, + InfiniDeviceEnum, + InfiniDeviceNames, + InfiniDtype, + InfiniDtypeNames, + TestTensor, + TestWorkspace, + check_error, + debug, + get_args, + get_test_devices, + get_tolerance, + infiniopOperatorDescriptor_t, + profile_operation, + test_operator, + to_torch_dtype, + torch_device_map, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.F64] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.F64: {"atol": 2.22e-15, "rtol": 2.22e-15}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Silu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride}" + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + # ans的shape对齐至input,而input可能存在广播维度 + ans = torch.nn.functional.silu(input.torch_tensor()) + # 利用add(+)计算的自动广播机制,确保ouput的torch_tensor与actual_tensor shape一致,以通过debug模式的shape检查 + zero = torch.zeros( + *shape, dtype=to_torch_dtype(dtype), device=torch_device_map[device] + ) + new_output = ans + zero + output.update_torch_tensor(new_output) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSiluDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSiluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_silu(): + check_error( + LIBINFINIOP.infiniopSilu( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_silu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose( + output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol + ) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch.nn.functional.silu(input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_silu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroySiluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + if device == InfiniDeviceEnum.ILUVATAR: + _TENSOR_DTYPES = [ + InfiniDtype.BF16, + InfiniDtype.F16, + InfiniDtype.F32, + ] + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m")