From ed530e111ce058f421304dab05796def49e8ce9e Mon Sep 17 00:00:00 2001 From: pengcheng888 Date: Mon, 29 Sep 2025 16:02:23 +0800 Subject: [PATCH] issue/427 - the sigmoid, topksoftmax, and topkrouter ops --- include/infiniop.h | 2 + include/infiniop/ops/sigmoid.h | 24 ++ include/infiniop/ops/topkrouter.h | 21 +- include/infiniop/ops/topksoftmax.h | 26 ++ scripts/python_test.py | 3 + src/infiniop-test/include/ops.hpp | 7 +- src/infiniop-test/src/ops/sigmoid.cpp | 103 ++++++++ src/infiniop-test/src/ops/topkrouter.cpp | 130 ++++++++++ src/infiniop-test/src/ops/topksoftmax.cpp | 122 ++++++++++ src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc | 51 ++++ src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h | 19 ++ src/infiniop/ops/sigmoid/cuda/kernel.cuh | 34 +++ .../ops/sigmoid/nvidia/sigmoid_nvidia.cu | 58 +++++ .../ops/sigmoid/nvidia/sigmoid_nvidia.cuh | 8 + src/infiniop/ops/sigmoid/operator.cc | 115 +++++++++ .../ops/topkrouter/cpu/topkrouter_cpu.cc | 217 +++++++++++++++-- .../ops/topkrouter/cpu/topkrouter_cpu.h | 5 +- src/infiniop/ops/topkrouter/cuda/kernel.cuh | 42 ++-- src/infiniop/ops/topkrouter/info.h | 4 +- .../topkrouter/nvidia/topkrouter_nvidia.cu | 21 +- .../topkrouter/nvidia/topkrouter_nvidia.cuh | 4 +- src/infiniop/ops/topkrouter/operator.cc | 34 ++- src/infiniop/ops/topkrouter/topkrouter.h | 87 +++---- .../ops/topksoftmax/cpu/topksoftmax_cpu.cc | 143 +++++++++++ .../ops/topksoftmax/cpu/topksoftmax_cpu.h | 7 + src/infiniop/ops/topksoftmax/cuda/kernel.cuh | 145 +++++++++++ src/infiniop/ops/topksoftmax/info.h | 48 ++++ .../topksoftmax/nvidia/topksoftmax_nvidia.cu | 92 +++++++ .../topksoftmax/nvidia/topksoftmax_nvidia.cuh | 8 + src/infiniop/ops/topksoftmax/operator.cc | 101 ++++++++ src/infiniop/ops/topksoftmax/topksoftmax.h | 34 +++ .../test_generate/testcases/sigmoid.py | 136 +++++++++++ .../test_generate/testcases/topkrouter.py | 225 ++++++++++++++++++ .../test_generate/testcases/topksoftmax.py | 148 ++++++++++++ test/infiniop/libinfiniop/op_register.py | 64 ++++- test/infiniop/sigmoid.py | 172 +++++++++++++ test/infiniop/topkrouter.py | 34 +-- test/infiniop/topksoftmax.py | 170 +++++++++++++ 38 files changed, 2523 insertions(+), 141 deletions(-) create mode 100644 include/infiniop/ops/sigmoid.h create mode 100644 include/infiniop/ops/topksoftmax.h create mode 100644 src/infiniop-test/src/ops/sigmoid.cpp create mode 100644 src/infiniop-test/src/ops/topkrouter.cpp create mode 100644 src/infiniop-test/src/ops/topksoftmax.cpp create mode 100644 src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc create mode 100644 src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h create mode 100644 src/infiniop/ops/sigmoid/cuda/kernel.cuh create mode 100644 src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu create mode 100644 src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh create mode 100644 src/infiniop/ops/sigmoid/operator.cc create mode 100644 src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc create mode 100644 src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h create mode 100644 src/infiniop/ops/topksoftmax/cuda/kernel.cuh create mode 100644 src/infiniop/ops/topksoftmax/info.h create mode 100644 src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu create mode 100644 src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh create mode 100644 src/infiniop/ops/topksoftmax/operator.cc create mode 100644 src/infiniop/ops/topksoftmax/topksoftmax.h create mode 100644 test/infiniop-test/test_generate/testcases/sigmoid.py create mode 100644 test/infiniop-test/test_generate/testcases/topkrouter.py create mode 100644 test/infiniop-test/test_generate/testcases/topksoftmax.py create mode 100644 test/infiniop/sigmoid.py create mode 100644 test/infiniop/topksoftmax.py diff --git a/include/infiniop.h b/include/infiniop.h index b3cf8b6ca..038a06ade 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -19,6 +19,8 @@ #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/ops/topkrouter.h" +#include "infiniop/ops/topksoftmax.h" +#include "infiniop/ops/sigmoid.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/sigmoid.h b/include/infiniop/ops/sigmoid.h new file mode 100644 index 000000000..4fa0f6604 --- /dev/null +++ b/include/infiniop/ops/sigmoid.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIGMOID_API_H__ +#define __INFINIOP_SIGMOID_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSigmoidDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSigmoidDescriptor(infiniopHandle_t handle, + infiniopSigmoidDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x); + +__C __export infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSigmoid(infiniopSigmoidDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/topkrouter.h b/include/infiniop/ops/topkrouter.h index d85b6b5ff..da6aaced5 100644 --- a/include/infiniop/ops/topkrouter.h +++ b/include/infiniop/ops/topkrouter.h @@ -5,16 +5,23 @@ typedef struct InfiniopDescriptor *infiniopTopkrouterDescriptor_t; -__C __export infiniStatus_t infiniopCreateTopkrouterDescriptor( - infiniopHandle_t handle, - infiniopTopkrouterDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t correction_bias_desc); +__C __export infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, + infiniopTopkrouterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t correction_bias_desc); __C __export infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size); -__C __export infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size, - void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream); +__C __export infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *values, + void *indices, + const void *x, + const void *correction_bias, + const float routed_scaling_factor, + const size_t topk, + void *stream); __C __export infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc); diff --git a/include/infiniop/ops/topksoftmax.h b/include/infiniop/ops/topksoftmax.h new file mode 100644 index 000000000..696eb50f3 --- /dev/null +++ b/include/infiniop/ops/topksoftmax.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_TOPKSOFTMAX_API_H__ +#define __INFINIOP_TOPKSOFTMAX_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTopksoftmaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle, + infiniopTopksoftmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc); + +__C __export infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *values, + void *indices, + const void *x, + const size_t topk, + const int norm, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index 5348c8c69..00cc1293a 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -25,6 +25,9 @@ def run_tests(args): "sub.py", "swiglu.py", "softplus.py", + "sigmoid.py", + "topkrouter.py", + "topksoftmax.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..0685847b5 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,7 +16,9 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) - +DECLARE_INFINIOP_TEST(sigmoid) +DECLARE_INFINIOP_TEST(topkrouter) +DECLARE_INFINIOP_TEST(topksoftmax) #define REGISTER_INFINIOP_TEST(name) \ { \ #name, \ @@ -43,6 +45,9 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(sigmoid) \ + REGISTER_INFINIOP_TEST(topkrouter) \ + REGISTER_INFINIOP_TEST(topksoftmax) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/sigmoid.cpp b/src/infiniop-test/src/ops/sigmoid.cpp new file mode 100644 index 000000000..bb3a0f70a --- /dev/null +++ b/src/infiniop-test/src/ops/sigmoid.cpp @@ -0,0 +1,103 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sigmoid { +struct Test::Attributes { + std::shared_ptr x; + std::shared_ptr y; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("x") == tensors.end() + || tensors.find("y") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->x = tensors["x"]; + test->_attributes->y = tensors["y"]; + test->_attributes->ans = tensors["ans"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSigmoidDescriptor_t op_desc; + auto x = _attributes->x->to(device, device_id); + auto y = _attributes->y->to(device, device_id); + CHECK_OR(infiniopCreateSigmoidDescriptor(handle, &op_desc, + y->desc(), + x->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSigmoidWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSigmoid(op_desc, workspace, workspace_size, + y->data(), + x->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(y, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSigmoid( + op_desc, workspace, workspace_size, + y->data(), + x->data(), + nullptr); + }, + warm_ups, iterations); + + infiniopDestroySigmoidDescriptor(op_desc); + infinirtFree(workspace); + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"x", "y", "ans"}; +} + +std::vector Test::output_names() { + return {"y"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + oss << "- y: " << _attributes->y->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::sigmoid diff --git a/src/infiniop-test/src/ops/topkrouter.cpp b/src/infiniop-test/src/ops/topkrouter.cpp new file mode 100644 index 000000000..367087fd6 --- /dev/null +++ b/src/infiniop-test/src/ops/topkrouter.cpp @@ -0,0 +1,130 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::topkrouter { +struct Test::Attributes { + std::shared_ptr values; + std::shared_ptr indices; + std::shared_ptr x; + std::shared_ptr correction_bias; + float routed_scaling_factor; + int topk; + std::shared_ptr lable_values; + std::shared_ptr lable_indices; +}; + +std::shared_ptr Test::build(std::unordered_map> attributes, + std::unordered_map> tensors, double rtol, + double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (attributes.find("routed_scaling_factor") == attributes.end() || attributes.find("topk") == attributes.end() || tensors.find("values") == tensors.end() || tensors.find("indices") == tensors.end() || tensors.find("x") == tensors.end() || tensors.find("correction_bias") == tensors.end() || tensors.find("lable_values") == tensors.end() || tensors.find("lable_indices") == tensors.end()) { + throw std::runtime_error("Invalid Test: Missing attributes or tensors"); + } + + test->_attributes->values = tensors["values"]; + test->_attributes->indices = tensors["indices"]; + test->_attributes->x = tensors["x"]; + test->_attributes->correction_bias = tensors["correction_bias"]; + + test->_attributes->routed_scaling_factor = *reinterpret_cast(attributes["routed_scaling_factor"].data()); + test->_attributes->topk = *reinterpret_cast(attributes["topk"].data()); + + test->_attributes->lable_values = tensors["lable_values"]; + test->_attributes->lable_indices = tensors["lable_indices"]; + + return test; +} + +std::shared_ptr Test::run(infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + infiniopTopkrouterDescriptor_t op_desc; + CHECK_OR(infiniopCreateTopkrouterDescriptor(handle, &op_desc, _attributes->x->desc(), + _attributes->correction_bias->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create Topkrouter descriptor")); + + // + auto values = _attributes->values->to(device, device_id); + auto indices = _attributes->indices->to(device, device_id); + auto x = _attributes->x->to(device, device_id); + auto correction_bias = _attributes->correction_bias->to(device, device_id); + + float routed_scaling_factor = _attributes->routed_scaling_factor; + int topk = _attributes->topk; + + size_t workspace_size; + CHECK_OR(infiniopGetTopkrouterWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + + CHECK_OR(infiniopTopkrouter(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(), + correction_bias->data(), routed_scaling_factor, topk, nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Topkrouter execution failed")); + + try { + allClose(values, _attributes->lable_values, _rtol, _atol); + allClose(indices, _attributes->lable_indices, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopTopkrouter(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(), + correction_bias->data(), routed_scaling_factor, topk, nullptr); + }, + warm_ups, iterations); + + if (workspace != nullptr) { + infinirtFree(workspace); + } + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"routed_scaling_factor", "topk"}; +} + +std::vector Test::tensor_names() { + return {"values", "indices", "x", "correction_bias", "lable_values", "lable_indices"}; +} + +std::vector Test::output_names() { + return {"values", "indices"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- routed_scaling_factor=" << _attributes->routed_scaling_factor << std::endl; + oss << "- topk=" << _attributes->topk << std::endl; + + oss << "- values: " << _attributes->values->info() << std::endl; + oss << "- indices: " << _attributes->indices->info() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + oss << "- correction_bias: " << _attributes->correction_bias->info() << std::endl; + + oss << "- lable_values: " << _attributes->lable_values->info() << std::endl; + oss << "- lable_indices: " << _attributes->lable_indices->info() << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::topkrouter diff --git a/src/infiniop-test/src/ops/topksoftmax.cpp b/src/infiniop-test/src/ops/topksoftmax.cpp new file mode 100644 index 000000000..f6b0af7a4 --- /dev/null +++ b/src/infiniop-test/src/ops/topksoftmax.cpp @@ -0,0 +1,122 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::topksoftmax { +struct Test::Attributes { + std::shared_ptr values; + std::shared_ptr indices; + std::shared_ptr x; + int topk; + bool norm; + std::shared_ptr lable_values; + std::shared_ptr lable_indices; +}; + +std::shared_ptr Test::build(std::unordered_map> attributes, + std::unordered_map> tensors, double rtol, + double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + + if (attributes.find("topk") == attributes.end() || attributes.find("norm") == attributes.end() || tensors.find("values") == tensors.end() || tensors.find("indices") == tensors.end() || tensors.find("x") == tensors.end() || tensors.find("lable_values") == tensors.end() || tensors.find("lable_indices") == tensors.end()) { + throw std::runtime_error("Invalid Test: Missing attributes or tensors"); + } + + test->_attributes->values = tensors["values"]; + test->_attributes->indices = tensors["indices"]; + test->_attributes->x = tensors["x"]; + + test->_attributes->topk = *reinterpret_cast(attributes["topk"].data()); + test->_attributes->norm = *reinterpret_cast(attributes["norm"].data()); + test->_attributes->lable_values = tensors["lable_values"]; + test->_attributes->lable_indices = tensors["lable_indices"]; + + return test; +} + +std::shared_ptr Test::run(infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + infiniopTopksoftmaxDescriptor_t op_desc; + CHECK_OR(infiniopCreateTopksoftmaxDescriptor(handle, &op_desc, _attributes->x->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create Topksoftmax descriptor")); + + // + auto values = _attributes->values->to(device, device_id); + auto indices = _attributes->indices->to(device, device_id); + auto x = _attributes->x->to(device, device_id); + + int topk = _attributes->topk; + bool norm = _attributes->norm; + + size_t workspace_size; + CHECK_OR(infiniopGetTopksoftmaxWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + + CHECK_OR(infiniopTopksoftmax(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(), topk, norm, nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Topksoftmax execution failed")); + + try { + allClose(values, _attributes->lable_values, _rtol, _atol); + allClose(indices, _attributes->lable_indices, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + elapsed_time = benchmark( + [=]() { + infiniopTopksoftmax(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(), topk, norm, nullptr); + }, + warm_ups, iterations); + + if (workspace != nullptr) { + infinirtFree(workspace); + } + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"topk", "norm"}; +} + +std::vector Test::tensor_names() { + return {"values", "indices", "x", "lable_values", "lable_indices"}; +} + +std::vector Test::output_names() { + return {"values", "indices"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + + oss << "- topk=" << _attributes->topk << std::endl; + oss << "- norm=" << _attributes->norm << std::endl; + + oss << "- values: " << _attributes->values->info() << std::endl; + oss << "- indices: " << _attributes->indices->info() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + + oss << "- lable_values: " << _attributes->lable_values->info() << std::endl; + oss << "- lable_indices: " << _attributes->lable_indices->info() << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::topksoftmax diff --git a/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc new file mode 100644 index 000000000..c335bba60 --- /dev/null +++ b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc @@ -0,0 +1,51 @@ +#include "sigmoid_cpu.h" + +namespace op::sigmoid::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid::cpu diff --git a/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h new file mode 100644 index 000000000..49c963f44 --- /dev/null +++ b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h @@ -0,0 +1,19 @@ +#ifndef __SIGMOID_CPU_H__ +#define __SIGMOID_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid, cpu) + +namespace op::sigmoid::cpu { +typedef struct SigmoidOp { +public: + static constexpr size_t num_inputs = 1; + template + T operator()(const T &x) const { + return T(1) / (T(1) + std::exp(-x)); + } +} SigmoidOp; +} // namespace op::sigmoid::cpu + +#endif // __SIGMOID_CPU_H__ diff --git a/src/infiniop/ops/sigmoid/cuda/kernel.cuh b/src/infiniop/ops/sigmoid/cuda/kernel.cuh new file mode 100644 index 000000000..6661d1909 --- /dev/null +++ b/src/infiniop/ops/sigmoid/cuda/kernel.cuh @@ -0,0 +1,34 @@ +#ifndef __SIDMOID_CUDA_H__ +#define __SIDMOID_CUDA_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" +#include +#include + +namespace op::sigmoid::cuda { +typedef struct SigmoidOp { +public: + static constexpr size_t num_inputs = 1; + template + __device__ __forceinline__ T operator()(const T &x) const { + // sigmoid(x) = 1 / (1 + exp(-x)) + if constexpr (std::is_same_v) { + half2 denominator = __hadd2(make_half2(1, 1), h2exp(__hneg2(x))); + return h2rcp(denominator); + } else if constexpr (std::is_same_v) { + half denominator = __hadd(__float2half(1.0f), hexp(__hneg(x))); + return hrcp(denominator); + } else if constexpr (std::is_same_v) { + __nv_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x)))); + return __float2bfloat16(1.0f) / denominator; + } else if constexpr (std::is_same_v) { + float denominator = __fadd_rn(1.0f, __expf(-x)); + return __frcp_rn(denominator); + } else { // double + return 1.0 / (1.0 + exp(-x)); + } + } +} SigmoidOp; +} // namespace op::sigmoid::cuda + +#endif // __SIDMOID_CUDA_H__ diff --git a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu new file mode 100644 index 000000000..43f6df9e6 --- /dev/null +++ b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu @@ -0,0 +1,58 @@ +#include "../cuda/kernel.cuh" +#include "sigmoid_nvidia.cuh" + +namespace op::sigmoid::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &x_desc = input_desc_vec.at(0); + const auto &y_shape = out_desc->shape(); + const auto &x_shape = x_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(y_shape, x_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidOp, double>(_info, workspace, output, inputs, stream); + + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid::nvidia diff --git a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh new file mode 100644 index 000000000..5084a99d1 --- /dev/null +++ b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_CUDA_API_H__ +#define __SIGMOID_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sigmoid, nvidia) + +#endif // __SIGMOID_CUDA_API_H__ diff --git a/src/infiniop/ops/sigmoid/operator.cc b/src/infiniop/ops/sigmoid/operator.cc new file mode 100644 index 000000000..3f2f95067 --- /dev/null +++ b/src/infiniop/ops/sigmoid/operator.cc @@ -0,0 +1,115 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sigmoid.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sigmoid_cpu.h" +#endif +#ifdef ENABLE_NVIDIA_API +#include "nvidia/sigmoid_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateSigmoidDescriptor( + infiniopHandle_t handle, + infiniopSigmoidDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sigmoid::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + {x_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSigmoid( + infiniopSigmoidDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, y, {x}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc index 7d4555da2..2d2b36d6b 100644 --- a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc +++ b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc @@ -1,28 +1,215 @@ #include "topkrouter_cpu.h" +#include "../../../../utils.h" #include "../../../devices/cpu/common_cpu.h" #include "../../../reduce/cpu/reduce.h" +#include namespace op::topkrouter::cpu { +Descriptor::~Descriptor() { +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t correction_bias_desc) { + auto result = TopkrouterInfo::create(x_desc); + CHECK_RESULT(result); + auto info = result.take(); + + if (info.x_strides[1] != 1) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + *desc_ptr = new Descriptor(nullptr, std::move(info), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +inline float sigmoid_func(T x) { + float value; + if constexpr (std::is_same::value) { + value = _f16_to_f32(x); + } else if constexpr (std::is_same::value) { + value = _bf16_to_f32(x); + } else { + value = x; + } + return 1.0f / (1.0f + std::exp(-value)); +} + +template +void topkrouter_cpu_one_token(float *values_input, // 输出数据 + int *indices_input, // 输出索引 + const T *x_input, // 输入数据 + std::vector> &value_index_arr, // 输入数据 + const float *correction_bias, + const float routed_scaling_factor, const size_t topk, + const size_t width, const size_t n_routed_experts, const size_t n_group, + const size_t topk_group, const bool norm_topk_prob) { + + // ------------------------------------------------------ // + // 对输入数据做 sigmoid // + // ------------------------------------------------------ // + for (size_t i = 0; i < width; ++i) { + value_index_arr[i].first = sigmoid_func(value_index_arr[i].first); + } + + // ------------------------------------------------------ // + // 再加偏置 // + // ------------------------------------------------------ // + for (size_t i = 0; i < width; ++i) { + value_index_arr[i].first += correction_bias[i]; + } + + // ----------------------------------------------------------- // + // 分为 n_group 组,找出每组的最大值 // + // ----------------------------------------------------------- // + std::vector> value_index_group; + value_index_group.resize(n_group); + + const size_t group_size = width / n_group; // group_size表示每个组的元素数量 + for (size_t igroup = 0; igroup < n_group; ++igroup) { + std::vector> value_index_warp; + value_index_warp.resize(group_size); + + auto it = value_index_arr.begin() + igroup * group_size; + for (size_t i = 0; i < group_size; ++i) { + value_index_warp[i] = {(it++)->first, i}; + } + + // 每个group中的数据,进行排序 + std::sort(value_index_warp.begin(), value_index_warp.end(), + [](const std::pair &a, const std::pair &b) { return a.first > b.first; }); + + // 取前两个的和,作为最大值 + value_index_group[igroup] = {value_index_warp[0].first + value_index_warp[1].first, igroup}; + } -Descriptor::~Descriptor() {} + // ------------------------------------------------------------------ // + // 对 value_index_group 的数据, 再选前 topk_group 个 // + // ------------------------------------------------------------------ // + std::sort(value_index_group.begin(), value_index_group.end(), + [](const std::pair &a, const std::pair &b) { return a.first > b.first; }); -infiniStatus_t Descriptor::create( - infiniopHandle_t handle, - Descriptor **desc_ptr, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t correction_bias_desc) { + std::vector group_mask; + group_mask.resize(n_group, false); + for (size_t i = 0; i < topk_group; ++i) { + size_t index = value_index_group[i].second; + group_mask[index] = true; + } - return INFINI_STATUS_NOT_IMPLEMENTED; + // ------------------------------------------------------------------ // + // 根据group_mask,false的组的数值置0 // + // ------------------------------------------------------------------ // + for (size_t igroup = 0; igroup < n_group; ++igroup) { + if (group_mask[igroup]) { + continue; + } + + auto it = value_index_arr.begin() + igroup * group_size; + for (size_t i = 0; i < group_size; ++i) { + (it++)->first = 0.0f; + } + } + + // ------------------------------------------------------------------ // + // 最后整体做topk // + // ------------------------------------------------------------------ // + std::sort(value_index_arr.begin(), value_index_arr.end(), + [](const std::pair &a, const std::pair &b) { return a.first > b.first; }); + + // ----------------------------------------------------------- // + // 取topk个数据 // + // ----------------------------------------------------------- // + float exp_sum = 1e-9f; + for (size_t i = 0; i < topk; ++i) { + size_t index = value_index_arr[i].second; + float exp_value = sigmoid_func(x_input[index]); + + values_input[i] = exp_value; + indices_input[i] = static_cast(index); + + exp_sum += exp_value; + } + + // ----------------------------------------------------------- // + // 归一化 // + // ----------------------------------------------------------- // + if (norm_topk_prob) { + for (size_t i = 0; i < topk; ++i) { + values_input[i] = routed_scaling_factor * values_input[i] / exp_sum; + } + } } -infiniStatus_t Descriptor::calculate( - void *workspace, - size_t workspace_size, - float *values, int *indices, void *x, float *correction_bias, - float routed_scaling_factor, - size_t topk, - void *stream) const { +template +infiniStatus_t topkrouter_cpu_func(float *values, int *indices, + const T *x, + const float *correction_bias, const float routed_scaling_factor, const size_t topk, + const size_t N, const size_t width, const size_t n_routed_experts = 256, + const size_t n_group = 8, const size_t topk_group = 4, + const bool norm_topk_prob = true) { + /* + O-----------> width 地址连续 + | + | + N + */ + for (size_t n = 0; n < N; ++n) { + float *values_input = values + n * topk; + int *indices_input = indices + n * topk; + const T *x_input = x + n * width; + + std::vector> value_index_arr; + value_index_arr.resize(width); + + for (size_t i = 0; i < width; ++i) { + float temp; + if constexpr (std::is_same::value) { + temp = _f16_to_f32(x_input[i]); + } else if constexpr (std::is_same::value) { + temp = _bf16_to_f32(x_input[i]); + } else { + temp = x_input[i]; + } + value_index_arr[i] = {temp, i}; + } + + topkrouter_cpu_one_token(values_input, indices_input, x_input, value_index_arr, + correction_bias, routed_scaling_factor, + topk, width, n_routed_experts, n_group, topk_group, norm_topk_prob); + } + + return INFINI_STATUS_SUCCESS; +} // namespace op::topkrouter::cpu + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x, + const float *correction_bias, const float routed_scaling_factor, const size_t topk, + void *stream) const { + size_t N = _info.N; + size_t width = _info.width; + + // 下面是 deepseek的config.json的超参数 + const size_t n_routed_experts = 256; + const size_t n_group = 8; + const size_t topk_group = 4; + const bool norm_topk_prob = true; + + if ((width != n_routed_experts) || (width % n_group != 0) || (256 != width)) { + return INFINI_STATUS_BAD_PARAM; + } + + if (_info.xtype == INFINI_DTYPE_F32) { + topkrouter_cpu_func(values, indices, static_cast(x), correction_bias, routed_scaling_factor, topk, N, + width, n_routed_experts, n_group, topk_group, norm_topk_prob); + } else if (_info.xtype == INFINI_DTYPE_F16) { + topkrouter_cpu_func(values, indices, static_cast(x), correction_bias, routed_scaling_factor, topk, N, + width, n_routed_experts, n_group, topk_group, norm_topk_prob); + } else if (_info.xtype == INFINI_DTYPE_BF16) { + topkrouter_cpu_func(values, indices, static_cast(x), correction_bias, routed_scaling_factor, topk, N, + width, n_routed_experts, n_group, topk_group, norm_topk_prob); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } - return INFINI_STATUS_NOT_IMPLEMENTED; + return INFINI_STATUS_SUCCESS; } } // namespace op::topkrouter::cpu diff --git a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h index f327a2ab6..996fd9d2f 100644 --- a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h +++ b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h @@ -1,7 +1,6 @@ -#ifndef __Topkrouter_CPU_H__ -#define __Topkrouter_CPU_H__ +#ifndef __TOPKTOUTER_CPU_H__ +#define __TOPKTOUTER_CPU_H__ #include "../topkrouter.h" - DESCRIPTOR(cpu) #endif diff --git a/src/infiniop/ops/topkrouter/cuda/kernel.cuh b/src/infiniop/ops/topkrouter/cuda/kernel.cuh index f81048d3d..0832c5b93 100644 --- a/src/infiniop/ops/topkrouter/cuda/kernel.cuh +++ b/src/infiniop/ops/topkrouter/cuda/kernel.cuh @@ -1,5 +1,5 @@ -#ifndef _Topkrouter_KERNEL_CUH__ -#define _Topkrouter_KERNEL_CUH__ +#ifndef _TOPKROUTER_KERNEL_CUH__ +#define _TOPKROUTER_KERNEL_CUH__ #include #include #include @@ -36,17 +36,14 @@ struct CustomLess { } }; -// -// deepseek的topk -// template -__global__ void topkrouter_kernel(float *values_topk, // 输出值, 形状[N, topk] - int *indices_topk, // 输出索引, 形状[N, topk] - T *input, // 输入数据 [N, width] - float *d_correction_bias, // 输入数据 [width] - float routed_scaling_factor, // - const size_t N, // 总行数,toen数量 - const size_t width, // 每行元素数量 +__global__ void topkrouter_kernel(float *values_topk, // 输出数据, 形状[N, topk] + int *indices_topk, // 输出索引, 形状[N, topk] + const T *input, // 输入数据 [N, width] + const float *d_correction_bias, // 输入数据 [width] + const float routed_scaling_factor, + const size_t N, + const size_t width, const size_t topk ) { @@ -99,7 +96,6 @@ __global__ void topkrouter_kernel(float *values_topk, // 输出值, 形 // ----------------------------------------------------------- // // 每个组中,前两个数据的和 // // ----------------------------------------------------------- // - __syncthreads(); if (0 == lane_id) { share_data_group[warp_id] = share_data[warp_id * warp_threads] + share_data[warp_id * warp_threads + 1]; @@ -116,8 +112,10 @@ __global__ void topkrouter_kernel(float *values_topk, // 输出值, 形 thread_indices[0] = lane_id; } - __shared__ typename WarpMergeSortT::TempStorage temp_storage[1]; - WarpMergeSortT(temp_storage[0]).Sort(thread_values, thread_indices, CustomLess()); + { + __shared__ typename WarpMergeSortT::TempStorage temp_storage[1]; + WarpMergeSortT(temp_storage[0]).Sort(thread_values, thread_indices, CustomLess()); + } if (lane_id < 4) { int indices = thread_indices[0]; share_data_group_mask[indices] = 1.0f; @@ -147,13 +145,13 @@ __global__ void topkrouter_kernel(float *values_topk, // 输出值, 形 int index = thread_indices[0]; value = sigmoid_func(data_input[index]); } - - typedef cub::WarpReduce WarpReduce; - __shared__ typename WarpReduce::TempStorage temp_storage; - // 使用有效项group 进行部分归约 - float warp_sum = WarpReduce(temp_storage).Sum(value); - if (0 == tid) { - share_sum = warp_sum + 1e-20; + { + typedef cub::WarpReduce WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage; + float warp_sum = WarpReduce(temp_storage).Sum(value); + if (0 == tid) { + share_sum = warp_sum + 1e-9f; + } } __syncwarp(); diff --git a/src/infiniop/ops/topkrouter/info.h b/src/infiniop/ops/topkrouter/info.h index 4df0a9298..08a4751b8 100644 --- a/src/infiniop/ops/topkrouter/info.h +++ b/src/infiniop/ops/topkrouter/info.h @@ -1,5 +1,5 @@ -#ifndef __topkrouter_INFO_H__ -#define __topkrouter_INFO_H__ +#ifndef __TOPKROUTER_INFO_H__ +#define __TOPKROUTER_INFO_H__ #include "../../../utils.h" #include "../../tensor.h" diff --git a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu index e44872fcc..fe8cfeb72 100644 --- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu +++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu @@ -40,9 +40,9 @@ infiniStatus_t Descriptor::create( namespace { template -infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, void *d_input, float *d_correction_bias, float routed_scaling_factor, - size_t N, size_t width, size_t topk, infiniDtype_t xtype, cudaStream_t stream) { - +infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, const void *d_input, const float *d_correction_bias, + const float routed_scaling_factor, const size_t N, const size_t width, const size_t topk, infiniDtype_t xtype, + cudaStream_t stream) { const int block_threads = BLOCK_SIZE; dim3 blocks(N); dim3 threads(block_threads); @@ -63,9 +63,15 @@ infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, void * }; // namespace infiniStatus_t Descriptor::calculate( - void *workspace, size_t workspace_size, - float *values, int *indices, void *x, float *correction_bias, float routed_scaling_factor, size_t topk, void *stream) const { - + void *workspace, + size_t workspace_size, + float *values, + int *indices, + const void *x, + const float *correction_bias, + const float routed_scaling_factor, + const size_t topk, + void *stream) const { if (workspace_size < _workspace_size) { return INFINI_STATUS_INSUFFICIENT_WORKSPACE; } @@ -76,13 +82,12 @@ infiniStatus_t Descriptor::calculate( // size_t n_routed_experts = 256; // size_t n_group = 8; // size_t topk_group = 4; - auto cuda_stream = reinterpret_cast(stream); if (256 == width) { launch_topkrouter<256>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, cuda_stream); } else { - return INFINI_STATUS_INTERNAL_ERROR; + return INFINI_STATUS_BAD_PARAM; } return INFINI_STATUS_SUCCESS; diff --git a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh index 1aab4e2ff..9b3a81c37 100644 --- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh +++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh @@ -1,5 +1,5 @@ -#ifndef __Topkrouter_CUDA_H__ -#define __Topkrouter_CUDA_H__ +#ifndef __TOPKROUTER_CUDA_H__ +#define __TOPKROUTER_CUDA_H__ #include "../topkrouter.h" diff --git a/src/infiniop/ops/topkrouter/operator.cc b/src/infiniop/ops/topkrouter/operator.cc index 4d43c77ce..c61c9c07a 100644 --- a/src/infiniop/ops/topkrouter/operator.cc +++ b/src/infiniop/ops/topkrouter/operator.cc @@ -9,18 +9,13 @@ #include "nvidia/topkrouter_nvidia.cuh" #endif -__C infiniStatus_t infiniopCreateTopkrouterDescriptor( - infiniopHandle_t handle, - infiniopTopkrouterDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t x_desc, - infiniopTensorDescriptor_t correction_bias_desc) { - -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::topkrouter::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - x_desc, correction_bias_desc) +__C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, infiniopTopkrouterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t correction_bias_desc) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::topkrouter::NAMESPACE::Descriptor::create( \ + handle, reinterpret_cast(desc_ptr), x_desc, correction_bias_desc) switch (handle->device) { #ifdef ENABLE_CPU_API @@ -37,7 +32,6 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor( } __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size) { - #define GET(CASE, NAMESPACE) \ case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ @@ -58,12 +52,13 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript } __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size, - void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream) { - -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc)->calculate( \ - workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream) + void *values, void *indices, const void *x, const void *correction_bias, + const float routed_scaling_factor, const size_t topk, void *stream) { +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, \ + topk, stream) switch (desc->device_type) { #ifdef ENABLE_CPU_API @@ -80,7 +75,6 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void } __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc) { - #define DESTROY(CASE, NAMESPACE) \ case CASE: \ delete reinterpret_cast(desc); \ diff --git a/src/infiniop/ops/topkrouter/topkrouter.h b/src/infiniop/ops/topkrouter/topkrouter.h index 3e6baae13..fbd763763 100644 --- a/src/infiniop/ops/topkrouter/topkrouter.h +++ b/src/infiniop/ops/topkrouter/topkrouter.h @@ -1,50 +1,51 @@ -#ifndef _Topkrouter_H_ -#define _Topkrouter_H_ +#ifndef _TOPKTOUTER_H_ +#define _TOPKTOUTER_H_ #include "../../operator.h" #include "info.h" -#define DESCRIPTOR(NAMESPACE) \ - \ - namespace op::topkrouter::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ - struct Opaque; \ - Opaque *_opaque; \ - TopkrouterInfo _info; \ - size_t _workspace_size; \ - \ - Descriptor( \ - Opaque *opaque, \ - TopkrouterInfo info, \ - size_t workspace_size, \ - infiniDevice_t device_type, \ - int device_id) \ - : InfiniopDescriptor{device_type, device_id}, \ - _opaque(opaque), \ - _info(info), \ - _workspace_size(workspace_size) {} \ - \ - public: \ - ~Descriptor(); \ - \ - size_t workspaceSize() const { return _workspace_size; } \ - \ - static infiniStatus_t create( \ - infiniopHandle_t handle, \ - Descriptor **desc_ptr, \ - infiniopTensorDescriptor_t x_desc, \ - infiniopTensorDescriptor_t correction_bias_desc); \ - \ - infiniStatus_t calculate( \ - void *workspace, size_t workspace_size, \ - float *values, \ - int *indices, \ - void *x, \ - float *correction_bias, \ - float routed_scaling_factor, \ - size_t topk, \ - void *stream) const; \ - }; \ +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::topkrouter::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + TopkrouterInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, \ + TopkrouterInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) { \ + } \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { \ + return _workspace_size; \ + } \ + \ + static infiniStatus_t create(infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t correction_bias_desc); \ + \ + infiniStatus_t calculate(void *workspace, \ + size_t workspace_size, \ + float *values, \ + int *indices, \ + const void *x, \ + const float *correction_bias, \ + const float routed_scaling_factor, \ + const size_t topk, \ + void *stream) const; \ + }; \ } #endif // _Topkrouter_H_ diff --git a/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc new file mode 100644 index 000000000..53584e182 --- /dev/null +++ b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc @@ -0,0 +1,143 @@ +#include "topksoftmax_cpu.h" +#include "../../../../utils.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "topksoftmax_cpu.h" +#include + +namespace op::topksoftmax::cpu { +Descriptor::~Descriptor() { +} + +infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc) { + auto result = TopksoftmaxInfo::create(x_desc); + CHECK_RESULT(result); + + auto info = result.take(); + if (info.x_strides[1] != 1) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + *desc_ptr = new Descriptor(nullptr, std::move(info), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +void topksoftmax_cpu_one_token(float *values_input, // 输出数据 + int *indices_input, // 输出索引 + std::vector> &value_index_arr, // 输入数据 + size_t topk, + bool norm, + size_t width) { + + // ------------------------------------------------ // + // 第一步:计算最大值 // + // ------------------------------------------------ // + float value_max = value_index_arr[0].first; + for (size_t i = 1; i < width; ++i) { + value_max = value_index_arr[i].first > value_max ? value_index_arr[i].first : value_max; + } + + // ------------------------------------------------ // + // 第二步: 指数计算 // + // ------------------------------------------------ // + float exp_sum = 0.0f; + for (size_t i = 0; i < width; ++i) { + float value = std::exp(value_index_arr[i].first - value_max); + value_index_arr[i].first = value; + exp_sum += value; + } + + // ------------------------------------------------ // + // 第三步:计算 Softmax // + // ------------------------------------------------ // + for (size_t i = 0; i < width; ++i) { + value_index_arr[i].first /= exp_sum; + } + + // ------------------------------------------------ // + // 第四步:计算 排序 // + // ------------------------------------------------ // + std::sort(value_index_arr.begin(), value_index_arr.end(), + [](const std::pair &a, const std::pair &b) { return a.first > b.first; }); + + // ------------------------------------------------ // + // 第五步: topk // + // ------------------------------------------------ // + exp_sum = 0.0f; + for (size_t i = 0; i < topk; ++i) { + values_input[i] = value_index_arr[i].first; + indices_input[i] = static_cast(value_index_arr[i].second); + exp_sum += values_input[i]; + } + + // ------------------------------------------------ // + // 第6步: norm归一化 // + // ------------------------------------------------ // + if (norm) { + for (size_t i = 0; i < topk; ++i) { + values_input[i] /= exp_sum; + } + } +} + +template +infiniStatus_t topksoftmax_cpu_func(float *values, int *indices, + const T *x, + size_t topk, bool norm, size_t N, size_t width) { + /* + O-----------> width 地址连续 + | + | + N + */ + + for (size_t n = 0; n < N; ++n) { + + float *values_input = values + n * topk; + int *indices_input = indices + n * topk; + const T *x_input = x + n * width; + + std::vector> value_index_arr; + value_index_arr.resize(width); + + // ------------------------------------------------ // + // 第0步: 数据先转换到 float // + // ------------------------------------------------ // + float temp; + for (size_t i = 0; i < width; ++i) { + if constexpr (std::is_same::value) { + temp = _f16_to_f32(x_input[i]); + } else if constexpr (std::is_same::value) { + temp = _bf16_to_f32(x_input[i]); + } else { + temp = x_input[i]; + } + value_index_arr[i] = {temp, i}; + } + + topksoftmax_cpu_one_token(values_input, + indices_input, + value_index_arr, + topk, + norm, + width); + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x, + const size_t topk, const bool norm, void *stream) const { + size_t N = _info.N; + size_t width = _info.width; + if (_info.xtype == INFINI_DTYPE_F32) { + topksoftmax_cpu_func(values, indices, (const float *)x, topk, norm, N, width); + } else if (_info.xtype == INFINI_DTYPE_F16) { + topksoftmax_cpu_func(values, indices, (const fp16_t *)x, topk, norm, N, width); + } else if (_info.xtype == INFINI_DTYPE_BF16) { + topksoftmax_cpu_func(values, indices, (const bf16_t *)x, topk, norm, N, width); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::topksoftmax::cpu diff --git a/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h new file mode 100644 index 000000000..c874f4997 --- /dev/null +++ b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h @@ -0,0 +1,7 @@ +#ifndef __TOPKSOFTMAX_CPU_H__ +#define __TOPKSOFTMAX_CPU_H__ +#include "../topksoftmax.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/topksoftmax/cuda/kernel.cuh b/src/infiniop/ops/topksoftmax/cuda/kernel.cuh new file mode 100644 index 000000000..b8bd05833 --- /dev/null +++ b/src/infiniop/ops/topksoftmax/cuda/kernel.cuh @@ -0,0 +1,145 @@ +#ifndef _TOPKSOFTMAX_KERNEL_CUH__ +#define _TOPKSOFTMAX_KERNEL_CUH__ +#include +#include +#include +#include +#include +#include +#include +#include + +template +inline __device__ float exp_func(T x) { + float data; + if constexpr (std::is_same_v) { + data = x; + } else if constexpr (std::is_same_v) { + data = __bfloat162float(x); + } else if constexpr (std::is_same_v) { + data = __half2float(x); + } + return __expf(data); +} + +template +__global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形状[N, topk] + int *indices_topk, // 输出索引, 形状[N, topk] + const T *input, // 输入数据 [N, width] + const size_t N, + const size_t width, + const size_t topk, + bool norm + +) { + const int bid = blockIdx.x; + if (bid >= N) { + return; + } + + const int tid = threadIdx.x; + const T *data_input = input + bid * width; + float *values_topk_output = values_topk + bid * topk; + int *indices_topk_output = indices_topk + bid * topk; + + const int warp_id = tid / 32; + + __shared__ T shared_max; + __shared__ float shared_sum; + typedef cub::BlockReduce BlockReduce; + + // ------------------------------------------------ // + // 第一步:计算最大值 // + // ------------------------------------------------ // + T thread_max = data_input[0]; + if (tid < width) { + thread_max = thread_max > data_input[tid] ? thread_max : data_input[tid]; + } + + { + __shared__ typename BlockReduce::TempStorage temp_storage_max; + T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, cub::Max()); + if (tid == 0) { + shared_max = value_max; + } + } + __syncthreads(); + + // ------------------------------------------------ // + // 第二步:计算指数和 // + // ------------------------------------------------ // + float exp_val = 0.0f; + if (tid < width) { + T temp_val = data_input[tid] - shared_max; + exp_val = exp_func(temp_val); + } + + { + __shared__ typename BlockReduce::TempStorage temp_storage_sum; + float value_sum = BlockReduce(temp_storage_sum).Sum(exp_val); + if (tid == 0) { + shared_sum = value_sum; + } + } + __syncthreads(); + + // ------------------------------------------------ // + // 第三步:计算 Softmax // + // ------------------------------------------------ // + exp_val /= shared_sum; + + // ------------------------------------------------ // + // 第四步:计算 排序 // + // ------------------------------------------------ // + float thread_values[1] = {-FLT_MAX}; + int thread_indices[1] = {-1}; + if ((tid < width) && (exp_val > thread_values[0])) { + thread_values[0] = exp_val; + thread_indices[0] = tid; + } + { + typedef cub::BlockRadixSort BlockRadixSort; + __shared__ typename BlockRadixSort::TempStorage temp_storage; + BlockRadixSort(temp_storage).SortDescending(thread_values, thread_indices); + } + __syncthreads(); + + if (0 == warp_id) { + int indice = -1; + float value = 0.0f; + if (tid < topk) { + indice = thread_indices[0]; + value = thread_values[0]; + } + + // ------------------------------------------------ // + // 第五步: topk的和 // + // ------------------------------------------------ // + { + typedef cub::WarpReduce WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage; + float warp_sum = WarpReduce(temp_storage).Sum(value); + if (0 == tid) { + shared_sum = warp_sum + 1e-9f; + } + } + __syncwarp(); + + // ------------------------------------------------ // + // 第6步: norm归一化 // + // ------------------------------------------------ // + if (norm && (tid < topk)) { + value /= shared_sum; + } + + // ------------------------------------------------ // + // 第7步: 最终的返回值 // + // ------------------------------------------------ // + if (tid < topk) { + values_topk_output[tid] = value; + indices_topk_output[tid] = indice; + } + } +} + +#endif // _TOPKSOFTMAX_KERNEL_CUH__ diff --git a/src/infiniop/ops/topksoftmax/info.h b/src/infiniop/ops/topksoftmax/info.h new file mode 100644 index 000000000..4e8eb9d7c --- /dev/null +++ b/src/infiniop/ops/topksoftmax/info.h @@ -0,0 +1,48 @@ +#ifndef __TOPKSOFTMAX_INFO_H__ +#define __TOPKSOFTMAX_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::topksoftmax { + +class TopksoftmaxInfo { + TopksoftmaxInfo() = default; + +public: + infiniDtype_t xtype; + std::vector shape; + std::vector x_strides; + size_t N; + size_t width; + +public: + size_t ndim() const { return shape.size(); } + size_t dim() const { return shape[ndim() - 1]; } + + static utils::Result create(infiniopTensorDescriptor_t x_desc) { + + auto xtype = x_desc->dtype(); + if ((xtype != infiniDtype_t::INFINI_DTYPE_F32) && (xtype != infiniDtype_t::INFINI_DTYPE_F16) && (xtype != infiniDtype_t::INFINI_DTYPE_BF16)) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + if (x_desc->ndim() != 2) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + + size_t N = x_desc->shape()[0]; // token数量 + size_t width = x_desc->shape()[1]; // 专家数量 + + return utils::Result(TopksoftmaxInfo{xtype, + x_desc->shape(), + x_desc->strides(), + N, + width}); + } +}; + +} // namespace op::topksoftmax + +#endif // __TOPKSOFTMAX_INFO_H__ diff --git a/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu new file mode 100644 index 000000000..b7a9d7c4c --- /dev/null +++ b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu @@ -0,0 +1,92 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "topksoftmax_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../cuda/kernel.cuh" + +namespace op::topksoftmax::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t x_desc) { + auto result = TopksoftmaxInfo::create(x_desc); + CHECK_RESULT(result); + auto info = result.take(); + + if (info.x_strides[1] != 1) { + return INFINI_STATUS_BAD_TENSOR_STRIDES; + } + + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + std::move(info), + 0, + handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +namespace { + +template +infiniStatus_t launch_topksoftmax(float *d_values_out, int *d_indices_out, const void *d_input, const size_t N, const size_t width, const size_t topk, const bool norm, infiniDtype_t xtype, cudaStream_t stream) { + const int block_threads = BLOCK_SIZE; + dim3 blocks(static_cast(N)); + dim3 threads(block_threads); + + if (xtype == INFINI_DTYPE_F32) { + softmax_topk_row_kernel<<>>(d_values_out, d_indices_out, (float *)d_input, N, width, topk, norm); + } else if (xtype == INFINI_DTYPE_F16) { + softmax_topk_row_kernel<<>>(d_values_out, d_indices_out, (half *)d_input, N, width, topk, norm); + } else if (xtype == INFINI_DTYPE_BF16) { + softmax_topk_row_kernel<__nv_bfloat16, BLOCK_SIZE><<>>(d_values_out, d_indices_out, (__nv_bfloat16 *)d_input, N, width, topk, norm); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} + +}; // namespace + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + float *values, + int *indices, + const void *x, + const size_t topk, + const bool norm, + void *stream) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + size_t N = _info.N; + size_t width = _info.width; + auto cuda_stream = reinterpret_cast(stream); + if (width <= 128) { + launch_topksoftmax<128>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream); + } else if (width <= 256) { + launch_topksoftmax<256>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream); + } else if (width <= 512) { + launch_topksoftmax<512>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream); + } else { + return INFINI_STATUS_INTERNAL_ERROR; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::topksoftmax::nvidia diff --git a/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh new file mode 100644 index 000000000..fea0d0f2a --- /dev/null +++ b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TOPKSOFTMAX_CUDA_H__ +#define __TOPKSOFTMAX_CUDA_H__ + +#include "../topksoftmax.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/topksoftmax/operator.cc b/src/infiniop/ops/topksoftmax/operator.cc new file mode 100644 index 000000000..ad3c91dcd --- /dev/null +++ b/src/infiniop/ops/topksoftmax/operator.cc @@ -0,0 +1,101 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/topksoftmax.h" + +#ifdef ENABLE_CPU_API +#include "cpu/topksoftmax_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) +#include "nvidia/topksoftmax_nvidia.cuh" +#endif + +__C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle, + infiniopTopksoftmaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t x_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::topksoftmax::NAMESPACE::Descriptor::create( \ + handle, reinterpret_cast(desc_ptr), x_desc) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + } + +#undef CREATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, void *workspace, size_t workspace_size, + void *values, void *indices, const void *x, const size_t topk, const int norm, + void *stream) { + if (topk > 32) { + return INFINI_STATUS_BAD_PARAM; + } + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, (float *)values, (int *)indices, x, topk, static_cast(norm), stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif + } + +#undef CALCULATE + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia); +#endif + } + +#undef DESTROY + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/topksoftmax/topksoftmax.h b/src/infiniop/ops/topksoftmax/topksoftmax.h new file mode 100644 index 000000000..29d438f3f --- /dev/null +++ b/src/infiniop/ops/topksoftmax/topksoftmax.h @@ -0,0 +1,34 @@ +#ifndef _TOPKSOFTMAX_H_ +#define _TOPKSOFTMAX_H_ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::topksoftmax::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + TopksoftmaxInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor(Opaque *opaque, TopksoftmaxInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) \ + : InfiniopDescriptor{device_type, device_id}, _opaque(opaque), _info(info), _workspace_size(workspace_size) { \ + } \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { \ + return _workspace_size; \ + } \ + \ + static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc); \ + \ + infiniStatus_t calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x, \ + const size_t topk, const bool norm, void *stream) const; \ + }; \ + } + +#endif // _TOPKSOFTMAX_H_ diff --git a/test/infiniop-test/test_generate/testcases/sigmoid.py b/test/infiniop-test/test_generate/testcases/sigmoid.py new file mode 100644 index 000000000..f622a4d6e --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/sigmoid.py @@ -0,0 +1,136 @@ +import numpy as np +from numpy.lib.stride_tricks import as_strided +import gguf +from typing import List + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor + + +def sigmoid( + x: np.ndarray, +): + return 1 / (1 + np.exp(-x)) + + +def random_tensor(shape, dtype): + rate = 1e-3 + var = 0.5 * rate + return rate * np.random.rand(*shape).astype(dtype) - var + + +def process_tensors(a, b, stride_a=None, stride_b=None): + def normalize_stride(tensor, stride): + if stride: + slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride) + return tensor[slices] + else: + return tensor + + a_unique = normalize_stride(a, stride_a) + b_unique = normalize_stride(b, stride_b) + return a_unique, b_unique + + +def process_tensor(a, stride_a=None): + def normalize_stride(tensor, stride): + if stride: + slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride) + return tensor[slices] + else: + return tensor + + a_unique = normalize_stride(a, stride_a) + return a_unique + + +class SigmoidTestCase(InfiniopTestCase): + def __init__( + self, + x: np.ndarray, + shape_x: List[int] | None, + stride_x: List[int] | None, + y: np.ndarray, + shape_y: List[int] | None, + stride_y: List[int] | None, + ): + super().__init__("sigmoid") + self.x = x + self.shape_x = shape_x + self.stride_x = stride_x + + self.y = y + self.shape_y = shape_y + self.stride_y = stride_y + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + + if self.shape_x is not None: + test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x) + if self.shape_y is not None: + test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y) + + if self.stride_x is not None: + test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + + test_writer.add_array( + test_writer.gguf_key("y.strides"), + gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + ) + + test_writer.add_tensor( + test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype) + ) + + input_x = self.x.astype(np.float64) + if (self.stride_x is not None) and (0 in self.stride_x): + typesize = np.dtype(input_x.dtype).itemsize + new_strides_bytes = tuple(x * typesize for x in self.stride_x) + input_x = as_strided(x=input_x, shape=self.shape_x, strides=new_strides_bytes) + + ans = sigmoid(input_x) + + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == '__main__': + test_writer = InfiniopTestWriter("sigmoid.gguf") + + test_cases = [] + _TEST_CASES_ = [ + # shape, x_stride, y_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), + ] + _TENSOR_DTYPES_ = [np.float16, np.float32] + + for dtype in _TENSOR_DTYPES_: + for shape, stride_x, stride_y in _TEST_CASES_: + x = np.random.rand(*shape).astype(dtype) + y = np.empty(tuple(0 for _ in shape), dtype=dtype) + + x = process_zero_stride_tensor(x, stride_x) + test_case = SigmoidTestCase(x=x, + shape_x=shape, + stride_x=stride_x, + y=y, + shape_y=shape, + stride_y=stride_y) + + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/topkrouter.py b/test/infiniop-test/test_generate/testcases/topkrouter.py new file mode 100644 index 000000000..54d647154 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/topkrouter.py @@ -0,0 +1,225 @@ +import numpy as np +from typing import List +import torch +import torch.nn as nn + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides + + +def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: + return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + + +class DeepseekV3TopkRouter(nn.Module): + def __init__(self, correction_bias, + routed_scaling_factor: float, + topk: int, config=None): + super().__init__() + self.config = config + self.top_k = 8 # config.num_experts_per_tok + self.n_routed_experts = 256 # config.n_routed_experts + self.routed_scaling_factor = 2.5 # config.routed_scaling_factor + self.n_group = 8 # config.n_group + self.topk_group = 4 # config.topk_group + self.norm_topk_prob = True # config.norm_topk_prob + + self.routed_scaling_factor = routed_scaling_factor + self.top_k = topk + + # self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size))) + # self.weight = torch.rand(256, 7168) * 2 - 1 + + # self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts)) + self.e_score_correction_bias = torch.zeros(256, ) + self.e_score_correction_bias[:] = correction_bias[:] + + @torch.no_grad() + def get_topk_indices(self, scores): + scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0) # Size([1, 256]) + group_scores = ( + scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group) + .topk(2, dim=-1)[0] + .sum(dim=-1) + ) + + group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=True)[1] # Size([1, 4]) + group_mask = torch.zeros_like(group_scores) # Size([1, 8]) + group_mask.scatter_(1, group_idx, 1) # Size([1, 8]) + + score_mask = ( + group_mask.unsqueeze(-1) + .expand(-1, self.n_group, self.n_routed_experts // self.n_group) + .reshape(-1, self.n_routed_experts) + ) + + scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # Size([1, 256]) + topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=True)[1] # Size([1, 8]) + + return topk_indices + + def forward(self, router_logits): + # hidden_states = hidden_states.view(-1, 7168) + # router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32)) + + scores = router_logits.sigmoid() # (1,256) + scores = scores.to(torch.float32) + + topk_indices = self.get_topk_indices(scores) # (1,8) + topk_weights = scores.gather(1, topk_indices) + + if self.norm_topk_prob: + denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 + topk_weights /= denominator + topk_weights = topk_weights * self.routed_scaling_factor + return topk_indices, topk_weights + + +def python_topkrouter(x: np.ndarray, + correction_bias: np.ndarray, + routed_scaling_factor: float, + topk: int): + x = torch.from_numpy(x) + correction_bias = torch.from_numpy(correction_bias) + + router_logits = x + lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias, routed_scaling_factor=routed_scaling_factor, topk=topk)(router_logits) + lable_indices = lable_indices.to(torch.int32) + + return lable_values.numpy(), lable_indices.numpy() + + +class TopkrouterTestCase(InfiniopTestCase): + def __init__(self, + values: np.ndarray, # 传出参数 + indices: np.ndarray, # 传出参数 + x: np.ndarray, # 传入参数 + correction_bias: np.ndarray, # 传入参数 + routed_scaling_factor: float, + topk: int, + values_shape: List[int] | None, + values_strides: List[int] | None, + indices_shape: List[int] | None, + indices_strides: List[int] | None, + x_shape: List[int] | None, + x_strides: List[int] | None, + correction_bias_shape: List[int] | None, + correction_bias_strides: List[int] | None, + ): + super().__init__("topkrouter") + self.values = values + self.indices = indices + self.x = x + self.correction_bias = correction_bias + + self.routed_scaling_factor = routed_scaling_factor + self.topk = topk + + self.values_shape = values_shape + self.values_strides = values_strides + self.indices_shape = indices_shape + self.indices_strides = indices_strides + self.x_shape = x_shape + self.x_strides = x_strides + self.correction_bias_shape = correction_bias_shape + self.correction_bias_strides = correction_bias_strides + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + + if self.values_shape is not None: + print("self.values_shape: ", self.values_shape) + test_writer.add_array(test_writer.gguf_key("values.shape"), self.values_shape) + if self.indices_shape is not None: + test_writer.add_array(test_writer.gguf_key("indices.shape"), self.indices_shape) + if self.x_shape is not None: + test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape) + if self.correction_bias_shape is not None: + test_writer.add_array(test_writer.gguf_key("correction_bias.shape"), self.correction_bias_shape) + + if self.x_strides is not None: + test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + if self.correction_bias_strides is not None: + test_writer.add_array(test_writer.gguf_key("correction_bias_strides.strides"), gguf_strides(*self.correction_bias_strides)) + test_writer.add_array( + test_writer.gguf_key("values.strides"), + gguf_strides(*self.values_strides if self.values_strides is not None else contiguous_gguf_strides(self.values_shape)) + ) + test_writer.add_array( + test_writer.gguf_key("indices.strides"), + gguf_strides(*self.indices_strides if self.indices_strides is not None else contiguous_gguf_strides(self.indices_shape)) + ) + + test_writer.add_tensor(test_writer.gguf_key("values"), + self.values, + raw_dtype=np_dtype_to_ggml(self.values.dtype)) + + test_writer.add_tensor(test_writer.gguf_key("indices"), + self.indices, + raw_dtype=np_dtype_to_ggml(self.indices.dtype)) + + test_writer.add_tensor(test_writer.gguf_key("x"), + self.x, + raw_dtype=np_dtype_to_ggml(self.x.dtype)) + + test_writer.add_tensor(test_writer.gguf_key("correction_bias"), + self.correction_bias, + raw_dtype=np_dtype_to_ggml(self.correction_bias.dtype)) + + test_writer.add_float32(test_writer.gguf_key("routed_scaling_factor"), self.routed_scaling_factor) + test_writer.add_int32(test_writer.gguf_key("topk"), self.topk) + + lable_values, lable_indices = python_topkrouter(self.x.copy(), self.correction_bias.copy(), self.routed_scaling_factor, self.topk) + + test_writer.add_tensor( + test_writer.gguf_key("lable_values"), + lable_values, + raw_dtype=np_dtype_to_ggml(lable_values.dtype), + ) + test_writer.add_tensor( + test_writer.gguf_key("lable_indices"), + lable_indices, + raw_dtype=np_dtype_to_ggml(lable_indices.dtype) + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("topkrouter.gguf") + test_cases = [] + + _TEST_CASES_ = [ + # x_shape, x_strides, correction_bias_shape, correction_bias_stride, routed_scaling_factor, topk + ((1, 256), None, (256,), None, 2.5, 8), + ((2, 256), None, (256,), None, 1.5, 8), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16] + for dtype in _TENSOR_DTYPES_: + for x_shape, x_strides, correction_bias_shape, b_stride, routed_scaling_factor, topk in _TEST_CASES_: + ntoken = x_shape[0] + + values_indices_shape = (ntoken, topk) + values = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.float32) + indices = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.int32) + + x = np.random.rand(*x_shape).astype(dtype) + correction_bias = np.random.rand(*correction_bias_shape).astype(np.float32) + test_case = TopkrouterTestCase( + values=values, + indices=indices, + x=x, + correction_bias=correction_bias, + routed_scaling_factor=routed_scaling_factor, + topk=topk, + values_shape=list(values_indices_shape), + values_strides=None, + indices_shape=list(values_indices_shape), + indices_strides=None, + x_shape=list(x_shape), + x_strides=None, + correction_bias_shape=list(correction_bias_shape), + correction_bias_strides=None, + ) + + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/topksoftmax.py b/test/infiniop-test/test_generate/testcases/topksoftmax.py new file mode 100644 index 000000000..337c4b9e9 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/topksoftmax.py @@ -0,0 +1,148 @@ +import numpy as np +from typing import List +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides + + +def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: + return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + + +def torch_Topksoftmax(router_logits, top_k: int, norm_topk_prob: bool): + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1) + if norm_topk_prob: # only diff with mixtral sparse moe block! + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(torch.float) + return routing_weights, selected_experts + + +def python_Topksoftmax(router_logits, top_k: int, norm_topk_prob: bool): + router_logits = torch.from_numpy(router_logits) + lable_values, lable_indices = torch_Topksoftmax(router_logits, top_k, norm_topk_prob) + return lable_values.numpy(), lable_indices.numpy() + + +class TopksoftmaxTestCase(InfiniopTestCase): + def __init__(self, + values: np.ndarray, # 传出参数 + indices: np.ndarray, # 传出参数 + x: np.ndarray, # 传入参数 + topk: np.ndarray, + norm: bool, + values_shape: List[int] | None, + values_strides: List[int] | None, + indices_shape: List[int] | None, + indices_strides: List[int] | None, + x_shape: List[int] | None, + x_strides: List[int] | None, + ): + super().__init__("topksoftmax") + self.values = values + self.indices = indices + self.x = x + self.topk = topk + self.norm = norm + + self.values_shape = values_shape + self.values_strides = values_strides + self.indices_shape = indices_shape + self.indices_strides = indices_strides + self.x_shape = x_shape + self.x_strides = x_strides + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + + if self.values_shape is not None: + print("self.values_shape: ", self.values_shape) + test_writer.add_array(test_writer.gguf_key("values.shape"), self.values_shape) + if self.indices_shape is not None: + test_writer.add_array(test_writer.gguf_key("indices.shape"), self.indices_shape) + if self.x_shape is not None: + test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape) + + if self.x_strides is not None: + test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + + test_writer.add_array( + test_writer.gguf_key("values.strides"), + gguf_strides(*self.values_strides if self.values_strides is not None else contiguous_gguf_strides(self.values_shape)) + ) + + test_writer.add_array( + test_writer.gguf_key("indices.strides"), + gguf_strides(*self.indices_strides if self.indices_strides is not None else contiguous_gguf_strides(self.indices_shape)) + ) + + test_writer.add_tensor(test_writer.gguf_key("values"), + self.values, + raw_dtype=np_dtype_to_ggml(self.values.dtype)) + + test_writer.add_tensor(test_writer.gguf_key("indices"), + self.indices, + raw_dtype=np_dtype_to_ggml(self.indices.dtype)) + + test_writer.add_tensor(test_writer.gguf_key("x"), + self.x, + raw_dtype=np_dtype_to_ggml(self.x.dtype)) + + test_writer.add_int32(test_writer.gguf_key("topk"), self.topk) + test_writer.add_bool(test_writer.gguf_key("norm"), self.norm) + + lable_values, lable_indices = python_Topksoftmax(self.x.copy(), self.topk, self.norm) + + test_writer.add_tensor( + test_writer.gguf_key("lable_values"), + lable_values, + raw_dtype=np_dtype_to_ggml(lable_values.dtype), + ) + test_writer.add_tensor( + test_writer.gguf_key("lable_indices"), + lable_indices, + raw_dtype=np_dtype_to_ggml(lable_indices.dtype) + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("topksoftmax.gguf") + test_cases = [] + + _TEST_CASES_ = [ + # x_shape, x_strides, topk, norm + ((1, 32), None, 4, True), + ((8, 20), None, 8, False), + ((2, 128), None, 10, True) + ] + _TENSOR_DTYPES_ = [np.float32, np.float16] + for dtype in _TENSOR_DTYPES_: + for x_shape, x_strides, topk, norm in _TEST_CASES_: + ntoken = x_shape[0] + + values_indices_shape = (ntoken, topk) + values = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.float32) + indices = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.int32) + + x = np.random.rand(*x_shape).astype(dtype) + + test_case = TopksoftmaxTestCase( + values=values, + indices=indices, + x=x, + topk=topk, + norm=norm, + values_shape=list(values_indices_shape), + values_strides=None, + indices_shape=list(values_indices_shape), + indices_strides=None, + x_shape=list(x_shape), + x_strides=None + ) + + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index ba1ce33df..c47da4ea4 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -495,6 +495,66 @@ def conv_(lib): ] +@OpRegister.operator +def sigmoid_(lib): + lib.infiniopCreateSigmoidDescriptor.restype = c_int32 + lib.infiniopCreateSigmoidDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetSigmoidWorkspaceSize.restype = c_int32 + lib.infiniopGetSigmoidWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopSigmoid.restype = c_int32 + lib.infiniopSigmoid.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySigmoidDescriptor.restype = c_int32 + lib.infiniopDestroySigmoidDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def topksoftmax_(lib): + lib.infiniopCreateTopksoftmaxDescriptor.restype = c_int32 + lib.infiniopCreateTopksoftmaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + ] + lib.infiniopGetTopksoftmaxWorkspaceSize.restype = c_int32 + lib.infiniopGetTopksoftmaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopTopksoftmax.restype = c_int32 + lib.infiniopTopksoftmax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_size_t, + c_int32, + c_void_p, + ] + lib.infiniopDestroyTopksoftmaxDescriptor.restype = c_int32 + lib.infiniopDestroyTopksoftmaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + @OpRegister.operator def topkrouter_(lib): lib.infiniopCreateTopkrouterDescriptor.restype = c_int32 @@ -502,7 +562,7 @@ def topkrouter_(lib): infiniopHandle_t, POINTER(infiniopOperatorDescriptor_t), infiniopTensorDescriptor_t, - infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t ] lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32 @@ -510,7 +570,6 @@ def topkrouter_(lib): infiniopOperatorDescriptor_t, POINTER(c_size_t), ] - lib.infiniopTopkrouter.restype = c_int32 lib.infiniopTopkrouter.argtypes = [ infiniopOperatorDescriptor_t, @@ -524,7 +583,6 @@ def topkrouter_(lib): c_size_t, c_void_p, ] - lib.infiniopDestroyTopkrouterDescriptor.restype = c_int32 lib.infiniopDestroyTopkrouterDescriptor.argtypes = [ infiniopOperatorDescriptor_t, diff --git a/test/infiniop/sigmoid.py b/test/infiniop/sigmoid.py new file mode 100644 index 000000000..589900fac --- /dev/null +++ b/test/infiniop/sigmoid.py @@ -0,0 +1,172 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, x_stride, y_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), (0, 1)), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (4, 0, 1)), + ((16, 5632), None, None), + ((16, 5632), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), + ((4, 4, 56320), None, None), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_X, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def torch_sigmoid(y, x): + torch.sigmoid(x, out=y) + +def test( + handle, + device, + shape, + x_stride=None, + y_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + x = TestTensor(shape, x_stride, dtype, device) + if inplace == Inplace.INPLACE_X: + if x_stride != y_stride: + return + y = x + else: + y = TestTensor(shape, y_stride, dtype, device, mode="ones") + + if y.is_broadcast(): + return + + print( + f"Testing Sigmoid on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + torch_sigmoid(y.torch_tensor(), x.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSigmoidDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x, y]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSigmoidWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, y.device) + + def lib_sigmoid(): + check_error( + LIBINFINIOP.infiniopSigmoid( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + None, + ) + ) + + lib_sigmoid() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_sigmoid(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sigmoid(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroySigmoidDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92m Test passed! \033[0m") + diff --git a/test/infiniop/topkrouter.py b/test/infiniop/topkrouter.py index 6f851a89f..97b9e37e8 100644 --- a/test/infiniop/topkrouter.py +++ b/test/infiniop/topkrouter.py @@ -27,14 +27,13 @@ # ============================================================================== # These are not meant to be imported from other modules _TEST_CASES_ = [ - # x_shape, x_stride, select_experts - ((1, 256), None, 8), - ((3, 256), None, 8), + # x_shape, x_stride, topk, routed_scaling_factor + ((1, 256), None, 8, 2.5), ] # w (weight) types # Note: 'None' means the same as input dtype -_X_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16] # +_X_DTYPES = [] # [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16] # x types used for testing _VALUE_DTYPES = [InfiniDtype.F32] @@ -46,6 +45,8 @@ # Tolerance map for different data types _TOLERANCE_MAP = { InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, } DEBUG = False @@ -59,12 +60,13 @@ def tensorInfo(data): class DeepseekV3TopkRouter(nn.Module): - def __init__(self, correction_bias, config=None): + def __init__(self, correction_bias, routed_scaling_factor: float = 2.5, topk: int = 8, config=None): super().__init__() self.config = config - self.top_k = 8 # config.num_experts_per_tok + self.top_k = topk # config.num_experts_per_tok 8 + assert topk == 8 self.n_routed_experts = 256 # config.n_routed_experts - self.routed_scaling_factor = 2.5 # config.routed_scaling_factor + self.routed_scaling_factor = routed_scaling_factor # config.routed_scaling_factor 2.5 self.n_group = 8 # config.n_group self.topk_group = 4 # config.topk_group self.norm_topk_prob = True # config.norm_topk_prob @@ -73,7 +75,7 @@ def __init__(self, correction_bias, config=None): # self.weight = torch.rand(256, 7168) * 2 - 1 # self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts)) - self.e_score_correction_bias = torch.zeros(256, device="cuda") + self.e_score_correction_bias = torch.zeros(256, device=correction_bias.device) self.e_score_correction_bias[:] = correction_bias[:] @torch.no_grad() @@ -114,11 +116,12 @@ def forward(self, router_logits): denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20 topk_weights /= denominator topk_weights = topk_weights * self.routed_scaling_factor + return topk_indices, topk_weights -def torch_topkrouter(router_logits, correction_bias): - lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias)(router_logits) +def torch_topkrouter(router_logits, correction_bias, routed_scaling_factor, topk): + lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias, routed_scaling_factor, topk)(router_logits) lable_indices = lable_indices.to(torch.int32) return lable_values, lable_indices @@ -129,12 +132,13 @@ def test( x_shape, x_stride, topk, + routed_scaling_factor, x_dtype=InfiniDtype.F32, dtype=InfiniDtype.F16, sync=None, ): print( - f"Testing topkrouter on {InfiniDeviceNames[device]} with x_shape:{x_shape}" + f"Testing topkrouter on {InfiniDeviceNames[device]} with x_shape:{x_shape} " f"x_stride:{x_stride} w_dtype:{InfiniDtypeNames[x_dtype]} dtype:{InfiniDtypeNames[dtype]}" ) @@ -182,27 +186,27 @@ def lib_topkrouter(): indices.data_ptr(), x.data(), correction_bias.data(), - 2.5, + routed_scaling_factor, topk, None, ) ) lib_topkrouter() - lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor()) + lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk) atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) if DEBUG: debug(lable_values, values, atol=atol, rtol=rtol) debug(lable_indices, indices, atol=atol, rtol=rtol) assert torch.allclose(lable_values, values, atol=atol, rtol=rtol) - assert torch.allclose(lable_indices, lable_indices, atol=atol, rtol=rtol) + assert torch.allclose(lable_indices, indices, atol=atol, rtol=rtol) # Profiling workflow if PROFILE: # fmt: off - profile_operation("PyTorch", lambda: torch_topkrouter(x.actual_tensor().clone(), tokp), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation("PyTorch", lambda: torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk), device, NUM_PRERUN, NUM_ITERATIONS) profile_operation(" lib", lambda: lib_topkrouter(), device, NUM_PRERUN, NUM_ITERATIONS) # fmt: on check_error(LIBINFINIOP.infiniopDestroyTopkrouterDescriptor(descriptor)) diff --git a/test/infiniop/topksoftmax.py b/test/infiniop/topksoftmax.py new file mode 100644 index 000000000..8529b8ffd --- /dev/null +++ b/test/infiniop/topksoftmax.py @@ -0,0 +1,170 @@ +import torch +import ctypes +from ctypes import c_uint64 +import torch.nn.functional as F + +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + torch_device_map +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # x_shape, x_stride, topk, norm + ((1, 10), None, 7, True), + ((2, 20), None, 4, True), + ((1, 128), None, 10, True), +] + +# w (weight) types +# Note: 'None' means the same as input dtype +_X_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16] # +# x types used for testing +_VALUE_DTYPES = [InfiniDtype.F32] + +# Form the test cases by appending each element of _X_DTYPES to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (x_dtype,) for test_case in _TEST_CASES_ for x_dtype in _X_DTYPES +] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tensorInfo(data): + print("data: ", data.is_contiguous(), data.device, data.dtype, data.shape, data.stride(), data.data_ptr(), hex(data.data_ptr())) + + +def torch_topksoftmax(router_logits, top_k, norm_topk_prob=False): + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1) + if norm_topk_prob: # only diff with mixtral sparse moe block! + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + return routing_weights, selected_experts + + +def test( + handle, + device, + x_shape, + x_stride, + topk, + norm_topk_prob, + x_dtype=InfiniDtype.F32, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing topksoftmax on {InfiniDeviceNames[device]} with x_shape:{x_shape}" + f"x_stride:{x_stride} w_dtype:{InfiniDtypeNames[x_dtype]} dtype:{InfiniDtypeNames[dtype]}" + ) + + data = torch.arange(0, x_shape[0] * x_shape[1]).reshape(x_shape) + + N, width = x_shape + x = TestTensor(x_shape, data.stride(), x_dtype, device, scale=0.5, mode="manual", set_tensor=data) + + # print(x.torch_tensor()) + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTopksoftmaxDescriptor( + handle, + ctypes.byref(descriptor), + x.descriptor + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [x]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTopksoftmaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + values = torch.zeros((N, topk), dtype=torch.float32, device=torch_device_map[x.device]) + indices = torch.zeros((N, topk), dtype=torch.int32, device=torch_device_map[x.device]) + + def lib_topksoftmax(): + check_error( + LIBINFINIOP.infiniopTopksoftmax( + descriptor, + workspace.data(), + workspace_size.value, + values.data_ptr(), + indices.data_ptr(), + x.data(), + topk, + norm_topk_prob, + None, + ) + ) + + lable_values, lable_indices = torch_topksoftmax(x.torch_tensor().clone(), topk, norm_topk_prob=norm_topk_prob) + lable_indices = lable_indices.to(dtype=torch.int32) + lib_topksoftmax() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(lable_values, values, atol=atol, rtol=rtol) + debug(lable_indices, indices, atol=atol, rtol=rtol) + + assert torch.allclose(lable_values, values, atol=atol, rtol=rtol) + assert torch.allclose(lable_indices, indices, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_topksoftmax(x.actual_tensor().clone(), topk), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_topksoftmax(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTopksoftmaxDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _VALUE_DTYPES) + + print("\033[92mTest passed!\033[0m") +