From ed530e111ce058f421304dab05796def49e8ce9e Mon Sep 17 00:00:00 2001
From: pengcheng888 <pengcheng@example.com>
Date: Mon, 29 Sep 2025 16:02:23 +0800
Subject: [PATCH] issue/427 - the sigmoid, topksoftmax, and topkrouter ops

---
 include/infiniop.h                            |   2 +
 include/infiniop/ops/sigmoid.h                |  24 ++
 include/infiniop/ops/topkrouter.h             |  21 +-
 include/infiniop/ops/topksoftmax.h            |  26 ++
 scripts/python_test.py                        |   3 +
 src/infiniop-test/include/ops.hpp             |   7 +-
 src/infiniop-test/src/ops/sigmoid.cpp         | 103 ++++++++
 src/infiniop-test/src/ops/topkrouter.cpp      | 130 ++++++++++
 src/infiniop-test/src/ops/topksoftmax.cpp     | 122 ++++++++++
 src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc   |  51 ++++
 src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h    |  19 ++
 src/infiniop/ops/sigmoid/cuda/kernel.cuh      |  34 +++
 .../ops/sigmoid/nvidia/sigmoid_nvidia.cu      |  58 +++++
 .../ops/sigmoid/nvidia/sigmoid_nvidia.cuh     |   8 +
 src/infiniop/ops/sigmoid/operator.cc          | 115 +++++++++
 .../ops/topkrouter/cpu/topkrouter_cpu.cc      | 217 +++++++++++++++--
 .../ops/topkrouter/cpu/topkrouter_cpu.h       |   5 +-
 src/infiniop/ops/topkrouter/cuda/kernel.cuh   |  42 ++--
 src/infiniop/ops/topkrouter/info.h            |   4 +-
 .../topkrouter/nvidia/topkrouter_nvidia.cu    |  21 +-
 .../topkrouter/nvidia/topkrouter_nvidia.cuh   |   4 +-
 src/infiniop/ops/topkrouter/operator.cc       |  34 ++-
 src/infiniop/ops/topkrouter/topkrouter.h      |  87 +++----
 .../ops/topksoftmax/cpu/topksoftmax_cpu.cc    | 143 +++++++++++
 .../ops/topksoftmax/cpu/topksoftmax_cpu.h     |   7 +
 src/infiniop/ops/topksoftmax/cuda/kernel.cuh  | 145 +++++++++++
 src/infiniop/ops/topksoftmax/info.h           |  48 ++++
 .../topksoftmax/nvidia/topksoftmax_nvidia.cu  |  92 +++++++
 .../topksoftmax/nvidia/topksoftmax_nvidia.cuh |   8 +
 src/infiniop/ops/topksoftmax/operator.cc      | 101 ++++++++
 src/infiniop/ops/topksoftmax/topksoftmax.h    |  34 +++
 .../test_generate/testcases/sigmoid.py        | 136 +++++++++++
 .../test_generate/testcases/topkrouter.py     | 225 ++++++++++++++++++
 .../test_generate/testcases/topksoftmax.py    | 148 ++++++++++++
 test/infiniop/libinfiniop/op_register.py      |  64 ++++-
 test/infiniop/sigmoid.py                      | 172 +++++++++++++
 test/infiniop/topkrouter.py                   |  34 +--
 test/infiniop/topksoftmax.py                  | 170 +++++++++++++
 38 files changed, 2523 insertions(+), 141 deletions(-)
 create mode 100644 include/infiniop/ops/sigmoid.h
 create mode 100644 include/infiniop/ops/topksoftmax.h
 create mode 100644 src/infiniop-test/src/ops/sigmoid.cpp
 create mode 100644 src/infiniop-test/src/ops/topkrouter.cpp
 create mode 100644 src/infiniop-test/src/ops/topksoftmax.cpp
 create mode 100644 src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc
 create mode 100644 src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h
 create mode 100644 src/infiniop/ops/sigmoid/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu
 create mode 100644 src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh
 create mode 100644 src/infiniop/ops/sigmoid/operator.cc
 create mode 100644 src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc
 create mode 100644 src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h
 create mode 100644 src/infiniop/ops/topksoftmax/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/topksoftmax/info.h
 create mode 100644 src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu
 create mode 100644 src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh
 create mode 100644 src/infiniop/ops/topksoftmax/operator.cc
 create mode 100644 src/infiniop/ops/topksoftmax/topksoftmax.h
 create mode 100644 test/infiniop-test/test_generate/testcases/sigmoid.py
 create mode 100644 test/infiniop-test/test_generate/testcases/topkrouter.py
 create mode 100644 test/infiniop-test/test_generate/testcases/topksoftmax.py
 create mode 100644 test/infiniop/sigmoid.py
 create mode 100644 test/infiniop/topksoftmax.py

diff --git a/include/infiniop.h b/include/infiniop.h
index b3cf8b6ca..038a06ade 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -19,6 +19,8 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/topkrouter.h"
+#include "infiniop/ops/topksoftmax.h"
+#include "infiniop/ops/sigmoid.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/sigmoid.h b/include/infiniop/ops/sigmoid.h
new file mode 100644
index 000000000..4fa0f6604
--- /dev/null
+++ b/include/infiniop/ops/sigmoid.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIGMOID_API_H__
+#define __INFINIOP_SIGMOID_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSigmoidDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSigmoidDescriptor(infiniopHandle_t handle,
+                                                            infiniopSigmoidDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t y,
+                                                            infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSigmoid(infiniopSigmoidDescriptor_t desc,
+                                            void *workspace,
+                                            size_t workspace_size,
+                                            void *y,
+                                            const void *x,
+                                            void *stream);
+
+__C __export infiniStatus_t infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/topkrouter.h b/include/infiniop/ops/topkrouter.h
index d85b6b5ff..da6aaced5 100644
--- a/include/infiniop/ops/topkrouter.h
+++ b/include/infiniop/ops/topkrouter.h
@@ -5,16 +5,23 @@
 
 typedef struct InfiniopDescriptor *infiniopTopkrouterDescriptor_t;
 
-__C __export infiniStatus_t infiniopCreateTopkrouterDescriptor(
-    infiniopHandle_t handle,
-    infiniopTopkrouterDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t correction_bias_desc);
+__C __export infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle,
+                                                               infiniopTopkrouterDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t x_desc,
+                                                               infiniopTensorDescriptor_t correction_bias_desc);
 
 __C __export infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size);
 
-__C __export infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size,
-                                               void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream);
+__C __export infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *values,
+                                               void *indices,
+                                               const void *x,
+                                               const void *correction_bias,
+                                               const float routed_scaling_factor,
+                                               const size_t topk,
+                                               void *stream);
 
 __C __export infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc);
 
diff --git a/include/infiniop/ops/topksoftmax.h b/include/infiniop/ops/topksoftmax.h
new file mode 100644
index 000000000..696eb50f3
--- /dev/null
+++ b/include/infiniop/ops/topksoftmax.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_TOPKSOFTMAX_API_H__
+#define __INFINIOP_TOPKSOFTMAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTopksoftmaxDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
+                                                                infiniopTopksoftmaxDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t x_desc);
+
+__C __export infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *values,
+                                                void *indices,
+                                                const void *x,
+                                                const size_t topk,
+                                                const int norm,
+                                                void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 5348c8c69..00cc1293a 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -25,6 +25,9 @@ def run_tests(args):
         "sub.py",
         "swiglu.py",
         "softplus.py",
+        "sigmoid.py",
+        "topkrouter.py",
+        "topksoftmax.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..0685847b5 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,7 +16,9 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
-
+DECLARE_INFINIOP_TEST(sigmoid)
+DECLARE_INFINIOP_TEST(topkrouter)
+DECLARE_INFINIOP_TEST(topksoftmax)
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
         #name,                                            \
@@ -43,6 +45,9 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(sigmoid)        \
+        REGISTER_INFINIOP_TEST(topkrouter)     \
+        REGISTER_INFINIOP_TEST(topksoftmax)    \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/sigmoid.cpp b/src/infiniop-test/src/ops/sigmoid.cpp
new file mode 100644
index 000000000..bb3a0f70a
--- /dev/null
+++ b/src/infiniop-test/src/ops/sigmoid.cpp
@@ -0,0 +1,103 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sigmoid {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSigmoidDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateSigmoidDescriptor(handle, &op_desc,
+                                             y->desc(),
+                                             x->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSigmoidWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSigmoid(op_desc, workspace, workspace_size,
+                             y->data(),
+                             x->data(),
+                             nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSigmoid(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    infiniopDestroySigmoidDescriptor(op_desc);
+    infinirtFree(workspace);
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::sigmoid
diff --git a/src/infiniop-test/src/ops/topkrouter.cpp b/src/infiniop-test/src/ops/topkrouter.cpp
new file mode 100644
index 000000000..367087fd6
--- /dev/null
+++ b/src/infiniop-test/src/ops/topkrouter.cpp
@@ -0,0 +1,130 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::topkrouter {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> values;
+    std::shared_ptr<Tensor> indices;
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> correction_bias;
+    float routed_scaling_factor;
+    int topk;
+    std::shared_ptr<Tensor> lable_values;
+    std::shared_ptr<Tensor> lable_indices;
+};
+
+std::shared_ptr<Test> Test::build(std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+                                  std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, double rtol,
+                                  double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (attributes.find("routed_scaling_factor") == attributes.end() || attributes.find("topk") == attributes.end() || tensors.find("values") == tensors.end() || tensors.find("indices") == tensors.end() || tensors.find("x") == tensors.end() || tensors.find("correction_bias") == tensors.end() || tensors.find("lable_values") == tensors.end() || tensors.find("lable_indices") == tensors.end()) {
+        throw std::runtime_error("Invalid Test: Missing attributes or tensors");
+    }
+
+    test->_attributes->values = tensors["values"];
+    test->_attributes->indices = tensors["indices"];
+    test->_attributes->x = tensors["x"];
+    test->_attributes->correction_bias = tensors["correction_bias"];
+
+    test->_attributes->routed_scaling_factor = *reinterpret_cast<float *>(attributes["routed_scaling_factor"].data());
+    test->_attributes->topk = *reinterpret_cast<int *>(attributes["topk"].data());
+
+    test->_attributes->lable_values = tensors["lable_values"];
+    test->_attributes->lable_indices = tensors["lable_indices"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(infiniopHandle_t handle, infiniDevice_t device, int device_id,
+                                                 size_t warm_ups, size_t iterations) {
+    infiniopTopkrouterDescriptor_t op_desc;
+    CHECK_OR(infiniopCreateTopkrouterDescriptor(handle, &op_desc, _attributes->x->desc(),
+                                                _attributes->correction_bias->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create Topkrouter descriptor"));
+
+    //
+    auto values = _attributes->values->to(device, device_id);
+    auto indices = _attributes->indices->to(device, device_id);
+    auto x = _attributes->x->to(device, device_id);
+    auto correction_bias = _attributes->correction_bias->to(device, device_id);
+
+    float routed_scaling_factor = _attributes->routed_scaling_factor;
+    int topk = _attributes->topk;
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTopkrouterWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+
+    CHECK_OR(infiniopTopkrouter(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(),
+                                correction_bias->data(), routed_scaling_factor, topk, nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Topkrouter execution failed"));
+
+    try {
+        allClose(values, _attributes->lable_values, _rtol, _atol);
+        allClose(indices, _attributes->lable_indices, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTopkrouter(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(),
+                               correction_bias->data(), routed_scaling_factor, topk, nullptr);
+        },
+        warm_ups, iterations);
+
+    if (workspace != nullptr) {
+        infinirtFree(workspace);
+    }
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"routed_scaling_factor", "topk"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"values", "indices", "x", "correction_bias", "lable_values", "lable_indices"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"values", "indices"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- routed_scaling_factor=" << _attributes->routed_scaling_factor << std::endl;
+    oss << "- topk=" << _attributes->topk << std::endl;
+
+    oss << "- values: " << _attributes->values->info() << std::endl;
+    oss << "- indices: " << _attributes->indices->info() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- correction_bias: " << _attributes->correction_bias->info() << std::endl;
+
+    oss << "- lable_values: " << _attributes->lable_values->info() << std::endl;
+    oss << "- lable_indices: " << _attributes->lable_indices->info() << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::topkrouter
diff --git a/src/infiniop-test/src/ops/topksoftmax.cpp b/src/infiniop-test/src/ops/topksoftmax.cpp
new file mode 100644
index 000000000..f6b0af7a4
--- /dev/null
+++ b/src/infiniop-test/src/ops/topksoftmax.cpp
@@ -0,0 +1,122 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::topksoftmax {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> values;
+    std::shared_ptr<Tensor> indices;
+    std::shared_ptr<Tensor> x;
+    int topk;
+    bool norm;
+    std::shared_ptr<Tensor> lable_values;
+    std::shared_ptr<Tensor> lable_indices;
+};
+
+std::shared_ptr<Test> Test::build(std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+                                  std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, double rtol,
+                                  double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (attributes.find("topk") == attributes.end() || attributes.find("norm") == attributes.end() || tensors.find("values") == tensors.end() || tensors.find("indices") == tensors.end() || tensors.find("x") == tensors.end() || tensors.find("lable_values") == tensors.end() || tensors.find("lable_indices") == tensors.end()) {
+        throw std::runtime_error("Invalid Test: Missing attributes or tensors");
+    }
+
+    test->_attributes->values = tensors["values"];
+    test->_attributes->indices = tensors["indices"];
+    test->_attributes->x = tensors["x"];
+
+    test->_attributes->topk = *reinterpret_cast<int *>(attributes["topk"].data());
+    test->_attributes->norm = *reinterpret_cast<bool *>(attributes["norm"].data());
+    test->_attributes->lable_values = tensors["lable_values"];
+    test->_attributes->lable_indices = tensors["lable_indices"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(infiniopHandle_t handle, infiniDevice_t device, int device_id,
+                                                 size_t warm_ups, size_t iterations) {
+    infiniopTopksoftmaxDescriptor_t op_desc;
+    CHECK_OR(infiniopCreateTopksoftmaxDescriptor(handle, &op_desc, _attributes->x->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create Topksoftmax descriptor"));
+
+    //
+    auto values = _attributes->values->to(device, device_id);
+    auto indices = _attributes->indices->to(device, device_id);
+    auto x = _attributes->x->to(device, device_id);
+
+    int topk = _attributes->topk;
+    bool norm = _attributes->norm;
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTopksoftmaxWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+
+    CHECK_OR(infiniopTopksoftmax(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(), topk, norm, nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Topksoftmax execution failed"));
+
+    try {
+        allClose(values, _attributes->lable_values, _rtol, _atol);
+        allClose(indices, _attributes->lable_indices, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTopksoftmax(op_desc, workspace, workspace_size, values->data(), indices->data(), x->data(), topk, norm, nullptr);
+        },
+        warm_ups, iterations);
+
+    if (workspace != nullptr) {
+        infinirtFree(workspace);
+    }
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"topk", "norm"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"values", "indices", "x", "lable_values", "lable_indices"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"values", "indices"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+
+    oss << "- topk=" << _attributes->topk << std::endl;
+    oss << "- norm=" << _attributes->norm << std::endl;
+
+    oss << "- values: " << _attributes->values->info() << std::endl;
+    oss << "- indices: " << _attributes->indices->info() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+
+    oss << "- lable_values: " << _attributes->lable_values->info() << std::endl;
+    oss << "- lable_indices: " << _attributes->lable_indices->info() << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::topksoftmax
diff --git a/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc
new file mode 100644
index 000000000..c335bba60
--- /dev/null
+++ b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.cc
@@ -0,0 +1,51 @@
+#include "sigmoid_cpu.h"
+
+namespace op::sigmoid::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SigmoidOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SigmoidOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SigmoidOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SigmoidOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid::cpu
diff --git a/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h
new file mode 100644
index 000000000..49c963f44
--- /dev/null
+++ b/src/infiniop/ops/sigmoid/cpu/sigmoid_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __SIGMOID_CPU_H__
+#define __SIGMOID_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid, cpu)
+
+namespace op::sigmoid::cpu {
+typedef struct SigmoidOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return T(1) / (T(1) + std::exp(-x));
+    }
+} SigmoidOp;
+} // namespace op::sigmoid::cpu
+
+#endif // __SIGMOID_CPU_H__
diff --git a/src/infiniop/ops/sigmoid/cuda/kernel.cuh b/src/infiniop/ops/sigmoid/cuda/kernel.cuh
new file mode 100644
index 000000000..6661d1909
--- /dev/null
+++ b/src/infiniop/ops/sigmoid/cuda/kernel.cuh
@@ -0,0 +1,34 @@
+#ifndef __SIDMOID_CUDA_H__
+#define __SIDMOID_CUDA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::sigmoid::cuda {
+typedef struct SigmoidOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        // sigmoid(x) = 1 / (1 + exp(-x))
+        if constexpr (std::is_same_v<T, half2>) {
+            half2 denominator = __hadd2(make_half2(1, 1), h2exp(__hneg2(x)));
+            return h2rcp(denominator);
+        } else if constexpr (std::is_same_v<T, half>) {
+            half denominator = __hadd(__float2half(1.0f), hexp(__hneg(x)));
+            return hrcp(denominator);
+        } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+            __nv_bfloat16 denominator = __float2bfloat16(__fadd_rn(1.0f, __expf(__bfloat162float(-x))));
+            return __float2bfloat16(1.0f) / denominator;
+        } else if constexpr (std::is_same_v<T, float>) {
+            float denominator = __fadd_rn(1.0f, __expf(-x));
+            return __frcp_rn(denominator);
+        } else { // double
+            return 1.0 / (1.0 + exp(-x));
+        }
+    }
+} SigmoidOp;
+} // namespace op::sigmoid::cuda
+
+#endif // __SIDMOID_CUDA_H__
diff --git a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu
new file mode 100644
index 000000000..43f6df9e6
--- /dev/null
+++ b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../cuda/kernel.cuh"
+#include "sigmoid_nvidia.cuh"
+
+namespace op::sigmoid::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidOp, double>(_info, workspace, output, inputs, stream);
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid::nvidia
diff --git a/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh
new file mode 100644
index 000000000..5084a99d1
--- /dev/null
+++ b/src/infiniop/ops/sigmoid/nvidia/sigmoid_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIGMOID_CUDA_API_H__
+#define __SIGMOID_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid, nvidia)
+
+#endif // __SIGMOID_CUDA_API_H__
diff --git a/src/infiniop/ops/sigmoid/operator.cc b/src/infiniop/ops/sigmoid/operator.cc
new file mode 100644
index 000000000..3f2f95067
--- /dev/null
+++ b/src/infiniop/ops/sigmoid/operator.cc
@@ -0,0 +1,115 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sigmoid.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sigmoid_cpu.h"
+#endif
+#ifdef ENABLE_NVIDIA_API
+#include "nvidia/sigmoid_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateSigmoidDescriptor(
+    infiniopHandle_t handle,
+    infiniopSigmoidDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::sigmoid::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::sigmoid::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                            \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSigmoidWorkspaceSize(infiniopSigmoidDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                   \
+    case CASE:                                                                                 \
+        *size = reinterpret_cast<op::sigmoid::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSigmoid(
+    infiniopSigmoidDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<const op::sigmoid::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySigmoidDescriptor(infiniopSigmoidDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::sigmoid::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
index 7d4555da2..2d2b36d6b 100644
--- a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
+++ b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.cc
@@ -1,28 +1,215 @@
 #include "topkrouter_cpu.h"
+#include "../../../../utils.h"
 #include "../../../devices/cpu/common_cpu.h"
 #include "../../../reduce/cpu/reduce.h"
+#include <algorithm>
 
 namespace op::topkrouter::cpu {
+Descriptor::~Descriptor() {
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc,
+                                  infiniopTensorDescriptor_t correction_bias_desc) {
+    auto result = TopkrouterInfo::create(x_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    if (info.x_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(nullptr, std::move(info), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+inline float sigmoid_func(T x) {
+    float value;
+    if constexpr (std::is_same<T, fp16_t>::value) {
+        value = _f16_to_f32(x);
+    } else if constexpr (std::is_same<T, bf16_t>::value) {
+        value = _bf16_to_f32(x);
+    } else {
+        value = x;
+    }
+    return 1.0f / (1.0f + std::exp(-value));
+}
+
+template <typename T>
+void topkrouter_cpu_one_token(float *values_input,                                    // 输出数据
+                              int *indices_input,                                     // 输出索引
+                              const T *x_input,                                       // 输入数据
+                              std::vector<std::pair<float, size_t>> &value_index_arr, // 输入数据
+                              const float *correction_bias,
+                              const float routed_scaling_factor, const size_t topk,
+                              const size_t width, const size_t n_routed_experts, const size_t n_group,
+                              const size_t topk_group, const bool norm_topk_prob) {
+
+    // ------------------------------------------------------ //
+    //               对输入数据做 sigmoid                       //
+    // ------------------------------------------------------ //
+    for (size_t i = 0; i < width; ++i) {
+        value_index_arr[i].first = sigmoid_func(value_index_arr[i].first);
+    }
+
+    // ------------------------------------------------------ //
+    //                   再加偏置                              //
+    // ------------------------------------------------------ //
+    for (size_t i = 0; i < width; ++i) {
+        value_index_arr[i].first += correction_bias[i];
+    }
+
+    // ----------------------------------------------------------- //
+    //          分为 n_group 组，找出每组的最大值                      //
+    // ----------------------------------------------------------- //
+    std::vector<std::pair<float, size_t>> value_index_group;
+    value_index_group.resize(n_group);
+
+    const size_t group_size = width / n_group; // group_size表示每个组的元素数量
+    for (size_t igroup = 0; igroup < n_group; ++igroup) {
+        std::vector<std::pair<float, size_t>> value_index_warp;
+        value_index_warp.resize(group_size);
+
+        auto it = value_index_arr.begin() + igroup * group_size;
+        for (size_t i = 0; i < group_size; ++i) {
+            value_index_warp[i] = {(it++)->first, i};
+        }
+
+        // 每个group中的数据，进行排序
+        std::sort(value_index_warp.begin(), value_index_warp.end(),
+                  [](const std::pair<float, size_t> &a, const std::pair<float, size_t> &b) { return a.first > b.first; });
+
+        // 取前两个的和，作为最大值
+        value_index_group[igroup] = {value_index_warp[0].first + value_index_warp[1].first, igroup};
+    }
 
-Descriptor::~Descriptor() {}
+    // ------------------------------------------------------------------ //
+    //       对 value_index_group 的数据, 再选前 topk_group 个               //
+    // ------------------------------------------------------------------ //
+    std::sort(value_index_group.begin(), value_index_group.end(),
+              [](const std::pair<float, size_t> &a, const std::pair<float, size_t> &b) { return a.first > b.first; });
 
-infiniStatus_t Descriptor::create(
-    infiniopHandle_t handle,
-    Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t correction_bias_desc) {
+    std::vector<bool> group_mask;
+    group_mask.resize(n_group, false);
+    for (size_t i = 0; i < topk_group; ++i) {
+        size_t index = value_index_group[i].second;
+        group_mask[index] = true;
+    }
 
-    return INFINI_STATUS_NOT_IMPLEMENTED;
+    // ------------------------------------------------------------------ //
+    //              根据group_mask，false的组的数值置0                       //
+    // ------------------------------------------------------------------ //
+    for (size_t igroup = 0; igroup < n_group; ++igroup) {
+        if (group_mask[igroup]) {
+            continue;
+        }
+
+        auto it = value_index_arr.begin() + igroup * group_size;
+        for (size_t i = 0; i < group_size; ++i) {
+            (it++)->first = 0.0f;
+        }
+    }
+
+    // ------------------------------------------------------------------ //
+    //                     最后整体做topk                                   //
+    // ------------------------------------------------------------------ //
+    std::sort(value_index_arr.begin(), value_index_arr.end(),
+              [](const std::pair<float, size_t> &a, const std::pair<float, size_t> &b) { return a.first > b.first; });
+
+    // ----------------------------------------------------------- //
+    //                取topk个数据                                  //
+    // ----------------------------------------------------------- //
+    float exp_sum = 1e-9f;
+    for (size_t i = 0; i < topk; ++i) {
+        size_t index = value_index_arr[i].second;
+        float exp_value = sigmoid_func(x_input[index]);
+
+        values_input[i] = exp_value;
+        indices_input[i] = static_cast<int>(index);
+
+        exp_sum += exp_value;
+    }
+
+    // ----------------------------------------------------------- //
+    //                    归一化                                    //
+    // ----------------------------------------------------------- //
+    if (norm_topk_prob) {
+        for (size_t i = 0; i < topk; ++i) {
+            values_input[i] = routed_scaling_factor * values_input[i] / exp_sum;
+        }
+    }
 }
 
-infiniStatus_t Descriptor::calculate(
-    void *workspace,
-    size_t workspace_size,
-    float *values, int *indices, void *x, float *correction_bias,
-    float routed_scaling_factor,
-    size_t topk,
-    void *stream) const {
+template <typename T>
+infiniStatus_t topkrouter_cpu_func(float *values, int *indices,
+                                   const T *x,
+                                   const float *correction_bias, const float routed_scaling_factor, const size_t topk,
+                                   const size_t N, const size_t width, const size_t n_routed_experts = 256,
+                                   const size_t n_group = 8, const size_t topk_group = 4,
+                                   const bool norm_topk_prob = true) {
+    /*
+    O-----------> width 地址连续
+    |
+    |
+    N
+    */
+    for (size_t n = 0; n < N; ++n) {
+        float *values_input = values + n * topk;
+        int *indices_input = indices + n * topk;
+        const T *x_input = x + n * width;
+
+        std::vector<std::pair<float, size_t>> value_index_arr;
+        value_index_arr.resize(width);
+
+        for (size_t i = 0; i < width; ++i) {
+            float temp;
+            if constexpr (std::is_same<T, fp16_t>::value) {
+                temp = _f16_to_f32(x_input[i]);
+            } else if constexpr (std::is_same<T, bf16_t>::value) {
+                temp = _bf16_to_f32(x_input[i]);
+            } else {
+                temp = x_input[i];
+            }
+            value_index_arr[i] = {temp, i};
+        }
+
+        topkrouter_cpu_one_token<T>(values_input, indices_input, x_input, value_index_arr,
+                                    correction_bias, routed_scaling_factor,
+                                    topk, width, n_routed_experts, n_group, topk_group, norm_topk_prob);
+    }
+
+    return INFINI_STATUS_SUCCESS;
+} // namespace op::topkrouter::cpu
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x,
+                                     const float *correction_bias, const float routed_scaling_factor, const size_t topk,
+                                     void *stream) const {
+    size_t N = _info.N;
+    size_t width = _info.width;
+
+    // 下面是 deepseek的config.json的超参数
+    const size_t n_routed_experts = 256;
+    const size_t n_group = 8;
+    const size_t topk_group = 4;
+    const bool norm_topk_prob = true;
+
+    if ((width != n_routed_experts) || (width % n_group != 0) || (256 != width)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    if (_info.xtype == INFINI_DTYPE_F32) {
+        topkrouter_cpu_func(values, indices, static_cast<const float *>(x), correction_bias, routed_scaling_factor, topk, N,
+                            width, n_routed_experts, n_group, topk_group, norm_topk_prob);
+    } else if (_info.xtype == INFINI_DTYPE_F16) {
+        topkrouter_cpu_func(values, indices, static_cast<const fp16_t *>(x), correction_bias, routed_scaling_factor, topk, N,
+                            width, n_routed_experts, n_group, topk_group, norm_topk_prob);
+    } else if (_info.xtype == INFINI_DTYPE_BF16) {
+        topkrouter_cpu_func(values, indices, static_cast<const bf16_t *>(x), correction_bias, routed_scaling_factor, topk, N,
+                            width, n_routed_experts, n_group, topk_group, norm_topk_prob);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
 
-    return INFINI_STATUS_NOT_IMPLEMENTED;
+    return INFINI_STATUS_SUCCESS;
 }
 } // namespace op::topkrouter::cpu
diff --git a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
index f327a2ab6..996fd9d2f 100644
--- a/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
+++ b/src/infiniop/ops/topkrouter/cpu/topkrouter_cpu.h
@@ -1,7 +1,6 @@
-#ifndef __Topkrouter_CPU_H__
-#define __Topkrouter_CPU_H__
+#ifndef __TOPKTOUTER_CPU_H__
+#define __TOPKTOUTER_CPU_H__
 #include "../topkrouter.h"
-
 DESCRIPTOR(cpu)
 
 #endif
diff --git a/src/infiniop/ops/topkrouter/cuda/kernel.cuh b/src/infiniop/ops/topkrouter/cuda/kernel.cuh
index f81048d3d..0832c5b93 100644
--- a/src/infiniop/ops/topkrouter/cuda/kernel.cuh
+++ b/src/infiniop/ops/topkrouter/cuda/kernel.cuh
@@ -1,5 +1,5 @@
-#ifndef _Topkrouter_KERNEL_CUH__
-#define _Topkrouter_KERNEL_CUH__
+#ifndef _TOPKROUTER_KERNEL_CUH__
+#define _TOPKROUTER_KERNEL_CUH__
 #include <cfloat>
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>
@@ -36,17 +36,14 @@ struct CustomLess {
     }
 };
 
-//
-// deepseek的topk
-//
 template <typename T, int BLOCK_THREADS = 256>
-__global__ void topkrouter_kernel(float *values_topk,          // 输出值, 形状[N, topk]
-                                  int *indices_topk,           // 输出索引, 形状[N, topk]
-                                  T *input,                    // 输入数据 [N, width]
-                                  float *d_correction_bias,    // 输入数据 [width]
-                                  float routed_scaling_factor, //
-                                  const size_t N,              // 总行数,toen数量
-                                  const size_t width,          // 每行元素数量
+__global__ void topkrouter_kernel(float *values_topk,             // 输出数据, 形状[N, topk]
+                                  int *indices_topk,              // 输出索引, 形状[N, topk]
+                                  const T *input,                 // 输入数据 [N, width]
+                                  const float *d_correction_bias, // 输入数据 [width]
+                                  const float routed_scaling_factor,
+                                  const size_t N,
+                                  const size_t width,
                                   const size_t topk
 
 ) {
@@ -99,7 +96,6 @@ __global__ void topkrouter_kernel(float *values_topk,          // 输出值, 形
     // ----------------------------------------------------------- //
     //              每个组中,前两个数据的和                            //
     // ----------------------------------------------------------- //
-
     __syncthreads();
     if (0 == lane_id) {
         share_data_group[warp_id] = share_data[warp_id * warp_threads] + share_data[warp_id * warp_threads + 1];
@@ -116,8 +112,10 @@ __global__ void topkrouter_kernel(float *values_topk,          // 输出值, 形
             thread_indices[0] = lane_id;
         }
 
-        __shared__ typename WarpMergeSortT::TempStorage temp_storage[1];
-        WarpMergeSortT(temp_storage[0]).Sort(thread_values, thread_indices, CustomLess());
+        {
+            __shared__ typename WarpMergeSortT::TempStorage temp_storage[1];
+            WarpMergeSortT(temp_storage[0]).Sort(thread_values, thread_indices, CustomLess());
+        }
         if (lane_id < 4) {
             int indices = thread_indices[0];
             share_data_group_mask[indices] = 1.0f;
@@ -147,13 +145,13 @@ __global__ void topkrouter_kernel(float *values_topk,          // 输出值, 形
             int index = thread_indices[0];
             value = sigmoid_func(data_input[index]);
         }
-
-        typedef cub::WarpReduce<float, warp_threads> WarpReduce;
-        __shared__ typename WarpReduce::TempStorage temp_storage;
-        // 使用有效项group 进行部分归约
-        float warp_sum = WarpReduce(temp_storage).Sum(value);
-        if (0 == tid) {
-            share_sum = warp_sum + 1e-20;
+        {
+            typedef cub::WarpReduce<float, warp_threads> WarpReduce;
+            __shared__ typename WarpReduce::TempStorage temp_storage;
+            float warp_sum = WarpReduce(temp_storage).Sum(value);
+            if (0 == tid) {
+                share_sum = warp_sum + 1e-9f;
+            }
         }
         __syncwarp();
 
diff --git a/src/infiniop/ops/topkrouter/info.h b/src/infiniop/ops/topkrouter/info.h
index 4df0a9298..08a4751b8 100644
--- a/src/infiniop/ops/topkrouter/info.h
+++ b/src/infiniop/ops/topkrouter/info.h
@@ -1,5 +1,5 @@
-#ifndef __topkrouter_INFO_H__
-#define __topkrouter_INFO_H__
+#ifndef __TOPKROUTER_INFO_H__
+#define __TOPKROUTER_INFO_H__
 
 #include "../../../utils.h"
 #include "../../tensor.h"
diff --git a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
index e44872fcc..fe8cfeb72 100644
--- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
+++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cu
@@ -40,9 +40,9 @@ infiniStatus_t Descriptor::create(
 namespace {
 
 template <int BLOCK_SIZE = 128>
-infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, void *d_input, float *d_correction_bias, float routed_scaling_factor,
-                                 size_t N, size_t width, size_t topk, infiniDtype_t xtype, cudaStream_t stream) {
-
+infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, const void *d_input, const float *d_correction_bias,
+                                 const float routed_scaling_factor, const size_t N, const size_t width, const size_t topk, infiniDtype_t xtype,
+                                 cudaStream_t stream) {
     const int block_threads = BLOCK_SIZE;
     dim3 blocks(N);
     dim3 threads(block_threads);
@@ -63,9 +63,15 @@ infiniStatus_t launch_topkrouter(float *d_values_out, int *d_indices_out, void *
 }; // namespace
 
 infiniStatus_t Descriptor::calculate(
-    void *workspace, size_t workspace_size,
-    float *values, int *indices, void *x, float *correction_bias, float routed_scaling_factor, size_t topk, void *stream) const {
-
+    void *workspace,
+    size_t workspace_size,
+    float *values,
+    int *indices,
+    const void *x,
+    const float *correction_bias,
+    const float routed_scaling_factor,
+    const size_t topk,
+    void *stream) const {
     if (workspace_size < _workspace_size) {
         return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
     }
@@ -76,13 +82,12 @@ infiniStatus_t Descriptor::calculate(
     // size_t n_routed_experts = 256;
     // size_t n_group = 8;
     // size_t topk_group = 4;
-
     auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
 
     if (256 == width) {
         launch_topkrouter<256>(values, indices, x, correction_bias, routed_scaling_factor, N, width, topk, _info.xtype, cuda_stream);
     } else {
-        return INFINI_STATUS_INTERNAL_ERROR;
+        return INFINI_STATUS_BAD_PARAM;
     }
 
     return INFINI_STATUS_SUCCESS;
diff --git a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
index 1aab4e2ff..9b3a81c37 100644
--- a/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
+++ b/src/infiniop/ops/topkrouter/nvidia/topkrouter_nvidia.cuh
@@ -1,5 +1,5 @@
-#ifndef __Topkrouter_CUDA_H__
-#define __Topkrouter_CUDA_H__
+#ifndef __TOPKROUTER_CUDA_H__
+#define __TOPKROUTER_CUDA_H__
 
 #include "../topkrouter.h"
 
diff --git a/src/infiniop/ops/topkrouter/operator.cc b/src/infiniop/ops/topkrouter/operator.cc
index 4d43c77ce..c61c9c07a 100644
--- a/src/infiniop/ops/topkrouter/operator.cc
+++ b/src/infiniop/ops/topkrouter/operator.cc
@@ -9,18 +9,13 @@
 #include "nvidia/topkrouter_nvidia.cuh"
 #endif
 
-__C infiniStatus_t infiniopCreateTopkrouterDescriptor(
-    infiniopHandle_t handle,
-    infiniopTopkrouterDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t x_desc,
-    infiniopTensorDescriptor_t correction_bias_desc) {
-
-#define CREATE(CASE, NAMESPACE)                                                   \
-    case CASE:                                                                    \
-        return op::topkrouter::NAMESPACE::Descriptor::create(                     \
-            handle,                                                               \
-            reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), \
-            x_desc, correction_bias_desc)
+__C infiniStatus_t infiniopCreateTopkrouterDescriptor(infiniopHandle_t handle, infiniopTopkrouterDescriptor_t *desc_ptr,
+                                                      infiniopTensorDescriptor_t x_desc,
+                                                      infiniopTensorDescriptor_t correction_bias_desc) {
+#define CREATE(CASE, NAMESPACE)                               \
+    case CASE:                                                \
+        return op::topkrouter::NAMESPACE::Descriptor::create( \
+            handle, reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor **>(desc_ptr), x_desc, correction_bias_desc)
 
     switch (handle->device) {
 #ifdef ENABLE_CPU_API
@@ -37,7 +32,6 @@ __C infiniStatus_t infiniopCreateTopkrouterDescriptor(
 }
 
 __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescriptor_t desc, size_t *size) {
-
 #define GET(CASE, NAMESPACE)                                                                      \
     case CASE:                                                                                    \
         *size = reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
@@ -58,12 +52,13 @@ __C infiniStatus_t infiniopGetTopkrouterWorkspaceSize(infiniopTopkrouterDescript
 }
 
 __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void *workspace, size_t workspace_size,
-                                      void *values, void *indices, void *x, void *correction_bias, float routed_scaling_factor, size_t topk, void *stream) {
-
-#define CALCULATE(CASE, NAMESPACE)                                                         \
-    case CASE:                                                                             \
-        return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate( \
-            workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, topk, stream)
+                                      void *values, void *indices, const void *x, const void *correction_bias,
+                                      const float routed_scaling_factor, const size_t topk, void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                                                          \
+    case CASE:                                                                                                              \
+        return reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc)->calculate(                                  \
+            workspace, workspace_size, (float *)values, (int *)indices, x, (float *)correction_bias, routed_scaling_factor, \
+            topk, stream)
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
@@ -80,7 +75,6 @@ __C infiniStatus_t infiniopTopkrouter(infiniopTopkrouterDescriptor_t desc, void
 }
 
 __C infiniStatus_t infiniopDestroyTopkrouterDescriptor(infiniopTopkrouterDescriptor_t desc) {
-
 #define DESTROY(CASE, NAMESPACE)                                                \
     case CASE:                                                                  \
         delete reinterpret_cast<op::topkrouter::NAMESPACE::Descriptor *>(desc); \
diff --git a/src/infiniop/ops/topkrouter/topkrouter.h b/src/infiniop/ops/topkrouter/topkrouter.h
index 3e6baae13..fbd763763 100644
--- a/src/infiniop/ops/topkrouter/topkrouter.h
+++ b/src/infiniop/ops/topkrouter/topkrouter.h
@@ -1,50 +1,51 @@
-#ifndef _Topkrouter_H_
-#define _Topkrouter_H_
+#ifndef _TOPKTOUTER_H_
+#define _TOPKTOUTER_H_
 
 #include "../../operator.h"
 #include "info.h"
 
-#define DESCRIPTOR(NAMESPACE)                                    \
-                                                                 \
-    namespace op::topkrouter::NAMESPACE {                        \
-    class Descriptor final : public InfiniopDescriptor {         \
-        struct Opaque;                                           \
-        Opaque *_opaque;                                         \
-        TopkrouterInfo _info;                                    \
-        size_t _workspace_size;                                  \
-                                                                 \
-        Descriptor(                                              \
-            Opaque *opaque,                                      \
-            TopkrouterInfo info,                                 \
-            size_t workspace_size,                               \
-            infiniDevice_t device_type,                          \
-            int device_id)                                       \
-            : InfiniopDescriptor{device_type, device_id},        \
-              _opaque(opaque),                                   \
-              _info(info),                                       \
-              _workspace_size(workspace_size) {}                 \
-                                                                 \
-    public:                                                      \
-        ~Descriptor();                                           \
-                                                                 \
-        size_t workspaceSize() const { return _workspace_size; } \
-                                                                 \
-        static infiniStatus_t create(                            \
-            infiniopHandle_t handle,                             \
-            Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t x_desc,                   \
-            infiniopTensorDescriptor_t correction_bias_desc);    \
-                                                                 \
-        infiniStatus_t calculate(                                \
-            void *workspace, size_t workspace_size,              \
-            float *values,                                       \
-            int *indices,                                        \
-            void *x,                                             \
-            float *correction_bias,                              \
-            float routed_scaling_factor,                         \
-            size_t topk,                                         \
-            void *stream) const;                                 \
-    };                                                           \
+#define DESCRIPTOR(NAMESPACE)                                                          \
+                                                                                       \
+    namespace op::topkrouter::NAMESPACE {                                              \
+    class Descriptor final : public InfiniopDescriptor {                               \
+        struct Opaque;                                                                 \
+        Opaque *_opaque;                                                               \
+        TopkrouterInfo _info;                                                          \
+        size_t _workspace_size;                                                        \
+                                                                                       \
+        Descriptor(Opaque *opaque,                                                     \
+                   TopkrouterInfo info,                                                \
+                   size_t workspace_size,                                              \
+                   infiniDevice_t device_type,                                         \
+                   int device_id)                                                      \
+            : InfiniopDescriptor{device_type, device_id},                              \
+              _opaque(opaque),                                                         \
+              _info(info),                                                             \
+              _workspace_size(workspace_size) {                                        \
+        }                                                                              \
+                                                                                       \
+    public:                                                                            \
+        ~Descriptor();                                                                 \
+                                                                                       \
+        size_t workspaceSize() const {                                                 \
+            return _workspace_size;                                                    \
+        }                                                                              \
+                                                                                       \
+        static infiniStatus_t create(infiniopHandle_t handle,                          \
+                                     Descriptor **desc_ptr,                            \
+                                     infiniopTensorDescriptor_t x_desc,                \
+                                     infiniopTensorDescriptor_t correction_bias_desc); \
+                                                                                       \
+        infiniStatus_t calculate(void *workspace,                                      \
+                                 size_t workspace_size,                                \
+                                 float *values,                                        \
+                                 int *indices,                                         \
+                                 const void *x,                                        \
+                                 const float *correction_bias,                         \
+                                 const float routed_scaling_factor,                    \
+                                 const size_t topk,                                    \
+                                 void *stream) const;                                  \
+    };                                                                                 \
     }
 
 #endif // _Topkrouter_H_
diff --git a/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc
new file mode 100644
index 000000000..53584e182
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.cc
@@ -0,0 +1,143 @@
+#include "topksoftmax_cpu.h"
+#include "../../../../utils.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "topksoftmax_cpu.h"
+#include <algorithm>
+
+namespace op::topksoftmax::cpu {
+Descriptor::~Descriptor() {
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc) {
+    auto result = TopksoftmaxInfo::create(x_desc);
+    CHECK_RESULT(result);
+
+    auto info = result.take();
+    if (info.x_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(nullptr, std::move(info), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+void topksoftmax_cpu_one_token(float *values_input,                                    // 输出数据
+                               int *indices_input,                                     // 输出索引
+                               std::vector<std::pair<float, size_t>> &value_index_arr, // 输入数据
+                               size_t topk,
+                               bool norm,
+                               size_t width) {
+
+    // ------------------------------------------------ //
+    //             第一步：计算最大值                       //
+    // ------------------------------------------------ //
+    float value_max = value_index_arr[0].first;
+    for (size_t i = 1; i < width; ++i) {
+        value_max = value_index_arr[i].first > value_max ? value_index_arr[i].first : value_max;
+    }
+
+    // ------------------------------------------------ //
+    //             第二步： 指数计算                       //
+    // ------------------------------------------------ //
+    float exp_sum = 0.0f;
+    for (size_t i = 0; i < width; ++i) {
+        float value = std::exp(value_index_arr[i].first - value_max);
+        value_index_arr[i].first = value;
+        exp_sum += value;
+    }
+
+    // ------------------------------------------------ //
+    //              第三步：计算 Softmax                  //
+    // ------------------------------------------------ //
+    for (size_t i = 0; i < width; ++i) {
+        value_index_arr[i].first /= exp_sum;
+    }
+
+    // ------------------------------------------------ //
+    //           第四步：计算 排序                        //
+    // ------------------------------------------------ //
+    std::sort(value_index_arr.begin(), value_index_arr.end(),
+              [](const std::pair<float, size_t> &a, const std::pair<float, size_t> &b) { return a.first > b.first; });
+
+    // ------------------------------------------------ //
+    //           第五步：   topk                         //
+    // ------------------------------------------------ //
+    exp_sum = 0.0f;
+    for (size_t i = 0; i < topk; ++i) {
+        values_input[i] = value_index_arr[i].first;
+        indices_input[i] = static_cast<int>(value_index_arr[i].second);
+        exp_sum += values_input[i];
+    }
+
+    // ------------------------------------------------ //
+    //           第6步： norm归一化                       //
+    // ------------------------------------------------ //
+    if (norm) {
+        for (size_t i = 0; i < topk; ++i) {
+            values_input[i] /= exp_sum;
+        }
+    }
+}
+
+template <typename T>
+infiniStatus_t topksoftmax_cpu_func(float *values, int *indices,
+                                    const T *x,
+                                    size_t topk, bool norm, size_t N, size_t width) {
+    /*
+    O-----------> width 地址连续
+    |
+    |
+    N
+    */
+
+    for (size_t n = 0; n < N; ++n) {
+
+        float *values_input = values + n * topk;
+        int *indices_input = indices + n * topk;
+        const T *x_input = x + n * width;
+
+        std::vector<std::pair<float, size_t>> value_index_arr;
+        value_index_arr.resize(width);
+
+        // ------------------------------------------------ //
+        //             第0步： 数据先转换到 float              //
+        // ------------------------------------------------ //
+        float temp;
+        for (size_t i = 0; i < width; ++i) {
+            if constexpr (std::is_same<T, fp16_t>::value) {
+                temp = _f16_to_f32(x_input[i]);
+            } else if constexpr (std::is_same<T, bf16_t>::value) {
+                temp = _bf16_to_f32(x_input[i]);
+            } else {
+                temp = x_input[i];
+            }
+            value_index_arr[i] = {temp, i};
+        }
+
+        topksoftmax_cpu_one_token(values_input,
+                                  indices_input,
+                                  value_index_arr,
+                                  topk,
+                                  norm,
+                                  width);
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x,
+                                     const size_t topk, const bool norm, void *stream) const {
+    size_t N = _info.N;
+    size_t width = _info.width;
+    if (_info.xtype == INFINI_DTYPE_F32) {
+        topksoftmax_cpu_func<float>(values, indices, (const float *)x, topk, norm, N, width);
+    } else if (_info.xtype == INFINI_DTYPE_F16) {
+        topksoftmax_cpu_func<fp16_t>(values, indices, (const fp16_t *)x, topk, norm, N, width);
+    } else if (_info.xtype == INFINI_DTYPE_BF16) {
+        topksoftmax_cpu_func<bf16_t>(values, indices, (const bf16_t *)x, topk, norm, N, width);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::topksoftmax::cpu
diff --git a/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h
new file mode 100644
index 000000000..c874f4997
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/cpu/topksoftmax_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __TOPKSOFTMAX_CPU_H__
+#define __TOPKSOFTMAX_CPU_H__
+#include "../topksoftmax.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/topksoftmax/cuda/kernel.cuh b/src/infiniop/ops/topksoftmax/cuda/kernel.cuh
new file mode 100644
index 000000000..b8bd05833
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/cuda/kernel.cuh
@@ -0,0 +1,145 @@
+#ifndef _TOPKSOFTMAX_KERNEL_CUH__
+#define _TOPKSOFTMAX_KERNEL_CUH__
+#include <cfloat>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_store.cuh>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+template <typename T>
+inline __device__ float exp_func(T x) {
+    float data;
+    if constexpr (std::is_same_v<T, float>) {
+        data = x;
+    } else if constexpr (std::is_same_v<T, __nv_bfloat16>) {
+        data = __bfloat162float(x);
+    } else if constexpr (std::is_same_v<T, half>) {
+        data = __half2float(x);
+    }
+    return __expf(data);
+}
+
+template <typename T, int BLOCK_SIZE = 128>
+__global__ void softmax_topk_row_kernel(float *values_topk, // 输出数据, 形状[N, topk]
+                                        int *indices_topk,  // 输出索引, 形状[N, topk]
+                                        const T *input,     // 输入数据 [N, width]
+                                        const size_t N,
+                                        const size_t width,
+                                        const size_t topk,
+                                        bool norm
+
+) {
+    const int bid = blockIdx.x;
+    if (bid >= N) {
+        return;
+    }
+
+    const int tid = threadIdx.x;
+    const T *data_input = input + bid * width;
+    float *values_topk_output = values_topk + bid * topk;
+    int *indices_topk_output = indices_topk + bid * topk;
+
+    const int warp_id = tid / 32;
+
+    __shared__ T shared_max;
+    __shared__ float shared_sum;
+    typedef cub::BlockReduce<T, BLOCK_SIZE> BlockReduce;
+
+    // ------------------------------------------------ //
+    //             第一步：计算最大值                      //
+    // ------------------------------------------------ //
+    T thread_max = data_input[0];
+    if (tid < width) {
+        thread_max = thread_max > data_input[tid] ? thread_max : data_input[tid];
+    }
+
+    {
+        __shared__ typename BlockReduce::TempStorage temp_storage_max;
+        T value_max = BlockReduce(temp_storage_max).Reduce(thread_max, cub::Max());
+        if (tid == 0) {
+            shared_max = value_max;
+        }
+    }
+    __syncthreads();
+
+    // ------------------------------------------------ //
+    //             第二步：计算指数和                      //
+    // ------------------------------------------------ //
+    float exp_val = 0.0f;
+    if (tid < width) {
+        T temp_val = data_input[tid] - shared_max;
+        exp_val = exp_func<T>(temp_val);
+    }
+
+    {
+        __shared__ typename BlockReduce::TempStorage temp_storage_sum;
+        float value_sum = BlockReduce(temp_storage_sum).Sum(exp_val);
+        if (tid == 0) {
+            shared_sum = value_sum;
+        }
+    }
+    __syncthreads();
+
+    // ------------------------------------------------ //
+    //           第三步：计算 Softmax                     //
+    // ------------------------------------------------ //
+    exp_val /= shared_sum;
+
+    // ------------------------------------------------ //
+    //           第四步：计算 排序                        //
+    // ------------------------------------------------ //
+    float thread_values[1] = {-FLT_MAX};
+    int thread_indices[1] = {-1};
+    if ((tid < width) && (exp_val > thread_values[0])) {
+        thread_values[0] = exp_val;
+        thread_indices[0] = tid;
+    }
+    {
+        typedef cub::BlockRadixSort<float, BLOCK_SIZE, 1, int> BlockRadixSort;
+        __shared__ typename BlockRadixSort::TempStorage temp_storage;
+        BlockRadixSort(temp_storage).SortDescending(thread_values, thread_indices);
+    }
+    __syncthreads();
+
+    if (0 == warp_id) {
+        int indice = -1;
+        float value = 0.0f;
+        if (tid < topk) {
+            indice = thread_indices[0];
+            value = thread_values[0];
+        }
+
+        // ------------------------------------------------ //
+        //           第五步： topk的和                         //
+        // ------------------------------------------------ //
+        {
+            typedef cub::WarpReduce<float, 32> WarpReduce;
+            __shared__ typename WarpReduce::TempStorage temp_storage;
+            float warp_sum = WarpReduce(temp_storage).Sum(value);
+            if (0 == tid) {
+                shared_sum = warp_sum + 1e-9f;
+            }
+        }
+        __syncwarp();
+
+        // ------------------------------------------------ //
+        //           第6步： norm归一化                       //
+        // ------------------------------------------------ //
+        if (norm && (tid < topk)) {
+            value /= shared_sum;
+        }
+
+        // ------------------------------------------------ //
+        //           第7步： 最终的返回值                       //
+        // ------------------------------------------------ //
+        if (tid < topk) {
+            values_topk_output[tid] = value;
+            indices_topk_output[tid] = indice;
+        }
+    }
+}
+
+#endif // _TOPKSOFTMAX_KERNEL_CUH__
diff --git a/src/infiniop/ops/topksoftmax/info.h b/src/infiniop/ops/topksoftmax/info.h
new file mode 100644
index 000000000..4e8eb9d7c
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/info.h
@@ -0,0 +1,48 @@
+#ifndef __TOPKSOFTMAX_INFO_H__
+#define __TOPKSOFTMAX_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::topksoftmax {
+
+class TopksoftmaxInfo {
+    TopksoftmaxInfo() = default;
+
+public:
+    infiniDtype_t xtype;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> x_strides;
+    size_t N;
+    size_t width;
+
+public:
+    size_t ndim() const { return shape.size(); }
+    size_t dim() const { return shape[ndim() - 1]; }
+
+    static utils::Result<TopksoftmaxInfo> create(infiniopTensorDescriptor_t x_desc) {
+
+        auto xtype = x_desc->dtype();
+        if ((xtype != infiniDtype_t::INFINI_DTYPE_F32) && (xtype != infiniDtype_t::INFINI_DTYPE_F16) && (xtype != infiniDtype_t::INFINI_DTYPE_BF16)) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        if (x_desc->ndim() != 2) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        size_t N = x_desc->shape()[0];     // token数量
+        size_t width = x_desc->shape()[1]; // 专家数量
+
+        return utils::Result<TopksoftmaxInfo>(TopksoftmaxInfo{xtype,
+                                                              x_desc->shape(),
+                                                              x_desc->strides(),
+                                                              N,
+                                                              width});
+    }
+};
+
+} // namespace op::topksoftmax
+
+#endif // __TOPKSOFTMAX_INFO_H__
diff --git a/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu
new file mode 100644
index 000000000..b7a9d7c4c
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cu
@@ -0,0 +1,92 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "topksoftmax_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::topksoftmax::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_desc) {
+    auto result = TopksoftmaxInfo::create(x_desc);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    if (info.x_strides[1] != 1) {
+        return INFINI_STATUS_BAD_TENSOR_STRIDES;
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        std::move(info),
+        0,
+        handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+namespace {
+
+template <int BLOCK_SIZE = 128>
+infiniStatus_t launch_topksoftmax(float *d_values_out, int *d_indices_out, const void *d_input, const size_t N, const size_t width, const size_t topk, const bool norm, infiniDtype_t xtype, cudaStream_t stream) {
+    const int block_threads = BLOCK_SIZE;
+    dim3 blocks(static_cast<unsigned int>(N));
+    dim3 threads(block_threads);
+
+    if (xtype == INFINI_DTYPE_F32) {
+        softmax_topk_row_kernel<float, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (float *)d_input, N, width, topk, norm);
+    } else if (xtype == INFINI_DTYPE_F16) {
+        softmax_topk_row_kernel<half, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (half *)d_input, N, width, topk, norm);
+    } else if (xtype == INFINI_DTYPE_BF16) {
+        softmax_topk_row_kernel<__nv_bfloat16, BLOCK_SIZE><<<blocks, threads, 0, stream>>>(d_values_out, d_indices_out, (__nv_bfloat16 *)d_input, N, width, topk, norm);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    float *values,
+    int *indices,
+    const void *x,
+    const size_t topk,
+    const bool norm,
+    void *stream) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    size_t N = _info.N;
+    size_t width = _info.width;
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    if (width <= 128) {
+        launch_topksoftmax<128>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream);
+    } else if (width <= 256) {
+        launch_topksoftmax<256>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream);
+    } else if (width <= 512) {
+        launch_topksoftmax<512>(values, indices, x, N, width, topk, norm, _info.xtype, cuda_stream);
+    } else {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::topksoftmax::nvidia
diff --git a/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh
new file mode 100644
index 000000000..fea0d0f2a
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/nvidia/topksoftmax_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TOPKSOFTMAX_CUDA_H__
+#define __TOPKSOFTMAX_CUDA_H__
+
+#include "../topksoftmax.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/topksoftmax/operator.cc b/src/infiniop/ops/topksoftmax/operator.cc
new file mode 100644
index 000000000..ad3c91dcd
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/operator.cc
@@ -0,0 +1,101 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/topksoftmax.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/topksoftmax_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API)
+#include "nvidia/topksoftmax_nvidia.cuh"
+#endif
+
+__C infiniStatus_t infiniopCreateTopksoftmaxDescriptor(infiniopHandle_t handle,
+                                                       infiniopTopksoftmaxDescriptor_t *desc_ptr,
+                                                       infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                \
+    case CASE:                                                 \
+        return op::topksoftmax::NAMESPACE::Descriptor::create( \
+            handle, reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor **>(desc_ptr), x_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef CREATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetTopksoftmaxWorkspaceSize(infiniopTopksoftmaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTopksoftmax(infiniopTopksoftmaxDescriptor_t desc, void *workspace, size_t workspace_size,
+                                       void *values, void *indices, const void *x, const size_t topk, const int norm,
+                                       void *stream) {
+    if (topk > 32) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+#define CALCULATE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                              \
+        return reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, (float *)values, (int *)indices, x, topk, static_cast<bool>(norm), stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef CALCULATE
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyTopksoftmaxDescriptor(infiniopTopksoftmaxDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                   \
+        delete reinterpret_cast<op::topksoftmax::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+    }
+
+#undef DESTROY
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/topksoftmax/topksoftmax.h b/src/infiniop/ops/topksoftmax/topksoftmax.h
new file mode 100644
index 000000000..29d438f3f
--- /dev/null
+++ b/src/infiniop/ops/topksoftmax/topksoftmax.h
@@ -0,0 +1,34 @@
+#ifndef _TOPKSOFTMAX_H_
+#define _TOPKSOFTMAX_H_
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                                                                              \
+                                                                                                                           \
+    namespace op::topksoftmax::NAMESPACE {                                                                                 \
+    class Descriptor final : public InfiniopDescriptor {                                                                   \
+        struct Opaque;                                                                                                     \
+        Opaque *_opaque;                                                                                                   \
+        TopksoftmaxInfo _info;                                                                                             \
+        size_t _workspace_size;                                                                                            \
+                                                                                                                           \
+        Descriptor(Opaque *opaque, TopksoftmaxInfo info, size_t workspace_size, infiniDevice_t device_type, int device_id) \
+            : InfiniopDescriptor{device_type, device_id}, _opaque(opaque), _info(info), _workspace_size(workspace_size) {  \
+        }                                                                                                                  \
+                                                                                                                           \
+    public:                                                                                                                \
+        ~Descriptor();                                                                                                     \
+                                                                                                                           \
+        size_t workspaceSize() const {                                                                                     \
+            return _workspace_size;                                                                                        \
+        }                                                                                                                  \
+                                                                                                                           \
+        static infiniStatus_t create(infiniopHandle_t handle, Descriptor **desc_ptr, infiniopTensorDescriptor_t x_desc);   \
+                                                                                                                           \
+        infiniStatus_t calculate(void *workspace, size_t workspace_size, float *values, int *indices, const void *x,       \
+                                 const size_t topk, const bool norm, void *stream) const;                                  \
+    };                                                                                                                     \
+    }
+
+#endif // _TOPKSOFTMAX_H_
diff --git a/test/infiniop-test/test_generate/testcases/sigmoid.py b/test/infiniop-test/test_generate/testcases/sigmoid.py
new file mode 100644
index 000000000..f622a4d6e
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/sigmoid.py
@@ -0,0 +1,136 @@
+import numpy as np
+from numpy.lib.stride_tricks import as_strided
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+
+
+def sigmoid(
+        x: np.ndarray,
+):
+    return 1 / (1 + np.exp(-x))
+
+
+def random_tensor(shape, dtype):
+    rate = 1e-3
+    var = 0.5 * rate
+    return rate * np.random.rand(*shape).astype(dtype) - var
+
+
+def process_tensors(a, b, stride_a=None, stride_b=None):
+    def normalize_stride(tensor, stride):
+        if stride:
+            slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
+            return tensor[slices]
+        else:
+            return tensor
+
+    a_unique = normalize_stride(a, stride_a)
+    b_unique = normalize_stride(b, stride_b)
+    return a_unique, b_unique
+
+
+def process_tensor(a, stride_a=None):
+    def normalize_stride(tensor, stride):
+        if stride:
+            slices = tuple(slice(0, 1) if s == 0 else slice(None) for s in stride)
+            return tensor[slices]
+        else:
+            return tensor
+
+    a_unique = normalize_stride(a, stride_a)
+    return a_unique
+
+
+class SigmoidTestCase(InfiniopTestCase):
+    def __init__(
+            self,
+            x: np.ndarray,
+            shape_x: List[int] | None,
+            stride_x: List[int] | None,
+            y: np.ndarray,
+            shape_y: List[int] | None,
+            stride_y: List[int] | None,
+    ):
+        super().__init__("sigmoid")
+        self.x = x
+        self.shape_x = shape_x
+        self.stride_x = stride_x
+
+        self.y = y
+        self.shape_y = shape_y
+        self.stride_y = stride_y
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
+
+        if self.stride_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+
+        test_writer.add_array(
+            test_writer.gguf_key("y.strides"),
+            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+
+        input_x = self.x.astype(np.float64)
+        if (self.stride_x is not None) and (0 in self.stride_x):
+            typesize = np.dtype(input_x.dtype).itemsize
+            new_strides_bytes = tuple(x * typesize for x in self.stride_x)
+            input_x = as_strided(x=input_x, shape=self.shape_x, strides=new_strides_bytes)
+
+        ans = sigmoid(input_x)
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == '__main__':
+    test_writer = InfiniopTestWriter("sigmoid.gguf")
+
+    test_cases = []
+    _TEST_CASES_ = [
+        # shape, x_stride, y_stride
+        ((13, 4), None, None),
+        ((13, 4), (10, 1), (10, 1)),
+        ((13, 4), (0, 1), None),
+        ((13, 4, 4), None, None),
+        ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+        ((13, 4, 4), (4, 0, 1), None),
+        ((16, 5632), None, None),
+        ((16, 5632), (13312, 1), (13312, 1)),
+        ((4, 4, 5632), None, None),
+        ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+    ]
+    _TENSOR_DTYPES_ = [np.float16, np.float32]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape, stride_x, stride_y in _TEST_CASES_:
+            x = np.random.rand(*shape).astype(dtype)
+            y = np.empty(tuple(0 for _ in shape), dtype=dtype)
+
+            x = process_zero_stride_tensor(x, stride_x)
+            test_case = SigmoidTestCase(x=x,
+                                        shape_x=shape,
+                                        stride_x=stride_x,
+                                        y=y,
+                                        shape_y=shape,
+                                        stride_y=stride_y)
+
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/topkrouter.py b/test/infiniop-test/test_generate/testcases/topkrouter.py
new file mode 100644
index 000000000..54d647154
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/topkrouter.py
@@ -0,0 +1,225 @@
+import numpy as np
+from typing import List
+import torch
+import torch.nn as nn
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+
+def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
+
+
+class DeepseekV3TopkRouter(nn.Module):
+    def __init__(self, correction_bias,
+                 routed_scaling_factor: float,
+                 topk: int, config=None):
+        super().__init__()
+        self.config = config
+        self.top_k = 8  # config.num_experts_per_tok
+        self.n_routed_experts = 256  # config.n_routed_experts
+        self.routed_scaling_factor = 2.5  # config.routed_scaling_factor
+        self.n_group = 8  # config.n_group
+        self.topk_group = 4  # config.topk_group
+        self.norm_topk_prob = True  # config.norm_topk_prob
+
+        self.routed_scaling_factor = routed_scaling_factor
+        self.top_k = topk
+
+        # self.weight = nn.Parameter(torch.empty((self.n_routed_experts, config.hidden_size)))
+        # self.weight = torch.rand(256, 7168) * 2 - 1
+
+        # self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
+        self.e_score_correction_bias = torch.zeros(256, )
+        self.e_score_correction_bias[:] = correction_bias[:]
+
+    @torch.no_grad()
+    def get_topk_indices(self, scores):
+        scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)  # Size([1, 256])
+        group_scores = (
+            scores_for_choice.view(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .topk(2, dim=-1)[0]
+            .sum(dim=-1)
+        )
+
+        group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=True)[1]  # Size([1, 4])
+        group_mask = torch.zeros_like(group_scores)  # Size([1, 8])
+        group_mask.scatter_(1, group_idx, 1)  # Size([1, 8])
+
+        score_mask = (
+            group_mask.unsqueeze(-1)
+            .expand(-1, self.n_group, self.n_routed_experts // self.n_group)
+            .reshape(-1, self.n_routed_experts)
+        )
+
+        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)  # Size([1, 256])
+        topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=True)[1]  # Size([1, 8])
+
+        return topk_indices
+
+    def forward(self, router_logits):
+        # hidden_states = hidden_states.view(-1, 7168)
+        # router_logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+
+        scores = router_logits.sigmoid()  # (1,256)
+        scores = scores.to(torch.float32)
+
+        topk_indices = self.get_topk_indices(scores)  # (1,8)
+        topk_weights = scores.gather(1, topk_indices)
+
+        if self.norm_topk_prob:
+            denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weights /= denominator
+        topk_weights = topk_weights * self.routed_scaling_factor
+        return topk_indices, topk_weights
+
+
+def python_topkrouter(x: np.ndarray,
+                      correction_bias: np.ndarray,
+                      routed_scaling_factor: float,
+                      topk: int):
+    x = torch.from_numpy(x)
+    correction_bias = torch.from_numpy(correction_bias)
+
+    router_logits = x
+    lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias, routed_scaling_factor=routed_scaling_factor, topk=topk)(router_logits)
+    lable_indices = lable_indices.to(torch.int32)
+
+    return lable_values.numpy(), lable_indices.numpy()
+
+
+class TopkrouterTestCase(InfiniopTestCase):
+    def __init__(self,
+                 values: np.ndarray,  # 传出参数
+                 indices: np.ndarray,  # 传出参数
+                 x: np.ndarray,  # 传入参数
+                 correction_bias: np.ndarray,  # 传入参数
+                 routed_scaling_factor: float,
+                 topk: int,
+                 values_shape: List[int] | None,
+                 values_strides: List[int] | None,
+                 indices_shape: List[int] | None,
+                 indices_strides: List[int] | None,
+                 x_shape: List[int] | None,
+                 x_strides: List[int] | None,
+                 correction_bias_shape: List[int] | None,
+                 correction_bias_strides: List[int] | None,
+                 ):
+        super().__init__("topkrouter")
+        self.values = values
+        self.indices = indices
+        self.x = x
+        self.correction_bias = correction_bias
+
+        self.routed_scaling_factor = routed_scaling_factor
+        self.topk = topk
+
+        self.values_shape = values_shape
+        self.values_strides = values_strides
+        self.indices_shape = indices_shape
+        self.indices_strides = indices_strides
+        self.x_shape = x_shape
+        self.x_strides = x_strides
+        self.correction_bias_shape = correction_bias_shape
+        self.correction_bias_strides = correction_bias_strides
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+
+        if self.values_shape is not None:
+            print("self.values_shape:  ", self.values_shape)
+            test_writer.add_array(test_writer.gguf_key("values.shape"), self.values_shape)
+        if self.indices_shape is not None:
+            test_writer.add_array(test_writer.gguf_key("indices.shape"), self.indices_shape)
+        if self.x_shape is not None:
+            test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape)
+        if self.correction_bias_shape is not None:
+            test_writer.add_array(test_writer.gguf_key("correction_bias.shape"), self.correction_bias_shape)
+
+        if self.x_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+        if self.correction_bias_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("correction_bias_strides.strides"), gguf_strides(*self.correction_bias_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("values.strides"),
+            gguf_strides(*self.values_strides if self.values_strides is not None else contiguous_gguf_strides(self.values_shape))
+        )
+        test_writer.add_array(
+            test_writer.gguf_key("indices.strides"),
+            gguf_strides(*self.indices_strides if self.indices_strides is not None else contiguous_gguf_strides(self.indices_shape))
+        )
+
+        test_writer.add_tensor(test_writer.gguf_key("values"),
+                               self.values,
+                               raw_dtype=np_dtype_to_ggml(self.values.dtype))
+
+        test_writer.add_tensor(test_writer.gguf_key("indices"),
+                               self.indices,
+                               raw_dtype=np_dtype_to_ggml(self.indices.dtype))
+
+        test_writer.add_tensor(test_writer.gguf_key("x"),
+                               self.x,
+                               raw_dtype=np_dtype_to_ggml(self.x.dtype))
+
+        test_writer.add_tensor(test_writer.gguf_key("correction_bias"),
+                               self.correction_bias,
+                               raw_dtype=np_dtype_to_ggml(self.correction_bias.dtype))
+
+        test_writer.add_float32(test_writer.gguf_key("routed_scaling_factor"), self.routed_scaling_factor)
+        test_writer.add_int32(test_writer.gguf_key("topk"), self.topk)
+
+        lable_values, lable_indices = python_topkrouter(self.x.copy(), self.correction_bias.copy(), self.routed_scaling_factor, self.topk)
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("lable_values"),
+            lable_values,
+            raw_dtype=np_dtype_to_ggml(lable_values.dtype),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("lable_indices"),
+            lable_indices,
+            raw_dtype=np_dtype_to_ggml(lable_indices.dtype)
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("topkrouter.gguf")
+    test_cases = []
+
+    _TEST_CASES_ = [
+        # x_shape, x_strides, correction_bias_shape, correction_bias_stride， routed_scaling_factor, topk
+        ((1, 256), None, (256,), None, 2.5, 8),
+        ((2, 256), None, (256,), None, 1.5, 8),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+    for dtype in _TENSOR_DTYPES_:
+        for x_shape, x_strides, correction_bias_shape, b_stride, routed_scaling_factor, topk in _TEST_CASES_:
+            ntoken = x_shape[0]
+
+            values_indices_shape = (ntoken, topk)
+            values = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.float32)
+            indices = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.int32)
+
+            x = np.random.rand(*x_shape).astype(dtype)
+            correction_bias = np.random.rand(*correction_bias_shape).astype(np.float32)
+            test_case = TopkrouterTestCase(
+                values=values,
+                indices=indices,
+                x=x,
+                correction_bias=correction_bias,
+                routed_scaling_factor=routed_scaling_factor,
+                topk=topk,
+                values_shape=list(values_indices_shape),
+                values_strides=None,
+                indices_shape=list(values_indices_shape),
+                indices_strides=None,
+                x_shape=list(x_shape),
+                x_strides=None,
+                correction_bias_shape=list(correction_bias_shape),
+                correction_bias_strides=None,
+            )
+
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/topksoftmax.py b/test/infiniop-test/test_generate/testcases/topksoftmax.py
new file mode 100644
index 000000000..337c4b9e9
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/topksoftmax.py
@@ -0,0 +1,148 @@
+import numpy as np
+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+
+def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
+
+
+def torch_Topksoftmax(router_logits, top_k: int, norm_topk_prob: bool):
+    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    if norm_topk_prob:  # only diff with mixtral sparse moe block!
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+    routing_weights = routing_weights.to(torch.float)
+    return routing_weights, selected_experts
+
+
+def python_Topksoftmax(router_logits, top_k: int, norm_topk_prob: bool):
+    router_logits = torch.from_numpy(router_logits)
+    lable_values, lable_indices = torch_Topksoftmax(router_logits, top_k, norm_topk_prob)
+    return lable_values.numpy(), lable_indices.numpy()
+
+
+class TopksoftmaxTestCase(InfiniopTestCase):
+    def __init__(self,
+                 values: np.ndarray,  # 传出参数
+                 indices: np.ndarray,  # 传出参数
+                 x: np.ndarray,  # 传入参数
+                 topk: np.ndarray,
+                 norm: bool,
+                 values_shape: List[int] | None,
+                 values_strides: List[int] | None,
+                 indices_shape: List[int] | None,
+                 indices_strides: List[int] | None,
+                 x_shape: List[int] | None,
+                 x_strides: List[int] | None,
+                 ):
+        super().__init__("topksoftmax")
+        self.values = values
+        self.indices = indices
+        self.x = x
+        self.topk = topk
+        self.norm = norm
+
+        self.values_shape = values_shape
+        self.values_strides = values_strides
+        self.indices_shape = indices_shape
+        self.indices_strides = indices_strides
+        self.x_shape = x_shape
+        self.x_strides = x_strides
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+
+        if self.values_shape is not None:
+            print("self.values_shape:  ", self.values_shape)
+            test_writer.add_array(test_writer.gguf_key("values.shape"), self.values_shape)
+        if self.indices_shape is not None:
+            test_writer.add_array(test_writer.gguf_key("indices.shape"), self.indices_shape)
+        if self.x_shape is not None:
+            test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape)
+
+        if self.x_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+
+        test_writer.add_array(
+            test_writer.gguf_key("values.strides"),
+            gguf_strides(*self.values_strides if self.values_strides is not None else contiguous_gguf_strides(self.values_shape))
+        )
+
+        test_writer.add_array(
+            test_writer.gguf_key("indices.strides"),
+            gguf_strides(*self.indices_strides if self.indices_strides is not None else contiguous_gguf_strides(self.indices_shape))
+        )
+
+        test_writer.add_tensor(test_writer.gguf_key("values"),
+                               self.values,
+                               raw_dtype=np_dtype_to_ggml(self.values.dtype))
+
+        test_writer.add_tensor(test_writer.gguf_key("indices"),
+                               self.indices,
+                               raw_dtype=np_dtype_to_ggml(self.indices.dtype))
+
+        test_writer.add_tensor(test_writer.gguf_key("x"),
+                               self.x,
+                               raw_dtype=np_dtype_to_ggml(self.x.dtype))
+
+        test_writer.add_int32(test_writer.gguf_key("topk"), self.topk)
+        test_writer.add_bool(test_writer.gguf_key("norm"), self.norm)
+
+        lable_values, lable_indices = python_Topksoftmax(self.x.copy(), self.topk, self.norm)
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("lable_values"),
+            lable_values,
+            raw_dtype=np_dtype_to_ggml(lable_values.dtype),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("lable_indices"),
+            lable_indices,
+            raw_dtype=np_dtype_to_ggml(lable_indices.dtype)
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("topksoftmax.gguf")
+    test_cases = []
+
+    _TEST_CASES_ = [
+        # x_shape, x_strides, topk, norm
+        ((1, 32), None, 4, True),
+        ((8, 20), None, 8, False),
+        ((2, 128), None, 10, True)
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+    for dtype in _TENSOR_DTYPES_:
+        for x_shape, x_strides, topk, norm in _TEST_CASES_:
+            ntoken = x_shape[0]
+
+            values_indices_shape = (ntoken, topk)
+            values = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.float32)
+            indices = np.empty(tuple(0 for _ in values_indices_shape), dtype=np.int32)
+
+            x = np.random.rand(*x_shape).astype(dtype)
+
+            test_case = TopksoftmaxTestCase(
+                values=values,
+                indices=indices,
+                x=x,
+                topk=topk,
+                norm=norm,
+                values_shape=list(values_indices_shape),
+                values_strides=None,
+                indices_shape=list(values_indices_shape),
+                indices_strides=None,
+                x_shape=list(x_shape),
+                x_strides=None
+            )
+
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index ba1ce33df..c47da4ea4 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -495,6 +495,66 @@ def conv_(lib):
     ]
 
 
+@OpRegister.operator
+def sigmoid_(lib):
+    lib.infiniopCreateSigmoidDescriptor.restype = c_int32
+    lib.infiniopCreateSigmoidDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetSigmoidWorkspaceSize.restype = c_int32
+    lib.infiniopGetSigmoidWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopSigmoid.restype = c_int32
+    lib.infiniopSigmoid.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroySigmoidDescriptor.restype = c_int32
+    lib.infiniopDestroySigmoidDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def topksoftmax_(lib):
+    lib.infiniopCreateTopksoftmaxDescriptor.restype = c_int32
+    lib.infiniopCreateTopksoftmaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetTopksoftmaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetTopksoftmaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+    lib.infiniopTopksoftmax.restype = c_int32
+    lib.infiniopTopksoftmax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_size_t,
+        c_int32, 
+        c_void_p,
+    ]
+    lib.infiniopDestroyTopksoftmaxDescriptor.restype = c_int32
+    lib.infiniopDestroyTopksoftmaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
 @OpRegister.operator
 def topkrouter_(lib):
     lib.infiniopCreateTopkrouterDescriptor.restype = c_int32
@@ -502,7 +562,7 @@ def topkrouter_(lib):
         infiniopHandle_t,
         POINTER(infiniopOperatorDescriptor_t),
         infiniopTensorDescriptor_t,
-        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t
     ]
 
     lib.infiniopGetTopkrouterWorkspaceSize.restype = c_int32
@@ -510,7 +570,6 @@ def topkrouter_(lib):
         infiniopOperatorDescriptor_t,
         POINTER(c_size_t),
     ]
-
     lib.infiniopTopkrouter.restype = c_int32
     lib.infiniopTopkrouter.argtypes = [
         infiniopOperatorDescriptor_t,
@@ -524,7 +583,6 @@ def topkrouter_(lib):
         c_size_t,
         c_void_p,
     ]
-
     lib.infiniopDestroyTopkrouterDescriptor.restype = c_int32
     lib.infiniopDestroyTopkrouterDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
diff --git a/test/infiniop/sigmoid.py b/test/infiniop/sigmoid.py
new file mode 100644
index 000000000..589900fac
--- /dev/null
+++ b/test/infiniop/sigmoid.py
@@ -0,0 +1,172 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, x_stride, y_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), (0, 1)),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (4, 0, 1)),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+    ((4, 4, 56320), None, None),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_X = auto()
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_X,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def torch_sigmoid(y, x):
+    torch.sigmoid(x, out=y)
+
+def test(
+    handle,
+    device,
+    shape,
+    x_stride=None,
+    y_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    x = TestTensor(shape, x_stride, dtype, device)
+    if inplace == Inplace.INPLACE_X:
+        if x_stride != y_stride:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Sigmoid on {InfiniDeviceNames[device]} with shape:{shape} x_stride:{x_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    torch_sigmoid(y.torch_tensor(), x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSigmoidDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSigmoidWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_sigmoid():
+        check_error(
+            LIBINFINIOP.infiniopSigmoid(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_sigmoid()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_sigmoid(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sigmoid(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroySigmoidDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92m  Test passed!  \033[0m")
+
diff --git a/test/infiniop/topkrouter.py b/test/infiniop/topkrouter.py
index 6f851a89f..97b9e37e8 100644
--- a/test/infiniop/topkrouter.py
+++ b/test/infiniop/topkrouter.py
@@ -27,14 +27,13 @@
 # ==============================================================================
 # These are not meant to be imported from other modules
 _TEST_CASES_ = [
-    # x_shape, x_stride, select_experts
-    ((1, 256), None, 8),
-    ((3, 256), None, 8),
+    # x_shape, x_stride, topk, routed_scaling_factor
+    ((1, 256), None, 8, 2.5),
 ]
 
 # w (weight) types
 # Note: 'None' means the same as input dtype
-_X_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]  # 
+_X_DTYPES = [] # [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
 # x types used for testing
 _VALUE_DTYPES = [InfiniDtype.F32]
 
@@ -46,6 +45,8 @@
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
     InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
 }
 
 DEBUG = False
@@ -59,12 +60,13 @@ def tensorInfo(data):
 
 
 class DeepseekV3TopkRouter(nn.Module):
-    def __init__(self, correction_bias, config=None):
+    def __init__(self, correction_bias, routed_scaling_factor: float = 2.5, topk: int = 8, config=None):
         super().__init__()
         self.config = config
-        self.top_k = 8  # config.num_experts_per_tok
+        self.top_k = topk  # config.num_experts_per_tok 8
+        assert topk == 8
         self.n_routed_experts = 256  # config.n_routed_experts
-        self.routed_scaling_factor = 2.5  # config.routed_scaling_factor
+        self.routed_scaling_factor = routed_scaling_factor  # config.routed_scaling_factor 2.5
         self.n_group = 8  # config.n_group
         self.topk_group = 4  # config.topk_group
         self.norm_topk_prob = True  # config.norm_topk_prob
@@ -73,7 +75,7 @@ def __init__(self, correction_bias, config=None):
         # self.weight = torch.rand(256, 7168) * 2 - 1
 
         # self.register_buffer("e_score_correction_bias", torch.zeros(self.n_routed_experts))
-        self.e_score_correction_bias = torch.zeros(256, device="cuda")
+        self.e_score_correction_bias = torch.zeros(256, device=correction_bias.device)
         self.e_score_correction_bias[:] = correction_bias[:]
 
     @torch.no_grad()
@@ -114,11 +116,12 @@ def forward(self, router_logits):
             denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
             topk_weights /= denominator
         topk_weights = topk_weights * self.routed_scaling_factor
+
         return topk_indices, topk_weights
 
 
-def torch_topkrouter(router_logits, correction_bias):
-    lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias)(router_logits)
+def torch_topkrouter(router_logits, correction_bias, routed_scaling_factor, topk):
+    lable_indices, lable_values = DeepseekV3TopkRouter(correction_bias, routed_scaling_factor, topk)(router_logits)
     lable_indices = lable_indices.to(torch.int32)
     return lable_values, lable_indices
 
@@ -129,12 +132,13 @@ def test(
         x_shape,
         x_stride,
         topk,
+        routed_scaling_factor,
         x_dtype=InfiniDtype.F32,
         dtype=InfiniDtype.F16,
         sync=None,
 ):
     print(
-        f"Testing topkrouter on {InfiniDeviceNames[device]} with x_shape:{x_shape}"
+        f"Testing topkrouter on {InfiniDeviceNames[device]} with x_shape:{x_shape} "
         f"x_stride:{x_stride} w_dtype:{InfiniDtypeNames[x_dtype]} dtype:{InfiniDtypeNames[dtype]}"
     )
 
@@ -182,27 +186,27 @@ def lib_topkrouter():
                 indices.data_ptr(),
                 x.data(),
                 correction_bias.data(),
-                2.5,
+                routed_scaling_factor,
                 topk,
                 None,
             )
         )
 
     lib_topkrouter()
-    lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor())
 
+    lable_values, lable_indices = torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk)
     atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
     if DEBUG:
         debug(lable_values, values, atol=atol, rtol=rtol)
         debug(lable_indices, indices, atol=atol, rtol=rtol)
 
     assert torch.allclose(lable_values, values, atol=atol, rtol=rtol)
-    assert torch.allclose(lable_indices, lable_indices, atol=atol, rtol=rtol)
+    assert torch.allclose(lable_indices, indices, atol=atol, rtol=rtol)
 
     # Profiling workflow
     if PROFILE:
         # fmt: off
-        profile_operation("PyTorch", lambda: torch_topkrouter(x.actual_tensor().clone(), tokp), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("PyTorch", lambda: torch_topkrouter(x.actual_tensor(), correction_bias.actual_tensor(), routed_scaling_factor, topk), device, NUM_PRERUN, NUM_ITERATIONS)
         profile_operation("    lib", lambda: lib_topkrouter(), device, NUM_PRERUN, NUM_ITERATIONS)
         # fmt: on
     check_error(LIBINFINIOP.infiniopDestroyTopkrouterDescriptor(descriptor))
diff --git a/test/infiniop/topksoftmax.py b/test/infiniop/topksoftmax.py
new file mode 100644
index 000000000..8529b8ffd
--- /dev/null
+++ b/test/infiniop/topksoftmax.py
@@ -0,0 +1,170 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+import torch.nn.functional as F
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    torch_device_map
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # x_shape, x_stride, topk, norm
+    ((1, 10), None, 7, True),
+    ((2, 20), None, 4, True),
+    ((1, 128), None, 10, True),
+]
+
+# w (weight) types
+# Note: 'None' means the same as input dtype
+_X_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]  #
+# x types used for testing
+_VALUE_DTYPES = [InfiniDtype.F32]
+
+# Form the test cases by appending each element of _X_DTYPES to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (x_dtype,) for test_case in _TEST_CASES_ for x_dtype in _X_DTYPES
+]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F32: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def tensorInfo(data):
+    print("data:  ", data.is_contiguous(), data.device, data.dtype, data.shape, data.stride(), data.data_ptr(), hex(data.data_ptr()))
+
+
+def torch_topksoftmax(router_logits, top_k, norm_topk_prob=False):
+    routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    if norm_topk_prob:  # only diff with mixtral sparse moe block!
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+    return routing_weights, selected_experts
+
+
+def test(
+        handle,
+        device,
+        x_shape,
+        x_stride,
+        topk,
+        norm_topk_prob,
+        x_dtype=InfiniDtype.F32,
+        dtype=InfiniDtype.F16,
+        sync=None,
+):
+    print(
+        f"Testing topksoftmax on {InfiniDeviceNames[device]} with x_shape:{x_shape}"
+        f"x_stride:{x_stride} w_dtype:{InfiniDtypeNames[x_dtype]} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    data = torch.arange(0, x_shape[0] * x_shape[1]).reshape(x_shape)
+
+    N, width = x_shape
+    x = TestTensor(x_shape, data.stride(), x_dtype, device, scale=0.5, mode="manual", set_tensor=data)
+
+    # print(x.torch_tensor())
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTopksoftmaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            x.descriptor
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [x]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTopksoftmaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    values = torch.zeros((N, topk), dtype=torch.float32, device=torch_device_map[x.device])
+    indices = torch.zeros((N, topk), dtype=torch.int32, device=torch_device_map[x.device])
+
+    def lib_topksoftmax():
+        check_error(
+            LIBINFINIOP.infiniopTopksoftmax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                values.data_ptr(),
+                indices.data_ptr(),
+                x.data(),
+                topk,
+                norm_topk_prob,
+                None,
+            )
+        )
+
+    lable_values, lable_indices = torch_topksoftmax(x.torch_tensor().clone(), topk, norm_topk_prob=norm_topk_prob)
+    lable_indices = lable_indices.to(dtype=torch.int32)
+    lib_topksoftmax()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(lable_values, values, atol=atol, rtol=rtol)
+        debug(lable_indices, indices, atol=atol, rtol=rtol)
+
+    assert torch.allclose(lable_values, values, atol=atol, rtol=rtol)
+    assert torch.allclose(lable_indices, indices, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_topksoftmax(x.actual_tensor().clone(), topk), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_topksoftmax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTopksoftmaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _VALUE_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
+