InfiniTensor · wooway777 · Mar 18, 2026 · Mar 6, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
@@ -20,6 +20,8 @@
 #include "ops/paged_attention.hpp"
 #include "ops/paged_attention_prefill.hpp"
 #include "ops/paged_caching.hpp"
+#include "ops/per_tensor_dequant_i8.hpp"
+#include "ops/per_tensor_quant_i8.hpp"
 #include "ops/random_sample.hpp"
 #include "ops/rearrange.hpp"
 #include "ops/reciprocal.hpp"

diff --git a/include/infinicore/ops/per_tensor_dequant_i8.hpp b/include/infinicore/ops/per_tensor_dequant_i8.hpp
@@ -0,0 +1,11 @@
+#pragma once
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(PerTensorDequantI8, Tensor, const Tensor &, const Tensor &, const Tensor &);
+
+void per_tensor_dequant_i8_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero);
+} // namespace infinicore::op
diff --git a/include/infinicore/ops/per_tensor_quant_i8.hpp b/include/infinicore/ops/per_tensor_quant_i8.hpp
@@ -0,0 +1,11 @@
+#pragma once
+#include "../device.hpp"
+#include "../graph/graph.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_CLASS(PerTensorQuantI8, const Tensor &, Tensor, Tensor, Tensor, bool);
+
+void per_tensor_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static);
+} // namespace infinicore::op
diff --git a/include/infinicore/quantization/quantization_scheme.hpp b/include/infinicore/quantization/quantization_scheme.hpp
@@ -9,4 +9,9 @@ enum class QuantScheme {
     AWQ_W4A16,
 };
 
+enum class KVQuantScheme {
+    NONE,
+    INT8,
+};
+
 } // namespace infinicore::quantization
diff --git a/include/infiniop.h b/include/infiniop.h
@@ -15,6 +15,7 @@
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/cross_entropy.h"
+#include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
 #include "infiniop/ops/dequantize_awq.h"
 #include "infiniop/ops/embedding.h"
 #include "infiniop/ops/equal.h"
@@ -34,6 +35,7 @@
 #include "infiniop/ops/paged_attention_prefill.h"
 #include "infiniop/ops/paged_caching.h"
 #include "infiniop/ops/quant/per_channel_quant_int8.h"
+#include "infiniop/ops/quant/per_tensor_quant_int8.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/reciprocal.h"

diff --git a/include/infiniop/ops/dequant/per_tensor_dequant_int8.h b/include/infiniop/ops/dequant/per_tensor_dequant_int8.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
+#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
+                                                                              infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
+                                                                              infiniopTensorDescriptor_t x_desc,
+                                                                              infiniopTensorDescriptor_t x_packed_desc,
+                                                                              infiniopTensorDescriptor_t x_scale_desc,
+                                                                              infiniopTensorDescriptor_t x_zero_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
+                                                              void *workspace,
+                                                              size_t workspace_size,
+                                                              void *x,
+                                                              const void *x_packed,
+                                                              const void *x_scale,
+                                                              const void *x_zero,
+                                                              void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/quant/per_tensor_quant_int8.h b/include/infiniop/ops/quant/per_tensor_quant_int8.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
+#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
+
+#include "../../operator_descriptor.h"
+
+typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t;
+
+__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle,
+                                                                            infiniopPerTensorQuantI8Descriptor_t *desc_ptr,
+                                                                            infiniopTensorDescriptor_t x_packed_desc,
+                                                                            infiniopTensorDescriptor_t x_scale_desc,
+                                                                            infiniopTensorDescriptor_t x_zero_desc,
+                                                                            infiniopTensorDescriptor_t x_desc);
+
+__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size);
+
+__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc,
+                                                            void *workspace,
+                                                            size_t workspace_size,
+                                                            void *x_packed,
+                                                            void *x_scale,
+                                                            void *x_zero,
+                                                            const void *x,
+                                                            const bool is_static,
+                                                            void *stream);
+
+__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc);
+
+#endif
diff --git a/src/infinicore/ops/dequant/per_tensor_dequant_int8/per_tensor_quant_int8.cc b/src/infinicore/ops/dequant/per_tensor_dequant_int8/per_tensor_quant_int8.cc
@@ -0,0 +1,20 @@
+#include "../../../utils.hpp"
+#include "infinicore/ops/per_tensor_dequant_i8.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerTensorDequantI8);
+
+PerTensorDequantI8::PerTensorDequantI8(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zero);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zero);
+}
+
+void PerTensorDequantI8::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerTensorDequantI8, x, x_packed, x_scale, x_zero);
+}
+
+void per_tensor_dequant_i8_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
+    PerTensorDequantI8::execute(x, x_packed, x_scale, x_zero);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/dequant/per_tensor_dequant_int8/per_tensor_quant_int8_infiniop.cc b/src/infinicore/ops/dequant/per_tensor_dequant_int8/per_tensor_quant_int8_infiniop.cc
@@ -0,0 +1,53 @@
+#include "../../infiniop_impl.hpp"
+#include "infinicore/ops/per_tensor_dequant_i8.hpp"
+
+namespace infinicore::op::per_tensor_dequant_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerTensorDequantI8, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale, x_zero;
+};
+
+void *plan(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
+    size_t seed = hash_combine(x, x_packed, x_scale);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PerTensorDequantI8,
+        seed,
+        x->desc(), x_packed->desc(), x_scale->desc(), (x_zero ? x_zero->desc() : nullptr));
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PerTensorDequantI8, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale),
+        graph::GraphTensor(x_zero)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopPerTensorDequantI8(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x->data(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        nullptr,
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerTensorDequantI8, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::per_tensor_dequant_i8_impl::infiniop
diff --git a/src/infinicore/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.cc b/src/infinicore/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8.cc
@@ -0,0 +1,20 @@
+#include "../../../utils.hpp"
+#include "infinicore/ops/per_tensor_quant_i8.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerTensorQuantI8);
+
+PerTensorQuantI8::PerTensorQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zero);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zero, is_static);
+}
+
+void PerTensorQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerTensorQuantI8, x, x_packed, x_scale, x_zero, is_static);
+}
+
+void per_tensor_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
+    PerTensorQuantI8::execute(x, x_packed, x_scale, x_zero, is_static);
+}
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8_infiniop.cc b/src/infinicore/ops/quant/per_tensor_quant_int8/per_tensor_quant_int8_infiniop.cc
@@ -0,0 +1,56 @@
+#include "../../infiniop_impl.hpp"
+#include "infinicore/ops/per_tensor_quant_i8.hpp"
+
+namespace infinicore::op::per_tensor_quant_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerTensorQuantI8, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale, x_zero;
+    const bool is_static;
+};
+
+void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
+    size_t seed = hash_combine(x, x_packed, x_scale);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PerTensorQuantI8,
+        seed,
+        x_packed->desc(), x_scale->desc(), (x_zero ? x_zero->desc() : nullptr), x->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PerTensorQuantI8, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale),
+        graph::GraphTensor(x_zero),
+        is_static};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+    const bool is_static = planned->is_static;
+    INFINICORE_CHECK_ERROR(infiniopPerTensorQuantI8(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        nullptr,
+        planned->x->data(),
+        is_static,
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerTensorQuantI8, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::per_tensor_quant_i8_impl::infiniop
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh b/src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
@@ -0,0 +1,36 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
+#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
+
+template <typename Tin, typename Tout>
+__device__ void perTensorDequantI8SymKernel(
+    Tout *x, const Tin *x_packed, const float *x_scale,
+    size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
+    ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
+    ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3,
+    int num_elements) {
+
+    unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const int grid_size = blockDim.x * gridDim.x;
+    float x_scale_val = x_scale[0];
+    for (int ind = gid; ind < num_elements; ind += grid_size) {
+        int tid = ind;
+        int w = tid % (int)width;
+        tid = tid / (int)width;
+
+        int h = tid % (int)hidden_dim;
+        tid = tid / (int)hidden_dim;
+
+        int c = tid % (int)channel;
+        tid = tid / (int)channel;
+
+        int b = tid % (int)batch_size;
+
+        int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0;
+        int p_index = w * (int)p_strides_3 + h * (int)p_strides_2 + c * (int)p_strides_1 + b * (int)p_strides_0;
+
+        float val = static_cast<float>(x_packed[p_index]) * x_scale_val;
+        x[index] = static_cast<Tout>(val);
+    }
+}
+
+#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
diff --git a/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h b/src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h
@@ -0,0 +1,76 @@
+#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__
+#define __PER_TENSOR_DEQUANT_INT8_INFO_H__
+
+#include "../../../../utils.h"
+#include "../../../operator.h"
+#include "../../../tensor.h"
+
+namespace op::per_tensor_dequant_int8 {
+
+class PerTensorDequantI8Info {
+private:
+    PerTensorDequantI8Info() = default;
+
+public:
+    infiniDtype_t dtype, packed_type;
+    size_t batch_size, channel, hidden_dim, width;
+    ptrdiff_t strides_0, strides_1, strides_2, strides_3;
+    ptrdiff_t p_strides_0, p_strides_1, p_strides_2, p_strides_3;
+    int num_elements;
+
+    static utils::Result<PerTensorDequantI8Info> createPerTensorDequantI8Info(
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t x_packed_desc,
+        infiniopTensorDescriptor_t x_scale_desc,
+        infiniopTensorDescriptor_t x_zero_desc) {
+
+        CHECK_OR_RETURN(
+            x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
+            INFINI_STATUS_NULL_POINTER);
+
+        const infiniDtype_t dtype = x_desc->dtype();
+        const infiniDtype_t packed_type = x_packed_desc->dtype();
+
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+        CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);
+
+        auto shape = x_desc->shape();
+        CHECK_SAME_SHAPE(shape, x_packed_desc->shape());
+
+        auto ndim = x_desc->ndim();
+        CHECK_OR_RETURN(ndim <= 4,
+                        INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+        size_t width = shape[ndim - 1];
+        size_t hidden_dim = (ndim > 1 ? shape[ndim - 2] : 1);
+        size_t channel = (ndim > 2 ? shape[ndim - 3] : 1);
+        size_t batch_size = (ndim > 3 ? shape[ndim - 4] : 1);
+
+        ptrdiff_t strides_3 = x_desc->strides()[ndim - 1];
+        ptrdiff_t strides_2 = (ndim > 1 ? x_desc->strides()[ndim - 2] : 0);
+        ptrdiff_t strides_1 = (ndim > 2 ? x_desc->strides()[ndim - 3] : 0);
+        ptrdiff_t strides_0 = (ndim > 3 ? x_desc->strides()[ndim - 4] : 0);
+
+        ptrdiff_t p_strides_3 = x_packed_desc->strides()[ndim - 1];
+        ptrdiff_t p_strides_2 = (ndim > 1 ? x_packed_desc->strides()[ndim - 2] : 0);
+        ptrdiff_t p_strides_1 = (ndim > 2 ? x_packed_desc->strides()[ndim - 3] : 0);
+        ptrdiff_t p_strides_0 = (ndim > 3 ? x_packed_desc->strides()[ndim - 4] : 0);
+
+        int num_elements = 1;
+        for (int i = 0; i < (int)ndim; i++) {
+            num_elements *= static_cast<int>(shape[i]);
+        }
+
+        return utils::Result<PerTensorDequantI8Info>(PerTensorDequantI8Info{
+            dtype,
+            packed_type,
+            batch_size, channel, hidden_dim, width,
+            strides_0, strides_1, strides_2, strides_3,
+            p_strides_0, p_strides_1, p_strides_2, p_strides_3,
+            num_elements});
+    }
+};
+
+} // namespace op::per_tensor_dequant_int8
+
+#endif //  __PER_TENSOR_DEQUANT_INT8_INFO_H__