Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/infinicore/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
#include "ops/paged_attention.hpp"
#include "ops/paged_attention_prefill.hpp"
#include "ops/paged_caching.hpp"
#include "ops/per_tensor_dequant_i8.hpp"
#include "ops/per_tensor_quant_i8.hpp"
#include "ops/random_sample.hpp"
#include "ops/rearrange.hpp"
#include "ops/reciprocal.hpp"
Expand Down
11 changes: 11 additions & 0 deletions include/infinicore/ops/per_tensor_dequant_i8.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(PerTensorDequantI8, Tensor, const Tensor &, const Tensor &, const Tensor &);

void per_tensor_dequant_i8_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero);
} // namespace infinicore::op
11 changes: 11 additions & 0 deletions include/infinicore/ops/per_tensor_quant_i8.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"

namespace infinicore::op {

INFINICORE_GRAPH_OP_CLASS(PerTensorQuantI8, const Tensor &, Tensor, Tensor, Tensor, bool);

void per_tensor_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static);
} // namespace infinicore::op
5 changes: 5 additions & 0 deletions include/infinicore/quantization/quantization_scheme.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,9 @@ enum class QuantScheme {
AWQ_W4A16,
};

enum class KVQuantScheme {
NONE,
INT8,
};

} // namespace infinicore::quantization
2 changes: 2 additions & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "infiniop/ops/clip.h"
#include "infiniop/ops/conv.h"
#include "infiniop/ops/cross_entropy.h"
#include "infiniop/ops/dequant/per_tensor_dequant_int8.h"
#include "infiniop/ops/dequantize_awq.h"
#include "infiniop/ops/embedding.h"
#include "infiniop/ops/equal.h"
Expand All @@ -34,6 +35,7 @@
#include "infiniop/ops/paged_attention_prefill.h"
#include "infiniop/ops/paged_caching.h"
#include "infiniop/ops/quant/per_channel_quant_int8.h"
#include "infiniop/ops/quant/per_tensor_quant_int8.h"
#include "infiniop/ops/random_sample.h"
#include "infiniop/ops/rearrange.h"
#include "infiniop/ops/reciprocal.h"
Expand Down
28 changes: 28 additions & 0 deletions include/infiniop/ops/dequant/per_tensor_dequant_int8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__
#define __INFINIOP_PER_TENSOR_DEQUANT_INT8_API_H__

#include "../../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopPerTensorDequantI8Descriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreatePerTensorDequantI8Descriptor(infiniopHandle_t handle,
infiniopPerTensorDequantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc);

__INFINI_C __export infiniStatus_t infiniopGetPerTensorDequantI8WorkspaceSize(infiniopPerTensorDequantI8Descriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopPerTensorDequantI8(infiniopPerTensorDequantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x,
const void *x_packed,
const void *x_scale,
const void *x_zero,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorDequantI8Descriptor(infiniopPerTensorDequantI8Descriptor_t desc);

#endif
29 changes: 29 additions & 0 deletions include/infiniop/ops/quant/per_tensor_quant_int8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__
#define __INFINIOP_PER_TENSOR_QUANT_INT8_API_H__

#include "../../operator_descriptor.h"

typedef InfiniopDescriptor *infiniopPerTensorQuantI8Descriptor_t;

__INFINI_C __export infiniStatus_t infiniopCreatePerTensorQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerTensorQuantI8Descriptor_t *desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc);

__INFINI_C __export infiniStatus_t infiniopGetPerTensorQuantI8WorkspaceSize(infiniopPerTensorQuantI8Descriptor_t desc, size_t *size);

__INFINI_C __export infiniStatus_t infiniopPerTensorQuantI8(infiniopPerTensorQuantI8Descriptor_t desc,
void *workspace,
size_t workspace_size,
void *x_packed,
void *x_scale,
void *x_zero,
const void *x,
const bool is_static,
void *stream);

__INFINI_C __export infiniStatus_t infiniopDestroyPerTensorQuantI8Descriptor(infiniopPerTensorQuantI8Descriptor_t desc);

#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "../../../utils.hpp"
#include "infinicore/ops/per_tensor_dequant_i8.hpp"

namespace infinicore::op {

INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerTensorDequantI8);

PerTensorDequantI8::PerTensorDequantI8(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zero);
INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zero);
}

void PerTensorDequantI8::execute(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerTensorDequantI8, x, x_packed, x_scale, x_zero);
}

void per_tensor_dequant_i8_(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
PerTensorDequantI8::execute(x, x_packed, x_scale, x_zero);
}
} // namespace infinicore::op
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include "../../infiniop_impl.hpp"
#include "infinicore/ops/per_tensor_dequant_i8.hpp"

namespace infinicore::op::per_tensor_dequant_i8_impl::infiniop {

INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerTensorDequantI8, 100);

struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, x, x_packed, x_scale, x_zero;
};

void *plan(Tensor x, const Tensor &x_packed, const Tensor &x_scale, const Tensor &x_zero) {
size_t seed = hash_combine(x, x_packed, x_scale);

INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, PerTensorDequantI8,
seed,
x->desc(), x_packed->desc(), x_scale->desc(), (x_zero ? x_zero->desc() : nullptr));

INFINIOP_WORKSPACE_TENSOR(workspace, PerTensorDequantI8, descriptor);

return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(x),
graph::GraphTensor(x_packed),
graph::GraphTensor(x_scale),
graph::GraphTensor(x_zero)};
}

void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

INFINICORE_CHECK_ERROR(infiniopPerTensorDequantI8(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->x->data(),
planned->x_packed->data(),
planned->x_scale->data(),
nullptr,
context::getStream()));
}

void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}

INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerTensorDequantI8, &plan, &run, &cleanup);

} // namespace infinicore::op::per_tensor_dequant_i8_impl::infiniop
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include "../../../utils.hpp"
#include "infinicore/ops/per_tensor_quant_i8.hpp"

namespace infinicore::op {

INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerTensorQuantI8);

PerTensorQuantI8::PerTensorQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale, x_zero);
INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale, x_zero, is_static);
}

void PerTensorQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerTensorQuantI8, x, x_packed, x_scale, x_zero, is_static);
}

void per_tensor_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
PerTensorQuantI8::execute(x, x_packed, x_scale, x_zero, is_static);
}
} // namespace infinicore::op
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "../../infiniop_impl.hpp"
#include "infinicore/ops/per_tensor_quant_i8.hpp"

namespace infinicore::op::per_tensor_quant_i8_impl::infiniop {

INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerTensorQuantI8, 100);

struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor workspace, x, x_packed, x_scale, x_zero;
const bool is_static;
};

void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale, Tensor x_zero, bool is_static) {
size_t seed = hash_combine(x, x_packed, x_scale);

INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, PerTensorQuantI8,
seed,
x_packed->desc(), x_scale->desc(), (x_zero ? x_zero->desc() : nullptr), x->desc());

INFINIOP_WORKSPACE_TENSOR(workspace, PerTensorQuantI8, descriptor);

return new PlannedMeta{
descriptor,
graph::GraphTensor(workspace),
graph::GraphTensor(x),
graph::GraphTensor(x_packed),
graph::GraphTensor(x_scale),
graph::GraphTensor(x_zero),
is_static};
}

void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
const bool is_static = planned->is_static;
INFINICORE_CHECK_ERROR(infiniopPerTensorQuantI8(
planned->descriptor->desc,
planned->workspace->data(),
planned->workspace->numel(),
planned->x_packed->data(),
planned->x_scale->data(),
nullptr,
planned->x->data(),
is_static,
context::getStream()));
}

void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}

INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerTensorQuantI8, &plan, &run, &cleanup);

} // namespace infinicore::op::per_tensor_quant_i8_impl::infiniop
36 changes: 36 additions & 0 deletions src/infiniop/ops/dequant/per_tensor_dequant_int8/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
#define __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__

template <typename Tin, typename Tout>
__device__ void perTensorDequantI8SymKernel(
Tout *x, const Tin *x_packed, const float *x_scale,
size_t batch_size, size_t channel, size_t hidden_dim, size_t width,
ptrdiff_t strides_0, ptrdiff_t strides_1, ptrdiff_t strides_2, ptrdiff_t strides_3,
ptrdiff_t p_strides_0, ptrdiff_t p_strides_1, ptrdiff_t p_strides_2, ptrdiff_t p_strides_3,
int num_elements) {

unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
const int grid_size = blockDim.x * gridDim.x;
float x_scale_val = x_scale[0];
for (int ind = gid; ind < num_elements; ind += grid_size) {
int tid = ind;
int w = tid % (int)width;
tid = tid / (int)width;

int h = tid % (int)hidden_dim;
tid = tid / (int)hidden_dim;

int c = tid % (int)channel;
tid = tid / (int)channel;

int b = tid % (int)batch_size;

int index = w * (int)strides_3 + h * (int)strides_2 + c * (int)strides_1 + b * (int)strides_0;
int p_index = w * (int)p_strides_3 + h * (int)p_strides_2 + c * (int)p_strides_1 + b * (int)p_strides_0;

float val = static_cast<float>(x_packed[p_index]) * x_scale_val;
x[index] = static_cast<Tout>(val);
}
}

#endif // __PER_TENSOR_DEQUANT_INT8_KERNEL_CUH__
76 changes: 76 additions & 0 deletions src/infiniop/ops/dequant/per_tensor_dequant_int8/info.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#ifndef __PER_TENSOR_DEQUANT_INT8_INFO_H__
#define __PER_TENSOR_DEQUANT_INT8_INFO_H__

#include "../../../../utils.h"
#include "../../../operator.h"
#include "../../../tensor.h"

namespace op::per_tensor_dequant_int8 {

class PerTensorDequantI8Info {
private:
PerTensorDequantI8Info() = default;

public:
infiniDtype_t dtype, packed_type;
size_t batch_size, channel, hidden_dim, width;
ptrdiff_t strides_0, strides_1, strides_2, strides_3;
ptrdiff_t p_strides_0, p_strides_1, p_strides_2, p_strides_3;
int num_elements;

static utils::Result<PerTensorDequantI8Info> createPerTensorDequantI8Info(
infiniopTensorDescriptor_t x_desc,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc) {

CHECK_OR_RETURN(
x_packed_desc != nullptr && x_scale_desc != nullptr && x_desc != nullptr,
INFINI_STATUS_NULL_POINTER);

const infiniDtype_t dtype = x_desc->dtype();
const infiniDtype_t packed_type = x_packed_desc->dtype();

CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
CHECK_DTYPE(packed_type, INFINI_DTYPE_I8);

auto shape = x_desc->shape();
CHECK_SAME_SHAPE(shape, x_packed_desc->shape());

auto ndim = x_desc->ndim();
CHECK_OR_RETURN(ndim <= 4,
INFINI_STATUS_BAD_TENSOR_SHAPE);

size_t width = shape[ndim - 1];
size_t hidden_dim = (ndim > 1 ? shape[ndim - 2] : 1);
size_t channel = (ndim > 2 ? shape[ndim - 3] : 1);
size_t batch_size = (ndim > 3 ? shape[ndim - 4] : 1);

ptrdiff_t strides_3 = x_desc->strides()[ndim - 1];
ptrdiff_t strides_2 = (ndim > 1 ? x_desc->strides()[ndim - 2] : 0);
ptrdiff_t strides_1 = (ndim > 2 ? x_desc->strides()[ndim - 3] : 0);
ptrdiff_t strides_0 = (ndim > 3 ? x_desc->strides()[ndim - 4] : 0);

ptrdiff_t p_strides_3 = x_packed_desc->strides()[ndim - 1];
ptrdiff_t p_strides_2 = (ndim > 1 ? x_packed_desc->strides()[ndim - 2] : 0);
ptrdiff_t p_strides_1 = (ndim > 2 ? x_packed_desc->strides()[ndim - 3] : 0);
ptrdiff_t p_strides_0 = (ndim > 3 ? x_packed_desc->strides()[ndim - 4] : 0);

int num_elements = 1;
for (int i = 0; i < (int)ndim; i++) {
num_elements *= static_cast<int>(shape[i]);
}

return utils::Result<PerTensorDequantI8Info>(PerTensorDequantI8Info{
dtype,
packed_type,
batch_size, channel, hidden_dim, width,
strides_0, strides_1, strides_2, strides_3,
p_strides_0, p_strides_1, p_strides_2, p_strides_3,
num_elements});
}
};

} // namespace op::per_tensor_dequant_int8

#endif // __PER_TENSOR_DEQUANT_INT8_INFO_H__
Loading
Loading