Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/infiniop.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "infiniop/ops/relu.h"
#include "infiniop/ops/rms_norm.h"
#include "infiniop/ops/rope.h"
#include "infiniop/ops/silu.h"
#include "infiniop/ops/softplus.h"
#include "infiniop/ops/sub.h"
#include "infiniop/ops/swiglu.h"
Expand Down
24 changes: 24 additions & 0 deletions include/infiniop/ops/silu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_SILU_API_H__
#define __INFINIOP_SILU_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopSiluDescriptor_t;

__C __export infiniStatus_t infiniopCreateSiluDescriptor(infiniopHandle_t handle,
infiniopSiluDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t output,
infiniopTensorDescriptor_t intput);

__C __export infiniStatus_t infiniopGetSiluWorkspaceSize(infiniopSiluDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopSilu(infiniopSiluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *output,
const void *intput,
void *stream);

__C __export infiniStatus_t infiniopDestroySiluDescriptor(infiniopSiluDescriptor_t desc);

#endif
2 changes: 2 additions & 0 deletions src/infiniop-test/include/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ DECLARE_INFINIOP_TEST(swiglu)
DECLARE_INFINIOP_TEST(add)
DECLARE_INFINIOP_TEST(causal_softmax)
DECLARE_INFINIOP_TEST(rearrange)
DECLARE_INFINIOP_TEST(silu)
DECLARE_INFINIOP_TEST(sub)
DECLARE_INFINIOP_TEST(zeros)
DECLARE_INFINIOP_TEST(ones)
Expand Down Expand Up @@ -53,6 +54,7 @@ DECLARE_INFINIOP_TEST(topksoftmax)
REGISTER_INFINIOP_TEST(sigmoid) \
REGISTER_INFINIOP_TEST(topkrouter) \
REGISTER_INFINIOP_TEST(topksoftmax) \
REGISTER_INFINIOP_TEST(silu) \
}

namespace infiniop_test {
Expand Down
101 changes: 101 additions & 0 deletions src/infiniop-test/src/ops/silu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#include "ops.hpp"
#include "utils.hpp"
#include <infinirt.h>
#include <iomanip>
#include <iostream>

namespace infiniop_test::silu {
struct Test::Attributes {
std::shared_ptr<Tensor> input;
std::shared_ptr<Tensor> output;
std::shared_ptr<Tensor> ans;
};

std::shared_ptr<Test> Test::build(
std::unordered_map<std::string, std::vector<uint8_t>> attributes,
std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
double rtol, double atol) {
auto test = std::shared_ptr<Test>(new Test(rtol, atol));
test->_attributes = new Attributes();
if (tensors.find("input") == tensors.end()
|| tensors.find("output") == tensors.end()
|| tensors.find("ans") == tensors.end()) {
throw std::runtime_error("Invalid Test");
}

test->_attributes->input = tensors["input"];
test->_attributes->output = tensors["output"];
test->_attributes->ans = tensors["ans"];

return test;
}

std::shared_ptr<infiniop_test::Result> Test::run(
infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
infiniopSiluDescriptor_t op_desc;
auto input = _attributes->input->to(device, device_id);
auto output = _attributes->output->to(device, device_id);
CHECK_OR(infiniopCreateSiluDescriptor(handle, &op_desc,
output->desc(),
input->desc()),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
size_t workspace_size;
CHECK_OR(infiniopGetSiluWorkspaceSize(op_desc, &workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
void *workspace;
CHECK_OR(infinirtMalloc(&workspace, workspace_size),
return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
CHECK_OR(infiniopSilu(op_desc, workspace, workspace_size,
output->data(),
input->data(),
nullptr),
return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));

try {
allClose(output, _attributes->ans, _rtol, _atol);
} catch (const std::exception &e) {
return TEST_FAILED(RESULT_INCORRECT, e.what());
}

double elapsed_time = 0.;

elapsed_time = benchmark(
[=]() {
infiniopSilu(
op_desc, workspace, workspace_size,
output->data(),
input->data(),
nullptr);
},
warm_ups, iterations);

return TEST_PASSED(elapsed_time);
}

std::vector<std::string> Test::attribute_names() {
return {};
}

std::vector<std::string> Test::tensor_names() {
return {"input", "output", "ans"};
}

std::vector<std::string> Test::output_names() {
return {"output"};
}

std::string Test::toString() const {
std::ostringstream oss;
oss << op_name() << std::endl;
oss << "- input: " << _attributes->input->info() << std::endl;
oss << "- output: " << _attributes->output->info() << std::endl;
oss << std::scientific << std::setprecision(2);
oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
return oss.str();
}

Test::~Test() {
delete _attributes;
}

} // namespace infiniop_test::silu
52 changes: 52 additions & 0 deletions src/infiniop/ops/silu/cpu/silu_cpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "silu_cpu.h"

namespace op::silu::cpu {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);

CHECK_SAME_SHAPE(output_shape, input_shape);

// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<SiluOp, bf16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<SiluOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<SiluOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<SiluOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}
} // namespace op::silu::cpu
23 changes: 23 additions & 0 deletions src/infiniop/ops/silu/cpu/silu_cpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#ifndef __SILU_CPU_H__
#define __SILU_CPU_H__

#include "../../../elementwise/cpu/elementwise_cpu.h"

ELEMENTWISE_DESCRIPTOR(silu, cpu)

#include <cmath>

namespace op::silu::cpu {
typedef struct SiluOp {
public:
static constexpr size_t num_inputs = 1;

template <typename T>
T operator()(const T &x) const {
return x / (static_cast<T>(1) + std::exp(-x));
}
} SiluOp;

} // namespace op::silu::cpu

#endif // __SILU_CPU_H__
37 changes: 37 additions & 0 deletions src/infiniop/ops/silu/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#ifndef __SILU_CUDA_H__
#define __SILU_CUDA_H__

#include <cmath>

namespace op::silu::cuda {

typedef struct SiluOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
__device__ __forceinline__ T operator()(const T &x) const {
if constexpr (std::is_same_v<T, half2>) {
// half2向量化优化
return __hmul2(x, __h2div(__float2half2_rn(1.0f),
__hadd2(__float2half2_rn(1.0f), h2exp(__hneg2(x)))));
} else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
// BF16
const float x_f = __bfloat162float(x);
return __float2bfloat16(x_f / (1.0f + __expf(-x_f)));
} else if constexpr (std::is_same_v<T, half>) {
// FP16
const float x_f = __half2float(x);
return __float2half(x_f / (1.0f + __expf(-x_f)));
} else if constexpr (std::is_same_v<T, float>) {
// FP32
return x * (1.0f / (1.0f + __expf(-x)));
} else if constexpr (std::is_same_v<T, double>) {
// FP64
return x / (1.0 + exp(-x));
}
}
} SiluOp;

} // namespace op::silu::cuda

#endif // __SILU_CUDA_H__
8 changes: 8 additions & 0 deletions src/infiniop/ops/silu/metax/silu_metax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __SILU_METAX_API_H__
#define __SILU_METAX_API_H__

#include "../../../elementwise/metax/elementwise_metax_api.h"

ELEMENTWISE_DESCRIPTOR(silu, metax)

#endif // __SILU_METAX_API_H__
60 changes: 60 additions & 0 deletions src/infiniop/ops/silu/metax/silu_metax.maca
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include "silu_metax.h"

#include "../../../elementwise/metax/elementwise_metax.h"

#include "../cuda/kernel.cuh"

namespace op::silu::metax {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);

CHECK_SAME_SHAPE(output_shape, input_shape);

// create METAX elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SiluOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}
} // namespace op::silu::metax
59 changes: 59 additions & 0 deletions src/infiniop/ops/silu/nvidia/silu_nvidia.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"

#include "../cuda/kernel.cuh"
#include "silu_nvidia.cuh"

namespace op::silu::nvidia {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);

CHECK_SAME_SHAPE(output_shape, input_shape);

// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SiluOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SiluOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SiluOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SiluOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}
} // namespace op::silu::nvidia
Loading