Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions src/infiniop/devices/cuda/cuda_kernel_common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void
#endif

#include <cuda_bf16.h>
#include <cuda_fp16.h>

// Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096
Expand All @@ -12,8 +15,10 @@

#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)

namespace device::cuda {
using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;

namespace device::cuda {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t
indexToReducedOffset(
Expand Down Expand Up @@ -45,8 +50,6 @@ indexToOffset(
}
} // namespace device::cuda

#ifdef ENABLE_NVIDIA_API
#include <cuda_fp16.h>
__forceinline__ __device__ float
exp_(const float val) {
return expf(val);
Expand All @@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_(const __nv_bfloat16 x) {
return hexp(x);
}
#endif
13 changes: 7 additions & 6 deletions src/infiniop/devices/maca/maca_kernel_common.h
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
#define INFINIOP_MACA_KERNEL __global__ void

// Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024
#define MACA_BLOCK_SIZE_512 512

#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)

using cuda_bfloat16 = hpcc_bfloat16;
using cuda_bfloat162 = hpcc_bfloat162;

namespace device::maca {

// return the memory offset of original tensor, given the flattened index of broadcasted tensor
Expand Down Expand Up @@ -39,16 +43,14 @@ indexToOffset(
}
} // namespace device::maca

#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__ __device__ float
exp_(const float val) {
return expf(val);
}

__forceinline__ __device__ long double
exp_(const long double val) {
return expl(val);
return exp(val);
}

__forceinline__ __device__ double
Expand All @@ -61,8 +63,7 @@ exp_(const __half x) {
return hexp(x);
}

__forceinline__ __device__ __hpcc_bfloat16;
exp_(const __hpcc_bfloat16; x) {
__forceinline__ __device__ __hpcc_bfloat16
exp_(const __hpcc_bfloat16 x) {
return hexp(x);
}
#endif
78 changes: 39 additions & 39 deletions src/infiniop/elementwise/elementwise.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,45 +12,45 @@
#include <numeric>
#include <vector>

#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info, \
op::elementwise::NAMESPACE::DeviceImpl *device_info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype), \
_info(std::move(info)), \
_device_info(std::move(device_info)), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
std::vector<const void *> inputs, \
void *stream) const; \
}; \
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON) \
\
namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
size_t _workspace_size; \
\
Descriptor( \
infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info, \
op::elementwise::KERNEL_COMMON::DeviceImpl *device_info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype), \
_info(std::move(info)), \
_device_info(std::move(device_info)), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
\
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *output, \
std::vector<const void *> inputs, \
void *stream) const; \
}; \
}

namespace op::elementwise {
Expand Down
2 changes: 1 addition & 1 deletion src/infiniop/ops/add/cpu/add_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "../../../elementwise/cpu/elementwise_cpu.h"

ELEMENTWISE_DESCRIPTOR(add, cpu)
ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)

namespace op::add::cpu {
typedef struct AddOp {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "add_cuda.cuh"
#include "add_cuda_internal.cuh"
#include "../cuda/kernel.cuh"
#include "add_nvidia.cuh"

namespace op::add::cuda {
namespace op::add::nvidia {

Descriptor::~Descriptor() = default;

Expand Down Expand Up @@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(

switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, AddOp, half>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, AddOp, float>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, AddOp, double>(_info, workspace, output, inputs, stream);
return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::cuda
} // namespace op::add::nvidia
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"

ELEMENTWISE_DESCRIPTOR(add, cuda)
ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)

#endif // __ADD_CUDA_API_H__
16 changes: 8 additions & 8 deletions src/infiniop/ops/add/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "cpu/add_cpu.h"
#endif
#ifdef ENABLE_NVIDIA_API
#include "cuda/add_cuda.cuh"
#include "nvidia/add_nvidia.cuh"
#endif

__C infiniStatus_t infiniopCreateAddDescriptor(
Expand All @@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda);
CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif

default:
Expand All @@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS

switch (desc->device_type) {
#ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu)
GET(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda)
GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
Expand Down Expand Up @@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd(
CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif

default:
Expand All @@ -99,15 +99,15 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
return INFINI_STATUS_SUCCESS

switch (desc->device_type) {

#ifdef ENABLE_CPU_API
DELETE(INFINI_DEVICE_CPU, cpu);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda);
DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif

default:
Expand Down
8 changes: 0 additions & 8 deletions src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
#define __CAUSAL_SOFTMAX_KERNEL_CUH__

#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"

template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_CUDA_KERNEL causalSoftmax(
__device__ void causalSoftmaxKernel(
Tdata *y_, const Tdata *x_,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
Expand All @@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
// 2 | * * * ... * * * |
// height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) {
#ifdef ENABLE_NVIDIA_API
y[col] = exp_(x[col] - max_);
#else
y[col] = exp(x[col] - max_);
#endif
if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
y[col] = hexp(x[col] - max_);
} else {
y[col] = exp(x[col] - max_);
}
} else {
y[col] = Tdata(0);
}
Expand Down
60 changes: 0 additions & 60 deletions src/infiniop/ops/causal_softmax/maca/causal_softmax_kernel.h

This file was deleted.

8 changes: 0 additions & 8 deletions src/infiniop/ops/causal_softmax/maca/causal_softmax_maca.h

This file was deleted.

8 changes: 8 additions & 0 deletions src/infiniop/ops/causal_softmax/metax/causal_softmax_metax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __CAUSAL_SOFTMAX_METAX_H__
#define __CAUSAL_SOFTMAX_METAX_H__

#include "../causal_softmax.h"

DESCRIPTOR(metax)

#endif
Loading