Skip to content
Merged

Dev #57

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/04kernel/cuda/include/kernel/cuda/slice.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
namespace refactor::kernel::cuda {

struct DimInfo {
unsigned int countStride, sizeStart;
int sizeStride;
unsigned int strideO, skip;
int strideI;
};

void launchSlice(
Expand Down
12 changes: 5 additions & 7 deletions src/04kernel/cuda/src/slice.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace refactor::kernel::cuda {

__global__ static void sliceKernel(
unsigned long long n,
uint8_t const *src, DimInfo const *dims, uint8_t *output,
uint8_t const *src, DimInfo const *dims, uint8_t *dst,
unsigned int rank,
unsigned int blockSize) {
extern __shared__ DimInfo dimInfo[];
Expand All @@ -18,15 +18,13 @@ namespace refactor::kernel::cuda {
step = blockDim.x * gridDim.x;
tid < n;
tid += step) {
long rem = tid;
auto src_ = src;
auto dst_ = output + rem * blockSize;
long rem = tid, j = 0;
for (auto i = 0; i < rank; ++i) {
auto const &dim = dimInfo[i];
src_ += rem / dim.countStride * dim.sizeStride + dim.sizeStart;
rem %= dim.countStride;
j += rem / dim.strideO * dim.strideI + dim.skip;
rem %= dim.strideO;
}
optimizedMemcpy(dst_, src_, blockSize);
optimizedMemcpy(dst + tid * blockSize, src + j * blockSize, blockSize);
}
}

Expand Down
11 changes: 5 additions & 6 deletions src/04kernel/include/kernel/attributes/slice_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,19 @@ namespace refactor::kernel {
/// @brief 优化用于计算的 Slice 描述。
struct SliceInfo {
struct Dim {
dim_t countStride, sizeStart;
sdim_t sizeStride;
dim_t strideO, skip;
sdim_t strideI;

bool operator==(Dim const &) const noexcept;
bool operator!=(Dim const &) const noexcept;
};
std::vector<Dim> dims;
dim_t blockCount, blockSize, baseOffset;
dim_t blockCount, blockSize;

SliceInfo(decltype(dims),
decltype(blockCount),
decltype(blockSize),
decltype(baseOffset)) noexcept;
SliceInfo(Dimensions const &, Tensor const &);
decltype(blockSize)) noexcept;
SliceInfo(Dimensions, Tensor const &);
SliceInfo reform(dim_t maxblockSize) const noexcept;
void reformAssign(dim_t maxblockSize) noexcept;
};
Expand Down
118 changes: 58 additions & 60 deletions src/04kernel/src/attributes/slice_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
namespace refactor::kernel {

bool SliceInfo::Dim::operator==(Dim const &rhs) const noexcept {
return countStride == rhs.countStride &&
sizeStart == rhs.sizeStart &&
sizeStride == rhs.sizeStride;
return strideO == rhs.strideO &&
strideI == rhs.strideI &&
skip == rhs.skip;
}
bool SliceInfo::Dim::operator!=(Dim const &rhs) const noexcept {
return !operator==(rhs);
Expand All @@ -15,74 +15,70 @@ namespace refactor::kernel {
SliceInfo::SliceInfo(
std::vector<Dim> dims_,
dim_t blockCount_,
dim_t blockSize_,
dim_t baseOffset_) noexcept
dim_t blockSize_) noexcept
: dims(std::move(dims_)),
blockCount(blockCount_),
blockSize(blockSize_),
baseOffset(baseOffset_) {}
blockSize(blockSize_) {}

SliceInfo::SliceInfo(Dimensions const &dims_, Tensor const &input)
: dims(1),
SliceInfo::SliceInfo(Dimensions dims_, Tensor const &input)
: dims{},
blockCount(1),
blockSize(input.dataType.size()),
baseOffset(0) {
ASSERT(dims_.size() == static_cast<size_t>(input.rank()), "Unreachable");
blockSize(input.dataType.size()) {
size_t rank = input.rank();
if (!rank) { return; }// scalar input
ASSERT(dims_.size() == rank, "unreachable");

auto continuous = true;
auto stride = blockSize;
dims[0] = {1, 0, static_cast<sdim_t>(stride)};
for (auto i : range0_(input.rank()).rev()) {
auto l = input.shape[i];
auto const &d = dims_[i];
if (auto &it = dims.back(); continuous && d.step == 1) {
it.countStride *= d.length;
it.sizeStart = d.start * stride;
it.sizeStride *= l;
} else {
dims.push_back(Dim{
static_cast<dim_t>(it.countStride * d.length),
static_cast<dim_t>(d.start * stride),
static_cast<sdim_t>(d.step * stride),
});
std::vector<dim_t> shape;
{// 去除形状里的 1
shape.reserve(rank);
for (auto i : range0_(rank)) {
if (auto l = input.shape[i]; l != 1) {
if (auto j = shape.size(); j < i) { dims_[j] = dims_[i]; }
shape.push_back(l);
}
}
continuous = d.length == l;
stride *= l;
dims_.resize(rank = shape.size());
}
baseOffset = dims[0].sizeStart;
auto elementCount = dims[0].countStride;
blockSize *= elementCount;
for (auto &d : dims) {
d.countStride /= elementCount;
dims.reserve(rank);
dim_t strideI = 1;
for (auto i : range0_(rank).rev()) {
auto const &dim = dims_[i];
dims.push_back({
.strideO = blockCount,
.skip = static_cast<dim_t>(strideI * dim.start),
.strideI = static_cast<sdim_t>(strideI * dim.step),
});
blockCount *= dim.length;
strideI *= shape[i];
}
std::reverse(dims.begin(), dims.end());
blockCount = dims[0].countStride;
for (auto i : range(1ul, dims.size())) {
dims[i - 1].countStride = dims[i].countStride;

while (!dims.empty()) {
auto const &dim = dims.back();
if (dim.strideI == static_cast<sdim_t>(dim.strideO) && !dim.skip) {
dims.pop_back();
} else {
long times = std::gcd(std::gcd(dim.strideI, dim.strideO), dim.skip);
blockCount /= times;
blockSize *= times;
if (!dims.empty()) {
for (auto &dim : dims) {
dim.strideO /= times;
dim.skip /= times;
dim.strideI /= times;
}
if (dims.back().strideO != 1) {
dims.push_back({1, 0, 1});
}
}
break;
}
}
dims.pop_back();
dims.shrink_to_fit();
}

SliceInfo SliceInfo::reform(dim_t maxblockSize) const noexcept {
auto blockSize_ = std::gcd(blockSize, maxblockSize);
if (blockSize_ == blockSize) { return *this; }
auto times = blockSize / blockSize_;
SliceInfo ans{
std::vector<Dim>(dims.size() + 1),
blockCount * times,
blockSize_,
baseOffset,
};
for (auto i : range0_(dims.size())) {
auto const &d = dims[i];
ans.dims[i] = {
d.countStride * times,
d.sizeStart,
d.sizeStride,
};
}
ans.dims.back() = {1, 0, static_cast<sdim_t>(blockSize_)};
auto ans = *this;
ans.reformAssign(maxblockSize);
return ans;
}

Expand All @@ -93,10 +89,12 @@ namespace refactor::kernel {
blockCount *= times;
blockSize = blockSize_;
for (auto &d : dims) {
d.countStride *= times;
d.strideO *= times;
d.strideI *= times;
d.skip *= times;
}
dims.resize(dims.size() + 1);
dims.back() = {1, 0, static_cast<sdim_t>(blockSize_)};
dims.back() = {1, 0, 1};
}


Expand Down
10 changes: 6 additions & 4 deletions src/04kernel/src/generator/nvrtc_repo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace refactor::kernel::nvrtc {
}
NVRTC_ASSERT(nvrtcCreateProgram(&prog, code.data(), name.data(), 0, nullptr, nullptr));

std::vector<std::string> opts{"--std=c++20", "--gpu-architecture=compute_80"};
std::vector<std::string> opts{"--std=c++17", "--gpu-architecture=compute_80"};
#ifdef CUDA_INCLUDE_PATH
opts.emplace_back(fmt::format("-I{}", CUDA_INCLUDE_PATH));
#endif
Expand All @@ -42,9 +42,11 @@ namespace refactor::kernel::nvrtc {
{
size_t logSize;
NVRTC_ASSERT(nvrtcGetProgramLogSize(prog, &logSize));
std::vector<char> log(logSize);
NVRTC_ASSERT(nvrtcGetProgramLog(prog, log.data()));
fmt::println("{}", log.data());
if (logSize > 1) {
std::vector<char> log(logSize);
NVRTC_ASSERT(nvrtcGetProgramLog(prog, log.data()));
fmt::println("{}", log.data());
}
}
if (compileResult != NVRTC_SUCCESS) {
fmt::println("wrong code:");
Expand Down
12 changes: 5 additions & 7 deletions src/04kernel/src/kernels/slice/cpu_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,18 @@ namespace refactor::kernel {
auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
using namespace runtime;
return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]) + info.baseOffset;
auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
auto dst = reinterpret_cast<uint8_t *>(outputs[0]);
std::for_each_n(std::execution::par_unseq,
natural_t(0), info.blockCount,
[=, &info](auto i) {
long rem = i;
auto src_ = src;
auto dst_ = dst + rem * info.blockSize;
long rem = i, j = 0;
for (auto const &dim : info.dims) {
auto d = std::div(rem, dim.countStride);
src_ += d.quot * dim.sizeStride + dim.sizeStart;
auto d = std::div(rem, dim.strideO);
j += d.quot * dim.strideI + dim.skip;
rem = d.rem;
}
std::memcpy(dst_, src_, info.blockSize);
std::memcpy(dst + i * info.blockSize, src + j * info.blockSize, info.blockSize);
});
};
}
Expand Down
11 changes: 5 additions & 6 deletions src/04kernel/src/kernels/slice/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,15 @@ namespace refactor::kernel {
dims.begin(),
[](auto const &d) {
return cuda::DimInfo{
d.countStride,
d.sizeStart,
d.sizeStride,
d.strideO,
d.skip,
d.strideI,
};
});
return [dims = thrust::device_vector<cuda::DimInfo>(dims),
params = cuda::ThreadsDistributer()(info.blockCount),
blockSize = info.blockSize,
baseOffset = info.baseOffset](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]) + baseOffset;
blockSize = info.blockSize](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
cuda::launchSlice(params, src, dims.data().get(), outputs[0],
dims.size(),
blockSize);
Expand Down
43 changes: 0 additions & 43 deletions src/04kernel/test/attributes/test_slice_info.cpp

This file was deleted.

42 changes: 42 additions & 0 deletions src/04kernel/test/kernels/slice/test_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,48 @@
using namespace refactor;
using namespace kernel;

TEST(kernel, SliceCpu_with1) {
// build routine
Dimensions dims{
{0, 1, 1},
{0, 1, 2},
{0, 1, 1},
{2, 1, 2},
};
auto input = Tensor::share(DataType::F32, Shape{1, 2, 1, 4}),
output = Tensor::share(DataType::F32, Shape{1, 2, 1, 2});
auto kernel = SliceCpu::build(SliceInfo(dims, *input));
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto routine = kernel->lower(res).routine;
// put input data
std::vector<float>
data(input->elementsSize()),
result(output->elementsSize());
std::iota(data.begin(), data.end(), 0);
// inference
{
void const *inputs[]{data.data()};
void *outputs[]{result.data()};
routine(res, nullptr, inputs, outputs);
}
// check
std::vector<float> ans{2, 3, 6, 7};
EXPECT_EQ(result, ans);
// test reform
auto kernelReformed = SliceCpu::build(SliceInfo(dims, *input).reform(16));
ASSERT_TRUE(kernelReformed);
auto routineReformed = kernelReformed->lower(res).routine;
std::vector<float> resultReformed(result.size());
{
void const *inputs[]{data.data()};
void *outputs[]{resultReformed.data()};
routineReformed(res, nullptr, inputs, outputs);
}
// check
EXPECT_EQ(resultReformed, ans);
}

TEST(kernel, SliceCpu) {
// build routine
Dimensions dims{
Expand Down
Loading