From 01ac6b5c20478cdfb67aaf80b24ad9da9e7e142e Mon Sep 17 00:00:00 2001 From: crydsch Date: Thu, 17 Aug 2023 10:38:25 +0200 Subject: [PATCH 1/4] Enable TestOpTensorSync Signed-off-by: crydsch --- test/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0564458c..a967e2a4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -17,6 +17,7 @@ add_executable(kompute_tests TestAsyncOperations.cpp TestMultipleAlgoExecutions.cpp TestOpShadersFromStringAndFile.cpp TestOpTensorCopy.cpp + TestOpTensorSync.cpp TestOpTensorCreate.cpp TestPushConstant.cpp TestSequence.cpp From 99cafa27d315c51cd9c36558a0d249afb6639d1a Mon Sep 17 00:00:00 2001 From: crydsch Date: Sat, 26 Aug 2023 11:55:36 +0200 Subject: [PATCH 2/4] Expose copy region for tensor copy operations Signed-off-by: crydsch --- src/Tensor.cpp | 37 ++++++++++++++++++++++++++++--- src/include/kompute/Tensor.hpp | 40 +++++++++++++++++++++++++++++++--- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/src/Tensor.cpp b/src/Tensor.cpp index ad5cac9a..3d488981 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -202,7 +202,20 @@ Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer, vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); - KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize); + this->recordCopyFrom(commandBuffer, + copyFromTensor, + copyRegion); +} + +void +Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer, + std::shared_ptr copyFromTensor, + const vk::BufferCopy copyRegion) +{ + + vk::DeviceSize bufferSize(this->memorySize()); + + KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", copyRegion.size); this->recordCopyBuffer(commandBuffer, copyFromTensor->mPrimaryBuffer, @@ -217,7 +230,15 @@ Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer) vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); - KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); + this->recordCopyFromStagingToDevice(commandBuffer, copyRegion); +} + +void +Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion) +{ + vk::DeviceSize bufferSize(this->memorySize()); + + KP_LOG_DEBUG("Kompute Tensor copying data size {}.", copyRegion.size); this->recordCopyBuffer(commandBuffer, this->mStagingBuffer, @@ -232,7 +253,17 @@ Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer) vk::DeviceSize bufferSize(this->memorySize()); vk::BufferCopy copyRegion(0, 0, bufferSize); - KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize); + this->recordCopyFromDeviceToStaging(commandBuffer, + copyRegion); +} + +void +Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, + const vk::BufferCopy copyRegion) +{ + vk::DeviceSize bufferSize(this->memorySize()); + + KP_LOG_DEBUG("Kompute Tensor copying data size {}.", copyRegion.size); this->recordCopyBuffer(commandBuffer, this->mPrimaryBuffer, diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index a2bcd187..6aaa3edf 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -99,8 +99,9 @@ class Tensor /** * Records a copy from the memory of the tensor provided to the current - * thensor. This is intended to pass memory into a processing, to perform + * tensor. This is intended to pass memory into a processing, to perform * a staging buffer transfer, or to gather output (between others). + * Copies the entire tensor. * * @param commandBuffer Vulkan Command Buffer to record the commands into * @param copyFromTensor Tensor to copy the data from @@ -108,23 +109,56 @@ class Tensor void recordCopyFrom(const vk::CommandBuffer& commandBuffer, std::shared_ptr copyFromTensor); + /** + * Records a copy from the memory of the tensor provided to the current + * tensor. This is intended to pass memory into a processing, to perform + * a staging buffer transfer, or to gather output (between others). + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param copyFromTensor Tensor to copy the data from + * @param copyRegion The buffer region to copy + */ + void recordCopyFrom(const vk::CommandBuffer& commandBuffer, + std::shared_ptr copyFromTensor, + const vk::BufferCopy copyRegion); + /** * Records a copy from the internal staging memory to the device memory * using an optional barrier to wait for the operation. This function would - * only be relevant for kp::Tensors of type eDevice. + * only be relevant for kp::Tensors of type eDevice. Copies the entire tensor. * * @param commandBuffer Vulkan Command Buffer to record the commands into */ void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer); /** - * Records a copy from the internal device memory to the staging memory + * Records a copy from the internal staging memory to the device memory * using an optional barrier to wait for the operation. This function would * only be relevant for kp::Tensors of type eDevice. * * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param copyRegion The buffer region to copy + */ + void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion); + + /** + * Records a copy from the internal device memory to the staging memory + * using an optional barrier to wait for the operation. This function would + * only be relevant for kp::Tensors of type eDevice. Copies the entire tensor. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into */ void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer); + + /** + * Records a copy from the internal device memory to the staging memory + * using an optional barrier to wait for the operation. This function would + * only be relevant for kp::Tensors of type eDevice. + * + * @param commandBuffer Vulkan Command Buffer to record the commands into + * @param copyRegion The buffer region to copy + */ + void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion); /** * Records the buffer memory barrier into the primary buffer and command From 63b7c10f9c7b573117e3ebd541303f43b9f3dc5b Mon Sep 17 00:00:00 2001 From: crydsch Date: Sat, 26 Aug 2023 12:30:29 +0200 Subject: [PATCH 3/4] Support custom parameter in sequence template functions Signed-off-by: crydsch --- src/include/kompute/Sequence.hpp | 84 ++++--------------- .../kompute/operations/OpAlgoDispatch.hpp | 2 + src/include/kompute/operations/OpBase.hpp | 2 + .../kompute/operations/OpMemoryBarrier.hpp | 2 + src/include/kompute/operations/OpMult.hpp | 2 + .../kompute/operations/OpTensorCopy.hpp | 2 + .../kompute/operations/OpTensorSyncDevice.hpp | 2 + .../kompute/operations/OpTensorSyncLocal.hpp | 2 + 8 files changed, 31 insertions(+), 67 deletions(-) diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp index de9b9f69..0787f74c 100644 --- a/src/include/kompute/Sequence.hpp +++ b/src/include/kompute/Sequence.hpp @@ -41,7 +41,7 @@ class Sequence : public std::enable_shared_from_this * function also requires the Sequence to be recording, otherwise it will * not be able to add the operation. * - * @param op Object derived from kp::BaseOp that will be recoreded by the + * @param op Object derived from kp::BaseOp that will be recorded by the * sequence which will be used when the operation is evaluated. * @return shared_ptr of the Sequence class itself */ @@ -53,37 +53,18 @@ class Sequence : public std::enable_shared_from_this * function also requires the Sequence to be recording, otherwise it will * not be able to add the operation. * - * @param tensors Vector of tensors to use for the operation + * @param param Template parameter that is used to initialise the operation. * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr record( - std::vector> tensors, + typename T::ConstructorParameterType param, TArgs&&... params) { - std::shared_ptr op{ new T(tensors, std::forward(params)...) }; - return this->record(op); - } - /** - * Record function for operation to be added to the GPU queue in batch. This - * template requires classes to be derived from the OpBase class. This - * function also requires the Sequence to be recording, otherwise it will - * not be able to add the operation. - * - * @param algorithm Algorithm to use for the record often used for OpAlgo - * operations - * @param TArgs Template parameters that are used to initialise operation - * which allows for extensible configurations on initialisation. - * @return shared_ptr of the Sequence class itself - */ - template - std::shared_ptr record(std::shared_ptr algorithm, - TArgs&&... params) - { - std::shared_ptr op{ new T(algorithm, - std::forward(params)...) }; + static_assert(std::is_base_of::value, "T must derive from OpBase"); + std::shared_ptr op{ new T(param, std::forward(params)...) }; return this->record(op); } @@ -108,34 +89,18 @@ class Sequence : public std::enable_shared_from_this * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * - * @param tensors Vector of tensors to use for the operation + * @param param Template parameter that is used to initialise the operation. * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template - std::shared_ptr eval(std::vector> tensors, - TArgs&&... params) - { - std::shared_ptr op{ new T(tensors, std::forward(params)...) }; - return this->eval(op); - } - /** - * Eval sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job with a barrier. - * - * @param algorithm Algorithm to use for the record often used for OpAlgo - * operations - * @param TArgs Template parameters that are used to initialise operation - * which allows for extensible configurations on initialisation. - * @return shared_ptr of the Sequence class itself - */ - template - std::shared_ptr eval(std::shared_ptr algorithm, - TArgs&&... params) + std::shared_ptr eval( + typename T::ConstructorParameterType param, + TArgs&&... params) { - std::shared_ptr op{ new T(algorithm, - std::forward(params)...) }; + static_assert(std::is_base_of::value, "T must derive from OpBase"); + std::shared_ptr op{ new T(param, std::forward(params)...) }; return this->eval(op); } @@ -148,6 +113,7 @@ class Sequence : public std::enable_shared_from_this * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(); + /** * Clears currnet operations to record provided one in the vector of * operations into the gpu as a submit job without a barrier. EvalAwait() @@ -157,39 +123,23 @@ class Sequence : public std::enable_shared_from_this * @return Boolean stating whether execution was successful. */ std::shared_ptr evalAsync(std::shared_ptr op); + /** * Eval sends all the recorded and stored operations in the vector of * operations into the gpu as a submit job with a barrier. * - * @param tensors Vector of tensors to use for the operation + * @param param Template parameter that is used to initialise the operation. * @param TArgs Template parameters that are used to initialise operation * which allows for extensible configurations on initialisation. * @return shared_ptr of the Sequence class itself */ template std::shared_ptr evalAsync( - std::vector> tensors, + typename T::ConstructorParameterType param, TArgs&&... params) { - std::shared_ptr op{ new T(tensors, std::forward(params)...) }; - return this->evalAsync(op); - } - /** - * Eval sends all the recorded and stored operations in the vector of - * operations into the gpu as a submit job with a barrier. - * - * @param algorithm Algorithm to use for the record often used for OpAlgo - * operations - * @param TArgs Template parameters that are used to initialise operation - * which allows for extensible configurations on initialisation. - * @return shared_ptr of the Sequence class itself - */ - template - std::shared_ptr evalAsync(std::shared_ptr algorithm, - TArgs&&... params) - { - std::shared_ptr op{ new T(algorithm, - std::forward(params)...) }; + static_assert(std::is_base_of::value, "T must derive from OpBase"); + std::shared_ptr op{ new T(param, std::forward(params)...) }; return this->evalAsync(op); } diff --git a/src/include/kompute/operations/OpAlgoDispatch.hpp b/src/include/kompute/operations/OpAlgoDispatch.hpp index e91598f0..bd58fb6d 100644 --- a/src/include/kompute/operations/OpAlgoDispatch.hpp +++ b/src/include/kompute/operations/OpAlgoDispatch.hpp @@ -17,6 +17,8 @@ namespace kp { class OpAlgoDispatch : public OpBase { public: + using ConstructorParameterType = std::shared_ptr; + /** * Constructor that stores the algorithm to use as well as the relevant * push constants to override when recording. diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp index 73767084..23a217a8 100644 --- a/src/include/kompute/operations/OpBase.hpp +++ b/src/include/kompute/operations/OpBase.hpp @@ -18,6 +18,8 @@ namespace kp { class OpBase { public: + using ConstructorParameterType = void; + /** * Default destructor for OpBase class. This OpBase destructor class should * always be called to destroy and free owned resources unless it is diff --git a/src/include/kompute/operations/OpMemoryBarrier.hpp b/src/include/kompute/operations/OpMemoryBarrier.hpp index 4a232232..35a23113 100644 --- a/src/include/kompute/operations/OpMemoryBarrier.hpp +++ b/src/include/kompute/operations/OpMemoryBarrier.hpp @@ -18,6 +18,8 @@ namespace kp { class OpMemoryBarrier : public OpBase { public: + using ConstructorParameterType = std::vector>; + /** * Constructor that stores tensors as well as memory barrier parameters to * be used to create a pipeline barrier on the respective primary or staging diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp index f75ccc4f..2d4f0eca 100644 --- a/src/include/kompute/operations/OpMult.hpp +++ b/src/include/kompute/operations/OpMult.hpp @@ -21,6 +21,8 @@ namespace kp { class OpMult : public OpAlgoDispatch { public: + using ConstructorParameterType = std::vector>; + /** * Default constructor with parameters that provides the bare minimum * requirements for the operations to be able to create and manage their diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp index 968c1065..6438b108 100644 --- a/src/include/kompute/operations/OpTensorCopy.hpp +++ b/src/include/kompute/operations/OpTensorCopy.hpp @@ -18,6 +18,8 @@ namespace kp { class OpTensorCopy : public OpBase { public: + using ConstructorParameterType = std::vector>; + /** * Default constructor with parameters that provides the core vulkan * resources and the tensors that will be used in the operation. diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp index 3a1792ac..7460a6ea 100644 --- a/src/include/kompute/operations/OpTensorSyncDevice.hpp +++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp @@ -18,6 +18,8 @@ namespace kp { class OpTensorSyncDevice : public OpBase { public: + using ConstructorParameterType = std::vector>; + /** * Default constructor with parameters that provides the core vulkan * resources and the tensors that will be used in the operation. The tensos diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp index 4216003e..95426775 100644 --- a/src/include/kompute/operations/OpTensorSyncLocal.hpp +++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp @@ -20,6 +20,8 @@ namespace kp { class OpTensorSyncLocal : public OpBase { public: + using ConstructorParameterType = std::vector>; + /** * Default constructor with parameters that provides the core vulkan * resources and the tensors that will be used in the operation. The tensors From 62ac892d224132cc96bbb75f7c4a5151731e2a0b Mon Sep 17 00:00:00 2001 From: crydsch Date: Sat, 26 Aug 2023 13:07:29 +0200 Subject: [PATCH 4/4] Add tensor copy operations with region support Signed-off-by: crydsch --- src/CMakeLists.txt | 3 + src/OpTensorCopyRegion.cpp | 75 ++++++++++++++++ src/OpTensorSyncRegionDevice.cpp | 72 +++++++++++++++ src/OpTensorSyncRegionLocal.cpp | 89 +++++++++++++++++++ src/include/CMakeLists.txt | 3 + src/include/kompute/Kompute.hpp | 3 + src/include/kompute/Tensor.hpp | 7 ++ .../kompute/operations/OpTensorCopyRegion.hpp | 69 ++++++++++++++ .../operations/OpTensorSyncRegionDevice.hpp | 67 ++++++++++++++ .../operations/OpTensorSyncRegionLocal.hpp | 68 ++++++++++++++ test/TestOpTensorSync.cpp | 48 ++++++++++ 11 files changed, 504 insertions(+) create mode 100644 src/OpTensorCopyRegion.cpp create mode 100644 src/OpTensorSyncRegionDevice.cpp create mode 100644 src/OpTensorSyncRegionLocal.cpp create mode 100644 src/include/kompute/operations/OpTensorCopyRegion.hpp create mode 100644 src/include/kompute/operations/OpTensorSyncRegionDevice.hpp create mode 100644 src/include/kompute/operations/OpTensorSyncRegionLocal.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dbb47dbe..8978d595 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,9 @@ add_library(kompute Algorithm.cpp OpTensorCopy.cpp OpTensorSyncDevice.cpp OpTensorSyncLocal.cpp + OpTensorCopyRegion.cpp + OpTensorSyncRegionDevice.cpp + OpTensorSyncRegionLocal.cpp Sequence.cpp Tensor.cpp Core.cpp) diff --git a/src/OpTensorCopyRegion.cpp b/src/OpTensorCopyRegion.cpp new file mode 100644 index 00000000..9b52c382 --- /dev/null +++ b/src/OpTensorCopyRegion.cpp @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "kompute/operations/OpTensorCopyRegion.hpp" +#include "kompute/Tensor.hpp" + +namespace kp { + +OpTensorCopyRegion::OpTensorCopyRegion(const TensorCopyRegions regions) +{ + KP_LOG_DEBUG("Kompute OpTensorCopyRegion constructor with params"); + + if (regions.dstRegions.size() < 1) { + throw std::runtime_error( + "Kompute OpTensorCopyRegion called with no destination region"); + } + + kp::Tensor::TensorDataTypes dataType = regions.srcTensor->dataType(); + for (const TensorRegion& region : regions.dstRegions) { + if (region.tensor->dataType() != dataType) { + throw std::runtime_error(fmt::format( + "Kompute OpTensorCopyRegion called with different types from {} to {}", + Tensor::toString(dataType), + Tensor::toString(region.tensor->dataType()))); + } + if (region.elemCount == 0) { + throw std::runtime_error( + "Kompute OpTensorCopyRegion called with elemCount == 0"); + } + if (region.srcIndex + region.elemCount > regions.srcTensor->size()) { + throw std::runtime_error( + "Kompute OpTensorCopyRegion called with out of bounds source region"); + } + if (region.dstIndex + region.elemCount > region.tensor->size()) { + throw std::runtime_error( + "Kompute OpTensorCopyRegion called with out of bounds destination region"); + } + } + + this->mRegions = regions; +} + +OpTensorCopyRegion::~OpTensorCopyRegion() +{ + KP_LOG_DEBUG("Kompute OpTensorCopyRegion destructor started"); +} + +void +OpTensorCopyRegion::record(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpTensorCopyRegion record called"); + + for (size_t i = 0; i < this->mRegions.dstRegions.size(); i++) { + const uint32_t dataTypeMemorySize = this->mRegions.dstRegions[i].tensor->dataTypeMemorySize(); + const vk::BufferCopy copy = { + dataTypeMemorySize * this->mRegions.dstRegions[i].srcIndex, + dataTypeMemorySize * this->mRegions.dstRegions[i].dstIndex, + dataTypeMemorySize * this->mRegions.dstRegions[i].elemCount, + }; + this->mRegions.dstRegions[i].tensor->recordCopyFrom(commandBuffer, this->mRegions.srcTensor, copy); + } +} + +void +OpTensorCopyRegion::preEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorCopyRegion preEval called"); +} + +void +OpTensorCopyRegion::postEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorCopyRegion postEval called"); +} + +} diff --git a/src/OpTensorSyncRegionDevice.cpp b/src/OpTensorSyncRegionDevice.cpp new file mode 100644 index 00000000..91af1153 --- /dev/null +++ b/src/OpTensorSyncRegionDevice.cpp @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "kompute/operations/OpTensorSyncRegionDevice.hpp" + +namespace kp { + +OpTensorSyncRegionDevice::OpTensorSyncRegionDevice( + const std::vector& regions) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice constructor with params"); + + if (regions.size() < 1) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionDevice called with less than 1 tensor region"); + } + + for (size_t i = 0; i < regions.size(); i++) { + if (regions[i].elemCount == 0) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionDevice called with elemCount == 0"); + } + if (regions[i].srcIndex + regions[i].elemCount > regions[i].tensor->size()) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionDevice called with out of bounds source region"); + } + if (regions[i].dstIndex + regions[i].elemCount > regions[i].tensor->size()) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionDevice called with out of bounds destination region"); + } + } + + this->mRegions = regions; +} + +OpTensorSyncRegionDevice::~OpTensorSyncRegionDevice() +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice destructor started"); + + this->mRegions.clear(); +} + +void +OpTensorSyncRegionDevice::record(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice record called"); + + for (size_t i = 0; i < this->mRegions.size(); i++) { + if (this->mRegions[i].tensor->tensorType() == Tensor::TensorTypes::eDevice) { + const uint32_t dataTypeMemorySize = this->mRegions[i].tensor->dataTypeMemorySize(); + const vk::BufferCopy copy = { + dataTypeMemorySize * this->mRegions[i].srcIndex, + dataTypeMemorySize * this->mRegions[i].dstIndex, + dataTypeMemorySize * this->mRegions[i].elemCount, + }; + this->mRegions[i].tensor->recordCopyFromStagingToDevice(commandBuffer, copy); + } + } +} + +void +OpTensorSyncRegionDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice preEval called"); +} + +void +OpTensorSyncRegionDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice postEval called"); +} + +} diff --git a/src/OpTensorSyncRegionLocal.cpp b/src/OpTensorSyncRegionLocal.cpp new file mode 100644 index 00000000..a426a519 --- /dev/null +++ b/src/OpTensorSyncRegionLocal.cpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include "kompute/Tensor.hpp" + +#include "kompute/operations/OpTensorSyncRegionLocal.hpp" + +namespace kp { + +OpTensorSyncRegionLocal::OpTensorSyncRegionLocal( + const std::vector& regions) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal constructor with params"); + + if (regions.size() < 1) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionLocal called with less than 1 region"); + } + + for (size_t i = 0; i < regions.size(); i++) { + if (regions[i].elemCount == 0) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionLocal called with elemCount == 0"); + } + if (regions[i].srcIndex + regions[i].elemCount > regions[i].tensor->size()) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionLocal called with out of bounds source region"); + } + if (regions[i].dstIndex + regions[i].elemCount > regions[i].tensor->size()) { + throw std::runtime_error( + "Kompute OpTensorSyncRegionLocal called with out of bounds destination region"); + } + } + + this->mRegions = regions; +} + +OpTensorSyncRegionLocal::~OpTensorSyncRegionLocal() +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal destructor started"); +} + +void +OpTensorSyncRegionLocal::record(const vk::CommandBuffer& commandBuffer) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal record called"); + + for (size_t i = 0; i < this->mRegions.size(); i++) { + if (this->mRegions[i].tensor->tensorType() == Tensor::TensorTypes::eDevice) { + + this->mRegions[i].tensor->recordPrimaryBufferMemoryBarrier( + commandBuffer, + vk::AccessFlagBits::eShaderWrite, + vk::AccessFlagBits::eTransferRead, + vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eTransfer); + + const uint32_t dataTypeMemorySize = this->mRegions[i].tensor->dataTypeMemorySize(); + const vk::BufferCopy copy = { + dataTypeMemorySize * this->mRegions[i].srcIndex, + dataTypeMemorySize * this->mRegions[i].dstIndex, + dataTypeMemorySize * this->mRegions[i].elemCount, + }; + this->mRegions[i].tensor->recordCopyFromDeviceToStaging(commandBuffer, copy); + + this->mRegions[i].tensor->recordPrimaryBufferMemoryBarrier( + commandBuffer, + vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eHostRead, + vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eHost); + } + } +} + +void +OpTensorSyncRegionLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal preEval called"); +} + +void +OpTensorSyncRegionLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/) +{ + KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal postEval called"); + + KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal mapping data into tensor local"); +} + +} diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt index e1652fdd..f4b475d9 100644 --- a/src/include/CMakeLists.txt +++ b/src/include/CMakeLists.txt @@ -23,6 +23,9 @@ target_sources(kompute PRIVATE kompute/operations/OpTensorCopy.hpp kompute/operations/OpTensorSyncDevice.hpp kompute/operations/OpTensorSyncLocal.hpp + kompute/operations/OpTensorCopyRegion.hpp + kompute/operations/OpTensorSyncRegionDevice.hpp + kompute/operations/OpTensorSyncRegionLocal.hpp kompute/logger/Logger.hpp ) diff --git a/src/include/kompute/Kompute.hpp b/src/include/kompute/Kompute.hpp index e54adc1b..466d242e 100644 --- a/src/include/kompute/Kompute.hpp +++ b/src/include/kompute/Kompute.hpp @@ -13,6 +13,9 @@ #include "operations/OpTensorCopy.hpp" #include "operations/OpTensorSyncDevice.hpp" #include "operations/OpTensorSyncLocal.hpp" +#include "operations/OpTensorCopyRegion.hpp" +#include "operations/OpTensorSyncRegionDevice.hpp" +#include "operations/OpTensorSyncRegionLocal.hpp" // Will be build by CMake and placed inside the build directory #include "ShaderLogisticRegression.hpp" diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp index 6aaa3edf..ace24dcf 100644 --- a/src/include/kompute/Tensor.hpp +++ b/src/include/kompute/Tensor.hpp @@ -378,4 +378,11 @@ class TensorT : public Tensor TensorDataTypes dataType(); }; +struct TensorRegion { + std::shared_ptr tensor; + uint32_t srcIndex; + uint32_t dstIndex; + uint32_t elemCount; +}; + } // End namespace kp diff --git a/src/include/kompute/operations/OpTensorCopyRegion.hpp b/src/include/kompute/operations/OpTensorCopyRegion.hpp new file mode 100644 index 00000000..d5520cc7 --- /dev/null +++ b/src/include/kompute/operations/OpTensorCopyRegion.hpp @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include "kompute/Core.hpp" + +#include "kompute/Tensor.hpp" + +#include "kompute/operations/OpBase.hpp" + +namespace kp { + +struct TensorCopyRegions { + std::shared_ptr srcTensor; + std::vector dstRegions; +}; + +/** + * Operation that copies the data from the first tensor to the rest of the + * tensors provided, using a record command for all the vectors. This operation + * does not own/manage the memory of the tensors passed to it. The operation + * must only receive tensors of type + */ +class OpTensorCopyRegion : public OpBase +{ + public: + using ConstructorParameterType = TensorCopyRegions; + + /** + * Default constructor with parameters that provides the core vulkan + * resources and the tensors that will be used in the operation. + * + * @param tensors Tensors that will be used to create in operation. + */ + OpTensorCopyRegion(const TensorCopyRegions regions); + + /** + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. + */ + ~OpTensorCopyRegion() override; + + /** + * Records the copy commands from the first tensor into all the other + * tensors provided. Also optionally records a barrier. + * + * @param commandBuffer The command buffer to record the command into. + */ + void record(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void preEval(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void postEval(const vk::CommandBuffer& commandBuffer) override; + + private: + // -------------- ALWAYS OWNED RESOURCES + TensorCopyRegions mRegions; +}; + +} // End namespace kp diff --git a/src/include/kompute/operations/OpTensorSyncRegionDevice.hpp b/src/include/kompute/operations/OpTensorSyncRegionDevice.hpp new file mode 100644 index 00000000..7b7e5ff4 --- /dev/null +++ b/src/include/kompute/operations/OpTensorSyncRegionDevice.hpp @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include "kompute/Core.hpp" +#include "kompute/Tensor.hpp" +#include "kompute/operations/OpBase.hpp" + +namespace kp { + +/** + * Operation that syncs tensor's device by mapping local data into the device + * memory. For TensorTypes::eDevice it will use a record operation for the + * memory to be syncd into GPU memory which means that the operation will be + * done in sync with GPU commands. For TensorTypes::eHost it will only map the + * data into host memory which will happen during preEval before the recorded + * commands are dispatched. + */ +class OpTensorSyncRegionDevice : public OpBase +{ + public: + using ConstructorParameterType = std::vector; + + /** + * Default constructor with parameters that provides the core vulkan + * resources and the tensors that will be used in the operation. The tensors + * provided cannot be of type TensorTypes::eStorage. + * For each tensor a region to copy is specified. + * + * @param tensors Tensors that will be used to create in operation. + * @param copyRegions The regions to copy. + */ + OpTensorSyncRegionDevice(const std::vector& regions); + + /** + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. + */ + ~OpTensorSyncRegionDevice() override; + + /** + * For device tensors, it records the copy command for the tensor to copy + * the data from its staging to device memory. + * + * @param commandBuffer The command buffer to record the command into. + */ + void record(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void preEval(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any postEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void postEval(const vk::CommandBuffer& commandBuffer) override; + + private: + // -------------- ALWAYS OWNED RESOURCES + std::vector mRegions; +}; + +} // End namespace kp diff --git a/src/include/kompute/operations/OpTensorSyncRegionLocal.hpp b/src/include/kompute/operations/OpTensorSyncRegionLocal.hpp new file mode 100644 index 00000000..0b8b41b8 --- /dev/null +++ b/src/include/kompute/operations/OpTensorSyncRegionLocal.hpp @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: Apache-2.0 +#pragma once + +#include "kompute/Core.hpp" + +#include "kompute/Tensor.hpp" + +#include "kompute/operations/OpBase.hpp" + +namespace kp { + +/** + * Operation that syncs tensor's local memory by mapping device data into the + * local CPU memory. For TensorTypes::eDevice it will use a record operation + * for the memory to be syncd into GPU memory which means that the operation + * will be done in sync with GPU commands. For TensorTypes::eHost it will + * only map the data into host memory which will happen during preEval before + * the recorded commands are dispatched. + */ +class OpTensorSyncRegionLocal : public OpBase +{ + public: + using ConstructorParameterType = std::vector; + + /** + * Default constructor with parameters that provides the core vulkan + * resources and the tensors that will be used in the operation. The tensors + * provided cannot be of type TensorTypes::eStorage. + * + * @param tensors Tensors that will be used to create in operation. + */ + OpTensorSyncRegionLocal(const std::vector& tensors); + + /** + * Default destructor. This class does not manage memory so it won't be + * expecting the parent to perform a release. + */ + ~OpTensorSyncRegionLocal() override; + + /** + * For device tensors, it records the copy command for the tensor to copy + * the data from its device to staging memory. + * + * @param commandBuffer The command buffer to record the command into. + */ + void record(const vk::CommandBuffer& commandBuffer) override; + + /** + * Does not perform any preEval commands. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void preEval(const vk::CommandBuffer& commandBuffer) override; + + /** + * For host tensors it performs the map command from the host memory into + * local memory. + * + * @param commandBuffer The command buffer to record the command into. + */ + virtual void postEval(const vk::CommandBuffer& commandBuffer) override; + + private: + // -------------- ALWAYS OWNED RESOURCES + std::vector mRegions; +}; + +} // End namespace kp diff --git a/test/TestOpTensorSync.cpp b/test/TestOpTensorSync.cpp index b59ee148..045d792d 100644 --- a/test/TestOpTensorSync.cpp +++ b/test/TestOpTensorSync.cpp @@ -53,3 +53,51 @@ TEST(TestOpTensorSync, SyncToDeviceMemoryMultiTensor) EXPECT_EQ(tensorB->vector(), testVec); EXPECT_EQ(tensorC->vector(), testVec); } + +TEST(TestOpTensorSync, SyncToDeviceMemoryCopyRegion) +{ + kp::Manager mgr; + + std::vector testVecPreA{ 1, 2, 3, 4 }; + std::vector testVecPostA{ 1, 1, 1, 4 }; + std::vector testVecPostB{ 0, 0, 0, 1 }; + + std::shared_ptr> tensorA = mgr.tensor({ 0, 0, 0, 0 }); + std::shared_ptr> tensorB = mgr.tensor({ 0, 0, 0, 0 }); + + EXPECT_TRUE(tensorA->isInit()); + EXPECT_TRUE(tensorB->isInit()); + + mgr.sequence()->eval({ tensorA, tensorB }); + + // tensorA local: { 0, 0, 0, 0 } | tensorA device: { 0, 0, 0, 0 } + // tensorB local: { 0, 0, 0, 0 } | tensorB device: { 0, 0, 0, 0 } + + tensorA->setData(testVecPreA); + + // tensorA local: { 1, 2, 3, 4 } | tensorA device: { 0, 0, 0, 0 } + // tensorB local: { 0, 0, 0, 0 } | tensorB device: { 0, 0, 0, 0 } + + // Copy from tensorA local index 0 to tensorA device index 1 (1 element) + mgr.sequence()->eval({{ tensorA, 0, 1, 1 }}); + + // tensorA local: { 1, 2, 3, 4 } | tensorA device: { 0, 1, 0, 0 } + // tensorB local: { 0, 0, 0, 0 } | tensorB device: { 0, 0, 0, 0 } + + // Copy from tensorA device index 1 to tensorA device index 2 (1 element) + // Copy from tensorA device index 1 to tensorB device index 2 (1 element) + mgr.sequence()->eval({ tensorA, {{ tensorA, 1, 2, 1 }, { tensorB, 1, 2, 1 }}}); + + // tensorA local: { 1, 2, 3, 4 } | tensorA device: { 0, 1, 1, 0 } + // tensorB local: { 0, 0, 0, 0 } | tensorB device: { 0, 0, 1, 0 } + + // Copy from tensorA device index 1 to tensorA local index 1 (2 elements) + // Copy from tensorB device index 2 to tensorB local index 3 (1 element) + mgr.sequence()->eval({{ tensorA, 1, 1, 2 }, { tensorB, 2, 3, 1 }}); + + // tensorA local: { 1, 1, 1, 4 } | tensorA device: { 0, 1, 1, 0 } + // tensorB local: { 0, 0, 1, 0 } | tensorB device: { 0, 0, 1, 0 } + + EXPECT_EQ(tensorA->vector(), testVecPostA); + EXPECT_EQ(tensorB->vector(), testVecPostB); +}