From 01ac6b5c20478cdfb67aaf80b24ad9da9e7e142e Mon Sep 17 00:00:00 2001
From: crydsch <crydsch@lph.zone>
Date: Thu, 17 Aug 2023 10:38:25 +0200
Subject: [PATCH 1/4] Enable TestOpTensorSync

Signed-off-by: crydsch <crydsch@lph.zone>
---
 test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 0564458c..a967e2a4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,7 @@ add_executable(kompute_tests TestAsyncOperations.cpp
     TestMultipleAlgoExecutions.cpp
     TestOpShadersFromStringAndFile.cpp
     TestOpTensorCopy.cpp
+    TestOpTensorSync.cpp
     TestOpTensorCreate.cpp
     TestPushConstant.cpp
     TestSequence.cpp

From 99cafa27d315c51cd9c36558a0d249afb6639d1a Mon Sep 17 00:00:00 2001
From: crydsch <crydsch@lph.zone>
Date: Sat, 26 Aug 2023 11:55:36 +0200
Subject: [PATCH 2/4] Expose copy region for tensor copy operations

Signed-off-by: crydsch <crydsch@lph.zone>
---
 src/Tensor.cpp                 | 37 ++++++++++++++++++++++++++++---
 src/include/kompute/Tensor.hpp | 40 +++++++++++++++++++++++++++++++---
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index ad5cac9a..3d488981 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -202,7 +202,20 @@ Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
-    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+    this->recordCopyFrom(commandBuffer,
+                           copyFromTensor,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                       std::shared_ptr<Tensor> copyFromTensor,
+                       const vk::BufferCopy copyRegion)
+{
+
+    vk::DeviceSize bufferSize(this->memorySize());
+
+    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", copyRegion.size);
 
     this->recordCopyBuffer(commandBuffer,
                            copyFromTensor->mPrimaryBuffer,
@@ -217,7 +230,15 @@ Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
-    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+    this->recordCopyFromStagingToDevice(commandBuffer, copyRegion);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion)
+{
+    vk::DeviceSize bufferSize(this->memorySize());
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", copyRegion.size);
 
     this->recordCopyBuffer(commandBuffer,
                            this->mStagingBuffer,
@@ -232,7 +253,17 @@ Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
     vk::DeviceSize bufferSize(this->memorySize());
     vk::BufferCopy copyRegion(0, 0, bufferSize);
 
-    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+    this->recordCopyFromDeviceToStaging(commandBuffer,
+                                        copyRegion);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer,
+                                      const vk::BufferCopy copyRegion)
+{
+    vk::DeviceSize bufferSize(this->memorySize());
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", copyRegion.size);
 
     this->recordCopyBuffer(commandBuffer,
                            this->mPrimaryBuffer,
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index a2bcd187..6aaa3edf 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -99,8 +99,9 @@ class Tensor
 
     /**
      * Records a copy from the memory of the tensor provided to the current
-     * thensor. This is intended to pass memory into a processing, to perform
+     * tensor. This is intended to pass memory into a processing, to perform
      * a staging buffer transfer, or to gather output (between others).
+     * Copies the entire tensor.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      * @param copyFromTensor Tensor to copy the data from
@@ -108,23 +109,56 @@ class Tensor
     void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
                         std::shared_ptr<Tensor> copyFromTensor);
 
+    /**
+     * Records a copy from the memory of the tensor provided to the current
+     * tensor. This is intended to pass memory into a processing, to perform
+     * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyFromTensor Tensor to copy the data from
+     * @param copyRegion The buffer region to copy
+     */
+    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor,
+                        const vk::BufferCopy copyRegion);
+
     /**
      * Records a copy from the internal staging memory to the device memory
      * using an optional barrier to wait for the operation. This function would
-     * only be relevant for kp::Tensors of type eDevice.
+     * only be relevant for kp::Tensors of type eDevice. Copies the entire tensor.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
      */
     void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
 
     /**
-     * Records a copy from the internal device memory to the staging memory
+     * Records a copy from the internal staging memory to the device memory
      * using an optional barrier to wait for the operation. This function would
      * only be relevant for kp::Tensors of type eDevice.
      *
      * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyRegion The buffer region to copy
+     */
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice. Copies the entire tensor.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
      */
     void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
+    
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyRegion The buffer region to copy
+     */
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer, const vk::BufferCopy copyRegion);
 
     /**
      * Records the buffer memory barrier into the primary buffer and command

From 63b7c10f9c7b573117e3ebd541303f43b9f3dc5b Mon Sep 17 00:00:00 2001
From: crydsch <crydsch@lph.zone>
Date: Sat, 26 Aug 2023 12:30:29 +0200
Subject: [PATCH 3/4] Support custom parameter in sequence template functions

Signed-off-by: crydsch <crydsch@lph.zone>
---
 src/include/kompute/Sequence.hpp              | 84 ++++---------------
 .../kompute/operations/OpAlgoDispatch.hpp     |  2 +
 src/include/kompute/operations/OpBase.hpp     |  2 +
 .../kompute/operations/OpMemoryBarrier.hpp    |  2 +
 src/include/kompute/operations/OpMult.hpp     |  2 +
 .../kompute/operations/OpTensorCopy.hpp       |  2 +
 .../kompute/operations/OpTensorSyncDevice.hpp |  2 +
 .../kompute/operations/OpTensorSyncLocal.hpp  |  2 +
 8 files changed, 31 insertions(+), 67 deletions(-)

diff --git a/src/include/kompute/Sequence.hpp b/src/include/kompute/Sequence.hpp
index de9b9f69..0787f74c 100644
--- a/src/include/kompute/Sequence.hpp
+++ b/src/include/kompute/Sequence.hpp
@@ -41,7 +41,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * function also requires the Sequence to be recording, otherwise it will
      * not be able to add the operation.
      *
-     * @param op Object derived from kp::BaseOp that will be recoreded by the
+     * @param op Object derived from kp::BaseOp that will be recorded by the
      * sequence which will be used when the operation is evaluated.
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
@@ -53,37 +53,18 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * function also requires the Sequence to be recording, otherwise it will
      * not be able to add the operation.
      *
-     * @param tensors Vector of tensors to use for the operation
+     * @param param Template parameter that is used to initialise the operation.
      * @param TArgs Template parameters that are used to initialise operation
      * which allows for extensible configurations on initialisation.
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
     template<typename T, typename... TArgs>
     std::shared_ptr<Sequence> record(
-      std::vector<std::shared_ptr<Tensor>> tensors,
+      typename T::ConstructorParameterType param,
       TArgs&&... params)
     {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->record(op);
-    }
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
-                                     TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
+        static_assert(std::is_base_of<OpBase, T>::value, "T must derive from OpBase");
+        std::shared_ptr<T> op{ new T(param, std::forward<TArgs>(params)...) };
         return this->record(op);
     }
 
@@ -108,34 +89,18 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
      *
-     * @param tensors Vector of tensors to use for the operation
+     * @param param Template parameter that is used to initialise the operation.
      * @param TArgs Template parameters that are used to initialise operation
      * which allows for extensible configurations on initialisation.
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
     template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
-                                   TArgs&&... params)
+    std::shared_ptr<Sequence> eval(
+      typename T::ConstructorParameterType param,
+      TArgs&&... params)
     {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
+        static_assert(std::is_base_of<OpBase, T>::value, "T must derive from OpBase");
+        std::shared_ptr<T> op{ new T(param, std::forward<TArgs>(params)...) };
         return this->eval(op);
     }
 
@@ -148,6 +113,7 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * @return Boolean stating whether execution was successful.
      */
     std::shared_ptr<Sequence> evalAsync();
+
     /**
      * Clears currnet operations to record provided one in the vector of
      * operations into the gpu as a submit job without a barrier. EvalAwait()
@@ -157,39 +123,23 @@ class Sequence : public std::enable_shared_from_this<Sequence>
      * @return Boolean stating whether execution was successful.
      */
     std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
+
     /**
      * Eval sends all the recorded and stored operations in the vector of
      * operations into the gpu as a submit job with a barrier.
      *
-     * @param tensors Vector of tensors to use for the operation
+     * @param param Template parameter that is used to initialise the operation.
      * @param TArgs Template parameters that are used to initialise operation
      * which allows for extensible configurations on initialisation.
      * @return shared_ptr<Sequence> of the Sequence class itself
      */
     template<typename T, typename... TArgs>
     std::shared_ptr<Sequence> evalAsync(
-      std::vector<std::shared_ptr<Tensor>> tensors,
+      typename T::ConstructorParameterType param,
       TArgs&&... params)
     {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->evalAsync(op);
-    }
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
-                                        TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
+        static_assert(std::is_base_of<OpBase, T>::value, "T must derive from OpBase");
+        std::shared_ptr<T> op{ new T(param, std::forward<TArgs>(params)...) };
         return this->evalAsync(op);
     }
 
diff --git a/src/include/kompute/operations/OpAlgoDispatch.hpp b/src/include/kompute/operations/OpAlgoDispatch.hpp
index e91598f0..bd58fb6d 100644
--- a/src/include/kompute/operations/OpAlgoDispatch.hpp
+++ b/src/include/kompute/operations/OpAlgoDispatch.hpp
@@ -17,6 +17,8 @@ namespace kp {
 class OpAlgoDispatch : public OpBase
 {
   public:
+    using ConstructorParameterType = std::shared_ptr<kp::Algorithm>;
+
     /**
      * Constructor that stores the algorithm to use as well as the relevant
      * push constants to override when recording.
diff --git a/src/include/kompute/operations/OpBase.hpp b/src/include/kompute/operations/OpBase.hpp
index 73767084..23a217a8 100644
--- a/src/include/kompute/operations/OpBase.hpp
+++ b/src/include/kompute/operations/OpBase.hpp
@@ -18,6 +18,8 @@ namespace kp {
 class OpBase
 {
   public:
+    using ConstructorParameterType = void;
+
     /**
      * Default destructor for OpBase class. This OpBase destructor class should
      * always be called to destroy and free owned resources unless it is
diff --git a/src/include/kompute/operations/OpMemoryBarrier.hpp b/src/include/kompute/operations/OpMemoryBarrier.hpp
index 4a232232..35a23113 100644
--- a/src/include/kompute/operations/OpMemoryBarrier.hpp
+++ b/src/include/kompute/operations/OpMemoryBarrier.hpp
@@ -18,6 +18,8 @@ namespace kp {
 class OpMemoryBarrier : public OpBase
 {
   public:
+    using ConstructorParameterType = std::vector<std::shared_ptr<Tensor>>;
+
     /**
      * Constructor that stores tensors as well as memory barrier parameters to
      * be used to create a pipeline barrier on the respective primary or staging
diff --git a/src/include/kompute/operations/OpMult.hpp b/src/include/kompute/operations/OpMult.hpp
index f75ccc4f..2d4f0eca 100644
--- a/src/include/kompute/operations/OpMult.hpp
+++ b/src/include/kompute/operations/OpMult.hpp
@@ -21,6 +21,8 @@ namespace kp {
 class OpMult : public OpAlgoDispatch
 {
   public:
+    using ConstructorParameterType = std::vector<std::shared_ptr<Tensor>>;
+    
     /**
      * Default constructor with parameters that provides the bare minimum
      * requirements for the operations to be able to create and manage their
diff --git a/src/include/kompute/operations/OpTensorCopy.hpp b/src/include/kompute/operations/OpTensorCopy.hpp
index 968c1065..6438b108 100644
--- a/src/include/kompute/operations/OpTensorCopy.hpp
+++ b/src/include/kompute/operations/OpTensorCopy.hpp
@@ -18,6 +18,8 @@ namespace kp {
 class OpTensorCopy : public OpBase
 {
   public:
+    using ConstructorParameterType = std::vector<std::shared_ptr<Tensor>>;
+    
     /**
      * Default constructor with parameters that provides the core vulkan
      * resources and the tensors that will be used in the operation.
diff --git a/src/include/kompute/operations/OpTensorSyncDevice.hpp b/src/include/kompute/operations/OpTensorSyncDevice.hpp
index 3a1792ac..7460a6ea 100644
--- a/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ b/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -18,6 +18,8 @@ namespace kp {
 class OpTensorSyncDevice : public OpBase
 {
   public:
+    using ConstructorParameterType = std::vector<std::shared_ptr<Tensor>>;
+    
     /**
      * Default constructor with parameters that provides the core vulkan
      * resources and the tensors that will be used in the operation. The tensos
diff --git a/src/include/kompute/operations/OpTensorSyncLocal.hpp b/src/include/kompute/operations/OpTensorSyncLocal.hpp
index 4216003e..95426775 100644
--- a/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ b/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -20,6 +20,8 @@ namespace kp {
 class OpTensorSyncLocal : public OpBase
 {
   public:
+    using ConstructorParameterType = std::vector<std::shared_ptr<Tensor>>;
+    
     /**
      * Default constructor with parameters that provides the core vulkan
      * resources and the tensors that will be used in the operation. The tensors

From 62ac892d224132cc96bbb75f7c4a5151731e2a0b Mon Sep 17 00:00:00 2001
From: crydsch <crydsch@lph.zone>
Date: Sat, 26 Aug 2023 13:07:29 +0200
Subject: [PATCH 4/4] Add tensor copy operations with region support

Signed-off-by: crydsch <crydsch@lph.zone>
---
 src/CMakeLists.txt                            |  3 +
 src/OpTensorCopyRegion.cpp                    | 75 ++++++++++++++++
 src/OpTensorSyncRegionDevice.cpp              | 72 +++++++++++++++
 src/OpTensorSyncRegionLocal.cpp               | 89 +++++++++++++++++++
 src/include/CMakeLists.txt                    |  3 +
 src/include/kompute/Kompute.hpp               |  3 +
 src/include/kompute/Tensor.hpp                |  7 ++
 .../kompute/operations/OpTensorCopyRegion.hpp | 69 ++++++++++++++
 .../operations/OpTensorSyncRegionDevice.hpp   | 67 ++++++++++++++
 .../operations/OpTensorSyncRegionLocal.hpp    | 68 ++++++++++++++
 test/TestOpTensorSync.cpp                     | 48 ++++++++++
 11 files changed, 504 insertions(+)
 create mode 100644 src/OpTensorCopyRegion.cpp
 create mode 100644 src/OpTensorSyncRegionDevice.cpp
 create mode 100644 src/OpTensorSyncRegionLocal.cpp
 create mode 100644 src/include/kompute/operations/OpTensorCopyRegion.hpp
 create mode 100644 src/include/kompute/operations/OpTensorSyncRegionDevice.hpp
 create mode 100644 src/include/kompute/operations/OpTensorSyncRegionLocal.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index dbb47dbe..8978d595 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,6 +15,9 @@ add_library(kompute Algorithm.cpp
     OpTensorCopy.cpp
     OpTensorSyncDevice.cpp
     OpTensorSyncLocal.cpp
+    OpTensorCopyRegion.cpp
+    OpTensorSyncRegionDevice.cpp
+    OpTensorSyncRegionLocal.cpp
     Sequence.cpp
     Tensor.cpp
     Core.cpp)
diff --git a/src/OpTensorCopyRegion.cpp b/src/OpTensorCopyRegion.cpp
new file mode 100644
index 00000000..9b52c382
--- /dev/null
+++ b/src/OpTensorCopyRegion.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kompute/operations/OpTensorCopyRegion.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorCopyRegion::OpTensorCopyRegion(const TensorCopyRegions regions)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopyRegion constructor with params");
+
+    if (regions.dstRegions.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorCopyRegion called with no destination region");
+    }
+
+    kp::Tensor::TensorDataTypes dataType = regions.srcTensor->dataType();
+    for (const TensorRegion& region : regions.dstRegions) {
+        if (region.tensor->dataType() != dataType) {
+            throw std::runtime_error(fmt::format(
+              "Kompute OpTensorCopyRegion called with different types from {} to {}",
+              Tensor::toString(dataType),
+              Tensor::toString(region.tensor->dataType())));
+        }
+        if (region.elemCount == 0) {
+            throw std::runtime_error(
+                "Kompute OpTensorCopyRegion called with elemCount == 0");
+        }
+        if (region.srcIndex + region.elemCount > regions.srcTensor->size()) {
+            throw std::runtime_error(
+                "Kompute OpTensorCopyRegion called with out of bounds source region");
+        }
+        if (region.dstIndex  + region.elemCount > region.tensor->size()) {
+            throw std::runtime_error(
+                "Kompute OpTensorCopyRegion called with out of bounds destination region");
+        }     
+    }
+
+    this->mRegions = regions;
+}
+
+OpTensorCopyRegion::~OpTensorCopyRegion()
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopyRegion destructor started");
+}
+
+void
+OpTensorCopyRegion::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopyRegion record called");
+
+    for (size_t i = 0; i < this->mRegions.dstRegions.size(); i++) {
+        const uint32_t dataTypeMemorySize = this->mRegions.dstRegions[i].tensor->dataTypeMemorySize();
+        const vk::BufferCopy copy = {
+            dataTypeMemorySize * this->mRegions.dstRegions[i].srcIndex,
+            dataTypeMemorySize * this->mRegions.dstRegions[i].dstIndex,
+            dataTypeMemorySize * this->mRegions.dstRegions[i].elemCount,
+        };
+        this->mRegions.dstRegions[i].tensor->recordCopyFrom(commandBuffer, this->mRegions.srcTensor, copy);
+    }
+}
+
+void
+OpTensorCopyRegion::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopyRegion preEval called");
+}
+
+void
+OpTensorCopyRegion::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopyRegion postEval called");
+}
+
+}
diff --git a/src/OpTensorSyncRegionDevice.cpp b/src/OpTensorSyncRegionDevice.cpp
new file mode 100644
index 00000000..91af1153
--- /dev/null
+++ b/src/OpTensorSyncRegionDevice.cpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kompute/operations/OpTensorSyncRegionDevice.hpp"
+
+namespace kp {
+
+OpTensorSyncRegionDevice::OpTensorSyncRegionDevice(
+  const std::vector<TensorRegion>& regions)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice constructor with params");
+
+    if (regions.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncRegionDevice called with less than 1 tensor region");
+    }
+
+    for (size_t i = 0; i < regions.size(); i++) {
+        if (regions[i].elemCount == 0) {
+            throw std::runtime_error(
+            "Kompute OpTensorSyncRegionDevice called with elemCount == 0");
+        }
+        if (regions[i].srcIndex + regions[i].elemCount > regions[i].tensor->size()) {
+            throw std::runtime_error(
+            "Kompute OpTensorSyncRegionDevice called with out of bounds source region");
+        }
+        if (regions[i].dstIndex  + regions[i].elemCount > regions[i].tensor->size()) {
+            throw std::runtime_error(
+            "Kompute OpTensorSyncRegionDevice called with out of bounds destination region");
+        }        
+    }
+
+    this->mRegions = regions;
+}
+
+OpTensorSyncRegionDevice::~OpTensorSyncRegionDevice()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice destructor started");
+
+    this->mRegions.clear();
+}
+
+void
+OpTensorSyncRegionDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice record called");
+
+    for (size_t i = 0; i < this->mRegions.size(); i++) {
+        if (this->mRegions[i].tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+            const uint32_t dataTypeMemorySize = this->mRegions[i].tensor->dataTypeMemorySize();
+            const vk::BufferCopy copy = {
+                dataTypeMemorySize * this->mRegions[i].srcIndex,
+                dataTypeMemorySize * this->mRegions[i].dstIndex,
+                dataTypeMemorySize * this->mRegions[i].elemCount,
+            };
+            this->mRegions[i].tensor->recordCopyFromStagingToDevice(commandBuffer, copy);
+        }
+    }
+}
+
+void
+OpTensorSyncRegionDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice preEval called");
+}
+
+void
+OpTensorSyncRegionDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionDevice postEval called");
+}
+
+}
diff --git a/src/OpTensorSyncRegionLocal.cpp b/src/OpTensorSyncRegionLocal.cpp
new file mode 100644
index 00000000..a426a519
--- /dev/null
+++ b/src/OpTensorSyncRegionLocal.cpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncRegionLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncRegionLocal::OpTensorSyncRegionLocal(
+  const std::vector<TensorRegion>& regions)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal constructor with params");
+
+    if (regions.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncRegionLocal called with less than 1 region");
+    }
+
+    for (size_t i = 0; i < regions.size(); i++) {
+        if (regions[i].elemCount == 0) {
+            throw std::runtime_error(
+            "Kompute OpTensorSyncRegionLocal called with elemCount == 0");
+        }
+        if (regions[i].srcIndex + regions[i].elemCount > regions[i].tensor->size()) {
+            throw std::runtime_error(
+            "Kompute OpTensorSyncRegionLocal called with out of bounds source region");
+        }
+        if (regions[i].dstIndex  + regions[i].elemCount > regions[i].tensor->size()) {
+            throw std::runtime_error(
+            "Kompute OpTensorSyncRegionLocal called with out of bounds destination region");
+        }        
+    }
+
+    this->mRegions = regions;
+}
+
+OpTensorSyncRegionLocal::~OpTensorSyncRegionLocal()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal destructor started");
+}
+
+void
+OpTensorSyncRegionLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal record called");
+
+    for (size_t i = 0; i < this->mRegions.size(); i++) {
+        if (this->mRegions[i].tensor->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            this->mRegions[i].tensor->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+
+            const uint32_t dataTypeMemorySize = this->mRegions[i].tensor->dataTypeMemorySize();
+            const vk::BufferCopy copy = {
+                dataTypeMemorySize * this->mRegions[i].srcIndex,
+                dataTypeMemorySize * this->mRegions[i].dstIndex,
+                dataTypeMemorySize * this->mRegions[i].elemCount,
+            };
+            this->mRegions[i].tensor->recordCopyFromDeviceToStaging(commandBuffer, copy);
+
+            this->mRegions[i].tensor->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eTransferWrite,
+              vk::AccessFlagBits::eHostRead,
+              vk::PipelineStageFlagBits::eTransfer,
+              vk::PipelineStageFlagBits::eHost);
+        }
+    }
+}
+
+void
+OpTensorSyncRegionLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal preEval called");
+}
+
+void
+OpTensorSyncRegionLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal postEval called");
+
+    KP_LOG_DEBUG("Kompute OpTensorSyncRegionLocal mapping data into tensor local");
+}
+
+}
diff --git a/src/include/CMakeLists.txt b/src/include/CMakeLists.txt
index e1652fdd..f4b475d9 100644
--- a/src/include/CMakeLists.txt
+++ b/src/include/CMakeLists.txt
@@ -23,6 +23,9 @@ target_sources(kompute PRIVATE
     kompute/operations/OpTensorCopy.hpp
     kompute/operations/OpTensorSyncDevice.hpp
     kompute/operations/OpTensorSyncLocal.hpp
+    kompute/operations/OpTensorCopyRegion.hpp
+    kompute/operations/OpTensorSyncRegionDevice.hpp
+    kompute/operations/OpTensorSyncRegionLocal.hpp
 
     kompute/logger/Logger.hpp
 )
diff --git a/src/include/kompute/Kompute.hpp b/src/include/kompute/Kompute.hpp
index e54adc1b..466d242e 100644
--- a/src/include/kompute/Kompute.hpp
+++ b/src/include/kompute/Kompute.hpp
@@ -13,6 +13,9 @@
 #include "operations/OpTensorCopy.hpp"
 #include "operations/OpTensorSyncDevice.hpp"
 #include "operations/OpTensorSyncLocal.hpp"
+#include "operations/OpTensorCopyRegion.hpp"
+#include "operations/OpTensorSyncRegionDevice.hpp"
+#include "operations/OpTensorSyncRegionLocal.hpp"
 
 // Will be build by CMake and placed inside the build directory
 #include "ShaderLogisticRegression.hpp"
diff --git a/src/include/kompute/Tensor.hpp b/src/include/kompute/Tensor.hpp
index 6aaa3edf..ace24dcf 100644
--- a/src/include/kompute/Tensor.hpp
+++ b/src/include/kompute/Tensor.hpp
@@ -378,4 +378,11 @@ class TensorT : public Tensor
     TensorDataTypes dataType();
 };
 
+struct TensorRegion {
+  std::shared_ptr<Tensor> tensor;
+  uint32_t srcIndex;
+  uint32_t dstIndex;
+  uint32_t elemCount;
+};
+
 } // End namespace kp
diff --git a/src/include/kompute/operations/OpTensorCopyRegion.hpp b/src/include/kompute/operations/OpTensorCopyRegion.hpp
new file mode 100644
index 00000000..d5520cc7
--- /dev/null
+++ b/src/include/kompute/operations/OpTensorCopyRegion.hpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+struct TensorCopyRegions {
+  std::shared_ptr<Tensor> srcTensor;
+  std::vector<TensorRegion> dstRegions;
+};
+
+/**
+ * Operation that copies the data from the first tensor to the rest of the
+ * tensors provided, using a record command for all the vectors. This operation
+ * does not own/manage the memory of the tensors passed to it. The operation
+ * must only receive tensors of type
+ */
+class OpTensorCopyRegion : public OpBase
+{
+  public:
+    using ConstructorParameterType = TensorCopyRegions;
+
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopyRegion(const TensorCopyRegions regions);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorCopyRegion() override;
+
+    /**
+     * Records the copy commands from the first tensor into all the other
+     * tensors provided. Also optionally records a barrier.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorCopyRegions mRegions;
+};
+
+} // End namespace kp
diff --git a/src/include/kompute/operations/OpTensorSyncRegionDevice.hpp b/src/include/kompute/operations/OpTensorSyncRegionDevice.hpp
new file mode 100644
index 00000000..7b7e5ff4
--- /dev/null
+++ b/src/include/kompute/operations/OpTensorSyncRegionDevice.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's device by mapping local data into the device
+ * memory. For TensorTypes::eDevice it will use a record operation for the
+ * memory to be syncd into GPU memory which means that the operation will be
+ * done in sync with GPU commands. For TensorTypes::eHost it will only map the
+ * data into host memory which will happen during preEval before the recorded
+ * commands are dispatched.
+ */
+class OpTensorSyncRegionDevice : public OpBase
+{
+  public:
+    using ConstructorParameterType = std::vector<TensorRegion>;
+
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensors
+     * provided cannot be of type TensorTypes::eStorage.
+     * For each tensor a region to copy is specified.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     * @param copyRegions The regions to copy.
+     */
+    OpTensorSyncRegionDevice(const std::vector<TensorRegion>& regions);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncRegionDevice() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<TensorRegion> mRegions;
+};
+
+} // End namespace kp
diff --git a/src/include/kompute/operations/OpTensorSyncRegionLocal.hpp b/src/include/kompute/operations/OpTensorSyncRegionLocal.hpp
new file mode 100644
index 00000000..0b8b41b8
--- /dev/null
+++ b/src/include/kompute/operations/OpTensorSyncRegionLocal.hpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's local memory by mapping device data into the
+ * local CPU memory. For TensorTypes::eDevice it will use a record operation
+ * for the memory to be syncd into GPU memory which means that the operation
+ * will be done in sync with GPU commands. For TensorTypes::eHost it will
+ * only map the data into host memory which will happen during preEval before
+ * the recorded commands are dispatched.
+ */
+class OpTensorSyncRegionLocal : public OpBase
+{
+  public:
+    using ConstructorParameterType = std::vector<TensorRegion>;
+    
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensors
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncRegionLocal(const std::vector<TensorRegion>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncRegionLocal() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its device to staging memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into
+     * local memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<TensorRegion> mRegions;
+};
+
+} // End namespace kp
diff --git a/test/TestOpTensorSync.cpp b/test/TestOpTensorSync.cpp
index b59ee148..045d792d 100644
--- a/test/TestOpTensorSync.cpp
+++ b/test/TestOpTensorSync.cpp
@@ -53,3 +53,51 @@ TEST(TestOpTensorSync, SyncToDeviceMemoryMultiTensor)
     EXPECT_EQ(tensorB->vector(), testVec);
     EXPECT_EQ(tensorC->vector(), testVec);
 }
+
+TEST(TestOpTensorSync, SyncToDeviceMemoryCopyRegion)
+{
+    kp::Manager mgr;
+
+    std::vector<float> testVecPreA{ 1, 2, 3, 4 };
+    std::vector<float> testVecPostA{ 1, 1, 1, 4 };
+    std::vector<float> testVecPostB{ 0, 0, 0, 1 };
+
+    std::shared_ptr<kp::TensorT<float>> tensorA = mgr.tensor({ 0, 0, 0, 0 });
+    std::shared_ptr<kp::TensorT<float>> tensorB = mgr.tensor({ 0, 0, 0, 0 });
+
+    EXPECT_TRUE(tensorA->isInit());
+    EXPECT_TRUE(tensorB->isInit());
+
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({ tensorA, tensorB });
+
+    // tensorA local: { 0, 0, 0, 0 }  |  tensorA device: { 0, 0, 0, 0 }
+    // tensorB local: { 0, 0, 0, 0 }  |  tensorB device: { 0, 0, 0, 0 }
+
+    tensorA->setData(testVecPreA);
+    
+    // tensorA local: { 1, 2, 3, 4 }  |  tensorA device: { 0, 0, 0, 0 }
+    // tensorB local: { 0, 0, 0, 0 }  |  tensorB device: { 0, 0, 0, 0 }
+
+    // Copy from tensorA local index 0 to tensorA device index 1 (1 element)
+    mgr.sequence()->eval<kp::OpTensorSyncRegionDevice>({{ tensorA, 0, 1, 1 }});
+    
+    // tensorA local: { 1, 2, 3, 4 }  |  tensorA device: { 0, 1, 0, 0 }
+    // tensorB local: { 0, 0, 0, 0 }  |  tensorB device: { 0, 0, 0, 0 }
+
+    // Copy from tensorA device index 1 to tensorA device index 2 (1 element)
+    // Copy from tensorA device index 1 to tensorB device index 2 (1 element)
+    mgr.sequence()->eval<kp::OpTensorCopyRegion>({ tensorA, {{ tensorA, 1, 2, 1 }, { tensorB, 1, 2, 1 }}});
+    
+    // tensorA local: { 1, 2, 3, 4 }  |  tensorA device: { 0, 1, 1, 0 }
+    // tensorB local: { 0, 0, 0, 0 }  |  tensorB device: { 0, 0, 1, 0 }
+
+    // Copy from tensorA device index 1 to tensorA local index 1 (2 elements)
+    // Copy from tensorB device index 2 to tensorB local index 3 (1 element)
+    mgr.sequence()->eval<kp::OpTensorSyncRegionLocal>({{ tensorA, 1, 1, 2 }, { tensorB, 2, 3, 1 }});
+    
+    // tensorA local: { 1, 1, 1, 4 }  |  tensorA device: { 0, 1, 1, 0 }
+    // tensorB local: { 0, 0, 1, 0 }  |  tensorB device: { 0, 0, 1, 0 }
+
+    EXPECT_EQ(tensorA->vector(), testVecPostA);
+    EXPECT_EQ(tensorB->vector(), testVecPostB);
+}