From a595aeaa065f3106d3e743e11889cef2241b90da Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Wed, 30 Nov 2022 23:18:31 +0100
Subject: [PATCH 01/20] Initial implementation of Deformable Convolution
 workload

---
 dpbench/benchmarks/CMakeLists.txt             |   1 +
 .../deformable_convolution/CMakeLists.txt     |   1 +
 .../deformable_convolution/__init__.py        |  37 +++
 .../deformable_convolution_initialize.py      |  32 ++
 .../deformable_convolution_numba_npr.py       |  81 +++++
 .../deformable_convolution_numpy.py           |  81 +++++
 .../CMakeLists.txt                            |  20 ++
 .../__init__.py                               |   7 +
 .../deformable_convolution_sycl/impl.cpp      | 297 ++++++++++++++++++
 .../deformable_convolution_sycl/utils.hpp     |  94 ++++++
 .../bench_info/deformable_convolution.json    | 135 ++++++++
 11 files changed, 786 insertions(+)
 create mode 100644 dpbench/benchmarks/deformable_convolution/CMakeLists.txt
 create mode 100644 dpbench/benchmarks/deformable_convolution/__init__.py
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
 create mode 100644 dpbench/configs/bench_info/deformable_convolution.json

diff --git a/dpbench/benchmarks/CMakeLists.txt b/dpbench/benchmarks/CMakeLists.txt
index 6869220b..8032000b 100644
--- a/dpbench/benchmarks/CMakeLists.txt
+++ b/dpbench/benchmarks/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(rambo)
 add_subdirectory(kmeans)
 add_subdirectory(knn)
 add_subdirectory(gpairs)
+add_subdirectory(deformable_convolution)
 add_subdirectory(dbscan)
 
 # generate dpcpp version into config
diff --git a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
new file mode 100644
index 00000000..d06d5c73
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(deformable_convolution_sycl_native_ext)
diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py
new file mode 100644
index 00000000..2dedbb33
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache 2.0
+
+from .deformable_convolution_initialize import initialize
+from .deformable_convolution_numba_npr import deformable_convolution as deformable_convolution_numba_npr
+from .deformable_convolution_numpy import deformable_convolution as deformable_convolution_numpy
+from .deformable_convolution_sycl_native_ext import deformable_convolution_sycl
+
+__all__ = [
+    "initialize",
+    "deformable_convolution_numba_npr",
+    "deformable_convolution_numba_numpy",
+    "deformable_convolution_sycl",
+]
+
+"""l2-norm calculation of n vectors
+
+Input
+---------
+npoints: int
+    number of vectors
+dims: int
+    dimension of single vector
+seed: int
+    random seed to generate random number
+
+Output
+-------
+d: array
+    l2 norm of each vector
+
+Method
+------
+    ||Vj||2=sqrt(sum(Xj[i]*Xj[i]))
+    here i is 0->dims, j is 0->npoints
+"""
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
new file mode 100644
index 00000000..3a52ef0c
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
@@ -0,0 +1,32 @@
+# Copyright 2022 Intel Corp.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def initialize(batch, 
+               in_channels, in_height, in_width, 
+               out_channels, out_height, out_width, 
+               kernel_height, kernel_width, 
+               stride_y, stride_x,
+               dilation_y, dilation_x,
+               pad_y, pad_x,
+               groups,
+               deformable_groups,
+               dtype,
+               seed):
+    import numpy as np
+    import numpy.random as default_rng
+    default_rng.seed(seed)
+
+    return (
+        default_rng.random((batch, in_channels, in_height, in_width)).astype(dtype),
+        # np.ones((batch, in_channels, in_height, in_width)).astype(dtype),
+        np.zeros((batch, out_channels, out_height, out_width)).astype(dtype),
+        # np.zeros((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype),
+        2*default_rng.random((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype) - 1,
+        # default_rng.random((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype),
+        np.ones((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype),
+        default_rng.random(out_channels).astype(dtype),
+        # np.ones((out_channels,)).astype(dtype),
+        np.zeros((in_channels, kernel_height, kernel_width, out_height, out_width)).astype(dtype)
+    )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
new file mode 100644
index 00000000..6e891c3f
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
@@ -0,0 +1,81 @@
+# Copyright 2022 Intel Corp.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+import numpy as np
+from numba import njit, prange
+
+@njit(parallel=True, inline='always', fastmath=True)
+def bilinear(input, offset_y, offset_x):
+    height, width = input.shape
+    start_x = int(math.floor(offset_x))
+    start_x_weight = 1 - (offset_x - start_x)
+    start_y = int(math.floor(offset_y))
+    start_y_weight = 1 - (offset_y - start_y)
+
+    output = 0
+    if offset_x >= width or offset_y >= height or offset_x <= -1 or offset_y <= -1:
+        return output
+
+    if start_x >= 0 and start_y >= 0:
+        w = start_x_weight*start_y_weight
+        output += w*input[start_y, start_x]
+
+    if start_x + 1 < width and start_y >= 0:
+        w = (1 - start_x_weight)*start_y_weight
+        output += w*input[start_y, start_x + 1]
+
+    if start_x >=0 and start_y + 1 < height:
+        w = start_x_weight*(1 - start_y_weight)
+        output += w*input[start_y + 1, start_x]
+
+    if start_x + 1 < width and start_y + 1 < height:
+        w = (1 - start_x_weight)*(1 - start_y_weight)
+        output += w*input[start_y + 1, start_x + 1]
+
+    return output
+
+@njit(parallel=True, fastmath=True)
+def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups):
+    k_height, k_width, _, out_height, out_width = offset.shape
+    channels, _, _ = input.shape
+
+    k_h_m = (k_height - 1)//2
+    k_w_m = (k_width - 1)//2
+
+    for ckhkw in prange(channels*k_height*k_width):
+        for h in prange(out_height):
+            for w in prange(out_width):
+                c = ckhkw//(k_height*k_width)
+                khkw = ckhkw%(k_height*k_width)
+                kh = khkw//k_width
+                kw = khkw%k_width
+
+                offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m)
+                offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m)
+
+                output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
+
+@njit(parallel=True, fastmath=True)
+def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups):
+    out_channels, height, width = output.shape
+    _, in_channels, k_height, k_width = weights.shape
+    
+    deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
+
+    tmp = tmp.reshape((in_channels*k_height*k_width, height*width))
+
+    _weights = weights.reshape((out_channels, in_channels*k_height*k_width))
+    _output = output.reshape((out_channels, height*width))
+    np.dot(_weights, tmp, _output)
+
+    _bias = bias.reshape((out_channels, 1))
+    _output[:] = _output + _bias
+
+@njit(parallel=True)
+def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups):
+    batch, _, _, _ = input.shape
+    for b in range(batch):
+        deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups)
+
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
new file mode 100644
index 00000000..59897585
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
@@ -0,0 +1,81 @@
+# Copyright 2022 Intel Corp.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+import numpy as np
+from numba import njit, prange
+
+@njit(parallel=True, inline='always', fastmath=True)
+def bilinear(input, offset_y, offset_x):
+    height, width = input.shape
+    start_x = int(math.floor(offset_x))
+    start_x_weight = 1 - (offset_x - start_x)
+    start_y = int(math.floor(offset_y))
+    start_y_weight = 1 - (offset_y - start_y)
+
+    output = 0
+    if start_x >= width or start_y >= height or start_x < -1 or start_y < -1:
+        return output
+
+    if start_x >= 0 and start_y >= 0:
+        w = start_x_weight*start_y_weight
+        output += w*input[start_y, start_x]
+
+    if start_x + 1 < width and start_y >= 0:
+        w = (1 - start_x_weight)*start_y_weight
+        output += w*input[start_y, start_x + 1]
+
+    if start_x >=0 and start_y + 1 < height:
+        w = start_x_weight*(1 - start_y_weight)
+        output += w*input[start_y + 1, start_x]
+
+    if start_x + 1 < width and start_y + 1 < height:
+        w = (1 - start_x_weight)*(1 - start_y_weight)
+        output += w*input[start_y + 1, start_x + 1]
+
+    return output
+
+@njit(parallel=True, fastmath=True)
+def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups):
+    k_height, k_width, _, out_height, out_width = offset.shape
+    channels, _, _ = input.shape
+
+    k_h_m = (k_height - 1)//2
+    k_w_m = (k_width - 1)//2
+
+    for ckhkw in prange(channels*k_height*k_width):
+        for h in prange(out_height):
+            for w in prange(out_width):
+                c = ckhkw//(k_height*k_width)
+                khkw = ckhkw%(k_height*k_width)
+                kh = khkw//k_width
+                kw = khkw%k_width
+
+                offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m)
+                offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m)
+
+                output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
+
+# @njit(parallel=True, fastmath=True)
+def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups):
+    out_channels, height, width = output.shape
+    _, in_channels, k_height, k_width = weights.shape
+    
+    deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
+
+    tmp = tmp.reshape((in_channels*k_height*k_width, height*width))
+
+    _weights = weights.reshape((out_channels, in_channels*k_height*k_width))
+    _output = output.reshape((out_channels, height*width))
+    np.dot(_weights, tmp, _output)
+
+    _bias = bias.reshape((out_channels, 1))
+    _output[:] = _output + _bias
+
+# @njit(parallel=True)
+def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups):
+    batch, _, _, _ = input.shape
+    for b in range(batch):
+        deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups)
+
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
new file mode 100644
index 00000000..e874e715
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(py_module_name _deformable_convolution_sycl)
+pybind11_add_module(${py_module_name}
+    MODULE
+    deformable_convolution_sycl/impl.cpp
+)
+find_package(MKL CONFIG REQUIRED)
+
+# target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $)
+target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+target_link_libraries(${py_module_name} PUBLIC $<LINK_ONLY:MKL::MKL> ${MKL_SYCL})
+
+message(STATUS "${MKL_IMPORTED_TARGETS}")
+message(STATUS "${INTERFACE_COMPILE_OPTIONS}")
+message(STATUS "${INTERFACE_INCLUDE_DIRECTORIES}")
+
+file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+install(TARGETS ${py_module_name}
+  DESTINATION ${py_module_dest}/deformable_convolution_sycl
+)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
new file mode 100644
index 00000000..cd4c9bbd
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
@@ -0,0 +1,7 @@
+# Copyright 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache 2.0
+
+from .deformable_convolution_sycl._deformable_convolution_sycl import deformable_convolution as deformable_convolution_sycl
+
+__all__ = ["deformable_convolution_sycl"]
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
new file mode 100644
index 00000000..cc6ecac7
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -0,0 +1,297 @@
+//==- impl.cpp - Python native extension of deformable convolution   ===//
+//
+// Copyright 2022 Intel Corp.
+//
+// SPDX - License - Identifier : Apache 2.0
+///
+/// \file
+/// The files implements a SYCL-based Python native extension for the
+/// deformable convolution benchmark.
+
+#include "utils.hpp"
+#include "CL/sycl.hpp"
+#include "dpctl4pybind11.hpp"
+#include "oneapi/mkl.hpp"
+#include "cmath"
+
+using namespace sycl;
+namespace py = pybind11;
+
+template<class DataType>
+__attribute__((always_inline)) DataType bilinear(const DataType* input,
+                                                 int height,
+                                                 int width,
+                                                 float offset_y,
+                                                 float offset_x)
+{
+    auto start_x = int(std::floor(offset_x));
+    auto start_x_weight = 1 - (offset_x - start_x);
+    auto start_y = int(std::floor(offset_y));
+    auto start_y_weight = 1 - (offset_y - start_y);
+
+    DataType result = 0;
+    if (offset_x >= width || offset_y >= height || offset_x <= -1 || offset_y <= -1)
+        return result;
+
+    if (start_x >= 0 && start_y >= 0)
+    {
+        auto w0 = start_x_weight*start_y_weight;
+        auto v0 = *get_ptr_2d(input, height, width, start_y, start_x);
+
+        result += w0*v0;
+    }
+
+    if (start_x + 1 < width && start_y >= 0)
+    {
+        auto w1 = (1 - start_x_weight)*start_y_weight;
+        auto v1 = *get_ptr_2d(input, height, width, start_y, start_x + 1);
+
+        result += w1*v1;
+    }
+
+    if (start_x >=0 && start_y + 1 < height)
+    {
+        auto w2 = start_x_weight*(1 - start_y_weight);
+        auto v2 = *get_ptr_2d(input, height, width, start_y + 1, start_x);
+
+        result += w2*v2;
+    }
+
+    if (start_x + 1 < width && start_y + 1 < height)
+    {
+        auto w3 = (1 - start_x_weight)*(1 - start_y_weight);
+        auto v3 = *get_ptr_2d(input, height, width, start_y + 1, start_x + 1);
+
+        result += w3*v3;
+    }
+
+    return result;
+}
+
+class deform;
+
+template<class DataType>
+inline auto deform_input(cl::sycl::queue& queue,
+                         const DataType* input, const Shape3D in_shape,
+                         DataType* output, const Shape5D out_shape,
+                         const DataType* offset,
+                         int stride_y, int stride_x,
+                         int pad_y, int pad_x,
+                         int dilation_y, int dilation_x)
+{
+    auto in_channels = in_shape[CHW::C];
+    auto in_height   = in_shape[CHW::H];
+    auto in_width    = in_shape[CHW::W];
+
+    auto k_height   = out_shape[CKHW::KH];
+    auto k_width    = out_shape[CKHW::KW];
+    auto out_height = out_shape[CKHW::H];
+    auto out_width  = out_shape[CKHW::W];
+
+    assert(out_shape[CKHW::C] == in_channels);
+
+    auto wsize = sycl::range<3>(in_channels*k_height*k_width, out_height, out_width);
+    return queue.parallel_for<deform>(wsize, [=](sycl::id<3> idx)
+    {
+        auto ckhkw = static_cast<int>(idx[0]);
+        auto h = static_cast<int>(idx[1]);
+        auto w = static_cast<int>(idx[2]);
+
+        auto c = ckhkw/(k_height*k_width);
+        auto khkw = ckhkw%(k_height*k_width);
+
+        auto kh = khkw/k_width;
+        auto kw = khkw%k_width;
+
+        auto k_h_m = (k_height - 1)/2;
+        auto k_w_m = (k_width - 1)/2;
+
+        auto _output = get_ptr_5d(output,
+                                  in_channels, k_height, k_width, out_height, out_width,
+                                  c, kh, kw, h, 0);
+
+        auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 1, h, w) + h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m);
+        auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 0, h, w) + w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m);
+        // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m);
+        // auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m);
+
+        auto _input = get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0);
+
+        _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x);
+        // _output[w] = offset_y;
+    });
+}
+
+class fill_output;
+
+template<class DataType>
+auto output_fill_with_bias(cl::sycl::queue& queue, DataType* output, const Shape3D out_shape, DataType* bias)
+{
+    auto out_c = out_shape[CHW::C];
+    auto out_h = out_shape[CHW::H];
+    auto out_w = out_shape[CHW::W];
+
+    return queue.parallel_for<fill_output>(sycl::range<3>(out_c, out_h, out_w),[=](sycl::id<3> idx)
+    {
+        auto c = static_cast<int>(idx[0]);
+        auto h = static_cast<int>(idx[1]);
+        auto w = static_cast<int>(idx[2]);
+
+        auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w);
+        *out_ptr = bias[c];
+    });
+}
+
+template<class DataType>
+void deformable_convolution_b1_impl(cl::sycl::queue& queue,
+                                    const DataType* input,
+                                    const Shape3D in_shape,
+                                    DataType* output,
+                                    const Shape3D out_shape,
+                                    DataType* tmp,
+                                    const float* offset,
+                                    DataType* weights,
+                                    const Shape4D weights_shape,
+                                    DataType* bias,
+                                    int stride_y, int stride_x,
+                                    int pad_y, int pad_x,
+                                    int dilation_y, int dilation_x,
+                                    int groups, int deformable_groups)
+{
+    using oneapi::mkl::blas::row_major::gemm;
+    using oneapi::mkl::transpose;
+
+    auto in_c = in_shape[CHW::C];
+    auto in_h = in_shape[CHW::H];
+    auto in_w = in_shape[CHW::W];
+
+    auto out_c = out_shape[CHW::C];
+    auto out_h = out_shape[CHW::H];
+    auto out_w = out_shape[CHW::W];
+
+    assert(out_c == weights_shape[OIHW::OC]);
+    assert(in_c == weights_shape[OIHW::IC]);
+    auto ker_h = weights_shape[OIHW::H];
+    auto ker_w = weights_shape[OIHW::W];
+
+    auto edeform = deform_input(queue,
+                                input, in_shape,
+                                tmp, {in_c, ker_h, ker_w, out_h, out_w},
+                                offset,
+                                stride_y, stride_x,
+                                pad_y, pad_x,
+                                dilation_y, dilation_x);
+
+    auto efill = output_fill_with_bias(queue, output, out_shape, bias);
+    auto egemm = gemm(queue,
+                      transpose::N, transpose::N, /*transpose a, b*/
+                      out_c, out_h*out_w, in_c*ker_h*ker_w, /*m, n, k*/
+                      1, /*alpha*/
+                      weights, in_c*ker_h*ker_w, /*a, lda*/
+                      tmp, out_h*out_w, /*b, ldb*/
+                      1, /*beta*/
+                      output, out_h*out_w, /*c, ldc*/
+                      {edeform, efill} /*events*/);
+    egemm.wait();
+}
+
+template<class DataType>
+void deformable_convolution_impl(cl::sycl::queue& queue,
+                                 const DataType* input,
+                                 const Shape4D in_shape,
+                                 DataType* output,
+                                 const Shape4D out_shape,
+                                 DataType* tmp,
+                                 const float* offset,
+                                 DataType* weights,
+                                 const Shape4D weights_shape,
+                                 DataType* bias,
+                                 int stride_y, int stride_x,
+                                 int pad_y, int pad_x,
+                                 int dilation_y, int dilation_x,
+                                 int groups, int deformable_groups)
+{
+    auto in_b = in_shape[NCHW::N];
+    auto in_c = in_shape[NCHW::C];
+    auto in_h = in_shape[NCHW::H];
+    auto in_w = in_shape[NCHW::W];
+
+    assert(in_b == out_shape[NCHW::N]);
+    auto out_c = out_shape[NCHW::C];
+    auto out_h = out_shape[NCHW::H];
+    auto out_w = out_shape[NCHW::W];
+
+    assert(out_c == weights_shape[OIHW::OC]);
+    assert(in_c == weights_shape[OIHW::IC]);
+
+    for (auto b = 0; b < in_b; ++b)
+    {
+        auto input_ptr = get_ptr_4d(input, in_b, in_c, in_h, in_w, b, 0, 0, 0);
+        auto output_ptr = get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0);
+        deformable_convolution_b1_impl(queue,
+                                       input_ptr, {in_c, in_h, in_w},
+                                       output_ptr, {out_c, out_h, out_w},
+                                       tmp,
+                                       offset,
+                                       weights, weights_shape,
+                                       bias,
+                                       stride_y, stride_x,
+                                       pad_y, pad_x,
+                                       dilation_y, dilation_x,
+                                       groups, deformable_groups);
+
+    }
+}
+
+void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm_ndarray output, dpctl::tensor::usm_ndarray offset,
+                            dpctl::tensor::usm_ndarray weights, dpctl::tensor::usm_ndarray bias, dpctl::tensor::usm_ndarray tmp,
+                            int stride_y, int stride_x, int pad_y, int pad_x, int dilation_y, int dilation_x, int groups, int deformable_groups)
+{
+    auto queue = input.get_queue();
+
+    if (input.get_typenum() != UAR_FLOAT) {
+        throw std::runtime_error("Expected a single precision FP array.");
+    }
+
+    int batch = input.get_shape(0);
+
+    int in_channels = input.get_shape(1);
+    int in_height = input.get_shape(2);
+    int in_width = input.get_shape(3);
+
+    int out_channels = output.get_shape(1);
+    int out_height = output.get_shape(2);
+    int out_width = output.get_shape(3);
+
+    int kernel_height = weights.get_shape(2);
+    int kernel_width = weights.get_shape(3);
+
+    auto input_shape = Shape4D({batch, in_channels, in_height, in_width});
+    auto output_shape = Shape4D({batch, out_channels, out_height, out_width});
+    auto weights_shape = Shape4D({out_channels, in_channels, kernel_height, kernel_width});
+
+    deformable_convolution_impl(queue,
+                                input.get_data<float>(), input_shape,
+                                output.get_data<float>(), output_shape,
+                                tmp.get_data<float>(),
+                                offset.get_data<float>(),
+                                weights.get_data<float>(), weights_shape,
+                                bias.get_data<float>(),
+                                stride_y, stride_x,
+                                pad_y, pad_x,
+                                dilation_y, dilation_x,
+                                groups, deformable_groups);
+}
+
+PYBIND11_MODULE(_deformable_convolution_sycl, m)
+{
+    import_dpctl();
+
+    m.def("deformable_convolution", &deformable_convolution,
+          "Defromable convolution",
+          py::arg("input"), py::arg("output"), py::arg("offset"),
+          py::arg("weights"), py::arg("bias"), py::arg("tmp"),
+          py::arg("stride_y"), py::arg("stride_x"), py::arg("pad_y"), py::arg("pad_x"),
+          py::arg("dilation_y"), py::arg("dilation_x"),
+          py::arg("groups"), py::arg("deformable_groups"));
+}
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
new file mode 100644
index 00000000..447a0571
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
@@ -0,0 +1,94 @@
+//
+// Copyright 2022 Intel Corp.
+//
+// SPDX - License - Identifier : Apache 2.0
+///
+
+#include <array>
+
+template<int Dims>
+class TensorShape: public std::array<int, Dims>
+{
+public:
+    template<class DimType>
+    const int& operator[](DimType dim) const
+    {
+        return std::array<int, Dims>::operator[](static_cast<int>(dim));
+    }
+
+    template<class DimType>
+    int& operator[](DimType dim)
+    {
+        return std::array<int, Dims>::operator[](static_cast<int>(dim));
+    }
+
+};
+
+enum class CHW: int
+{
+    C,
+    H,
+    W,
+};
+
+enum class NCHW: int
+{
+    N,
+    C,
+    H,
+    W,
+};
+
+enum class OIHW: int
+{
+    OC,
+    IC,
+    H,
+    W,
+};
+
+enum class HWCK: int
+{
+    H,
+    W,
+    C,
+    KH,
+    KW
+};
+
+enum class CKHW: int
+{
+    C,
+    KH,
+    KW,
+    H,
+    W,
+};
+
+using Shape1D = TensorShape<1>;
+using Shape2D = TensorShape<2>;
+using Shape3D = TensorShape<3>;
+using Shape4D = TensorShape<4>;
+using Shape5D = TensorShape<5>;
+using DType = float;
+
+#define get_ptr_1d(data_ptr, max_dim_0, dim_0) (data_ptr + (dim_0))
+#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1) \
+(\
+    get_ptr_1d(data_ptr + (dim_0)*(max_dim_1), max_dim_1, dim_1) \
+)
+#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1, dim_2) \
+(\
+    get_ptr_2d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2), max_dim_1, max_dim_2, dim_1, dim_2) \
+)
+#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, dim_0, dim_1, dim_2, dim_3) \
+(\
+    get_ptr_3d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3), max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3) \
+)
+
+#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4) \
+(\
+    get_ptr_4d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3)*(max_dim_4), \
+               max_dim_1, max_dim_2, max_dim_3, max_dim_4, \
+               dim_1, dim_2, dim_3, dim_4) \
+)
diff --git a/dpbench/configs/bench_info/deformable_convolution.json b/dpbench/configs/bench_info/deformable_convolution.json
new file mode 100644
index 00000000..e18e7771
--- /dev/null
+++ b/dpbench/configs/bench_info/deformable_convolution.json
@@ -0,0 +1,135 @@
+{
+    "benchmark": {
+        "name": "Deformable convolution",
+        "short_name": "deformable-convolution",
+        "relative_path": "deformable_convolution",
+        "module_name": "deformable_convolution",
+        "func_name": "deformable_convolution",
+        "kind": "microbenchmark",
+        "domain": "Deep learning",
+        "parameters": {
+            "S": {
+                "batch":1, 
+                "in_channels":16, 
+                "in_height":32, 
+                "in_width":32, 
+                "out_channels":16, 
+                "out_height":32, 
+                "out_width":32, 
+                "kernel_height":3, 
+                "kernel_width":3, 
+                "stride_y":1, 
+                "stride_x":1,
+                "dilation_y":1, 
+                "dilation_x":1,
+                "pad_y":1, 
+                "pad_x":1,
+                "groups":1,
+                "deformable_groups":1,
+                "dtype":"float32",
+                "seed": 7777777
+            },
+            "M": {
+                "batch":1, 
+                "in_channels":64, 
+                "in_height":64, 
+                "in_width":64, 
+                "out_channels":64, 
+                "out_height":64, 
+                "out_width":64, 
+                "kernel_height":3, 
+                "kernel_width":3, 
+                "stride_y":1, 
+                "stride_x":1,
+                "dilation_y":1, 
+                "dilation_x":1,
+                "pad_y":1, 
+                "pad_x":1,
+                "groups":1,
+                "deformable_groups":1,
+                "dtype":"float32",
+                "seed": 7777777
+            },
+            "L": {
+                "batch":2, 
+                "in_channels":64, 
+                "in_height":128, 
+                "in_width":128, 
+                "out_channels":128, 
+                "out_height":128, 
+                "out_width":128, 
+                "kernel_height":3, 
+                "kernel_width":3, 
+                "stride_y":1, 
+                "stride_x":1,
+                "dilation_y":1, 
+                "dilation_x":1,
+                "pad_y":1, 
+                "pad_x":1,
+                "groups":1,
+                "deformable_groups":1,
+                "dtype":"float32",
+                "seed": 7777777
+            }
+        },
+        "init": {
+            "func_name": "initialize",
+            "input_args": [
+                "batch", 
+                "in_channels", 
+                "in_height", 
+                "in_width", 
+                "out_channels", 
+                "out_height", 
+                "out_width", 
+                "kernel_height", 
+                "kernel_width", 
+                "stride_y", 
+                "stride_x",
+                "dilation_y", 
+                "dilation_x",
+                "pad_y", 
+                "pad_x",
+                "groups",
+                "deformable_groups",
+                "dtype",
+                "seed"
+            ],
+            "output_args": [
+                "input",
+                "output",
+                "offset",
+                "weights",
+                "bias",
+                "tmp"
+            ]
+        },
+        "input_args": [
+            "input",
+            "output",
+            "offset",
+            "weights",
+            "bias",
+            "tmp",
+            "stride_y", 
+            "stride_x",
+            "dilation_y", 
+            "dilation_x",
+            "pad_y", 
+            "pad_x",
+            "groups",
+            "deformable_groups"
+        ],
+        "array_args": [
+            "input",
+            "output",
+            "offset",
+            "weights",
+            "bias",
+            "tmp"
+        ],
+        "output_args": [
+            "output"
+        ]
+    }
+}

From 29639449262a0a949654dc003444784af1eb031c Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 10 Apr 2023 23:49:37 +0200
Subject: [PATCH 02/20] Fix pre-commit

---
 .../deformable_convolution/__init__.py        |   8 +-
 .../deformable_convolution_initialize.py      |  51 ++-
 .../deformable_convolution_numba_npr.py       | 120 ++++++--
 .../deformable_convolution_numpy.py           | 113 +++++--
 .../__init__.py                               |   4 +-
 .../deformable_convolution_sycl/impl.cpp      | 291 +++++++++---------
 .../deformable_convolution_sycl/utils.hpp     |  54 ++--
 .../bench_info/deformable_convolution.json    | 102 +++---
 8 files changed, 447 insertions(+), 296 deletions(-)

diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py
index 2dedbb33..a5db8b1d 100644
--- a/dpbench/benchmarks/deformable_convolution/__init__.py
+++ b/dpbench/benchmarks/deformable_convolution/__init__.py
@@ -3,8 +3,12 @@
 # SPDX-License-Identifier: Apache 2.0
 
 from .deformable_convolution_initialize import initialize
-from .deformable_convolution_numba_npr import deformable_convolution as deformable_convolution_numba_npr
-from .deformable_convolution_numpy import deformable_convolution as deformable_convolution_numpy
+from .deformable_convolution_numba_npr import (
+    deformable_convolution as deformable_convolution_numba_npr,
+)
+from .deformable_convolution_numpy import (
+    deformable_convolution as deformable_convolution_numpy,
+)
 from .deformable_convolution_sycl_native_ext import deformable_convolution_sycl
 
 __all__ = [
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
index 3a52ef0c..896a2224 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
@@ -3,30 +3,51 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-def initialize(batch, 
-               in_channels, in_height, in_width, 
-               out_channels, out_height, out_width, 
-               kernel_height, kernel_width, 
-               stride_y, stride_x,
-               dilation_y, dilation_x,
-               pad_y, pad_x,
-               groups,
-               deformable_groups,
-               dtype,
-               seed):
+def initialize(
+    batch,
+    in_channels,
+    in_height,
+    in_width,
+    out_channels,
+    out_height,
+    out_width,
+    kernel_height,
+    kernel_width,
+    stride_y,
+    stride_x,
+    dilation_y,
+    dilation_x,
+    pad_y,
+    pad_x,
+    groups,
+    deformable_groups,
+    dtype,
+    seed,
+):
     import numpy as np
     import numpy.random as default_rng
+
     default_rng.seed(seed)
 
     return (
-        default_rng.random((batch, in_channels, in_height, in_width)).astype(dtype),
+        default_rng.random((batch, in_channels, in_height, in_width)).astype(
+            dtype
+        ),
         # np.ones((batch, in_channels, in_height, in_width)).astype(dtype),
         np.zeros((batch, out_channels, out_height, out_width)).astype(dtype),
         # np.zeros((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype),
-        2*default_rng.random((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype) - 1,
+        2
+        * default_rng.random(
+            (kernel_height, kernel_width, 2, out_height, out_width)
+        ).astype(dtype)
+        - 1,
         # default_rng.random((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype),
-        np.ones((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype),
+        np.ones(
+            (out_channels, in_channels, kernel_height, kernel_width)
+        ).astype(dtype),
         default_rng.random(out_channels).astype(dtype),
         # np.ones((out_channels,)).astype(dtype),
-        np.zeros((in_channels, kernel_height, kernel_width, out_height, out_width)).astype(dtype)
+        np.zeros(
+            (in_channels, kernel_height, kernel_width, out_height, out_width)
+        ).astype(dtype),
     )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
index 6e891c3f..dea7236e 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
@@ -3,10 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+
 import numpy as np
 from numba import njit, prange
 
-@njit(parallel=True, inline='always', fastmath=True)
+
+@njit(parallel=True, inline="always", fastmath=True)
 def bilinear(input, offset_y, offset_x):
     height, width = input.shape
     start_x = int(math.floor(offset_x))
@@ -15,67 +17,125 @@ def bilinear(input, offset_y, offset_x):
     start_y_weight = 1 - (offset_y - start_y)
 
     output = 0
-    if offset_x >= width or offset_y >= height or offset_x <= -1 or offset_y <= -1:
+    if (
+        offset_x >= width
+        or offset_y >= height
+        or offset_x <= -1
+        or offset_y <= -1
+    ):
         return output
 
     if start_x >= 0 and start_y >= 0:
-        w = start_x_weight*start_y_weight
-        output += w*input[start_y, start_x]
+        w = start_x_weight * start_y_weight
+        output += w * input[start_y, start_x]
 
     if start_x + 1 < width and start_y >= 0:
-        w = (1 - start_x_weight)*start_y_weight
-        output += w*input[start_y, start_x + 1]
+        w = (1 - start_x_weight) * start_y_weight
+        output += w * input[start_y, start_x + 1]
 
-    if start_x >=0 and start_y + 1 < height:
-        w = start_x_weight*(1 - start_y_weight)
-        output += w*input[start_y + 1, start_x]
+    if start_x >= 0 and start_y + 1 < height:
+        w = start_x_weight * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x]
 
     if start_x + 1 < width and start_y + 1 < height:
-        w = (1 - start_x_weight)*(1 - start_y_weight)
-        output += w*input[start_y + 1, start_x + 1]
+        w = (1 - start_x_weight) * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x + 1]
 
     return output
 
+
 @njit(parallel=True, fastmath=True)
-def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups):
+def deform(
+    input, offset, output, stride, pad, dilation, groups, deformable_groups
+):
     k_height, k_width, _, out_height, out_width = offset.shape
     channels, _, _ = input.shape
 
-    k_h_m = (k_height - 1)//2
-    k_w_m = (k_width - 1)//2
+    k_h_m = (k_height - 1) // 2
+    k_w_m = (k_width - 1) // 2
 
-    for ckhkw in prange(channels*k_height*k_width):
+    for ckhkw in prange(channels * k_height * k_width):
         for h in prange(out_height):
             for w in prange(out_width):
-                c = ckhkw//(k_height*k_width)
-                khkw = ckhkw%(k_height*k_width)
-                kh = khkw//k_width
-                kw = khkw%k_width
-
-                offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m)
-                offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m)
+                c = ckhkw // (k_height * k_width)
+                khkw = ckhkw % (k_height * k_width)
+                kh = khkw // k_width
+                kw = khkw % k_width
+
+                offset_y = (
+                    offset[kh, kw, 1, h, w]
+                    + h * stride[0]
+                    + (kh - k_h_m) * dilation[0]
+                    - (pad[0] - k_h_m)
+                )
+                offset_x = (
+                    offset[kh, kw, 0, h, w]
+                    + w * stride[1]
+                    + (kw - k_w_m) * dilation[1]
+                    - (pad[1] - k_w_m)
+                )
 
                 output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
 
+
 @njit(parallel=True, fastmath=True)
-def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups):
+def deformable_convolution_b1(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride,
+    pad,
+    dilation,
+    groups,
+    deformable_groups,
+):
     out_channels, height, width = output.shape
     _, in_channels, k_height, k_width = weights.shape
-    
+
     deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
 
-    tmp = tmp.reshape((in_channels*k_height*k_width, height*width))
+    tmp = tmp.reshape((in_channels * k_height * k_width, height * width))
 
-    _weights = weights.reshape((out_channels, in_channels*k_height*k_width))
-    _output = output.reshape((out_channels, height*width))
+    _weights = weights.reshape((out_channels, in_channels * k_height * k_width))
+    _output = output.reshape((out_channels, height * width))
     np.dot(_weights, tmp, _output)
 
     _bias = bias.reshape((out_channels, 1))
     _output[:] = _output + _bias
 
+
 @njit(parallel=True)
-def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups):
+def deformable_convolution(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride_y,
+    stride_x,
+    pad_y,
+    pad_x,
+    dilation_y,
+    dilation_x,
+    groups,
+    deformable_groups,
+):
     batch, _, _, _ = input.shape
     for b in range(batch):
-        deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups)
-
+        deformable_convolution_b1(
+            input[b],
+            output[b],
+            offset,
+            weights,
+            bias,
+            tmp,
+            (stride_y, stride_x),
+            (pad_y, pad_x),
+            (dilation_y, dilation_x),
+            groups,
+            deformable_groups,
+        )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
index 59897585..d0ec3d5f 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
@@ -3,10 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+
 import numpy as np
 from numba import njit, prange
 
-@njit(parallel=True, inline='always', fastmath=True)
+
+@njit(parallel=True, inline="always", fastmath=True)
 def bilinear(input, offset_y, offset_x):
     height, width = input.shape
     start_x = int(math.floor(offset_x))
@@ -19,63 +21,116 @@ def bilinear(input, offset_y, offset_x):
         return output
 
     if start_x >= 0 and start_y >= 0:
-        w = start_x_weight*start_y_weight
-        output += w*input[start_y, start_x]
+        w = start_x_weight * start_y_weight
+        output += w * input[start_y, start_x]
 
     if start_x + 1 < width and start_y >= 0:
-        w = (1 - start_x_weight)*start_y_weight
-        output += w*input[start_y, start_x + 1]
+        w = (1 - start_x_weight) * start_y_weight
+        output += w * input[start_y, start_x + 1]
 
-    if start_x >=0 and start_y + 1 < height:
-        w = start_x_weight*(1 - start_y_weight)
-        output += w*input[start_y + 1, start_x]
+    if start_x >= 0 and start_y + 1 < height:
+        w = start_x_weight * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x]
 
     if start_x + 1 < width and start_y + 1 < height:
-        w = (1 - start_x_weight)*(1 - start_y_weight)
-        output += w*input[start_y + 1, start_x + 1]
+        w = (1 - start_x_weight) * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x + 1]
 
     return output
 
+
 @njit(parallel=True, fastmath=True)
-def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups):
+def deform(
+    input, offset, output, stride, pad, dilation, groups, deformable_groups
+):
     k_height, k_width, _, out_height, out_width = offset.shape
     channels, _, _ = input.shape
 
-    k_h_m = (k_height - 1)//2
-    k_w_m = (k_width - 1)//2
+    k_h_m = (k_height - 1) // 2
+    k_w_m = (k_width - 1) // 2
 
-    for ckhkw in prange(channels*k_height*k_width):
+    for ckhkw in prange(channels * k_height * k_width):
         for h in prange(out_height):
             for w in prange(out_width):
-                c = ckhkw//(k_height*k_width)
-                khkw = ckhkw%(k_height*k_width)
-                kh = khkw//k_width
-                kw = khkw%k_width
-
-                offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m)
-                offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m)
+                c = ckhkw // (k_height * k_width)
+                khkw = ckhkw % (k_height * k_width)
+                kh = khkw // k_width
+                kw = khkw % k_width
+
+                offset_y = (
+                    offset[kh, kw, 1, h, w]
+                    + h * stride[0]
+                    + (kh - k_h_m) * dilation[0]
+                    - (pad[0] - k_h_m)
+                )
+                offset_x = (
+                    offset[kh, kw, 0, h, w]
+                    + w * stride[1]
+                    + (kw - k_w_m) * dilation[1]
+                    - (pad[1] - k_w_m)
+                )
 
                 output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
 
+
 # @njit(parallel=True, fastmath=True)
-def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups):
+def deformable_convolution_b1(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride,
+    pad,
+    dilation,
+    groups,
+    deformable_groups,
+):
     out_channels, height, width = output.shape
     _, in_channels, k_height, k_width = weights.shape
-    
+
     deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
 
-    tmp = tmp.reshape((in_channels*k_height*k_width, height*width))
+    tmp = tmp.reshape((in_channels * k_height * k_width, height * width))
 
-    _weights = weights.reshape((out_channels, in_channels*k_height*k_width))
-    _output = output.reshape((out_channels, height*width))
+    _weights = weights.reshape((out_channels, in_channels * k_height * k_width))
+    _output = output.reshape((out_channels, height * width))
     np.dot(_weights, tmp, _output)
 
     _bias = bias.reshape((out_channels, 1))
     _output[:] = _output + _bias
 
+
 # @njit(parallel=True)
-def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups):
+def deformable_convolution(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride_y,
+    stride_x,
+    pad_y,
+    pad_x,
+    dilation_y,
+    dilation_x,
+    groups,
+    deformable_groups,
+):
     batch, _, _, _ = input.shape
     for b in range(batch):
-        deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups)
-
+        deformable_convolution_b1(
+            input[b],
+            output[b],
+            offset,
+            weights,
+            bias,
+            tmp,
+            (stride_y, stride_x),
+            (pad_y, pad_x),
+            (dilation_y, dilation_x),
+            groups,
+            deformable_groups,
+        )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
index cd4c9bbd..01a4191d 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
@@ -2,6 +2,8 @@
 #
 # SPDX-License-Identifier: Apache 2.0
 
-from .deformable_convolution_sycl._deformable_convolution_sycl import deformable_convolution as deformable_convolution_sycl
+from .deformable_convolution_sycl._deformable_convolution_sycl import (
+    deformable_convolution as deformable_convolution_sycl,
+)
 
 __all__ = ["deformable_convolution_sycl"]
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index cc6ecac7..005155bd 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -8,17 +8,17 @@
 /// The files implements a SYCL-based Python native extension for the
 /// deformable convolution benchmark.
 
-#include "utils.hpp"
 #include "CL/sycl.hpp"
+#include "cmath"
 #include "dpctl4pybind11.hpp"
 #include "oneapi/mkl.hpp"
-#include "cmath"
+#include "utils.hpp"
 
 using namespace sycl;
 namespace py = pybind11;
 
-template<class DataType>
-__attribute__((always_inline)) DataType bilinear(const DataType* input,
+template <class DataType>
+__attribute__((always_inline)) DataType bilinear(const DataType *input,
                                                  int height,
                                                  int width,
                                                  float offset_y,
@@ -30,39 +30,36 @@ __attribute__((always_inline)) DataType bilinear(const DataType* input,
     auto start_y_weight = 1 - (offset_y - start_y);
 
     DataType result = 0;
-    if (offset_x >= width || offset_y >= height || offset_x <= -1 || offset_y <= -1)
+    if (offset_x >= width || offset_y >= height || offset_x <= -1 ||
+        offset_y <= -1)
         return result;
 
-    if (start_x >= 0 && start_y >= 0)
-    {
-        auto w0 = start_x_weight*start_y_weight;
+    if (start_x >= 0 && start_y >= 0) {
+        auto w0 = start_x_weight * start_y_weight;
         auto v0 = *get_ptr_2d(input, height, width, start_y, start_x);
 
-        result += w0*v0;
+        result += w0 * v0;
     }
 
-    if (start_x + 1 < width && start_y >= 0)
-    {
-        auto w1 = (1 - start_x_weight)*start_y_weight;
+    if (start_x + 1 < width && start_y >= 0) {
+        auto w1 = (1 - start_x_weight) * start_y_weight;
         auto v1 = *get_ptr_2d(input, height, width, start_y, start_x + 1);
 
-        result += w1*v1;
+        result += w1 * v1;
     }
 
-    if (start_x >=0 && start_y + 1 < height)
-    {
-        auto w2 = start_x_weight*(1 - start_y_weight);
+    if (start_x >= 0 && start_y + 1 < height) {
+        auto w2 = start_x_weight * (1 - start_y_weight);
         auto v2 = *get_ptr_2d(input, height, width, start_y + 1, start_x);
 
-        result += w2*v2;
+        result += w2 * v2;
     }
 
-    if (start_x + 1 < width && start_y + 1 < height)
-    {
-        auto w3 = (1 - start_x_weight)*(1 - start_y_weight);
+    if (start_x + 1 < width && start_y + 1 < height) {
+        auto w3 = (1 - start_x_weight) * (1 - start_y_weight);
         auto v3 = *get_ptr_2d(input, height, width, start_y + 1, start_x + 1);
 
-        result += w3*v3;
+        result += w3 * v3;
     }
 
     return result;
@@ -70,52 +67,64 @@ __attribute__((always_inline)) DataType bilinear(const DataType* input,
 
 class deform;
 
-template<class DataType>
-inline auto deform_input(cl::sycl::queue& queue,
-                         const DataType* input, const Shape3D in_shape,
-                         DataType* output, const Shape5D out_shape,
-                         const DataType* offset,
-                         int stride_y, int stride_x,
-                         int pad_y, int pad_x,
-                         int dilation_y, int dilation_x)
+template <class DataType>
+inline auto deform_input(cl::sycl::queue &queue,
+                         const DataType *input,
+                         const Shape3D in_shape,
+                         DataType *output,
+                         const Shape5D out_shape,
+                         const DataType *offset,
+                         int stride_y,
+                         int stride_x,
+                         int pad_y,
+                         int pad_x,
+                         int dilation_y,
+                         int dilation_x)
 {
     auto in_channels = in_shape[CHW::C];
-    auto in_height   = in_shape[CHW::H];
-    auto in_width    = in_shape[CHW::W];
+    auto in_height = in_shape[CHW::H];
+    auto in_width = in_shape[CHW::W];
 
-    auto k_height   = out_shape[CKHW::KH];
-    auto k_width    = out_shape[CKHW::KW];
+    auto k_height = out_shape[CKHW::KH];
+    auto k_width = out_shape[CKHW::KW];
     auto out_height = out_shape[CKHW::H];
-    auto out_width  = out_shape[CKHW::W];
+    auto out_width = out_shape[CKHW::W];
 
     assert(out_shape[CKHW::C] == in_channels);
 
-    auto wsize = sycl::range<3>(in_channels*k_height*k_width, out_height, out_width);
-    return queue.parallel_for<deform>(wsize, [=](sycl::id<3> idx)
-    {
+    auto wsize =
+        sycl::range<3>(in_channels * k_height * k_width, out_height, out_width);
+    return queue.parallel_for<deform>(wsize, [=](sycl::id<3> idx) {
         auto ckhkw = static_cast<int>(idx[0]);
         auto h = static_cast<int>(idx[1]);
         auto w = static_cast<int>(idx[2]);
 
-        auto c = ckhkw/(k_height*k_width);
-        auto khkw = ckhkw%(k_height*k_width);
+        auto c = ckhkw / (k_height * k_width);
+        auto khkw = ckhkw % (k_height * k_width);
 
-        auto kh = khkw/k_width;
-        auto kw = khkw%k_width;
+        auto kh = khkw / k_width;
+        auto kw = khkw % k_width;
 
-        auto k_h_m = (k_height - 1)/2;
-        auto k_w_m = (k_width - 1)/2;
+        auto k_h_m = (k_height - 1) / 2;
+        auto k_w_m = (k_width - 1) / 2;
 
-        auto _output = get_ptr_5d(output,
-                                  in_channels, k_height, k_width, out_height, out_width,
-                                  c, kh, kw, h, 0);
+        auto _output = get_ptr_5d(output, in_channels, k_height, k_width,
+                                  out_height, out_width, c, kh, kw, h, 0);
 
-        auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 1, h, w) + h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m);
-        auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 0, h, w) + w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m);
-        // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m);
-        // auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m);
+        auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height,
+                                    out_width, kh, kw, 1, h, w) +
+                        h * stride_y + (kh - k_h_m) * dilation_y -
+                        (pad_y - k_h_m);
+        auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height,
+                                    out_width, kh, kw, 0, h, w) +
+                        w * stride_x + (kw - k_w_m) * dilation_x -
+                        (pad_x - k_w_m);
+        // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y -
+        // k_h_m); auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x
+        // - k_w_m);
 
-        auto _input = get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0);
+        auto _input =
+            get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0);
 
         _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x);
         // _output[w] = offset_y;
@@ -124,42 +133,49 @@ inline auto deform_input(cl::sycl::queue& queue,
 
 class fill_output;
 
-template<class DataType>
-auto output_fill_with_bias(cl::sycl::queue& queue, DataType* output, const Shape3D out_shape, DataType* bias)
+template <class DataType>
+auto output_fill_with_bias(cl::sycl::queue &queue,
+                           DataType *output,
+                           const Shape3D out_shape,
+                           DataType *bias)
 {
     auto out_c = out_shape[CHW::C];
     auto out_h = out_shape[CHW::H];
     auto out_w = out_shape[CHW::W];
 
-    return queue.parallel_for<fill_output>(sycl::range<3>(out_c, out_h, out_w),[=](sycl::id<3> idx)
-    {
-        auto c = static_cast<int>(idx[0]);
-        auto h = static_cast<int>(idx[1]);
-        auto w = static_cast<int>(idx[2]);
+    return queue.parallel_for<fill_output>(
+        sycl::range<3>(out_c, out_h, out_w), [=](sycl::id<3> idx) {
+            auto c = static_cast<int>(idx[0]);
+            auto h = static_cast<int>(idx[1]);
+            auto w = static_cast<int>(idx[2]);
 
-        auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w);
-        *out_ptr = bias[c];
-    });
+            auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w);
+            *out_ptr = bias[c];
+        });
 }
 
-template<class DataType>
-void deformable_convolution_b1_impl(cl::sycl::queue& queue,
-                                    const DataType* input,
+template <class DataType>
+void deformable_convolution_b1_impl(cl::sycl::queue &queue,
+                                    const DataType *input,
                                     const Shape3D in_shape,
-                                    DataType* output,
+                                    DataType *output,
                                     const Shape3D out_shape,
-                                    DataType* tmp,
-                                    const float* offset,
-                                    DataType* weights,
+                                    DataType *tmp,
+                                    const float *offset,
+                                    DataType *weights,
                                     const Shape4D weights_shape,
-                                    DataType* bias,
-                                    int stride_y, int stride_x,
-                                    int pad_y, int pad_x,
-                                    int dilation_y, int dilation_x,
-                                    int groups, int deformable_groups)
+                                    DataType *bias,
+                                    int stride_y,
+                                    int stride_x,
+                                    int pad_y,
+                                    int pad_x,
+                                    int dilation_y,
+                                    int dilation_x,
+                                    int groups,
+                                    int deformable_groups)
 {
-    using oneapi::mkl::blas::row_major::gemm;
     using oneapi::mkl::transpose;
+    using oneapi::mkl::blas::row_major::gemm;
 
     auto in_c = in_shape[CHW::C];
     auto in_h = in_shape[CHW::H];
@@ -174,42 +190,41 @@ void deformable_convolution_b1_impl(cl::sycl::queue& queue,
     auto ker_h = weights_shape[OIHW::H];
     auto ker_w = weights_shape[OIHW::W];
 
-    auto edeform = deform_input(queue,
-                                input, in_shape,
-                                tmp, {in_c, ker_h, ker_w, out_h, out_w},
-                                offset,
-                                stride_y, stride_x,
-                                pad_y, pad_x,
-                                dilation_y, dilation_x);
+    auto edeform = deform_input(
+        queue, input, in_shape, tmp, {in_c, ker_h, ker_w, out_h, out_w}, offset,
+        stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x);
 
     auto efill = output_fill_with_bias(queue, output, out_shape, bias);
-    auto egemm = gemm(queue,
-                      transpose::N, transpose::N, /*transpose a, b*/
-                      out_c, out_h*out_w, in_c*ker_h*ker_w, /*m, n, k*/
-                      1, /*alpha*/
-                      weights, in_c*ker_h*ker_w, /*a, lda*/
-                      tmp, out_h*out_w, /*b, ldb*/
-                      1, /*beta*/
-                      output, out_h*out_w, /*c, ldc*/
+    auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/
+                      out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/
+                      1,                                          /*alpha*/
+                      weights, in_c * ker_h * ker_w,              /*a, lda*/
+                      tmp, out_h * out_w,                         /*b, ldb*/
+                      1,                                          /*beta*/
+                      output, out_h * out_w,                      /*c, ldc*/
                       {edeform, efill} /*events*/);
     egemm.wait();
 }
 
-template<class DataType>
-void deformable_convolution_impl(cl::sycl::queue& queue,
-                                 const DataType* input,
+template <class DataType>
+void deformable_convolution_impl(cl::sycl::queue &queue,
+                                 const DataType *input,
                                  const Shape4D in_shape,
-                                 DataType* output,
+                                 DataType *output,
                                  const Shape4D out_shape,
-                                 DataType* tmp,
-                                 const float* offset,
-                                 DataType* weights,
+                                 DataType *tmp,
+                                 const float *offset,
+                                 DataType *weights,
                                  const Shape4D weights_shape,
-                                 DataType* bias,
-                                 int stride_y, int stride_x,
-                                 int pad_y, int pad_x,
-                                 int dilation_y, int dilation_x,
-                                 int groups, int deformable_groups)
+                                 DataType *bias,
+                                 int stride_y,
+                                 int stride_x,
+                                 int pad_y,
+                                 int pad_x,
+                                 int dilation_y,
+                                 int dilation_x,
+                                 int groups,
+                                 int deformable_groups)
 {
     auto in_b = in_shape[NCHW::N];
     auto in_c = in_shape[NCHW::C];
@@ -224,28 +239,32 @@ void deformable_convolution_impl(cl::sycl::queue& queue,
     assert(out_c == weights_shape[OIHW::OC]);
     assert(in_c == weights_shape[OIHW::IC]);
 
-    for (auto b = 0; b < in_b; ++b)
-    {
+    for (auto b = 0; b < in_b; ++b) {
         auto input_ptr = get_ptr_4d(input, in_b, in_c, in_h, in_w, b, 0, 0, 0);
-        auto output_ptr = get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0);
-        deformable_convolution_b1_impl(queue,
-                                       input_ptr, {in_c, in_h, in_w},
-                                       output_ptr, {out_c, out_h, out_w},
-                                       tmp,
-                                       offset,
-                                       weights, weights_shape,
-                                       bias,
-                                       stride_y, stride_x,
-                                       pad_y, pad_x,
-                                       dilation_y, dilation_x,
-                                       groups, deformable_groups);
-
+        auto output_ptr =
+            get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0);
+        deformable_convolution_b1_impl(
+            queue, input_ptr, {in_c, in_h, in_w}, output_ptr,
+            {out_c, out_h, out_w}, tmp, offset, weights, weights_shape, bias,
+            stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups,
+            deformable_groups);
     }
 }
 
-void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm_ndarray output, dpctl::tensor::usm_ndarray offset,
-                            dpctl::tensor::usm_ndarray weights, dpctl::tensor::usm_ndarray bias, dpctl::tensor::usm_ndarray tmp,
-                            int stride_y, int stride_x, int pad_y, int pad_x, int dilation_y, int dilation_x, int groups, int deformable_groups)
+void deformable_convolution(dpctl::tensor::usm_ndarray input,
+                            dpctl::tensor::usm_ndarray output,
+                            dpctl::tensor::usm_ndarray offset,
+                            dpctl::tensor::usm_ndarray weights,
+                            dpctl::tensor::usm_ndarray bias,
+                            dpctl::tensor::usm_ndarray tmp,
+                            int stride_y,
+                            int stride_x,
+                            int pad_y,
+                            int pad_x,
+                            int dilation_y,
+                            int dilation_x,
+                            int groups,
+                            int deformable_groups)
 {
     auto queue = input.get_queue();
 
@@ -268,19 +287,15 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm
 
     auto input_shape = Shape4D({batch, in_channels, in_height, in_width});
     auto output_shape = Shape4D({batch, out_channels, out_height, out_width});
-    auto weights_shape = Shape4D({out_channels, in_channels, kernel_height, kernel_width});
-
-    deformable_convolution_impl(queue,
-                                input.get_data<float>(), input_shape,
-                                output.get_data<float>(), output_shape,
-                                tmp.get_data<float>(),
-                                offset.get_data<float>(),
-                                weights.get_data<float>(), weights_shape,
-                                bias.get_data<float>(),
-                                stride_y, stride_x,
-                                pad_y, pad_x,
-                                dilation_y, dilation_x,
-                                groups, deformable_groups);
+    auto weights_shape =
+        Shape4D({out_channels, in_channels, kernel_height, kernel_width});
+
+    deformable_convolution_impl(
+        queue, input.get_data<float>(), input_shape, output.get_data<float>(),
+        output_shape, tmp.get_data<float>(), offset.get_data<float>(),
+        weights.get_data<float>(), weights_shape, bias.get_data<float>(),
+        stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups,
+        deformable_groups);
 }
 
 PYBIND11_MODULE(_deformable_convolution_sycl, m)
@@ -288,10 +303,10 @@ PYBIND11_MODULE(_deformable_convolution_sycl, m)
     import_dpctl();
 
     m.def("deformable_convolution", &deformable_convolution,
-          "Defromable convolution",
-          py::arg("input"), py::arg("output"), py::arg("offset"),
-          py::arg("weights"), py::arg("bias"), py::arg("tmp"),
-          py::arg("stride_y"), py::arg("stride_x"), py::arg("pad_y"), py::arg("pad_x"),
-          py::arg("dilation_y"), py::arg("dilation_x"),
-          py::arg("groups"), py::arg("deformable_groups"));
+          "Defromable convolution", py::arg("input"), py::arg("output"),
+          py::arg("offset"), py::arg("weights"), py::arg("bias"),
+          py::arg("tmp"), py::arg("stride_y"), py::arg("stride_x"),
+          py::arg("pad_y"), py::arg("pad_x"), py::arg("dilation_y"),
+          py::arg("dilation_x"), py::arg("groups"),
+          py::arg("deformable_groups"));
 }
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
index 447a0571..603c4144 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
@@ -6,32 +6,28 @@
 
 #include <array>
 
-template<int Dims>
-class TensorShape: public std::array<int, Dims>
+template <int Dims> class TensorShape : public std::array<int, Dims>
 {
 public:
-    template<class DimType>
-    const int& operator[](DimType dim) const
+    template <class DimType> const int &operator[](DimType dim) const
     {
         return std::array<int, Dims>::operator[](static_cast<int>(dim));
     }
 
-    template<class DimType>
-    int& operator[](DimType dim)
+    template <class DimType> int &operator[](DimType dim)
     {
         return std::array<int, Dims>::operator[](static_cast<int>(dim));
     }
-
 };
 
-enum class CHW: int
+enum class CHW : int
 {
     C,
     H,
     W,
 };
 
-enum class NCHW: int
+enum class NCHW : int
 {
     N,
     C,
@@ -39,7 +35,7 @@ enum class NCHW: int
     W,
 };
 
-enum class OIHW: int
+enum class OIHW : int
 {
     OC,
     IC,
@@ -47,7 +43,7 @@ enum class OIHW: int
     W,
 };
 
-enum class HWCK: int
+enum class HWCK : int
 {
     H,
     W,
@@ -56,7 +52,7 @@ enum class HWCK: int
     KW
 };
 
-enum class CKHW: int
+enum class CKHW : int
 {
     C,
     KH,
@@ -73,22 +69,20 @@ using Shape5D = TensorShape<5>;
 using DType = float;
 
 #define get_ptr_1d(data_ptr, max_dim_0, dim_0) (data_ptr + (dim_0))
-#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1) \
-(\
-    get_ptr_1d(data_ptr + (dim_0)*(max_dim_1), max_dim_1, dim_1) \
-)
-#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1, dim_2) \
-(\
-    get_ptr_2d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2), max_dim_1, max_dim_2, dim_1, dim_2) \
-)
-#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, dim_0, dim_1, dim_2, dim_3) \
-(\
-    get_ptr_3d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3), max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3) \
-)
+#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1)               \
+    (get_ptr_1d(data_ptr + (dim_0) * (max_dim_1), max_dim_1, dim_1))
+#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1,    \
+                   dim_2)                                                      \
+    (get_ptr_2d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2), max_dim_1,     \
+                max_dim_2, dim_1, dim_2))
+#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3,       \
+                   dim_0, dim_1, dim_2, dim_3)                                 \
+    (get_ptr_3d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2) * (max_dim_3),  \
+                max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3))
 
-#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4) \
-(\
-    get_ptr_4d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3)*(max_dim_4), \
-               max_dim_1, max_dim_2, max_dim_3, max_dim_4, \
-               dim_1, dim_2, dim_3, dim_4) \
-)
+#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3,       \
+                   max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4)               \
+    (get_ptr_4d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2) * (max_dim_3) * \
+                               (max_dim_4),                                    \
+                max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_1, dim_2,      \
+                dim_3, dim_4))
diff --git a/dpbench/configs/bench_info/deformable_convolution.json b/dpbench/configs/bench_info/deformable_convolution.json
index e18e7771..8fabbb57 100644
--- a/dpbench/configs/bench_info/deformable_convolution.json
+++ b/dpbench/configs/bench_info/deformable_convolution.json
@@ -9,20 +9,20 @@
         "domain": "Deep learning",
         "parameters": {
             "S": {
-                "batch":1, 
-                "in_channels":16, 
-                "in_height":32, 
-                "in_width":32, 
-                "out_channels":16, 
-                "out_height":32, 
-                "out_width":32, 
-                "kernel_height":3, 
-                "kernel_width":3, 
-                "stride_y":1, 
+                "batch":1,
+                "in_channels":16,
+                "in_height":32,
+                "in_width":32,
+                "out_channels":16,
+                "out_height":32,
+                "out_width":32,
+                "kernel_height":3,
+                "kernel_width":3,
+                "stride_y":1,
                 "stride_x":1,
-                "dilation_y":1, 
+                "dilation_y":1,
                 "dilation_x":1,
-                "pad_y":1, 
+                "pad_y":1,
                 "pad_x":1,
                 "groups":1,
                 "deformable_groups":1,
@@ -30,20 +30,20 @@
                 "seed": 7777777
             },
             "M": {
-                "batch":1, 
-                "in_channels":64, 
-                "in_height":64, 
-                "in_width":64, 
-                "out_channels":64, 
-                "out_height":64, 
-                "out_width":64, 
-                "kernel_height":3, 
-                "kernel_width":3, 
-                "stride_y":1, 
+                "batch":1,
+                "in_channels":64,
+                "in_height":64,
+                "in_width":64,
+                "out_channels":64,
+                "out_height":64,
+                "out_width":64,
+                "kernel_height":3,
+                "kernel_width":3,
+                "stride_y":1,
                 "stride_x":1,
-                "dilation_y":1, 
+                "dilation_y":1,
                 "dilation_x":1,
-                "pad_y":1, 
+                "pad_y":1,
                 "pad_x":1,
                 "groups":1,
                 "deformable_groups":1,
@@ -51,20 +51,20 @@
                 "seed": 7777777
             },
             "L": {
-                "batch":2, 
-                "in_channels":64, 
-                "in_height":128, 
-                "in_width":128, 
-                "out_channels":128, 
-                "out_height":128, 
-                "out_width":128, 
-                "kernel_height":3, 
-                "kernel_width":3, 
-                "stride_y":1, 
+                "batch":2,
+                "in_channels":64,
+                "in_height":128,
+                "in_width":128,
+                "out_channels":128,
+                "out_height":128,
+                "out_width":128,
+                "kernel_height":3,
+                "kernel_width":3,
+                "stride_y":1,
                 "stride_x":1,
-                "dilation_y":1, 
+                "dilation_y":1,
                 "dilation_x":1,
-                "pad_y":1, 
+                "pad_y":1,
                 "pad_x":1,
                 "groups":1,
                 "deformable_groups":1,
@@ -75,20 +75,20 @@
         "init": {
             "func_name": "initialize",
             "input_args": [
-                "batch", 
-                "in_channels", 
-                "in_height", 
-                "in_width", 
-                "out_channels", 
-                "out_height", 
-                "out_width", 
-                "kernel_height", 
-                "kernel_width", 
-                "stride_y", 
+                "batch",
+                "in_channels",
+                "in_height",
+                "in_width",
+                "out_channels",
+                "out_height",
+                "out_width",
+                "kernel_height",
+                "kernel_width",
+                "stride_y",
                 "stride_x",
-                "dilation_y", 
+                "dilation_y",
                 "dilation_x",
-                "pad_y", 
+                "pad_y",
                 "pad_x",
                 "groups",
                 "deformable_groups",
@@ -111,11 +111,11 @@
             "weights",
             "bias",
             "tmp",
-            "stride_y", 
+            "stride_y",
             "stride_x",
-            "dilation_y", 
+            "dilation_y",
             "dilation_x",
-            "pad_y", 
+            "pad_y",
             "pad_x",
             "groups",
             "deformable_groups"

From 809047daf6040b00fa492ba1441e280f18da9af7 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 29 May 2023 13:25:41 +0300
Subject: [PATCH 03/20] refactor

---
 .../deformable_convolution_initialize.py      |   4 -
 .../deformable_convolution_numba_npr.py       | 141 ------------------
 .../deformable_convolution_numpy.py           | 136 -----------------
 .../CMakeLists.txt                            |   7 +-
 .../deformable_convolution_sycl/impl.cpp      |  14 +-
 .../bench_info/deformable_convolution.json    | 135 -----------------
 6 files changed, 7 insertions(+), 430 deletions(-)
 delete mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
 delete mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
 delete mode 100644 dpbench/configs/bench_info/deformable_convolution.json

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
index 896a2224..205f57be 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
@@ -33,20 +33,16 @@ def initialize(
         default_rng.random((batch, in_channels, in_height, in_width)).astype(
             dtype
         ),
-        # np.ones((batch, in_channels, in_height, in_width)).astype(dtype),
         np.zeros((batch, out_channels, out_height, out_width)).astype(dtype),
-        # np.zeros((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype),
         2
         * default_rng.random(
             (kernel_height, kernel_width, 2, out_height, out_width)
         ).astype(dtype)
         - 1,
-        # default_rng.random((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype),
         np.ones(
             (out_channels, in_channels, kernel_height, kernel_width)
         ).astype(dtype),
         default_rng.random(out_channels).astype(dtype),
-        # np.ones((out_channels,)).astype(dtype),
         np.zeros(
             (in_channels, kernel_height, kernel_width, out_height, out_width)
         ).astype(dtype),
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
deleted file mode 100644
index dea7236e..00000000
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2022 Intel Corp.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import math
-
-import numpy as np
-from numba import njit, prange
-
-
-@njit(parallel=True, inline="always", fastmath=True)
-def bilinear(input, offset_y, offset_x):
-    height, width = input.shape
-    start_x = int(math.floor(offset_x))
-    start_x_weight = 1 - (offset_x - start_x)
-    start_y = int(math.floor(offset_y))
-    start_y_weight = 1 - (offset_y - start_y)
-
-    output = 0
-    if (
-        offset_x >= width
-        or offset_y >= height
-        or offset_x <= -1
-        or offset_y <= -1
-    ):
-        return output
-
-    if start_x >= 0 and start_y >= 0:
-        w = start_x_weight * start_y_weight
-        output += w * input[start_y, start_x]
-
-    if start_x + 1 < width and start_y >= 0:
-        w = (1 - start_x_weight) * start_y_weight
-        output += w * input[start_y, start_x + 1]
-
-    if start_x >= 0 and start_y + 1 < height:
-        w = start_x_weight * (1 - start_y_weight)
-        output += w * input[start_y + 1, start_x]
-
-    if start_x + 1 < width and start_y + 1 < height:
-        w = (1 - start_x_weight) * (1 - start_y_weight)
-        output += w * input[start_y + 1, start_x + 1]
-
-    return output
-
-
-@njit(parallel=True, fastmath=True)
-def deform(
-    input, offset, output, stride, pad, dilation, groups, deformable_groups
-):
-    k_height, k_width, _, out_height, out_width = offset.shape
-    channels, _, _ = input.shape
-
-    k_h_m = (k_height - 1) // 2
-    k_w_m = (k_width - 1) // 2
-
-    for ckhkw in prange(channels * k_height * k_width):
-        for h in prange(out_height):
-            for w in prange(out_width):
-                c = ckhkw // (k_height * k_width)
-                khkw = ckhkw % (k_height * k_width)
-                kh = khkw // k_width
-                kw = khkw % k_width
-
-                offset_y = (
-                    offset[kh, kw, 1, h, w]
-                    + h * stride[0]
-                    + (kh - k_h_m) * dilation[0]
-                    - (pad[0] - k_h_m)
-                )
-                offset_x = (
-                    offset[kh, kw, 0, h, w]
-                    + w * stride[1]
-                    + (kw - k_w_m) * dilation[1]
-                    - (pad[1] - k_w_m)
-                )
-
-                output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
-
-
-@njit(parallel=True, fastmath=True)
-def deformable_convolution_b1(
-    input,
-    output,
-    offset,
-    weights,
-    bias,
-    tmp,
-    stride,
-    pad,
-    dilation,
-    groups,
-    deformable_groups,
-):
-    out_channels, height, width = output.shape
-    _, in_channels, k_height, k_width = weights.shape
-
-    deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
-
-    tmp = tmp.reshape((in_channels * k_height * k_width, height * width))
-
-    _weights = weights.reshape((out_channels, in_channels * k_height * k_width))
-    _output = output.reshape((out_channels, height * width))
-    np.dot(_weights, tmp, _output)
-
-    _bias = bias.reshape((out_channels, 1))
-    _output[:] = _output + _bias
-
-
-@njit(parallel=True)
-def deformable_convolution(
-    input,
-    output,
-    offset,
-    weights,
-    bias,
-    tmp,
-    stride_y,
-    stride_x,
-    pad_y,
-    pad_x,
-    dilation_y,
-    dilation_x,
-    groups,
-    deformable_groups,
-):
-    batch, _, _, _ = input.shape
-    for b in range(batch):
-        deformable_convolution_b1(
-            input[b],
-            output[b],
-            offset,
-            weights,
-            bias,
-            tmp,
-            (stride_y, stride_x),
-            (pad_y, pad_x),
-            (dilation_y, dilation_x),
-            groups,
-            deformable_groups,
-        )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
deleted file mode 100644
index d0ec3d5f..00000000
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2022 Intel Corp.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import math
-
-import numpy as np
-from numba import njit, prange
-
-
-@njit(parallel=True, inline="always", fastmath=True)
-def bilinear(input, offset_y, offset_x):
-    height, width = input.shape
-    start_x = int(math.floor(offset_x))
-    start_x_weight = 1 - (offset_x - start_x)
-    start_y = int(math.floor(offset_y))
-    start_y_weight = 1 - (offset_y - start_y)
-
-    output = 0
-    if start_x >= width or start_y >= height or start_x < -1 or start_y < -1:
-        return output
-
-    if start_x >= 0 and start_y >= 0:
-        w = start_x_weight * start_y_weight
-        output += w * input[start_y, start_x]
-
-    if start_x + 1 < width and start_y >= 0:
-        w = (1 - start_x_weight) * start_y_weight
-        output += w * input[start_y, start_x + 1]
-
-    if start_x >= 0 and start_y + 1 < height:
-        w = start_x_weight * (1 - start_y_weight)
-        output += w * input[start_y + 1, start_x]
-
-    if start_x + 1 < width and start_y + 1 < height:
-        w = (1 - start_x_weight) * (1 - start_y_weight)
-        output += w * input[start_y + 1, start_x + 1]
-
-    return output
-
-
-@njit(parallel=True, fastmath=True)
-def deform(
-    input, offset, output, stride, pad, dilation, groups, deformable_groups
-):
-    k_height, k_width, _, out_height, out_width = offset.shape
-    channels, _, _ = input.shape
-
-    k_h_m = (k_height - 1) // 2
-    k_w_m = (k_width - 1) // 2
-
-    for ckhkw in prange(channels * k_height * k_width):
-        for h in prange(out_height):
-            for w in prange(out_width):
-                c = ckhkw // (k_height * k_width)
-                khkw = ckhkw % (k_height * k_width)
-                kh = khkw // k_width
-                kw = khkw % k_width
-
-                offset_y = (
-                    offset[kh, kw, 1, h, w]
-                    + h * stride[0]
-                    + (kh - k_h_m) * dilation[0]
-                    - (pad[0] - k_h_m)
-                )
-                offset_x = (
-                    offset[kh, kw, 0, h, w]
-                    + w * stride[1]
-                    + (kw - k_w_m) * dilation[1]
-                    - (pad[1] - k_w_m)
-                )
-
-                output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
-
-
-# @njit(parallel=True, fastmath=True)
-def deformable_convolution_b1(
-    input,
-    output,
-    offset,
-    weights,
-    bias,
-    tmp,
-    stride,
-    pad,
-    dilation,
-    groups,
-    deformable_groups,
-):
-    out_channels, height, width = output.shape
-    _, in_channels, k_height, k_width = weights.shape
-
-    deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
-
-    tmp = tmp.reshape((in_channels * k_height * k_width, height * width))
-
-    _weights = weights.reshape((out_channels, in_channels * k_height * k_width))
-    _output = output.reshape((out_channels, height * width))
-    np.dot(_weights, tmp, _output)
-
-    _bias = bias.reshape((out_channels, 1))
-    _output[:] = _output + _bias
-
-
-# @njit(parallel=True)
-def deformable_convolution(
-    input,
-    output,
-    offset,
-    weights,
-    bias,
-    tmp,
-    stride_y,
-    stride_x,
-    pad_y,
-    pad_x,
-    dilation_y,
-    dilation_x,
-    groups,
-    deformable_groups,
-):
-    batch, _, _, _ = input.shape
-    for b in range(batch):
-        deformable_convolution_b1(
-            input[b],
-            output[b],
-            offset,
-            weights,
-            bias,
-            tmp,
-            (stride_y, stride_x),
-            (pad_y, pad_x),
-            (dilation_y, dilation_x),
-            groups,
-            deformable_groups,
-        )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index e874e715..d4284907 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -1,3 +1,5 @@
+cmake_minimum_required(VERSION 3.23)
+
 set(py_module_name _deformable_convolution_sycl)
 pybind11_add_module(${py_module_name}
     MODULE
@@ -5,15 +7,10 @@ pybind11_add_module(${py_module_name}
 )
 find_package(MKL CONFIG REQUIRED)
 
-# target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $)
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
 target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_libraries(${py_module_name} PUBLIC $<LINK_ONLY:MKL::MKL> ${MKL_SYCL})
 
-message(STATUS "${MKL_IMPORTED_TARGETS}")
-message(STATUS "${INTERFACE_COMPILE_OPTIONS}")
-message(STATUS "${INTERFACE_INCLUDE_DIRECTORIES}")
-
 file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 install(TARGETS ${py_module_name}
   DESTINATION ${py_module_dest}/deformable_convolution_sycl
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index 005155bd..eedd9799 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -119,15 +119,11 @@ inline auto deform_input(cl::sycl::queue &queue,
                                     out_width, kh, kw, 0, h, w) +
                         w * stride_x + (kw - k_w_m) * dilation_x -
                         (pad_x - k_w_m);
-        // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y -
-        // k_h_m); auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x
-        // - k_w_m);
 
         auto _input =
             get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0);
 
         _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x);
-        // _output[w] = offset_y;
     });
 }
 
@@ -197,11 +193,11 @@ void deformable_convolution_b1_impl(cl::sycl::queue &queue,
     auto efill = output_fill_with_bias(queue, output, out_shape, bias);
     auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/
                       out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/
-                      1,                                          /*alpha*/
-                      weights, in_c * ker_h * ker_w,              /*a, lda*/
-                      tmp, out_h * out_w,                         /*b, ldb*/
-                      1,                                          /*beta*/
-                      output, out_h * out_w,                      /*c, ldc*/
+                      1, /*alpha*/
+                      weights, in_c * ker_h * ker_w, /*a, lda*/
+                      tmp, out_h * out_w, /*b, ldb*/
+                      1, /*beta*/
+                      output, out_h * out_w, /*c, ldc*/
                       {edeform, efill} /*events*/);
     egemm.wait();
 }
diff --git a/dpbench/configs/bench_info/deformable_convolution.json b/dpbench/configs/bench_info/deformable_convolution.json
deleted file mode 100644
index 8fabbb57..00000000
--- a/dpbench/configs/bench_info/deformable_convolution.json
+++ /dev/null
@@ -1,135 +0,0 @@
-{
-    "benchmark": {
-        "name": "Deformable convolution",
-        "short_name": "deformable-convolution",
-        "relative_path": "deformable_convolution",
-        "module_name": "deformable_convolution",
-        "func_name": "deformable_convolution",
-        "kind": "microbenchmark",
-        "domain": "Deep learning",
-        "parameters": {
-            "S": {
-                "batch":1,
-                "in_channels":16,
-                "in_height":32,
-                "in_width":32,
-                "out_channels":16,
-                "out_height":32,
-                "out_width":32,
-                "kernel_height":3,
-                "kernel_width":3,
-                "stride_y":1,
-                "stride_x":1,
-                "dilation_y":1,
-                "dilation_x":1,
-                "pad_y":1,
-                "pad_x":1,
-                "groups":1,
-                "deformable_groups":1,
-                "dtype":"float32",
-                "seed": 7777777
-            },
-            "M": {
-                "batch":1,
-                "in_channels":64,
-                "in_height":64,
-                "in_width":64,
-                "out_channels":64,
-                "out_height":64,
-                "out_width":64,
-                "kernel_height":3,
-                "kernel_width":3,
-                "stride_y":1,
-                "stride_x":1,
-                "dilation_y":1,
-                "dilation_x":1,
-                "pad_y":1,
-                "pad_x":1,
-                "groups":1,
-                "deformable_groups":1,
-                "dtype":"float32",
-                "seed": 7777777
-            },
-            "L": {
-                "batch":2,
-                "in_channels":64,
-                "in_height":128,
-                "in_width":128,
-                "out_channels":128,
-                "out_height":128,
-                "out_width":128,
-                "kernel_height":3,
-                "kernel_width":3,
-                "stride_y":1,
-                "stride_x":1,
-                "dilation_y":1,
-                "dilation_x":1,
-                "pad_y":1,
-                "pad_x":1,
-                "groups":1,
-                "deformable_groups":1,
-                "dtype":"float32",
-                "seed": 7777777
-            }
-        },
-        "init": {
-            "func_name": "initialize",
-            "input_args": [
-                "batch",
-                "in_channels",
-                "in_height",
-                "in_width",
-                "out_channels",
-                "out_height",
-                "out_width",
-                "kernel_height",
-                "kernel_width",
-                "stride_y",
-                "stride_x",
-                "dilation_y",
-                "dilation_x",
-                "pad_y",
-                "pad_x",
-                "groups",
-                "deformable_groups",
-                "dtype",
-                "seed"
-            ],
-            "output_args": [
-                "input",
-                "output",
-                "offset",
-                "weights",
-                "bias",
-                "tmp"
-            ]
-        },
-        "input_args": [
-            "input",
-            "output",
-            "offset",
-            "weights",
-            "bias",
-            "tmp",
-            "stride_y",
-            "stride_x",
-            "dilation_y",
-            "dilation_x",
-            "pad_y",
-            "pad_x",
-            "groups",
-            "deformable_groups"
-        ],
-        "array_args": [
-            "input",
-            "output",
-            "offset",
-            "weights",
-            "bias",
-            "tmp"
-        ],
-        "output_args": [
-            "output"
-        ]
-    }
-}

From 7d39e0ef2e76b5a6a8f01bca724708f880acf6c9 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Thu, 1 Jun 2023 11:01:10 +0200
Subject: [PATCH 04/20] Small fixes

---
 .../deformable_convolution/__init__.py        |  37 +----
 .../deformable_convolution_numba_mlir_p.py    | 142 ++++++++++++++++++
 dpbench/config/reader.py                      |   2 +-
 3 files changed, 144 insertions(+), 37 deletions(-)
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py

diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py
index a5db8b1d..a94a9e6b 100644
--- a/dpbench/benchmarks/deformable_convolution/__init__.py
+++ b/dpbench/benchmarks/deformable_convolution/__init__.py
@@ -2,40 +2,5 @@
 #
 # SPDX-License-Identifier: Apache 2.0
 
-from .deformable_convolution_initialize import initialize
-from .deformable_convolution_numba_npr import (
-    deformable_convolution as deformable_convolution_numba_npr,
-)
-from .deformable_convolution_numpy import (
-    deformable_convolution as deformable_convolution_numpy,
-)
-from .deformable_convolution_sycl_native_ext import deformable_convolution_sycl
-
-__all__ = [
-    "initialize",
-    "deformable_convolution_numba_npr",
-    "deformable_convolution_numba_numpy",
-    "deformable_convolution_sycl",
-]
-
-"""l2-norm calculation of n vectors
-
-Input
----------
-npoints: int
-    number of vectors
-dims: int
-    dimension of single vector
-seed: int
-    random seed to generate random number
-
-Output
--------
-d: array
-    l2 norm of each vector
-
-Method
-------
-    ||Vj||2=sqrt(sum(Xj[i]*Xj[i]))
-    here i is 0->dims, j is 0->npoints
+"""Deformable convolution
 """
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
new file mode 100644
index 00000000..76d354d0
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
@@ -0,0 +1,142 @@
+# Copyright 2022 Intel Corp.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+
+import numpy as np
+from numba import prange
+from numba_mlir import njit
+
+
+@njit(parallel=True, inline="always", fastmath=True)
+def bilinear(input, offset_y, offset_x):
+    height, width = input.shape
+    start_x = int(math.floor(offset_x))
+    start_x_weight = 1 - (offset_x - start_x)
+    start_y = int(math.floor(offset_y))
+    start_y_weight = 1 - (offset_y - start_y)
+
+    output = 0
+    if (
+        offset_x >= width
+        or offset_y >= height
+        or offset_x <= -1
+        or offset_y <= -1
+    ):
+        return output
+
+    if start_x >= 0 and start_y >= 0:
+        w = start_x_weight * start_y_weight
+        output += w * input[start_y, start_x]
+
+    if start_x + 1 < width and start_y >= 0:
+        w = (1 - start_x_weight) * start_y_weight
+        output += w * input[start_y, start_x + 1]
+
+    if start_x >= 0 and start_y + 1 < height:
+        w = start_x_weight * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x]
+
+    if start_x + 1 < width and start_y + 1 < height:
+        w = (1 - start_x_weight) * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x + 1]
+
+    return output
+
+
+@njit(parallel=True, fastmath=True)
+def deform(
+    input, offset, output, stride, pad, dilation, groups, deformable_groups
+):
+    k_height, k_width, _, out_height, out_width = offset.shape
+    channels, _, _ = input.shape
+
+    k_h_m = (k_height - 1) // 2
+    k_w_m = (k_width - 1) // 2
+
+    for ckhkw in prange(channels * k_height * k_width):
+        for h in prange(out_height):
+            for w in prange(out_width):
+                c = ckhkw // (k_height * k_width)
+                khkw = ckhkw % (k_height * k_width)
+                kh = khkw // k_width
+                kw = khkw % k_width
+
+                offset_y = (
+                    offset[kh, kw, 1, h, w]
+                    + h * stride[0]
+                    + (kh - k_h_m) * dilation[0]
+                    - (pad[0] - k_h_m)
+                )
+                offset_x = (
+                    offset[kh, kw, 0, h, w]
+                    + w * stride[1]
+                    + (kw - k_w_m) * dilation[1]
+                    - (pad[1] - k_w_m)
+                )
+
+                output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
+
+
+@njit(parallel=True, fastmath=True)
+def deformable_convolution_b1(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride,
+    pad,
+    dilation,
+    groups,
+    deformable_groups,
+):
+    out_channels, height, width = output.shape
+    _, in_channels, k_height, k_width = weights.shape
+
+    deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
+
+    tmp = tmp.reshape((in_channels * k_height * k_width, height * width))
+
+    _weights = weights.reshape((out_channels, in_channels * k_height * k_width))
+    _output = output.reshape((out_channels, height * width))
+    np.dot(_weights, tmp, _output)
+
+    _bias = bias.reshape((out_channels, 1))
+    _output[:] = _output + _bias
+
+
+@njit(parallel=True)
+def deformable_convolution(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride_y,
+    stride_x,
+    pad_y,
+    pad_x,
+    dilation_y,
+    dilation_x,
+    groups,
+    deformable_groups,
+):
+    batch, _, _, _ = input.shape
+    for b in range(batch):
+        deformable_convolution_b1(
+            input[b],
+            output[b],
+            offset,
+            weights,
+            bias,
+            tmp,
+            (stride_y, stride_x),
+            (pad_y, pad_x),
+            (dilation_y, dilation_x),
+            groups,
+            deformable_groups,
+        )
diff --git a/dpbench/config/reader.py b/dpbench/config/reader.py
index bc549653..58b5edf8 100644
--- a/dpbench/config/reader.py
+++ b/dpbench/config/reader.py
@@ -19,7 +19,7 @@
 from .implementation_postfix import Implementation
 from .module import Module
 
-_REFERENCE_IMPLEMENTATIONS = {"numpy", "python"}
+_REFERENCE_IMPLEMENTATIONS = {"numpy", "python", "sycl"}
 
 
 def read_configs(  # noqa: C901: TODO: move modules into config

From 0de01b1fd3ec3811a6d987815ea02f73556c0671 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Fri, 9 Jun 2023 20:35:44 +0200
Subject: [PATCH 05/20] Add config & pre-commit

---
 .../deformable_convolution/CMakeLists.txt     |   4 +
 .../CMakeLists.txt                            |   4 +
 .../deformable_convolution_sycl/impl.cpp      |  10 +-
 .../bench_info/deformable_convolution.toml    | 136 ++++++++++++++++++
 4 files changed, 149 insertions(+), 5 deletions(-)
 create mode 100644 dpbench/configs/bench_info/deformable_convolution.toml

diff --git a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
index d06d5c73..820b345a 100644
--- a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
@@ -1 +1,5 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
 add_subdirectory(deformable_convolution_sycl_native_ext)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index d4284907..86cd122f 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
 cmake_minimum_required(VERSION 3.23)
 
 set(py_module_name _deformable_convolution_sycl)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index eedd9799..211c0b4b 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -193,11 +193,11 @@ void deformable_convolution_b1_impl(cl::sycl::queue &queue,
     auto efill = output_fill_with_bias(queue, output, out_shape, bias);
     auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/
                       out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/
-                      1, /*alpha*/
-                      weights, in_c * ker_h * ker_w, /*a, lda*/
-                      tmp, out_h * out_w, /*b, ldb*/
-                      1, /*beta*/
-                      output, out_h * out_w, /*c, ldc*/
+                      1,                                          /*alpha*/
+                      weights, in_c * ker_h * ker_w,              /*a, lda*/
+                      tmp, out_h * out_w,                         /*b, ldb*/
+                      1,                                          /*beta*/
+                      output, out_h * out_w,                      /*c, ldc*/
                       {edeform, efill} /*events*/);
     egemm.wait();
 }
diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml
new file mode 100644
index 00000000..fd339b1a
--- /dev/null
+++ b/dpbench/configs/bench_info/deformable_convolution.toml
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[benchmark]
+name = "Deformable convolution"
+short_name = "deformable-convolution"
+relative_path = "deformable_convolution"
+module_name = "deformable_convolution"
+func_name = "deformable_convolution"
+kind = "microbenchmark"
+domain = "Deep learning"
+input_args = [
+    "input",
+    "output",
+    "offset",
+    "weights",
+    "bias",
+    "tmp",
+    "stride_y",
+    "stride_x",
+    "dilation_y",
+    "dilation_x",
+    "pad_y",
+    "pad_x",
+    "groups",
+    "deformable_groups"
+]
+array_args = [
+    "input",
+    "output",
+    "offset",
+    "weights",
+    "bias",
+    "tmp"
+]
+output_args = [
+    "output"
+]
+
+[benchmark.parameters.S]
+batch = 1
+in_channels = 16
+in_height = 32
+in_width = 32
+out_channels = 16
+out_height = 32
+out_width = 32
+kernel_height = 3
+kernel_width = 3
+stride_y = 1
+stride_x = 1
+dilation_y = 1
+dilation_x = 1
+pad_y = 1
+pad_x = 1
+groups = 1
+deformable_groups = 1
+dtype = "float32"
+seed =  7777777
+
+[benchmark.parameters.M]
+batch = 1
+in_channels = 64
+in_height = 64
+in_width = 64
+out_channels = 64
+out_height = 64
+out_width = 64
+kernel_height = 3
+kernel_width = 3
+stride_y = 1
+stride_x = 1
+dilation_y = 1
+dilation_x = 1
+pad_y = 1
+pad_x = 1
+groups = 1
+deformable_groups = 1
+dtype = "float32"
+seed =  7777777
+
+[benchmark.parameters.L]
+batch = 2
+in_channels = 64
+in_height = 128
+in_width = 128
+out_channels = 128
+out_height = 128
+out_width = 128
+kernel_height = 3
+kernel_width = 3
+stride_y = 1
+stride_x = 1
+dilation_y = 1
+dilation_x = 1
+pad_y = 1
+pad_x = 1
+groups = 1
+deformable_groups = 1
+dtype = "float32"
+seed =  7777777
+
+[benchmark.init]
+func_name = "initialize"
+types_dict_name="types_dict"
+precision="single"
+input_args =  [
+    "batch",
+    "in_channels",
+    "in_height",
+    "in_width",
+    "out_channels",
+    "out_height",
+    "out_width",
+    "kernel_height",
+    "kernel_width",
+    "stride_y",
+    "stride_x",
+    "dilation_y",
+    "dilation_x",
+    "pad_y",
+    "pad_x",
+    "groups",
+    "deformable_groups",
+    "dtype",
+    "seed"
+]
+output_args = [
+    "input",
+    "output",
+    "offset",
+    "weights",
+    "bias",
+    "tmp"
+]

From 0e1b858ca274a07883bf0debb1b23958b9097750 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 19 Jun 2023 23:48:00 +0200
Subject: [PATCH 06/20] Fix mlir_p version & result

---
 .../deformable_convolution_numba_mlir_p.py            | 11 +++++------
 .../deformable_convolution_sycl/impl.cpp              |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
index 76d354d0..55e7246b 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
@@ -9,7 +9,7 @@
 from numba_mlir import njit
 
 
-@njit(parallel=True, inline="always", fastmath=True)
+@njit(parallel=True, inline="always", fastmath=True, gpu_fp64_truncate="auto")
 def bilinear(input, offset_y, offset_x):
     height, width = input.shape
     start_x = int(math.floor(offset_x))
@@ -42,10 +42,10 @@ def bilinear(input, offset_y, offset_x):
         w = (1 - start_x_weight) * (1 - start_y_weight)
         output += w * input[start_y + 1, start_x + 1]
 
-    return output
+    return output/2
 
 
-@njit(parallel=True, fastmath=True)
+@njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def deform(
     input, offset, output, stride, pad, dilation, groups, deformable_groups
 ):
@@ -54,7 +54,6 @@ def deform(
 
     k_h_m = (k_height - 1) // 2
     k_w_m = (k_width - 1) // 2
-
     for ckhkw in prange(channels * k_height * k_width):
         for h in prange(out_height):
             for w in prange(out_width):
@@ -79,7 +78,7 @@ def deform(
                 output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
 
 
-@njit(parallel=True, fastmath=True)
+@njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
 def deformable_convolution_b1(
     input,
     output,
@@ -108,7 +107,7 @@ def deformable_convolution_b1(
     _output[:] = _output + _bias
 
 
-@njit(parallel=True)
+@njit(parallel=True, gpu_fp64_truncate="auto")
 def deformable_convolution(
     input,
     output,
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index 211c0b4b..ad4e9ef2 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -62,7 +62,7 @@ __attribute__((always_inline)) DataType bilinear(const DataType *input,
         result += w3 * v3;
     }
 
-    return result;
+    return result/2;
 }
 
 class deform;

From ff457c058c5815b665e0165b82178e79ac18309f Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sat, 24 Jun 2023 18:14:56 +0200
Subject: [PATCH 07/20] Add TBBConfig.cmake

---
 .../deformable_convolution_numba_mlir_p.py    |   2 +-
 .../CMakeLists.txt                            |   1 +
 .../cmake/TBBConfig.cmake                     | 193 ++++++++++++++++++
 .../deformable_convolution_sycl/impl.cpp      |   2 +-
 4 files changed, 196 insertions(+), 2 deletions(-)
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
index 55e7246b..b99ff53b 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
@@ -42,7 +42,7 @@ def bilinear(input, offset_y, offset_x):
         w = (1 - start_x_weight) * (1 - start_y_weight)
         output += w * input[start_y + 1, start_x + 1]
 
-    return output/2
+    return output / 2
 
 
 @njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index 86cd122f..f7efe986 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -9,6 +9,7 @@ pybind11_add_module(${py_module_name}
     MODULE
     deformable_convolution_sycl/impl.cpp
 )
+find_package(TBB REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 find_package(MKL CONFIG REQUIRED)
 
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake
new file mode 100644
index 00000000..5363c1d3
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake
@@ -0,0 +1,193 @@
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# It defines the following variables:
+#     TBB_<component>_FOUND
+#     TBB_IMPORTED_TARGETS
+#
+# TBBConfigVersion.cmake defines TBB_VERSION
+#
+# Initialize to default values
+if (NOT TBB_IMPORTED_TARGETS)
+    set(TBB_IMPORTED_TARGETS "")
+endif()
+
+if (NOT TBB_FIND_COMPONENTS)
+    set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy")
+    foreach (_tbb_component ${TBB_FIND_COMPONENTS})
+        set(TBB_FIND_REQUIRED_${_tbb_component} 1)
+    endforeach()
+endif()
+
+get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
+get_filename_component(_tbb_root "${_tbb_root}/../../.." ABSOLUTE)
+
+set(TBB_INTERFACE_VERSION )
+
+set(_tbb_bin_version 12)
+set(_tbbmalloc_bin_version 2)
+set(_tbbmalloc_proxy_bin_version 2)
+set(_tbbbind_bin_version 3)
+
+# Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc
+list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix)
+if (NOT _tbbmalloc_proxy_ix EQUAL -1)
+    list(APPEND TBB_FIND_COMPONENTS tbbmalloc)
+    list(REMOVE_DUPLICATES TBB_FIND_COMPONENTS)
+    set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy})
+endif()
+unset(_tbbmalloc_proxy_ix)
+
+if (CMAKE_SIZEOF_VOID_P STREQUAL "8")
+    set(_tbb_subdir intel64/gcc4.8)
+else ()
+    set(_tbb_subdir ia32/gcc4.8)
+endif()
+
+if (UNIX)
+  set(_tbb_lib_ext ".so")
+  set(_tbb_lib_prefix "lib")
+  set(_tbb_lib_dir_conda "lib")
+  set(_bin_version "")
+elseif (WIN32)
+  set(_bin_version "")
+  set(_tbb_lib_prefix "")
+  set(_tbb_lib_ext ".dll")
+  set(_tbb_impllib_ext ".lib")
+  set(_tbb_lib_dir_conda "bin")
+  set(_tbb_impllib_dir_conda "lib")
+else()
+    message(FATAL_ERROR "Unsupported platform. Only Unix and Windows are supported.")
+endif()
+
+foreach (_tbb_component ${TBB_FIND_COMPONENTS})
+    set(TBB_${_tbb_component}_FOUND 0)
+
+if(WIN32)
+    unset(_bin_version)
+    if (_tbb_component STREQUAL tbb)
+        set(_bin_version ${_tbb_bin_version})
+    endif()
+endif()
+
+    if(UNIX)
+       find_library(_tbb_release_lib
+                    NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_lib_ext}
+                    PATHS ${_tbb_root}
+                    HINTS ENV TBB_ROOT_HINT
+                    PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}")
+
+    else()
+       find_file(_tbb_release_lib
+                 NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_lib_ext}
+                 PATHS ${_tbb_root}
+                 HINTS ENV TBB_ROOT_HINT
+                 PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}")
+
+       if (EXISTS "${_tbb_release_lib}")
+          find_library(_tbb_release_impllib
+                       NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_impllib_ext}
+                       PATHS ${_tbb_root}
+                       HINTS ENV TBB_ROOT_HINT
+                       PATH_SUFFIXES "${_tbb_impllib_dir_conda}" "lib/${_tbb_subdir}")
+       endif()
+    endif()
+
+    if (NOT TBB_FIND_RELEASE_ONLY)
+        find_library(_tbb_debug_lib
+                     NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}_debug.${_tbb_lib_ext}
+                     PATHS ${_tbb_root}
+                     HINTS ENV TBB_ROOT_HINT
+                     PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}")
+        if(WIN32  AND EXISTS "${_tbb_debug_lib}")
+           find_library(_tbb_debug_impllib
+                        NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}_debug.${_tbb_impllib_ext}
+                        PATHS ${_tbb_root}
+                        HINTS ENV TBB_ROOT_HINT
+                        PATH_SUFFIXES "${_tbb_impllib_dir_conda}" "lib/${_tbb_subdir}")
+        endif()
+    endif()
+
+    if (EXISTS "${_tbb_release_lib}" OR EXISTS "${_tbb_debug_lib}")
+        if (NOT TARGET TBB::${_tbb_component})
+            add_library(TBB::${_tbb_component} SHARED IMPORTED)
+
+	    find_path(_tbb_include_dir
+	      oneapi/tbb.h
+	      PATHS ${_tbb_root}
+	      PATH_SUFFIXES include
+	      HINTS ENV TBB_ROOT_HINT
+	      )
+
+if(WIN32)
+            set_target_properties(
+                TBB::${_tbb_component} PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}"
+                INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1"
+                )
+else()
+            set_target_properties(
+                TBB::${_tbb_component} PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}"
+                )
+endif()
+            unset(_tbb_current_realpath)
+            unset(_tbb_include_dir)
+
+            if (EXISTS "${_tbb_release_lib}")
+if(WIN32)
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}"
+                                      IMPORTED_IMPLIB_RELEASE "${_tbb_release_impllib}")
+else()
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}")
+endif()
+                set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+            endif()
+
+            if (EXISTS "${_tbb_debug_lib}")
+if(WIN32)
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}"
+                                      IMPORTED_IMPLIB_DEBUG "${_tbb_debug_impllib}"
+                )
+else()
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}")
+endif()
+                set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
+            endif()
+
+            # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc
+            if (_tbb_component STREQUAL tbbmalloc_proxy)
+                set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc)
+            endif()
+        endif()
+        list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component})
+        set(TBB_${_tbb_component}_FOUND 1)
+    elseif (TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component})
+        message(STATUS "Missed required oneTBB component: ${_tbb_component}")
+        if (TBB_FIND_RELEASE_ONLY)
+            message(STATUS "  ${_tbb_release_lib} must exist.")
+        else()
+            message(STATUS "  one or both of:\n   ${_tbb_release_lib}\n    ${_tbb_debug_lib}\n   files must exist.")
+        endif()
+        set(TBB_FOUND FALSE)
+    endif()
+endforeach()
+list(REMOVE_DUPLICATES TBB_IMPORTED_TARGETS)
+unset(_tbb_release_lib)
+unset(_tbb_debug_lib)
+unset(_tbb_root)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index ad4e9ef2..12ac11de 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -62,7 +62,7 @@ __attribute__((always_inline)) DataType bilinear(const DataType *input,
         result += w3 * v3;
     }
 
-    return result/2;
+    return result / 2;
 }
 
 class deform;

From 95fa4a90234b4b319e184d0908d832f1b21ba978 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sat, 24 Jun 2023 18:26:09 +0200
Subject: [PATCH 08/20] Fix TBB discovery

---
 .../deformable_convolution_sycl_native_ext/CMakeLists.txt     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index f7efe986..701a07ab 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -9,7 +9,9 @@ pybind11_add_module(${py_module_name}
     MODULE
     deformable_convolution_sycl/impl.cpp
 )
-find_package(TBB REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
+
+message(${CMAKE_CURRENT_SOURCE_DIR})
+find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 find_package(MKL CONFIG REQUIRED)
 
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)

From 00a0944dccc589793aef92273c4d70735e258051 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 00:03:31 +0200
Subject: [PATCH 09/20] Add tbb and mkl to the environment

---
 dpbench/config/reader.py                               | 2 +-
 dpbench/configs/bench_info/deformable_convolution.toml | 1 +
 environments/conda-linux-sycl.yml                      | 2 ++
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/dpbench/config/reader.py b/dpbench/config/reader.py
index 58b5edf8..bc549653 100644
--- a/dpbench/config/reader.py
+++ b/dpbench/config/reader.py
@@ -19,7 +19,7 @@
 from .implementation_postfix import Implementation
 from .module import Module
 
-_REFERENCE_IMPLEMENTATIONS = {"numpy", "python", "sycl"}
+_REFERENCE_IMPLEMENTATIONS = {"numpy", "python"}
 
 
 def read_configs(  # noqa: C901: TODO: move modules into config
diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml
index fd339b1a..4c22313c 100644
--- a/dpbench/configs/bench_info/deformable_convolution.toml
+++ b/dpbench/configs/bench_info/deformable_convolution.toml
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 [benchmark]
+reference_implementation_postfix = "sycl"
 name = "Deformable convolution"
 short_name = "deformable-convolution"
 relative_path = "deformable_convolution"
diff --git a/environments/conda-linux-sycl.yml b/environments/conda-linux-sycl.yml
index 8d34ee73..b77a2348 100644
--- a/environments/conda-linux-sycl.yml
+++ b/environments/conda-linux-sycl.yml
@@ -34,3 +34,5 @@ dependencies:
   - libgcc-ng
   - libstdcxx-ng
   - libgomp
+  - tbb-devel >= 2021.6.0
+  - mkl-devel-dpcpp>=2023.0.0
\ No newline at end of file

From 46d90dc3a532a6fc8e731406150a5e3e08872094 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 00:36:57 +0200
Subject: [PATCH 10/20] pre-commit

---
 environments/conda-linux-sycl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/conda-linux-sycl.yml b/environments/conda-linux-sycl.yml
index b77a2348..7fe49532 100644
--- a/environments/conda-linux-sycl.yml
+++ b/environments/conda-linux-sycl.yml
@@ -35,4 +35,4 @@ dependencies:
   - libstdcxx-ng
   - libgomp
   - tbb-devel >= 2021.6.0
-  - mkl-devel-dpcpp>=2023.0.0
\ No newline at end of file
+  - mkl-devel-dpcpp>=2023.0.0

From 5badbc47da700112237c643b597791dfc13561b6 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 01:09:52 +0200
Subject: [PATCH 11/20] Add MKLConfig.cmake

---
 .../CMakeLists.txt                            |   2 +-
 .../cmake/MKLConfig.cmake                     | 851 ++++++++++++++++++
 2 files changed, 852 insertions(+), 1 deletion(-)
 create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index 701a07ab..bf233286 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -12,7 +12,7 @@ pybind11_add_module(${py_module_name}
 
 message(${CMAKE_CURRENT_SOURCE_DIR})
 find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
-find_package(MKL CONFIG REQUIRED)
+find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
 target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
new file mode 100644
index 00000000..07290e8e
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
@@ -0,0 +1,851 @@
+#===============================================================================
+# Copyright 2021-2022 Intel Corporation.
+#
+# This software and the related documents are Intel copyrighted  materials,  and
+# your use of  them is  governed by the  express license  under which  they were
+# provided to you (License).  Unless the License provides otherwise, you may not
+# use, modify, copy, publish, distribute,  disclose or transmit this software or
+# the related documents without Intel's prior written permission.
+#
+# This software and the related documents  are provided as  is,  with no express
+# or implied  warranties,  other  than those  that are  expressly stated  in the
+# License.
+#===============================================================================
+
+#===================================================================
+# CMake Config file for Intel(R) oneAPI Math Kernel Library (oneMKL)
+#===================================================================
+
+#===============================================================================
+# Input parameters
+#=================
+#-------------
+# Main options
+#-------------
+# MKL_ROOT: oneMKL root directory (May be required for non-standard install locations. Optional otherwise.)
+#    Default: use location from MKLROOT environment variable or <Full path to this file>/../../../ if MKLROOT is not defined
+# MKL_ARCH
+#    Values:  ia32 intel64
+#    Default: intel64
+# MKL_LINK
+#    Values:  static, dynamic, sdl
+#    Default: dynamic
+#       Exceptions:- DPC++ doesn't support sdl
+# MKL_THREADING
+#    Values:  sequential,
+#             intel_thread (Intel OpenMP),
+#             gnu_thread (GNU OpenMP),
+#             pgi_thread (PGI OpenMP),
+#             tbb_thread
+#    Default: intel_thread
+#       Exceptions:- DPC++ defaults to tbb, PGI compiler on Windows defaults to pgi_thread
+# MKL_INTERFACE (for MKL_ARCH=intel64 only)
+#    Values:  lp64, ilp64
+#       GNU or INTEL interface will be selected based on Compiler.
+#    Default: ilp64
+# MKL_MPI
+#    Values:  intelmpi, mpich, openmpi, msmpi, mshpc
+#    Default: intelmpi
+#-----------------------------------
+# Special options (OFF by default)
+#-----------------------------------
+# ENABLE_BLAS95:      Enables BLAS Fortran95 API
+# ENABLE_LAPACK95:    Enables LAPACK Fortran95 API
+# ENABLE_BLACS:       Enables cluster BLAS library
+# ENABLE_CDFT:        Enables cluster DFT library
+# ENABLE_CPARDISO:    Enables cluster PARDISO functionality
+# ENABLE_SCALAPACK:   Enables cluster LAPACK library
+# ENABLE_OMP_OFFLOAD: Enables OpenMP Offload functionality
+#
+#==================
+# Output parameters
+#==================
+# MKL_ROOT
+#     oneMKL root directory.
+# MKL_INCLUDE
+#     Use of target_include_directories() is recommended.
+#     INTERFACE_INCLUDE_DIRECTORIES property is set on mkl_core and mkl_rt libraries.
+#     Alternatively, this variable can be used directly (not recommended as per Modern CMake)
+# MKL_ENV
+#     Provides all environment variables based on input parameters.
+#     Currently useful for mkl_rt linking and BLACS on Windows.
+#     Must be set as an ENVIRONMENT property.
+# Example:
+#     add_test(NAME mytest COMMAND myexe)
+#     if(MKL_ENV)
+#       set_tests_properties(mytest PROPERTIES ENVIRONMENT "${MKL_ENV}")
+#     endif()
+#
+# MKL::<library name>
+#     IMPORTED targets to link MKL libraries individually or when using a custom link-line.
+#     mkl_core and mkl_rt have INTERFACE_* properties set to them.
+#     Please refer to Intel(R) oneMKL Link Line Advisor for help with linking.
+#
+# Below INTERFACE targets provide full link-lines for direct use.
+# Example:
+#     target_link_options(<my_linkable_target> PUBLIC $<LINK_ONLY:MKL::MKL>)
+#
+# MKL::MKL
+#     Link line for C and Fortran API
+# MKL::MKL_DPCPP
+#     Link line for DPC++ API
+#
+# Note: For Device API, library linking is not required.
+#       Compile options can be added from the INTERFACE_COMPILE_OPTIONS property on MKL::MKL_DPCPP
+#       Include directories can be added from the INTERFACE_INCLUDE_DIRECTORIES property on MKL::MKL_DPCPP
+#
+# Note: Output parameters' and targets' availability can change
+# based on Input parameters and application project languages.
+#===============================================================================
+
+function(mkl_message MSG_MODE MSG_TEXT)
+  if(MSG_MODE STREQUAL "FATAL_ERROR")
+    message(${MSG_MODE} ${MSG_TEXT})
+  else()
+    if(NOT MKL_FIND_QUIETLY)
+      message(${MSG_MODE} ${MSG_TEXT})
+    endif()
+  endif()
+endfunction()
+
+if(${CMAKE_VERSION} VERSION_LESS "3.13")
+  mkl_message(FATAL_ERROR "The minimum supported CMake version is 3.13. You are running version ${CMAKE_VERSION}")
+endif()
+
+include_guard()
+include(FindPackageHandleStandardArgs)
+
+if(NOT MKL_LIBRARIES)
+
+# Set CMake policies for well-defined behavior across CMake versions
+cmake_policy(SET CMP0011 NEW)
+cmake_policy(SET CMP0057 NEW)
+
+# Project Languages
+get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
+list(APPEND MKL_LANGS C CXX Fortran)
+foreach(lang ${languages})
+  if(${lang} IN_LIST MKL_LANGS)
+    list(APPEND CURR_LANGS ${lang})
+  endif()
+endforeach()
+list(REMOVE_DUPLICATES CURR_LANGS)
+
+option(ENABLE_BLAS95      "Enables BLAS Fortran95 API"            OFF)
+option(ENABLE_LAPACK95    "Enables LAPACK Fortran95 API"          OFF)
+option(ENABLE_BLACS       "Enables cluster BLAS library"          OFF)
+option(ENABLE_CDFT        "Enables cluster DFT library"           OFF)
+option(ENABLE_CPARDISO    "Enables cluster PARDISO functionality" OFF)
+option(ENABLE_SCALAPACK   "Enables cluster LAPACK library"        OFF)
+option(ENABLE_OMP_OFFLOAD "Enables OpenMP Offload functionality"  OFF)
+
+# Use MPI if any of these are enabled
+if(ENABLE_BLACS OR ENABLE_CDFT OR ENABLE_SCALAPACK OR ENABLE_CPARDISO)
+  set(USE_MPI ON)
+endif()
+
+# Check Parameters
+function(define_param TARGET_PARAM DEFAULT_PARAM SUPPORTED_LIST)
+  if(NOT DEFINED ${TARGET_PARAM} AND NOT DEFINED ${DEFAULT_PARAM})
+    mkl_message(STATUS "${TARGET_PARAM}: Undefined")
+  elseif(NOT DEFINED ${TARGET_PARAM} AND DEFINED ${DEFAULT_PARAM})
+    set(${TARGET_PARAM} "${${DEFAULT_PARAM}}" CACHE STRING "Choose ${TARGET_PARAM} options are: ${${SUPPORTED_LIST}}")
+    foreach(opt ${${DEFAULT_PARAM}})
+      set(STR_LIST "${STR_LIST} ${opt}")
+    endforeach()
+    mkl_message(STATUS "${TARGET_PARAM}: None, set to `${STR_LIST}` by default")
+  elseif(${SUPPORTED_LIST})
+    set(ITEM_FOUND 1)
+    foreach(opt ${${TARGET_PARAM}})
+      if(NOT ${opt} IN_LIST ${SUPPORTED_LIST})
+        set(ITEM_FOUND 0)
+      endif()
+    endforeach()
+    if(ITEM_FOUND EQUAL 0)
+      foreach(opt ${${SUPPORTED_LIST}})
+        set(STR_LIST "${STR_LIST} ${opt}")
+      endforeach()
+      mkl_message(FATAL_ERROR "Invalid ${TARGET_PARAM} `${${TARGET_PARAM}}`, options are: ${STR_LIST}")
+    else()
+      mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}")
+    endif()
+  else()
+    mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}")
+  endif()
+endfunction()
+
+#================
+# Compiler checks
+#================
+
+if(CMAKE_C_COMPILER)
+  get_filename_component(C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME)
+endif()
+if(CMAKE_CXX_COMPILER)
+  get_filename_component(CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME)
+endif()
+if(CMAKE_Fortran_COMPILER)
+  get_filename_component(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME)
+endif()
+
+# Determine Compiler Family
+if(CXX_COMPILER_NAME STREQUAL "dpcpp" OR CXX_COMPILER_NAME STREQUAL "dpcpp.exe"
+    OR CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe")
+  set(DPCPP_COMPILER ON)
+endif()
+if(C_COMPILER_NAME MATCHES "^clang")
+  set(CLANG_COMPILER ON)
+endif()
+if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+  set(PGI_COMPILER ON)
+elseif(CMAKE_C_COMPILER_ID STREQUAL "Intel" OR CMAKE_Fortran_COMPILER_ID STREQUAL "Intel"
+        OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+  set(INTEL_COMPILER ON)
+else()
+  if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+    set(GNU_C_COMPILER ON)
+  endif()
+  if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+    set(GNU_Fortran_COMPILER ON)
+  endif()
+endif()
+
+if(USE_MPI AND (C_COMPILER_NAME MATCHES "^mpi" OR Fortran_COMPILER_NAME MATCHES "^mpi"))
+  set(USE_MPI_SCRIPT ON)
+endif()
+
+#================
+
+#================
+# System-specific
+#================
+
+# Extensions
+if(UNIX)
+  set(LIB_PREFIX "lib")
+  set(LIB_EXT ".a")
+  set(DLL_EXT ".so")
+  if(APPLE)
+    set(DLL_EXT ".dylib")
+  endif()
+  set(LINK_PREFIX "-l")
+  set(LINK_SUFFIX "")
+else()
+  set(LIB_PREFIX "")
+  set(LIB_EXT ".lib")
+  set(DLL_EXT "_dll.lib")
+  set(LINK_PREFIX "")
+  set(LINK_SUFFIX ".lib")
+endif()
+
+# Set target system architecture
+set(DEFAULT_MKL_ARCH intel64)
+if(DPCPP_COMPILER OR PGI_COMPILER OR ENABLE_OMP_OFFLOAD OR USE_MPI)
+  set(MKL_ARCH_LIST intel64)
+else()
+  set(MKL_ARCH_LIST ia32 intel64)
+endif()
+define_param(MKL_ARCH DEFAULT_MKL_ARCH MKL_ARCH_LIST)
+
+#================
+
+#==========
+# Setup MKL
+#==========
+
+# Set MKL_ROOT directory
+if(NOT DEFINED MKL_ROOT)
+  if(DEFINED ENV{MKLROOT})
+    set(MKL_ROOT $ENV{MKLROOT})
+  else()
+    get_filename_component(MKL_CMAKE_PATH "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
+    get_filename_component(MKL_ROOT "${MKL_CMAKE_PATH}/../../../" ABSOLUTE)
+    mkl_message(STATUS "MKL_ROOT ${MKL_ROOT}")
+  endif()
+endif()
+string(REPLACE "\\" "/" MKL_ROOT ${MKL_ROOT})
+
+# Define MKL_LINK
+set(DEFAULT_MKL_LINK dynamic)
+if(DPCPP_COMPILER OR USE_MPI)
+  set(MKL_LINK_LIST static dynamic)
+else()
+  set(MKL_LINK_LIST static dynamic sdl)
+endif()
+define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST)
+
+# Define MKL_INTERFACE
+if(MKL_ARCH STREQUAL "intel64")
+  set(IFACE_TYPE intel)
+  if(GNU_Fortran_COMPILER)
+    set(IFACE_TYPE gf)
+  endif()
+  if(DPCPP_COMPILER)
+    if(MKL_INTERFACE)
+      set(MKL_INTERFACE_FULL intel_${MKL_INTERFACE})
+    endif()
+    set(DEFAULT_MKL_INTERFACE intel_ilp64)
+    set(MKL_INTERFACE_LIST intel_ilp64)
+  else()
+    if(MKL_INTERFACE)
+      set(MKL_INTERFACE_FULL ${IFACE_TYPE}_${MKL_INTERFACE})
+    endif()
+    set(DEFAULT_MKL_INTERFACE ${IFACE_TYPE}_ilp64)
+    set(MKL_INTERFACE_LIST ${IFACE_TYPE}_ilp64 ${IFACE_TYPE}_lp64)
+  endif()
+  define_param(MKL_INTERFACE_FULL DEFAULT_MKL_INTERFACE MKL_INTERFACE_LIST)
+else()
+  if(WIN32)
+    set(MKL_INTERFACE_FULL intel_c)
+  elseif(NOT APPLE)
+    if(GNU_Fortran_COMPILER)
+      set(MKL_INTERFACE_FULL gf)
+    else()
+      set(MKL_INTERFACE_FULL intel)
+    endif()
+  else()
+    mkl_message(FATAL_ERROR "OSX does not support MKL_ARCH ia32.")
+  endif()
+endif()
+if(MKL_INTERFACE_FULL MATCHES "ilp64")
+  set(MKL_INTERFACE "ilp64")
+else()
+  set(MKL_INTERFACE "lp64")
+endif()
+
+# Define MKL headers
+find_path(MKL_H mkl.h
+  HINTS ${MKL_ROOT}
+  PATH_SUFFIXES include)
+list(APPEND MKL_INCLUDE ${MKL_H})
+
+# Add pre-built F95 Interface Modules
+if(INTEL_COMPILER AND (ENABLE_BLAS95 OR ENABLE_LAPACK95))
+  if(MKL_ARCH STREQUAL "intel64")
+    list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/${MKL_ARCH}/${MKL_INTERFACE}")
+  else()
+    list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/${MKL_ARCH}")
+  endif()
+endif()
+
+# Define MKL_THREADING
+# All APIs support sequential threading
+set(MKL_THREADING_LIST "sequential" "intel_thread" "tbb_thread")
+set(DEFAULT_MKL_THREADING intel_thread)
+# DPC++ API supports TBB threading, but not OpenMP threading
+if(DPCPP_COMPILER)
+  set(DEFAULT_MKL_THREADING tbb_thread)
+  list(REMOVE_ITEM MKL_THREADING_LIST intel_thread)
+# C, Fortran API
+elseif(PGI_COMPILER)
+  # PGI compiler supports PGI OpenMP threading, additionally
+  list(APPEND MKL_THREADING_LIST pgi_thread)
+  # PGI compiler does not support TBB threading
+  list(REMOVE_ITEM MKL_THREADING_LIST tbb_thread)
+  if(WIN32)
+    # PGI 19.10 and 20.1 on Windows, do not support Intel OpenMP threading
+    list(REMOVE_ITEM MKL_THREADING_LIST intel_thread)
+    set(DEFAULT_MKL_THREADING pgi_thread)
+  endif()
+elseif(GNU_C_COMPILER OR GNU_Fortran_COMPILER OR CLANG_COMPILER)
+  list(APPEND MKL_THREADING_LIST gnu_thread)
+else()
+  # Intel and Microsoft compilers
+  # Nothing to do, only for completeness
+endif()
+define_param(MKL_THREADING DEFAULT_MKL_THREADING MKL_THREADING_LIST)
+
+# Define MKL_MPI
+set(DEFAULT_MKL_MPI intelmpi)
+if(UNIX)
+  if(APPLE)
+    # Override defaults for OSX
+    set(DEFAULT_MKL_MPI mpich)
+    set(MKL_MPI_LIST mpich)
+  else()
+    set(MKL_MPI_LIST intelmpi openmpi mpich mpich2)
+  endif()
+else()
+  # Windows
+  set(MKL_MPI_LIST intelmpi mshpc msmpi)
+endif()
+define_param(MKL_MPI DEFAULT_MKL_MPI MKL_MPI_LIST)
+# MSMPI is now called MSHPC. MSMPI option exists for backward compatibility.
+if(MKL_MPI STREQUAL "mshpc")
+  set(MKL_MPI msmpi)
+endif()
+find_package_handle_standard_args(MKL REQUIRED_VARS MKL_MPI)
+
+# Checkpoint - Verify if required options are defined
+find_package_handle_standard_args(MKL REQUIRED_VARS MKL_ROOT MKL_ARCH MKL_INCLUDE MKL_LINK MKL_THREADING MKL_INTERFACE_FULL)
+
+# Provides a list of IMPORTED targets for the project
+if(NOT DEFINED MKL_IMPORTED_TARGETS)
+  set(MKL_IMPORTED_TARGETS "")
+endif()
+
+# Clear temporary variables
+set(MKL_C_COPT "")
+set(MKL_F_COPT "")
+set(MKL_SDL_COPT "")
+set(MKL_CXX_COPT "")
+set(MKL_DPCPP_COPT "")
+set(MKL_DPCPP_LOPT "")
+set(MKL_OFFLOAD_COPT "")
+set(MKL_OFFLOAD_LOPT "")
+
+set(MKL_SUPP_LINK "")    # Other link options. Usually at the end of the link-line.
+set(MKL_LINK_LINE)       # For MPI only
+set(MKL_ENV_PATH "")     # Temporary variable to work with PATH
+set(MKL_ENV "")          # Exported environment variables
+
+# Modify PATH variable to make it CMake-friendly
+set(OLD_PATH $ENV{PATH})
+string(REPLACE ";" "\;" OLD_PATH "${OLD_PATH}")
+
+# Compiler options
+if(GNU_C_COMPILER OR GNU_Fortran_COMPILER)
+  if(MKL_ARCH STREQUAL "ia32")
+    list(APPEND MKL_C_COPT -m32)
+    list(APPEND MKL_F_COPT -m32)
+  else()
+    list(APPEND MKL_C_COPT -m64)
+    list(APPEND MKL_F_COPT -m64)
+  endif()
+endif()
+
+# Additonal compiler & linker options
+if(CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe")
+  list(APPEND MKL_DPCPP_COPT "-fsycl")
+  list(APPEND MKL_DPCPP_LOPT "-fsycl")
+endif()
+if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD)
+  if(MKL_LINK STREQUAL "static")
+    list(APPEND MKL_DPCPP_LOPT "-fsycl-device-code-split=per_kernel")
+    list(APPEND MKL_OFFLOAD_LOPT "-fsycl-device-code-split=per_kernel")
+  endif()
+endif()
+
+# For OpenMP Offload
+if(ENABLE_OMP_OFFLOAD)
+  if(WIN32)
+    if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1")
+      if("Fortran" IN_LIST CURR_LANGS)
+        list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -DONEMKL_USE_OPENMP_VERSION=202011)
+      else()
+        list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -Qopenmp-version:51 -DONEMKL_USE_OPENMP_VERSION=202011)
+      endif()
+    else()
+      list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64)
+    endif()
+    # -MD and -MDd are manually added here because offload functionality uses DPC++ runtime.
+    if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
+      list(APPEND MKL_OFFLOAD_COPT -MDd)
+    else()
+      list(APPEND MKL_OFFLOAD_COPT -MD)
+    endif()
+    list(APPEND MKL_OFFLOAD_LOPT -Qiopenmp -Qopenmp-targets:spir64 -fsycl)
+    set(SKIP_LIBPATH ON)
+  else()
+    if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1")
+      if("Fortran" IN_LIST CURR_LANGS)
+        list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -DONEMKL_USE_OPENMP_VERSION=202011)
+      else()
+        list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -fopenmp-version=51 -DONEMKL_USE_OPENMP_VERSION=202011)
+      endif()
+    else ()
+      list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64)
+    endif()
+    list(APPEND MKL_OFFLOAD_LOPT -fiopenmp -fopenmp-targets=spir64 -fsycl)
+    if(APPLE)
+      list(APPEND MKL_SUPP_LINK -lc++)
+    else()
+      list(APPEND MKL_SUPP_LINK -lstdc++)
+    endif()
+  endif()
+endif()
+
+# For selected Interface
+if(MKL_INTERFACE_FULL)
+  if(MKL_ARCH STREQUAL "ia32")
+    if(GNU_Fortran_COMPILER)
+      set(MKL_SDL_IFACE_ENV "GNU")
+    endif()
+  else()
+    if(GNU_Fortran_COMPILER)
+      set(MKL_SDL_IFACE_ENV "GNU,${MKL_INTERFACE}")
+    else()
+      set(MKL_SDL_IFACE_ENV "${MKL_INTERFACE}")
+    endif()
+    if(MKL_INTERFACE STREQUAL "ilp64")
+      if("Fortran" IN_LIST CURR_LANGS)
+        if(INTEL_COMPILER)
+          if(WIN32)
+            list(APPEND MKL_F_COPT "-4I8")
+          else()
+            list(APPEND MKL_F_COPT "-i8")
+          endif()
+        elseif(GNU_Fortran_COMPILER)
+          list(APPEND MKL_F_COPT "-fdefault-integer-8")
+        elseif(PGI_COMPILER)
+          list(APPEND MKL_F_COPT "-i8")
+        endif()
+      endif()
+      list(INSERT MKL_C_COPT 0 "-DMKL_ILP64")
+      list(INSERT MKL_SDL_COPT 0 "-DMKL_ILP64")
+      list(INSERT MKL_CXX_COPT 0 "-DMKL_ILP64")
+      list(INSERT MKL_OFFLOAD_COPT 0 "-DMKL_ILP64")
+    else()
+      # lp64
+    endif()
+  endif()
+  if(MKL_SDL_IFACE_ENV)
+    string(TOUPPER ${MKL_SDL_IFACE_ENV} MKL_SDL_IFACE_ENV)
+  endif()
+endif() # MKL_INTERFACE_FULL
+
+# All MKL Libraries
+if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
+  set(MKL_SYCL          mkl_sycld)
+else()
+  set(MKL_SYCL          mkl_sycl)
+endif()
+set(MKL_IFACE_LIB     mkl_${MKL_INTERFACE_FULL})
+set(MKL_CORE          mkl_core)
+if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo" AND MKL_THREADING STREQUAL "tbb_thread")
+  set(MKL_THREAD        mkl_tbb_threadd)
+else()
+  set(MKL_THREAD        mkl_${MKL_THREADING})
+endif()
+set(MKL_SDL           mkl_rt)
+if(MKL_ARCH STREQUAL "ia32")
+  set(MKL_BLAS95      mkl_blas95)
+  set(MKL_LAPACK95    mkl_lapack95)
+else()
+  set(MKL_BLAS95      mkl_blas95_${MKL_INTERFACE})
+  set(MKL_LAPACK95    mkl_lapack95_${MKL_INTERFACE})
+endif()
+# BLACS
+set(MKL_BLACS mkl_blacs_${MKL_MPI}_${MKL_INTERFACE})
+if(UNIX AND NOT APPLE AND MKL_MPI MATCHES "mpich")
+  # MPICH is compatible with INTELMPI Wrappers on Linux
+  set(MKL_BLACS mkl_blacs_intelmpi_${MKL_INTERFACE})
+endif()
+if(WIN32)
+  if(MKL_MPI STREQUAL "msmpi")
+    if("Fortran" IN_LIST CURR_LANGS)
+      list(APPEND MKL_SUPP_LINK "msmpifec.lib")
+    endif()
+    # MSMPI and MSHPC are supported with the same BLACS library
+    set(MKL_BLACS mkl_blacs_msmpi_${MKL_INTERFACE})
+    if(NOT MKL_LINK STREQUAL "static")
+      set(MKL_BLACS mkl_blacs_${MKL_INTERFACE})
+      set(MKL_BLACS_ENV MSMPI)
+    endif()
+  elseif(MKL_MPI STREQUAL "intelmpi" AND NOT MKL_LINK STREQUAL "static")
+    set(MKL_BLACS mkl_blacs_${MKL_INTERFACE})
+    set(MKL_BLACS_ENV INTELMPI)
+  endif()
+endif()
+# CDFT & SCALAPACK
+set(MKL_CDFT      mkl_cdft_core)
+set(MKL_SCALAPACK mkl_scalapack_${MKL_INTERFACE})
+
+
+if (UNIX)
+  if(NOT APPLE)
+    if(MKL_LINK STREQUAL "static")
+      set(START_GROUP "-Wl,--start-group")
+      set(END_GROUP "-Wl,--end-group")
+      if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD)
+        set(EXPORT_DYNAMIC "-Wl,-export-dynamic")
+      endif()
+    elseif(MKL_LINK STREQUAL "dynamic")
+      set(MKL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_CORE}>")
+      if((GNU_Fortran_COMPILER OR PGI_COMPILER) AND "Fortran" IN_LIST CURR_LANGS)
+        set(NO_AS_NEEDED -Wl,--no-as-needed)
+      endif()
+    else()
+      set(MKL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_SDL}>")
+    endif()
+  endif()
+endif()
+
+# Create a list of requested libraries, based on input options (MKL_LIBRARIES)
+# Create full link-line in MKL_LINK_LINE
+list(APPEND MKL_LINK_LINE $<IF:$<BOOL:${ENABLE_OMP_OFFLOAD}>,${MKL_OFFLOAD_LOPT},>
+    $<IF:$<BOOL:${DPCPP_COMPILER}>,${MKL_DPCPP_LOPT},> ${EXPORT_DYNAMIC} ${NO_AS_NEEDED} ${MKL_RPATH})
+if(ENABLE_BLAS95)
+  list(APPEND MKL_LIBRARIES ${MKL_BLAS95})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_BLAS95})
+endif()
+if(ENABLE_LAPACK95)
+  list(APPEND MKL_LIBRARIES ${MKL_LAPACK95})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_LAPACK95})
+endif()
+if(ENABLE_SCALAPACK)
+  list(APPEND MKL_LIBRARIES ${MKL_SCALAPACK})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_SCALAPACK})
+endif()
+if(DPCPP_COMPILER OR (ENABLE_OMP_OFFLOAD AND NOT MKL_LINK STREQUAL "sdl"))
+  list(APPEND MKL_LIBRARIES ${MKL_SYCL})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_SYCL})
+endif()
+list(APPEND MKL_LINK_LINE ${START_GROUP})
+if(ENABLE_CDFT)
+  list(APPEND MKL_LIBRARIES ${MKL_CDFT})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_CDFT})
+endif()
+if(MKL_LINK STREQUAL "sdl")
+  list(APPEND MKL_LIBRARIES ${MKL_SDL})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_SDL})
+else()
+  list(APPEND MKL_LIBRARIES ${MKL_IFACE_LIB} ${MKL_THREAD} ${MKL_CORE})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_IFACE_LIB} MKL::${MKL_THREAD} MKL::${MKL_CORE})
+endif()
+if(USE_MPI)
+  list(APPEND MKL_LIBRARIES ${MKL_BLACS})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_BLACS})
+endif()
+list(APPEND MKL_LINK_LINE ${END_GROUP})
+
+# Find all requested libraries
+foreach(lib ${MKL_LIBRARIES})
+  unset(${lib}_file CACHE)
+  if(MKL_LINK STREQUAL "static" AND NOT ${lib} STREQUAL ${MKL_SDL})
+    find_library(${lib}_file ${LIB_PREFIX}${lib}${LIB_EXT}
+                  PATHS ${MKL_ROOT}
+                  PATH_SUFFIXES "lib" "lib/${MKL_ARCH}")
+    add_library(MKL::${lib} STATIC IMPORTED)
+  else()
+    find_library(${lib}_file NAMES ${LIB_PREFIX}${lib}${DLL_EXT} ${lib}
+                  PATHS ${MKL_ROOT}
+                  PATH_SUFFIXES "lib" "lib/${MKL_ARCH}")
+    add_library(MKL::${lib} SHARED IMPORTED)
+  endif()
+  find_package_handle_standard_args(MKL REQUIRED_VARS ${lib}_file)
+  # CMP0111, implemented in CMake 3.20+ requires a shared library target on Windows
+  # to be defined with IMPLIB and LOCATION property.
+  # It also requires a static library target to be defined with LOCATION property.
+  # Setting the policy to OLD usage, using cmake_policy() does not work as of 3.20.0, hence the if-else below.
+  if(WIN32 AND NOT MKL_LINK STREQUAL "static")
+    set_target_properties(MKL::${lib} PROPERTIES IMPORTED_IMPLIB "${${lib}_file}")
+    # Find corresponding DLL
+    set(MKL_DLL_GLOB ${lib}.*.dll)
+    file(GLOB MKL_DLL_FILE "${MKL_ROOT}/redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/../redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/../redist/${MKL_ARCH}/mkl/${MKL_DLL_GLOB}"
+	"${MKL_ROOT}/bin/${MKL_DLL_GLOB}")
+    if(NOT ${lib} STREQUAL ${MKL_IFACE_LIB} AND NOT ${lib} STREQUAL ${MKL_BLAS95} AND NOT ${lib} STREQUAL ${MKL_LAPACK95})  # Windows IFACE libs are static only
+      list(LENGTH MKL_DLL_FILE MKL_DLL_FILE_LEN)
+      if(MKL_DLL_FILE_LEN)
+        # in case multiple versions of the same dll are found, select the highest version
+        list(SORT MKL_DLL_FILE)
+        list(REVERSE MKL_DLL_FILE)
+        list(GET MKL_DLL_FILE 0 MKL_DLL_FILE)
+
+        mkl_message(STATUS "Found DLL: ${MKL_DLL_FILE}")
+        set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${MKL_DLL_FILE}")
+      else()
+        mkl_message(FATAL_ERROR "${MKL_DLL_GLOB} not found. MKL_ROOT was '${MKL_ROOT}'. MKL_DLL_FILE is '${MKL_DLL_FILE}'")
+      endif()
+    endif()
+  else()
+    set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${${lib}_file}")
+  endif()
+  list(APPEND MKL_IMPORTED_TARGETS MKL::${lib})
+endforeach()
+
+# Threading selection
+if(MKL_THREADING)
+  if(MKL_THREADING STREQUAL "tbb_thread")
+    find_package(TBB REQUIRED CONFIG COMPONENTS tbb)
+    set(MKL_THREAD_LIB $<TARGET_LINKER_FILE:TBB::tbb>)
+    set(MKL_SDL_THREAD_ENV "TBB")
+    get_property(TBB_LIB TARGET TBB::tbb PROPERTY IMPORTED_LOCATION_RELEASE)
+    get_filename_component(TBB_LIB_DIR ${TBB_LIB} DIRECTORY)
+    if(UNIX)
+      if(CMAKE_SKIP_BUILD_RPATH)
+        set(TBB_LINK "-L${TBB_LIB_DIR} -ltbb")
+      else()
+        set(TBB_LINK "-Wl,-rpath,${TBB_LIB_DIR} -L${TBB_LIB_DIR} -ltbb")
+      endif()
+      list(APPEND MKL_SUPP_LINK ${TBB_LINK})
+      if(APPLE)
+        list(APPEND MKL_SUPP_LINK -lc++)
+      else()
+        list(APPEND MKL_SUPP_LINK -lstdc++)
+      endif()
+    endif()
+    if(WIN32 OR APPLE)
+      set(MKL_ENV_PATH ${TBB_LIB_DIR})
+    endif()
+  elseif(MKL_THREADING MATCHES "_thread")
+    if(MKL_THREADING STREQUAL "pgi_thread")
+      list(APPEND MKL_SUPP_LINK -mp -pgf90libs)
+      set(MKL_SDL_THREAD_ENV "PGI")
+    elseif(MKL_THREADING STREQUAL "gnu_thread")
+      list(APPEND MKL_SUPP_LINK -lgomp)
+      set(MKL_SDL_THREAD_ENV "GNU")
+    else()
+      # intel_thread
+      if(UNIX)
+        set(MKL_OMP_LIB iomp5)
+        set(LIB_EXT ".so")
+        if(APPLE)
+          set(LIB_EXT ".dylib")
+        endif()
+      else()
+        set(MKL_OMP_LIB libiomp5md)
+      endif()
+      set(MKL_SDL_THREAD_ENV "INTEL")
+      set(OMP_LIBNAME ${LIB_PREFIX}${MKL_OMP_LIB}${LIB_EXT})
+
+      find_library(OMP_LIBRARY ${OMP_LIBNAME}
+        HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} ${CMPLR_ROOT}
+        PATH_SUFFIXES "lib" "lib/${MKL_ARCH}"
+               "lib/${MKL_ARCH}_lin" "lib/${MKL_ARCH}_win"
+               "linux/compiler/lib/${MKL_ARCH}"
+               "linux/compiler/lib/${MKL_ARCH}_lin"
+               "windows/compiler/lib/${MKL_ARCH}"
+               "windows/compiler/lib/${MKL_ARCH}_win"
+               "../compiler/lib/${MKL_ARCH}_lin" "../compiler/lib/${MKL_ARCH}_win"
+               "../compiler/lib/${MKL_ARCH}" "../compiler/lib"
+               "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}"
+               "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}_lin"
+               "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}"
+               "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}_win"
+               "../../compiler/latest/mac/compiler/lib")
+      if(WIN32)
+        set(OMP_DLLNAME ${LIB_PREFIX}${MKL_OMP_LIB}.dll)
+        find_path(OMP_DLL_DIR ${OMP_DLLNAME}
+          HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} ${CMPLR_ROOT}
+          PATH_SUFFIXES "redist/${MKL_ARCH}"
+               "redist/${MKL_ARCH}_win" "redist/${MKL_ARCH}_win/compiler"
+               "../redist/${MKL_ARCH}/compiler" "../compiler/lib"
+               "../../compiler/latest/windows/redist/${MKL_ARCH}_win"
+               "../../compiler/latest/windows/redist/${MKL_ARCH}_win/compiler"
+               "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win"
+               "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win/compiler")
+        find_package_handle_standard_args(MKL REQUIRED_VARS OMP_DLL_DIR)
+        set(MKL_ENV_PATH "${OMP_DLL_DIR}")
+      endif()
+
+      if(WIN32 AND SKIP_LIBPATH)
+        # Only for Intel OpenMP Offload
+        set(OMP_LINK "libiomp5md.lib")
+      else()
+        set(OMP_LINK "${OMP_LIBRARY}")
+        if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+          # Disable PGI OpenMP runtime for correct work of Intel OpenMP runtime
+          list(APPEND MKL_SUPP_LINK -nomp)
+        endif()
+      endif()
+      find_package_handle_standard_args(MKL REQUIRED_VARS OMP_LIBRARY OMP_LINK)
+      set(MKL_THREAD_LIB ${OMP_LINK})
+    endif()
+  else()
+    # Sequential threading
+    set(MKL_SDL_THREAD_ENV "SEQUENTIAL")
+  endif()
+endif() # MKL_THREADING
+
+if (UNIX)
+  list(APPEND MKL_SUPP_LINK -lm -ldl -lpthread)
+endif()
+
+if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD)
+  if(WIN32)
+    # Detect sycl library version
+    if(NOT DEFINED SYCL_LIB_VER_CACHE)
+      set(SYCL_LIB_VER "")
+      find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${LIB_EXT}
+        HINTS $ENV{LIB} $ENV{CMPLR_ROOT}
+        PATH_SUFFIXES "windows/lib")
+      if(NOT SYCL_LIB_DIR)
+        foreach(ver RANGE 6 99)
+          find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${ver}${LIB_EXT}
+            HINTS $ENV{LIB} $ENV{CMPLR_ROOT}
+            PATH_SUFFIXES "windows/lib")
+          if(SYCL_LIB_DIR)
+            set(SYCL_LIB_VER ${ver})
+            break()
+          endif()
+        endforeach()
+      endif()
+      set(SYCL_LIB_VER_CACHE ${SYCL_LIB_VER} CACHE STRING "")
+    endif()
+
+    if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
+      list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}d${LINK_SUFFIX})
+    else()
+      list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}${LINK_SUFFIX})
+    endif()
+  else()
+    list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${LINK_SUFFIX})
+  endif()
+  list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}OpenCL${LINK_SUFFIX})
+endif()
+
+# Setup link types based on input options
+set(LINK_TYPES "")
+
+if(DPCPP_COMPILER)
+  add_library(MKL::MKL_DPCPP INTERFACE IMPORTED GLOBAL)
+  target_compile_options(MKL::MKL_DPCPP INTERFACE ${MKL_DPCPP_COPT})
+  target_link_libraries(MKL::MKL_DPCPP INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK})
+  list(APPEND LINK_TYPES MKL::MKL_DPCPP)
+endif()
+# Single target for all C, Fortran link-lines
+add_library(MKL::MKL INTERFACE IMPORTED GLOBAL)
+target_compile_options(MKL::MKL INTERFACE
+    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,C>:${MKL_C_COPT}>
+    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,Fortran>:${MKL_F_COPT}>
+    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,CXX>:${MKL_CXX_COPT}>
+    $<IF:$<BOOL:${ENABLE_OMP_OFFLOAD}>,${MKL_OFFLOAD_COPT},>)
+target_link_libraries(MKL::MKL INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK})
+list(APPEND LINK_TYPES MKL::MKL)
+
+foreach(link ${LINK_TYPES})
+  # Set properties on all INTERFACE targets
+  target_include_directories(${link} BEFORE INTERFACE "${MKL_INCLUDE}")
+  list(APPEND MKL_IMPORTED_TARGETS ${link})
+endforeach(link) # LINK_TYPES
+
+if(MKL_LINK STREQUAL "sdl")
+  list(APPEND MKL_ENV "MKL_INTERFACE_LAYER=${MKL_SDL_IFACE_ENV}" "MKL_THREADING_LAYER=${MKL_SDL_THREAD_ENV}")
+endif()
+if(WIN32 AND NOT MKL_LINK STREQUAL "static")
+  list(APPEND MKL_ENV "MKL_BLACS_MPI=${MKL_BLACS_ENV}")
+endif()
+
+# Add MKL dynamic libraries if RPATH is not defined on Unix
+if(UNIX AND CMAKE_SKIP_BUILD_RPATH)
+  if(MKL_LINK STREQUAL "sdl")
+    set(MKL_LIB_DIR $<TARGET_FILE_DIR:MKL::${MKL_SDL}>)
+  else()
+    set(MKL_LIB_DIR $<TARGET_FILE_DIR:MKL::${MKL_CORE}>)
+  endif()
+  if(APPLE)
+    list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{DYLD_LIBRARY_PATH}")
+  else()
+    list(APPEND MKL_ENV "LD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{LD_LIBRARY_PATH}")
+  endif()
+endif()
+
+# Add MKL dynamic libraries to PATH on Windows
+if(WIN32 AND NOT MKL_LINK STREQUAL "static")
+  get_filename_component(MKL_DLL_DIR ${MKL_DLL_FILE} DIRECTORY)
+  set(MKL_ENV_PATH "${MKL_DLL_DIR}\;${MKL_ENV_PATH}")
+endif()
+
+if(MKL_ENV_PATH)
+  list(APPEND MKL_ENV "PATH=${MKL_ENV_PATH}\;${OLD_PATH}")
+  if(APPLE)
+    list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_ENV_PATH}\:${OLD_PATH}")
+  endif()
+endif()
+
+unset(MKL_DLL_FILE)
+
+endif() # MKL_LIBRARIES

From a1a88a5327fbf76f914e188a8503c2bf47940fd4 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 13:53:36 +0200
Subject: [PATCH 12/20] Fix mkl discovery

---
 .../deformable_convolution_sycl_native_ext/CMakeLists.txt   | 6 +++++-
 .../cmake/MKLConfig.cmake                                   | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index bf233286..b93e00fb 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -10,8 +10,12 @@ pybind11_add_module(${py_module_name}
     deformable_convolution_sycl/impl.cpp
 )
 
-message(${CMAKE_CURRENT_SOURCE_DIR})
 find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
+
+if (DEFINED ENV{CONDA_PREFIX})
+  set(MKL_ROOT $ENV{CONDA_PREFIX})
+endif()
+
 find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
index 07290e8e..e83661c4 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
@@ -635,7 +635,8 @@ foreach(lib ${MKL_LIBRARIES})
     file(GLOB MKL_DLL_FILE "${MKL_ROOT}/redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
         "${MKL_ROOT}/../redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
         "${MKL_ROOT}/../redist/${MKL_ARCH}/mkl/${MKL_DLL_GLOB}"
-	"${MKL_ROOT}/bin/${MKL_DLL_GLOB}")
+        "${MKL_ROOT}/bin/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/lib")
     if(NOT ${lib} STREQUAL ${MKL_IFACE_LIB} AND NOT ${lib} STREQUAL ${MKL_BLAS95} AND NOT ${lib} STREQUAL ${MKL_LAPACK95})  # Windows IFACE libs are static only
       list(LENGTH MKL_DLL_FILE MKL_DLL_FILE_LEN)
       if(MKL_DLL_FILE_LEN)

From bce5efaad54a0803106e3c6a270375242efac852 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 14:08:31 +0200
Subject: [PATCH 13/20] Add tbb and mkl-devel to win conda env

---
 environments/conda-win-sycl.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/environments/conda-win-sycl.yml b/environments/conda-win-sycl.yml
index e4ac46e5..0c5d3032 100644
--- a/environments/conda-win-sycl.yml
+++ b/environments/conda-win-sycl.yml
@@ -30,3 +30,5 @@ dependencies:
   # https://github.com/scikit-build/scikit-build/issues/981
   - setuptools>=42,<64
   - pybind11
+  - tbb-devel >= 2021.6.0
+  - mkl-devel-dpcpp>=2023.0.0

From 2642393a2be3ce7a6c172fe76e2aba8bb4b72fdf Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 14:45:50 +0200
Subject: [PATCH 14/20] Test cmake changes

---
 .../deformable_convolution_sycl_native_ext/CMakeLists.txt | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index b93e00fb..b3ffd9fe 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -10,12 +10,8 @@ pybind11_add_module(${py_module_name}
     deformable_convolution_sycl/impl.cpp
 )
 
-find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
-
-if (DEFINED ENV{CONDA_PREFIX})
-  set(MKL_ROOT $ENV{CONDA_PREFIX})
-endif()
-
+find_package(TBB CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
+find_package(IntelSYCL CONFIG REQUIRED)
 find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)

From a85dcf4ef7e9c6def9eb8e7a91c82553e850e33c Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Sun, 25 Jun 2023 18:16:32 +0200
Subject: [PATCH 15/20] Revert some changes

---
 .../deformable_convolution_sycl_native_ext/CMakeLists.txt     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
index b3ffd9fe..e5911eb6 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -12,6 +12,10 @@ pybind11_add_module(${py_module_name}
 
 find_package(TBB CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 find_package(IntelSYCL CONFIG REQUIRED)
+
+if (DEFINED ENV{CONDA_PREFIX})
+  set(MKL_ROOT $ENV{CONDA_PREFIX})
+endif()
 find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
 target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)

From 4e2b42b329854260dd0ff19e3461c5c73f2c757d Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 26 Jun 2023 01:53:46 +0200
Subject: [PATCH 16/20] Fix hang

---
 dpbench/infrastructure/benchmark_runner.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py
index 1fe53357..a00a4221 100644
--- a/dpbench/infrastructure/benchmark_runner.py
+++ b/dpbench/infrastructure/benchmark_runner.py
@@ -314,8 +314,6 @@ def run_benchmark_in_sub_process(
 
             return results
 
-        _, conn = self.get_process(rc.framework)
-
         brc = BaseRunConfig.from_instance(rc)
 
         brc.ref_framework: cfg.Framework = [
@@ -325,6 +323,7 @@ def run_benchmark_in_sub_process(
             in {p.postfix for p in f.postfixes}
         ][0]
 
+        _, conn = self.get_process(rc.framework)
         conn.send(brc)
 
         if conn.poll(rc.timeout if rc.timeout else self._default_timeout):

From 909d1032d8e6468c46d6894994cce0a967c15e73 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 26 Jun 2023 03:17:06 +0200
Subject: [PATCH 17/20] Fix hang

---
 dpbench/infrastructure/benchmark_runner.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py
index a00a4221..24715a0b 100644
--- a/dpbench/infrastructure/benchmark_runner.py
+++ b/dpbench/infrastructure/benchmark_runner.py
@@ -316,6 +316,7 @@ def run_benchmark_in_sub_process(
 
         brc = BaseRunConfig.from_instance(rc)
 
+        _, conn = self.get_process(rc.framework)
         brc.ref_framework: cfg.Framework = [
             f
             for f in cfg.GLOBAL.frameworks
@@ -323,7 +324,6 @@ def run_benchmark_in_sub_process(
             in {p.postfix for p in f.postfixes}
         ][0]
 
-        _, conn = self.get_process(rc.framework)
         conn.send(brc)
 
         if conn.poll(rc.timeout if rc.timeout else self._default_timeout):
@@ -357,7 +357,14 @@ def run_benchmark_and_save(
         Args:
             rc: runtime configuration.
         """
-        results = self.run_benchmark_in_sub_process(rc)
+        try:
+            results = self.run_benchmark_in_sub_process(rc)
+        except Exception as e:
+            # self.kill_process(rc.framework)
+
+            results = BenchmarkResults(0, rc.implementation, rc.preset)
+            results.error_state = ErrorCodes.FAILED_EXECUTION
+            results.error_msg = f"Unexpected failure. {str(e)}"
 
         if rc.conn:
             framework = build_framework(rc.framework)
@@ -377,6 +384,7 @@ def run_benchmark_and_save(
             )
 
 
+
 def _set_input_args(
     bench: Benchmark, framework: Framework, np_input_data: dict
 ):

From e7face29011a23f18776e20bb67209a274fce0b9 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 26 Jun 2023 03:18:53 +0200
Subject: [PATCH 18/20] pre-commit

---
 dpbench/infrastructure/benchmark_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py
index 24715a0b..9c95a43e 100644
--- a/dpbench/infrastructure/benchmark_runner.py
+++ b/dpbench/infrastructure/benchmark_runner.py
@@ -384,7 +384,6 @@ def run_benchmark_and_save(
             )
 
 
-
 def _set_input_args(
     bench: Benchmark, framework: Framework, np_input_data: dict
 ):

From c5bddb32572f7d6a63e6937b8e69d414ef37e20f Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Wed, 28 Jun 2023 00:51:25 +0200
Subject: [PATCH 19/20] Some cleanup

---
 .../deformable_convolution_initialize.py      | 66 +++++++------
 .../deformable_convolution_numba_mlir_p.py    | 45 +++++++--
 .../deformable_convolution_sycl/impl.cpp      | 97 +++++++++++++------
 .../bench_info/deformable_convolution.toml    | 96 ++++++------------
 4 files changed, 169 insertions(+), 135 deletions(-)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
index 205f57be..9903e9d8 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
@@ -5,45 +5,49 @@
 
 def initialize(
     batch,
-    in_channels,
-    in_height,
-    in_width,
-    out_channels,
-    out_height,
-    out_width,
-    kernel_height,
-    kernel_width,
-    stride_y,
-    stride_x,
-    dilation_y,
-    dilation_x,
-    pad_y,
-    pad_x,
+    in_chw,
+    out_chw,
+    kernel_hw,
+    stride_hw,
+    dilation_hw,
+    pad_hw,
     groups,
     deformable_groups,
-    dtype,
     seed,
+    types_dict,
 ):
     import numpy as np
     import numpy.random as default_rng
 
+    dtype: np.dtype = types_dict["float"]
+
     default_rng.seed(seed)
 
+    input_size = [batch] + in_chw  # nchw
+    output_size = [batch] + out_chw  # nchw
+    offset_size = kernel_hw + [2, out_chw[1], out_chw[2]]  # kh, kw, 2, oh, ow
+    weights_size = [out_chw[0], in_chw[0]] + kernel_hw  # oc, ic, kh, kw
+    bias_size = out_chw[0]  # oc
+    tmp_size = [
+        in_chw[0],
+        kernel_hw[0],
+        kernel_hw[1],
+        out_chw[1],
+        out_chw[2],
+    ]  # ic, kh, kw, oh, ow
+
+    input = default_rng.random(input_size).astype(dtype)
+    output = np.empty(output_size, dtype=dtype)
+    offset = 2 * default_rng.random(offset_size).astype("float32") - 1
+    weights = default_rng.random(weights_size).astype(dtype)
+    bias = default_rng.random(bias_size).astype(dtype)
+    tmp = np.empty(tmp_size, dtype=dtype)
+
     return (
-        default_rng.random((batch, in_channels, in_height, in_width)).astype(
-            dtype
-        ),
-        np.zeros((batch, out_channels, out_height, out_width)).astype(dtype),
-        2
-        * default_rng.random(
-            (kernel_height, kernel_width, 2, out_height, out_width)
-        ).astype(dtype)
-        - 1,
-        np.ones(
-            (out_channels, in_channels, kernel_height, kernel_width)
-        ).astype(dtype),
-        default_rng.random(out_channels).astype(dtype),
-        np.zeros(
-            (in_channels, kernel_height, kernel_width, out_height, out_width)
-        ).astype(dtype),
+        input,
+        output,
+        offset,
+        weights,
+        bias,
+        tmp,
     )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
index b99ff53b..77cbc5a9 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
@@ -108,19 +108,16 @@ def deformable_convolution_b1(
 
 
 @njit(parallel=True, gpu_fp64_truncate="auto")
-def deformable_convolution(
+def jdeformable_convolution(
     input,
     output,
     offset,
     weights,
     bias,
     tmp,
-    stride_y,
-    stride_x,
-    pad_y,
-    pad_x,
-    dilation_y,
-    dilation_x,
+    stride,
+    pad,
+    dilation,
     groups,
     deformable_groups,
 ):
@@ -133,9 +130,37 @@ def deformable_convolution(
             weights,
             bias,
             tmp,
-            (stride_y, stride_x),
-            (pad_y, pad_x),
-            (dilation_y, dilation_x),
+            stride,
+            pad,
+            dilation,
             groups,
             deformable_groups,
         )
+
+
+def deformable_convolution(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride_hw,
+    pad_hw,
+    dilation_hw,
+    groups,
+    deformable_groups,
+):
+    jdeformable_convolution(
+        input,
+        output,
+        offset,
+        weights,
+        bias,
+        tmp,
+        tuple(stride_hw),
+        tuple(pad_hw),
+        tuple(dilation_hw),
+        groups,
+        deformable_groups,
+    )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index 12ac11de..9cff7a98 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -14,6 +14,8 @@
 #include "oneapi/mkl.hpp"
 #include "utils.hpp"
 
+#include <pybind11/stl.h>
+
 using namespace sycl;
 namespace py = pybind11;
 
@@ -65,7 +67,7 @@ __attribute__((always_inline)) DataType bilinear(const DataType *input,
     return result / 2;
 }
 
-class deform;
+template <class DataType> class deform;
 
 template <class DataType>
 inline auto deform_input(cl::sycl::queue &queue,
@@ -73,7 +75,7 @@ inline auto deform_input(cl::sycl::queue &queue,
                          const Shape3D in_shape,
                          DataType *output,
                          const Shape5D out_shape,
-                         const DataType *offset,
+                         const float *offset,
                          int stride_y,
                          int stride_x,
                          int pad_y,
@@ -94,7 +96,7 @@ inline auto deform_input(cl::sycl::queue &queue,
 
     auto wsize =
         sycl::range<3>(in_channels * k_height * k_width, out_height, out_width);
-    return queue.parallel_for<deform>(wsize, [=](sycl::id<3> idx) {
+    return queue.parallel_for<deform<DataType>>(wsize, [=](sycl::id<3> idx) {
         auto ckhkw = static_cast<int>(idx[0]);
         auto h = static_cast<int>(idx[1]);
         auto w = static_cast<int>(idx[2]);
@@ -127,19 +129,19 @@ inline auto deform_input(cl::sycl::queue &queue,
     });
 }
 
-class fill_output;
+template <class DataType> class fill_output;
 
 template <class DataType>
 auto output_fill_with_bias(cl::sycl::queue &queue,
                            DataType *output,
                            const Shape3D out_shape,
-                           DataType *bias)
+                           const DataType *bias)
 {
     auto out_c = out_shape[CHW::C];
     auto out_h = out_shape[CHW::H];
     auto out_w = out_shape[CHW::W];
 
-    return queue.parallel_for<fill_output>(
+    return queue.parallel_for<fill_output<DataType>>(
         sycl::range<3>(out_c, out_h, out_w), [=](sycl::id<3> idx) {
             auto c = static_cast<int>(idx[0]);
             auto h = static_cast<int>(idx[1]);
@@ -158,9 +160,9 @@ void deformable_convolution_b1_impl(cl::sycl::queue &queue,
                                     const Shape3D out_shape,
                                     DataType *tmp,
                                     const float *offset,
-                                    DataType *weights,
+                                    const DataType *weights,
                                     const Shape4D weights_shape,
-                                    DataType *bias,
+                                    const DataType *bias,
                                     int stride_y,
                                     int stride_x,
                                     int pad_y,
@@ -210,9 +212,9 @@ void deformable_convolution_impl(cl::sycl::queue &queue,
                                  const Shape4D out_shape,
                                  DataType *tmp,
                                  const float *offset,
-                                 DataType *weights,
+                                 const DataType *weights,
                                  const Shape4D weights_shape,
-                                 DataType *bias,
+                                 const DataType *bias,
                                  int stride_y,
                                  int stride_x,
                                  int pad_y,
@@ -247,25 +249,45 @@ void deformable_convolution_impl(cl::sycl::queue &queue,
     }
 }
 
+template <typename... Args> bool ensure_compatibility(const Args &...args)
+{
+    std::vector<dpctl::tensor::usm_ndarray> arrays = {args...};
+
+    auto arr = arrays.at(0);
+
+    for (auto &arr : arrays) {
+        if (!(arr.get_flags() & (USM_ARRAY_C_CONTIGUOUS))) {
+            std::cerr << "All arrays need to be C contiguous.\n";
+            return false;
+        }
+    }
+    return true;
+}
+
 void deformable_convolution(dpctl::tensor::usm_ndarray input,
                             dpctl::tensor::usm_ndarray output,
                             dpctl::tensor::usm_ndarray offset,
                             dpctl::tensor::usm_ndarray weights,
                             dpctl::tensor::usm_ndarray bias,
                             dpctl::tensor::usm_ndarray tmp,
-                            int stride_y,
-                            int stride_x,
-                            int pad_y,
-                            int pad_x,
-                            int dilation_y,
-                            int dilation_x,
+                            std::vector<int> stride_hw,
+                            std::vector<int> pad_hw,
+                            std::vector<int> dilation_hw,
                             int groups,
                             int deformable_groups)
 {
     auto queue = input.get_queue();
 
-    if (input.get_typenum() != UAR_FLOAT) {
-        throw std::runtime_error("Expected a single precision FP array.");
+    if (!ensure_compatibility(input, output, offset, weights, bias, tmp))
+        throw std::runtime_error("Input arrays are not acceptable.");
+
+    if (input.get_typenum() != output.get_typenum() or
+        input.get_typenum() != offset.get_typenum() or
+        input.get_typenum() != weights.get_typenum() or
+        input.get_typenum() != bias.get_typenum() or
+        input.get_typenum() != tmp.get_typenum())
+    {
+        throw std::runtime_error("All arrays must have the same precision");
     }
 
     int batch = input.get_shape(0);
@@ -281,17 +303,39 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input,
     int kernel_height = weights.get_shape(2);
     int kernel_width = weights.get_shape(3);
 
+    auto stride_y = stride_hw[0];
+    auto stride_x = stride_hw[1];
+
+    auto pad_y = pad_hw[0];
+    auto pad_x = pad_hw[1];
+
+    auto dilation_y = pad_hw[0];
+    auto dilation_x = pad_hw[1];
+
     auto input_shape = Shape4D({batch, in_channels, in_height, in_width});
     auto output_shape = Shape4D({batch, out_channels, out_height, out_width});
     auto weights_shape =
         Shape4D({out_channels, in_channels, kernel_height, kernel_width});
 
-    deformable_convolution_impl(
-        queue, input.get_data<float>(), input_shape, output.get_data<float>(),
-        output_shape, tmp.get_data<float>(), offset.get_data<float>(),
-        weights.get_data<float>(), weights_shape, bias.get_data<float>(),
-        stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups,
-        deformable_groups);
+#define dispatch_dc(typ)                                                       \
+    deformable_convolution_impl<typ>(                                          \
+        queue, input.get_data<typ>(), input_shape, output.get_data<typ>(),     \
+        output_shape, tmp.get_data<typ>(), offset.get_data<float>(),           \
+        weights.get_data<typ>(), weights_shape, bias.get_data<typ>(),          \
+        stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups,      \
+        deformable_groups)
+
+    if (input.get_typenum() == UAR_FLOAT) {
+        dispatch_dc(float);
+    }
+    else if (input.get_typenum() == UAR_DOUBLE) {
+        dispatch_dc(double);
+    }
+    else {
+        throw std::runtime_error("Unsupported type");
+    }
+
+#undef dispatch_dc
 }
 
 PYBIND11_MODULE(_deformable_convolution_sycl, m)
@@ -301,8 +345,7 @@ PYBIND11_MODULE(_deformable_convolution_sycl, m)
     m.def("deformable_convolution", &deformable_convolution,
           "Defromable convolution", py::arg("input"), py::arg("output"),
           py::arg("offset"), py::arg("weights"), py::arg("bias"),
-          py::arg("tmp"), py::arg("stride_y"), py::arg("stride_x"),
-          py::arg("pad_y"), py::arg("pad_x"), py::arg("dilation_y"),
-          py::arg("dilation_x"), py::arg("groups"),
+          py::arg("tmp"), py::arg("stride_hw"), py::arg("pad_hw"),
+          py::arg("dilation_hw"), py::arg("groups"),
           py::arg("deformable_groups"));
 }
diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml
index 4c22313c..93c2e50a 100644
--- a/dpbench/configs/bench_info/deformable_convolution.toml
+++ b/dpbench/configs/bench_info/deformable_convolution.toml
@@ -18,12 +18,9 @@ input_args = [
     "weights",
     "bias",
     "tmp",
-    "stride_y",
-    "stride_x",
-    "dilation_y",
-    "dilation_x",
-    "pad_y",
-    "pad_x",
+    "stride_hw",
+    "dilation_hw",
+    "pad_hw",
     "groups",
     "deformable_groups"
 ]
@@ -41,65 +38,38 @@ output_args = [
 
 [benchmark.parameters.S]
 batch = 1
-in_channels = 16
-in_height = 32
-in_width = 32
-out_channels = 16
-out_height = 32
-out_width = 32
-kernel_height = 3
-kernel_width = 3
-stride_y = 1
-stride_x = 1
-dilation_y = 1
-dilation_x = 1
-pad_y = 1
-pad_x = 1
+in_chw = [16, 32, 32]
+out_chw = [16, 32, 32]
+kernel_hw = [3, 3]
+stride_hw = [1, 1]
+dilation_hw = [1, 1]
+pad_hw = [1, 1]
 groups = 1
 deformable_groups = 1
-dtype = "float32"
 seed =  7777777
 
 [benchmark.parameters.M]
 batch = 1
-in_channels = 64
-in_height = 64
-in_width = 64
-out_channels = 64
-out_height = 64
-out_width = 64
-kernel_height = 3
-kernel_width = 3
-stride_y = 1
-stride_x = 1
-dilation_y = 1
-dilation_x = 1
-pad_y = 1
-pad_x = 1
+in_chw = [64, 64, 64]
+out_chw = [64, 64, 64]
+kernel_hw = [3, 3]
+stride_hw = [1, 1]
+dilation_hw = [1, 1]
+pad_hw = [1, 1]
 groups = 1
 deformable_groups = 1
-dtype = "float32"
 seed =  7777777
 
 [benchmark.parameters.L]
 batch = 2
-in_channels = 64
-in_height = 128
-in_width = 128
-out_channels = 128
-out_height = 128
-out_width = 128
-kernel_height = 3
-kernel_width = 3
-stride_y = 1
-stride_x = 1
-dilation_y = 1
-dilation_x = 1
-pad_y = 1
-pad_x = 1
+in_chw = [64, 128, 128]
+out_chw = [128, 128, 128]
+kernel_hw = [3, 3]
+stride_hw = [1, 1]
+dilation_hw = [1, 1]
+pad_hw = [1, 1]
 groups = 1
 deformable_groups = 1
-dtype = "float32"
 seed =  7777777
 
 [benchmark.init]
@@ -108,24 +78,16 @@ types_dict_name="types_dict"
 precision="single"
 input_args =  [
     "batch",
-    "in_channels",
-    "in_height",
-    "in_width",
-    "out_channels",
-    "out_height",
-    "out_width",
-    "kernel_height",
-    "kernel_width",
-    "stride_y",
-    "stride_x",
-    "dilation_y",
-    "dilation_x",
-    "pad_y",
-    "pad_x",
+    "in_chw",
+    "out_chw",
+    "kernel_hw",
+    "stride_hw",
+    "dilation_hw",
+    "pad_hw",
     "groups",
     "deformable_groups",
-    "dtype",
-    "seed"
+    "seed",
+    "types_dict",
 ]
 output_args = [
     "input",

From 0c4f1f27a686b79a7b0994da2599e8c655588177 Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Wed, 28 Jun 2023 01:39:56 +0200
Subject: [PATCH 20/20] Remove auto convert list to vector

---
 .../deformable_convolution_sycl/impl.cpp      | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
index 9cff7a98..3bc49f39 100644
--- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -14,8 +14,6 @@
 #include "oneapi/mkl.hpp"
 #include "utils.hpp"
 
-#include <pybind11/stl.h>
-
 using namespace sycl;
 namespace py = pybind11;
 
@@ -270,9 +268,9 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input,
                             dpctl::tensor::usm_ndarray weights,
                             dpctl::tensor::usm_ndarray bias,
                             dpctl::tensor::usm_ndarray tmp,
-                            std::vector<int> stride_hw,
-                            std::vector<int> pad_hw,
-                            std::vector<int> dilation_hw,
+                            py::list stride_hw,
+                            py::list pad_hw,
+                            py::list dilation_hw,
                             int groups,
                             int deformable_groups)
 {
@@ -303,14 +301,14 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input,
     int kernel_height = weights.get_shape(2);
     int kernel_width = weights.get_shape(3);
 
-    auto stride_y = stride_hw[0];
-    auto stride_x = stride_hw[1];
+    auto stride_y = stride_hw[0].cast<int>();
+    auto stride_x = stride_hw[1].cast<int>();
 
-    auto pad_y = pad_hw[0];
-    auto pad_x = pad_hw[1];
+    auto pad_y = pad_hw[0].cast<int>();
+    auto pad_x = pad_hw[1].cast<int>();
 
-    auto dilation_y = pad_hw[0];
-    auto dilation_x = pad_hw[1];
+    auto dilation_y = pad_hw[0].cast<int>();
+    auto dilation_x = pad_hw[1].cast<int>();
 
     auto input_shape = Shape4D({batch, in_channels, in_height, in_width});
     auto output_shape = Shape4D({batch, out_channels, out_height, out_width});