diff --git a/dpbench/benchmarks/CMakeLists.txt b/dpbench/benchmarks/CMakeLists.txt
index 6869220b..8032000b 100644
--- a/dpbench/benchmarks/CMakeLists.txt
+++ b/dpbench/benchmarks/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(rambo)
 add_subdirectory(kmeans)
 add_subdirectory(knn)
 add_subdirectory(gpairs)
+add_subdirectory(deformable_convolution)
 add_subdirectory(dbscan)
 
 # generate dpcpp version into config
diff --git a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
new file mode 100644
index 00000000..820b345a
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+add_subdirectory(deformable_convolution_sycl_native_ext)
diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py
new file mode 100644
index 00000000..a94a9e6b
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache 2.0
+
+"""Deformable convolution
+"""
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
new file mode 100644
index 00000000..9903e9d8
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py
@@ -0,0 +1,53 @@
+# Copyright 2022 Intel Corp.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def initialize(
+    batch,
+    in_chw,
+    out_chw,
+    kernel_hw,
+    stride_hw,
+    dilation_hw,
+    pad_hw,
+    groups,
+    deformable_groups,
+    seed,
+    types_dict,
+):
+    import numpy as np
+    import numpy.random as default_rng
+
+    dtype: np.dtype = types_dict["float"]
+
+    default_rng.seed(seed)
+
+    input_size = [batch] + in_chw  # nchw
+    output_size = [batch] + out_chw  # nchw
+    offset_size = kernel_hw + [2, out_chw[1], out_chw[2]]  # kh, kw, 2, oh, ow
+    weights_size = [out_chw[0], in_chw[0]] + kernel_hw  # oc, ic, kh, kw
+    bias_size = out_chw[0]  # oc
+    tmp_size = [
+        in_chw[0],
+        kernel_hw[0],
+        kernel_hw[1],
+        out_chw[1],
+        out_chw[2],
+    ]  # ic, kh, kw, oh, ow
+
+    input = default_rng.random(input_size).astype(dtype)
+    output = np.empty(output_size, dtype=dtype)
+    offset = 2 * default_rng.random(offset_size).astype("float32") - 1
+    weights = default_rng.random(weights_size).astype(dtype)
+    bias = default_rng.random(bias_size).astype(dtype)
+    tmp = np.empty(tmp_size, dtype=dtype)
+
+    return (
+        input,
+        output,
+        offset,
+        weights,
+        bias,
+        tmp,
+    )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
new file mode 100644
index 00000000..77cbc5a9
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py
@@ -0,0 +1,166 @@
+# Copyright 2022 Intel Corp.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+
+import numpy as np
+from numba import prange
+from numba_mlir import njit
+
+
+@njit(parallel=True, inline="always", fastmath=True, gpu_fp64_truncate="auto")
+def bilinear(input, offset_y, offset_x):
+    height, width = input.shape
+    start_x = int(math.floor(offset_x))
+    start_x_weight = 1 - (offset_x - start_x)
+    start_y = int(math.floor(offset_y))
+    start_y_weight = 1 - (offset_y - start_y)
+
+    output = 0
+    if (
+        offset_x >= width
+        or offset_y >= height
+        or offset_x <= -1
+        or offset_y <= -1
+    ):
+        return output
+
+    if start_x >= 0 and start_y >= 0:
+        w = start_x_weight * start_y_weight
+        output += w * input[start_y, start_x]
+
+    if start_x + 1 < width and start_y >= 0:
+        w = (1 - start_x_weight) * start_y_weight
+        output += w * input[start_y, start_x + 1]
+
+    if start_x >= 0 and start_y + 1 < height:
+        w = start_x_weight * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x]
+
+    if start_x + 1 < width and start_y + 1 < height:
+        w = (1 - start_x_weight) * (1 - start_y_weight)
+        output += w * input[start_y + 1, start_x + 1]
+
+    return output / 2
+
+
+@njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
+def deform(
+    input, offset, output, stride, pad, dilation, groups, deformable_groups
+):
+    k_height, k_width, _, out_height, out_width = offset.shape
+    channels, _, _ = input.shape
+
+    k_h_m = (k_height - 1) // 2
+    k_w_m = (k_width - 1) // 2
+    for ckhkw in prange(channels * k_height * k_width):
+        for h in prange(out_height):
+            for w in prange(out_width):
+                c = ckhkw // (k_height * k_width)
+                khkw = ckhkw % (k_height * k_width)
+                kh = khkw // k_width
+                kw = khkw % k_width
+
+                offset_y = (
+                    offset[kh, kw, 1, h, w]
+                    + h * stride[0]
+                    + (kh - k_h_m) * dilation[0]
+                    - (pad[0] - k_h_m)
+                )
+                offset_x = (
+                    offset[kh, kw, 0, h, w]
+                    + w * stride[1]
+                    + (kw - k_w_m) * dilation[1]
+                    - (pad[1] - k_w_m)
+                )
+
+                output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x)
+
+
+@njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto")
+def deformable_convolution_b1(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride,
+    pad,
+    dilation,
+    groups,
+    deformable_groups,
+):
+    out_channels, height, width = output.shape
+    _, in_channels, k_height, k_width = weights.shape
+
+    deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups)
+
+    tmp = tmp.reshape((in_channels * k_height * k_width, height * width))
+
+    _weights = weights.reshape((out_channels, in_channels * k_height * k_width))
+    _output = output.reshape((out_channels, height * width))
+    np.dot(_weights, tmp, _output)
+
+    _bias = bias.reshape((out_channels, 1))
+    _output[:] = _output + _bias
+
+
+@njit(parallel=True, gpu_fp64_truncate="auto")
+def jdeformable_convolution(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride,
+    pad,
+    dilation,
+    groups,
+    deformable_groups,
+):
+    batch, _, _, _ = input.shape
+    for b in range(batch):
+        deformable_convolution_b1(
+            input[b],
+            output[b],
+            offset,
+            weights,
+            bias,
+            tmp,
+            stride,
+            pad,
+            dilation,
+            groups,
+            deformable_groups,
+        )
+
+
+def deformable_convolution(
+    input,
+    output,
+    offset,
+    weights,
+    bias,
+    tmp,
+    stride_hw,
+    pad_hw,
+    dilation_hw,
+    groups,
+    deformable_groups,
+):
+    jdeformable_convolution(
+        input,
+        output,
+        offset,
+        weights,
+        bias,
+        tmp,
+        tuple(stride_hw),
+        tuple(pad_hw),
+        tuple(dilation_hw),
+        groups,
+        deformable_groups,
+    )
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
new file mode 100644
index 00000000..e5911eb6
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.23)
+
+set(py_module_name _deformable_convolution_sycl)
+pybind11_add_module(${py_module_name}
+    MODULE
+    deformable_convolution_sycl/impl.cpp
+)
+
+find_package(TBB CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
+find_package(IntelSYCL CONFIG REQUIRED)
+
+if (DEFINED ENV{CONDA_PREFIX})
+  set(MKL_ROOT $ENV{CONDA_PREFIX})
+endif()
+find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
+
+target_compile_options(${py_module_name} PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+target_link_libraries(${py_module_name} PUBLIC $<LINK_ONLY:MKL::MKL> ${MKL_SYCL})
+
+file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+install(TARGETS ${py_module_name}
+  DESTINATION ${py_module_dest}/deformable_convolution_sycl
+)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
new file mode 100644
index 00000000..01a4191d
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py
@@ -0,0 +1,9 @@
+# Copyright 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache 2.0
+
+from .deformable_convolution_sycl._deformable_convolution_sycl import (
+    deformable_convolution as deformable_convolution_sycl,
+)
+
+__all__ = ["deformable_convolution_sycl"]
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
new file mode 100644
index 00000000..e83661c4
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake
@@ -0,0 +1,852 @@
+#===============================================================================
+# Copyright 2021-2022 Intel Corporation.
+#
+# This software and the related documents are Intel copyrighted  materials,  and
+# your use of  them is  governed by the  express license  under which  they were
+# provided to you (License).  Unless the License provides otherwise, you may not
+# use, modify, copy, publish, distribute,  disclose or transmit this software or
+# the related documents without Intel's prior written permission.
+#
+# This software and the related documents  are provided as  is,  with no express
+# or implied  warranties,  other  than those  that are  expressly stated  in the
+# License.
+#===============================================================================
+
+#===================================================================
+# CMake Config file for Intel(R) oneAPI Math Kernel Library (oneMKL)
+#===================================================================
+
+#===============================================================================
+# Input parameters
+#=================
+#-------------
+# Main options
+#-------------
+# MKL_ROOT: oneMKL root directory (May be required for non-standard install locations. Optional otherwise.)
+#    Default: use location from MKLROOT environment variable or <Full path to this file>/../../../ if MKLROOT is not defined
+# MKL_ARCH
+#    Values:  ia32 intel64
+#    Default: intel64
+# MKL_LINK
+#    Values:  static, dynamic, sdl
+#    Default: dynamic
+#       Exceptions:- DPC++ doesn't support sdl
+# MKL_THREADING
+#    Values:  sequential,
+#             intel_thread (Intel OpenMP),
+#             gnu_thread (GNU OpenMP),
+#             pgi_thread (PGI OpenMP),
+#             tbb_thread
+#    Default: intel_thread
+#       Exceptions:- DPC++ defaults to tbb, PGI compiler on Windows defaults to pgi_thread
+# MKL_INTERFACE (for MKL_ARCH=intel64 only)
+#    Values:  lp64, ilp64
+#       GNU or INTEL interface will be selected based on Compiler.
+#    Default: ilp64
+# MKL_MPI
+#    Values:  intelmpi, mpich, openmpi, msmpi, mshpc
+#    Default: intelmpi
+#-----------------------------------
+# Special options (OFF by default)
+#-----------------------------------
+# ENABLE_BLAS95:      Enables BLAS Fortran95 API
+# ENABLE_LAPACK95:    Enables LAPACK Fortran95 API
+# ENABLE_BLACS:       Enables cluster BLAS library
+# ENABLE_CDFT:        Enables cluster DFT library
+# ENABLE_CPARDISO:    Enables cluster PARDISO functionality
+# ENABLE_SCALAPACK:   Enables cluster LAPACK library
+# ENABLE_OMP_OFFLOAD: Enables OpenMP Offload functionality
+#
+#==================
+# Output parameters
+#==================
+# MKL_ROOT
+#     oneMKL root directory.
+# MKL_INCLUDE
+#     Use of target_include_directories() is recommended.
+#     INTERFACE_INCLUDE_DIRECTORIES property is set on mkl_core and mkl_rt libraries.
+#     Alternatively, this variable can be used directly (not recommended as per Modern CMake)
+# MKL_ENV
+#     Provides all environment variables based on input parameters.
+#     Currently useful for mkl_rt linking and BLACS on Windows.
+#     Must be set as an ENVIRONMENT property.
+# Example:
+#     add_test(NAME mytest COMMAND myexe)
+#     if(MKL_ENV)
+#       set_tests_properties(mytest PROPERTIES ENVIRONMENT "${MKL_ENV}")
+#     endif()
+#
+# MKL::<library name>
+#     IMPORTED targets to link MKL libraries individually or when using a custom link-line.
+#     mkl_core and mkl_rt have INTERFACE_* properties set to them.
+#     Please refer to Intel(R) oneMKL Link Line Advisor for help with linking.
+#
+# Below INTERFACE targets provide full link-lines for direct use.
+# Example:
+#     target_link_options(<my_linkable_target> PUBLIC $<LINK_ONLY:MKL::MKL>)
+#
+# MKL::MKL
+#     Link line for C and Fortran API
+# MKL::MKL_DPCPP
+#     Link line for DPC++ API
+#
+# Note: For Device API, library linking is not required.
+#       Compile options can be added from the INTERFACE_COMPILE_OPTIONS property on MKL::MKL_DPCPP
+#       Include directories can be added from the INTERFACE_INCLUDE_DIRECTORIES property on MKL::MKL_DPCPP
+#
+# Note: Output parameters' and targets' availability can change
+# based on Input parameters and application project languages.
+#===============================================================================
+
+function(mkl_message MSG_MODE MSG_TEXT)
+  if(MSG_MODE STREQUAL "FATAL_ERROR")
+    message(${MSG_MODE} ${MSG_TEXT})
+  else()
+    if(NOT MKL_FIND_QUIETLY)
+      message(${MSG_MODE} ${MSG_TEXT})
+    endif()
+  endif()
+endfunction()
+
+if(${CMAKE_VERSION} VERSION_LESS "3.13")
+  mkl_message(FATAL_ERROR "The minimum supported CMake version is 3.13. You are running version ${CMAKE_VERSION}")
+endif()
+
+include_guard()
+include(FindPackageHandleStandardArgs)
+
+if(NOT MKL_LIBRARIES)
+
+# Set CMake policies for well-defined behavior across CMake versions
+cmake_policy(SET CMP0011 NEW)
+cmake_policy(SET CMP0057 NEW)
+
+# Project Languages
+get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
+list(APPEND MKL_LANGS C CXX Fortran)
+foreach(lang ${languages})
+  if(${lang} IN_LIST MKL_LANGS)
+    list(APPEND CURR_LANGS ${lang})
+  endif()
+endforeach()
+list(REMOVE_DUPLICATES CURR_LANGS)
+
+option(ENABLE_BLAS95      "Enables BLAS Fortran95 API"            OFF)
+option(ENABLE_LAPACK95    "Enables LAPACK Fortran95 API"          OFF)
+option(ENABLE_BLACS       "Enables cluster BLAS library"          OFF)
+option(ENABLE_CDFT        "Enables cluster DFT library"           OFF)
+option(ENABLE_CPARDISO    "Enables cluster PARDISO functionality" OFF)
+option(ENABLE_SCALAPACK   "Enables cluster LAPACK library"        OFF)
+option(ENABLE_OMP_OFFLOAD "Enables OpenMP Offload functionality"  OFF)
+
+# Use MPI if any of these are enabled
+if(ENABLE_BLACS OR ENABLE_CDFT OR ENABLE_SCALAPACK OR ENABLE_CPARDISO)
+  set(USE_MPI ON)
+endif()
+
+# Check Parameters
+function(define_param TARGET_PARAM DEFAULT_PARAM SUPPORTED_LIST)
+  if(NOT DEFINED ${TARGET_PARAM} AND NOT DEFINED ${DEFAULT_PARAM})
+    mkl_message(STATUS "${TARGET_PARAM}: Undefined")
+  elseif(NOT DEFINED ${TARGET_PARAM} AND DEFINED ${DEFAULT_PARAM})
+    set(${TARGET_PARAM} "${${DEFAULT_PARAM}}" CACHE STRING "Choose ${TARGET_PARAM} options are: ${${SUPPORTED_LIST}}")
+    foreach(opt ${${DEFAULT_PARAM}})
+      set(STR_LIST "${STR_LIST} ${opt}")
+    endforeach()
+    mkl_message(STATUS "${TARGET_PARAM}: None, set to `${STR_LIST}` by default")
+  elseif(${SUPPORTED_LIST})
+    set(ITEM_FOUND 1)
+    foreach(opt ${${TARGET_PARAM}})
+      if(NOT ${opt} IN_LIST ${SUPPORTED_LIST})
+        set(ITEM_FOUND 0)
+      endif()
+    endforeach()
+    if(ITEM_FOUND EQUAL 0)
+      foreach(opt ${${SUPPORTED_LIST}})
+        set(STR_LIST "${STR_LIST} ${opt}")
+      endforeach()
+      mkl_message(FATAL_ERROR "Invalid ${TARGET_PARAM} `${${TARGET_PARAM}}`, options are: ${STR_LIST}")
+    else()
+      mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}")
+    endif()
+  else()
+    mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}")
+  endif()
+endfunction()
+
+#================
+# Compiler checks
+#================
+
+if(CMAKE_C_COMPILER)
+  get_filename_component(C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME)
+endif()
+if(CMAKE_CXX_COMPILER)
+  get_filename_component(CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME)
+endif()
+if(CMAKE_Fortran_COMPILER)
+  get_filename_component(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME)
+endif()
+
+# Determine Compiler Family
+if(CXX_COMPILER_NAME STREQUAL "dpcpp" OR CXX_COMPILER_NAME STREQUAL "dpcpp.exe"
+    OR CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe")
+  set(DPCPP_COMPILER ON)
+endif()
+if(C_COMPILER_NAME MATCHES "^clang")
+  set(CLANG_COMPILER ON)
+endif()
+if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+  set(PGI_COMPILER ON)
+elseif(CMAKE_C_COMPILER_ID STREQUAL "Intel" OR CMAKE_Fortran_COMPILER_ID STREQUAL "Intel"
+        OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")
+  set(INTEL_COMPILER ON)
+else()
+  if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+    set(GNU_C_COMPILER ON)
+  endif()
+  if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+    set(GNU_Fortran_COMPILER ON)
+  endif()
+endif()
+
+if(USE_MPI AND (C_COMPILER_NAME MATCHES "^mpi" OR Fortran_COMPILER_NAME MATCHES "^mpi"))
+  set(USE_MPI_SCRIPT ON)
+endif()
+
+#================
+
+#================
+# System-specific
+#================
+
+# Extensions
+if(UNIX)
+  set(LIB_PREFIX "lib")
+  set(LIB_EXT ".a")
+  set(DLL_EXT ".so")
+  if(APPLE)
+    set(DLL_EXT ".dylib")
+  endif()
+  set(LINK_PREFIX "-l")
+  set(LINK_SUFFIX "")
+else()
+  set(LIB_PREFIX "")
+  set(LIB_EXT ".lib")
+  set(DLL_EXT "_dll.lib")
+  set(LINK_PREFIX "")
+  set(LINK_SUFFIX ".lib")
+endif()
+
+# Set target system architecture
+set(DEFAULT_MKL_ARCH intel64)
+if(DPCPP_COMPILER OR PGI_COMPILER OR ENABLE_OMP_OFFLOAD OR USE_MPI)
+  set(MKL_ARCH_LIST intel64)
+else()
+  set(MKL_ARCH_LIST ia32 intel64)
+endif()
+define_param(MKL_ARCH DEFAULT_MKL_ARCH MKL_ARCH_LIST)
+
+#================
+
+#==========
+# Setup MKL
+#==========
+
+# Set MKL_ROOT directory
+if(NOT DEFINED MKL_ROOT)
+  if(DEFINED ENV{MKLROOT})
+    set(MKL_ROOT $ENV{MKLROOT})
+  else()
+    get_filename_component(MKL_CMAKE_PATH "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
+    get_filename_component(MKL_ROOT "${MKL_CMAKE_PATH}/../../../" ABSOLUTE)
+    mkl_message(STATUS "MKL_ROOT ${MKL_ROOT}")
+  endif()
+endif()
+string(REPLACE "\\" "/" MKL_ROOT ${MKL_ROOT})
+
+# Define MKL_LINK
+set(DEFAULT_MKL_LINK dynamic)
+if(DPCPP_COMPILER OR USE_MPI)
+  set(MKL_LINK_LIST static dynamic)
+else()
+  set(MKL_LINK_LIST static dynamic sdl)
+endif()
+define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST)
+
+# Define MKL_INTERFACE
+if(MKL_ARCH STREQUAL "intel64")
+  set(IFACE_TYPE intel)
+  if(GNU_Fortran_COMPILER)
+    set(IFACE_TYPE gf)
+  endif()
+  if(DPCPP_COMPILER)
+    if(MKL_INTERFACE)
+      set(MKL_INTERFACE_FULL intel_${MKL_INTERFACE})
+    endif()
+    set(DEFAULT_MKL_INTERFACE intel_ilp64)
+    set(MKL_INTERFACE_LIST intel_ilp64)
+  else()
+    if(MKL_INTERFACE)
+      set(MKL_INTERFACE_FULL ${IFACE_TYPE}_${MKL_INTERFACE})
+    endif()
+    set(DEFAULT_MKL_INTERFACE ${IFACE_TYPE}_ilp64)
+    set(MKL_INTERFACE_LIST ${IFACE_TYPE}_ilp64 ${IFACE_TYPE}_lp64)
+  endif()
+  define_param(MKL_INTERFACE_FULL DEFAULT_MKL_INTERFACE MKL_INTERFACE_LIST)
+else()
+  if(WIN32)
+    set(MKL_INTERFACE_FULL intel_c)
+  elseif(NOT APPLE)
+    if(GNU_Fortran_COMPILER)
+      set(MKL_INTERFACE_FULL gf)
+    else()
+      set(MKL_INTERFACE_FULL intel)
+    endif()
+  else()
+    mkl_message(FATAL_ERROR "OSX does not support MKL_ARCH ia32.")
+  endif()
+endif()
+if(MKL_INTERFACE_FULL MATCHES "ilp64")
+  set(MKL_INTERFACE "ilp64")
+else()
+  set(MKL_INTERFACE "lp64")
+endif()
+
+# Define MKL headers
+find_path(MKL_H mkl.h
+  HINTS ${MKL_ROOT}
+  PATH_SUFFIXES include)
+list(APPEND MKL_INCLUDE ${MKL_H})
+
+# Add pre-built F95 Interface Modules
+if(INTEL_COMPILER AND (ENABLE_BLAS95 OR ENABLE_LAPACK95))
+  if(MKL_ARCH STREQUAL "intel64")
+    list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/${MKL_ARCH}/${MKL_INTERFACE}")
+  else()
+    list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/${MKL_ARCH}")
+  endif()
+endif()
+
+# Define MKL_THREADING
+# All APIs support sequential threading
+set(MKL_THREADING_LIST "sequential" "intel_thread" "tbb_thread")
+set(DEFAULT_MKL_THREADING intel_thread)
+# DPC++ API supports TBB threading, but not OpenMP threading
+if(DPCPP_COMPILER)
+  set(DEFAULT_MKL_THREADING tbb_thread)
+  list(REMOVE_ITEM MKL_THREADING_LIST intel_thread)
+# C, Fortran API
+elseif(PGI_COMPILER)
+  # PGI compiler supports PGI OpenMP threading, additionally
+  list(APPEND MKL_THREADING_LIST pgi_thread)
+  # PGI compiler does not support TBB threading
+  list(REMOVE_ITEM MKL_THREADING_LIST tbb_thread)
+  if(WIN32)
+    # PGI 19.10 and 20.1 on Windows, do not support Intel OpenMP threading
+    list(REMOVE_ITEM MKL_THREADING_LIST intel_thread)
+    set(DEFAULT_MKL_THREADING pgi_thread)
+  endif()
+elseif(GNU_C_COMPILER OR GNU_Fortran_COMPILER OR CLANG_COMPILER)
+  list(APPEND MKL_THREADING_LIST gnu_thread)
+else()
+  # Intel and Microsoft compilers
+  # Nothing to do, only for completeness
+endif()
+define_param(MKL_THREADING DEFAULT_MKL_THREADING MKL_THREADING_LIST)
+
+# Define MKL_MPI
+set(DEFAULT_MKL_MPI intelmpi)
+if(UNIX)
+  if(APPLE)
+    # Override defaults for OSX
+    set(DEFAULT_MKL_MPI mpich)
+    set(MKL_MPI_LIST mpich)
+  else()
+    set(MKL_MPI_LIST intelmpi openmpi mpich mpich2)
+  endif()
+else()
+  # Windows
+  set(MKL_MPI_LIST intelmpi mshpc msmpi)
+endif()
+define_param(MKL_MPI DEFAULT_MKL_MPI MKL_MPI_LIST)
+# MSMPI is now called MSHPC. MSMPI option exists for backward compatibility.
+if(MKL_MPI STREQUAL "mshpc")
+  set(MKL_MPI msmpi)
+endif()
+find_package_handle_standard_args(MKL REQUIRED_VARS MKL_MPI)
+
+# Checkpoint - Verify if required options are defined
+find_package_handle_standard_args(MKL REQUIRED_VARS MKL_ROOT MKL_ARCH MKL_INCLUDE MKL_LINK MKL_THREADING MKL_INTERFACE_FULL)
+
+# Provides a list of IMPORTED targets for the project
+if(NOT DEFINED MKL_IMPORTED_TARGETS)
+  set(MKL_IMPORTED_TARGETS "")
+endif()
+
+# Clear temporary variables
+set(MKL_C_COPT "")
+set(MKL_F_COPT "")
+set(MKL_SDL_COPT "")
+set(MKL_CXX_COPT "")
+set(MKL_DPCPP_COPT "")
+set(MKL_DPCPP_LOPT "")
+set(MKL_OFFLOAD_COPT "")
+set(MKL_OFFLOAD_LOPT "")
+
+set(MKL_SUPP_LINK "")    # Other link options. Usually at the end of the link-line.
+set(MKL_LINK_LINE)       # For MPI only
+set(MKL_ENV_PATH "")     # Temporary variable to work with PATH
+set(MKL_ENV "")          # Exported environment variables
+
+# Modify PATH variable to make it CMake-friendly
+set(OLD_PATH $ENV{PATH})
+string(REPLACE ";" "\;" OLD_PATH "${OLD_PATH}")
+
+# Compiler options
+if(GNU_C_COMPILER OR GNU_Fortran_COMPILER)
+  if(MKL_ARCH STREQUAL "ia32")
+    list(APPEND MKL_C_COPT -m32)
+    list(APPEND MKL_F_COPT -m32)
+  else()
+    list(APPEND MKL_C_COPT -m64)
+    list(APPEND MKL_F_COPT -m64)
+  endif()
+endif()
+
+# Additonal compiler & linker options
+if(CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe")
+  list(APPEND MKL_DPCPP_COPT "-fsycl")
+  list(APPEND MKL_DPCPP_LOPT "-fsycl")
+endif()
+if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD)
+  if(MKL_LINK STREQUAL "static")
+    list(APPEND MKL_DPCPP_LOPT "-fsycl-device-code-split=per_kernel")
+    list(APPEND MKL_OFFLOAD_LOPT "-fsycl-device-code-split=per_kernel")
+  endif()
+endif()
+
+# For OpenMP Offload
+if(ENABLE_OMP_OFFLOAD)
+  if(WIN32)
+    if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1")
+      if("Fortran" IN_LIST CURR_LANGS)
+        list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -DONEMKL_USE_OPENMP_VERSION=202011)
+      else()
+        list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -Qopenmp-version:51 -DONEMKL_USE_OPENMP_VERSION=202011)
+      endif()
+    else()
+      list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64)
+    endif()
+    # -MD and -MDd are manually added here because offload functionality uses DPC++ runtime.
+    if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
+      list(APPEND MKL_OFFLOAD_COPT -MDd)
+    else()
+      list(APPEND MKL_OFFLOAD_COPT -MD)
+    endif()
+    list(APPEND MKL_OFFLOAD_LOPT -Qiopenmp -Qopenmp-targets:spir64 -fsycl)
+    set(SKIP_LIBPATH ON)
+  else()
+    if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1")
+      if("Fortran" IN_LIST CURR_LANGS)
+        list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -DONEMKL_USE_OPENMP_VERSION=202011)
+      else()
+        list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -fopenmp-version=51 -DONEMKL_USE_OPENMP_VERSION=202011)
+      endif()
+    else ()
+      list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64)
+    endif()
+    list(APPEND MKL_OFFLOAD_LOPT -fiopenmp -fopenmp-targets=spir64 -fsycl)
+    if(APPLE)
+      list(APPEND MKL_SUPP_LINK -lc++)
+    else()
+      list(APPEND MKL_SUPP_LINK -lstdc++)
+    endif()
+  endif()
+endif()
+
+# For selected Interface
+if(MKL_INTERFACE_FULL)
+  if(MKL_ARCH STREQUAL "ia32")
+    if(GNU_Fortran_COMPILER)
+      set(MKL_SDL_IFACE_ENV "GNU")
+    endif()
+  else()
+    if(GNU_Fortran_COMPILER)
+      set(MKL_SDL_IFACE_ENV "GNU,${MKL_INTERFACE}")
+    else()
+      set(MKL_SDL_IFACE_ENV "${MKL_INTERFACE}")
+    endif()
+    if(MKL_INTERFACE STREQUAL "ilp64")
+      if("Fortran" IN_LIST CURR_LANGS)
+        if(INTEL_COMPILER)
+          if(WIN32)
+            list(APPEND MKL_F_COPT "-4I8")
+          else()
+            list(APPEND MKL_F_COPT "-i8")
+          endif()
+        elseif(GNU_Fortran_COMPILER)
+          list(APPEND MKL_F_COPT "-fdefault-integer-8")
+        elseif(PGI_COMPILER)
+          list(APPEND MKL_F_COPT "-i8")
+        endif()
+      endif()
+      list(INSERT MKL_C_COPT 0 "-DMKL_ILP64")
+      list(INSERT MKL_SDL_COPT 0 "-DMKL_ILP64")
+      list(INSERT MKL_CXX_COPT 0 "-DMKL_ILP64")
+      list(INSERT MKL_OFFLOAD_COPT 0 "-DMKL_ILP64")
+    else()
+      # lp64
+    endif()
+  endif()
+  if(MKL_SDL_IFACE_ENV)
+    string(TOUPPER ${MKL_SDL_IFACE_ENV} MKL_SDL_IFACE_ENV)
+  endif()
+endif() # MKL_INTERFACE_FULL
+
+# All MKL Libraries
+if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
+  set(MKL_SYCL          mkl_sycld)
+else()
+  set(MKL_SYCL          mkl_sycl)
+endif()
+set(MKL_IFACE_LIB     mkl_${MKL_INTERFACE_FULL})
+set(MKL_CORE          mkl_core)
+if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo" AND MKL_THREADING STREQUAL "tbb_thread")
+  set(MKL_THREAD        mkl_tbb_threadd)
+else()
+  set(MKL_THREAD        mkl_${MKL_THREADING})
+endif()
+set(MKL_SDL           mkl_rt)
+if(MKL_ARCH STREQUAL "ia32")
+  set(MKL_BLAS95      mkl_blas95)
+  set(MKL_LAPACK95    mkl_lapack95)
+else()
+  set(MKL_BLAS95      mkl_blas95_${MKL_INTERFACE})
+  set(MKL_LAPACK95    mkl_lapack95_${MKL_INTERFACE})
+endif()
+# BLACS
+set(MKL_BLACS mkl_blacs_${MKL_MPI}_${MKL_INTERFACE})
+if(UNIX AND NOT APPLE AND MKL_MPI MATCHES "mpich")
+  # MPICH is compatible with INTELMPI Wrappers on Linux
+  set(MKL_BLACS mkl_blacs_intelmpi_${MKL_INTERFACE})
+endif()
+if(WIN32)
+  if(MKL_MPI STREQUAL "msmpi")
+    if("Fortran" IN_LIST CURR_LANGS)
+      list(APPEND MKL_SUPP_LINK "msmpifec.lib")
+    endif()
+    # MSMPI and MSHPC are supported with the same BLACS library
+    set(MKL_BLACS mkl_blacs_msmpi_${MKL_INTERFACE})
+    if(NOT MKL_LINK STREQUAL "static")
+      set(MKL_BLACS mkl_blacs_${MKL_INTERFACE})
+      set(MKL_BLACS_ENV MSMPI)
+    endif()
+  elseif(MKL_MPI STREQUAL "intelmpi" AND NOT MKL_LINK STREQUAL "static")
+    set(MKL_BLACS mkl_blacs_${MKL_INTERFACE})
+    set(MKL_BLACS_ENV INTELMPI)
+  endif()
+endif()
+# CDFT & SCALAPACK
+set(MKL_CDFT      mkl_cdft_core)
+set(MKL_SCALAPACK mkl_scalapack_${MKL_INTERFACE})
+
+
+if (UNIX)
+  if(NOT APPLE)
+    if(MKL_LINK STREQUAL "static")
+      set(START_GROUP "-Wl,--start-group")
+      set(END_GROUP "-Wl,--end-group")
+      if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD)
+        set(EXPORT_DYNAMIC "-Wl,-export-dynamic")
+      endif()
+    elseif(MKL_LINK STREQUAL "dynamic")
+      set(MKL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_CORE}>")
+      if((GNU_Fortran_COMPILER OR PGI_COMPILER) AND "Fortran" IN_LIST CURR_LANGS)
+        set(NO_AS_NEEDED -Wl,--no-as-needed)
+      endif()
+    else()
+      set(MKL_RPATH "-Wl,-rpath=$<TARGET_FILE_DIR:MKL::${MKL_SDL}>")
+    endif()
+  endif()
+endif()
+
+# Create a list of requested libraries, based on input options (MKL_LIBRARIES)
+# Create full link-line in MKL_LINK_LINE
+list(APPEND MKL_LINK_LINE $<IF:$<BOOL:${ENABLE_OMP_OFFLOAD}>,${MKL_OFFLOAD_LOPT},>
+    $<IF:$<BOOL:${DPCPP_COMPILER}>,${MKL_DPCPP_LOPT},> ${EXPORT_DYNAMIC} ${NO_AS_NEEDED} ${MKL_RPATH})
+if(ENABLE_BLAS95)
+  list(APPEND MKL_LIBRARIES ${MKL_BLAS95})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_BLAS95})
+endif()
+if(ENABLE_LAPACK95)
+  list(APPEND MKL_LIBRARIES ${MKL_LAPACK95})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_LAPACK95})
+endif()
+if(ENABLE_SCALAPACK)
+  list(APPEND MKL_LIBRARIES ${MKL_SCALAPACK})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_SCALAPACK})
+endif()
+if(DPCPP_COMPILER OR (ENABLE_OMP_OFFLOAD AND NOT MKL_LINK STREQUAL "sdl"))
+  list(APPEND MKL_LIBRARIES ${MKL_SYCL})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_SYCL})
+endif()
+list(APPEND MKL_LINK_LINE ${START_GROUP})
+if(ENABLE_CDFT)
+  list(APPEND MKL_LIBRARIES ${MKL_CDFT})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_CDFT})
+endif()
+if(MKL_LINK STREQUAL "sdl")
+  list(APPEND MKL_LIBRARIES ${MKL_SDL})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_SDL})
+else()
+  list(APPEND MKL_LIBRARIES ${MKL_IFACE_LIB} ${MKL_THREAD} ${MKL_CORE})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_IFACE_LIB} MKL::${MKL_THREAD} MKL::${MKL_CORE})
+endif()
+if(USE_MPI)
+  list(APPEND MKL_LIBRARIES ${MKL_BLACS})
+  list(APPEND MKL_LINK_LINE MKL::${MKL_BLACS})
+endif()
+list(APPEND MKL_LINK_LINE ${END_GROUP})
+
+# Find all requested libraries
+foreach(lib ${MKL_LIBRARIES})
+  unset(${lib}_file CACHE)
+  if(MKL_LINK STREQUAL "static" AND NOT ${lib} STREQUAL ${MKL_SDL})
+    find_library(${lib}_file ${LIB_PREFIX}${lib}${LIB_EXT}
+                  PATHS ${MKL_ROOT}
+                  PATH_SUFFIXES "lib" "lib/${MKL_ARCH}")
+    add_library(MKL::${lib} STATIC IMPORTED)
+  else()
+    find_library(${lib}_file NAMES ${LIB_PREFIX}${lib}${DLL_EXT} ${lib}
+                  PATHS ${MKL_ROOT}
+                  PATH_SUFFIXES "lib" "lib/${MKL_ARCH}")
+    add_library(MKL::${lib} SHARED IMPORTED)
+  endif()
+  find_package_handle_standard_args(MKL REQUIRED_VARS ${lib}_file)
+  # CMP0111, implemented in CMake 3.20+ requires a shared library target on Windows
+  # to be defined with IMPLIB and LOCATION property.
+  # It also requires a static library target to be defined with LOCATION property.
+  # Setting the policy to OLD usage, using cmake_policy() does not work as of 3.20.0, hence the if-else below.
+  if(WIN32 AND NOT MKL_LINK STREQUAL "static")
+    set_target_properties(MKL::${lib} PROPERTIES IMPORTED_IMPLIB "${${lib}_file}")
+    # Find corresponding DLL
+    set(MKL_DLL_GLOB ${lib}.*.dll)
+    file(GLOB MKL_DLL_FILE "${MKL_ROOT}/redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/../redist/${MKL_ARCH}/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/../redist/${MKL_ARCH}/mkl/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/bin/${MKL_DLL_GLOB}"
+        "${MKL_ROOT}/lib")
+    if(NOT ${lib} STREQUAL ${MKL_IFACE_LIB} AND NOT ${lib} STREQUAL ${MKL_BLAS95} AND NOT ${lib} STREQUAL ${MKL_LAPACK95})  # Windows IFACE libs are static only
+      list(LENGTH MKL_DLL_FILE MKL_DLL_FILE_LEN)
+      if(MKL_DLL_FILE_LEN)
+        # in case multiple versions of the same dll are found, select the highest version
+        list(SORT MKL_DLL_FILE)
+        list(REVERSE MKL_DLL_FILE)
+        list(GET MKL_DLL_FILE 0 MKL_DLL_FILE)
+
+        mkl_message(STATUS "Found DLL: ${MKL_DLL_FILE}")
+        set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${MKL_DLL_FILE}")
+      else()
+        mkl_message(FATAL_ERROR "${MKL_DLL_GLOB} not found. MKL_ROOT was '${MKL_ROOT}'. MKL_DLL_FILE is '${MKL_DLL_FILE}'")
+      endif()
+    endif()
+  else()
+    set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${${lib}_file}")
+  endif()
+  list(APPEND MKL_IMPORTED_TARGETS MKL::${lib})
+endforeach()
+
+# Threading selection
+if(MKL_THREADING)
+  if(MKL_THREADING STREQUAL "tbb_thread")
+    find_package(TBB REQUIRED CONFIG COMPONENTS tbb)
+    set(MKL_THREAD_LIB $<TARGET_LINKER_FILE:TBB::tbb>)
+    set(MKL_SDL_THREAD_ENV "TBB")
+    get_property(TBB_LIB TARGET TBB::tbb PROPERTY IMPORTED_LOCATION_RELEASE)
+    get_filename_component(TBB_LIB_DIR ${TBB_LIB} DIRECTORY)
+    if(UNIX)
+      if(CMAKE_SKIP_BUILD_RPATH)
+        set(TBB_LINK "-L${TBB_LIB_DIR} -ltbb")
+      else()
+        set(TBB_LINK "-Wl,-rpath,${TBB_LIB_DIR} -L${TBB_LIB_DIR} -ltbb")
+      endif()
+      list(APPEND MKL_SUPP_LINK ${TBB_LINK})
+      if(APPLE)
+        list(APPEND MKL_SUPP_LINK -lc++)
+      else()
+        list(APPEND MKL_SUPP_LINK -lstdc++)
+      endif()
+    endif()
+    if(WIN32 OR APPLE)
+      set(MKL_ENV_PATH ${TBB_LIB_DIR})
+    endif()
+  elseif(MKL_THREADING MATCHES "_thread")
+    if(MKL_THREADING STREQUAL "pgi_thread")
+      list(APPEND MKL_SUPP_LINK -mp -pgf90libs)
+      set(MKL_SDL_THREAD_ENV "PGI")
+    elseif(MKL_THREADING STREQUAL "gnu_thread")
+      list(APPEND MKL_SUPP_LINK -lgomp)
+      set(MKL_SDL_THREAD_ENV "GNU")
+    else()
+      # intel_thread
+      if(UNIX)
+        set(MKL_OMP_LIB iomp5)
+        set(LIB_EXT ".so")
+        if(APPLE)
+          set(LIB_EXT ".dylib")
+        endif()
+      else()
+        set(MKL_OMP_LIB libiomp5md)
+      endif()
+      set(MKL_SDL_THREAD_ENV "INTEL")
+      set(OMP_LIBNAME ${LIB_PREFIX}${MKL_OMP_LIB}${LIB_EXT})
+
+      find_library(OMP_LIBRARY ${OMP_LIBNAME}
+        HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} ${CMPLR_ROOT}
+        PATH_SUFFIXES "lib" "lib/${MKL_ARCH}"
+               "lib/${MKL_ARCH}_lin" "lib/${MKL_ARCH}_win"
+               "linux/compiler/lib/${MKL_ARCH}"
+               "linux/compiler/lib/${MKL_ARCH}_lin"
+               "windows/compiler/lib/${MKL_ARCH}"
+               "windows/compiler/lib/${MKL_ARCH}_win"
+               "../compiler/lib/${MKL_ARCH}_lin" "../compiler/lib/${MKL_ARCH}_win"
+               "../compiler/lib/${MKL_ARCH}" "../compiler/lib"
+               "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}"
+               "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}_lin"
+               "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}"
+               "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}_win"
+               "../../compiler/latest/mac/compiler/lib")
+      if(WIN32)
+        set(OMP_DLLNAME ${LIB_PREFIX}${MKL_OMP_LIB}.dll)
+        find_path(OMP_DLL_DIR ${OMP_DLLNAME}
+          HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} ${CMPLR_ROOT}
+          PATH_SUFFIXES "redist/${MKL_ARCH}"
+               "redist/${MKL_ARCH}_win" "redist/${MKL_ARCH}_win/compiler"
+               "../redist/${MKL_ARCH}/compiler" "../compiler/lib"
+               "../../compiler/latest/windows/redist/${MKL_ARCH}_win"
+               "../../compiler/latest/windows/redist/${MKL_ARCH}_win/compiler"
+               "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win"
+               "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win/compiler")
+        find_package_handle_standard_args(MKL REQUIRED_VARS OMP_DLL_DIR)
+        set(MKL_ENV_PATH "${OMP_DLL_DIR}")
+      endif()
+
+      if(WIN32 AND SKIP_LIBPATH)
+        # Only for Intel OpenMP Offload
+        set(OMP_LINK "libiomp5md.lib")
+      else()
+        set(OMP_LINK "${OMP_LIBRARY}")
+        if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+          # Disable PGI OpenMP runtime for correct work of Intel OpenMP runtime
+          list(APPEND MKL_SUPP_LINK -nomp)
+        endif()
+      endif()
+      find_package_handle_standard_args(MKL REQUIRED_VARS OMP_LIBRARY OMP_LINK)
+      set(MKL_THREAD_LIB ${OMP_LINK})
+    endif()
+  else()
+    # Sequential threading
+    set(MKL_SDL_THREAD_ENV "SEQUENTIAL")
+  endif()
+endif() # MKL_THREADING
+
+if (UNIX)
+  list(APPEND MKL_SUPP_LINK -lm -ldl -lpthread)
+endif()
+
+if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD)
+  if(WIN32)
+    # Detect sycl library version
+    if(NOT DEFINED SYCL_LIB_VER_CACHE)
+      set(SYCL_LIB_VER "")
+      find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${LIB_EXT}
+        HINTS $ENV{LIB} $ENV{CMPLR_ROOT}
+        PATH_SUFFIXES "windows/lib")
+      if(NOT SYCL_LIB_DIR)
+        foreach(ver RANGE 6 99)
+          find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${ver}${LIB_EXT}
+            HINTS $ENV{LIB} $ENV{CMPLR_ROOT}
+            PATH_SUFFIXES "windows/lib")
+          if(SYCL_LIB_DIR)
+            set(SYCL_LIB_VER ${ver})
+            break()
+          endif()
+        endforeach()
+      endif()
+      set(SYCL_LIB_VER_CACHE ${SYCL_LIB_VER} CACHE STRING "")
+    endif()
+
+    if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo")
+      list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}d${LINK_SUFFIX})
+    else()
+      list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}${LINK_SUFFIX})
+    endif()
+  else()
+    list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${LINK_SUFFIX})
+  endif()
+  list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}OpenCL${LINK_SUFFIX})
+endif()
+
+# Setup link types based on input options
+set(LINK_TYPES "")
+
+if(DPCPP_COMPILER)
+  add_library(MKL::MKL_DPCPP INTERFACE IMPORTED GLOBAL)
+  target_compile_options(MKL::MKL_DPCPP INTERFACE ${MKL_DPCPP_COPT})
+  target_link_libraries(MKL::MKL_DPCPP INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK})
+  list(APPEND LINK_TYPES MKL::MKL_DPCPP)
+endif()
+# Single target for all C, Fortran link-lines
+add_library(MKL::MKL INTERFACE IMPORTED GLOBAL)
+target_compile_options(MKL::MKL INTERFACE
+    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,C>:${MKL_C_COPT}>
+    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,Fortran>:${MKL_F_COPT}>
+    $<$<STREQUAL:$<TARGET_PROPERTY:LINKER_LANGUAGE>,CXX>:${MKL_CXX_COPT}>
+    $<IF:$<BOOL:${ENABLE_OMP_OFFLOAD}>,${MKL_OFFLOAD_COPT},>)
+target_link_libraries(MKL::MKL INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK})
+list(APPEND LINK_TYPES MKL::MKL)
+
+foreach(link ${LINK_TYPES})
+  # Set properties on all INTERFACE targets
+  target_include_directories(${link} BEFORE INTERFACE "${MKL_INCLUDE}")
+  list(APPEND MKL_IMPORTED_TARGETS ${link})
+endforeach(link) # LINK_TYPES
+
+if(MKL_LINK STREQUAL "sdl")
+  list(APPEND MKL_ENV "MKL_INTERFACE_LAYER=${MKL_SDL_IFACE_ENV}" "MKL_THREADING_LAYER=${MKL_SDL_THREAD_ENV}")
+endif()
+if(WIN32 AND NOT MKL_LINK STREQUAL "static")
+  list(APPEND MKL_ENV "MKL_BLACS_MPI=${MKL_BLACS_ENV}")
+endif()
+
+# Add MKL dynamic libraries if RPATH is not defined on Unix
+if(UNIX AND CMAKE_SKIP_BUILD_RPATH)
+  if(MKL_LINK STREQUAL "sdl")
+    set(MKL_LIB_DIR $<TARGET_FILE_DIR:MKL::${MKL_SDL}>)
+  else()
+    set(MKL_LIB_DIR $<TARGET_FILE_DIR:MKL::${MKL_CORE}>)
+  endif()
+  if(APPLE)
+    list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{DYLD_LIBRARY_PATH}")
+  else()
+    list(APPEND MKL_ENV "LD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{LD_LIBRARY_PATH}")
+  endif()
+endif()
+
+# Add MKL dynamic libraries to PATH on Windows
+if(WIN32 AND NOT MKL_LINK STREQUAL "static")
+  get_filename_component(MKL_DLL_DIR ${MKL_DLL_FILE} DIRECTORY)
+  set(MKL_ENV_PATH "${MKL_DLL_DIR}\;${MKL_ENV_PATH}")
+endif()
+
+if(MKL_ENV_PATH)
+  list(APPEND MKL_ENV "PATH=${MKL_ENV_PATH}\;${OLD_PATH}")
+  if(APPLE)
+    list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_ENV_PATH}\:${OLD_PATH}")
+  endif()
+endif()
+
+unset(MKL_DLL_FILE)
+
+endif() # MKL_LIBRARIES
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake
new file mode 100644
index 00000000..5363c1d3
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake
@@ -0,0 +1,193 @@
+# Copyright (c) 2017-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# It defines the following variables:
+#     TBB_<component>_FOUND
+#     TBB_IMPORTED_TARGETS
+#
+# TBBConfigVersion.cmake defines TBB_VERSION
+#
+# Initialize to default values
+if (NOT TBB_IMPORTED_TARGETS)
+    set(TBB_IMPORTED_TARGETS "")
+endif()
+
+if (NOT TBB_FIND_COMPONENTS)
+    set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy")
+    foreach (_tbb_component ${TBB_FIND_COMPONENTS})
+        set(TBB_FIND_REQUIRED_${_tbb_component} 1)
+    endforeach()
+endif()
+
+get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
+get_filename_component(_tbb_root "${_tbb_root}/../../.." ABSOLUTE)
+
+set(TBB_INTERFACE_VERSION )
+
+set(_tbb_bin_version 12)
+set(_tbbmalloc_bin_version 2)
+set(_tbbmalloc_proxy_bin_version 2)
+set(_tbbbind_bin_version 3)
+
+# Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc
+list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix)
+if (NOT _tbbmalloc_proxy_ix EQUAL -1)
+    list(APPEND TBB_FIND_COMPONENTS tbbmalloc)
+    list(REMOVE_DUPLICATES TBB_FIND_COMPONENTS)
+    set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy})
+endif()
+unset(_tbbmalloc_proxy_ix)
+
+if (CMAKE_SIZEOF_VOID_P STREQUAL "8")
+    set(_tbb_subdir intel64/gcc4.8)
+else ()
+    set(_tbb_subdir ia32/gcc4.8)
+endif()
+
+if (UNIX)
+  set(_tbb_lib_ext ".so")
+  set(_tbb_lib_prefix "lib")
+  set(_tbb_lib_dir_conda "lib")
+  set(_bin_version "")
+elseif (WIN32)
+  set(_bin_version "")
+  set(_tbb_lib_prefix "")
+  set(_tbb_lib_ext ".dll")
+  set(_tbb_impllib_ext ".lib")
+  set(_tbb_lib_dir_conda "bin")
+  set(_tbb_impllib_dir_conda "lib")
+else()
+    message(FATAL_ERROR "Unsupported platform. Only Unix and Windows are supported.")
+endif()
+
+foreach (_tbb_component ${TBB_FIND_COMPONENTS})
+    set(TBB_${_tbb_component}_FOUND 0)
+
+if(WIN32)
+    unset(_bin_version)
+    if (_tbb_component STREQUAL tbb)
+        set(_bin_version ${_tbb_bin_version})
+    endif()
+endif()
+
+    if(UNIX)
+       find_library(_tbb_release_lib
+                    NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_lib_ext}
+                    PATHS ${_tbb_root}
+                    HINTS ENV TBB_ROOT_HINT
+                    PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}")
+
+    else()
+       find_file(_tbb_release_lib
+                 NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_lib_ext}
+                 PATHS ${_tbb_root}
+                 HINTS ENV TBB_ROOT_HINT
+                 PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}")
+
+       if (EXISTS "${_tbb_release_lib}")
+          find_library(_tbb_release_impllib
+                       NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_impllib_ext}
+                       PATHS ${_tbb_root}
+                       HINTS ENV TBB_ROOT_HINT
+                       PATH_SUFFIXES "${_tbb_impllib_dir_conda}" "lib/${_tbb_subdir}")
+       endif()
+    endif()
+
+    if (NOT TBB_FIND_RELEASE_ONLY)
+        find_library(_tbb_debug_lib
+                     NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}_debug.${_tbb_lib_ext}
+                     PATHS ${_tbb_root}
+                     HINTS ENV TBB_ROOT_HINT
+                     PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}")
+        if(WIN32  AND EXISTS "${_tbb_debug_lib}")
+           find_library(_tbb_debug_impllib
+                        NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}_debug.${_tbb_impllib_ext}
+                        PATHS ${_tbb_root}
+                        HINTS ENV TBB_ROOT_HINT
+                        PATH_SUFFIXES "${_tbb_impllib_dir_conda}" "lib/${_tbb_subdir}")
+        endif()
+    endif()
+
+    if (EXISTS "${_tbb_release_lib}" OR EXISTS "${_tbb_debug_lib}")
+        if (NOT TARGET TBB::${_tbb_component})
+            add_library(TBB::${_tbb_component} SHARED IMPORTED)
+
+	    find_path(_tbb_include_dir
+	      oneapi/tbb.h
+	      PATHS ${_tbb_root}
+	      PATH_SUFFIXES include
+	      HINTS ENV TBB_ROOT_HINT
+	      )
+
+if(WIN32)
+            set_target_properties(
+                TBB::${_tbb_component} PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}"
+                INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1"
+                )
+else()
+            set_target_properties(
+                TBB::${_tbb_component} PROPERTIES
+                INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}"
+                )
+endif()
+            unset(_tbb_current_realpath)
+            unset(_tbb_include_dir)
+
+            if (EXISTS "${_tbb_release_lib}")
+if(WIN32)
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}"
+                                      IMPORTED_IMPLIB_RELEASE "${_tbb_release_impllib}")
+else()
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}")
+endif()
+                set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+            endif()
+
+            if (EXISTS "${_tbb_debug_lib}")
+if(WIN32)
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}"
+                                      IMPORTED_IMPLIB_DEBUG "${_tbb_debug_impllib}"
+                )
+else()
+                set_target_properties(TBB::${_tbb_component} PROPERTIES
+                                      IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}")
+endif()
+                set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
+            endif()
+
+            # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc
+            if (_tbb_component STREQUAL tbbmalloc_proxy)
+                set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc)
+            endif()
+        endif()
+        list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component})
+        set(TBB_${_tbb_component}_FOUND 1)
+    elseif (TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component})
+        message(STATUS "Missed required oneTBB component: ${_tbb_component}")
+        if (TBB_FIND_RELEASE_ONLY)
+            message(STATUS "  ${_tbb_release_lib} must exist.")
+        else()
+            message(STATUS "  one or both of:\n   ${_tbb_release_lib}\n    ${_tbb_debug_lib}\n   files must exist.")
+        endif()
+        set(TBB_FOUND FALSE)
+    endif()
+endforeach()
+list(REMOVE_DUPLICATES TBB_IMPORTED_TARGETS)
+unset(_tbb_release_lib)
+unset(_tbb_debug_lib)
+unset(_tbb_root)
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
new file mode 100644
index 00000000..3bc49f39
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp
@@ -0,0 +1,349 @@
+//==- impl.cpp - Python native extension of deformable convolution   ===//
+//
+// Copyright 2022 Intel Corp.
+//
+// SPDX - License - Identifier : Apache 2.0
+///
+/// \file
+/// The files implements a SYCL-based Python native extension for the
+/// deformable convolution benchmark.
+
+#include "CL/sycl.hpp"
+#include "cmath"
+#include "dpctl4pybind11.hpp"
+#include "oneapi/mkl.hpp"
+#include "utils.hpp"
+
+using namespace sycl;
+namespace py = pybind11;
+
+template <class DataType>
+__attribute__((always_inline)) DataType bilinear(const DataType *input,
+                                                 int height,
+                                                 int width,
+                                                 float offset_y,
+                                                 float offset_x)
+{
+    auto start_x = int(std::floor(offset_x));
+    auto start_x_weight = 1 - (offset_x - start_x);
+    auto start_y = int(std::floor(offset_y));
+    auto start_y_weight = 1 - (offset_y - start_y);
+
+    DataType result = 0;
+    if (offset_x >= width || offset_y >= height || offset_x <= -1 ||
+        offset_y <= -1)
+        return result;
+
+    if (start_x >= 0 && start_y >= 0) {
+        auto w0 = start_x_weight * start_y_weight;
+        auto v0 = *get_ptr_2d(input, height, width, start_y, start_x);
+
+        result += w0 * v0;
+    }
+
+    if (start_x + 1 < width && start_y >= 0) {
+        auto w1 = (1 - start_x_weight) * start_y_weight;
+        auto v1 = *get_ptr_2d(input, height, width, start_y, start_x + 1);
+
+        result += w1 * v1;
+    }
+
+    if (start_x >= 0 && start_y + 1 < height) {
+        auto w2 = start_x_weight * (1 - start_y_weight);
+        auto v2 = *get_ptr_2d(input, height, width, start_y + 1, start_x);
+
+        result += w2 * v2;
+    }
+
+    if (start_x + 1 < width && start_y + 1 < height) {
+        auto w3 = (1 - start_x_weight) * (1 - start_y_weight);
+        auto v3 = *get_ptr_2d(input, height, width, start_y + 1, start_x + 1);
+
+        result += w3 * v3;
+    }
+
+    return result / 2;
+}
+
+template <class DataType> class deform;
+
+template <class DataType>
+inline auto deform_input(cl::sycl::queue &queue,
+                         const DataType *input,
+                         const Shape3D in_shape,
+                         DataType *output,
+                         const Shape5D out_shape,
+                         const float *offset,
+                         int stride_y,
+                         int stride_x,
+                         int pad_y,
+                         int pad_x,
+                         int dilation_y,
+                         int dilation_x)
+{
+    auto in_channels = in_shape[CHW::C];
+    auto in_height = in_shape[CHW::H];
+    auto in_width = in_shape[CHW::W];
+
+    auto k_height = out_shape[CKHW::KH];
+    auto k_width = out_shape[CKHW::KW];
+    auto out_height = out_shape[CKHW::H];
+    auto out_width = out_shape[CKHW::W];
+
+    assert(out_shape[CKHW::C] == in_channels);
+
+    auto wsize =
+        sycl::range<3>(in_channels * k_height * k_width, out_height, out_width);
+    return queue.parallel_for<deform<DataType>>(wsize, [=](sycl::id<3> idx) {
+        auto ckhkw = static_cast<int>(idx[0]);
+        auto h = static_cast<int>(idx[1]);
+        auto w = static_cast<int>(idx[2]);
+
+        auto c = ckhkw / (k_height * k_width);
+        auto khkw = ckhkw % (k_height * k_width);
+
+        auto kh = khkw / k_width;
+        auto kw = khkw % k_width;
+
+        auto k_h_m = (k_height - 1) / 2;
+        auto k_w_m = (k_width - 1) / 2;
+
+        auto _output = get_ptr_5d(output, in_channels, k_height, k_width,
+                                  out_height, out_width, c, kh, kw, h, 0);
+
+        auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height,
+                                    out_width, kh, kw, 1, h, w) +
+                        h * stride_y + (kh - k_h_m) * dilation_y -
+                        (pad_y - k_h_m);
+        auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height,
+                                    out_width, kh, kw, 0, h, w) +
+                        w * stride_x + (kw - k_w_m) * dilation_x -
+                        (pad_x - k_w_m);
+
+        auto _input =
+            get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0);
+
+        _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x);
+    });
+}
+
+template <class DataType> class fill_output;
+
+template <class DataType>
+auto output_fill_with_bias(cl::sycl::queue &queue,
+                           DataType *output,
+                           const Shape3D out_shape,
+                           const DataType *bias)
+{
+    auto out_c = out_shape[CHW::C];
+    auto out_h = out_shape[CHW::H];
+    auto out_w = out_shape[CHW::W];
+
+    return queue.parallel_for<fill_output<DataType>>(
+        sycl::range<3>(out_c, out_h, out_w), [=](sycl::id<3> idx) {
+            auto c = static_cast<int>(idx[0]);
+            auto h = static_cast<int>(idx[1]);
+            auto w = static_cast<int>(idx[2]);
+
+            auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w);
+            *out_ptr = bias[c];
+        });
+}
+
+template <class DataType>
+void deformable_convolution_b1_impl(cl::sycl::queue &queue,
+                                    const DataType *input,
+                                    const Shape3D in_shape,
+                                    DataType *output,
+                                    const Shape3D out_shape,
+                                    DataType *tmp,
+                                    const float *offset,
+                                    const DataType *weights,
+                                    const Shape4D weights_shape,
+                                    const DataType *bias,
+                                    int stride_y,
+                                    int stride_x,
+                                    int pad_y,
+                                    int pad_x,
+                                    int dilation_y,
+                                    int dilation_x,
+                                    int groups,
+                                    int deformable_groups)
+{
+    using oneapi::mkl::transpose;
+    using oneapi::mkl::blas::row_major::gemm;
+
+    auto in_c = in_shape[CHW::C];
+    auto in_h = in_shape[CHW::H];
+    auto in_w = in_shape[CHW::W];
+
+    auto out_c = out_shape[CHW::C];
+    auto out_h = out_shape[CHW::H];
+    auto out_w = out_shape[CHW::W];
+
+    assert(out_c == weights_shape[OIHW::OC]);
+    assert(in_c == weights_shape[OIHW::IC]);
+    auto ker_h = weights_shape[OIHW::H];
+    auto ker_w = weights_shape[OIHW::W];
+
+    auto edeform = deform_input(
+        queue, input, in_shape, tmp, {in_c, ker_h, ker_w, out_h, out_w}, offset,
+        stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x);
+
+    auto efill = output_fill_with_bias(queue, output, out_shape, bias);
+    auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/
+                      out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/
+                      1,                                          /*alpha*/
+                      weights, in_c * ker_h * ker_w,              /*a, lda*/
+                      tmp, out_h * out_w,                         /*b, ldb*/
+                      1,                                          /*beta*/
+                      output, out_h * out_w,                      /*c, ldc*/
+                      {edeform, efill} /*events*/);
+    egemm.wait();
+}
+
+template <class DataType>
+void deformable_convolution_impl(cl::sycl::queue &queue,
+                                 const DataType *input,
+                                 const Shape4D in_shape,
+                                 DataType *output,
+                                 const Shape4D out_shape,
+                                 DataType *tmp,
+                                 const float *offset,
+                                 const DataType *weights,
+                                 const Shape4D weights_shape,
+                                 const DataType *bias,
+                                 int stride_y,
+                                 int stride_x,
+                                 int pad_y,
+                                 int pad_x,
+                                 int dilation_y,
+                                 int dilation_x,
+                                 int groups,
+                                 int deformable_groups)
+{
+    auto in_b = in_shape[NCHW::N];
+    auto in_c = in_shape[NCHW::C];
+    auto in_h = in_shape[NCHW::H];
+    auto in_w = in_shape[NCHW::W];
+
+    assert(in_b == out_shape[NCHW::N]);
+    auto out_c = out_shape[NCHW::C];
+    auto out_h = out_shape[NCHW::H];
+    auto out_w = out_shape[NCHW::W];
+
+    assert(out_c == weights_shape[OIHW::OC]);
+    assert(in_c == weights_shape[OIHW::IC]);
+
+    for (auto b = 0; b < in_b; ++b) {
+        auto input_ptr = get_ptr_4d(input, in_b, in_c, in_h, in_w, b, 0, 0, 0);
+        auto output_ptr =
+            get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0);
+        deformable_convolution_b1_impl(
+            queue, input_ptr, {in_c, in_h, in_w}, output_ptr,
+            {out_c, out_h, out_w}, tmp, offset, weights, weights_shape, bias,
+            stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups,
+            deformable_groups);
+    }
+}
+
+template <typename... Args> bool ensure_compatibility(const Args &...args)
+{
+    std::vector<dpctl::tensor::usm_ndarray> arrays = {args...};
+
+    auto arr = arrays.at(0);
+
+    for (auto &arr : arrays) {
+        if (!(arr.get_flags() & (USM_ARRAY_C_CONTIGUOUS))) {
+            std::cerr << "All arrays need to be C contiguous.\n";
+            return false;
+        }
+    }
+    return true;
+}
+
+void deformable_convolution(dpctl::tensor::usm_ndarray input,
+                            dpctl::tensor::usm_ndarray output,
+                            dpctl::tensor::usm_ndarray offset,
+                            dpctl::tensor::usm_ndarray weights,
+                            dpctl::tensor::usm_ndarray bias,
+                            dpctl::tensor::usm_ndarray tmp,
+                            py::list stride_hw,
+                            py::list pad_hw,
+                            py::list dilation_hw,
+                            int groups,
+                            int deformable_groups)
+{
+    auto queue = input.get_queue();
+
+    if (!ensure_compatibility(input, output, offset, weights, bias, tmp))
+        throw std::runtime_error("Input arrays are not acceptable.");
+
+    if (input.get_typenum() != output.get_typenum() or
+        input.get_typenum() != offset.get_typenum() or
+        input.get_typenum() != weights.get_typenum() or
+        input.get_typenum() != bias.get_typenum() or
+        input.get_typenum() != tmp.get_typenum())
+    {
+        throw std::runtime_error("All arrays must have the same precision");
+    }
+
+    int batch = input.get_shape(0);
+
+    int in_channels = input.get_shape(1);
+    int in_height = input.get_shape(2);
+    int in_width = input.get_shape(3);
+
+    int out_channels = output.get_shape(1);
+    int out_height = output.get_shape(2);
+    int out_width = output.get_shape(3);
+
+    int kernel_height = weights.get_shape(2);
+    int kernel_width = weights.get_shape(3);
+
+    auto stride_y = stride_hw[0].cast<int>();
+    auto stride_x = stride_hw[1].cast<int>();
+
+    auto pad_y = pad_hw[0].cast<int>();
+    auto pad_x = pad_hw[1].cast<int>();
+
+    auto dilation_y = pad_hw[0].cast<int>();
+    auto dilation_x = pad_hw[1].cast<int>();
+
+    auto input_shape = Shape4D({batch, in_channels, in_height, in_width});
+    auto output_shape = Shape4D({batch, out_channels, out_height, out_width});
+    auto weights_shape =
+        Shape4D({out_channels, in_channels, kernel_height, kernel_width});
+
+#define dispatch_dc(typ)                                                       \
+    deformable_convolution_impl<typ>(                                          \
+        queue, input.get_data<typ>(), input_shape, output.get_data<typ>(),     \
+        output_shape, tmp.get_data<typ>(), offset.get_data<float>(),           \
+        weights.get_data<typ>(), weights_shape, bias.get_data<typ>(),          \
+        stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups,      \
+        deformable_groups)
+
+    if (input.get_typenum() == UAR_FLOAT) {
+        dispatch_dc(float);
+    }
+    else if (input.get_typenum() == UAR_DOUBLE) {
+        dispatch_dc(double);
+    }
+    else {
+        throw std::runtime_error("Unsupported type");
+    }
+
+#undef dispatch_dc
+}
+
+PYBIND11_MODULE(_deformable_convolution_sycl, m)
+{
+    import_dpctl();
+
+    m.def("deformable_convolution", &deformable_convolution,
+          "Defromable convolution", py::arg("input"), py::arg("output"),
+          py::arg("offset"), py::arg("weights"), py::arg("bias"),
+          py::arg("tmp"), py::arg("stride_hw"), py::arg("pad_hw"),
+          py::arg("dilation_hw"), py::arg("groups"),
+          py::arg("deformable_groups"));
+}
diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
new file mode 100644
index 00000000..603c4144
--- /dev/null
+++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp
@@ -0,0 +1,88 @@
+//
+// Copyright 2022 Intel Corp.
+//
+// SPDX - License - Identifier : Apache 2.0
+///
+
+#include <array>
+
+template <int Dims> class TensorShape : public std::array<int, Dims>
+{
+public:
+    template <class DimType> const int &operator[](DimType dim) const
+    {
+        return std::array<int, Dims>::operator[](static_cast<int>(dim));
+    }
+
+    template <class DimType> int &operator[](DimType dim)
+    {
+        return std::array<int, Dims>::operator[](static_cast<int>(dim));
+    }
+};
+
+enum class CHW : int
+{
+    C,
+    H,
+    W,
+};
+
+enum class NCHW : int
+{
+    N,
+    C,
+    H,
+    W,
+};
+
+enum class OIHW : int
+{
+    OC,
+    IC,
+    H,
+    W,
+};
+
+enum class HWCK : int
+{
+    H,
+    W,
+    C,
+    KH,
+    KW
+};
+
+enum class CKHW : int
+{
+    C,
+    KH,
+    KW,
+    H,
+    W,
+};
+
+using Shape1D = TensorShape<1>;
+using Shape2D = TensorShape<2>;
+using Shape3D = TensorShape<3>;
+using Shape4D = TensorShape<4>;
+using Shape5D = TensorShape<5>;
+using DType = float;
+
+#define get_ptr_1d(data_ptr, max_dim_0, dim_0) (data_ptr + (dim_0))
+#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1)               \
+    (get_ptr_1d(data_ptr + (dim_0) * (max_dim_1), max_dim_1, dim_1))
+#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1,    \
+                   dim_2)                                                      \
+    (get_ptr_2d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2), max_dim_1,     \
+                max_dim_2, dim_1, dim_2))
+#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3,       \
+                   dim_0, dim_1, dim_2, dim_3)                                 \
+    (get_ptr_3d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2) * (max_dim_3),  \
+                max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3))
+
+#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3,       \
+                   max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4)               \
+    (get_ptr_4d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2) * (max_dim_3) * \
+                               (max_dim_4),                                    \
+                max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_1, dim_2,      \
+                dim_3, dim_4))
diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml
new file mode 100644
index 00000000..93c2e50a
--- /dev/null
+++ b/dpbench/configs/bench_info/deformable_convolution.toml
@@ -0,0 +1,99 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[benchmark]
+reference_implementation_postfix = "sycl"
+name = "Deformable convolution"
+short_name = "deformable-convolution"
+relative_path = "deformable_convolution"
+module_name = "deformable_convolution"
+func_name = "deformable_convolution"
+kind = "microbenchmark"
+domain = "Deep learning"
+input_args = [
+    "input",
+    "output",
+    "offset",
+    "weights",
+    "bias",
+    "tmp",
+    "stride_hw",
+    "dilation_hw",
+    "pad_hw",
+    "groups",
+    "deformable_groups"
+]
+array_args = [
+    "input",
+    "output",
+    "offset",
+    "weights",
+    "bias",
+    "tmp"
+]
+output_args = [
+    "output"
+]
+
+[benchmark.parameters.S]
+batch = 1
+in_chw = [16, 32, 32]
+out_chw = [16, 32, 32]
+kernel_hw = [3, 3]
+stride_hw = [1, 1]
+dilation_hw = [1, 1]
+pad_hw = [1, 1]
+groups = 1
+deformable_groups = 1
+seed =  7777777
+
+[benchmark.parameters.M]
+batch = 1
+in_chw = [64, 64, 64]
+out_chw = [64, 64, 64]
+kernel_hw = [3, 3]
+stride_hw = [1, 1]
+dilation_hw = [1, 1]
+pad_hw = [1, 1]
+groups = 1
+deformable_groups = 1
+seed =  7777777
+
+[benchmark.parameters.L]
+batch = 2
+in_chw = [64, 128, 128]
+out_chw = [128, 128, 128]
+kernel_hw = [3, 3]
+stride_hw = [1, 1]
+dilation_hw = [1, 1]
+pad_hw = [1, 1]
+groups = 1
+deformable_groups = 1
+seed =  7777777
+
+[benchmark.init]
+func_name = "initialize"
+types_dict_name="types_dict"
+precision="single"
+input_args =  [
+    "batch",
+    "in_chw",
+    "out_chw",
+    "kernel_hw",
+    "stride_hw",
+    "dilation_hw",
+    "pad_hw",
+    "groups",
+    "deformable_groups",
+    "seed",
+    "types_dict",
+]
+output_args = [
+    "input",
+    "output",
+    "offset",
+    "weights",
+    "bias",
+    "tmp"
+]
diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py
index 1fe53357..9c95a43e 100644
--- a/dpbench/infrastructure/benchmark_runner.py
+++ b/dpbench/infrastructure/benchmark_runner.py
@@ -314,10 +314,9 @@ def run_benchmark_in_sub_process(
 
             return results
 
-        _, conn = self.get_process(rc.framework)
-
         brc = BaseRunConfig.from_instance(rc)
 
+        _, conn = self.get_process(rc.framework)
         brc.ref_framework: cfg.Framework = [
             f
             for f in cfg.GLOBAL.frameworks
@@ -358,7 +357,14 @@ def run_benchmark_and_save(
         Args:
             rc: runtime configuration.
         """
-        results = self.run_benchmark_in_sub_process(rc)
+        try:
+            results = self.run_benchmark_in_sub_process(rc)
+        except Exception as e:
+            # self.kill_process(rc.framework)
+
+            results = BenchmarkResults(0, rc.implementation, rc.preset)
+            results.error_state = ErrorCodes.FAILED_EXECUTION
+            results.error_msg = f"Unexpected failure. {str(e)}"
 
         if rc.conn:
             framework = build_framework(rc.framework)
diff --git a/environments/conda-linux-sycl.yml b/environments/conda-linux-sycl.yml
index 8d34ee73..7fe49532 100644
--- a/environments/conda-linux-sycl.yml
+++ b/environments/conda-linux-sycl.yml
@@ -34,3 +34,5 @@ dependencies:
   - libgcc-ng
   - libstdcxx-ng
   - libgomp
+  - tbb-devel >= 2021.6.0
+  - mkl-devel-dpcpp>=2023.0.0
diff --git a/environments/conda-win-sycl.yml b/environments/conda-win-sycl.yml
index e4ac46e5..0c5d3032 100644
--- a/environments/conda-win-sycl.yml
+++ b/environments/conda-win-sycl.yml
@@ -30,3 +30,5 @@ dependencies:
   # https://github.com/scikit-build/scikit-build/issues/981
   - setuptools>=42,<64
   - pybind11
+  - tbb-devel >= 2021.6.0
+  - mkl-devel-dpcpp>=2023.0.0