From a595aeaa065f3106d3e743e11889cef2241b90da Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 30 Nov 2022 23:18:31 +0100 Subject: [PATCH 01/20] Initial implementation of Deformable Convolution workload --- dpbench/benchmarks/CMakeLists.txt | 1 + .../deformable_convolution/CMakeLists.txt | 1 + .../deformable_convolution/__init__.py | 37 +++ .../deformable_convolution_initialize.py | 32 ++ .../deformable_convolution_numba_npr.py | 81 +++++ .../deformable_convolution_numpy.py | 81 +++++ .../CMakeLists.txt | 20 ++ .../__init__.py | 7 + .../deformable_convolution_sycl/impl.cpp | 297 ++++++++++++++++++ .../deformable_convolution_sycl/utils.hpp | 94 ++++++ .../bench_info/deformable_convolution.json | 135 ++++++++ 11 files changed, 786 insertions(+) create mode 100644 dpbench/benchmarks/deformable_convolution/CMakeLists.txt create mode 100644 dpbench/benchmarks/deformable_convolution/__init__.py create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp create mode 100644 dpbench/configs/bench_info/deformable_convolution.json diff --git a/dpbench/benchmarks/CMakeLists.txt b/dpbench/benchmarks/CMakeLists.txt index 6869220b..8032000b 100644 --- a/dpbench/benchmarks/CMakeLists.txt +++ b/dpbench/benchmarks/CMakeLists.txt @@ -9,6 +9,7 @@ add_subdirectory(rambo) add_subdirectory(kmeans) add_subdirectory(knn) add_subdirectory(gpairs) +add_subdirectory(deformable_convolution) add_subdirectory(dbscan) # generate dpcpp version into config diff --git a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt new file mode 100644 index 00000000..d06d5c73 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(deformable_convolution_sycl_native_ext) diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py new file mode 100644 index 00000000..2dedbb33 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache 2.0 + +from .deformable_convolution_initialize import initialize +from .deformable_convolution_numba_npr import deformable_convolution as deformable_convolution_numba_npr +from .deformable_convolution_numpy import deformable_convolution as deformable_convolution_numpy +from .deformable_convolution_sycl_native_ext import deformable_convolution_sycl + +__all__ = [ + "initialize", + "deformable_convolution_numba_npr", + "deformable_convolution_numba_numpy", + "deformable_convolution_sycl", +] + +"""l2-norm calculation of n vectors + +Input +--------- +npoints: int + number of vectors +dims: int + dimension of single vector +seed: int + random seed to generate random number + +Output +------- +d: array + l2 norm of each vector + +Method +------ + ||Vj||2=sqrt(sum(Xj[i]*Xj[i])) + here i is 0->dims, j is 0->npoints +""" diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py new file mode 100644 index 00000000..3a52ef0c --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py @@ -0,0 +1,32 @@ +# Copyright 2022 Intel Corp. +# +# SPDX-License-Identifier: Apache-2.0 + + +def initialize(batch, + in_channels, in_height, in_width, + out_channels, out_height, out_width, + kernel_height, kernel_width, + stride_y, stride_x, + dilation_y, dilation_x, + pad_y, pad_x, + groups, + deformable_groups, + dtype, + seed): + import numpy as np + import numpy.random as default_rng + default_rng.seed(seed) + + return ( + default_rng.random((batch, in_channels, in_height, in_width)).astype(dtype), + # np.ones((batch, in_channels, in_height, in_width)).astype(dtype), + np.zeros((batch, out_channels, out_height, out_width)).astype(dtype), + # np.zeros((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype), + 2*default_rng.random((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype) - 1, + # default_rng.random((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype), + np.ones((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype), + default_rng.random(out_channels).astype(dtype), + # np.ones((out_channels,)).astype(dtype), + np.zeros((in_channels, kernel_height, kernel_width, out_height, out_width)).astype(dtype) + ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py new file mode 100644 index 00000000..6e891c3f --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py @@ -0,0 +1,81 @@ +# Copyright 2022 Intel Corp. +# +# SPDX-License-Identifier: Apache-2.0 + +import math +import numpy as np +from numba import njit, prange + +@njit(parallel=True, inline='always', fastmath=True) +def bilinear(input, offset_y, offset_x): + height, width = input.shape + start_x = int(math.floor(offset_x)) + start_x_weight = 1 - (offset_x - start_x) + start_y = int(math.floor(offset_y)) + start_y_weight = 1 - (offset_y - start_y) + + output = 0 + if offset_x >= width or offset_y >= height or offset_x <= -1 or offset_y <= -1: + return output + + if start_x >= 0 and start_y >= 0: + w = start_x_weight*start_y_weight + output += w*input[start_y, start_x] + + if start_x + 1 < width and start_y >= 0: + w = (1 - start_x_weight)*start_y_weight + output += w*input[start_y, start_x + 1] + + if start_x >=0 and start_y + 1 < height: + w = start_x_weight*(1 - start_y_weight) + output += w*input[start_y + 1, start_x] + + if start_x + 1 < width and start_y + 1 < height: + w = (1 - start_x_weight)*(1 - start_y_weight) + output += w*input[start_y + 1, start_x + 1] + + return output + +@njit(parallel=True, fastmath=True) +def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups): + k_height, k_width, _, out_height, out_width = offset.shape + channels, _, _ = input.shape + + k_h_m = (k_height - 1)//2 + k_w_m = (k_width - 1)//2 + + for ckhkw in prange(channels*k_height*k_width): + for h in prange(out_height): + for w in prange(out_width): + c = ckhkw//(k_height*k_width) + khkw = ckhkw%(k_height*k_width) + kh = khkw//k_width + kw = khkw%k_width + + offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m) + offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m) + + output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) + +@njit(parallel=True, fastmath=True) +def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups): + out_channels, height, width = output.shape + _, in_channels, k_height, k_width = weights.shape + + deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) + + tmp = tmp.reshape((in_channels*k_height*k_width, height*width)) + + _weights = weights.reshape((out_channels, in_channels*k_height*k_width)) + _output = output.reshape((out_channels, height*width)) + np.dot(_weights, tmp, _output) + + _bias = bias.reshape((out_channels, 1)) + _output[:] = _output + _bias + +@njit(parallel=True) +def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups): + batch, _, _, _ = input.shape + for b in range(batch): + deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups) + diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py new file mode 100644 index 00000000..59897585 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py @@ -0,0 +1,81 @@ +# Copyright 2022 Intel Corp. +# +# SPDX-License-Identifier: Apache-2.0 + +import math +import numpy as np +from numba import njit, prange + +@njit(parallel=True, inline='always', fastmath=True) +def bilinear(input, offset_y, offset_x): + height, width = input.shape + start_x = int(math.floor(offset_x)) + start_x_weight = 1 - (offset_x - start_x) + start_y = int(math.floor(offset_y)) + start_y_weight = 1 - (offset_y - start_y) + + output = 0 + if start_x >= width or start_y >= height or start_x < -1 or start_y < -1: + return output + + if start_x >= 0 and start_y >= 0: + w = start_x_weight*start_y_weight + output += w*input[start_y, start_x] + + if start_x + 1 < width and start_y >= 0: + w = (1 - start_x_weight)*start_y_weight + output += w*input[start_y, start_x + 1] + + if start_x >=0 and start_y + 1 < height: + w = start_x_weight*(1 - start_y_weight) + output += w*input[start_y + 1, start_x] + + if start_x + 1 < width and start_y + 1 < height: + w = (1 - start_x_weight)*(1 - start_y_weight) + output += w*input[start_y + 1, start_x + 1] + + return output + +@njit(parallel=True, fastmath=True) +def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups): + k_height, k_width, _, out_height, out_width = offset.shape + channels, _, _ = input.shape + + k_h_m = (k_height - 1)//2 + k_w_m = (k_width - 1)//2 + + for ckhkw in prange(channels*k_height*k_width): + for h in prange(out_height): + for w in prange(out_width): + c = ckhkw//(k_height*k_width) + khkw = ckhkw%(k_height*k_width) + kh = khkw//k_width + kw = khkw%k_width + + offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m) + offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m) + + output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) + +# @njit(parallel=True, fastmath=True) +def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups): + out_channels, height, width = output.shape + _, in_channels, k_height, k_width = weights.shape + + deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) + + tmp = tmp.reshape((in_channels*k_height*k_width, height*width)) + + _weights = weights.reshape((out_channels, in_channels*k_height*k_width)) + _output = output.reshape((out_channels, height*width)) + np.dot(_weights, tmp, _output) + + _bias = bias.reshape((out_channels, 1)) + _output[:] = _output + _bias + +# @njit(parallel=True) +def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups): + batch, _, _, _ = input.shape + for b in range(batch): + deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups) + diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt new file mode 100644 index 00000000..e874e715 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -0,0 +1,20 @@ +set(py_module_name _deformable_convolution_sycl) +pybind11_add_module(${py_module_name} + MODULE + deformable_convolution_sycl/impl.cpp +) +find_package(MKL CONFIG REQUIRED) + +# target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $) +target_compile_options(${py_module_name} PUBLIC $) +target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $) +target_link_libraries(${py_module_name} PUBLIC $ ${MKL_SYCL}) + +message(STATUS "${MKL_IMPORTED_TARGETS}") +message(STATUS "${INTERFACE_COMPILE_OPTIONS}") +message(STATUS "${INTERFACE_INCLUDE_DIRECTORIES}") + +file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) +install(TARGETS ${py_module_name} + DESTINATION ${py_module_dest}/deformable_convolution_sycl +) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py new file mode 100644 index 00000000..cd4c9bbd --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py @@ -0,0 +1,7 @@ +# Copyright 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache 2.0 + +from .deformable_convolution_sycl._deformable_convolution_sycl import deformable_convolution as deformable_convolution_sycl + +__all__ = ["deformable_convolution_sycl"] diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp new file mode 100644 index 00000000..cc6ecac7 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -0,0 +1,297 @@ +//==- impl.cpp - Python native extension of deformable convolution ===// +// +// Copyright 2022 Intel Corp. +// +// SPDX - License - Identifier : Apache 2.0 +/// +/// \file +/// The files implements a SYCL-based Python native extension for the +/// deformable convolution benchmark. + +#include "utils.hpp" +#include "CL/sycl.hpp" +#include "dpctl4pybind11.hpp" +#include "oneapi/mkl.hpp" +#include "cmath" + +using namespace sycl; +namespace py = pybind11; + +template +__attribute__((always_inline)) DataType bilinear(const DataType* input, + int height, + int width, + float offset_y, + float offset_x) +{ + auto start_x = int(std::floor(offset_x)); + auto start_x_weight = 1 - (offset_x - start_x); + auto start_y = int(std::floor(offset_y)); + auto start_y_weight = 1 - (offset_y - start_y); + + DataType result = 0; + if (offset_x >= width || offset_y >= height || offset_x <= -1 || offset_y <= -1) + return result; + + if (start_x >= 0 && start_y >= 0) + { + auto w0 = start_x_weight*start_y_weight; + auto v0 = *get_ptr_2d(input, height, width, start_y, start_x); + + result += w0*v0; + } + + if (start_x + 1 < width && start_y >= 0) + { + auto w1 = (1 - start_x_weight)*start_y_weight; + auto v1 = *get_ptr_2d(input, height, width, start_y, start_x + 1); + + result += w1*v1; + } + + if (start_x >=0 && start_y + 1 < height) + { + auto w2 = start_x_weight*(1 - start_y_weight); + auto v2 = *get_ptr_2d(input, height, width, start_y + 1, start_x); + + result += w2*v2; + } + + if (start_x + 1 < width && start_y + 1 < height) + { + auto w3 = (1 - start_x_weight)*(1 - start_y_weight); + auto v3 = *get_ptr_2d(input, height, width, start_y + 1, start_x + 1); + + result += w3*v3; + } + + return result; +} + +class deform; + +template +inline auto deform_input(cl::sycl::queue& queue, + const DataType* input, const Shape3D in_shape, + DataType* output, const Shape5D out_shape, + const DataType* offset, + int stride_y, int stride_x, + int pad_y, int pad_x, + int dilation_y, int dilation_x) +{ + auto in_channels = in_shape[CHW::C]; + auto in_height = in_shape[CHW::H]; + auto in_width = in_shape[CHW::W]; + + auto k_height = out_shape[CKHW::KH]; + auto k_width = out_shape[CKHW::KW]; + auto out_height = out_shape[CKHW::H]; + auto out_width = out_shape[CKHW::W]; + + assert(out_shape[CKHW::C] == in_channels); + + auto wsize = sycl::range<3>(in_channels*k_height*k_width, out_height, out_width); + return queue.parallel_for(wsize, [=](sycl::id<3> idx) + { + auto ckhkw = static_cast(idx[0]); + auto h = static_cast(idx[1]); + auto w = static_cast(idx[2]); + + auto c = ckhkw/(k_height*k_width); + auto khkw = ckhkw%(k_height*k_width); + + auto kh = khkw/k_width; + auto kw = khkw%k_width; + + auto k_h_m = (k_height - 1)/2; + auto k_w_m = (k_width - 1)/2; + + auto _output = get_ptr_5d(output, + in_channels, k_height, k_width, out_height, out_width, + c, kh, kw, h, 0); + + auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 1, h, w) + h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m); + auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 0, h, w) + w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m); + // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m); + // auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m); + + auto _input = get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0); + + _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x); + // _output[w] = offset_y; + }); +} + +class fill_output; + +template +auto output_fill_with_bias(cl::sycl::queue& queue, DataType* output, const Shape3D out_shape, DataType* bias) +{ + auto out_c = out_shape[CHW::C]; + auto out_h = out_shape[CHW::H]; + auto out_w = out_shape[CHW::W]; + + return queue.parallel_for(sycl::range<3>(out_c, out_h, out_w),[=](sycl::id<3> idx) + { + auto c = static_cast(idx[0]); + auto h = static_cast(idx[1]); + auto w = static_cast(idx[2]); + + auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w); + *out_ptr = bias[c]; + }); +} + +template +void deformable_convolution_b1_impl(cl::sycl::queue& queue, + const DataType* input, + const Shape3D in_shape, + DataType* output, + const Shape3D out_shape, + DataType* tmp, + const float* offset, + DataType* weights, + const Shape4D weights_shape, + DataType* bias, + int stride_y, int stride_x, + int pad_y, int pad_x, + int dilation_y, int dilation_x, + int groups, int deformable_groups) +{ + using oneapi::mkl::blas::row_major::gemm; + using oneapi::mkl::transpose; + + auto in_c = in_shape[CHW::C]; + auto in_h = in_shape[CHW::H]; + auto in_w = in_shape[CHW::W]; + + auto out_c = out_shape[CHW::C]; + auto out_h = out_shape[CHW::H]; + auto out_w = out_shape[CHW::W]; + + assert(out_c == weights_shape[OIHW::OC]); + assert(in_c == weights_shape[OIHW::IC]); + auto ker_h = weights_shape[OIHW::H]; + auto ker_w = weights_shape[OIHW::W]; + + auto edeform = deform_input(queue, + input, in_shape, + tmp, {in_c, ker_h, ker_w, out_h, out_w}, + offset, + stride_y, stride_x, + pad_y, pad_x, + dilation_y, dilation_x); + + auto efill = output_fill_with_bias(queue, output, out_shape, bias); + auto egemm = gemm(queue, + transpose::N, transpose::N, /*transpose a, b*/ + out_c, out_h*out_w, in_c*ker_h*ker_w, /*m, n, k*/ + 1, /*alpha*/ + weights, in_c*ker_h*ker_w, /*a, lda*/ + tmp, out_h*out_w, /*b, ldb*/ + 1, /*beta*/ + output, out_h*out_w, /*c, ldc*/ + {edeform, efill} /*events*/); + egemm.wait(); +} + +template +void deformable_convolution_impl(cl::sycl::queue& queue, + const DataType* input, + const Shape4D in_shape, + DataType* output, + const Shape4D out_shape, + DataType* tmp, + const float* offset, + DataType* weights, + const Shape4D weights_shape, + DataType* bias, + int stride_y, int stride_x, + int pad_y, int pad_x, + int dilation_y, int dilation_x, + int groups, int deformable_groups) +{ + auto in_b = in_shape[NCHW::N]; + auto in_c = in_shape[NCHW::C]; + auto in_h = in_shape[NCHW::H]; + auto in_w = in_shape[NCHW::W]; + + assert(in_b == out_shape[NCHW::N]); + auto out_c = out_shape[NCHW::C]; + auto out_h = out_shape[NCHW::H]; + auto out_w = out_shape[NCHW::W]; + + assert(out_c == weights_shape[OIHW::OC]); + assert(in_c == weights_shape[OIHW::IC]); + + for (auto b = 0; b < in_b; ++b) + { + auto input_ptr = get_ptr_4d(input, in_b, in_c, in_h, in_w, b, 0, 0, 0); + auto output_ptr = get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0); + deformable_convolution_b1_impl(queue, + input_ptr, {in_c, in_h, in_w}, + output_ptr, {out_c, out_h, out_w}, + tmp, + offset, + weights, weights_shape, + bias, + stride_y, stride_x, + pad_y, pad_x, + dilation_y, dilation_x, + groups, deformable_groups); + + } +} + +void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm_ndarray output, dpctl::tensor::usm_ndarray offset, + dpctl::tensor::usm_ndarray weights, dpctl::tensor::usm_ndarray bias, dpctl::tensor::usm_ndarray tmp, + int stride_y, int stride_x, int pad_y, int pad_x, int dilation_y, int dilation_x, int groups, int deformable_groups) +{ + auto queue = input.get_queue(); + + if (input.get_typenum() != UAR_FLOAT) { + throw std::runtime_error("Expected a single precision FP array."); + } + + int batch = input.get_shape(0); + + int in_channels = input.get_shape(1); + int in_height = input.get_shape(2); + int in_width = input.get_shape(3); + + int out_channels = output.get_shape(1); + int out_height = output.get_shape(2); + int out_width = output.get_shape(3); + + int kernel_height = weights.get_shape(2); + int kernel_width = weights.get_shape(3); + + auto input_shape = Shape4D({batch, in_channels, in_height, in_width}); + auto output_shape = Shape4D({batch, out_channels, out_height, out_width}); + auto weights_shape = Shape4D({out_channels, in_channels, kernel_height, kernel_width}); + + deformable_convolution_impl(queue, + input.get_data(), input_shape, + output.get_data(), output_shape, + tmp.get_data(), + offset.get_data(), + weights.get_data(), weights_shape, + bias.get_data(), + stride_y, stride_x, + pad_y, pad_x, + dilation_y, dilation_x, + groups, deformable_groups); +} + +PYBIND11_MODULE(_deformable_convolution_sycl, m) +{ + import_dpctl(); + + m.def("deformable_convolution", &deformable_convolution, + "Defromable convolution", + py::arg("input"), py::arg("output"), py::arg("offset"), + py::arg("weights"), py::arg("bias"), py::arg("tmp"), + py::arg("stride_y"), py::arg("stride_x"), py::arg("pad_y"), py::arg("pad_x"), + py::arg("dilation_y"), py::arg("dilation_x"), + py::arg("groups"), py::arg("deformable_groups")); +} diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp new file mode 100644 index 00000000..447a0571 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp @@ -0,0 +1,94 @@ +// +// Copyright 2022 Intel Corp. +// +// SPDX - License - Identifier : Apache 2.0 +/// + +#include + +template +class TensorShape: public std::array +{ +public: + template + const int& operator[](DimType dim) const + { + return std::array::operator[](static_cast(dim)); + } + + template + int& operator[](DimType dim) + { + return std::array::operator[](static_cast(dim)); + } + +}; + +enum class CHW: int +{ + C, + H, + W, +}; + +enum class NCHW: int +{ + N, + C, + H, + W, +}; + +enum class OIHW: int +{ + OC, + IC, + H, + W, +}; + +enum class HWCK: int +{ + H, + W, + C, + KH, + KW +}; + +enum class CKHW: int +{ + C, + KH, + KW, + H, + W, +}; + +using Shape1D = TensorShape<1>; +using Shape2D = TensorShape<2>; +using Shape3D = TensorShape<3>; +using Shape4D = TensorShape<4>; +using Shape5D = TensorShape<5>; +using DType = float; + +#define get_ptr_1d(data_ptr, max_dim_0, dim_0) (data_ptr + (dim_0)) +#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1) \ +(\ + get_ptr_1d(data_ptr + (dim_0)*(max_dim_1), max_dim_1, dim_1) \ +) +#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1, dim_2) \ +(\ + get_ptr_2d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2), max_dim_1, max_dim_2, dim_1, dim_2) \ +) +#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, dim_0, dim_1, dim_2, dim_3) \ +(\ + get_ptr_3d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3), max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3) \ +) + +#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4) \ +(\ + get_ptr_4d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3)*(max_dim_4), \ + max_dim_1, max_dim_2, max_dim_3, max_dim_4, \ + dim_1, dim_2, dim_3, dim_4) \ +) diff --git a/dpbench/configs/bench_info/deformable_convolution.json b/dpbench/configs/bench_info/deformable_convolution.json new file mode 100644 index 00000000..e18e7771 --- /dev/null +++ b/dpbench/configs/bench_info/deformable_convolution.json @@ -0,0 +1,135 @@ +{ + "benchmark": { + "name": "Deformable convolution", + "short_name": "deformable-convolution", + "relative_path": "deformable_convolution", + "module_name": "deformable_convolution", + "func_name": "deformable_convolution", + "kind": "microbenchmark", + "domain": "Deep learning", + "parameters": { + "S": { + "batch":1, + "in_channels":16, + "in_height":32, + "in_width":32, + "out_channels":16, + "out_height":32, + "out_width":32, + "kernel_height":3, + "kernel_width":3, + "stride_y":1, + "stride_x":1, + "dilation_y":1, + "dilation_x":1, + "pad_y":1, + "pad_x":1, + "groups":1, + "deformable_groups":1, + "dtype":"float32", + "seed": 7777777 + }, + "M": { + "batch":1, + "in_channels":64, + "in_height":64, + "in_width":64, + "out_channels":64, + "out_height":64, + "out_width":64, + "kernel_height":3, + "kernel_width":3, + "stride_y":1, + "stride_x":1, + "dilation_y":1, + "dilation_x":1, + "pad_y":1, + "pad_x":1, + "groups":1, + "deformable_groups":1, + "dtype":"float32", + "seed": 7777777 + }, + "L": { + "batch":2, + "in_channels":64, + "in_height":128, + "in_width":128, + "out_channels":128, + "out_height":128, + "out_width":128, + "kernel_height":3, + "kernel_width":3, + "stride_y":1, + "stride_x":1, + "dilation_y":1, + "dilation_x":1, + "pad_y":1, + "pad_x":1, + "groups":1, + "deformable_groups":1, + "dtype":"float32", + "seed": 7777777 + } + }, + "init": { + "func_name": "initialize", + "input_args": [ + "batch", + "in_channels", + "in_height", + "in_width", + "out_channels", + "out_height", + "out_width", + "kernel_height", + "kernel_width", + "stride_y", + "stride_x", + "dilation_y", + "dilation_x", + "pad_y", + "pad_x", + "groups", + "deformable_groups", + "dtype", + "seed" + ], + "output_args": [ + "input", + "output", + "offset", + "weights", + "bias", + "tmp" + ] + }, + "input_args": [ + "input", + "output", + "offset", + "weights", + "bias", + "tmp", + "stride_y", + "stride_x", + "dilation_y", + "dilation_x", + "pad_y", + "pad_x", + "groups", + "deformable_groups" + ], + "array_args": [ + "input", + "output", + "offset", + "weights", + "bias", + "tmp" + ], + "output_args": [ + "output" + ] + } +} From 29639449262a0a949654dc003444784af1eb031c Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 10 Apr 2023 23:49:37 +0200 Subject: [PATCH 02/20] Fix pre-commit --- .../deformable_convolution/__init__.py | 8 +- .../deformable_convolution_initialize.py | 51 ++- .../deformable_convolution_numba_npr.py | 120 ++++++-- .../deformable_convolution_numpy.py | 113 +++++-- .../__init__.py | 4 +- .../deformable_convolution_sycl/impl.cpp | 291 +++++++++--------- .../deformable_convolution_sycl/utils.hpp | 54 ++-- .../bench_info/deformable_convolution.json | 102 +++--- 8 files changed, 447 insertions(+), 296 deletions(-) diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py index 2dedbb33..a5db8b1d 100644 --- a/dpbench/benchmarks/deformable_convolution/__init__.py +++ b/dpbench/benchmarks/deformable_convolution/__init__.py @@ -3,8 +3,12 @@ # SPDX-License-Identifier: Apache 2.0 from .deformable_convolution_initialize import initialize -from .deformable_convolution_numba_npr import deformable_convolution as deformable_convolution_numba_npr -from .deformable_convolution_numpy import deformable_convolution as deformable_convolution_numpy +from .deformable_convolution_numba_npr import ( + deformable_convolution as deformable_convolution_numba_npr, +) +from .deformable_convolution_numpy import ( + deformable_convolution as deformable_convolution_numpy, +) from .deformable_convolution_sycl_native_ext import deformable_convolution_sycl __all__ = [ diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py index 3a52ef0c..896a2224 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py @@ -3,30 +3,51 @@ # SPDX-License-Identifier: Apache-2.0 -def initialize(batch, - in_channels, in_height, in_width, - out_channels, out_height, out_width, - kernel_height, kernel_width, - stride_y, stride_x, - dilation_y, dilation_x, - pad_y, pad_x, - groups, - deformable_groups, - dtype, - seed): +def initialize( + batch, + in_channels, + in_height, + in_width, + out_channels, + out_height, + out_width, + kernel_height, + kernel_width, + stride_y, + stride_x, + dilation_y, + dilation_x, + pad_y, + pad_x, + groups, + deformable_groups, + dtype, + seed, +): import numpy as np import numpy.random as default_rng + default_rng.seed(seed) return ( - default_rng.random((batch, in_channels, in_height, in_width)).astype(dtype), + default_rng.random((batch, in_channels, in_height, in_width)).astype( + dtype + ), # np.ones((batch, in_channels, in_height, in_width)).astype(dtype), np.zeros((batch, out_channels, out_height, out_width)).astype(dtype), # np.zeros((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype), - 2*default_rng.random((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype) - 1, + 2 + * default_rng.random( + (kernel_height, kernel_width, 2, out_height, out_width) + ).astype(dtype) + - 1, # default_rng.random((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype), - np.ones((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype), + np.ones( + (out_channels, in_channels, kernel_height, kernel_width) + ).astype(dtype), default_rng.random(out_channels).astype(dtype), # np.ones((out_channels,)).astype(dtype), - np.zeros((in_channels, kernel_height, kernel_width, out_height, out_width)).astype(dtype) + np.zeros( + (in_channels, kernel_height, kernel_width, out_height, out_width) + ).astype(dtype), ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py index 6e891c3f..dea7236e 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py @@ -3,10 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 import math + import numpy as np from numba import njit, prange -@njit(parallel=True, inline='always', fastmath=True) + +@njit(parallel=True, inline="always", fastmath=True) def bilinear(input, offset_y, offset_x): height, width = input.shape start_x = int(math.floor(offset_x)) @@ -15,67 +17,125 @@ def bilinear(input, offset_y, offset_x): start_y_weight = 1 - (offset_y - start_y) output = 0 - if offset_x >= width or offset_y >= height or offset_x <= -1 or offset_y <= -1: + if ( + offset_x >= width + or offset_y >= height + or offset_x <= -1 + or offset_y <= -1 + ): return output if start_x >= 0 and start_y >= 0: - w = start_x_weight*start_y_weight - output += w*input[start_y, start_x] + w = start_x_weight * start_y_weight + output += w * input[start_y, start_x] if start_x + 1 < width and start_y >= 0: - w = (1 - start_x_weight)*start_y_weight - output += w*input[start_y, start_x + 1] + w = (1 - start_x_weight) * start_y_weight + output += w * input[start_y, start_x + 1] - if start_x >=0 and start_y + 1 < height: - w = start_x_weight*(1 - start_y_weight) - output += w*input[start_y + 1, start_x] + if start_x >= 0 and start_y + 1 < height: + w = start_x_weight * (1 - start_y_weight) + output += w * input[start_y + 1, start_x] if start_x + 1 < width and start_y + 1 < height: - w = (1 - start_x_weight)*(1 - start_y_weight) - output += w*input[start_y + 1, start_x + 1] + w = (1 - start_x_weight) * (1 - start_y_weight) + output += w * input[start_y + 1, start_x + 1] return output + @njit(parallel=True, fastmath=True) -def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups): +def deform( + input, offset, output, stride, pad, dilation, groups, deformable_groups +): k_height, k_width, _, out_height, out_width = offset.shape channels, _, _ = input.shape - k_h_m = (k_height - 1)//2 - k_w_m = (k_width - 1)//2 + k_h_m = (k_height - 1) // 2 + k_w_m = (k_width - 1) // 2 - for ckhkw in prange(channels*k_height*k_width): + for ckhkw in prange(channels * k_height * k_width): for h in prange(out_height): for w in prange(out_width): - c = ckhkw//(k_height*k_width) - khkw = ckhkw%(k_height*k_width) - kh = khkw//k_width - kw = khkw%k_width - - offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m) - offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m) + c = ckhkw // (k_height * k_width) + khkw = ckhkw % (k_height * k_width) + kh = khkw // k_width + kw = khkw % k_width + + offset_y = ( + offset[kh, kw, 1, h, w] + + h * stride[0] + + (kh - k_h_m) * dilation[0] + - (pad[0] - k_h_m) + ) + offset_x = ( + offset[kh, kw, 0, h, w] + + w * stride[1] + + (kw - k_w_m) * dilation[1] + - (pad[1] - k_w_m) + ) output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) + @njit(parallel=True, fastmath=True) -def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups): +def deformable_convolution_b1( + input, + output, + offset, + weights, + bias, + tmp, + stride, + pad, + dilation, + groups, + deformable_groups, +): out_channels, height, width = output.shape _, in_channels, k_height, k_width = weights.shape - + deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) - tmp = tmp.reshape((in_channels*k_height*k_width, height*width)) + tmp = tmp.reshape((in_channels * k_height * k_width, height * width)) - _weights = weights.reshape((out_channels, in_channels*k_height*k_width)) - _output = output.reshape((out_channels, height*width)) + _weights = weights.reshape((out_channels, in_channels * k_height * k_width)) + _output = output.reshape((out_channels, height * width)) np.dot(_weights, tmp, _output) _bias = bias.reshape((out_channels, 1)) _output[:] = _output + _bias + @njit(parallel=True) -def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups): +def deformable_convolution( + input, + output, + offset, + weights, + bias, + tmp, + stride_y, + stride_x, + pad_y, + pad_x, + dilation_y, + dilation_x, + groups, + deformable_groups, +): batch, _, _, _ = input.shape for b in range(batch): - deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups) - + deformable_convolution_b1( + input[b], + output[b], + offset, + weights, + bias, + tmp, + (stride_y, stride_x), + (pad_y, pad_x), + (dilation_y, dilation_x), + groups, + deformable_groups, + ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py index 59897585..d0ec3d5f 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py @@ -3,10 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 import math + import numpy as np from numba import njit, prange -@njit(parallel=True, inline='always', fastmath=True) + +@njit(parallel=True, inline="always", fastmath=True) def bilinear(input, offset_y, offset_x): height, width = input.shape start_x = int(math.floor(offset_x)) @@ -19,63 +21,116 @@ def bilinear(input, offset_y, offset_x): return output if start_x >= 0 and start_y >= 0: - w = start_x_weight*start_y_weight - output += w*input[start_y, start_x] + w = start_x_weight * start_y_weight + output += w * input[start_y, start_x] if start_x + 1 < width and start_y >= 0: - w = (1 - start_x_weight)*start_y_weight - output += w*input[start_y, start_x + 1] + w = (1 - start_x_weight) * start_y_weight + output += w * input[start_y, start_x + 1] - if start_x >=0 and start_y + 1 < height: - w = start_x_weight*(1 - start_y_weight) - output += w*input[start_y + 1, start_x] + if start_x >= 0 and start_y + 1 < height: + w = start_x_weight * (1 - start_y_weight) + output += w * input[start_y + 1, start_x] if start_x + 1 < width and start_y + 1 < height: - w = (1 - start_x_weight)*(1 - start_y_weight) - output += w*input[start_y + 1, start_x + 1] + w = (1 - start_x_weight) * (1 - start_y_weight) + output += w * input[start_y + 1, start_x + 1] return output + @njit(parallel=True, fastmath=True) -def deform(input, offset, output, stride, pad, dilation, groups, deformable_groups): +def deform( + input, offset, output, stride, pad, dilation, groups, deformable_groups +): k_height, k_width, _, out_height, out_width = offset.shape channels, _, _ = input.shape - k_h_m = (k_height - 1)//2 - k_w_m = (k_width - 1)//2 + k_h_m = (k_height - 1) // 2 + k_w_m = (k_width - 1) // 2 - for ckhkw in prange(channels*k_height*k_width): + for ckhkw in prange(channels * k_height * k_width): for h in prange(out_height): for w in prange(out_width): - c = ckhkw//(k_height*k_width) - khkw = ckhkw%(k_height*k_width) - kh = khkw//k_width - kw = khkw%k_width - - offset_y = offset[kh, kw, 1, h, w] + h*stride[0] + (kh - k_h_m)*dilation[0] - (pad[0] - k_h_m) - offset_x = offset[kh, kw, 0, h, w] + w*stride[1] + (kw - k_w_m)*dilation[1] - (pad[1] - k_w_m) + c = ckhkw // (k_height * k_width) + khkw = ckhkw % (k_height * k_width) + kh = khkw // k_width + kw = khkw % k_width + + offset_y = ( + offset[kh, kw, 1, h, w] + + h * stride[0] + + (kh - k_h_m) * dilation[0] + - (pad[0] - k_h_m) + ) + offset_x = ( + offset[kh, kw, 0, h, w] + + w * stride[1] + + (kw - k_w_m) * dilation[1] + - (pad[1] - k_w_m) + ) output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) + # @njit(parallel=True, fastmath=True) -def deformable_convolution_b1(input, output, offset, weights, bias, tmp, stride, pad, dilation, groups, deformable_groups): +def deformable_convolution_b1( + input, + output, + offset, + weights, + bias, + tmp, + stride, + pad, + dilation, + groups, + deformable_groups, +): out_channels, height, width = output.shape _, in_channels, k_height, k_width = weights.shape - + deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) - tmp = tmp.reshape((in_channels*k_height*k_width, height*width)) + tmp = tmp.reshape((in_channels * k_height * k_width, height * width)) - _weights = weights.reshape((out_channels, in_channels*k_height*k_width)) - _output = output.reshape((out_channels, height*width)) + _weights = weights.reshape((out_channels, in_channels * k_height * k_width)) + _output = output.reshape((out_channels, height * width)) np.dot(_weights, tmp, _output) _bias = bias.reshape((out_channels, 1)) _output[:] = _output + _bias + # @njit(parallel=True) -def deformable_convolution(input, output, offset, weights, bias, tmp, stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, deformable_groups): +def deformable_convolution( + input, + output, + offset, + weights, + bias, + tmp, + stride_y, + stride_x, + pad_y, + pad_x, + dilation_y, + dilation_x, + groups, + deformable_groups, +): batch, _, _, _ = input.shape for b in range(batch): - deformable_convolution_b1(input[b], output[b], offset, weights, bias, tmp, (stride_y, stride_x), (pad_y, pad_x), (dilation_y, dilation_x), groups, deformable_groups) - + deformable_convolution_b1( + input[b], + output[b], + offset, + weights, + bias, + tmp, + (stride_y, stride_x), + (pad_y, pad_x), + (dilation_y, dilation_x), + groups, + deformable_groups, + ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py index cd4c9bbd..01a4191d 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/__init__.py @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache 2.0 -from .deformable_convolution_sycl._deformable_convolution_sycl import deformable_convolution as deformable_convolution_sycl +from .deformable_convolution_sycl._deformable_convolution_sycl import ( + deformable_convolution as deformable_convolution_sycl, +) __all__ = ["deformable_convolution_sycl"] diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index cc6ecac7..005155bd 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -8,17 +8,17 @@ /// The files implements a SYCL-based Python native extension for the /// deformable convolution benchmark. -#include "utils.hpp" #include "CL/sycl.hpp" +#include "cmath" #include "dpctl4pybind11.hpp" #include "oneapi/mkl.hpp" -#include "cmath" +#include "utils.hpp" using namespace sycl; namespace py = pybind11; -template -__attribute__((always_inline)) DataType bilinear(const DataType* input, +template +__attribute__((always_inline)) DataType bilinear(const DataType *input, int height, int width, float offset_y, @@ -30,39 +30,36 @@ __attribute__((always_inline)) DataType bilinear(const DataType* input, auto start_y_weight = 1 - (offset_y - start_y); DataType result = 0; - if (offset_x >= width || offset_y >= height || offset_x <= -1 || offset_y <= -1) + if (offset_x >= width || offset_y >= height || offset_x <= -1 || + offset_y <= -1) return result; - if (start_x >= 0 && start_y >= 0) - { - auto w0 = start_x_weight*start_y_weight; + if (start_x >= 0 && start_y >= 0) { + auto w0 = start_x_weight * start_y_weight; auto v0 = *get_ptr_2d(input, height, width, start_y, start_x); - result += w0*v0; + result += w0 * v0; } - if (start_x + 1 < width && start_y >= 0) - { - auto w1 = (1 - start_x_weight)*start_y_weight; + if (start_x + 1 < width && start_y >= 0) { + auto w1 = (1 - start_x_weight) * start_y_weight; auto v1 = *get_ptr_2d(input, height, width, start_y, start_x + 1); - result += w1*v1; + result += w1 * v1; } - if (start_x >=0 && start_y + 1 < height) - { - auto w2 = start_x_weight*(1 - start_y_weight); + if (start_x >= 0 && start_y + 1 < height) { + auto w2 = start_x_weight * (1 - start_y_weight); auto v2 = *get_ptr_2d(input, height, width, start_y + 1, start_x); - result += w2*v2; + result += w2 * v2; } - if (start_x + 1 < width && start_y + 1 < height) - { - auto w3 = (1 - start_x_weight)*(1 - start_y_weight); + if (start_x + 1 < width && start_y + 1 < height) { + auto w3 = (1 - start_x_weight) * (1 - start_y_weight); auto v3 = *get_ptr_2d(input, height, width, start_y + 1, start_x + 1); - result += w3*v3; + result += w3 * v3; } return result; @@ -70,52 +67,64 @@ __attribute__((always_inline)) DataType bilinear(const DataType* input, class deform; -template -inline auto deform_input(cl::sycl::queue& queue, - const DataType* input, const Shape3D in_shape, - DataType* output, const Shape5D out_shape, - const DataType* offset, - int stride_y, int stride_x, - int pad_y, int pad_x, - int dilation_y, int dilation_x) +template +inline auto deform_input(cl::sycl::queue &queue, + const DataType *input, + const Shape3D in_shape, + DataType *output, + const Shape5D out_shape, + const DataType *offset, + int stride_y, + int stride_x, + int pad_y, + int pad_x, + int dilation_y, + int dilation_x) { auto in_channels = in_shape[CHW::C]; - auto in_height = in_shape[CHW::H]; - auto in_width = in_shape[CHW::W]; + auto in_height = in_shape[CHW::H]; + auto in_width = in_shape[CHW::W]; - auto k_height = out_shape[CKHW::KH]; - auto k_width = out_shape[CKHW::KW]; + auto k_height = out_shape[CKHW::KH]; + auto k_width = out_shape[CKHW::KW]; auto out_height = out_shape[CKHW::H]; - auto out_width = out_shape[CKHW::W]; + auto out_width = out_shape[CKHW::W]; assert(out_shape[CKHW::C] == in_channels); - auto wsize = sycl::range<3>(in_channels*k_height*k_width, out_height, out_width); - return queue.parallel_for(wsize, [=](sycl::id<3> idx) - { + auto wsize = + sycl::range<3>(in_channels * k_height * k_width, out_height, out_width); + return queue.parallel_for(wsize, [=](sycl::id<3> idx) { auto ckhkw = static_cast(idx[0]); auto h = static_cast(idx[1]); auto w = static_cast(idx[2]); - auto c = ckhkw/(k_height*k_width); - auto khkw = ckhkw%(k_height*k_width); + auto c = ckhkw / (k_height * k_width); + auto khkw = ckhkw % (k_height * k_width); - auto kh = khkw/k_width; - auto kw = khkw%k_width; + auto kh = khkw / k_width; + auto kw = khkw % k_width; - auto k_h_m = (k_height - 1)/2; - auto k_w_m = (k_width - 1)/2; + auto k_h_m = (k_height - 1) / 2; + auto k_w_m = (k_width - 1) / 2; - auto _output = get_ptr_5d(output, - in_channels, k_height, k_width, out_height, out_width, - c, kh, kw, h, 0); + auto _output = get_ptr_5d(output, in_channels, k_height, k_width, + out_height, out_width, c, kh, kw, h, 0); - auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 1, h, w) + h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m); - auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height, out_width, kh, kw, 0, h, w) + w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m); - // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - k_h_m); - // auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - k_w_m); + auto offset_y = *get_ptr_5d(offset, k_height, k_width, 2, out_height, + out_width, kh, kw, 1, h, w) + + h * stride_y + (kh - k_h_m) * dilation_y - + (pad_y - k_h_m); + auto offset_x = *get_ptr_5d(offset, k_height, k_width, 2, out_height, + out_width, kh, kw, 0, h, w) + + w * stride_x + (kw - k_w_m) * dilation_x - + (pad_x - k_w_m); + // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - + // k_h_m); auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x + // - k_w_m); - auto _input = get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0); + auto _input = + get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0); _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x); // _output[w] = offset_y; @@ -124,42 +133,49 @@ inline auto deform_input(cl::sycl::queue& queue, class fill_output; -template -auto output_fill_with_bias(cl::sycl::queue& queue, DataType* output, const Shape3D out_shape, DataType* bias) +template +auto output_fill_with_bias(cl::sycl::queue &queue, + DataType *output, + const Shape3D out_shape, + DataType *bias) { auto out_c = out_shape[CHW::C]; auto out_h = out_shape[CHW::H]; auto out_w = out_shape[CHW::W]; - return queue.parallel_for(sycl::range<3>(out_c, out_h, out_w),[=](sycl::id<3> idx) - { - auto c = static_cast(idx[0]); - auto h = static_cast(idx[1]); - auto w = static_cast(idx[2]); + return queue.parallel_for( + sycl::range<3>(out_c, out_h, out_w), [=](sycl::id<3> idx) { + auto c = static_cast(idx[0]); + auto h = static_cast(idx[1]); + auto w = static_cast(idx[2]); - auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w); - *out_ptr = bias[c]; - }); + auto out_ptr = get_ptr_3d(output, out_c, out_h, out_w, c, h, w); + *out_ptr = bias[c]; + }); } -template -void deformable_convolution_b1_impl(cl::sycl::queue& queue, - const DataType* input, +template +void deformable_convolution_b1_impl(cl::sycl::queue &queue, + const DataType *input, const Shape3D in_shape, - DataType* output, + DataType *output, const Shape3D out_shape, - DataType* tmp, - const float* offset, - DataType* weights, + DataType *tmp, + const float *offset, + DataType *weights, const Shape4D weights_shape, - DataType* bias, - int stride_y, int stride_x, - int pad_y, int pad_x, - int dilation_y, int dilation_x, - int groups, int deformable_groups) + DataType *bias, + int stride_y, + int stride_x, + int pad_y, + int pad_x, + int dilation_y, + int dilation_x, + int groups, + int deformable_groups) { - using oneapi::mkl::blas::row_major::gemm; using oneapi::mkl::transpose; + using oneapi::mkl::blas::row_major::gemm; auto in_c = in_shape[CHW::C]; auto in_h = in_shape[CHW::H]; @@ -174,42 +190,41 @@ void deformable_convolution_b1_impl(cl::sycl::queue& queue, auto ker_h = weights_shape[OIHW::H]; auto ker_w = weights_shape[OIHW::W]; - auto edeform = deform_input(queue, - input, in_shape, - tmp, {in_c, ker_h, ker_w, out_h, out_w}, - offset, - stride_y, stride_x, - pad_y, pad_x, - dilation_y, dilation_x); + auto edeform = deform_input( + queue, input, in_shape, tmp, {in_c, ker_h, ker_w, out_h, out_w}, offset, + stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x); auto efill = output_fill_with_bias(queue, output, out_shape, bias); - auto egemm = gemm(queue, - transpose::N, transpose::N, /*transpose a, b*/ - out_c, out_h*out_w, in_c*ker_h*ker_w, /*m, n, k*/ - 1, /*alpha*/ - weights, in_c*ker_h*ker_w, /*a, lda*/ - tmp, out_h*out_w, /*b, ldb*/ - 1, /*beta*/ - output, out_h*out_w, /*c, ldc*/ + auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/ + out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/ + 1, /*alpha*/ + weights, in_c * ker_h * ker_w, /*a, lda*/ + tmp, out_h * out_w, /*b, ldb*/ + 1, /*beta*/ + output, out_h * out_w, /*c, ldc*/ {edeform, efill} /*events*/); egemm.wait(); } -template -void deformable_convolution_impl(cl::sycl::queue& queue, - const DataType* input, +template +void deformable_convolution_impl(cl::sycl::queue &queue, + const DataType *input, const Shape4D in_shape, - DataType* output, + DataType *output, const Shape4D out_shape, - DataType* tmp, - const float* offset, - DataType* weights, + DataType *tmp, + const float *offset, + DataType *weights, const Shape4D weights_shape, - DataType* bias, - int stride_y, int stride_x, - int pad_y, int pad_x, - int dilation_y, int dilation_x, - int groups, int deformable_groups) + DataType *bias, + int stride_y, + int stride_x, + int pad_y, + int pad_x, + int dilation_y, + int dilation_x, + int groups, + int deformable_groups) { auto in_b = in_shape[NCHW::N]; auto in_c = in_shape[NCHW::C]; @@ -224,28 +239,32 @@ void deformable_convolution_impl(cl::sycl::queue& queue, assert(out_c == weights_shape[OIHW::OC]); assert(in_c == weights_shape[OIHW::IC]); - for (auto b = 0; b < in_b; ++b) - { + for (auto b = 0; b < in_b; ++b) { auto input_ptr = get_ptr_4d(input, in_b, in_c, in_h, in_w, b, 0, 0, 0); - auto output_ptr = get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0); - deformable_convolution_b1_impl(queue, - input_ptr, {in_c, in_h, in_w}, - output_ptr, {out_c, out_h, out_w}, - tmp, - offset, - weights, weights_shape, - bias, - stride_y, stride_x, - pad_y, pad_x, - dilation_y, dilation_x, - groups, deformable_groups); - + auto output_ptr = + get_ptr_4d(output, in_b, out_c, out_h, out_w, b, 0, 0, 0); + deformable_convolution_b1_impl( + queue, input_ptr, {in_c, in_h, in_w}, output_ptr, + {out_c, out_h, out_w}, tmp, offset, weights, weights_shape, bias, + stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, + deformable_groups); } } -void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm_ndarray output, dpctl::tensor::usm_ndarray offset, - dpctl::tensor::usm_ndarray weights, dpctl::tensor::usm_ndarray bias, dpctl::tensor::usm_ndarray tmp, - int stride_y, int stride_x, int pad_y, int pad_x, int dilation_y, int dilation_x, int groups, int deformable_groups) +void deformable_convolution(dpctl::tensor::usm_ndarray input, + dpctl::tensor::usm_ndarray output, + dpctl::tensor::usm_ndarray offset, + dpctl::tensor::usm_ndarray weights, + dpctl::tensor::usm_ndarray bias, + dpctl::tensor::usm_ndarray tmp, + int stride_y, + int stride_x, + int pad_y, + int pad_x, + int dilation_y, + int dilation_x, + int groups, + int deformable_groups) { auto queue = input.get_queue(); @@ -268,19 +287,15 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm auto input_shape = Shape4D({batch, in_channels, in_height, in_width}); auto output_shape = Shape4D({batch, out_channels, out_height, out_width}); - auto weights_shape = Shape4D({out_channels, in_channels, kernel_height, kernel_width}); - - deformable_convolution_impl(queue, - input.get_data(), input_shape, - output.get_data(), output_shape, - tmp.get_data(), - offset.get_data(), - weights.get_data(), weights_shape, - bias.get_data(), - stride_y, stride_x, - pad_y, pad_x, - dilation_y, dilation_x, - groups, deformable_groups); + auto weights_shape = + Shape4D({out_channels, in_channels, kernel_height, kernel_width}); + + deformable_convolution_impl( + queue, input.get_data(), input_shape, output.get_data(), + output_shape, tmp.get_data(), offset.get_data(), + weights.get_data(), weights_shape, bias.get_data(), + stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, + deformable_groups); } PYBIND11_MODULE(_deformable_convolution_sycl, m) @@ -288,10 +303,10 @@ PYBIND11_MODULE(_deformable_convolution_sycl, m) import_dpctl(); m.def("deformable_convolution", &deformable_convolution, - "Defromable convolution", - py::arg("input"), py::arg("output"), py::arg("offset"), - py::arg("weights"), py::arg("bias"), py::arg("tmp"), - py::arg("stride_y"), py::arg("stride_x"), py::arg("pad_y"), py::arg("pad_x"), - py::arg("dilation_y"), py::arg("dilation_x"), - py::arg("groups"), py::arg("deformable_groups")); + "Defromable convolution", py::arg("input"), py::arg("output"), + py::arg("offset"), py::arg("weights"), py::arg("bias"), + py::arg("tmp"), py::arg("stride_y"), py::arg("stride_x"), + py::arg("pad_y"), py::arg("pad_x"), py::arg("dilation_y"), + py::arg("dilation_x"), py::arg("groups"), + py::arg("deformable_groups")); } diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp index 447a0571..603c4144 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/utils.hpp @@ -6,32 +6,28 @@ #include -template -class TensorShape: public std::array +template class TensorShape : public std::array { public: - template - const int& operator[](DimType dim) const + template const int &operator[](DimType dim) const { return std::array::operator[](static_cast(dim)); } - template - int& operator[](DimType dim) + template int &operator[](DimType dim) { return std::array::operator[](static_cast(dim)); } - }; -enum class CHW: int +enum class CHW : int { C, H, W, }; -enum class NCHW: int +enum class NCHW : int { N, C, @@ -39,7 +35,7 @@ enum class NCHW: int W, }; -enum class OIHW: int +enum class OIHW : int { OC, IC, @@ -47,7 +43,7 @@ enum class OIHW: int W, }; -enum class HWCK: int +enum class HWCK : int { H, W, @@ -56,7 +52,7 @@ enum class HWCK: int KW }; -enum class CKHW: int +enum class CKHW : int { C, KH, @@ -73,22 +69,20 @@ using Shape5D = TensorShape<5>; using DType = float; #define get_ptr_1d(data_ptr, max_dim_0, dim_0) (data_ptr + (dim_0)) -#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1) \ -(\ - get_ptr_1d(data_ptr + (dim_0)*(max_dim_1), max_dim_1, dim_1) \ -) -#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1, dim_2) \ -(\ - get_ptr_2d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2), max_dim_1, max_dim_2, dim_1, dim_2) \ -) -#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, dim_0, dim_1, dim_2, dim_3) \ -(\ - get_ptr_3d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3), max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3) \ -) +#define get_ptr_2d(data_ptr, max_dim_0, max_dim_1, dim_0, dim_1) \ + (get_ptr_1d(data_ptr + (dim_0) * (max_dim_1), max_dim_1, dim_1)) +#define get_ptr_3d(data_ptr, max_dim_0, max_dim_1, max_dim_2, dim_0, dim_1, \ + dim_2) \ + (get_ptr_2d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2), max_dim_1, \ + max_dim_2, dim_1, dim_2)) +#define get_ptr_4d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, \ + dim_0, dim_1, dim_2, dim_3) \ + (get_ptr_3d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2) * (max_dim_3), \ + max_dim_1, max_dim_2, max_dim_3, dim_1, dim_2, dim_3)) -#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4) \ -(\ - get_ptr_4d(data_ptr + (dim_0)*(max_dim_1)*(max_dim_2)*(max_dim_3)*(max_dim_4), \ - max_dim_1, max_dim_2, max_dim_3, max_dim_4, \ - dim_1, dim_2, dim_3, dim_4) \ -) +#define get_ptr_5d(data_ptr, max_dim_0, max_dim_1, max_dim_2, max_dim_3, \ + max_dim_4, dim_0, dim_1, dim_2, dim_3, dim_4) \ + (get_ptr_4d(data_ptr + (dim_0) * (max_dim_1) * (max_dim_2) * (max_dim_3) * \ + (max_dim_4), \ + max_dim_1, max_dim_2, max_dim_3, max_dim_4, dim_1, dim_2, \ + dim_3, dim_4)) diff --git a/dpbench/configs/bench_info/deformable_convolution.json b/dpbench/configs/bench_info/deformable_convolution.json index e18e7771..8fabbb57 100644 --- a/dpbench/configs/bench_info/deformable_convolution.json +++ b/dpbench/configs/bench_info/deformable_convolution.json @@ -9,20 +9,20 @@ "domain": "Deep learning", "parameters": { "S": { - "batch":1, - "in_channels":16, - "in_height":32, - "in_width":32, - "out_channels":16, - "out_height":32, - "out_width":32, - "kernel_height":3, - "kernel_width":3, - "stride_y":1, + "batch":1, + "in_channels":16, + "in_height":32, + "in_width":32, + "out_channels":16, + "out_height":32, + "out_width":32, + "kernel_height":3, + "kernel_width":3, + "stride_y":1, "stride_x":1, - "dilation_y":1, + "dilation_y":1, "dilation_x":1, - "pad_y":1, + "pad_y":1, "pad_x":1, "groups":1, "deformable_groups":1, @@ -30,20 +30,20 @@ "seed": 7777777 }, "M": { - "batch":1, - "in_channels":64, - "in_height":64, - "in_width":64, - "out_channels":64, - "out_height":64, - "out_width":64, - "kernel_height":3, - "kernel_width":3, - "stride_y":1, + "batch":1, + "in_channels":64, + "in_height":64, + "in_width":64, + "out_channels":64, + "out_height":64, + "out_width":64, + "kernel_height":3, + "kernel_width":3, + "stride_y":1, "stride_x":1, - "dilation_y":1, + "dilation_y":1, "dilation_x":1, - "pad_y":1, + "pad_y":1, "pad_x":1, "groups":1, "deformable_groups":1, @@ -51,20 +51,20 @@ "seed": 7777777 }, "L": { - "batch":2, - "in_channels":64, - "in_height":128, - "in_width":128, - "out_channels":128, - "out_height":128, - "out_width":128, - "kernel_height":3, - "kernel_width":3, - "stride_y":1, + "batch":2, + "in_channels":64, + "in_height":128, + "in_width":128, + "out_channels":128, + "out_height":128, + "out_width":128, + "kernel_height":3, + "kernel_width":3, + "stride_y":1, "stride_x":1, - "dilation_y":1, + "dilation_y":1, "dilation_x":1, - "pad_y":1, + "pad_y":1, "pad_x":1, "groups":1, "deformable_groups":1, @@ -75,20 +75,20 @@ "init": { "func_name": "initialize", "input_args": [ - "batch", - "in_channels", - "in_height", - "in_width", - "out_channels", - "out_height", - "out_width", - "kernel_height", - "kernel_width", - "stride_y", + "batch", + "in_channels", + "in_height", + "in_width", + "out_channels", + "out_height", + "out_width", + "kernel_height", + "kernel_width", + "stride_y", "stride_x", - "dilation_y", + "dilation_y", "dilation_x", - "pad_y", + "pad_y", "pad_x", "groups", "deformable_groups", @@ -111,11 +111,11 @@ "weights", "bias", "tmp", - "stride_y", + "stride_y", "stride_x", - "dilation_y", + "dilation_y", "dilation_x", - "pad_y", + "pad_y", "pad_x", "groups", "deformable_groups" From 809047daf6040b00fa492ba1441e280f18da9af7 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 29 May 2023 13:25:41 +0300 Subject: [PATCH 03/20] refactor --- .../deformable_convolution_initialize.py | 4 - .../deformable_convolution_numba_npr.py | 141 ------------------ .../deformable_convolution_numpy.py | 136 ----------------- .../CMakeLists.txt | 7 +- .../deformable_convolution_sycl/impl.cpp | 14 +- .../bench_info/deformable_convolution.json | 135 ----------------- 6 files changed, 7 insertions(+), 430 deletions(-) delete mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py delete mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py delete mode 100644 dpbench/configs/bench_info/deformable_convolution.json diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py index 896a2224..205f57be 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py @@ -33,20 +33,16 @@ def initialize( default_rng.random((batch, in_channels, in_height, in_width)).astype( dtype ), - # np.ones((batch, in_channels, in_height, in_width)).astype(dtype), np.zeros((batch, out_channels, out_height, out_width)).astype(dtype), - # np.zeros((kernel_height, kernel_width, 2, out_height, out_width)).astype(dtype), 2 * default_rng.random( (kernel_height, kernel_width, 2, out_height, out_width) ).astype(dtype) - 1, - # default_rng.random((out_channels, in_channels, kernel_height, kernel_width)).astype(dtype), np.ones( (out_channels, in_channels, kernel_height, kernel_width) ).astype(dtype), default_rng.random(out_channels).astype(dtype), - # np.ones((out_channels,)).astype(dtype), np.zeros( (in_channels, kernel_height, kernel_width, out_height, out_width) ).astype(dtype), diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py deleted file mode 100644 index dea7236e..00000000 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_npr.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2022 Intel Corp. -# -# SPDX-License-Identifier: Apache-2.0 - -import math - -import numpy as np -from numba import njit, prange - - -@njit(parallel=True, inline="always", fastmath=True) -def bilinear(input, offset_y, offset_x): - height, width = input.shape - start_x = int(math.floor(offset_x)) - start_x_weight = 1 - (offset_x - start_x) - start_y = int(math.floor(offset_y)) - start_y_weight = 1 - (offset_y - start_y) - - output = 0 - if ( - offset_x >= width - or offset_y >= height - or offset_x <= -1 - or offset_y <= -1 - ): - return output - - if start_x >= 0 and start_y >= 0: - w = start_x_weight * start_y_weight - output += w * input[start_y, start_x] - - if start_x + 1 < width and start_y >= 0: - w = (1 - start_x_weight) * start_y_weight - output += w * input[start_y, start_x + 1] - - if start_x >= 0 and start_y + 1 < height: - w = start_x_weight * (1 - start_y_weight) - output += w * input[start_y + 1, start_x] - - if start_x + 1 < width and start_y + 1 < height: - w = (1 - start_x_weight) * (1 - start_y_weight) - output += w * input[start_y + 1, start_x + 1] - - return output - - -@njit(parallel=True, fastmath=True) -def deform( - input, offset, output, stride, pad, dilation, groups, deformable_groups -): - k_height, k_width, _, out_height, out_width = offset.shape - channels, _, _ = input.shape - - k_h_m = (k_height - 1) // 2 - k_w_m = (k_width - 1) // 2 - - for ckhkw in prange(channels * k_height * k_width): - for h in prange(out_height): - for w in prange(out_width): - c = ckhkw // (k_height * k_width) - khkw = ckhkw % (k_height * k_width) - kh = khkw // k_width - kw = khkw % k_width - - offset_y = ( - offset[kh, kw, 1, h, w] - + h * stride[0] - + (kh - k_h_m) * dilation[0] - - (pad[0] - k_h_m) - ) - offset_x = ( - offset[kh, kw, 0, h, w] - + w * stride[1] - + (kw - k_w_m) * dilation[1] - - (pad[1] - k_w_m) - ) - - output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) - - -@njit(parallel=True, fastmath=True) -def deformable_convolution_b1( - input, - output, - offset, - weights, - bias, - tmp, - stride, - pad, - dilation, - groups, - deformable_groups, -): - out_channels, height, width = output.shape - _, in_channels, k_height, k_width = weights.shape - - deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) - - tmp = tmp.reshape((in_channels * k_height * k_width, height * width)) - - _weights = weights.reshape((out_channels, in_channels * k_height * k_width)) - _output = output.reshape((out_channels, height * width)) - np.dot(_weights, tmp, _output) - - _bias = bias.reshape((out_channels, 1)) - _output[:] = _output + _bias - - -@njit(parallel=True) -def deformable_convolution( - input, - output, - offset, - weights, - bias, - tmp, - stride_y, - stride_x, - pad_y, - pad_x, - dilation_y, - dilation_x, - groups, - deformable_groups, -): - batch, _, _, _ = input.shape - for b in range(batch): - deformable_convolution_b1( - input[b], - output[b], - offset, - weights, - bias, - tmp, - (stride_y, stride_x), - (pad_y, pad_x), - (dilation_y, dilation_x), - groups, - deformable_groups, - ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py deleted file mode 100644 index d0ec3d5f..00000000 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numpy.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright 2022 Intel Corp. -# -# SPDX-License-Identifier: Apache-2.0 - -import math - -import numpy as np -from numba import njit, prange - - -@njit(parallel=True, inline="always", fastmath=True) -def bilinear(input, offset_y, offset_x): - height, width = input.shape - start_x = int(math.floor(offset_x)) - start_x_weight = 1 - (offset_x - start_x) - start_y = int(math.floor(offset_y)) - start_y_weight = 1 - (offset_y - start_y) - - output = 0 - if start_x >= width or start_y >= height or start_x < -1 or start_y < -1: - return output - - if start_x >= 0 and start_y >= 0: - w = start_x_weight * start_y_weight - output += w * input[start_y, start_x] - - if start_x + 1 < width and start_y >= 0: - w = (1 - start_x_weight) * start_y_weight - output += w * input[start_y, start_x + 1] - - if start_x >= 0 and start_y + 1 < height: - w = start_x_weight * (1 - start_y_weight) - output += w * input[start_y + 1, start_x] - - if start_x + 1 < width and start_y + 1 < height: - w = (1 - start_x_weight) * (1 - start_y_weight) - output += w * input[start_y + 1, start_x + 1] - - return output - - -@njit(parallel=True, fastmath=True) -def deform( - input, offset, output, stride, pad, dilation, groups, deformable_groups -): - k_height, k_width, _, out_height, out_width = offset.shape - channels, _, _ = input.shape - - k_h_m = (k_height - 1) // 2 - k_w_m = (k_width - 1) // 2 - - for ckhkw in prange(channels * k_height * k_width): - for h in prange(out_height): - for w in prange(out_width): - c = ckhkw // (k_height * k_width) - khkw = ckhkw % (k_height * k_width) - kh = khkw // k_width - kw = khkw % k_width - - offset_y = ( - offset[kh, kw, 1, h, w] - + h * stride[0] - + (kh - k_h_m) * dilation[0] - - (pad[0] - k_h_m) - ) - offset_x = ( - offset[kh, kw, 0, h, w] - + w * stride[1] - + (kw - k_w_m) * dilation[1] - - (pad[1] - k_w_m) - ) - - output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) - - -# @njit(parallel=True, fastmath=True) -def deformable_convolution_b1( - input, - output, - offset, - weights, - bias, - tmp, - stride, - pad, - dilation, - groups, - deformable_groups, -): - out_channels, height, width = output.shape - _, in_channels, k_height, k_width = weights.shape - - deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) - - tmp = tmp.reshape((in_channels * k_height * k_width, height * width)) - - _weights = weights.reshape((out_channels, in_channels * k_height * k_width)) - _output = output.reshape((out_channels, height * width)) - np.dot(_weights, tmp, _output) - - _bias = bias.reshape((out_channels, 1)) - _output[:] = _output + _bias - - -# @njit(parallel=True) -def deformable_convolution( - input, - output, - offset, - weights, - bias, - tmp, - stride_y, - stride_x, - pad_y, - pad_x, - dilation_y, - dilation_x, - groups, - deformable_groups, -): - batch, _, _, _ = input.shape - for b in range(batch): - deformable_convolution_b1( - input[b], - output[b], - offset, - weights, - bias, - tmp, - (stride_y, stride_x), - (pad_y, pad_x), - (dilation_y, dilation_x), - groups, - deformable_groups, - ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index e874e715..d4284907 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -1,3 +1,5 @@ +cmake_minimum_required(VERSION 3.23) + set(py_module_name _deformable_convolution_sycl) pybind11_add_module(${py_module_name} MODULE @@ -5,15 +7,10 @@ pybind11_add_module(${py_module_name} ) find_package(MKL CONFIG REQUIRED) -# target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $) target_compile_options(${py_module_name} PUBLIC $) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $) target_link_libraries(${py_module_name} PUBLIC $ ${MKL_SYCL}) -message(STATUS "${MKL_IMPORTED_TARGETS}") -message(STATUS "${INTERFACE_COMPILE_OPTIONS}") -message(STATUS "${INTERFACE_INCLUDE_DIRECTORIES}") - file(RELATIVE_PATH py_module_dest ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) install(TARGETS ${py_module_name} DESTINATION ${py_module_dest}/deformable_convolution_sycl diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index 005155bd..eedd9799 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -119,15 +119,11 @@ inline auto deform_input(cl::sycl::queue &queue, out_width, kh, kw, 0, h, w) + w * stride_x + (kw - k_w_m) * dilation_x - (pad_x - k_w_m); - // auto offset_y = h*stride_y + (kh - k_h_m)*dilation_y - (pad_y - - // k_h_m); auto offset_x = w*stride_x + (kw - k_w_m)*dilation_x - (pad_x - // - k_w_m); auto _input = get_ptr_3d(input, in_channels, in_height, in_width, c, 0, 0); _output[w] = bilinear(_input, in_height, in_width, offset_y, offset_x); - // _output[w] = offset_y; }); } @@ -197,11 +193,11 @@ void deformable_convolution_b1_impl(cl::sycl::queue &queue, auto efill = output_fill_with_bias(queue, output, out_shape, bias); auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/ out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/ - 1, /*alpha*/ - weights, in_c * ker_h * ker_w, /*a, lda*/ - tmp, out_h * out_w, /*b, ldb*/ - 1, /*beta*/ - output, out_h * out_w, /*c, ldc*/ + 1, /*alpha*/ + weights, in_c * ker_h * ker_w, /*a, lda*/ + tmp, out_h * out_w, /*b, ldb*/ + 1, /*beta*/ + output, out_h * out_w, /*c, ldc*/ {edeform, efill} /*events*/); egemm.wait(); } diff --git a/dpbench/configs/bench_info/deformable_convolution.json b/dpbench/configs/bench_info/deformable_convolution.json deleted file mode 100644 index 8fabbb57..00000000 --- a/dpbench/configs/bench_info/deformable_convolution.json +++ /dev/null @@ -1,135 +0,0 @@ -{ - "benchmark": { - "name": "Deformable convolution", - "short_name": "deformable-convolution", - "relative_path": "deformable_convolution", - "module_name": "deformable_convolution", - "func_name": "deformable_convolution", - "kind": "microbenchmark", - "domain": "Deep learning", - "parameters": { - "S": { - "batch":1, - "in_channels":16, - "in_height":32, - "in_width":32, - "out_channels":16, - "out_height":32, - "out_width":32, - "kernel_height":3, - "kernel_width":3, - "stride_y":1, - "stride_x":1, - "dilation_y":1, - "dilation_x":1, - "pad_y":1, - "pad_x":1, - "groups":1, - "deformable_groups":1, - "dtype":"float32", - "seed": 7777777 - }, - "M": { - "batch":1, - "in_channels":64, - "in_height":64, - "in_width":64, - "out_channels":64, - "out_height":64, - "out_width":64, - "kernel_height":3, - "kernel_width":3, - "stride_y":1, - "stride_x":1, - "dilation_y":1, - "dilation_x":1, - "pad_y":1, - "pad_x":1, - "groups":1, - "deformable_groups":1, - "dtype":"float32", - "seed": 7777777 - }, - "L": { - "batch":2, - "in_channels":64, - "in_height":128, - "in_width":128, - "out_channels":128, - "out_height":128, - "out_width":128, - "kernel_height":3, - "kernel_width":3, - "stride_y":1, - "stride_x":1, - "dilation_y":1, - "dilation_x":1, - "pad_y":1, - "pad_x":1, - "groups":1, - "deformable_groups":1, - "dtype":"float32", - "seed": 7777777 - } - }, - "init": { - "func_name": "initialize", - "input_args": [ - "batch", - "in_channels", - "in_height", - "in_width", - "out_channels", - "out_height", - "out_width", - "kernel_height", - "kernel_width", - "stride_y", - "stride_x", - "dilation_y", - "dilation_x", - "pad_y", - "pad_x", - "groups", - "deformable_groups", - "dtype", - "seed" - ], - "output_args": [ - "input", - "output", - "offset", - "weights", - "bias", - "tmp" - ] - }, - "input_args": [ - "input", - "output", - "offset", - "weights", - "bias", - "tmp", - "stride_y", - "stride_x", - "dilation_y", - "dilation_x", - "pad_y", - "pad_x", - "groups", - "deformable_groups" - ], - "array_args": [ - "input", - "output", - "offset", - "weights", - "bias", - "tmp" - ], - "output_args": [ - "output" - ] - } -} From 7d39e0ef2e76b5a6a8f01bca724708f880acf6c9 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Thu, 1 Jun 2023 11:01:10 +0200 Subject: [PATCH 04/20] Small fixes --- .../deformable_convolution/__init__.py | 37 +---- .../deformable_convolution_numba_mlir_p.py | 142 ++++++++++++++++++ dpbench/config/reader.py | 2 +- 3 files changed, 144 insertions(+), 37 deletions(-) create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py diff --git a/dpbench/benchmarks/deformable_convolution/__init__.py b/dpbench/benchmarks/deformable_convolution/__init__.py index a5db8b1d..a94a9e6b 100644 --- a/dpbench/benchmarks/deformable_convolution/__init__.py +++ b/dpbench/benchmarks/deformable_convolution/__init__.py @@ -2,40 +2,5 @@ # # SPDX-License-Identifier: Apache 2.0 -from .deformable_convolution_initialize import initialize -from .deformable_convolution_numba_npr import ( - deformable_convolution as deformable_convolution_numba_npr, -) -from .deformable_convolution_numpy import ( - deformable_convolution as deformable_convolution_numpy, -) -from .deformable_convolution_sycl_native_ext import deformable_convolution_sycl - -__all__ = [ - "initialize", - "deformable_convolution_numba_npr", - "deformable_convolution_numba_numpy", - "deformable_convolution_sycl", -] - -"""l2-norm calculation of n vectors - -Input ---------- -npoints: int - number of vectors -dims: int - dimension of single vector -seed: int - random seed to generate random number - -Output -------- -d: array - l2 norm of each vector - -Method ------- - ||Vj||2=sqrt(sum(Xj[i]*Xj[i])) - here i is 0->dims, j is 0->npoints +"""Deformable convolution """ diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py new file mode 100644 index 00000000..76d354d0 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py @@ -0,0 +1,142 @@ +# Copyright 2022 Intel Corp. +# +# SPDX-License-Identifier: Apache-2.0 + +import math + +import numpy as np +from numba import prange +from numba_mlir import njit + + +@njit(parallel=True, inline="always", fastmath=True) +def bilinear(input, offset_y, offset_x): + height, width = input.shape + start_x = int(math.floor(offset_x)) + start_x_weight = 1 - (offset_x - start_x) + start_y = int(math.floor(offset_y)) + start_y_weight = 1 - (offset_y - start_y) + + output = 0 + if ( + offset_x >= width + or offset_y >= height + or offset_x <= -1 + or offset_y <= -1 + ): + return output + + if start_x >= 0 and start_y >= 0: + w = start_x_weight * start_y_weight + output += w * input[start_y, start_x] + + if start_x + 1 < width and start_y >= 0: + w = (1 - start_x_weight) * start_y_weight + output += w * input[start_y, start_x + 1] + + if start_x >= 0 and start_y + 1 < height: + w = start_x_weight * (1 - start_y_weight) + output += w * input[start_y + 1, start_x] + + if start_x + 1 < width and start_y + 1 < height: + w = (1 - start_x_weight) * (1 - start_y_weight) + output += w * input[start_y + 1, start_x + 1] + + return output + + +@njit(parallel=True, fastmath=True) +def deform( + input, offset, output, stride, pad, dilation, groups, deformable_groups +): + k_height, k_width, _, out_height, out_width = offset.shape + channels, _, _ = input.shape + + k_h_m = (k_height - 1) // 2 + k_w_m = (k_width - 1) // 2 + + for ckhkw in prange(channels * k_height * k_width): + for h in prange(out_height): + for w in prange(out_width): + c = ckhkw // (k_height * k_width) + khkw = ckhkw % (k_height * k_width) + kh = khkw // k_width + kw = khkw % k_width + + offset_y = ( + offset[kh, kw, 1, h, w] + + h * stride[0] + + (kh - k_h_m) * dilation[0] + - (pad[0] - k_h_m) + ) + offset_x = ( + offset[kh, kw, 0, h, w] + + w * stride[1] + + (kw - k_w_m) * dilation[1] + - (pad[1] - k_w_m) + ) + + output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) + + +@njit(parallel=True, fastmath=True) +def deformable_convolution_b1( + input, + output, + offset, + weights, + bias, + tmp, + stride, + pad, + dilation, + groups, + deformable_groups, +): + out_channels, height, width = output.shape + _, in_channels, k_height, k_width = weights.shape + + deform(input, offset, tmp, stride, pad, dilation, groups, deformable_groups) + + tmp = tmp.reshape((in_channels * k_height * k_width, height * width)) + + _weights = weights.reshape((out_channels, in_channels * k_height * k_width)) + _output = output.reshape((out_channels, height * width)) + np.dot(_weights, tmp, _output) + + _bias = bias.reshape((out_channels, 1)) + _output[:] = _output + _bias + + +@njit(parallel=True) +def deformable_convolution( + input, + output, + offset, + weights, + bias, + tmp, + stride_y, + stride_x, + pad_y, + pad_x, + dilation_y, + dilation_x, + groups, + deformable_groups, +): + batch, _, _, _ = input.shape + for b in range(batch): + deformable_convolution_b1( + input[b], + output[b], + offset, + weights, + bias, + tmp, + (stride_y, stride_x), + (pad_y, pad_x), + (dilation_y, dilation_x), + groups, + deformable_groups, + ) diff --git a/dpbench/config/reader.py b/dpbench/config/reader.py index bc549653..58b5edf8 100644 --- a/dpbench/config/reader.py +++ b/dpbench/config/reader.py @@ -19,7 +19,7 @@ from .implementation_postfix import Implementation from .module import Module -_REFERENCE_IMPLEMENTATIONS = {"numpy", "python"} +_REFERENCE_IMPLEMENTATIONS = {"numpy", "python", "sycl"} def read_configs( # noqa: C901: TODO: move modules into config From 0de01b1fd3ec3811a6d987815ea02f73556c0671 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Fri, 9 Jun 2023 20:35:44 +0200 Subject: [PATCH 05/20] Add config & pre-commit --- .../deformable_convolution/CMakeLists.txt | 4 + .../CMakeLists.txt | 4 + .../deformable_convolution_sycl/impl.cpp | 10 +- .../bench_info/deformable_convolution.toml | 136 ++++++++++++++++++ 4 files changed, 149 insertions(+), 5 deletions(-) create mode 100644 dpbench/configs/bench_info/deformable_convolution.toml diff --git a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt index d06d5c73..820b345a 100644 --- a/dpbench/benchmarks/deformable_convolution/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/CMakeLists.txt @@ -1 +1,5 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + add_subdirectory(deformable_convolution_sycl_native_ext) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index d4284907..86cd122f 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + cmake_minimum_required(VERSION 3.23) set(py_module_name _deformable_convolution_sycl) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index eedd9799..211c0b4b 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -193,11 +193,11 @@ void deformable_convolution_b1_impl(cl::sycl::queue &queue, auto efill = output_fill_with_bias(queue, output, out_shape, bias); auto egemm = gemm(queue, transpose::N, transpose::N, /*transpose a, b*/ out_c, out_h * out_w, in_c * ker_h * ker_w, /*m, n, k*/ - 1, /*alpha*/ - weights, in_c * ker_h * ker_w, /*a, lda*/ - tmp, out_h * out_w, /*b, ldb*/ - 1, /*beta*/ - output, out_h * out_w, /*c, ldc*/ + 1, /*alpha*/ + weights, in_c * ker_h * ker_w, /*a, lda*/ + tmp, out_h * out_w, /*b, ldb*/ + 1, /*beta*/ + output, out_h * out_w, /*c, ldc*/ {edeform, efill} /*events*/); egemm.wait(); } diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml new file mode 100644 index 00000000..fd339b1a --- /dev/null +++ b/dpbench/configs/bench_info/deformable_convolution.toml @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +[benchmark] +name = "Deformable convolution" +short_name = "deformable-convolution" +relative_path = "deformable_convolution" +module_name = "deformable_convolution" +func_name = "deformable_convolution" +kind = "microbenchmark" +domain = "Deep learning" +input_args = [ + "input", + "output", + "offset", + "weights", + "bias", + "tmp", + "stride_y", + "stride_x", + "dilation_y", + "dilation_x", + "pad_y", + "pad_x", + "groups", + "deformable_groups" +] +array_args = [ + "input", + "output", + "offset", + "weights", + "bias", + "tmp" +] +output_args = [ + "output" +] + +[benchmark.parameters.S] +batch = 1 +in_channels = 16 +in_height = 32 +in_width = 32 +out_channels = 16 +out_height = 32 +out_width = 32 +kernel_height = 3 +kernel_width = 3 +stride_y = 1 +stride_x = 1 +dilation_y = 1 +dilation_x = 1 +pad_y = 1 +pad_x = 1 +groups = 1 +deformable_groups = 1 +dtype = "float32" +seed = 7777777 + +[benchmark.parameters.M] +batch = 1 +in_channels = 64 +in_height = 64 +in_width = 64 +out_channels = 64 +out_height = 64 +out_width = 64 +kernel_height = 3 +kernel_width = 3 +stride_y = 1 +stride_x = 1 +dilation_y = 1 +dilation_x = 1 +pad_y = 1 +pad_x = 1 +groups = 1 +deformable_groups = 1 +dtype = "float32" +seed = 7777777 + +[benchmark.parameters.L] +batch = 2 +in_channels = 64 +in_height = 128 +in_width = 128 +out_channels = 128 +out_height = 128 +out_width = 128 +kernel_height = 3 +kernel_width = 3 +stride_y = 1 +stride_x = 1 +dilation_y = 1 +dilation_x = 1 +pad_y = 1 +pad_x = 1 +groups = 1 +deformable_groups = 1 +dtype = "float32" +seed = 7777777 + +[benchmark.init] +func_name = "initialize" +types_dict_name="types_dict" +precision="single" +input_args = [ + "batch", + "in_channels", + "in_height", + "in_width", + "out_channels", + "out_height", + "out_width", + "kernel_height", + "kernel_width", + "stride_y", + "stride_x", + "dilation_y", + "dilation_x", + "pad_y", + "pad_x", + "groups", + "deformable_groups", + "dtype", + "seed" +] +output_args = [ + "input", + "output", + "offset", + "weights", + "bias", + "tmp" +] From 0e1b858ca274a07883bf0debb1b23958b9097750 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 19 Jun 2023 23:48:00 +0200 Subject: [PATCH 06/20] Fix mlir_p version & result --- .../deformable_convolution_numba_mlir_p.py | 11 +++++------ .../deformable_convolution_sycl/impl.cpp | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py index 76d354d0..55e7246b 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py @@ -9,7 +9,7 @@ from numba_mlir import njit -@njit(parallel=True, inline="always", fastmath=True) +@njit(parallel=True, inline="always", fastmath=True, gpu_fp64_truncate="auto") def bilinear(input, offset_y, offset_x): height, width = input.shape start_x = int(math.floor(offset_x)) @@ -42,10 +42,10 @@ def bilinear(input, offset_y, offset_x): w = (1 - start_x_weight) * (1 - start_y_weight) output += w * input[start_y + 1, start_x + 1] - return output + return output/2 -@njit(parallel=True, fastmath=True) +@njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto") def deform( input, offset, output, stride, pad, dilation, groups, deformable_groups ): @@ -54,7 +54,6 @@ def deform( k_h_m = (k_height - 1) // 2 k_w_m = (k_width - 1) // 2 - for ckhkw in prange(channels * k_height * k_width): for h in prange(out_height): for w in prange(out_width): @@ -79,7 +78,7 @@ def deform( output[c, kh, kw, h, w] = bilinear(input[c], offset_y, offset_x) -@njit(parallel=True, fastmath=True) +@njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto") def deformable_convolution_b1( input, output, @@ -108,7 +107,7 @@ def deformable_convolution_b1( _output[:] = _output + _bias -@njit(parallel=True) +@njit(parallel=True, gpu_fp64_truncate="auto") def deformable_convolution( input, output, diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index 211c0b4b..ad4e9ef2 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -62,7 +62,7 @@ __attribute__((always_inline)) DataType bilinear(const DataType *input, result += w3 * v3; } - return result; + return result/2; } class deform; From ff457c058c5815b665e0165b82178e79ac18309f Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sat, 24 Jun 2023 18:14:56 +0200 Subject: [PATCH 07/20] Add TBBConfig.cmake --- .../deformable_convolution_numba_mlir_p.py | 2 +- .../CMakeLists.txt | 1 + .../cmake/TBBConfig.cmake | 193 ++++++++++++++++++ .../deformable_convolution_sycl/impl.cpp | 2 +- 4 files changed, 196 insertions(+), 2 deletions(-) create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py index 55e7246b..b99ff53b 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py @@ -42,7 +42,7 @@ def bilinear(input, offset_y, offset_x): w = (1 - start_x_weight) * (1 - start_y_weight) output += w * input[start_y + 1, start_x + 1] - return output/2 + return output / 2 @njit(parallel=True, fastmath=True, gpu_fp64_truncate="auto") diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index 86cd122f..f7efe986 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -9,6 +9,7 @@ pybind11_add_module(${py_module_name} MODULE deformable_convolution_sycl/impl.cpp ) +find_package(TBB REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH) find_package(MKL CONFIG REQUIRED) target_compile_options(${py_module_name} PUBLIC $) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake new file mode 100644 index 00000000..5363c1d3 --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/TBBConfig.cmake @@ -0,0 +1,193 @@ +# Copyright (c) 2017-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# It defines the following variables: +# TBB__FOUND +# TBB_IMPORTED_TARGETS +# +# TBBConfigVersion.cmake defines TBB_VERSION +# +# Initialize to default values +if (NOT TBB_IMPORTED_TARGETS) + set(TBB_IMPORTED_TARGETS "") +endif() + +if (NOT TBB_FIND_COMPONENTS) + set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy") + foreach (_tbb_component ${TBB_FIND_COMPONENTS}) + set(TBB_FIND_REQUIRED_${_tbb_component} 1) + endforeach() +endif() + +get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH) +get_filename_component(_tbb_root "${_tbb_root}/../../.." ABSOLUTE) + +set(TBB_INTERFACE_VERSION ) + +set(_tbb_bin_version 12) +set(_tbbmalloc_bin_version 2) +set(_tbbmalloc_proxy_bin_version 2) +set(_tbbbind_bin_version 3) + +# Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc +list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix) +if (NOT _tbbmalloc_proxy_ix EQUAL -1) + list(APPEND TBB_FIND_COMPONENTS tbbmalloc) + list(REMOVE_DUPLICATES TBB_FIND_COMPONENTS) + set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy}) +endif() +unset(_tbbmalloc_proxy_ix) + +if (CMAKE_SIZEOF_VOID_P STREQUAL "8") + set(_tbb_subdir intel64/gcc4.8) +else () + set(_tbb_subdir ia32/gcc4.8) +endif() + +if (UNIX) + set(_tbb_lib_ext ".so") + set(_tbb_lib_prefix "lib") + set(_tbb_lib_dir_conda "lib") + set(_bin_version "") +elseif (WIN32) + set(_bin_version "") + set(_tbb_lib_prefix "") + set(_tbb_lib_ext ".dll") + set(_tbb_impllib_ext ".lib") + set(_tbb_lib_dir_conda "bin") + set(_tbb_impllib_dir_conda "lib") +else() + message(FATAL_ERROR "Unsupported platform. Only Unix and Windows are supported.") +endif() + +foreach (_tbb_component ${TBB_FIND_COMPONENTS}) + set(TBB_${_tbb_component}_FOUND 0) + +if(WIN32) + unset(_bin_version) + if (_tbb_component STREQUAL tbb) + set(_bin_version ${_tbb_bin_version}) + endif() +endif() + + if(UNIX) + find_library(_tbb_release_lib + NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_lib_ext} + PATHS ${_tbb_root} + HINTS ENV TBB_ROOT_HINT + PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}") + + else() + find_file(_tbb_release_lib + NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_lib_ext} + PATHS ${_tbb_root} + HINTS ENV TBB_ROOT_HINT + PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}") + + if (EXISTS "${_tbb_release_lib}") + find_library(_tbb_release_impllib + NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}${_tbb_impllib_ext} + PATHS ${_tbb_root} + HINTS ENV TBB_ROOT_HINT + PATH_SUFFIXES "${_tbb_impllib_dir_conda}" "lib/${_tbb_subdir}") + endif() + endif() + + if (NOT TBB_FIND_RELEASE_ONLY) + find_library(_tbb_debug_lib + NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}_debug.${_tbb_lib_ext} + PATHS ${_tbb_root} + HINTS ENV TBB_ROOT_HINT + PATH_SUFFIXES "${_tbb_lib_dir_conda}" "lib/${_tbb_subdir}") + if(WIN32 AND EXISTS "${_tbb_debug_lib}") + find_library(_tbb_debug_impllib + NAMES ${_tbb_lib_prefix}${_tbb_component}${_bin_version}_debug.${_tbb_impllib_ext} + PATHS ${_tbb_root} + HINTS ENV TBB_ROOT_HINT + PATH_SUFFIXES "${_tbb_impllib_dir_conda}" "lib/${_tbb_subdir}") + endif() + endif() + + if (EXISTS "${_tbb_release_lib}" OR EXISTS "${_tbb_debug_lib}") + if (NOT TARGET TBB::${_tbb_component}) + add_library(TBB::${_tbb_component} SHARED IMPORTED) + + find_path(_tbb_include_dir + oneapi/tbb.h + PATHS ${_tbb_root} + PATH_SUFFIXES include + HINTS ENV TBB_ROOT_HINT + ) + +if(WIN32) + set_target_properties( + TBB::${_tbb_component} PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}" + INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1" + ) +else() + set_target_properties( + TBB::${_tbb_component} PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}" + ) +endif() + unset(_tbb_current_realpath) + unset(_tbb_include_dir) + + if (EXISTS "${_tbb_release_lib}") +if(WIN32) + set_target_properties(TBB::${_tbb_component} PROPERTIES + IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}" + IMPORTED_IMPLIB_RELEASE "${_tbb_release_impllib}") +else() + set_target_properties(TBB::${_tbb_component} PROPERTIES + IMPORTED_LOCATION_RELEASE "${_tbb_release_lib}") +endif() + set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) + endif() + + if (EXISTS "${_tbb_debug_lib}") +if(WIN32) + set_target_properties(TBB::${_tbb_component} PROPERTIES + IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}" + IMPORTED_IMPLIB_DEBUG "${_tbb_debug_impllib}" + ) +else() + set_target_properties(TBB::${_tbb_component} PROPERTIES + IMPORTED_LOCATION_DEBUG "${_tbb_debug_lib}") +endif() + set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) + endif() + + # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc + if (_tbb_component STREQUAL tbbmalloc_proxy) + set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc) + endif() + endif() + list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component}) + set(TBB_${_tbb_component}_FOUND 1) + elseif (TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component}) + message(STATUS "Missed required oneTBB component: ${_tbb_component}") + if (TBB_FIND_RELEASE_ONLY) + message(STATUS " ${_tbb_release_lib} must exist.") + else() + message(STATUS " one or both of:\n ${_tbb_release_lib}\n ${_tbb_debug_lib}\n files must exist.") + endif() + set(TBB_FOUND FALSE) + endif() +endforeach() +list(REMOVE_DUPLICATES TBB_IMPORTED_TARGETS) +unset(_tbb_release_lib) +unset(_tbb_debug_lib) +unset(_tbb_root) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index ad4e9ef2..12ac11de 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -62,7 +62,7 @@ __attribute__((always_inline)) DataType bilinear(const DataType *input, result += w3 * v3; } - return result/2; + return result / 2; } class deform; From 95fa4a90234b4b319e184d0908d832f1b21ba978 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sat, 24 Jun 2023 18:26:09 +0200 Subject: [PATCH 08/20] Fix TBB discovery --- .../deformable_convolution_sycl_native_ext/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index f7efe986..701a07ab 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -9,7 +9,9 @@ pybind11_add_module(${py_module_name} MODULE deformable_convolution_sycl/impl.cpp ) -find_package(TBB REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH) + +message(${CMAKE_CURRENT_SOURCE_DIR}) +find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) find_package(MKL CONFIG REQUIRED) target_compile_options(${py_module_name} PUBLIC $) From 00a0944dccc589793aef92273c4d70735e258051 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 00:03:31 +0200 Subject: [PATCH 09/20] Add tbb and mkl to the environment --- dpbench/config/reader.py | 2 +- dpbench/configs/bench_info/deformable_convolution.toml | 1 + environments/conda-linux-sycl.yml | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dpbench/config/reader.py b/dpbench/config/reader.py index 58b5edf8..bc549653 100644 --- a/dpbench/config/reader.py +++ b/dpbench/config/reader.py @@ -19,7 +19,7 @@ from .implementation_postfix import Implementation from .module import Module -_REFERENCE_IMPLEMENTATIONS = {"numpy", "python", "sycl"} +_REFERENCE_IMPLEMENTATIONS = {"numpy", "python"} def read_configs( # noqa: C901: TODO: move modules into config diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml index fd339b1a..4c22313c 100644 --- a/dpbench/configs/bench_info/deformable_convolution.toml +++ b/dpbench/configs/bench_info/deformable_convolution.toml @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 [benchmark] +reference_implementation_postfix = "sycl" name = "Deformable convolution" short_name = "deformable-convolution" relative_path = "deformable_convolution" diff --git a/environments/conda-linux-sycl.yml b/environments/conda-linux-sycl.yml index 8d34ee73..b77a2348 100644 --- a/environments/conda-linux-sycl.yml +++ b/environments/conda-linux-sycl.yml @@ -34,3 +34,5 @@ dependencies: - libgcc-ng - libstdcxx-ng - libgomp + - tbb-devel >= 2021.6.0 + - mkl-devel-dpcpp>=2023.0.0 \ No newline at end of file From 46d90dc3a532a6fc8e731406150a5e3e08872094 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 00:36:57 +0200 Subject: [PATCH 10/20] pre-commit --- environments/conda-linux-sycl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/conda-linux-sycl.yml b/environments/conda-linux-sycl.yml index b77a2348..7fe49532 100644 --- a/environments/conda-linux-sycl.yml +++ b/environments/conda-linux-sycl.yml @@ -35,4 +35,4 @@ dependencies: - libstdcxx-ng - libgomp - tbb-devel >= 2021.6.0 - - mkl-devel-dpcpp>=2023.0.0 \ No newline at end of file + - mkl-devel-dpcpp>=2023.0.0 From 5badbc47da700112237c643b597791dfc13561b6 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 01:09:52 +0200 Subject: [PATCH 11/20] Add MKLConfig.cmake --- .../CMakeLists.txt | 2 +- .../cmake/MKLConfig.cmake | 851 ++++++++++++++++++ 2 files changed, 852 insertions(+), 1 deletion(-) create mode 100644 dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index 701a07ab..bf233286 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -12,7 +12,7 @@ pybind11_add_module(${py_module_name} message(${CMAKE_CURRENT_SOURCE_DIR}) find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) -find_package(MKL CONFIG REQUIRED) +find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) target_compile_options(${py_module_name} PUBLIC $) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS} $) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake new file mode 100644 index 00000000..07290e8e --- /dev/null +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake @@ -0,0 +1,851 @@ +#=============================================================================== +# Copyright 2021-2022 Intel Corporation. +# +# This software and the related documents are Intel copyrighted materials, and +# your use of them is governed by the express license under which they were +# provided to you (License). Unless the License provides otherwise, you may not +# use, modify, copy, publish, distribute, disclose or transmit this software or +# the related documents without Intel's prior written permission. +# +# This software and the related documents are provided as is, with no express +# or implied warranties, other than those that are expressly stated in the +# License. +#=============================================================================== + +#=================================================================== +# CMake Config file for Intel(R) oneAPI Math Kernel Library (oneMKL) +#=================================================================== + +#=============================================================================== +# Input parameters +#================= +#------------- +# Main options +#------------- +# MKL_ROOT: oneMKL root directory (May be required for non-standard install locations. Optional otherwise.) +# Default: use location from MKLROOT environment variable or /../../../ if MKLROOT is not defined +# MKL_ARCH +# Values: ia32 intel64 +# Default: intel64 +# MKL_LINK +# Values: static, dynamic, sdl +# Default: dynamic +# Exceptions:- DPC++ doesn't support sdl +# MKL_THREADING +# Values: sequential, +# intel_thread (Intel OpenMP), +# gnu_thread (GNU OpenMP), +# pgi_thread (PGI OpenMP), +# tbb_thread +# Default: intel_thread +# Exceptions:- DPC++ defaults to tbb, PGI compiler on Windows defaults to pgi_thread +# MKL_INTERFACE (for MKL_ARCH=intel64 only) +# Values: lp64, ilp64 +# GNU or INTEL interface will be selected based on Compiler. +# Default: ilp64 +# MKL_MPI +# Values: intelmpi, mpich, openmpi, msmpi, mshpc +# Default: intelmpi +#----------------------------------- +# Special options (OFF by default) +#----------------------------------- +# ENABLE_BLAS95: Enables BLAS Fortran95 API +# ENABLE_LAPACK95: Enables LAPACK Fortran95 API +# ENABLE_BLACS: Enables cluster BLAS library +# ENABLE_CDFT: Enables cluster DFT library +# ENABLE_CPARDISO: Enables cluster PARDISO functionality +# ENABLE_SCALAPACK: Enables cluster LAPACK library +# ENABLE_OMP_OFFLOAD: Enables OpenMP Offload functionality +# +#================== +# Output parameters +#================== +# MKL_ROOT +# oneMKL root directory. +# MKL_INCLUDE +# Use of target_include_directories() is recommended. +# INTERFACE_INCLUDE_DIRECTORIES property is set on mkl_core and mkl_rt libraries. +# Alternatively, this variable can be used directly (not recommended as per Modern CMake) +# MKL_ENV +# Provides all environment variables based on input parameters. +# Currently useful for mkl_rt linking and BLACS on Windows. +# Must be set as an ENVIRONMENT property. +# Example: +# add_test(NAME mytest COMMAND myexe) +# if(MKL_ENV) +# set_tests_properties(mytest PROPERTIES ENVIRONMENT "${MKL_ENV}") +# endif() +# +# MKL:: +# IMPORTED targets to link MKL libraries individually or when using a custom link-line. +# mkl_core and mkl_rt have INTERFACE_* properties set to them. +# Please refer to Intel(R) oneMKL Link Line Advisor for help with linking. +# +# Below INTERFACE targets provide full link-lines for direct use. +# Example: +# target_link_options( PUBLIC $) +# +# MKL::MKL +# Link line for C and Fortran API +# MKL::MKL_DPCPP +# Link line for DPC++ API +# +# Note: For Device API, library linking is not required. +# Compile options can be added from the INTERFACE_COMPILE_OPTIONS property on MKL::MKL_DPCPP +# Include directories can be added from the INTERFACE_INCLUDE_DIRECTORIES property on MKL::MKL_DPCPP +# +# Note: Output parameters' and targets' availability can change +# based on Input parameters and application project languages. +#=============================================================================== + +function(mkl_message MSG_MODE MSG_TEXT) + if(MSG_MODE STREQUAL "FATAL_ERROR") + message(${MSG_MODE} ${MSG_TEXT}) + else() + if(NOT MKL_FIND_QUIETLY) + message(${MSG_MODE} ${MSG_TEXT}) + endif() + endif() +endfunction() + +if(${CMAKE_VERSION} VERSION_LESS "3.13") + mkl_message(FATAL_ERROR "The minimum supported CMake version is 3.13. You are running version ${CMAKE_VERSION}") +endif() + +include_guard() +include(FindPackageHandleStandardArgs) + +if(NOT MKL_LIBRARIES) + +# Set CMake policies for well-defined behavior across CMake versions +cmake_policy(SET CMP0011 NEW) +cmake_policy(SET CMP0057 NEW) + +# Project Languages +get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES) +list(APPEND MKL_LANGS C CXX Fortran) +foreach(lang ${languages}) + if(${lang} IN_LIST MKL_LANGS) + list(APPEND CURR_LANGS ${lang}) + endif() +endforeach() +list(REMOVE_DUPLICATES CURR_LANGS) + +option(ENABLE_BLAS95 "Enables BLAS Fortran95 API" OFF) +option(ENABLE_LAPACK95 "Enables LAPACK Fortran95 API" OFF) +option(ENABLE_BLACS "Enables cluster BLAS library" OFF) +option(ENABLE_CDFT "Enables cluster DFT library" OFF) +option(ENABLE_CPARDISO "Enables cluster PARDISO functionality" OFF) +option(ENABLE_SCALAPACK "Enables cluster LAPACK library" OFF) +option(ENABLE_OMP_OFFLOAD "Enables OpenMP Offload functionality" OFF) + +# Use MPI if any of these are enabled +if(ENABLE_BLACS OR ENABLE_CDFT OR ENABLE_SCALAPACK OR ENABLE_CPARDISO) + set(USE_MPI ON) +endif() + +# Check Parameters +function(define_param TARGET_PARAM DEFAULT_PARAM SUPPORTED_LIST) + if(NOT DEFINED ${TARGET_PARAM} AND NOT DEFINED ${DEFAULT_PARAM}) + mkl_message(STATUS "${TARGET_PARAM}: Undefined") + elseif(NOT DEFINED ${TARGET_PARAM} AND DEFINED ${DEFAULT_PARAM}) + set(${TARGET_PARAM} "${${DEFAULT_PARAM}}" CACHE STRING "Choose ${TARGET_PARAM} options are: ${${SUPPORTED_LIST}}") + foreach(opt ${${DEFAULT_PARAM}}) + set(STR_LIST "${STR_LIST} ${opt}") + endforeach() + mkl_message(STATUS "${TARGET_PARAM}: None, set to `${STR_LIST}` by default") + elseif(${SUPPORTED_LIST}) + set(ITEM_FOUND 1) + foreach(opt ${${TARGET_PARAM}}) + if(NOT ${opt} IN_LIST ${SUPPORTED_LIST}) + set(ITEM_FOUND 0) + endif() + endforeach() + if(ITEM_FOUND EQUAL 0) + foreach(opt ${${SUPPORTED_LIST}}) + set(STR_LIST "${STR_LIST} ${opt}") + endforeach() + mkl_message(FATAL_ERROR "Invalid ${TARGET_PARAM} `${${TARGET_PARAM}}`, options are: ${STR_LIST}") + else() + mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}") + endif() + else() + mkl_message(STATUS "${TARGET_PARAM}: ${${TARGET_PARAM}}") + endif() +endfunction() + +#================ +# Compiler checks +#================ + +if(CMAKE_C_COMPILER) + get_filename_component(C_COMPILER_NAME ${CMAKE_C_COMPILER} NAME) +endif() +if(CMAKE_CXX_COMPILER) + get_filename_component(CXX_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) +endif() +if(CMAKE_Fortran_COMPILER) + get_filename_component(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER} NAME) +endif() + +# Determine Compiler Family +if(CXX_COMPILER_NAME STREQUAL "dpcpp" OR CXX_COMPILER_NAME STREQUAL "dpcpp.exe" + OR CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe") + set(DPCPP_COMPILER ON) +endif() +if(C_COMPILER_NAME MATCHES "^clang") + set(CLANG_COMPILER ON) +endif() +if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") + set(PGI_COMPILER ON) +elseif(CMAKE_C_COMPILER_ID STREQUAL "Intel" OR CMAKE_Fortran_COMPILER_ID STREQUAL "Intel" + OR CMAKE_C_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM") + set(INTEL_COMPILER ON) +else() + if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + set(GNU_C_COMPILER ON) + endif() + if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + set(GNU_Fortran_COMPILER ON) + endif() +endif() + +if(USE_MPI AND (C_COMPILER_NAME MATCHES "^mpi" OR Fortran_COMPILER_NAME MATCHES "^mpi")) + set(USE_MPI_SCRIPT ON) +endif() + +#================ + +#================ +# System-specific +#================ + +# Extensions +if(UNIX) + set(LIB_PREFIX "lib") + set(LIB_EXT ".a") + set(DLL_EXT ".so") + if(APPLE) + set(DLL_EXT ".dylib") + endif() + set(LINK_PREFIX "-l") + set(LINK_SUFFIX "") +else() + set(LIB_PREFIX "") + set(LIB_EXT ".lib") + set(DLL_EXT "_dll.lib") + set(LINK_PREFIX "") + set(LINK_SUFFIX ".lib") +endif() + +# Set target system architecture +set(DEFAULT_MKL_ARCH intel64) +if(DPCPP_COMPILER OR PGI_COMPILER OR ENABLE_OMP_OFFLOAD OR USE_MPI) + set(MKL_ARCH_LIST intel64) +else() + set(MKL_ARCH_LIST ia32 intel64) +endif() +define_param(MKL_ARCH DEFAULT_MKL_ARCH MKL_ARCH_LIST) + +#================ + +#========== +# Setup MKL +#========== + +# Set MKL_ROOT directory +if(NOT DEFINED MKL_ROOT) + if(DEFINED ENV{MKLROOT}) + set(MKL_ROOT $ENV{MKLROOT}) + else() + get_filename_component(MKL_CMAKE_PATH "${CMAKE_CURRENT_LIST_DIR}" REALPATH) + get_filename_component(MKL_ROOT "${MKL_CMAKE_PATH}/../../../" ABSOLUTE) + mkl_message(STATUS "MKL_ROOT ${MKL_ROOT}") + endif() +endif() +string(REPLACE "\\" "/" MKL_ROOT ${MKL_ROOT}) + +# Define MKL_LINK +set(DEFAULT_MKL_LINK dynamic) +if(DPCPP_COMPILER OR USE_MPI) + set(MKL_LINK_LIST static dynamic) +else() + set(MKL_LINK_LIST static dynamic sdl) +endif() +define_param(MKL_LINK DEFAULT_MKL_LINK MKL_LINK_LIST) + +# Define MKL_INTERFACE +if(MKL_ARCH STREQUAL "intel64") + set(IFACE_TYPE intel) + if(GNU_Fortran_COMPILER) + set(IFACE_TYPE gf) + endif() + if(DPCPP_COMPILER) + if(MKL_INTERFACE) + set(MKL_INTERFACE_FULL intel_${MKL_INTERFACE}) + endif() + set(DEFAULT_MKL_INTERFACE intel_ilp64) + set(MKL_INTERFACE_LIST intel_ilp64) + else() + if(MKL_INTERFACE) + set(MKL_INTERFACE_FULL ${IFACE_TYPE}_${MKL_INTERFACE}) + endif() + set(DEFAULT_MKL_INTERFACE ${IFACE_TYPE}_ilp64) + set(MKL_INTERFACE_LIST ${IFACE_TYPE}_ilp64 ${IFACE_TYPE}_lp64) + endif() + define_param(MKL_INTERFACE_FULL DEFAULT_MKL_INTERFACE MKL_INTERFACE_LIST) +else() + if(WIN32) + set(MKL_INTERFACE_FULL intel_c) + elseif(NOT APPLE) + if(GNU_Fortran_COMPILER) + set(MKL_INTERFACE_FULL gf) + else() + set(MKL_INTERFACE_FULL intel) + endif() + else() + mkl_message(FATAL_ERROR "OSX does not support MKL_ARCH ia32.") + endif() +endif() +if(MKL_INTERFACE_FULL MATCHES "ilp64") + set(MKL_INTERFACE "ilp64") +else() + set(MKL_INTERFACE "lp64") +endif() + +# Define MKL headers +find_path(MKL_H mkl.h + HINTS ${MKL_ROOT} + PATH_SUFFIXES include) +list(APPEND MKL_INCLUDE ${MKL_H}) + +# Add pre-built F95 Interface Modules +if(INTEL_COMPILER AND (ENABLE_BLAS95 OR ENABLE_LAPACK95)) + if(MKL_ARCH STREQUAL "intel64") + list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/${MKL_ARCH}/${MKL_INTERFACE}") + else() + list(APPEND MKL_INCLUDE "${MKL_ROOT}/include/${MKL_ARCH}") + endif() +endif() + +# Define MKL_THREADING +# All APIs support sequential threading +set(MKL_THREADING_LIST "sequential" "intel_thread" "tbb_thread") +set(DEFAULT_MKL_THREADING intel_thread) +# DPC++ API supports TBB threading, but not OpenMP threading +if(DPCPP_COMPILER) + set(DEFAULT_MKL_THREADING tbb_thread) + list(REMOVE_ITEM MKL_THREADING_LIST intel_thread) +# C, Fortran API +elseif(PGI_COMPILER) + # PGI compiler supports PGI OpenMP threading, additionally + list(APPEND MKL_THREADING_LIST pgi_thread) + # PGI compiler does not support TBB threading + list(REMOVE_ITEM MKL_THREADING_LIST tbb_thread) + if(WIN32) + # PGI 19.10 and 20.1 on Windows, do not support Intel OpenMP threading + list(REMOVE_ITEM MKL_THREADING_LIST intel_thread) + set(DEFAULT_MKL_THREADING pgi_thread) + endif() +elseif(GNU_C_COMPILER OR GNU_Fortran_COMPILER OR CLANG_COMPILER) + list(APPEND MKL_THREADING_LIST gnu_thread) +else() + # Intel and Microsoft compilers + # Nothing to do, only for completeness +endif() +define_param(MKL_THREADING DEFAULT_MKL_THREADING MKL_THREADING_LIST) + +# Define MKL_MPI +set(DEFAULT_MKL_MPI intelmpi) +if(UNIX) + if(APPLE) + # Override defaults for OSX + set(DEFAULT_MKL_MPI mpich) + set(MKL_MPI_LIST mpich) + else() + set(MKL_MPI_LIST intelmpi openmpi mpich mpich2) + endif() +else() + # Windows + set(MKL_MPI_LIST intelmpi mshpc msmpi) +endif() +define_param(MKL_MPI DEFAULT_MKL_MPI MKL_MPI_LIST) +# MSMPI is now called MSHPC. MSMPI option exists for backward compatibility. +if(MKL_MPI STREQUAL "mshpc") + set(MKL_MPI msmpi) +endif() +find_package_handle_standard_args(MKL REQUIRED_VARS MKL_MPI) + +# Checkpoint - Verify if required options are defined +find_package_handle_standard_args(MKL REQUIRED_VARS MKL_ROOT MKL_ARCH MKL_INCLUDE MKL_LINK MKL_THREADING MKL_INTERFACE_FULL) + +# Provides a list of IMPORTED targets for the project +if(NOT DEFINED MKL_IMPORTED_TARGETS) + set(MKL_IMPORTED_TARGETS "") +endif() + +# Clear temporary variables +set(MKL_C_COPT "") +set(MKL_F_COPT "") +set(MKL_SDL_COPT "") +set(MKL_CXX_COPT "") +set(MKL_DPCPP_COPT "") +set(MKL_DPCPP_LOPT "") +set(MKL_OFFLOAD_COPT "") +set(MKL_OFFLOAD_LOPT "") + +set(MKL_SUPP_LINK "") # Other link options. Usually at the end of the link-line. +set(MKL_LINK_LINE) # For MPI only +set(MKL_ENV_PATH "") # Temporary variable to work with PATH +set(MKL_ENV "") # Exported environment variables + +# Modify PATH variable to make it CMake-friendly +set(OLD_PATH $ENV{PATH}) +string(REPLACE ";" "\;" OLD_PATH "${OLD_PATH}") + +# Compiler options +if(GNU_C_COMPILER OR GNU_Fortran_COMPILER) + if(MKL_ARCH STREQUAL "ia32") + list(APPEND MKL_C_COPT -m32) + list(APPEND MKL_F_COPT -m32) + else() + list(APPEND MKL_C_COPT -m64) + list(APPEND MKL_F_COPT -m64) + endif() +endif() + +# Additonal compiler & linker options +if(CXX_COMPILER_NAME STREQUAL "icpx" OR CXX_COMPILER_NAME STREQUAL "icx.exe") + list(APPEND MKL_DPCPP_COPT "-fsycl") + list(APPEND MKL_DPCPP_LOPT "-fsycl") +endif() +if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD) + if(MKL_LINK STREQUAL "static") + list(APPEND MKL_DPCPP_LOPT "-fsycl-device-code-split=per_kernel") + list(APPEND MKL_OFFLOAD_LOPT "-fsycl-device-code-split=per_kernel") + endif() +endif() + +# For OpenMP Offload +if(ENABLE_OMP_OFFLOAD) + if(WIN32) + if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1") + if("Fortran" IN_LIST CURR_LANGS) + list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -DONEMKL_USE_OPENMP_VERSION=202011) + else() + list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64 -Qopenmp-version:51 -DONEMKL_USE_OPENMP_VERSION=202011) + endif() + else() + list(APPEND MKL_OFFLOAD_COPT -Qiopenmp -Qopenmp-targets:spir64) + endif() + # -MD and -MDd are manually added here because offload functionality uses DPC++ runtime. + if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo") + list(APPEND MKL_OFFLOAD_COPT -MDd) + else() + list(APPEND MKL_OFFLOAD_COPT -MD) + endif() + list(APPEND MKL_OFFLOAD_LOPT -Qiopenmp -Qopenmp-targets:spir64 -fsycl) + set(SKIP_LIBPATH ON) + else() + if(OPENMP_VERSION VERSION_GREATER_EQUAL "5.1") + if("Fortran" IN_LIST CURR_LANGS) + list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -DONEMKL_USE_OPENMP_VERSION=202011) + else() + list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64 -fopenmp-version=51 -DONEMKL_USE_OPENMP_VERSION=202011) + endif() + else () + list(APPEND MKL_OFFLOAD_COPT -fiopenmp -fopenmp-targets=spir64) + endif() + list(APPEND MKL_OFFLOAD_LOPT -fiopenmp -fopenmp-targets=spir64 -fsycl) + if(APPLE) + list(APPEND MKL_SUPP_LINK -lc++) + else() + list(APPEND MKL_SUPP_LINK -lstdc++) + endif() + endif() +endif() + +# For selected Interface +if(MKL_INTERFACE_FULL) + if(MKL_ARCH STREQUAL "ia32") + if(GNU_Fortran_COMPILER) + set(MKL_SDL_IFACE_ENV "GNU") + endif() + else() + if(GNU_Fortran_COMPILER) + set(MKL_SDL_IFACE_ENV "GNU,${MKL_INTERFACE}") + else() + set(MKL_SDL_IFACE_ENV "${MKL_INTERFACE}") + endif() + if(MKL_INTERFACE STREQUAL "ilp64") + if("Fortran" IN_LIST CURR_LANGS) + if(INTEL_COMPILER) + if(WIN32) + list(APPEND MKL_F_COPT "-4I8") + else() + list(APPEND MKL_F_COPT "-i8") + endif() + elseif(GNU_Fortran_COMPILER) + list(APPEND MKL_F_COPT "-fdefault-integer-8") + elseif(PGI_COMPILER) + list(APPEND MKL_F_COPT "-i8") + endif() + endif() + list(INSERT MKL_C_COPT 0 "-DMKL_ILP64") + list(INSERT MKL_SDL_COPT 0 "-DMKL_ILP64") + list(INSERT MKL_CXX_COPT 0 "-DMKL_ILP64") + list(INSERT MKL_OFFLOAD_COPT 0 "-DMKL_ILP64") + else() + # lp64 + endif() + endif() + if(MKL_SDL_IFACE_ENV) + string(TOUPPER ${MKL_SDL_IFACE_ENV} MKL_SDL_IFACE_ENV) + endif() +endif() # MKL_INTERFACE_FULL + +# All MKL Libraries +if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo") + set(MKL_SYCL mkl_sycld) +else() + set(MKL_SYCL mkl_sycl) +endif() +set(MKL_IFACE_LIB mkl_${MKL_INTERFACE_FULL}) +set(MKL_CORE mkl_core) +if(WIN32 AND CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo" AND MKL_THREADING STREQUAL "tbb_thread") + set(MKL_THREAD mkl_tbb_threadd) +else() + set(MKL_THREAD mkl_${MKL_THREADING}) +endif() +set(MKL_SDL mkl_rt) +if(MKL_ARCH STREQUAL "ia32") + set(MKL_BLAS95 mkl_blas95) + set(MKL_LAPACK95 mkl_lapack95) +else() + set(MKL_BLAS95 mkl_blas95_${MKL_INTERFACE}) + set(MKL_LAPACK95 mkl_lapack95_${MKL_INTERFACE}) +endif() +# BLACS +set(MKL_BLACS mkl_blacs_${MKL_MPI}_${MKL_INTERFACE}) +if(UNIX AND NOT APPLE AND MKL_MPI MATCHES "mpich") + # MPICH is compatible with INTELMPI Wrappers on Linux + set(MKL_BLACS mkl_blacs_intelmpi_${MKL_INTERFACE}) +endif() +if(WIN32) + if(MKL_MPI STREQUAL "msmpi") + if("Fortran" IN_LIST CURR_LANGS) + list(APPEND MKL_SUPP_LINK "msmpifec.lib") + endif() + # MSMPI and MSHPC are supported with the same BLACS library + set(MKL_BLACS mkl_blacs_msmpi_${MKL_INTERFACE}) + if(NOT MKL_LINK STREQUAL "static") + set(MKL_BLACS mkl_blacs_${MKL_INTERFACE}) + set(MKL_BLACS_ENV MSMPI) + endif() + elseif(MKL_MPI STREQUAL "intelmpi" AND NOT MKL_LINK STREQUAL "static") + set(MKL_BLACS mkl_blacs_${MKL_INTERFACE}) + set(MKL_BLACS_ENV INTELMPI) + endif() +endif() +# CDFT & SCALAPACK +set(MKL_CDFT mkl_cdft_core) +set(MKL_SCALAPACK mkl_scalapack_${MKL_INTERFACE}) + + +if (UNIX) + if(NOT APPLE) + if(MKL_LINK STREQUAL "static") + set(START_GROUP "-Wl,--start-group") + set(END_GROUP "-Wl,--end-group") + if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD) + set(EXPORT_DYNAMIC "-Wl,-export-dynamic") + endif() + elseif(MKL_LINK STREQUAL "dynamic") + set(MKL_RPATH "-Wl,-rpath=$") + if((GNU_Fortran_COMPILER OR PGI_COMPILER) AND "Fortran" IN_LIST CURR_LANGS) + set(NO_AS_NEEDED -Wl,--no-as-needed) + endif() + else() + set(MKL_RPATH "-Wl,-rpath=$") + endif() + endif() +endif() + +# Create a list of requested libraries, based on input options (MKL_LIBRARIES) +# Create full link-line in MKL_LINK_LINE +list(APPEND MKL_LINK_LINE $,${MKL_OFFLOAD_LOPT},> + $,${MKL_DPCPP_LOPT},> ${EXPORT_DYNAMIC} ${NO_AS_NEEDED} ${MKL_RPATH}) +if(ENABLE_BLAS95) + list(APPEND MKL_LIBRARIES ${MKL_BLAS95}) + list(APPEND MKL_LINK_LINE MKL::${MKL_BLAS95}) +endif() +if(ENABLE_LAPACK95) + list(APPEND MKL_LIBRARIES ${MKL_LAPACK95}) + list(APPEND MKL_LINK_LINE MKL::${MKL_LAPACK95}) +endif() +if(ENABLE_SCALAPACK) + list(APPEND MKL_LIBRARIES ${MKL_SCALAPACK}) + list(APPEND MKL_LINK_LINE MKL::${MKL_SCALAPACK}) +endif() +if(DPCPP_COMPILER OR (ENABLE_OMP_OFFLOAD AND NOT MKL_LINK STREQUAL "sdl")) + list(APPEND MKL_LIBRARIES ${MKL_SYCL}) + list(APPEND MKL_LINK_LINE MKL::${MKL_SYCL}) +endif() +list(APPEND MKL_LINK_LINE ${START_GROUP}) +if(ENABLE_CDFT) + list(APPEND MKL_LIBRARIES ${MKL_CDFT}) + list(APPEND MKL_LINK_LINE MKL::${MKL_CDFT}) +endif() +if(MKL_LINK STREQUAL "sdl") + list(APPEND MKL_LIBRARIES ${MKL_SDL}) + list(APPEND MKL_LINK_LINE MKL::${MKL_SDL}) +else() + list(APPEND MKL_LIBRARIES ${MKL_IFACE_LIB} ${MKL_THREAD} ${MKL_CORE}) + list(APPEND MKL_LINK_LINE MKL::${MKL_IFACE_LIB} MKL::${MKL_THREAD} MKL::${MKL_CORE}) +endif() +if(USE_MPI) + list(APPEND MKL_LIBRARIES ${MKL_BLACS}) + list(APPEND MKL_LINK_LINE MKL::${MKL_BLACS}) +endif() +list(APPEND MKL_LINK_LINE ${END_GROUP}) + +# Find all requested libraries +foreach(lib ${MKL_LIBRARIES}) + unset(${lib}_file CACHE) + if(MKL_LINK STREQUAL "static" AND NOT ${lib} STREQUAL ${MKL_SDL}) + find_library(${lib}_file ${LIB_PREFIX}${lib}${LIB_EXT} + PATHS ${MKL_ROOT} + PATH_SUFFIXES "lib" "lib/${MKL_ARCH}") + add_library(MKL::${lib} STATIC IMPORTED) + else() + find_library(${lib}_file NAMES ${LIB_PREFIX}${lib}${DLL_EXT} ${lib} + PATHS ${MKL_ROOT} + PATH_SUFFIXES "lib" "lib/${MKL_ARCH}") + add_library(MKL::${lib} SHARED IMPORTED) + endif() + find_package_handle_standard_args(MKL REQUIRED_VARS ${lib}_file) + # CMP0111, implemented in CMake 3.20+ requires a shared library target on Windows + # to be defined with IMPLIB and LOCATION property. + # It also requires a static library target to be defined with LOCATION property. + # Setting the policy to OLD usage, using cmake_policy() does not work as of 3.20.0, hence the if-else below. + if(WIN32 AND NOT MKL_LINK STREQUAL "static") + set_target_properties(MKL::${lib} PROPERTIES IMPORTED_IMPLIB "${${lib}_file}") + # Find corresponding DLL + set(MKL_DLL_GLOB ${lib}.*.dll) + file(GLOB MKL_DLL_FILE "${MKL_ROOT}/redist/${MKL_ARCH}/${MKL_DLL_GLOB}" + "${MKL_ROOT}/../redist/${MKL_ARCH}/${MKL_DLL_GLOB}" + "${MKL_ROOT}/../redist/${MKL_ARCH}/mkl/${MKL_DLL_GLOB}" + "${MKL_ROOT}/bin/${MKL_DLL_GLOB}") + if(NOT ${lib} STREQUAL ${MKL_IFACE_LIB} AND NOT ${lib} STREQUAL ${MKL_BLAS95} AND NOT ${lib} STREQUAL ${MKL_LAPACK95}) # Windows IFACE libs are static only + list(LENGTH MKL_DLL_FILE MKL_DLL_FILE_LEN) + if(MKL_DLL_FILE_LEN) + # in case multiple versions of the same dll are found, select the highest version + list(SORT MKL_DLL_FILE) + list(REVERSE MKL_DLL_FILE) + list(GET MKL_DLL_FILE 0 MKL_DLL_FILE) + + mkl_message(STATUS "Found DLL: ${MKL_DLL_FILE}") + set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${MKL_DLL_FILE}") + else() + mkl_message(FATAL_ERROR "${MKL_DLL_GLOB} not found. MKL_ROOT was '${MKL_ROOT}'. MKL_DLL_FILE is '${MKL_DLL_FILE}'") + endif() + endif() + else() + set_target_properties(MKL::${lib} PROPERTIES IMPORTED_LOCATION "${${lib}_file}") + endif() + list(APPEND MKL_IMPORTED_TARGETS MKL::${lib}) +endforeach() + +# Threading selection +if(MKL_THREADING) + if(MKL_THREADING STREQUAL "tbb_thread") + find_package(TBB REQUIRED CONFIG COMPONENTS tbb) + set(MKL_THREAD_LIB $) + set(MKL_SDL_THREAD_ENV "TBB") + get_property(TBB_LIB TARGET TBB::tbb PROPERTY IMPORTED_LOCATION_RELEASE) + get_filename_component(TBB_LIB_DIR ${TBB_LIB} DIRECTORY) + if(UNIX) + if(CMAKE_SKIP_BUILD_RPATH) + set(TBB_LINK "-L${TBB_LIB_DIR} -ltbb") + else() + set(TBB_LINK "-Wl,-rpath,${TBB_LIB_DIR} -L${TBB_LIB_DIR} -ltbb") + endif() + list(APPEND MKL_SUPP_LINK ${TBB_LINK}) + if(APPLE) + list(APPEND MKL_SUPP_LINK -lc++) + else() + list(APPEND MKL_SUPP_LINK -lstdc++) + endif() + endif() + if(WIN32 OR APPLE) + set(MKL_ENV_PATH ${TBB_LIB_DIR}) + endif() + elseif(MKL_THREADING MATCHES "_thread") + if(MKL_THREADING STREQUAL "pgi_thread") + list(APPEND MKL_SUPP_LINK -mp -pgf90libs) + set(MKL_SDL_THREAD_ENV "PGI") + elseif(MKL_THREADING STREQUAL "gnu_thread") + list(APPEND MKL_SUPP_LINK -lgomp) + set(MKL_SDL_THREAD_ENV "GNU") + else() + # intel_thread + if(UNIX) + set(MKL_OMP_LIB iomp5) + set(LIB_EXT ".so") + if(APPLE) + set(LIB_EXT ".dylib") + endif() + else() + set(MKL_OMP_LIB libiomp5md) + endif() + set(MKL_SDL_THREAD_ENV "INTEL") + set(OMP_LIBNAME ${LIB_PREFIX}${MKL_OMP_LIB}${LIB_EXT}) + + find_library(OMP_LIBRARY ${OMP_LIBNAME} + HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} ${CMPLR_ROOT} + PATH_SUFFIXES "lib" "lib/${MKL_ARCH}" + "lib/${MKL_ARCH}_lin" "lib/${MKL_ARCH}_win" + "linux/compiler/lib/${MKL_ARCH}" + "linux/compiler/lib/${MKL_ARCH}_lin" + "windows/compiler/lib/${MKL_ARCH}" + "windows/compiler/lib/${MKL_ARCH}_win" + "../compiler/lib/${MKL_ARCH}_lin" "../compiler/lib/${MKL_ARCH}_win" + "../compiler/lib/${MKL_ARCH}" "../compiler/lib" + "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}" + "../../compiler/latest/linux/compiler/lib/${MKL_ARCH}_lin" + "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}" + "../../compiler/latest/windows/compiler/lib/${MKL_ARCH}_win" + "../../compiler/latest/mac/compiler/lib") + if(WIN32) + set(OMP_DLLNAME ${LIB_PREFIX}${MKL_OMP_LIB}.dll) + find_path(OMP_DLL_DIR ${OMP_DLLNAME} + HINTS $ENV{LIB} $ENV{LIBRARY_PATH} $ENV{MKLROOT} ${MKL_ROOT} ${CMPLR_ROOT} + PATH_SUFFIXES "redist/${MKL_ARCH}" + "redist/${MKL_ARCH}_win" "redist/${MKL_ARCH}_win/compiler" + "../redist/${MKL_ARCH}/compiler" "../compiler/lib" + "../../compiler/latest/windows/redist/${MKL_ARCH}_win" + "../../compiler/latest/windows/redist/${MKL_ARCH}_win/compiler" + "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win" + "../../compiler/latest/windows/compiler/redist/${MKL_ARCH}_win/compiler") + find_package_handle_standard_args(MKL REQUIRED_VARS OMP_DLL_DIR) + set(MKL_ENV_PATH "${OMP_DLL_DIR}") + endif() + + if(WIN32 AND SKIP_LIBPATH) + # Only for Intel OpenMP Offload + set(OMP_LINK "libiomp5md.lib") + else() + set(OMP_LINK "${OMP_LIBRARY}") + if(CMAKE_C_COMPILER_ID STREQUAL "PGI" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") + # Disable PGI OpenMP runtime for correct work of Intel OpenMP runtime + list(APPEND MKL_SUPP_LINK -nomp) + endif() + endif() + find_package_handle_standard_args(MKL REQUIRED_VARS OMP_LIBRARY OMP_LINK) + set(MKL_THREAD_LIB ${OMP_LINK}) + endif() + else() + # Sequential threading + set(MKL_SDL_THREAD_ENV "SEQUENTIAL") + endif() +endif() # MKL_THREADING + +if (UNIX) + list(APPEND MKL_SUPP_LINK -lm -ldl -lpthread) +endif() + +if(DPCPP_COMPILER OR ENABLE_OMP_OFFLOAD) + if(WIN32) + # Detect sycl library version + if(NOT DEFINED SYCL_LIB_VER_CACHE) + set(SYCL_LIB_VER "") + find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${LIB_EXT} + HINTS $ENV{LIB} $ENV{CMPLR_ROOT} + PATH_SUFFIXES "windows/lib") + if(NOT SYCL_LIB_DIR) + foreach(ver RANGE 6 99) + find_library(SYCL_LIB_DIR ${LIB_PREFIX}sycl${ver}${LIB_EXT} + HINTS $ENV{LIB} $ENV{CMPLR_ROOT} + PATH_SUFFIXES "windows/lib") + if(SYCL_LIB_DIR) + set(SYCL_LIB_VER ${ver}) + break() + endif() + endforeach() + endif() + set(SYCL_LIB_VER_CACHE ${SYCL_LIB_VER} CACHE STRING "") + endif() + + if(CMAKE_BUILD_TYPE MATCHES "Debug|DebInfo") + list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}d${LINK_SUFFIX}) + else() + list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${SYCL_LIB_VER_CACHE}${LINK_SUFFIX}) + endif() + else() + list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}sycl${LINK_SUFFIX}) + endif() + list(APPEND MKL_SUPP_LINK ${LINK_PREFIX}OpenCL${LINK_SUFFIX}) +endif() + +# Setup link types based on input options +set(LINK_TYPES "") + +if(DPCPP_COMPILER) + add_library(MKL::MKL_DPCPP INTERFACE IMPORTED GLOBAL) + target_compile_options(MKL::MKL_DPCPP INTERFACE ${MKL_DPCPP_COPT}) + target_link_libraries(MKL::MKL_DPCPP INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK}) + list(APPEND LINK_TYPES MKL::MKL_DPCPP) +endif() +# Single target for all C, Fortran link-lines +add_library(MKL::MKL INTERFACE IMPORTED GLOBAL) +target_compile_options(MKL::MKL INTERFACE + $<$,C>:${MKL_C_COPT}> + $<$,Fortran>:${MKL_F_COPT}> + $<$,CXX>:${MKL_CXX_COPT}> + $,${MKL_OFFLOAD_COPT},>) +target_link_libraries(MKL::MKL INTERFACE ${MKL_LINK_LINE} ${MKL_THREAD_LIB} ${MKL_SUPP_LINK}) +list(APPEND LINK_TYPES MKL::MKL) + +foreach(link ${LINK_TYPES}) + # Set properties on all INTERFACE targets + target_include_directories(${link} BEFORE INTERFACE "${MKL_INCLUDE}") + list(APPEND MKL_IMPORTED_TARGETS ${link}) +endforeach(link) # LINK_TYPES + +if(MKL_LINK STREQUAL "sdl") + list(APPEND MKL_ENV "MKL_INTERFACE_LAYER=${MKL_SDL_IFACE_ENV}" "MKL_THREADING_LAYER=${MKL_SDL_THREAD_ENV}") +endif() +if(WIN32 AND NOT MKL_LINK STREQUAL "static") + list(APPEND MKL_ENV "MKL_BLACS_MPI=${MKL_BLACS_ENV}") +endif() + +# Add MKL dynamic libraries if RPATH is not defined on Unix +if(UNIX AND CMAKE_SKIP_BUILD_RPATH) + if(MKL_LINK STREQUAL "sdl") + set(MKL_LIB_DIR $) + else() + set(MKL_LIB_DIR $) + endif() + if(APPLE) + list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{DYLD_LIBRARY_PATH}") + else() + list(APPEND MKL_ENV "LD_LIBRARY_PATH=${MKL_LIB_DIR}\;$ENV{LD_LIBRARY_PATH}") + endif() +endif() + +# Add MKL dynamic libraries to PATH on Windows +if(WIN32 AND NOT MKL_LINK STREQUAL "static") + get_filename_component(MKL_DLL_DIR ${MKL_DLL_FILE} DIRECTORY) + set(MKL_ENV_PATH "${MKL_DLL_DIR}\;${MKL_ENV_PATH}") +endif() + +if(MKL_ENV_PATH) + list(APPEND MKL_ENV "PATH=${MKL_ENV_PATH}\;${OLD_PATH}") + if(APPLE) + list(APPEND MKL_ENV "DYLD_LIBRARY_PATH=${MKL_ENV_PATH}\:${OLD_PATH}") + endif() +endif() + +unset(MKL_DLL_FILE) + +endif() # MKL_LIBRARIES From a1a88a5327fbf76f914e188a8503c2bf47940fd4 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 13:53:36 +0200 Subject: [PATCH 12/20] Fix mkl discovery --- .../deformable_convolution_sycl_native_ext/CMakeLists.txt | 6 +++++- .../cmake/MKLConfig.cmake | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index bf233286..b93e00fb 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -10,8 +10,12 @@ pybind11_add_module(${py_module_name} deformable_convolution_sycl/impl.cpp ) -message(${CMAKE_CURRENT_SOURCE_DIR}) find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) + +if (DEFINED ENV{CONDA_PREFIX}) + set(MKL_ROOT $ENV{CONDA_PREFIX}) +endif() + find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) target_compile_options(${py_module_name} PUBLIC $) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake index 07290e8e..e83661c4 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/cmake/MKLConfig.cmake @@ -635,7 +635,8 @@ foreach(lib ${MKL_LIBRARIES}) file(GLOB MKL_DLL_FILE "${MKL_ROOT}/redist/${MKL_ARCH}/${MKL_DLL_GLOB}" "${MKL_ROOT}/../redist/${MKL_ARCH}/${MKL_DLL_GLOB}" "${MKL_ROOT}/../redist/${MKL_ARCH}/mkl/${MKL_DLL_GLOB}" - "${MKL_ROOT}/bin/${MKL_DLL_GLOB}") + "${MKL_ROOT}/bin/${MKL_DLL_GLOB}" + "${MKL_ROOT}/lib") if(NOT ${lib} STREQUAL ${MKL_IFACE_LIB} AND NOT ${lib} STREQUAL ${MKL_BLAS95} AND NOT ${lib} STREQUAL ${MKL_LAPACK95}) # Windows IFACE libs are static only list(LENGTH MKL_DLL_FILE MKL_DLL_FILE_LEN) if(MKL_DLL_FILE_LEN) From bce5efaad54a0803106e3c6a270375242efac852 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 14:08:31 +0200 Subject: [PATCH 13/20] Add tbb and mkl-devel to win conda env --- environments/conda-win-sycl.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environments/conda-win-sycl.yml b/environments/conda-win-sycl.yml index e4ac46e5..0c5d3032 100644 --- a/environments/conda-win-sycl.yml +++ b/environments/conda-win-sycl.yml @@ -30,3 +30,5 @@ dependencies: # https://github.com/scikit-build/scikit-build/issues/981 - setuptools>=42,<64 - pybind11 + - tbb-devel >= 2021.6.0 + - mkl-devel-dpcpp>=2023.0.0 From 2642393a2be3ce7a6c172fe76e2aba8bb4b72fdf Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 14:45:50 +0200 Subject: [PATCH 14/20] Test cmake changes --- .../deformable_convolution_sycl_native_ext/CMakeLists.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index b93e00fb..b3ffd9fe 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -10,12 +10,8 @@ pybind11_add_module(${py_module_name} deformable_convolution_sycl/impl.cpp ) -find_package(TBB REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) - -if (DEFINED ENV{CONDA_PREFIX}) - set(MKL_ROOT $ENV{CONDA_PREFIX}) -endif() - +find_package(TBB CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) +find_package(IntelSYCL CONFIG REQUIRED) find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) target_compile_options(${py_module_name} PUBLIC $) From a85dcf4ef7e9c6def9eb8e7a91c82553e850e33c Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Sun, 25 Jun 2023 18:16:32 +0200 Subject: [PATCH 15/20] Revert some changes --- .../deformable_convolution_sycl_native_ext/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt index b3ffd9fe..e5911eb6 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/CMakeLists.txt @@ -12,6 +12,10 @@ pybind11_add_module(${py_module_name} find_package(TBB CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) find_package(IntelSYCL CONFIG REQUIRED) + +if (DEFINED ENV{CONDA_PREFIX}) + set(MKL_ROOT $ENV{CONDA_PREFIX}) +endif() find_package(MKL CONFIG REQUIRED PATHS ${CMAKE_CURRENT_SOURCE_DIR}/cmake NO_DEFAULT_PATH) target_compile_options(${py_module_name} PUBLIC $) From 4e2b42b329854260dd0ff19e3461c5c73f2c757d Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 26 Jun 2023 01:53:46 +0200 Subject: [PATCH 16/20] Fix hang --- dpbench/infrastructure/benchmark_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py index 1fe53357..a00a4221 100644 --- a/dpbench/infrastructure/benchmark_runner.py +++ b/dpbench/infrastructure/benchmark_runner.py @@ -314,8 +314,6 @@ def run_benchmark_in_sub_process( return results - _, conn = self.get_process(rc.framework) - brc = BaseRunConfig.from_instance(rc) brc.ref_framework: cfg.Framework = [ @@ -325,6 +323,7 @@ def run_benchmark_in_sub_process( in {p.postfix for p in f.postfixes} ][0] + _, conn = self.get_process(rc.framework) conn.send(brc) if conn.poll(rc.timeout if rc.timeout else self._default_timeout): From 909d1032d8e6468c46d6894994cce0a967c15e73 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 26 Jun 2023 03:17:06 +0200 Subject: [PATCH 17/20] Fix hang --- dpbench/infrastructure/benchmark_runner.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py index a00a4221..24715a0b 100644 --- a/dpbench/infrastructure/benchmark_runner.py +++ b/dpbench/infrastructure/benchmark_runner.py @@ -316,6 +316,7 @@ def run_benchmark_in_sub_process( brc = BaseRunConfig.from_instance(rc) + _, conn = self.get_process(rc.framework) brc.ref_framework: cfg.Framework = [ f for f in cfg.GLOBAL.frameworks @@ -323,7 +324,6 @@ def run_benchmark_in_sub_process( in {p.postfix for p in f.postfixes} ][0] - _, conn = self.get_process(rc.framework) conn.send(brc) if conn.poll(rc.timeout if rc.timeout else self._default_timeout): @@ -357,7 +357,14 @@ def run_benchmark_and_save( Args: rc: runtime configuration. """ - results = self.run_benchmark_in_sub_process(rc) + try: + results = self.run_benchmark_in_sub_process(rc) + except Exception as e: + # self.kill_process(rc.framework) + + results = BenchmarkResults(0, rc.implementation, rc.preset) + results.error_state = ErrorCodes.FAILED_EXECUTION + results.error_msg = f"Unexpected failure. {str(e)}" if rc.conn: framework = build_framework(rc.framework) @@ -377,6 +384,7 @@ def run_benchmark_and_save( ) + def _set_input_args( bench: Benchmark, framework: Framework, np_input_data: dict ): From e7face29011a23f18776e20bb67209a274fce0b9 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 26 Jun 2023 03:18:53 +0200 Subject: [PATCH 18/20] pre-commit --- dpbench/infrastructure/benchmark_runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dpbench/infrastructure/benchmark_runner.py b/dpbench/infrastructure/benchmark_runner.py index 24715a0b..9c95a43e 100644 --- a/dpbench/infrastructure/benchmark_runner.py +++ b/dpbench/infrastructure/benchmark_runner.py @@ -384,7 +384,6 @@ def run_benchmark_and_save( ) - def _set_input_args( bench: Benchmark, framework: Framework, np_input_data: dict ): From c5bddb32572f7d6a63e6937b8e69d414ef37e20f Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 28 Jun 2023 00:51:25 +0200 Subject: [PATCH 19/20] Some cleanup --- .../deformable_convolution_initialize.py | 66 +++++++------ .../deformable_convolution_numba_mlir_p.py | 45 +++++++-- .../deformable_convolution_sycl/impl.cpp | 97 +++++++++++++------ .../bench_info/deformable_convolution.toml | 96 ++++++------------ 4 files changed, 169 insertions(+), 135 deletions(-) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py index 205f57be..9903e9d8 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_initialize.py @@ -5,45 +5,49 @@ def initialize( batch, - in_channels, - in_height, - in_width, - out_channels, - out_height, - out_width, - kernel_height, - kernel_width, - stride_y, - stride_x, - dilation_y, - dilation_x, - pad_y, - pad_x, + in_chw, + out_chw, + kernel_hw, + stride_hw, + dilation_hw, + pad_hw, groups, deformable_groups, - dtype, seed, + types_dict, ): import numpy as np import numpy.random as default_rng + dtype: np.dtype = types_dict["float"] + default_rng.seed(seed) + input_size = [batch] + in_chw # nchw + output_size = [batch] + out_chw # nchw + offset_size = kernel_hw + [2, out_chw[1], out_chw[2]] # kh, kw, 2, oh, ow + weights_size = [out_chw[0], in_chw[0]] + kernel_hw # oc, ic, kh, kw + bias_size = out_chw[0] # oc + tmp_size = [ + in_chw[0], + kernel_hw[0], + kernel_hw[1], + out_chw[1], + out_chw[2], + ] # ic, kh, kw, oh, ow + + input = default_rng.random(input_size).astype(dtype) + output = np.empty(output_size, dtype=dtype) + offset = 2 * default_rng.random(offset_size).astype("float32") - 1 + weights = default_rng.random(weights_size).astype(dtype) + bias = default_rng.random(bias_size).astype(dtype) + tmp = np.empty(tmp_size, dtype=dtype) + return ( - default_rng.random((batch, in_channels, in_height, in_width)).astype( - dtype - ), - np.zeros((batch, out_channels, out_height, out_width)).astype(dtype), - 2 - * default_rng.random( - (kernel_height, kernel_width, 2, out_height, out_width) - ).astype(dtype) - - 1, - np.ones( - (out_channels, in_channels, kernel_height, kernel_width) - ).astype(dtype), - default_rng.random(out_channels).astype(dtype), - np.zeros( - (in_channels, kernel_height, kernel_width, out_height, out_width) - ).astype(dtype), + input, + output, + offset, + weights, + bias, + tmp, ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py index b99ff53b..77cbc5a9 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_numba_mlir_p.py @@ -108,19 +108,16 @@ def deformable_convolution_b1( @njit(parallel=True, gpu_fp64_truncate="auto") -def deformable_convolution( +def jdeformable_convolution( input, output, offset, weights, bias, tmp, - stride_y, - stride_x, - pad_y, - pad_x, - dilation_y, - dilation_x, + stride, + pad, + dilation, groups, deformable_groups, ): @@ -133,9 +130,37 @@ def deformable_convolution( weights, bias, tmp, - (stride_y, stride_x), - (pad_y, pad_x), - (dilation_y, dilation_x), + stride, + pad, + dilation, groups, deformable_groups, ) + + +def deformable_convolution( + input, + output, + offset, + weights, + bias, + tmp, + stride_hw, + pad_hw, + dilation_hw, + groups, + deformable_groups, +): + jdeformable_convolution( + input, + output, + offset, + weights, + bias, + tmp, + tuple(stride_hw), + tuple(pad_hw), + tuple(dilation_hw), + groups, + deformable_groups, + ) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index 12ac11de..9cff7a98 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -14,6 +14,8 @@ #include "oneapi/mkl.hpp" #include "utils.hpp" +#include + using namespace sycl; namespace py = pybind11; @@ -65,7 +67,7 @@ __attribute__((always_inline)) DataType bilinear(const DataType *input, return result / 2; } -class deform; +template class deform; template inline auto deform_input(cl::sycl::queue &queue, @@ -73,7 +75,7 @@ inline auto deform_input(cl::sycl::queue &queue, const Shape3D in_shape, DataType *output, const Shape5D out_shape, - const DataType *offset, + const float *offset, int stride_y, int stride_x, int pad_y, @@ -94,7 +96,7 @@ inline auto deform_input(cl::sycl::queue &queue, auto wsize = sycl::range<3>(in_channels * k_height * k_width, out_height, out_width); - return queue.parallel_for(wsize, [=](sycl::id<3> idx) { + return queue.parallel_for>(wsize, [=](sycl::id<3> idx) { auto ckhkw = static_cast(idx[0]); auto h = static_cast(idx[1]); auto w = static_cast(idx[2]); @@ -127,19 +129,19 @@ inline auto deform_input(cl::sycl::queue &queue, }); } -class fill_output; +template class fill_output; template auto output_fill_with_bias(cl::sycl::queue &queue, DataType *output, const Shape3D out_shape, - DataType *bias) + const DataType *bias) { auto out_c = out_shape[CHW::C]; auto out_h = out_shape[CHW::H]; auto out_w = out_shape[CHW::W]; - return queue.parallel_for( + return queue.parallel_for>( sycl::range<3>(out_c, out_h, out_w), [=](sycl::id<3> idx) { auto c = static_cast(idx[0]); auto h = static_cast(idx[1]); @@ -158,9 +160,9 @@ void deformable_convolution_b1_impl(cl::sycl::queue &queue, const Shape3D out_shape, DataType *tmp, const float *offset, - DataType *weights, + const DataType *weights, const Shape4D weights_shape, - DataType *bias, + const DataType *bias, int stride_y, int stride_x, int pad_y, @@ -210,9 +212,9 @@ void deformable_convolution_impl(cl::sycl::queue &queue, const Shape4D out_shape, DataType *tmp, const float *offset, - DataType *weights, + const DataType *weights, const Shape4D weights_shape, - DataType *bias, + const DataType *bias, int stride_y, int stride_x, int pad_y, @@ -247,25 +249,45 @@ void deformable_convolution_impl(cl::sycl::queue &queue, } } +template bool ensure_compatibility(const Args &...args) +{ + std::vector arrays = {args...}; + + auto arr = arrays.at(0); + + for (auto &arr : arrays) { + if (!(arr.get_flags() & (USM_ARRAY_C_CONTIGUOUS))) { + std::cerr << "All arrays need to be C contiguous.\n"; + return false; + } + } + return true; +} + void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm_ndarray output, dpctl::tensor::usm_ndarray offset, dpctl::tensor::usm_ndarray weights, dpctl::tensor::usm_ndarray bias, dpctl::tensor::usm_ndarray tmp, - int stride_y, - int stride_x, - int pad_y, - int pad_x, - int dilation_y, - int dilation_x, + std::vector stride_hw, + std::vector pad_hw, + std::vector dilation_hw, int groups, int deformable_groups) { auto queue = input.get_queue(); - if (input.get_typenum() != UAR_FLOAT) { - throw std::runtime_error("Expected a single precision FP array."); + if (!ensure_compatibility(input, output, offset, weights, bias, tmp)) + throw std::runtime_error("Input arrays are not acceptable."); + + if (input.get_typenum() != output.get_typenum() or + input.get_typenum() != offset.get_typenum() or + input.get_typenum() != weights.get_typenum() or + input.get_typenum() != bias.get_typenum() or + input.get_typenum() != tmp.get_typenum()) + { + throw std::runtime_error("All arrays must have the same precision"); } int batch = input.get_shape(0); @@ -281,17 +303,39 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input, int kernel_height = weights.get_shape(2); int kernel_width = weights.get_shape(3); + auto stride_y = stride_hw[0]; + auto stride_x = stride_hw[1]; + + auto pad_y = pad_hw[0]; + auto pad_x = pad_hw[1]; + + auto dilation_y = pad_hw[0]; + auto dilation_x = pad_hw[1]; + auto input_shape = Shape4D({batch, in_channels, in_height, in_width}); auto output_shape = Shape4D({batch, out_channels, out_height, out_width}); auto weights_shape = Shape4D({out_channels, in_channels, kernel_height, kernel_width}); - deformable_convolution_impl( - queue, input.get_data(), input_shape, output.get_data(), - output_shape, tmp.get_data(), offset.get_data(), - weights.get_data(), weights_shape, bias.get_data(), - stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, - deformable_groups); +#define dispatch_dc(typ) \ + deformable_convolution_impl( \ + queue, input.get_data(), input_shape, output.get_data(), \ + output_shape, tmp.get_data(), offset.get_data(), \ + weights.get_data(), weights_shape, bias.get_data(), \ + stride_y, stride_x, pad_y, pad_x, dilation_y, dilation_x, groups, \ + deformable_groups) + + if (input.get_typenum() == UAR_FLOAT) { + dispatch_dc(float); + } + else if (input.get_typenum() == UAR_DOUBLE) { + dispatch_dc(double); + } + else { + throw std::runtime_error("Unsupported type"); + } + +#undef dispatch_dc } PYBIND11_MODULE(_deformable_convolution_sycl, m) @@ -301,8 +345,7 @@ PYBIND11_MODULE(_deformable_convolution_sycl, m) m.def("deformable_convolution", &deformable_convolution, "Defromable convolution", py::arg("input"), py::arg("output"), py::arg("offset"), py::arg("weights"), py::arg("bias"), - py::arg("tmp"), py::arg("stride_y"), py::arg("stride_x"), - py::arg("pad_y"), py::arg("pad_x"), py::arg("dilation_y"), - py::arg("dilation_x"), py::arg("groups"), + py::arg("tmp"), py::arg("stride_hw"), py::arg("pad_hw"), + py::arg("dilation_hw"), py::arg("groups"), py::arg("deformable_groups")); } diff --git a/dpbench/configs/bench_info/deformable_convolution.toml b/dpbench/configs/bench_info/deformable_convolution.toml index 4c22313c..93c2e50a 100644 --- a/dpbench/configs/bench_info/deformable_convolution.toml +++ b/dpbench/configs/bench_info/deformable_convolution.toml @@ -18,12 +18,9 @@ input_args = [ "weights", "bias", "tmp", - "stride_y", - "stride_x", - "dilation_y", - "dilation_x", - "pad_y", - "pad_x", + "stride_hw", + "dilation_hw", + "pad_hw", "groups", "deformable_groups" ] @@ -41,65 +38,38 @@ output_args = [ [benchmark.parameters.S] batch = 1 -in_channels = 16 -in_height = 32 -in_width = 32 -out_channels = 16 -out_height = 32 -out_width = 32 -kernel_height = 3 -kernel_width = 3 -stride_y = 1 -stride_x = 1 -dilation_y = 1 -dilation_x = 1 -pad_y = 1 -pad_x = 1 +in_chw = [16, 32, 32] +out_chw = [16, 32, 32] +kernel_hw = [3, 3] +stride_hw = [1, 1] +dilation_hw = [1, 1] +pad_hw = [1, 1] groups = 1 deformable_groups = 1 -dtype = "float32" seed = 7777777 [benchmark.parameters.M] batch = 1 -in_channels = 64 -in_height = 64 -in_width = 64 -out_channels = 64 -out_height = 64 -out_width = 64 -kernel_height = 3 -kernel_width = 3 -stride_y = 1 -stride_x = 1 -dilation_y = 1 -dilation_x = 1 -pad_y = 1 -pad_x = 1 +in_chw = [64, 64, 64] +out_chw = [64, 64, 64] +kernel_hw = [3, 3] +stride_hw = [1, 1] +dilation_hw = [1, 1] +pad_hw = [1, 1] groups = 1 deformable_groups = 1 -dtype = "float32" seed = 7777777 [benchmark.parameters.L] batch = 2 -in_channels = 64 -in_height = 128 -in_width = 128 -out_channels = 128 -out_height = 128 -out_width = 128 -kernel_height = 3 -kernel_width = 3 -stride_y = 1 -stride_x = 1 -dilation_y = 1 -dilation_x = 1 -pad_y = 1 -pad_x = 1 +in_chw = [64, 128, 128] +out_chw = [128, 128, 128] +kernel_hw = [3, 3] +stride_hw = [1, 1] +dilation_hw = [1, 1] +pad_hw = [1, 1] groups = 1 deformable_groups = 1 -dtype = "float32" seed = 7777777 [benchmark.init] @@ -108,24 +78,16 @@ types_dict_name="types_dict" precision="single" input_args = [ "batch", - "in_channels", - "in_height", - "in_width", - "out_channels", - "out_height", - "out_width", - "kernel_height", - "kernel_width", - "stride_y", - "stride_x", - "dilation_y", - "dilation_x", - "pad_y", - "pad_x", + "in_chw", + "out_chw", + "kernel_hw", + "stride_hw", + "dilation_hw", + "pad_hw", "groups", "deformable_groups", - "dtype", - "seed" + "seed", + "types_dict", ] output_args = [ "input", From 0c4f1f27a686b79a7b0994da2599e8c655588177 Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Wed, 28 Jun 2023 01:39:56 +0200 Subject: [PATCH 20/20] Remove auto convert list to vector --- .../deformable_convolution_sycl/impl.cpp | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp index 9cff7a98..3bc49f39 100644 --- a/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp +++ b/dpbench/benchmarks/deformable_convolution/deformable_convolution_sycl_native_ext/deformable_convolution_sycl/impl.cpp @@ -14,8 +14,6 @@ #include "oneapi/mkl.hpp" #include "utils.hpp" -#include - using namespace sycl; namespace py = pybind11; @@ -270,9 +268,9 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input, dpctl::tensor::usm_ndarray weights, dpctl::tensor::usm_ndarray bias, dpctl::tensor::usm_ndarray tmp, - std::vector stride_hw, - std::vector pad_hw, - std::vector dilation_hw, + py::list stride_hw, + py::list pad_hw, + py::list dilation_hw, int groups, int deformable_groups) { @@ -303,14 +301,14 @@ void deformable_convolution(dpctl::tensor::usm_ndarray input, int kernel_height = weights.get_shape(2); int kernel_width = weights.get_shape(3); - auto stride_y = stride_hw[0]; - auto stride_x = stride_hw[1]; + auto stride_y = stride_hw[0].cast(); + auto stride_x = stride_hw[1].cast(); - auto pad_y = pad_hw[0]; - auto pad_x = pad_hw[1]; + auto pad_y = pad_hw[0].cast(); + auto pad_x = pad_hw[1].cast(); - auto dilation_y = pad_hw[0]; - auto dilation_x = pad_hw[1]; + auto dilation_y = pad_hw[0].cast(); + auto dilation_x = pad_hw[1].cast(); auto input_shape = Shape4D({batch, in_channels, in_height, in_width}); auto output_shape = Shape4D({batch, out_channels, out_height, out_width});