import dpctl
import numba_dpex as dppy
import numpy as np
import dpnp
import dpctl.tensor as dpt
@dppy.kernel
def data_parallel_sum(a, b, c):
"""
Vector addition using the ``kernel`` decorator.
"""
i = dppy.get_global_id(0)
c[i] = a[i] + b[i]
def driver(a, b, c, global_size):
print("A : ", (a.shape, a.strides))
print("B : ", (b.shape, b.strides))
data_parallel_sum[dppy.Range(global_size)](a, b, c)
print("C ", (c.shape, c.strides))
# assert np.allclose(dpt.asnumpy(c), dpt.asnumpy(a) + dpt.asnumpy(b))
global_size = 10
N = global_size
print("N", N)
a = dpnp.random.random(N) # C-contig
b = dpnp.random.random(N) # C-contig
out = dpnp.zeros((2 * N,), dtype=a.dtype) # C-contig
c = out[::2] # Strided, every other element
driver(a, b, c, global_size)
print(c)
print(out)
Executing this script outputs
N 10
Using device ...
Name Intel(R) UHD Graphics [0x9bca]
Driver version 1.1.20678
Vendor Intel(R) Corporation
Profile FULL_PROFILE
Filter string level_zero:gpu:0
A : ((10,), (1,))
B : ((10,), (1,))
C ((10,), (2,))
Done...
[0.83166494 1.27176889 0.61416447 0.57984313 1.04820722 0.
0. 0. 0. 0. ]
[0.83166494 0.50883122 1.27176889 1.08111028 0.61416447 0.77052362
0.57984313 1.55592601 1.04820722 1.12209132 0. 0.
0. 0. 0. 0. 0. 0.
0. 0. ]
The expected result should have been
[0.83166494 0.50883122 1.27176889 1.08111028 0.61416447 0.77052362
0.57984313 1.55592601 1.04820722 1.12209132]
[0.83166494 0. 0.50883122 0. 1.27176889 0. 1.08111028 0.
0.61416447 0. 0.77052362 0. 0.57984313 0. 1.55592601 0.
1.04820722 0. 1.12209132 0. ]
UPDATE : Updated the reproducer to work with latest numba-dpex.
Executing this script outputs
The expected result should have been
UPDATE : Updated the reproducer to work with latest numba-dpex.