From 5f4982768322831e3628217fffa7ccf7b7413720 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Aug 2022 12:07:59 -0500
Subject: [PATCH 1/4] linspace_affine should not use double precision type in
 kernels is HW does not support it

---
 dpctl/tensor/libtensor/source/tensor_py.cpp | 28 ++++++++++++++-------
 1 file changed, 19 insertions(+), 9 deletions(-)
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 4ef68a30a0..b3df82f54b 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -43,7 +43,7 @@ template <typename srcT, typename dstT> class copy_cast_from_host_kernel;
 template <typename srcT, typename dstT, int nd> class copy_cast_spec_kernel;
 template <typename Ty> class copy_for_reshape_generic_kernel;
 template <typename Ty> class linear_sequence_step_kernel;
-template <typename Ty> class linear_sequence_affine_kernel;
+template <typename Ty, typename wTy> class linear_sequence_affine_kernel;
 
 static dpctl::tensor::detail::usm_ndarray_types array_types;
 
@@ -1526,7 +1526,7 @@ typedef sycl::event (*lin_space_affine_fn_ptr_t)(
 static lin_space_affine_fn_ptr_t
     lin_space_affine_dispatch_vector[_ns::num_types];
 
-template <typename Ty> class LinearSequenceAffineFunctor
+template <typename Ty, typename wTy> class LinearSequenceAffineFunctor
 {
 private:
     Ty *p = nullptr;
@@ -1544,8 +1544,8 @@ template <typename Ty> class LinearSequenceAffineFunctor
     void operator()(sycl::id<1> wiid) const
     {
         auto i = wiid.get(0);
-        double wc = double(i) / n;
-        double w = double(n - i) / n;
+        wTy wc = wTy(i) / n;
+        wTy w = wTy(n - i) / n;
         if constexpr (is_complex<Ty>::value) {
             auto _w = static_cast<typename Ty::value_type>(w);
             auto _wc = static_cast<typename Ty::value_type>(wc);
@@ -1578,13 +1578,23 @@ sycl::event lin_space_affine_impl(sycl::queue exec_q,
         throw;
     }
 
+    bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64);
     sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
-        cgh.parallel_for<linear_sequence_affine_kernel<Ty>>(
-            sycl::range<1>{nelems},
-            LinearSequenceAffineFunctor<Ty>(array_data, start_v, end_v,
-                                            (include_endpoint) ? nelems - 1
-                                                               : nelems));
+        if (device_supports_doubles) {
+            cgh.parallel_for<linear_sequence_affine_kernel<Ty, double>>(
+                sycl::range<1>{nelems},
+                LinearSequenceAffineFunctor<Ty, double>(
+                    array_data, start_v, end_v,
+                    (include_endpoint) ? nelems - 1 : nelems));
+        }
+        else {
+            cgh.parallel_for<linear_sequence_affine_kernel<Ty, float>>(
+                sycl::range<1>{nelems},
+                LinearSequenceAffineFunctor<Ty, float>(
+                    array_data, start_v, end_v,
+                    (include_endpoint) ? nelems - 1 : nelems));
+        }
     });
 
     return lin_space_affine_event;

From 290230aa478d7da2ea4b87f08faf756b8e4fa34f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Aug 2022 12:08:26 -0500
Subject: [PATCH 2/4] test_linspace should not try double precision if HW does
 not support it

---
 dpctl/tests/test_usm_ndarray_ctor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
index 9b4759ae73..5d52694ca4 100644
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ b/dpctl/tests/test_usm_ndarray_ctor.py
@@ -1080,6 +1080,8 @@ def test_linspace(dt):
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Default queue could not be created")
+    if dt in ["f8", "c16"] and not q.sycl_device.has_aspect_fp64:
+        pytest.skip("Device does not support double precision")
     X = dpt.linspace(0, 1, num=2, dtype=dt, sycl_queue=q)
     assert np.allclose(dpt.asnumpy(X), np.linspace(0, 1, num=2, dtype=dt))
 

From f6db04e05d64343d0ce23cf37879f715957fba8f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Aug 2022 12:30:31 -0500
Subject: [PATCH 3/4] Fix for copy of double precision NumPy array to device
 w/o HW support for DP

Routine to copy NumPy array to USM array casts double to single
precision on host is sycl device does not support double precision
---
 dpctl/tensor/_copy_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
index ec2c63d604..dd6f068596 100644
--- a/dpctl/tensor/_copy_utils.py
+++ b/dpctl/tensor/_copy_utils.py
@@ -81,8 +81,15 @@ def _copy_from_numpy_into(dst, np_ary):
     if not isinstance(dst, dpt.usm_ndarray):
         raise TypeError("Expected usm_ndarray, got {}".format(type(dst)))
     src_ary = np.broadcast_to(np_ary, dst.shape)
+    copy_q = dst.sycl_queue
+    if copy_q.sycl_device.has_aspect_fp64 is False:
+        src_ary_dt_c = src_ary.dtype.char
+        if src_ary_dt_c == "d":
+            src_ary = src_ary.astype(np.float32)
+        elif src_ary_dt_c == "D":
+            src_ary = src_ary.astype(np.complex64)
     ti._copy_numpy_ndarray_into_usm_ndarray(
-        src=src_ary, dst=dst, sycl_queue=dst.sycl_queue
+        src=src_ary, dst=dst, sycl_queue=copy_q
     )
 
 

From e363969df9b9005267363caa6ab6e65782410585 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 8 Aug 2022 12:48:07 -0500
Subject: [PATCH 4/4] Tests should not use double precision arrays where it is
 not essential for testing

---
 dpctl/tests/test_usm_ndarray_manipulation.py | 42 ++++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py
index 0dd4ccc9d7..038ac007c8 100644
--- a/dpctl/tests/test_usm_ndarray_manipulation.py
+++ b/dpctl/tests/test_usm_ndarray_manipulation.py
@@ -152,7 +152,7 @@ def test_expand_dims_tuple(axes):
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
 
-    Xnp = np.empty((3, 3, 3))
+    Xnp = np.empty((3, 3, 3), dtype="u1")
     X = dpt.asarray(Xnp, sycl_queue=q)
     Y = dpt.expand_dims(X, axes)
     Ynp = np.expand_dims(Xnp, axes)
@@ -234,7 +234,7 @@ def test_squeeze_without_axes(shapes):
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
 
-    Xnp = np.empty(shapes)
+    Xnp = np.empty(shapes, dtype="u1")
     X = dpt.asarray(Xnp, sycl_queue=q)
     Y = dpt.squeeze(X)
     Ynp = Xnp.squeeze()
@@ -248,7 +248,7 @@ def test_squeeze_axes_arg(axes):
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
 
-    Xnp = np.array([[[1], [2], [3]]])
+    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
     X = dpt.asarray(Xnp, sycl_queue=q)
     Y = dpt.squeeze(X, axes)
     Ynp = Xnp.squeeze(axes)
@@ -262,7 +262,7 @@ def test_squeeze_axes_arg_error(axes):
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
 
-    Xnp = np.array([[[1], [2], [3]]])
+    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
     X = dpt.asarray(Xnp, sycl_queue=q)
     pytest.raises(ValueError, dpt.squeeze, X, axes)
 
@@ -270,21 +270,21 @@ def test_squeeze_axes_arg_error(axes):
 @pytest.mark.parametrize(
     "data",
     [
-        [np.array(0), (0,)],
-        [np.array(0), (1,)],
-        [np.array(0), (3,)],
-        [np.ones(1), (1,)],
-        [np.ones(1), (2,)],
-        [np.ones(1), (1, 2, 3)],
-        [np.arange(3), (3,)],
-        [np.arange(3), (1, 3)],
-        [np.arange(3), (2, 3)],
-        [np.ones(0), 0],
-        [np.ones(1), 1],
-        [np.ones(1), 2],
-        [np.ones(1), (0,)],
-        [np.ones((1, 2)), (0, 2)],
-        [np.ones((2, 1)), (2, 0)],
+        [np.array(0, dtype="u1"), (0,)],
+        [np.array(0, dtype="u1"), (1,)],
+        [np.array(0, dtype="u1"), (3,)],
+        [np.ones(1, dtype="u1"), (1,)],
+        [np.ones(1, dtype="u1"), (2,)],
+        [np.ones(1, dtype="u1"), (1, 2, 3)],
+        [np.arange(3, dtype="u1"), (3,)],
+        [np.arange(3, dtype="u1"), (1, 3)],
+        [np.arange(3, dtype="u1"), (2, 3)],
+        [np.ones(0, dtype="u1"), 0],
+        [np.ones(1, dtype="u1"), 1],
+        [np.ones(1, dtype="u1"), 2],
+        [np.ones(1, dtype="u1"), (0,)],
+        [np.ones((1, 2), dtype="u1"), (0, 2)],
+        [np.ones((2, 1), dtype="u1"), (2, 0)],
     ],
 )
 def test_broadcast_to_succeeds(data):
@@ -323,7 +323,7 @@ def test_broadcast_to_raises(data):
         pytest.skip("Queue could not be created")
 
     orig_shape, target_shape = data
-    Xnp = np.zeros(orig_shape)
+    Xnp = np.zeros(orig_shape, dtype="i1")
     X = dpt.asarray(Xnp, sycl_queue=q)
     pytest.raises(ValueError, dpt.broadcast_to, X, target_shape)
 
@@ -333,7 +333,7 @@ def assert_broadcast_correct(input_shapes):
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
-    np_arrays = [np.zeros(s) for s in input_shapes]
+    np_arrays = [np.zeros(s, dtype="i1") for s in input_shapes]
     out_np_arrays = np.broadcast_arrays(*np_arrays)
     usm_arrays = [dpt.asarray(Xnp, sycl_queue=q) for Xnp in np_arrays]
     out_usm_arrays = dpt.broadcast_arrays(*usm_arrays)