From 7dd904d9ab71b7fef650569f8ff60c1d495cb0ce Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 15:31:43 +0300
Subject: [PATCH 01/24] Add support for dpctl.dparray.

---
 numba_dppy/dparray.py            | 354 +++++++++++++++++++++++++++++++
 numba_dppy/dppy_rt.c             |  89 ++++++++
 numba_dppy/tests/test_dparray.py | 228 ++++++++++++++++++++
 setup.py                         |  12 ++
 4 files changed, 683 insertions(+)
 create mode 100644 numba_dppy/dparray.py
 create mode 100644 numba_dppy/dppy_rt.c
 create mode 100644 numba_dppy/tests/test_dparray.py

diff --git a/numba_dppy/dparray.py b/numba_dppy/dparray.py
new file mode 100644
index 0000000000..654230fc81
--- /dev/null
+++ b/numba_dppy/dparray.py
@@ -0,0 +1,354 @@
+# This class creates a type in Numba.
+class DPArrayType(types.Array):
+    def __init__(
+        self,
+        dtype,
+        ndim,
+        layout,
+        readonly=False,
+        name=None,
+        aligned=True,
+        addrspace=None,
+    ):
+        # This name defines how this type will be shown in Numba's type dumps.
+        name = "DPArray:ndarray(%s, %sd, %s)" % (dtype, ndim, layout)
+        super(DPArrayType, self).__init__(
+            dtype,
+            ndim,
+            layout,
+            py_type=ndarray,
+            readonly=readonly,
+            name=name,
+            addrspace=addrspace,
+        )
+
+    # Tell Numba typing how to combine DPArrayType with other ndarray types.
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        if method == "__call__":
+            for inp in inputs:
+                if not isinstance(inp, (DPArrayType, types.Array, types.Number)):
+                    return None
+
+            return DPArrayType
+        else:
+            return None
+
+
+# This tells Numba how to create a DPArrayType when a dparray is passed
+# into a njit function.
+@typeof_impl.register(ndarray)
+def typeof_ta_ndarray(val, c):
+    try:
+        dtype = numpy_support.from_dtype(val.dtype)
+    except NotImplementedError:
+        raise ValueError("Unsupported array dtype: %s" % (val.dtype,))
+    layout = numpy_support.map_layout(val)
+    readonly = not val.flags.writeable
+    return DPArrayType(dtype, val.ndim, layout, readonly=readonly)
+
+
+# This tells Numba to use the default Numpy ndarray data layout for
+# object of type DPArray.
+register_model(DPArrayType)(numba.core.datamodel.models.ArrayModel)
+
+# This tells Numba how to convert from its native representation
+# of a DPArray in a njit function back to a Python DPArray.
+@box(DPArrayType)
+def box_array(typ, val, c):
+    nativearycls = c.context.make_array(typ)
+    nativeary = nativearycls(c.context, c.builder, value=val)
+    if c.context.enable_nrt:
+        np_dtype = numpy_support.as_dtype(typ.dtype)
+        dtypeptr = c.env_manager.read_const(c.env_manager.add_const(np_dtype))
+        # Steals NRT ref
+        newary = c.pyapi.nrt_adapt_ndarray_to_python(typ, val, dtypeptr)
+        return newary
+    else:
+        parent = nativeary.parent
+        c.pyapi.incref(parent)
+        return parent
+
+
+# This tells Numba to use this function when it needs to allocate a
+# DPArray in a njit function.
+@allocator(DPArrayType)
+def allocator_DPArray(context, builder, size, align):
+    context.nrt._require_nrt()
+
+    mod = builder.module
+    u32 = ir.IntType(32)
+
+    # Get the Numba external allocator for USM memory.
+    ext_allocator_fnty = ir.FunctionType(cgutils.voidptr_t, [])
+    ext_allocator_fn = mod.get_or_insert_function(
+        ext_allocator_fnty, name="dparray_get_ext_allocator"
+    )
+    ext_allocator = builder.call(ext_allocator_fn, [])
+    # Get the Numba function to allocate an aligned array with an external allocator.
+    fnty = ir.FunctionType(cgutils.voidptr_t, [cgutils.intp_t, u32, cgutils.voidptr_t])
+    fn = mod.get_or_insert_function(
+        fnty, name="NRT_MemInfo_alloc_safe_aligned_external"
+    )
+    fn.return_value.add_attribute("noalias")
+    if isinstance(align, builtins.int):
+        align = context.get_constant(types.uint32, align)
+    else:
+        assert align.type == u32, "align must be a uint32"
+    return builder.call(fn, [size, align, ext_allocator])
+
+
+registered = False
+
+
+def numba_register():
+    global registered
+    if not registered:
+        registered = True
+        numba_register_typing()
+        numba_register_lower_builtin()
+
+
+# Copy a function registered as a lowerer in Numba but change the
+# "np" import in Numba to point to dparray instead of NumPy.
+def copy_func_for_dparray(f, dparray_mod):
+    import copy as cc
+
+    # Make a copy so our change below doesn't affect anything else.
+    gglobals = cc.copy(f.__globals__)
+    # Make the "np"'s in the code use dparray instead of Numba's default NumPy.
+    gglobals["np"] = dparray_mod
+    # Create a new function using the original code but the new globals.
+    g = ftype(f.__code__, gglobals, None, f.__defaults__, f.__closure__)
+    # Some other tricks to make sure the function copy works.
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    return g
+
+
+def types_replace_array(x):
+    return tuple([z if z != types.Array else DPArrayType for z in x])
+
+
+def numba_register_lower_builtin():
+    todo = []
+    todo_builtin = []
+    todo_getattr = []
+
+    # For all Numpy identifiers that have been registered for typing in Numba...
+    # this registry contains functions, getattrs, setattrs, casts and constants...need to do them all? FIX FIX FIX
+    for ig in lower_registry.functions:
+        impl, func, types = ig
+        # If it is a Numpy function...
+        if isinstance(func, ftype):
+            if func.__module__ == np.__name__:
+                # If we have overloaded that function in the dparray module (always True right now)...
+                if func.__name__ in functions_list:
+                    todo.append(ig)
+        if isinstance(func, bftype):
+            if func.__module__ == np.__name__:
+                # If we have overloaded that function in the dparray module (always True right now)...
+                if func.__name__ in functions_list:
+                    todo.append(ig)
+
+    for lg in lower_registry.getattrs:
+        func, attr, types = lg
+        types_with_dparray = types_replace_array(types)
+        if DPArrayType in types_with_dparray:
+            dprint(
+                "lower_getattr:", func, type(func), attr, type(attr), types, type(types)
+            )
+            todo_getattr.append((func, attr, types_with_dparray))
+
+    for lg in todo_getattr:
+        lower_registry.getattrs.append(lg)
+
+    cur_mod = importlib.import_module(__name__)
+    for impl, func, types in todo + todo_builtin:
+        dparray_func = eval(func.__name__)
+        dprint(
+            "need to re-register lowerer for dparray", impl, func, types, dparray_func
+        )
+        new_impl = copy_func_for_dparray(impl, cur_mod)
+        lower_registry.functions.append((new_impl, dparray_func, types))
+
+
+def argspec_to_string(argspec):
+    first_default_arg = len(argspec.args) - len(argspec.defaults)
+    non_def = argspec.args[:first_default_arg]
+    arg_zip = list(zip(argspec.args[first_default_arg:], argspec.defaults))
+    combined = [a + "=" + str(b) for a, b in arg_zip]
+    return ",".join(non_def + combined)
+
+
+def numba_register_typing():
+    todo = []
+    todo_classes = []
+    todo_getattr = []
+
+    # For all Numpy identifiers that have been registered for typing in Numba...
+    for ig in typing_registry.globals:
+        val, typ = ig
+        # If it is a Numpy function...
+        if isinstance(val, (ftype, bftype)):
+            # If we have overloaded that function in the dparray module (always True right now)...
+            if val.__name__ in functions_list:
+                todo.append(ig)
+        if isinstance(val, type):
+            todo_classes.append(ig)
+
+    for tgetattr in templates_registry.attributes:
+        if tgetattr.key == types.Array:
+            todo_getattr.append(tgetattr)
+
+    for val, typ in todo:
+        assert len(typ.templates) == 1
+        # template is the typing class to invoke generic() upon.
+        template = typ.templates[0]
+        dpval = eval(val.__name__)
+        dprint("need to re-register for dparray", val, typ, typ.typing_key)
+        """
+        if debug:
+            print("--------------------------------------------------------------")
+            print("need to re-register for dparray", val, typ, typ.typing_key)
+            print("val:", val, type(val), "dir val", dir(val))
+            print("typ:", typ, type(typ), "dir typ", dir(typ))
+            print("typing key:", typ.typing_key)
+            print("name:", typ.name)
+            print("key:", typ.key)
+            print("templates:", typ.templates)
+            print("template:", template, type(template))
+            print("dpval:", dpval, type(dpval))
+            print("--------------------------------------------------------------")
+        """
+
+        class_name = "DparrayTemplate_" + val.__name__
+
+        @classmethod
+        def set_key_original(cls, key, original):
+            cls.key = key
+            cls.original = original
+
+        def generic_impl(self):
+            original_typer = self.__class__.original.generic(self.__class__.original)
+            ot_argspec = inspect.getfullargspec(original_typer)
+            # print("ot_argspec:", ot_argspec)
+            astr = argspec_to_string(ot_argspec)
+            # print("astr:", astr)
+
+            typer_func = """def typer({}):
+                                original_res = original_typer({})
+                                #print("original_res:", original_res)
+                                if isinstance(original_res, types.Array):
+                                    return DPArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
+
+                                return original_res""".format(
+                astr, ",".join(ot_argspec.args)
+            )
+
+            # print("typer_func:", typer_func)
+
+            try:
+                gs = globals()
+                ls = locals()
+                gs["original_typer"] = ls["original_typer"]
+                exec(typer_func, globals(), locals())
+            except NameError as ne:
+                print("NameError in exec:", ne)
+                sys.exit(0)
+            except:
+                print("exec failed!", sys.exc_info()[0])
+                sys.exit(0)
+
+            try:
+                exec_res = eval("typer")
+            except NameError as ne:
+                print("NameError in eval:", ne)
+                sys.exit(0)
+            except:
+                print("eval failed!", sys.exc_info()[0])
+                sys.exit(0)
+
+            # print("exec_res:", exec_res)
+            return exec_res
+
+        new_dparray_template = type(
+            class_name,
+            (template,),
+            {"set_class_vars": set_key_original, "generic": generic_impl},
+        )
+
+        new_dparray_template.set_class_vars(dpval, template)
+
+        assert callable(dpval)
+        type_handler = types.Function(new_dparray_template)
+        typing_registry.register_global(dpval, type_handler)
+
+    # Handle dparray attribute typing.
+    for tgetattr in todo_getattr:
+        class_name = tgetattr.__name__ + "_dparray"
+        dprint("tgetattr:", tgetattr, type(tgetattr), class_name)
+
+        @classmethod
+        def set_key(cls, key):
+            cls.key = key
+
+        def getattr_impl(self, attr):
+            if attr.startswith("resolve_"):
+                # print("getattr_impl starts with resolve_:", self, type(self), attr)
+                def wrapper(*args, **kwargs):
+                    attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
+                    if isinstance(attr_res, types.Array):
+                        return DPArrayType(
+                            dtype=attr_res.dtype,
+                            ndim=attr_res.ndim,
+                            layout=attr_res.layout,
+                        )
+
+                return wrapper
+            else:
+                return tgetattr.__getattribute__(self, attr)
+
+        new_dparray_template = type(
+            class_name,
+            (tgetattr,),
+            {"set_class_vars": set_key, "__getattribute__": getattr_impl},
+        )
+
+        new_dparray_template.set_class_vars(DPArrayType)
+        templates_registry.register_attr(new_dparray_template)
+
+
+def from_ndarray(x):
+    return copy(x)
+
+
+def as_ndarray(x):
+    return np.copy(x)
+
+
+@typing_registry.register_global(as_ndarray)
+class DparrayAsNdarray(CallableTemplate):
+    def generic(self):
+        def typer(arg):
+            return types.Array(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
+
+        return typer
+
+
+@typing_registry.register_global(from_ndarray)
+class DparrayFromNdarray(CallableTemplate):
+    def generic(self):
+        def typer(arg):
+            return DPArrayType(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
+
+        return typer
+
+
+@lower_registry.lower(as_ndarray, DPArrayType)
+def dparray_conversion_as(context, builder, sig, args):
+    return _array_copy(context, builder, sig, args)
+
+
+@lower_registry.lower(from_ndarray, types.Array)
+def dparray_conversion_from(context, builder, sig, args):
+    return _array_copy(context, builder, sig, args)
diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
new file mode 100644
index 0000000000..75c05ff585
--- /dev/null
+++ b/numba_dppy/dppy_rt.c
@@ -0,0 +1,89 @@
+#include "../_pymodule.h"
+#include "../core/runtime/nrt_external.h"
+#include "assert.h"
+#include <dlfcn.h>
+#include <stdio.h>
+
+NRT_ExternalAllocator dparray_allocator;
+
+void dparray_memsys_init(void) {
+    void *(*get_queue)(void);
+    char *lib_name = "libDPPLSyclInterface.so";
+    char *malloc_name = "DPPLmalloc_shared";
+    char *free_name = "DPPLfree_with_queue";
+    char *get_queue_name = "DPPLQueueMgr_GetCurrentQueue";
+
+    void *sycldl = dlopen(lib_name, RTLD_NOW);
+    assert(sycldl != NULL);
+    dparray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    if (dparray_allocator.malloc == NULL) {
+        printf("Did not find %s in %s\n", malloc_name, lib_name);
+        exit(-1);
+    }
+    dparray_allocator.realloc = NULL;
+    dparray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
+    if (dparray_allocator.free == NULL) {
+        printf("Did not find %s in %s\n", free_name, lib_name);
+        exit(-1);
+    }
+    get_queue = (void *(*))dlsym(sycldl, get_queue_name);
+    if (get_queue == NULL) {
+        printf("Did not find %s in %s\n", get_queue_name, lib_name);
+        exit(-1);
+    }
+    dparray_allocator.opaque_data = get_queue();
+//    printf("dparray_memsys_init: %p %p %p\n", dparray_allocator.malloc, dparray_allocator.free, dparray_allocator.opaque_data);
+}
+
+void * dparray_get_ext_allocator(void) {
+    printf("dparray_get_ext_allocator %p\n", &dparray_allocator);
+    return (void*)&dparray_allocator;
+}
+
+static PyObject *
+get_external_allocator(PyObject *self, PyObject *args) {
+    return PyLong_FromVoidPtr(dparray_get_ext_allocator());
+}
+
+static PyMethodDef ext_methods[] = {
+#define declmethod_noargs(func) { #func , ( PyCFunction )func , METH_NOARGS, NULL }
+    declmethod_noargs(get_external_allocator),
+    {NULL},
+#undef declmethod_noargs
+};
+
+static PyObject *
+build_c_helpers_dict(void)
+{
+    PyObject *dct = PyDict_New();
+    if (dct == NULL)
+        goto error;
+
+#define _declpointer(name, value) do {                 \
+    PyObject *o = PyLong_FromVoidPtr(value);           \
+    if (o == NULL) goto error;                         \
+    if (PyDict_SetItemString(dct, name, o)) {          \
+        Py_DECREF(o);                                  \
+        goto error;                                    \
+    }                                                  \
+    Py_DECREF(o);                                      \
+} while (0)
+
+    _declpointer("dparray_get_ext_allocator", &dparray_get_ext_allocator);
+
+#undef _declpointer
+    return dct;
+error:
+    Py_XDECREF(dct);
+    return NULL;
+}
+
+MOD_INIT(_dppl_rt) {
+    PyObject *m;
+    MOD_DEF(m, "numba.dppl._dppl_rt", "No docs", ext_methods)
+    if (m == NULL)
+        return MOD_ERROR_VAL;
+    dparray_memsys_init();
+    PyModule_AddObject(m, "c_helpers", build_c_helpers_dict());
+    return MOD_SUCCESS_VAL(m);
+}
diff --git a/numba_dppy/tests/test_dparray.py b/numba_dppy/tests/test_dparray.py
new file mode 100644
index 0000000000..24dbea43c4
--- /dev/null
+++ b/numba_dppy/tests/test_dparray.py
@@ -0,0 +1,228 @@
+from __future__ import print_function, division, absolute_import
+
+import numba
+import numba.dppl.dparray as dparray
+import numpy
+import sys
+
+
+def p1(a):
+    return a * 2.0 + 13
+
+
+f1 = numba.njit(p1)
+
+
+@numba.njit()
+def f2(a):
+    return a
+
+
+@numba.njit()
+def f3(a, b):  # a is dparray, b is numpy
+    return a * dparray.asarray(b)
+
+
+@numba.njit()
+def f4():
+    return dparray.ones(10)
+
+
+def p5(a, b):  # a is dparray, b is numpy
+    return a * b
+
+
+f5 = numba.njit(p5)
+
+
+@numba.njit()
+def f6(a):
+    return a + 13
+
+
+@numba.njit()
+def f7(a):  # a is dparray
+    # implicit conversion of a to numpy.ndarray
+    b = numpy.ones(10)
+    c = a * b
+    d = a.argsort()  # with no implicit conversion this fails
+
+
+@numba.njit
+def f8(a):
+    return dparray.as_ndarray(a)
+
+
+@numba.njit
+def f9(a):
+    return dparray.from_ndarray(a)
+
+
+@numba.njit
+def f10():
+    return dparray.empty((10, 10))
+
+
+@numba.njit
+def f11(x):
+    return x.shape
+
+
+@numba.njit
+def f12(x):
+    return x.T
+
+
+# --------------------------------------------------------------------------------
+
+print("------------------- Testing Python Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = p1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing Numba Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = f1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing dparray ones")
+sys.stdout.flush()
+a = dparray.ones(10)
+print("a:", a, type(a))
+assert isinstance(a, dparray.ndarray)
+assert dparray.has_array_interface(a)
+
+print("------------------- Testing dparray.dparray.as_ndarray")
+sys.stdout.flush()
+nd1 = a.as_ndarray()
+print("nd1:", nd1, type(nd1))
+assert type(nd1) == numpy.ndarray
+
+print("------------------- Testing dparray.as_ndarray")
+sys.stdout.flush()
+nd2 = dparray.as_ndarray(a)
+print("nd2:", nd2, type(nd2))
+assert type(nd2) == numpy.ndarray
+
+print("------------------- Testing dparray.from_ndarray")
+sys.stdout.flush()
+dp1 = dparray.from_ndarray(nd2)
+print("dp1:", dp1, type(dp1))
+assert isinstance(dp1, dparray.ndarray)
+assert dparray.has_array_interface(dp1)
+
+print("------------------- Testing dparray multiplication")
+sys.stdout.flush()
+c = a * 5
+print("c", c, type(c))
+assert isinstance(c, dparray.ndarray)
+assert dparray.has_array_interface(c)
+
+print("------------------- Testing Python dparray")
+sys.stdout.flush()
+b = p1(c)
+print("b:", b, type(b))
+assert isinstance(b, dparray.ndarray)
+assert dparray.has_array_interface(b)
+del b
+
+print("------------------- Testing Python mixing dparray and numpy.ndarray")
+sys.stdout.flush()
+h = p5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, dparray.ndarray)
+assert dparray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba dparray 2")
+sys.stdout.flush()
+d = f2(a)
+print("d:", d, type(d))
+assert isinstance(d, dparray.ndarray)
+assert dparray.has_array_interface(d)
+del d
+
+print("------------------- Testing Numba dparray")
+sys.stdout.flush()
+b = f1(c)
+print("b:", b, type(b))
+assert isinstance(b, dparray.ndarray)
+assert dparray.has_array_interface(b)
+del b
+
+"""
+print("------------------- Testing Numba dparray constructor from numpy.ndarray")
+sys.stdout.flush()
+e = f3(a, z1)
+print("e:", e, type(e))
+assert(isinstance(e, dparray.ndarray))
+"""
+
+print("------------------- Testing Numba mixing dparray and constant")
+sys.stdout.flush()
+g = f6(a)
+print("g:", g, type(g))
+assert isinstance(g, dparray.ndarray)
+assert dparray.has_array_interface(g)
+del g
+
+print("------------------- Testing Numba mixing dparray and numpy.ndarray")
+sys.stdout.flush()
+h = f5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, dparray.ndarray)
+assert dparray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba dparray functions")
+sys.stdout.flush()
+f = f4()
+print("f:", f, type(f))
+assert isinstance(f, dparray.ndarray)
+assert dparray.has_array_interface(f)
+del f
+
+print("------------------- Testing Numba dparray.as_ndarray")
+sys.stdout.flush()
+nd3 = f8(a)
+print("nd3:", nd3, type(nd3))
+assert type(nd3) == numpy.ndarray
+
+print("------------------- Testing Numba dparray.from_ndarray")
+sys.stdout.flush()
+dp2 = f9(nd3)
+print("dp2:", dp2, type(dp2))
+assert isinstance(dp2, dparray.ndarray)
+assert dparray.has_array_interface(dp2)
+del nd3
+del dp2
+
+print("------------------- Testing Numba dparray.empty")
+sys.stdout.flush()
+dp3 = f10()
+print("dp3:", dp3, type(dp3))
+assert isinstance(dp3, dparray.ndarray)
+assert dparray.has_array_interface(dp3)
+
+print("------------------- Testing Numba dparray.shape")
+sys.stdout.flush()
+s1 = f11(dp3)
+print("s1:", s1, type(s1))
+
+print("------------------- Testing Numba dparray.T")
+sys.stdout.flush()
+dp4 = f12(dp3)
+print("dp4:", dp4, type(dp4))
+assert isinstance(dp4, dparray.ndarray)
+assert dparray.has_array_interface(dp4)
+del dp3
+del dp4
+
+# -------------------------------
+del a
+
+print("SUCCESS")
diff --git a/setup.py b/setup.py
index 13f3d782d9..857eca49b1 100644
--- a/setup.py
+++ b/setup.py
@@ -8,6 +8,13 @@
 def get_ext_modules():
     ext_modules = []
 
+    ext_dppy = Extension(
+        name="numba_dppy._dppy_rt",
+        sources=["numba_dppy/dppl_rt.c"],
+        depends=["numba/core/runtime/nrt_external.h", "numba/core/runtime/nrt.h"],
+    )
+    ext_modules += [ext_modules]
+
     dpnp_present = False
     try:
         import dpnp
@@ -65,6 +72,11 @@ def get_ext_modules():
         "Topic :: Software Development :: Compilers",
     ],
     cmdclass=versioneer.get_cmdclass(),
+    entry_points={
+        "numba_extensions": [
+            "init = numba_dppy.dparray:numba_register",
+        ]},
+    )
 )
 
 setup(**metadata)

From 26d5e2e82fc1019aa40ea4ac948e7dc629e27573 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 9 Dec 2020 15:41:16 -0600
Subject: [PATCH 02/24] Fix build issues for dppy_rt.c

---
 numba_dppy/dppy_rt.c |  4 ++--
 setup.py             | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 75c05ff585..140c3fcaf5 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -1,5 +1,5 @@
-#include "../_pymodule.h"
-#include "../core/runtime/nrt_external.h"
+#include "_pymodule.h"
+#include "core/runtime/nrt_external.h"
 #include "assert.h"
 #include <dlfcn.h>
 #include <stdio.h>
diff --git a/setup.py b/setup.py
index 857eca49b1..83c7153456 100644
--- a/setup.py
+++ b/setup.py
@@ -10,10 +10,11 @@ def get_ext_modules():
 
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",
-        sources=["numba_dppy/dppl_rt.c"],
-        depends=["numba/core/runtime/nrt_external.h", "numba/core/runtime/nrt.h"],
+        sources=["numba_dppy/dppy_rt.c"],
+        include_dirs=["../numba/numba"],  # Need to get rid of relative paths.
+        depends=["../numba/numba/core/runtime/nrt_external.h", "../numba/numba/core/runtime/nrt.h", "../numba/numba/_pymodule.h"],
     )
-    ext_modules += [ext_modules]
+    ext_modules += [ext_dppy]
 
     dpnp_present = False
     try:
@@ -45,7 +46,6 @@ def get_ext_modules():
 build_requires = ["cython"]
 install_requires = [
     "numba",
-    "cffi",
     "dpctl",
 ]
 
@@ -75,8 +75,7 @@ def get_ext_modules():
     entry_points={
         "numba_extensions": [
             "init = numba_dppy.dparray:numba_register",
-        ]},
-    )
+    ]},
 )
 
 setup(**metadata)

From a320b501974e112af5d2d28ffc97d0d3fc1b53ae Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 9 Dec 2020 16:08:38 -0600
Subject: [PATCH 03/24] Name changes from dparray things to usmarray.

---
 numba_dppy/dppy_rt.c                          |  41 ++--
 .../{dparray.py => numpy_usm_shared.py}       |  94 ++++----
 numba_dppy/tests/test_dparray.py              | 228 ------------------
 numba_dppy/tests/test_usmarray.py             | 228 ++++++++++++++++++
 setup.py                                      |   2 +-
 5 files changed, 296 insertions(+), 297 deletions(-)
 rename numba_dppy/{dparray.py => numpy_usm_shared.py} (76%)
 delete mode 100644 numba_dppy/tests/test_dparray.py
 create mode 100644 numba_dppy/tests/test_usmarray.py

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 140c3fcaf5..dd892055bf 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -4,25 +4,25 @@
 #include <dlfcn.h>
 #include <stdio.h>
 
-NRT_ExternalAllocator dparray_allocator;
+NRT_ExternalAllocator usmarray_allocator;
 
-void dparray_memsys_init(void) {
+void usmarray_memsys_init(void) {
     void *(*get_queue)(void);
-    char *lib_name = "libDPPLSyclInterface.so";
-    char *malloc_name = "DPPLmalloc_shared";
-    char *free_name = "DPPLfree_with_queue";
-    char *get_queue_name = "DPPLQueueMgr_GetCurrentQueue";
+    char *lib_name = "libDPCTLSyclInterface.so";
+    char *malloc_name = "DPCTLmalloc_shared";
+    char *free_name = "DPCTLfree_with_queue";
+    char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
 
     void *sycldl = dlopen(lib_name, RTLD_NOW);
     assert(sycldl != NULL);
-    dparray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
-    if (dparray_allocator.malloc == NULL) {
+    usmarray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    if (usmarray_allocator.malloc == NULL) {
         printf("Did not find %s in %s\n", malloc_name, lib_name);
         exit(-1);
     }
-    dparray_allocator.realloc = NULL;
-    dparray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
-    if (dparray_allocator.free == NULL) {
+    usmarray_allocator.realloc = NULL;
+    usmarray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
+    if (usmarray_allocator.free == NULL) {
         printf("Did not find %s in %s\n", free_name, lib_name);
         exit(-1);
     }
@@ -31,18 +31,17 @@ void dparray_memsys_init(void) {
         printf("Did not find %s in %s\n", get_queue_name, lib_name);
         exit(-1);
     }
-    dparray_allocator.opaque_data = get_queue();
-//    printf("dparray_memsys_init: %p %p %p\n", dparray_allocator.malloc, dparray_allocator.free, dparray_allocator.opaque_data);
+    usmarray_allocator.opaque_data = get_queue();
 }
 
-void * dparray_get_ext_allocator(void) {
-    printf("dparray_get_ext_allocator %p\n", &dparray_allocator);
-    return (void*)&dparray_allocator;
+void * usmarray_get_ext_allocator(void) {
+    printf("usmarray_get_ext_allocator %p\n", &usmarray_allocator);
+    return (void*)&usmarray_allocator;
 }
 
 static PyObject *
 get_external_allocator(PyObject *self, PyObject *args) {
-    return PyLong_FromVoidPtr(dparray_get_ext_allocator());
+    return PyLong_FromVoidPtr(usmarray_get_ext_allocator());
 }
 
 static PyMethodDef ext_methods[] = {
@@ -69,7 +68,7 @@ build_c_helpers_dict(void)
     Py_DECREF(o);                                      \
 } while (0)
 
-    _declpointer("dparray_get_ext_allocator", &dparray_get_ext_allocator);
+    _declpointer("usmarray_get_ext_allocator", &usmarray_get_ext_allocator);
 
 #undef _declpointer
     return dct;
@@ -78,12 +77,12 @@ build_c_helpers_dict(void)
     return NULL;
 }
 
-MOD_INIT(_dppl_rt) {
+MOD_INIT(_dppy_rt) {
     PyObject *m;
-    MOD_DEF(m, "numba.dppl._dppl_rt", "No docs", ext_methods)
+    MOD_DEF(m, "numba_dppy._dppy_rt", "No docs", ext_methods)
     if (m == NULL)
         return MOD_ERROR_VAL;
-    dparray_memsys_init();
+    usmarray_memsys_init();
     PyModule_AddObject(m, "c_helpers", build_c_helpers_dict());
     return MOD_SUCCESS_VAL(m);
 }
diff --git a/numba_dppy/dparray.py b/numba_dppy/numpy_usm_shared.py
similarity index 76%
rename from numba_dppy/dparray.py
rename to numba_dppy/numpy_usm_shared.py
index 654230fc81..0d190b1317 100644
--- a/numba_dppy/dparray.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -1,5 +1,5 @@
 # This class creates a type in Numba.
-class DPArrayType(types.Array):
+class UsmSharedArrayType(types.Array):
     def __init__(
         self,
         dtype,
@@ -11,8 +11,8 @@ def __init__(
         addrspace=None,
     ):
         # This name defines how this type will be shown in Numba's type dumps.
-        name = "DPArray:ndarray(%s, %sd, %s)" % (dtype, ndim, layout)
-        super(DPArrayType, self).__init__(
+        name = "UsmArray:ndarray(%s, %sd, %s)" % (dtype, ndim, layout)
+        super(UsmSharedArrayType, self).__init__(
             dtype,
             ndim,
             layout,
@@ -22,19 +22,19 @@ def __init__(
             addrspace=addrspace,
         )
 
-    # Tell Numba typing how to combine DPArrayType with other ndarray types.
+    # Tell Numba typing how to combine UsmSharedArrayType with other ndarray types.
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         if method == "__call__":
             for inp in inputs:
-                if not isinstance(inp, (DPArrayType, types.Array, types.Number)):
+                if not isinstance(inp, (UsmSharedArrayType, types.Array, types.Number)):
                     return None
 
-            return DPArrayType
+            return UsmSharedArrayType
         else:
             return None
 
 
-# This tells Numba how to create a DPArrayType when a dparray is passed
+# This tells Numba how to create a UsmSharedArrayType when a usmarray is passed
 # into a njit function.
 @typeof_impl.register(ndarray)
 def typeof_ta_ndarray(val, c):
@@ -44,16 +44,16 @@ def typeof_ta_ndarray(val, c):
         raise ValueError("Unsupported array dtype: %s" % (val.dtype,))
     layout = numpy_support.map_layout(val)
     readonly = not val.flags.writeable
-    return DPArrayType(dtype, val.ndim, layout, readonly=readonly)
+    return UsmSharedArrayType(dtype, val.ndim, layout, readonly=readonly)
 
 
 # This tells Numba to use the default Numpy ndarray data layout for
-# object of type DPArray.
-register_model(DPArrayType)(numba.core.datamodel.models.ArrayModel)
+# object of type UsmArray.
+register_model(UsmSharedArrayType)(numba.core.datamodel.models.ArrayModel)
 
 # This tells Numba how to convert from its native representation
-# of a DPArray in a njit function back to a Python DPArray.
-@box(DPArrayType)
+# of a UsmArray in a njit function back to a Python UsmArray.
+@box(UsmSharedArrayType)
 def box_array(typ, val, c):
     nativearycls = c.context.make_array(typ)
     nativeary = nativearycls(c.context, c.builder, value=val)
@@ -70,9 +70,9 @@ def box_array(typ, val, c):
 
 
 # This tells Numba to use this function when it needs to allocate a
-# DPArray in a njit function.
-@allocator(DPArrayType)
-def allocator_DPArray(context, builder, size, align):
+# UsmArray in a njit function.
+@allocator(UsmSharedArrayType)
+def allocator_UsmArray(context, builder, size, align):
     context.nrt._require_nrt()
 
     mod = builder.module
@@ -81,7 +81,7 @@ def allocator_DPArray(context, builder, size, align):
     # Get the Numba external allocator for USM memory.
     ext_allocator_fnty = ir.FunctionType(cgutils.voidptr_t, [])
     ext_allocator_fn = mod.get_or_insert_function(
-        ext_allocator_fnty, name="dparray_get_ext_allocator"
+        ext_allocator_fnty, name="usmarray_get_ext_allocator"
     )
     ext_allocator = builder.call(ext_allocator_fn, [])
     # Get the Numba function to allocate an aligned array with an external allocator.
@@ -109,14 +109,14 @@ def numba_register():
 
 
 # Copy a function registered as a lowerer in Numba but change the
-# "np" import in Numba to point to dparray instead of NumPy.
-def copy_func_for_dparray(f, dparray_mod):
+# "np" import in Numba to point to usmarray instead of NumPy.
+def copy_func_for_usmarray(f, usmarray_mod):
     import copy as cc
 
     # Make a copy so our change below doesn't affect anything else.
     gglobals = cc.copy(f.__globals__)
-    # Make the "np"'s in the code use dparray instead of Numba's default NumPy.
-    gglobals["np"] = dparray_mod
+    # Make the "np"'s in the code use usmarray instead of Numba's default NumPy.
+    gglobals["np"] = usmarray_mod
     # Create a new function using the original code but the new globals.
     g = ftype(f.__code__, gglobals, None, f.__defaults__, f.__closure__)
     # Some other tricks to make sure the function copy works.
@@ -126,7 +126,7 @@ def copy_func_for_dparray(f, dparray_mod):
 
 
 def types_replace_array(x):
-    return tuple([z if z != types.Array else DPArrayType for z in x])
+    return tuple([z if z != types.Array else UsmSharedArrayType for z in x])
 
 
 def numba_register_lower_builtin():
@@ -141,35 +141,35 @@ def numba_register_lower_builtin():
         # If it is a Numpy function...
         if isinstance(func, ftype):
             if func.__module__ == np.__name__:
-                # If we have overloaded that function in the dparray module (always True right now)...
+                # If we have overloaded that function in the usmarray module (always True right now)...
                 if func.__name__ in functions_list:
                     todo.append(ig)
         if isinstance(func, bftype):
             if func.__module__ == np.__name__:
-                # If we have overloaded that function in the dparray module (always True right now)...
+                # If we have overloaded that function in the usmarray module (always True right now)...
                 if func.__name__ in functions_list:
                     todo.append(ig)
 
     for lg in lower_registry.getattrs:
         func, attr, types = lg
-        types_with_dparray = types_replace_array(types)
-        if DPArrayType in types_with_dparray:
+        types_with_usmarray = types_replace_array(types)
+        if UsmSharedArrayType in types_with_usmarray:
             dprint(
                 "lower_getattr:", func, type(func), attr, type(attr), types, type(types)
             )
-            todo_getattr.append((func, attr, types_with_dparray))
+            todo_getattr.append((func, attr, types_with_usmarray))
 
     for lg in todo_getattr:
         lower_registry.getattrs.append(lg)
 
     cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
-        dparray_func = eval(func.__name__)
+        usmarray_func = eval(func.__name__)
         dprint(
-            "need to re-register lowerer for dparray", impl, func, types, dparray_func
+            "need to re-register lowerer for usmarray", impl, func, types, usmarray_func
         )
-        new_impl = copy_func_for_dparray(impl, cur_mod)
-        lower_registry.functions.append((new_impl, dparray_func, types))
+        new_impl = copy_func_for_usmarray(impl, cur_mod)
+        lower_registry.functions.append((new_impl, usmarray_func, types))
 
 
 def argspec_to_string(argspec):
@@ -190,7 +190,7 @@ def numba_register_typing():
         val, typ = ig
         # If it is a Numpy function...
         if isinstance(val, (ftype, bftype)):
-            # If we have overloaded that function in the dparray module (always True right now)...
+            # If we have overloaded that function in the usmarray module (always True right now)...
             if val.__name__ in functions_list:
                 todo.append(ig)
         if isinstance(val, type):
@@ -205,11 +205,11 @@ def numba_register_typing():
         # template is the typing class to invoke generic() upon.
         template = typ.templates[0]
         dpval = eval(val.__name__)
-        dprint("need to re-register for dparray", val, typ, typ.typing_key)
+        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         """
         if debug:
             print("--------------------------------------------------------------")
-            print("need to re-register for dparray", val, typ, typ.typing_key)
+            print("need to re-register for usmarray", val, typ, typ.typing_key)
             print("val:", val, type(val), "dir val", dir(val))
             print("typ:", typ, type(typ), "dir typ", dir(typ))
             print("typing key:", typ.typing_key)
@@ -239,7 +239,7 @@ def generic_impl(self):
                                 original_res = original_typer({})
                                 #print("original_res:", original_res)
                                 if isinstance(original_res, types.Array):
-                                    return DPArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
+                                    return UsmSharedArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
 
                                 return original_res""".format(
                 astr, ",".join(ot_argspec.args)
@@ -271,21 +271,21 @@ def generic_impl(self):
             # print("exec_res:", exec_res)
             return exec_res
 
-        new_dparray_template = type(
+        new_usmarray_template = type(
             class_name,
             (template,),
             {"set_class_vars": set_key_original, "generic": generic_impl},
         )
 
-        new_dparray_template.set_class_vars(dpval, template)
+        new_usmarray_template.set_class_vars(dpval, template)
 
         assert callable(dpval)
-        type_handler = types.Function(new_dparray_template)
+        type_handler = types.Function(new_usmarray_template)
         typing_registry.register_global(dpval, type_handler)
 
-    # Handle dparray attribute typing.
+    # Handle usmarray attribute typing.
     for tgetattr in todo_getattr:
-        class_name = tgetattr.__name__ + "_dparray"
+        class_name = tgetattr.__name__ + "_usmarray"
         dprint("tgetattr:", tgetattr, type(tgetattr), class_name)
 
         @classmethod
@@ -298,7 +298,7 @@ def getattr_impl(self, attr):
                 def wrapper(*args, **kwargs):
                     attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
                     if isinstance(attr_res, types.Array):
-                        return DPArrayType(
+                        return UsmSharedArrayType(
                             dtype=attr_res.dtype,
                             ndim=attr_res.ndim,
                             layout=attr_res.layout,
@@ -308,14 +308,14 @@ def wrapper(*args, **kwargs):
             else:
                 return tgetattr.__getattribute__(self, attr)
 
-        new_dparray_template = type(
+        new_usmarray_template = type(
             class_name,
             (tgetattr,),
             {"set_class_vars": set_key, "__getattribute__": getattr_impl},
         )
 
-        new_dparray_template.set_class_vars(DPArrayType)
-        templates_registry.register_attr(new_dparray_template)
+        new_usmarray_template.set_class_vars(UsmSharedArrayType)
+        templates_registry.register_attr(new_usmarray_template)
 
 
 def from_ndarray(x):
@@ -339,16 +339,16 @@ def typer(arg):
 class DparrayFromNdarray(CallableTemplate):
     def generic(self):
         def typer(arg):
-            return DPArrayType(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
+            return UsmSharedArrayType(dtype=arg.dtype, ndim=arg.ndim, layout=arg.layout)
 
         return typer
 
 
-@lower_registry.lower(as_ndarray, DPArrayType)
-def dparray_conversion_as(context, builder, sig, args):
+@lower_registry.lower(as_ndarray, UsmSharedArrayType)
+def usmarray_conversion_as(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)
 
 
 @lower_registry.lower(from_ndarray, types.Array)
-def dparray_conversion_from(context, builder, sig, args):
+def usmarray_conversion_from(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)
diff --git a/numba_dppy/tests/test_dparray.py b/numba_dppy/tests/test_dparray.py
deleted file mode 100644
index 24dbea43c4..0000000000
--- a/numba_dppy/tests/test_dparray.py
+++ /dev/null
@@ -1,228 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-import numba
-import numba.dppl.dparray as dparray
-import numpy
-import sys
-
-
-def p1(a):
-    return a * 2.0 + 13
-
-
-f1 = numba.njit(p1)
-
-
-@numba.njit()
-def f2(a):
-    return a
-
-
-@numba.njit()
-def f3(a, b):  # a is dparray, b is numpy
-    return a * dparray.asarray(b)
-
-
-@numba.njit()
-def f4():
-    return dparray.ones(10)
-
-
-def p5(a, b):  # a is dparray, b is numpy
-    return a * b
-
-
-f5 = numba.njit(p5)
-
-
-@numba.njit()
-def f6(a):
-    return a + 13
-
-
-@numba.njit()
-def f7(a):  # a is dparray
-    # implicit conversion of a to numpy.ndarray
-    b = numpy.ones(10)
-    c = a * b
-    d = a.argsort()  # with no implicit conversion this fails
-
-
-@numba.njit
-def f8(a):
-    return dparray.as_ndarray(a)
-
-
-@numba.njit
-def f9(a):
-    return dparray.from_ndarray(a)
-
-
-@numba.njit
-def f10():
-    return dparray.empty((10, 10))
-
-
-@numba.njit
-def f11(x):
-    return x.shape
-
-
-@numba.njit
-def f12(x):
-    return x.T
-
-
-# --------------------------------------------------------------------------------
-
-print("------------------- Testing Python Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = p1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing Numba Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = f1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing dparray ones")
-sys.stdout.flush()
-a = dparray.ones(10)
-print("a:", a, type(a))
-assert isinstance(a, dparray.ndarray)
-assert dparray.has_array_interface(a)
-
-print("------------------- Testing dparray.dparray.as_ndarray")
-sys.stdout.flush()
-nd1 = a.as_ndarray()
-print("nd1:", nd1, type(nd1))
-assert type(nd1) == numpy.ndarray
-
-print("------------------- Testing dparray.as_ndarray")
-sys.stdout.flush()
-nd2 = dparray.as_ndarray(a)
-print("nd2:", nd2, type(nd2))
-assert type(nd2) == numpy.ndarray
-
-print("------------------- Testing dparray.from_ndarray")
-sys.stdout.flush()
-dp1 = dparray.from_ndarray(nd2)
-print("dp1:", dp1, type(dp1))
-assert isinstance(dp1, dparray.ndarray)
-assert dparray.has_array_interface(dp1)
-
-print("------------------- Testing dparray multiplication")
-sys.stdout.flush()
-c = a * 5
-print("c", c, type(c))
-assert isinstance(c, dparray.ndarray)
-assert dparray.has_array_interface(c)
-
-print("------------------- Testing Python dparray")
-sys.stdout.flush()
-b = p1(c)
-print("b:", b, type(b))
-assert isinstance(b, dparray.ndarray)
-assert dparray.has_array_interface(b)
-del b
-
-print("------------------- Testing Python mixing dparray and numpy.ndarray")
-sys.stdout.flush()
-h = p5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, dparray.ndarray)
-assert dparray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba dparray 2")
-sys.stdout.flush()
-d = f2(a)
-print("d:", d, type(d))
-assert isinstance(d, dparray.ndarray)
-assert dparray.has_array_interface(d)
-del d
-
-print("------------------- Testing Numba dparray")
-sys.stdout.flush()
-b = f1(c)
-print("b:", b, type(b))
-assert isinstance(b, dparray.ndarray)
-assert dparray.has_array_interface(b)
-del b
-
-"""
-print("------------------- Testing Numba dparray constructor from numpy.ndarray")
-sys.stdout.flush()
-e = f3(a, z1)
-print("e:", e, type(e))
-assert(isinstance(e, dparray.ndarray))
-"""
-
-print("------------------- Testing Numba mixing dparray and constant")
-sys.stdout.flush()
-g = f6(a)
-print("g:", g, type(g))
-assert isinstance(g, dparray.ndarray)
-assert dparray.has_array_interface(g)
-del g
-
-print("------------------- Testing Numba mixing dparray and numpy.ndarray")
-sys.stdout.flush()
-h = f5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, dparray.ndarray)
-assert dparray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba dparray functions")
-sys.stdout.flush()
-f = f4()
-print("f:", f, type(f))
-assert isinstance(f, dparray.ndarray)
-assert dparray.has_array_interface(f)
-del f
-
-print("------------------- Testing Numba dparray.as_ndarray")
-sys.stdout.flush()
-nd3 = f8(a)
-print("nd3:", nd3, type(nd3))
-assert type(nd3) == numpy.ndarray
-
-print("------------------- Testing Numba dparray.from_ndarray")
-sys.stdout.flush()
-dp2 = f9(nd3)
-print("dp2:", dp2, type(dp2))
-assert isinstance(dp2, dparray.ndarray)
-assert dparray.has_array_interface(dp2)
-del nd3
-del dp2
-
-print("------------------- Testing Numba dparray.empty")
-sys.stdout.flush()
-dp3 = f10()
-print("dp3:", dp3, type(dp3))
-assert isinstance(dp3, dparray.ndarray)
-assert dparray.has_array_interface(dp3)
-
-print("------------------- Testing Numba dparray.shape")
-sys.stdout.flush()
-s1 = f11(dp3)
-print("s1:", s1, type(s1))
-
-print("------------------- Testing Numba dparray.T")
-sys.stdout.flush()
-dp4 = f12(dp3)
-print("dp4:", dp4, type(dp4))
-assert isinstance(dp4, dparray.ndarray)
-assert dparray.has_array_interface(dp4)
-del dp3
-del dp4
-
-# -------------------------------
-del a
-
-print("SUCCESS")
diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
new file mode 100644
index 0000000000..fe1be71c9e
--- /dev/null
+++ b/numba_dppy/tests/test_usmarray.py
@@ -0,0 +1,228 @@
+from __future__ import print_function, division, absolute_import
+
+import numba
+import dpctl.dptensor.numpy_usm_shared as usmarray
+import numpy
+import sys
+
+
+def p1(a):
+    return a * 2.0 + 13
+
+
+f1 = numba.njit(p1)
+
+
+@numba.njit()
+def f2(a):
+    return a
+
+
+@numba.njit()
+def f3(a, b):  # a is usmarray, b is numpy
+    return a * usmarray.asarray(b)
+
+
+@numba.njit()
+def f4():
+    return usmarray.ones(10)
+
+
+def p5(a, b):  # a is usmarray, b is numpy
+    return a * b
+
+
+f5 = numba.njit(p5)
+
+
+@numba.njit()
+def f6(a):
+    return a + 13
+
+
+@numba.njit()
+def f7(a):  # a is usmarray
+    # implicit conversion of a to numpy.ndarray
+    b = numpy.ones(10)
+    c = a * b
+    d = a.argsort()  # with no implicit conversion this fails
+
+
+@numba.njit
+def f8(a):
+    return usmarray.as_ndarray(a)
+
+
+@numba.njit
+def f9(a):
+    return usmarray.from_ndarray(a)
+
+
+@numba.njit
+def f10():
+    return usmarray.empty((10, 10))
+
+
+@numba.njit
+def f11(x):
+    return x.shape
+
+
+@numba.njit
+def f12(x):
+    return x.T
+
+
+# --------------------------------------------------------------------------------
+
+print("------------------- Testing Python Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = p1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing Numba Numpy")
+sys.stdout.flush()
+z1 = numpy.ones(10)
+z2 = f1(z1)
+print("z2:", z2, type(z2))
+assert type(z2) == numpy.ndarray
+
+print("------------------- Testing usmarray ones")
+sys.stdout.flush()
+a = usmarray.ones(10)
+print("a:", a, type(a))
+assert isinstance(a, usmarray.ndarray)
+assert usmarray.has_array_interface(a)
+
+print("------------------- Testing usmarray.usmarray.as_ndarray")
+sys.stdout.flush()
+nd1 = a.as_ndarray()
+print("nd1:", nd1, type(nd1))
+assert type(nd1) == numpy.ndarray
+
+print("------------------- Testing usmarray.as_ndarray")
+sys.stdout.flush()
+nd2 = usmarray.as_ndarray(a)
+print("nd2:", nd2, type(nd2))
+assert type(nd2) == numpy.ndarray
+
+print("------------------- Testing usmarray.from_ndarray")
+sys.stdout.flush()
+dp1 = usmarray.from_ndarray(nd2)
+print("dp1:", dp1, type(dp1))
+assert isinstance(dp1, usmarray.ndarray)
+assert usmarray.has_array_interface(dp1)
+
+print("------------------- Testing usmarray multiplication")
+sys.stdout.flush()
+c = a * 5
+print("c", c, type(c))
+assert isinstance(c, usmarray.ndarray)
+assert usmarray.has_array_interface(c)
+
+print("------------------- Testing Python usmarray")
+sys.stdout.flush()
+b = p1(c)
+print("b:", b, type(b))
+assert isinstance(b, usmarray.ndarray)
+assert usmarray.has_array_interface(b)
+del b
+
+print("------------------- Testing Python mixing usmarray and numpy.ndarray")
+sys.stdout.flush()
+h = p5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, usmarray.ndarray)
+assert usmarray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba usmarray 2")
+sys.stdout.flush()
+d = f2(a)
+print("d:", d, type(d))
+assert isinstance(d, usmarray.ndarray)
+assert usmarray.has_array_interface(d)
+del d
+
+print("------------------- Testing Numba usmarray")
+sys.stdout.flush()
+b = f1(c)
+print("b:", b, type(b))
+assert isinstance(b, usmarray.ndarray)
+assert usmarray.has_array_interface(b)
+del b
+
+"""
+print("------------------- Testing Numba usmarray constructor from numpy.ndarray")
+sys.stdout.flush()
+e = f3(a, z1)
+print("e:", e, type(e))
+assert(isinstance(e, usmarray.ndarray))
+"""
+
+print("------------------- Testing Numba mixing usmarray and constant")
+sys.stdout.flush()
+g = f6(a)
+print("g:", g, type(g))
+assert isinstance(g, usmarray.ndarray)
+assert usmarray.has_array_interface(g)
+del g
+
+print("------------------- Testing Numba mixing usmarray and numpy.ndarray")
+sys.stdout.flush()
+h = f5(a, z1)
+print("h:", h, type(h))
+assert isinstance(h, usmarray.ndarray)
+assert usmarray.has_array_interface(h)
+del h
+
+print("------------------- Testing Numba usmarray functions")
+sys.stdout.flush()
+f = f4()
+print("f:", f, type(f))
+assert isinstance(f, usmarray.ndarray)
+assert usmarray.has_array_interface(f)
+del f
+
+print("------------------- Testing Numba usmarray.as_ndarray")
+sys.stdout.flush()
+nd3 = f8(a)
+print("nd3:", nd3, type(nd3))
+assert type(nd3) == numpy.ndarray
+
+print("------------------- Testing Numba usmarray.from_ndarray")
+sys.stdout.flush()
+dp2 = f9(nd3)
+print("dp2:", dp2, type(dp2))
+assert isinstance(dp2, usmarray.ndarray)
+assert usmarray.has_array_interface(dp2)
+del nd3
+del dp2
+
+print("------------------- Testing Numba usmarray.empty")
+sys.stdout.flush()
+dp3 = f10()
+print("dp3:", dp3, type(dp3))
+assert isinstance(dp3, usmarray.ndarray)
+assert usmarray.has_array_interface(dp3)
+
+print("------------------- Testing Numba usmarray.shape")
+sys.stdout.flush()
+s1 = f11(dp3)
+print("s1:", s1, type(s1))
+
+print("------------------- Testing Numba usmarray.T")
+sys.stdout.flush()
+dp4 = f12(dp3)
+print("dp4:", dp4, type(dp4))
+assert isinstance(dp4, usmarray.ndarray)
+assert usmarray.has_array_interface(dp4)
+del dp3
+del dp4
+
+# -------------------------------
+del a
+
+print("SUCCESS")
diff --git a/setup.py b/setup.py
index 83c7153456..b870c50a8f 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@ def get_ext_modules():
     cmdclass=versioneer.get_cmdclass(),
     entry_points={
         "numba_extensions": [
-            "init = numba_dppy.dparray:numba_register",
+            "init = numba_dppy.usmarray:numba_register",
     ]},
 )
 

From f9de97a5099b44b4f9033966232b6d38fc437ea6 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 15:39:40 +0300
Subject: [PATCH 04/24] Delete old backup file (#45)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 .../parfor_loop_invariant_hoisting.py.bkp     | 213 ------------------
 1 file changed, 213 deletions(-)
 delete mode 100644 numba_dppy/parfor_loop_invariant_hoisting.py.bkp

diff --git a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp b/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
deleted file mode 100644
index fb37a1c97b..0000000000
--- a/numba_dppy/parfor_loop_invariant_hoisting.py.bkp
+++ /dev/null
@@ -1,213 +0,0 @@
-from __future__ import print_function, division, absolute_import
-
-def add_to_def_once_sets(a_def, def_once, def_more):
-    '''If the variable is already defined more than once, do nothing.
-       Else if defined exactly once previously then transition this
-       variable to the defined more than once set (remove it from
-       def_once set and add to def_more set).
-       Else this must be the first time we've seen this variable defined
-       so add to def_once set.
-    '''
-    if a_def in def_more:
-        pass
-    elif a_def in def_once:
-        def_more.add(a_def)
-        def_once.remove(a_def)
-    else:
-        def_once.add(a_def)
-
-def compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Effect changes to the set of variables defined once or more than once
-       for a single block.
-       block - the block to process
-       def_once - set of variable names known to be defined exactly once
-       def_more - set of variable names known to be defined more than once
-       getattr_taken - dict mapping variable name to tuple of object and attribute taken
-       module_assigns - dict mapping variable name to the Global that they came from
-    '''
-    # The only "defs" occur in assignments, so find such instructions.
-    assignments = block.find_insts(ir.Assign)
-    # For each assignment...
-    for one_assign in assignments:
-        # Get the LHS/target of the assignment.
-        a_def = one_assign.target.name
-        # Add variable to def sets.
-        add_to_def_once_sets(a_def, def_once, def_more)
-
-        rhs = one_assign.value
-        if isinstance(rhs, ir.Global):
-            # Remember assignments of the form "a = Global(...)"
-            # Is this a module?
-            if isinstance(rhs.value, pytypes.ModuleType):
-                module_assigns[a_def] = rhs.value.__name__
-        if isinstance(rhs, ir.Expr) and rhs.op == 'getattr' and rhs.value.name in def_once:
-            # Remember assignments of the form "a = b.c"
-            getattr_taken[a_def] = (rhs.value.name, rhs.attr)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call' and rhs.func.name in getattr_taken:
-            # If "a" is being called then lookup the getattr definition of "a"
-            # as above, getting the module variable "b" (base_obj)
-            # and the attribute "c" (base_attr).
-            base_obj, base_attr = getattr_taken[rhs.func.name]
-            if base_obj in module_assigns:
-                # If we know the definition of the module variable then get the module
-                # name from module_assigns.
-                base_mod_name = module_assigns[base_obj]
-                if not is_const_call(base_mod_name, base_attr):
-                    # Calling a method on an object could modify the object and is thus
-                    # like a def of that object.  We call is_const_call to see if this module/attribute
-                    # combination is known to not modify the module state.  If we don't know that
-                    # the combination is safe then we have to assume there could be a modification to
-                    # the module and thus add the module variable as defined more than once.
-                    add_to_def_once_sets(base_obj, def_once, def_more)
-            else:
-                # Assume the worst and say that base_obj could be modified by the call.
-                add_to_def_once_sets(base_obj, def_once, def_more)
-        if isinstance(rhs, ir.Expr) and rhs.op == 'call':
-            # If a mutable object is passed to a function, then it may be changed and
-            # therefore can't be hoisted.
-            # For each argument to the function...
-            for argvar in rhs.args:
-                # Get the argument's type.
-                if isinstance(argvar, ir.Var):
-                    argvar = argvar.name
-                avtype = typemap[argvar]
-                # If that type doesn't have a mutable attribute or it does and it's set to
-                # not mutable then this usage is safe for hoisting.
-                if getattr(avtype, 'mutable', False):
-                    # Here we have a mutable variable passed to a function so add this variable
-                    # to the def lists.
-                    add_to_def_once_sets(argvar, def_once, def_more)
-
-def compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns):
-    '''Compute the set of variables defined exactly once in the given set of blocks
-       and use the given sets for storing which variables are defined once, more than
-       once and which have had a getattr call on them.
-    '''
-    # For each block...
-    for label, block in loop_body.items():
-        # Scan this block and effect changes to def_once, def_more, and getattr_taken
-        # based on the instructions in that block.
-        compute_def_once_block(block, def_once, def_more, getattr_taken, typemap, module_assigns)
-        # Have to recursively process parfors manually here.
-        for inst in block.body:
-            if isinstance(inst, parfor.Parfor):
-                # Recursively compute for the parfor's init block.
-                compute_def_once_block(inst.init_block, def_once, def_more, getattr_taken, typemap, module_assigns)
-                # Recursively compute for the parfor's loop body.
-                compute_def_once_internal(inst.loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-
-def compute_def_once(loop_body, typemap):
-    '''Compute the set of variables defined exactly once in the given set of blocks.
-    '''
-    def_once = set()   # set to hold variables defined exactly once
-    def_more = set()   # set to hold variables defined more than once
-    getattr_taken = {}
-    module_assigns = {}
-    compute_def_once_internal(loop_body, def_once, def_more, getattr_taken, typemap, module_assigns)
-    return def_once
-
-def find_vars(var, varset):
-    assert isinstance(var, ir.Var)
-    varset.add(var.name)
-    return var
-
-def _hoist_internal(inst, dep_on_param, call_table, hoisted, not_hoisted,
-                    typemap, stored_arrays):
-    if inst.target.name in stored_arrays:
-        not_hoisted.append((inst, "stored array"))
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Instruction", inst, " could not be hoisted because the created array is stored.")
-        return False
-
-    uses = set()
-    visit_vars_inner(inst.value, find_vars, uses)
-    diff = uses.difference(dep_on_param)
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("_hoist_internal:", inst, "uses:", uses, "diff:", diff)
-    if len(diff) == 0 and is_pure(inst.value, None, call_table):
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("Will hoist instruction", inst, typemap[inst.target.name])
-        hoisted.append(inst)
-        if not isinstance(typemap[inst.target.name], types.npytypes.Array):
-            dep_on_param += [inst.target.name]
-        return True
-    else:
-        if len(diff) > 0:
-            not_hoisted.append((inst, "dependency"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because of a dependency.")
-        else:
-            not_hoisted.append((inst, "not pure"))
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Instruction", inst, " could not be hoisted because it isn't pure.")
-    return False
-
-def find_setitems_block(setitems, itemsset, block, typemap):
-    for inst in block.body:
-        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
-            setitems.add(inst.target.name)
-            # If we store a non-mutable object into an array then that is safe to hoist.
-            # If the stored object is mutable and you hoist then multiple entries in the
-            # outer array could reference the same object and changing one index would then
-            # change other indices.
-            if getattr(typemap[inst.value.name], "mutable", False):
-                itemsset.add(inst.value.name)
-        elif isinstance(inst, parfor.Parfor):
-            find_setitems_block(setitems, itemsset, inst.init_block, typemap)
-            find_setitems_body(setitems, itemsset, inst.loop_body, typemap)
-
-def find_setitems_body(setitems, itemsset, loop_body, typemap):
-    """
-      Find the arrays that are written into (goes into setitems) and the
-      mutable objects (mostly arrays) that are written into other arrays
-      (goes into itemsset).
-    """
-    for label, block in loop_body.items():
-        find_setitems_block(setitems, itemsset, block, typemap)
-
-def hoist(parfor_params, loop_body, typemap, wrapped_blocks):
-    dep_on_param = copy.copy(parfor_params)
-    hoisted = []
-    not_hoisted = []
-
-    # Compute the set of variable defined exactly once in the loop body.
-    def_once = compute_def_once(loop_body, typemap)
-    (call_table, reverse_call_table) = get_call_table(wrapped_blocks)
-
-    setitems = set()
-    itemsset = set()
-    find_setitems_body(setitems, itemsset, loop_body, typemap)
-    dep_on_param = list(set(dep_on_param).difference(setitems))
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("hoist - def_once:", def_once, "setitems:",
-              setitems, "itemsset:", itemsset, "dep_on_param:",
-              dep_on_param, "parfor_params:", parfor_params)
-
-    for label, block in loop_body.items():
-        new_block = []
-        for inst in block.body:
-            if isinstance(inst, ir.Assign) and inst.target.name in def_once:
-                if _hoist_internal(inst, dep_on_param, call_table,
-                                   hoisted, not_hoisted, typemap, itemsset):
-                    # don't add this instruction to the block since it is
-                    # hoisted
-                    continue
-            elif isinstance(inst, parfor.Parfor):
-                new_init_block = []
-                if config.DEBUG_ARRAY_OPT >= 1:
-                    print("parfor")
-                    inst.dump()
-                for ib_inst in inst.init_block.body:
-                    if (isinstance(ib_inst, ir.Assign) and
-                        ib_inst.target.name in def_once):
-                        if _hoist_internal(ib_inst, dep_on_param, call_table,
-                                           hoisted, not_hoisted, typemap, itemsset):
-                            # don't add this instuction to the block since it is hoisted
-                            continue
-                    new_init_block.append(ib_inst)
-                inst.init_block.body = new_init_block
-
-            new_block.append(inst)
-        block.body = new_block
-    return hoisted, not_hoisted
-

From b8c11f77285ad77db688854cc039f74a29a8d0c3 Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Fri, 4 Dec 2020 15:41:15 +0300
Subject: [PATCH 05/24] Del dppl dir in tests (#43)

* Del dppl dir in tests

* Del unused var

Co-authored-by: etotmeni <elena.totmenina@intel.com>
---
 numba_dppy/tests/__init__.py                                | 3 +--
 numba_dppy/tests/dppl/__init__.py                           | 6 ------
 numba_dppy/tests/{dppl => }/test_arg_accessor.py            | 0
 numba_dppy/tests/{dppl => }/test_arg_types.py               | 0
 numba_dppy/tests/{dppl => }/test_atomic_op.py               | 0
 numba_dppy/tests/{dppl => }/test_barrier.py                 | 0
 numba_dppy/tests/{dppl => }/test_black_scholes.py           | 0
 numba_dppy/tests/{dppl => }/test_caching.py                 | 0
 numba_dppy/tests/{dppl => }/test_device_array_args.py       | 0
 numba_dppy/tests/{dppl => }/test_dpctl_api.py               | 0
 numba_dppy/tests/{dppl => }/test_dpnp_functions.py          | 0
 numba_dppy/tests/{dppl => }/test_dppl_fallback.py           | 0
 numba_dppy/tests/{dppl => }/test_dppl_func.py               | 0
 numba_dppy/tests/{dppl => }/test_math_functions.py          | 0
 .../tests/{dppl => }/test_numpy_bit_twiddling_functions.py  | 0
 .../tests/{dppl => }/test_numpy_comparison_functions.py     | 0
 .../tests/{dppl => }/test_numpy_floating_functions.py       | 0
 numba_dppy/tests/{dppl => }/test_numpy_math_functions.py    | 0
 .../tests/{dppl => }/test_numpy_trigonomteric_functions.py  | 0
 numba_dppy/tests/{dppl => }/test_parfor_lower_message.py    | 0
 numba_dppy/tests/{dppl => }/test_prange.py                  | 0
 numba_dppy/tests/{dppl => }/test_print.py                   | 0
 numba_dppy/tests/{dppl => }/test_sum_reduction.py           | 0
 numba_dppy/tests/{dppl => }/test_vectorize.py               | 0
 numba_dppy/tests/{dppl => }/test_with_context.py            | 0
 25 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 numba_dppy/tests/dppl/__init__.py
 rename numba_dppy/tests/{dppl => }/test_arg_accessor.py (100%)
 rename numba_dppy/tests/{dppl => }/test_arg_types.py (100%)
 rename numba_dppy/tests/{dppl => }/test_atomic_op.py (100%)
 rename numba_dppy/tests/{dppl => }/test_barrier.py (100%)
 rename numba_dppy/tests/{dppl => }/test_black_scholes.py (100%)
 rename numba_dppy/tests/{dppl => }/test_caching.py (100%)
 rename numba_dppy/tests/{dppl => }/test_device_array_args.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dpctl_api.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dpnp_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dppl_fallback.py (100%)
 rename numba_dppy/tests/{dppl => }/test_dppl_func.py (100%)
 rename numba_dppy/tests/{dppl => }/test_math_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_bit_twiddling_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_comparison_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_floating_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_math_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_numpy_trigonomteric_functions.py (100%)
 rename numba_dppy/tests/{dppl => }/test_parfor_lower_message.py (100%)
 rename numba_dppy/tests/{dppl => }/test_prange.py (100%)
 rename numba_dppy/tests/{dppl => }/test_print.py (100%)
 rename numba_dppy/tests/{dppl => }/test_sum_reduction.py (100%)
 rename numba_dppy/tests/{dppl => }/test_vectorize.py (100%)
 rename numba_dppy/tests/{dppl => }/test_with_context.py (100%)

diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index d29208fb91..5a2199f149 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -8,10 +8,9 @@
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
-    this_dir = dirname(__file__)
 
     if dppy_config.dppy_present:
-        suite.addTests(load_testsuite(loader, join(this_dir, 'dppl')))
+        suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
         print("skipped DPPL tests")
 
diff --git a/numba_dppy/tests/dppl/__init__.py b/numba_dppy/tests/dppl/__init__.py
deleted file mode 100644
index cff5a36cc2..0000000000
--- a/numba_dppy/tests/dppl/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from numba.testing import SerialSuite
-from numba.testing import load_testsuite
-import os
-
-def load_tests(loader, tests, pattern):
-    return SerialSuite(load_testsuite(loader, os.path.dirname(__file__)))
diff --git a/numba_dppy/tests/dppl/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_arg_accessor.py
rename to numba_dppy/tests/test_arg_accessor.py
diff --git a/numba_dppy/tests/dppl/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_arg_types.py
rename to numba_dppy/tests/test_arg_types.py
diff --git a/numba_dppy/tests/dppl/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_atomic_op.py
rename to numba_dppy/tests/test_atomic_op.py
diff --git a/numba_dppy/tests/dppl/test_barrier.py b/numba_dppy/tests/test_barrier.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_barrier.py
rename to numba_dppy/tests/test_barrier.py
diff --git a/numba_dppy/tests/dppl/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_black_scholes.py
rename to numba_dppy/tests/test_black_scholes.py
diff --git a/numba_dppy/tests/dppl/test_caching.py b/numba_dppy/tests/test_caching.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_caching.py
rename to numba_dppy/tests/test_caching.py
diff --git a/numba_dppy/tests/dppl/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_device_array_args.py
rename to numba_dppy/tests/test_device_array_args.py
diff --git a/numba_dppy/tests/dppl/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dpctl_api.py
rename to numba_dppy/tests/test_dpctl_api.py
diff --git a/numba_dppy/tests/dppl/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dpnp_functions.py
rename to numba_dppy/tests/test_dpnp_functions.py
diff --git a/numba_dppy/tests/dppl/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dppl_fallback.py
rename to numba_dppy/tests/test_dppl_fallback.py
diff --git a/numba_dppy/tests/dppl/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_dppl_func.py
rename to numba_dppy/tests/test_dppl_func.py
diff --git a/numba_dppy/tests/dppl/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_math_functions.py
rename to numba_dppy/tests/test_math_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_bit_twiddling_functions.py
rename to numba_dppy/tests/test_numpy_bit_twiddling_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_comparison_functions.py
rename to numba_dppy/tests/test_numpy_comparison_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_floating_functions.py
rename to numba_dppy/tests/test_numpy_floating_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_math_functions.py
rename to numba_dppy/tests/test_numpy_math_functions.py
diff --git a/numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_numpy_trigonomteric_functions.py
rename to numba_dppy/tests/test_numpy_trigonomteric_functions.py
diff --git a/numba_dppy/tests/dppl/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_parfor_lower_message.py
rename to numba_dppy/tests/test_parfor_lower_message.py
diff --git a/numba_dppy/tests/dppl/test_prange.py b/numba_dppy/tests/test_prange.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_prange.py
rename to numba_dppy/tests/test_prange.py
diff --git a/numba_dppy/tests/dppl/test_print.py b/numba_dppy/tests/test_print.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_print.py
rename to numba_dppy/tests/test_print.py
diff --git a/numba_dppy/tests/dppl/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_sum_reduction.py
rename to numba_dppy/tests/test_sum_reduction.py
diff --git a/numba_dppy/tests/dppl/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_vectorize.py
rename to numba_dppy/tests/test_vectorize.py
diff --git a/numba_dppy/tests/dppl/test_with_context.py b/numba_dppy/tests/test_with_context.py
similarity index 100%
rename from numba_dppy/tests/dppl/test_with_context.py
rename to numba_dppy/tests/test_with_context.py

From 16fb9b8967a5814aeba2e5fb9459465abd668bf4 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 09:04:32 -0600
Subject: [PATCH 06/24] Revert "numba-dppy requires cffi"

This reverts commit 776bf2228e2aef77ea9767ce2ae90ff204482230.
---
 conda-recipe/meta.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 4967295c05..d8f6c1ecbb 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -23,7 +23,6 @@ requirements:
     run:
         - python
         - numba >=0.51
-        - cffi
         - dpctl
         - spirv-tools
         - llvm-spirv

From 56e969a0904c112b4a55cd1a75f76bb18a58f261 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Fri, 4 Dec 2020 09:06:12 -0600
Subject: [PATCH 07/24] Remove use of cffi

---
 numba_dppy/dppl_lowerer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppl_lowerer.py
index 51fb072551..a317c990a6 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppl_lowerer.py
@@ -979,14 +979,13 @@ def relatively_deep_copy(obj, memo):
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
-    from cffi.api import FFI
     from types import ModuleType
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
     if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
                         Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr, FFI,
+                        DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 

From 0a56e08cd9d01c33be5f2b90de6bcece106d480d Mon Sep 17 00:00:00 2001
From: Elena Totmenina <totmeninal@mail.ru>
Date: Mon, 7 Dec 2020 22:44:44 +0300
Subject: [PATCH 08/24] Rename dppl to dppy (#42)

Co-authored-by: etotmeni <elena.totmenina@intel.com>
Co-authored-by: Diptorup Deb <diptorup.deb@intel.com>
---
 .gitignore                                    |  27 +++
 HowTo.rst                                     |   6 +-
 numba_dppy/CHANGE_LOG                         |   4 +-
 numba_dppy/__init__.py                        |  52 +++---
 numba_dppy/compiler.py                        | 103 +++++------
 numba_dppy/decorators.py                      |  14 +-
 numba_dppy/descriptor.py                      |  20 +--
 numba_dppy/dispatcher.py                      |  22 +--
 ...n_call_gen.py => dppy_host_fn_call_gen.py} |  14 +-
 .../{dppl_lowerer.py => dppy_lowerer.py}      |  52 +++---
 ...spatcher.py => dppy_offload_dispatcher.py} |  12 +-
 ...ppl_passbuilder.py => dppy_passbuilder.py} |  34 ++--
 numba_dppy/{dppl_passes.py => dppy_passes.py} |  30 ++--
 .../examples/{dppl_func.py => dppy_func.py}   |  10 +-
 ...l_with_context.py => dppy_with_context.py} |   2 +-
 numba_dppy/examples/matmul.py                 |  12 +-
 numba_dppy/examples/pairwise_distance.py      |   6 +-
 numba_dppy/examples/sum-hybrid.py             |  10 +-
 numba_dppy/examples/sum.py                    |   8 +-
 numba_dppy/examples/sum2D.py                  |  10 +-
 numba_dppy/examples/sum_ndarray.py            |   6 +-
 numba_dppy/examples/sum_reduction.py          |   8 +-
 numba_dppy/examples/sum_reduction_ocl.py      |  16 +-
 .../examples/sum_reduction_recursive_ocl.py   |  16 +-
 .../experimental_numpy_lowering_overload.py   |  12 +-
 numba_dppy/initialize.py                      |   8 +-
 numba_dppy/ocl/atomics/atomic_ops.cl          |  56 +++---
 numba_dppy/ocl/ocldecl.py                     |  44 ++---
 numba_dppy/ocl/oclimpl.py                     |  22 +--
 numba_dppy/ocl/stubs.py                       |   6 +-
 numba_dppy/printimpl.py                       |   4 +-
 numba_dppy/target.py                          |  26 +--
 numba_dppy/target_dispatcher.py               |  12 +-
 numba_dppy/testing.py                         |  12 +-
 numba_dppy/tests/__init__.py                  |   6 +-
 numba_dppy/tests/test_arg_accessor.py         |  18 +-
 numba_dppy/tests/test_arg_types.py            |  26 +--
 numba_dppy/tests/test_atomic_op.py            | 168 +++++++++---------
 numba_dppy/tests/test_barrier.py              |  34 ++--
 numba_dppy/tests/test_black_scholes.py        |  14 +-
 numba_dppy/tests/test_caching.py              |  14 +-
 numba_dppy/tests/test_device_array_args.py    |  16 +-
 numba_dppy/tests/test_dpctl_api.py            |   4 +-
 numba_dppy/tests/test_dpnp_functions.py       |   6 +-
 numba_dppy/tests/test_dppl_fallback.py        |  26 +--
 numba_dppy/tests/test_dppl_func.py            |  32 ++--
 numba_dppy/tests/test_math_functions.py       |  76 ++++----
 .../test_numpy_bit_twiddling_functions.py     |   6 +-
 .../tests/test_numpy_comparison_functions.py  |   6 +-
 .../tests/test_numpy_floating_functions.py    |   6 +-
 numba_dppy/tests/test_numpy_math_functions.py |   6 +-
 .../test_numpy_trigonomteric_functions.py     |   6 +-
 numba_dppy/tests/test_parfor_lower_message.py |   8 +-
 numba_dppy/tests/test_prange.py               |  14 +-
 numba_dppy/tests/test_print.py                |  16 +-
 numba_dppy/tests/test_sum_reduction.py        |  12 +-
 numba_dppy/tests/test_vectorize.py            |   6 +-
 numba_dppy/tests/test_with_context.py         |  16 +-
 58 files changed, 634 insertions(+), 604 deletions(-)
 create mode 100644 .gitignore
 rename numba_dppy/{dppl_host_fn_call_gen.py => dppy_host_fn_call_gen.py} (98%)
 rename numba_dppy/{dppl_lowerer.py => dppy_lowerer.py} (97%)
 rename numba_dppy/{dppl_offload_dispatcher.py => dppy_offload_dispatcher.py} (73%)
 rename numba_dppy/{dppl_passbuilder.py => dppy_passbuilder.py} (82%)
 rename numba_dppy/{dppl_passes.py => dppy_passes.py} (95%)
 rename numba_dppy/examples/{dppl_func.py => dppy_func.py} (81%)
 rename numba_dppy/examples/{dppl_with_context.py => dppy_with_context.py} (94%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..340ae2678b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+*.pyc
+*.o
+*.so
+*.dylib
+*.pyd
+*.pdb
+*.egg-info
+*.sw[po]
+*.out
+*.ll
+.coverage
+.nfs*
+tags
+MANIFEST
+
+build/
+docs/_build/
+docs/gh-pages/
+dist/
+htmlcov/
+.idea/
+.vscode/
+.mypy_cache/
+.ipynb_checkpoints/
+__pycache__/
+
+docs/source/developer/autogen*
diff --git a/HowTo.rst b/HowTo.rst
index 03927c0ea7..7689bc52bf 100644
--- a/HowTo.rst
+++ b/HowTo.rst
@@ -7,7 +7,7 @@ are listed below with the help of sample code snippets. In this release we have
 the implementation of the OAK approach described in MS138 in section 4.3.2. The
 new decorator is described below.
 
-To access the features driver module have to be imported from numba_dppy.dppl_driver
+To access the features driver module have to be imported from numba_dppy.dppy_driver
 
 New Decorator
 =============
@@ -61,7 +61,7 @@ Primitive types are passed by value to the kernel, currently supported are int,
 Math Kernels
 ============
 
-This release has support for math kernels. See numba_dppy/tests/dppl/test_math_functions.py
+This release has support for math kernels. See numba_dppy/tests/dppy/test_math_functions.py
 for more details.
 
 
@@ -170,6 +170,6 @@ Testing
 
 All examples can be found in numba_dppy/examples/
 
-All tests can be found in numba_dppy/tests/dppl and can be triggered by the following command:
+All tests can be found in numba_dppy/tests/dppy and can be triggered by the following command:
 
 ``python -m numba.runtests numba_dppy.tests``
diff --git a/numba_dppy/CHANGE_LOG b/numba_dppy/CHANGE_LOG
index e3cb06522c..2a1fcdee40 100644
--- a/numba_dppy/CHANGE_LOG
+++ b/numba_dppy/CHANGE_LOG
@@ -1,7 +1,7 @@
-NUMBA Version 0.48.0 + DPPL Version 0.3.0 (June 29, 2020)
+NUMBA Version 0.48.0 + DPPY Version 0.3.0 (June 29, 2020)
 --------------------------------------------------------
 
 This release includes:
 
-* Caching of dppl.kernels which will improve performance.
+* Caching of dppy.kernels which will improve performance.
 * Addition of support for Intel Advisor which will help in profiling applications.
diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index 6eff949d16..ac4e898889 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -4,9 +4,9 @@
 
 
 Extensions to Numba for Intel GPUs introduce two new features into Numba:
-    a.  A new backend that has a new decorator called @dppl.kernel that
+    a.  A new backend that has a new decorator called @dppy.kernel that
         exposes an explicit kernel programming interface similar to the
-        existing Numba GPU code-generation backends. The @dppl.kernel
+        existing Numba GPU code-generation backends. The @dppy.kernel
         decorator currently implements a subset of OpenCL’s API through
         Numba’s intrinsic functions.
 
@@ -20,48 +20,48 @@
 Explicit Kernel Prgoramming with new Docorators:
 
 
-@dppl.kernel
+@dppy.kernel
 
-    The @dppl.kernel decorator can be used with or without extra arguments.
+    The @dppy.kernel decorator can be used with or without extra arguments.
     Optionally, users can pass the signature of the arguments to the
     decorator. When a signature is provided to the DK decorator the version
     of the OpenCL kernel generated gets specialized for that type signature.
 
     ---------------------------------------------------------------------------
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
     ---------------------------------------------------------------------------
 
     To invoke the above function users will need to provide a
     global size (OpenCL) which is the size of a (same as b and c) and a
-    local size (dppl.DEFAULT_LOCAL_SIZE if user don't want to specify).
+    local size (dppy.DEFAULT_LOCAL_SIZE if user don't want to specify).
     Example shown below:
 
     ---------------------------------------------------------------------------
-    data_parallel_sum[len(a), dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+    data_parallel_sum[len(a), dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
     ---------------------------------------------------------------------------
 
 
-@dppl.func
+@dppy.func
 
-    The @dppl.func decorator is the other decorator provided in the explicit
+    The @dppy.func decorator is the other decorator provided in the explicit
     kernel programming model. This decorator allows users to write “device”
     functions that can be invoked from inside DK functions but cannot be invoked
     from the host. The decorator also supports type specialization as with the
-    DK decorator. Functions decorated with @dppl.func will also be JIT compiled
-    and inlined into the OpenCL Program containing the @dppl.kernel function
-    calling it. A @dppl.func will not be launched as an OpenCL kernel.
+    DK decorator. Functions decorated with @dppy.func will also be JIT compiled
+    and inlined into the OpenCL Program containing the @dppy.kernel function
+    calling it. A @dppy.func will not be launched as an OpenCL kernel.
 
     ---------------------------------------------------------------------------
-    @dppl.func
+    @dppy.func
     def bar(a):
         return a*a
 
-    @dppl.kernel
+    @dppy.kernel
     def foo(in, out):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         out[i] = bar(in[i])
     ---------------------------------------------------------------------------
 
@@ -71,13 +71,13 @@ def foo(in, out):
     The following table has the list of intrinsic functions that can be directly
     used inside a DK function. All the functions are equivalent to the similarly
     named OpenCL function. Wherever there is an implementation difference
-    between the Numba-PyDPPL version and the OpenCL version, the difference is
+    between the Numba-DPPY version and the OpenCL version, the difference is
     explained in table. Note that these functions cannot be used anywhere else
     outside of a DK function in a Numba application. Readers are referred to the
     OpenCL API specs to review the functionality of each function.
 
     +----------------------+----------------------------+----------------------+
-    | Numba-DPPL intrinsic | Equivalent OpenCL function |         Notes        |
+    | Numba-DPPY intrinsic | Equivalent OpenCL function |         Notes        |
     +----------------------+----------------------------+----------------------+
     | get_global_id        | get_global_id              |                      |
     +----------------------+----------------------------+----------------------+
@@ -121,7 +121,7 @@ def foo(in, out):
     |print             |print(varargs)                 |The print function is a  |
     |                  |                               |subset of the OpenCL     |
     |                  |                               |printf function. The     |
-    |                  |                               |Numba-DPPL version of    |
+    |                  |                               |Numba-DPPY version of    |
     |                  |                               |print supports only int, |
     |                  |                               |string, and float        |
     |                  |                               |arguments.               |
@@ -160,16 +160,16 @@ def foo(in, out):
 
 
 
-Complete Example using @dppl.kernel:
+Complete Example using @dppy.kernel:
 
     ---------------------------------------------------------------------------
     import numpy as np
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     import dpctl
 
-    @dppl.kernel
+    @dppy.kernel
     def data_parallel_sum(a, b, c):
-        i = dppl.get_global_id(0)
+        i = dppy.get_global_id(0)
         c[i] = a[i] + b[i]
 
     def driver(device_env, a, b, c, global_size):
@@ -181,7 +181,7 @@ def driver(device_env, a, b, c, global_size):
         print("before : ", dA._ndarray)
         print("before : ", dB._ndarray)
         print("before : ", dC._ndarray)
-        data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](dA, dB, dC)
+        data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](dA, dB, dC)
         device_env.copy_array_from_device(dC)
         print("after : ", dC._ndarray)
 
@@ -509,11 +509,11 @@ def main():
 if dppy_present:
     from .device_init import *
 else:
-    raise ImportError("Importing dppl failed")
+    raise ImportError("Importing numba-dppy failed")
 
 def test(*args, **kwargs):
     if not dppy_present and not is_available():
-        dppl_error()
+        dppy_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
 
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 736cd96a26..c8a329738a 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -2,7 +2,7 @@
 import copy
 from collections import namedtuple
 
-from .dppl_passbuilder import DPPLPassBuilder
+from .dppy_passbuilder import DPPYPassBuilder
 from numba.core.typing.templates import ConcreteTemplate
 from numba.core import types, compiler, ir
 from numba.core.typing.templates import AbstractTemplate
@@ -12,6 +12,7 @@
 
 import dpctl
 import dpctl.memory as dpctl_mem
+import dpctl.program as dpctl_prog
 import numpy as np
 
 from . import spirv_generator
@@ -19,10 +20,10 @@
 import os
 from numba.core.compiler import DefaultPassBuilder, CompilerBase
 
-DEBUG=os.environ.get('NUMBA_DPPL_DEBUG', None)
-_NUMBA_DPPL_READ_ONLY  = "read_only"
-_NUMBA_DPPL_WRITE_ONLY = "write_only"
-_NUMBA_DPPL_READ_WRITE = "read_write"
+DEBUG=os.environ.get('NUMBA_DPPY_DEBUG', None)
+_NUMBA_DPPY_READ_ONLY  = "read_only"
+_NUMBA_DPPY_WRITE_ONLY = "write_only"
+_NUMBA_DPPY_READ_WRITE = "read_write"
 
 def _raise_no_device_found_error():
     error_message = ("No OpenCL device specified. "
@@ -30,7 +31,7 @@ def _raise_no_device_found_error():
     raise ValueError(error_message)
 
 def _raise_invalid_kernel_enqueue_args():
-    error_message = ("Incorrect number of arguments for enquing dppl.kernel. "
+    error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
                      "Usage: device_env, global size, local size. "
                      "The local size argument is optional.")
     raise ValueError(error_message)
@@ -51,15 +52,15 @@ def get_ordered_arg_access_types(pyfunc, access_types):
     return ordered_arg_access_types
 
 
-class DPPLCompiler(CompilerBase):
-    """ DPPL Compiler """
+class DPPYCompiler(CompilerBase):
+    """ DPPY Compiler """
 
     def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
         if not self.state.flags.force_pyobject:
-            #print("Numba-DPPL [INFO]: Using Numba-DPPL pipeline")
-            pms.append(DPPLPassBuilder.define_nopython_pipeline(self.state))
+            #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
+            pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
             pms.append(
                 DefaultPassBuilder.define_objectmode_pipeline(self.state)
@@ -71,12 +72,12 @@ def define_pipelines(self):
         return pms
 
 
-def compile_with_dppl(pyfunc, return_type, args, debug):
+def compile_with_dppy(pyfunc, return_type, args, debug):
     # First compilation will trigger the initialization of the OpenCL backend.
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    typingctx = dppl_target.typing_context
-    targetctx = dppl_target.target_context
+    typingctx = dppy_target.typing_context
+    targetctx = dppy_target.target_context
     # TODO handle debug flag
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
@@ -93,7 +94,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                       return_type=return_type,
                                       flags=flags,
                                       locals={},
-                                      pipeline_class=DPPLCompiler)
+                                      pipeline_class=DPPYCompiler)
     elif isinstance(pyfunc, ir.FunctionIR):
         cres = compiler.compile_ir(typingctx=typingctx,
                                    targetctx=targetctx,
@@ -102,7 +103,7 @@ def compile_with_dppl(pyfunc, return_type, args, debug):
                                    return_type=return_type,
                                    flags=flags,
                                    locals={},
-                                   pipeline_class=DPPLCompiler)
+                                   pipeline_class=DPPYCompiler)
     else:
         assert(0)
     # Linking depending libraries
@@ -120,7 +121,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
         # This will be get_current_queue
         sycl_queue = dpctl.get_current_queue()
 
-    cres = compile_with_dppl(pyfunc, None, args, debug=debug)
+    cres = compile_with_dppy(pyfunc, None, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     # The kernel objet should have a reference to the target context it is compiled for.
@@ -128,7 +129,7 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     # depending on the target context. For example, we want to link our kernel object
     # with implementation containing atomic operations only when atomic operations
     # are being used in the kernel.
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -146,7 +147,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
             if isinstance(a, types.npytypes.Array):
                 print("addrspace:", a.addrspace)
 
-    cres = compile_with_dppl(func_ir, None, args_with_addrspaces,
+    cres = compile_with_dppy(func_ir, None, args_with_addrspaces,
                              debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
 
@@ -159,7 +160,7 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
 
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
     #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
-    oclkern = DPPLKernel(context=cres.target_context,
+    oclkern = DPPYKernel(context=cres.target_context,
                          sycl_queue=sycl_queue,
                          llvm_module=kernel.module,
                          name=kernel.name,
@@ -168,44 +169,44 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
     return oclkern
 
 
-def compile_dppl_func(pyfunc, return_type, args, debug=False):
-    cres = compile_with_dppl(pyfunc, return_type, args, debug=debug)
+def compile_dppy_func(pyfunc, return_type, args, debug=False):
+    cres = compile_with_dppy(pyfunc, return_type, args, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
     cres.target_context.mark_ocl_device(func)
-    devfn = DPPLFunction(cres)
+    devfn = DPPYFunction(cres)
 
-    class dppl_function_template(ConcreteTemplate):
+    class dppy_function_template(ConcreteTemplate):
         key = devfn
         cases = [cres.signature]
 
-    cres.typing_context.insert_user_function(devfn, dppl_function_template)
+    cres.typing_context.insert_user_function(devfn, dppy_function_template)
     libs = [cres.library]
     cres.target_context.insert_user_function(devfn, cres.fndesc, libs)
     return devfn
 
 
-# Compile dppl function template
-def compile_dppl_func_template(pyfunc):
-    """Compile a DPPLFunctionTemplate
+# Compile dppy function template
+def compile_dppy_func_template(pyfunc):
+    """Compile a DPPYFunctionTemplate
     """
-    from .descriptor import dppl_target
+    from .descriptor import dppy_target
 
-    dft = DPPLFunctionTemplate(pyfunc)
+    dft = DPPYFunctionTemplate(pyfunc)
 
-    class dppl_function_template(AbstractTemplate):
+    class dppy_function_template(AbstractTemplate):
         key = dft
 
         def generic(self, args, kws):
             assert not kws
             return dft.compile(args)
 
-    typingctx = dppl_target.typing_context
-    typingctx.insert_user_function(dft, dppl_function_template)
+    typingctx = dppy_target.typing_context
+    typingctx.insert_user_function(dft, dppy_function_template)
     return dft
 
 
-class DPPLFunctionTemplate(object):
-    """Unmaterialized dppl function
+class DPPYFunctionTemplate(object):
+    """Unmaterialized dppy function
     """
     def __init__(self, pyfunc, debug=False):
         self.py_func = pyfunc
@@ -220,7 +221,7 @@ def compile(self, args):
         this object.
         """
         if args not in self._compileinfos:
-            cres = compile_with_dppl(self.py_func, None, args, debug=self.debug)
+            cres = compile_with_dppy(self.py_func, None, args, debug=self.debug)
             func = cres.library.get_function(cres.fndesc.llvm_func_name)
             cres.target_context.mark_ocl_device(func)
             first_definition = not self._compileinfos
@@ -240,7 +241,7 @@ def compile(self, args):
         return cres.signature
 
 
-class DPPLFunction(object):
+class DPPYFunction(object):
     def __init__(self, cres):
         self.cres = cres
 
@@ -282,7 +283,7 @@ def _ensure_valid_work_group_size(val, work_item_grid):
     return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 
-class DPPLKernelBase(object):
+class DPPYKernelBase(object):
     """Define interface for configurable kernels
     """
 
@@ -293,9 +294,9 @@ def __init__(self):
 
         # list of supported access types, stored in dict for fast lookup
         self.valid_access_types = {
-                _NUMBA_DPPL_READ_ONLY: _NUMBA_DPPL_READ_ONLY,
-                _NUMBA_DPPL_WRITE_ONLY: _NUMBA_DPPL_WRITE_ONLY,
-                _NUMBA_DPPL_READ_WRITE: _NUMBA_DPPL_READ_WRITE}
+                _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
+                _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
+                _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE}
 
     def copy(self):
         return copy.copy(self)
@@ -331,14 +332,14 @@ def __getitem__(self, args):
         return self.configure(sycl_queue, gs, ls)
 
 
-class DPPLKernel(DPPLKernelBase):
+class DPPYKernel(DPPYKernelBase):
     """
     A OCL kernel object
     """
 
     def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
                  ordered_arg_access_types=None):
-        super(DPPLKernel, self).__init__()
+        super(DPPYKernel, self).__init__()
         self._llvm_module = llvm_module
         self.assembly = self.binary = llvm_module.__str__()
         self.entry_name = name
@@ -355,7 +356,7 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
         self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary)
 
         # create a program
-        self.program = dpctl.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
+        self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
         #  create a kernel
         self.kernel = self.program.get_sycl_kernel(self.entry_name)
 
@@ -385,7 +386,7 @@ def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
         """
         if (device_arr and (access_type not in self.valid_access_types or
             access_type in self.valid_access_types and
-            self.valid_access_types[access_type] != _NUMBA_DPPL_READ_ONLY)):
+            self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)):
             # we get the date back to host if have created a
             # device_array or if access_type of this device_array
             # is not of type read_only and read_write
@@ -431,8 +432,8 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype)
 
                 if (default_behavior or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_ONLY or
-                    self.valid_access_types[access_type] == _NUMBA_DPPL_READ_WRITE):
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or
+                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE):
                     np.copyto(usm_ndarr, val)
 
                 device_arrs[-1] = (usm_buf, usm_ndarr, val)
@@ -486,18 +487,18 @@ def check_for_invalid_access_type(self, access_type):
             return False
 
 
-class JitDPPLKernel(DPPLKernelBase):
+class JitDPPYKernel(DPPYKernelBase):
     def __init__(self, func, access_types):
 
-        super(JitDPPLKernel, self).__init__()
+        super(JitDPPYKernel, self).__init__()
 
         self.py_func = func
         self.definitions = {}
         self.access_types = access_types
 
-        from .descriptor import dppl_target
+        from .descriptor import dppy_target
 
-        self.typingctx = dppl_target.typing_context
+        self.typingctx = dppy_target.typing_context
 
     def __call__(self, *args, **kwargs):
         assert not kwargs, "Keyword Arguments are not supported"
diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py
index a8b6bbba36..641d924134 100644
--- a/numba_dppy/decorators.py
+++ b/numba_dppy/decorators.py
@@ -1,11 +1,11 @@
 from __future__ import print_function, absolute_import, division
 from numba.core import sigutils, types
-from .compiler import (compile_kernel, JitDPPLKernel, compile_dppl_func_template,
-                       compile_dppl_func, get_ordered_arg_access_types)
+from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template,
+                       compile_dppy_func, get_ordered_arg_access_types)
 
 
 def kernel(signature=None, access_types=None, debug=False):
-    """JIT compile a python function conforming using the DPPL backend.
+    """JIT compile a python function conforming using the DPPY backend.
 
     A kernel is equvalent to an OpenCL kernel function, and has the
     same restrictions as definined by SPIR_KERNEL calling convention.
@@ -22,14 +22,14 @@ def kernel(signature=None, access_types=None, debug=False):
 def autojit(debug=False, access_types=None):
     def _kernel_autojit(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types)
-        return JitDPPLKernel(pyfunc, ordered_arg_access_types)
+        return JitDPPYKernel(pyfunc, ordered_arg_access_types)
     return _kernel_autojit
 
 
 def _kernel_jit(signature, debug, access_types):
     argtypes, restype = sigutils.normalize_signature(signature)
     if restype is not None and restype != types.void:
-        msg = ("DPPL kernel must have void return type but got {restype}")
+        msg = ("DPPY kernel must have void return type but got {restype}")
         raise TypeError(msg.format(restype=restype))
 
     def _wrapped(pyfunc):
@@ -54,9 +54,9 @@ def _func_jit(signature):
     argtypes, restype = sigutils.normalize_signature(signature)
 
     def _wrapped(pyfunc):
-        return compile_dppl_func(pyfunc, restype, argtypes)
+        return compile_dppy_func(pyfunc, restype, argtypes)
 
     return _wrapped
 
 def _func_autojit(pyfunc):
-    return compile_dppl_func_template(pyfunc)
+    return compile_dppy_func_template(pyfunc)
diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py
index c0a24868c2..c8e6a58ec7 100644
--- a/numba_dppy/descriptor.py
+++ b/numba_dppy/descriptor.py
@@ -3,41 +3,41 @@
 from numba.core.options import TargetOptions
 
 from numba.core import dispatcher, utils, typing
-from .target import DPPLTargetContext, DPPLTypingContext
+from .target import DPPYTargetContext, DPPYTypingContext
 
 from numba.core.cpu import CPUTargetOptions
 
 
-class DPPLTarget(TargetDescriptor):
+class DPPYTarget(TargetDescriptor):
     options = CPUTargetOptions
-    #typingctx = DPPLTypingContext()
-    #targetctx = DPPLTargetContext(typingctx)
+    #typingctx = DPPYTypingContext()
+    #targetctx = DPPYTargetContext(typingctx)
 
     @utils.cached_property
     def _toplevel_target_context(self):
         # Lazily-initialized top-level target context, for all threads
-        return DPPLTargetContext(self.typing_context)
+        return DPPYTargetContext(self.typing_context)
 
     @utils.cached_property
     def _toplevel_typing_context(self):
         # Lazily-initialized top-level typing context, for all threads
-        return DPPLTypingContext()
+        return DPPYTypingContext()
 
     @property
     def target_context(self):
         """
-        The target context for DPPL targets.
+        The target context for DPPY targets.
         """
         return self._toplevel_target_context
 
     @property
     def typing_context(self):
         """
-        The typing context for DPPL targets.
+        The typing context for DPPY targets.
         """
         return self._toplevel_typing_context
 
 
 
-# The global DPPL target
-dppl_target = DPPLTarget()
+# The global DPPY target
+dppy_target = DPPYTarget()
diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py
index a4c32ec7ec..d00a597875 100644
--- a/numba_dppy/dispatcher.py
+++ b/numba_dppy/dispatcher.py
@@ -4,17 +4,17 @@
 
 #from numba.targets.descriptors import TargetDescriptor
 #from numba.targets.options import TargetOptions
-#import numba_dppy, numba_dppy as dppl
+#import numba_dppy, numba_dppy as dppy
 from numba_dppy import kernel, autojit
-from .descriptor import dppl_target
+from .descriptor import dppy_target
 #from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
  #                                       GUFuncCallSteps)
 
 from .. import dispatcher, utils, typing
-from .compiler import DPPLCompiler
+from .compiler import DPPYCompiler
 
-class DPPLDispatcher(dispatcher.Dispatcher):
-    targetdescr = dppl_target
+class DPPYDispatcher(dispatcher.Dispatcher):
+    targetdescr = dppy_target
 
 
     def __init__(self, py_func, locals={}, targetoptions={}):
@@ -58,7 +58,7 @@ def __getitem__(self, *args):
     def __getattr__(self, key):
         return getattr(self.compiled, key)
 
-class DPPLUFuncDispatcher(object):
+class DPPYUFuncDispatcher(object):
     """
     Invoke the OpenCL ufunc specialization for the given inputs.
     """
@@ -86,7 +86,7 @@ def __call__(self, *args, **kws):
                       depending on the input arguments.  Type must match
                       the input arguments.
         """
-        return DPPLUFuncMechanism.call(self.functions, args, kws)
+        return DPPYUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
         assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
@@ -142,7 +142,7 @@ def __reduce(self, mem, gpu_mems, stream):
                 return left
 
 
-class _DPPLGUFuncCallSteps(GUFuncCallSteps):
+class _DPPYGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
         '_stream',
     ]
@@ -167,10 +167,10 @@ def launch_kernel(self, kernel, nelem, args):
         kernel.forall(nelem, queue=self._stream)(*args)
 
 
-class DPPLGenerializedUFunc(GenerializedUFunc):
+class DPPYGenerializedUFunc(GenerializedUFunc):
     @property
     def _call_steps(self):
-        return _DPPLGUFuncCallSteps
+        return _DPPYGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
         return devicearray.DeviceNDArray(shape=shape,
@@ -188,7 +188,7 @@ def _broadcast_add_axis(self, ary, newshape):
                                          gpu_data=ary.gpu_data)
 
 
-class DPPLUFuncMechanism(UFuncMechanism):
+class DPPYUFuncMechanism(UFuncMechanism):
     """
     Provide OpenCL specialization
     """
diff --git a/numba_dppy/dppl_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
similarity index 98%
rename from numba_dppy/dppl_host_fn_call_gen.py
rename to numba_dppy/dppy_host_fn_call_gen.py
index 10a4820906..7d1c9bcea4 100644
--- a/numba_dppy/dppl_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -9,7 +9,7 @@
 
 from numba.core.ir_utils import legalize_names
 
-class DPPLHostFunctionCallsGenerator(object):
+class DPPYHostFunctionCallsGenerator(object):
     def __init__(self, lowerer, cres, num_inputs):
         self.lowerer = lowerer
         self.context = self.lowerer.context
@@ -70,31 +70,31 @@ def _init_llvm_types_and_constants(self):
     def _declare_functions(self):
         get_queue_fnty = lc.Type.function(self.void_ptr_t, ())
         self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty,
-                                                                name="DPPLQueueMgr_GetCurrentQueue")
+                                                                name="DPCTLQueueMgr_GetCurrentQueue")
 
         submit_range_fnty = lc.Type.function(self.void_ptr_t,
                 [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t,
                     self.int32_ptr_t, self.intp_t, self.intp_ptr_t,
                     self.intp_t, self.void_ptr_t, self.intp_t])
         self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty,
-                                                                name="DPPLQueue_SubmitRange")
+                                                                name="DPCTLQueue_SubmitRange")
 
 
         queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t])
         self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                                name="DPPLQueue_Memcpy")
+                                                                name="DPCTLQueue_Memcpy")
 
         queue_wait_fnty =  lc.Type.function(lir.VoidType(), [self.void_ptr_t])
         self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty,
-                                                                name="DPPLQueue_Wait")
+                                                                name="DPCTLQueue_Wait")
 
         usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t])
         self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty,
-                                                                name="DPPLmalloc_shared")
+                                                                name="DPCTLmalloc_shared")
 
         usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t])
         self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty,
-                                                                   name="DPPLfree_with_queue")
+                                                                   name="DPCTLfree_with_queue")
 
     def allocate_kenrel_arg_array(self, num_kernel_args):
         self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t)
diff --git a/numba_dppy/dppl_lowerer.py b/numba_dppy/dppy_lowerer.py
similarity index 97%
rename from numba_dppy/dppl_lowerer.py
rename to numba_dppy/dppy_lowerer.py
index a317c990a6..1561a6d85e 100644
--- a/numba_dppy/dppl_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -12,7 +12,7 @@
 from numba.core import (compiler, ir, types, sigutils, lowering,
                 funcdesc, config)
 from numba.parfors import parfor
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core.ir_utils import (add_offset_to_labels,
                             replace_var_names,
                             remove_dels,
@@ -38,9 +38,9 @@
 from numba.core.errors import NumbaParallelSafetyWarning, NumbaPerformanceWarning
 
 from .dufunc_inliner import dufunc_inliner
-from . import dppl_host_fn_call_gen as dppl_call_gen
+from . import dppy_host_fn_call_gen as dppy_call_gen
 import dpctl
-from numba_dppy.target import DPPLTargetContext
+from numba_dppy.target import DPPYTargetContext
 
 
 def _print_block(block):
@@ -72,7 +72,7 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
 
     for eachdim in range(global_id_dim):
         gufunc_txt += ("    " + legal_loop_indices[eachdim] + " = "
-                       + "dppl.get_global_id(" + str(eachdim) + ")\n")
+                       + "dppy.get_global_id(" + str(eachdim) + ")\n")
 
 
     for eachdim in range(global_id_dim, for_loop_dim):
@@ -444,7 +444,7 @@ def print_arg_with_addrspaces(args):
         print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt)
         sys.stdout.flush()
     # Force gufunc outline into existence.
-    globls = {"np": np, "numba": numba, "dppl": dppl}
+    globls = {"np": np, "numba": numba, "dppy": dppy}
     locls = {}
     exec(gufunc_txt, globls, locls)
     gufunc_func = locls[gufunc_name]
@@ -740,7 +740,7 @@ def _lower_parfor_gufunc(lowerer, parfor):
         parfor.races,
         typemap)
 
-    generate_dppl_host_wrapper(
+    generate_dppy_host_wrapper(
         lowerer,
         func,
         gu_signature,
@@ -828,10 +828,10 @@ def bump_alpha(c, class_map):
     return (gu_sin, gu_sout)
 
 
-# Keep all the dppl kernels and programs created alive indefinitely.
+# Keep all the dppy kernels and programs created alive indefinitely.
 keep_alive_kernels = []
 
-def generate_dppl_host_wrapper(lowerer,
+def generate_dppy_host_wrapper(lowerer,
                                cres,
                                gu_signature,
                                outer_sig,
@@ -852,7 +852,7 @@ def generate_dppl_host_wrapper(lowerer,
     num_dim = len(loop_ranges)
 
     if config.DEBUG_ARRAY_OPT:
-        print("generate_dppl_host_wrapper")
+        print("generate_dppy_host_wrapper")
         print("args = ", expr_args)
         print("outer_sig = ", outer_sig.args, outer_sig.return_type,
               outer_sig.recvr, outer_sig.pysig)
@@ -868,8 +868,8 @@ def generate_dppl_host_wrapper(lowerer,
 #        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
 
 
-    # get dppl_cpu_portion_lowerer object
-    dppl_cpu_lowerer = dppl_call_gen.DPPLHostFunctionCallsGenerator(
+    # get dppy_cpu_portion_lowerer object
+    dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator(
                            lowerer, cres, num_inputs)
 
     # Compute number of args ------------------------------------------------
@@ -886,7 +886,7 @@ def generate_dppl_host_wrapper(lowerer,
 
     # now that we know the total number of kernel args, lets allocate
     # a kernel_arg array
-    dppl_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
+    dppy_cpu_lowerer.allocate_kenrel_arg_array(num_expanded_args)
 
     ninouts = len(expr_args)
 
@@ -931,7 +931,7 @@ def val_type_or_none(context, lowerer, x):
                   "\n\tval_type:", val_type, type(val_type),
                   "\n\tindex:", index)
 
-        dppl_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
+        dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
                                             val_type, index, modified_arrays)
     # -----------------------------------------------------------------------
 
@@ -951,7 +951,7 @@ def load_range(v):
         step = load_range(step)
         loop_ranges[i] = (start, stop, step)
 
-    dppl_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
+    dppy_cpu_lowerer.enqueue_kernel_and_read_back(loop_ranges)
 
 
 from numba.core.lowering import Lower
@@ -975,7 +975,7 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.functions import Function, Dispatcher
     from numba.core.bytecode import FunctionIdentity
     from numba.core.typing.templates import Signature
-    from numba_dppy.compiler import DPPLFunctionTemplate
+    from numba_dppy.compiler import DPPYFunctionTemplate
     from numba.core.compiler import CompileResult
     from numba.np.ufunc.dufunc import DUFunc
     from ctypes import _CFuncPtr
@@ -983,9 +983,9 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, Dispatcher, ModuleType,
-                        Signature, DPPLFunctionTemplate, CompileResult,
-                        DUFunc, _CFuncPtr,
+    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type,
+                        Dispatcher, ModuleType, Signature,
+                        DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr,
                         type, str, bool, type(None))):
         return obj
 
@@ -1132,7 +1132,7 @@ def get_slots_members(obj):
     return cpy
 
 
-class DPPLLower(Lower):
+class DPPYLower(Lower):
     def __init__(self, context, library, fndesc, func_ir, metadata=None):
         Lower.__init__(self, context, library, fndesc, func_ir, metadata)
         memo = {}
@@ -1141,7 +1141,7 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
         func_ir_cpu = relatively_deep_copy(func_ir, memo)
 
 
-        cpu_context = context.cpu_context if isinstance(context, DPPLTargetContext) else context
+        cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context
         self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata)
         self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata)
 
@@ -1151,11 +1151,11 @@ def lower(self):
         # 1. Start lowering of parent function
         # 2. Try to lower parfor on GPU
         #     2.a. enter lower_parfor_rollback and prepare function to lower on GPU - insert get_global_id.
-        #         2.a.a. starting lower parfor body - enter this point (DPPLLower.lower()) second time.
+        #         2.a.a. starting lower parfor body - enter this point (DPPYLower.lower()) second time.
         #         2.a.b. If lowering on GPU failed - try on CPU.
         #         2.a.d. Since get_global_id is NOT supported with CPU context - fail and throw exception
         #     2.b. in lower_parfor_rollback catch exception and restore parfor body and other to its initial state
-        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPLLower.lower())
+        #     2.c. in lower_parfor_rollback throw expeption to catch it here (DPPYLower.lower())
         # 3. Catch exception and start parfor lowering with CPU context.
 
         # WARNING: this approach only works in case no device specific modifications were added to
@@ -1169,7 +1169,7 @@ def lower(self):
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
             if numba_dppy.compiler.DEBUG:
-                print("Failed to lower parfor on DPPL-device. Due to:\n", e)
+                print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
             if (lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel):
                 self.cpu_lower.lower()
@@ -1195,13 +1195,13 @@ def lower_parfor_rollback(lowerer, parfor):
     try:
         _lower_parfor_gufunc(lowerer, parfor)
         if numba_dppy.compiler.DEBUG:
-            msg = "Parfor lowered on DPPL-device"
+            msg = "Parfor lowered on DPPY-device"
             print(msg, parfor.loc)
     except Exception as e:
-        msg = "Failed to lower parfor on DPPL-device.\nTo see details set environment variable NUMBA_DPPL_DEBUG=1"
+        msg = "Failed to lower parfor on DPPY-device.\nTo see details set environment variable NUMBA_DPPY_DEBUG=1"
         warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
         raise e
 
 
-def dppl_lower_array_expr(lowerer, expr):
+def dppy_lower_array_expr(lowerer, expr):
     raise NotImplementedError(expr)
diff --git a/numba_dppy/dppl_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py
similarity index 73%
rename from numba_dppy/dppl_offload_dispatcher.py
rename to numba_dppy/dppy_offload_dispatcher.py
index db841bef06..0c5fe10f5e 100644
--- a/numba_dppy/dppl_offload_dispatcher.py
+++ b/numba_dppy/dppy_offload_dispatcher.py
@@ -3,21 +3,21 @@
 import numba_dppy.config as dppy_config
 
 
-class DpplOffloadDispatcher(dispatcher.Dispatcher):
+class DppyOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
     def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
         if dppy_config.dppy_present:
-            from numba_dppy.compiler import DPPLCompiler
+            from numba_dppy.compiler import DPPYCompiler
             targetoptions['parallel'] = True
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPLCompiler)
+                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler)
         else:
             print("---------------------------------------------------------------------")
-            print("WARNING : DPPL pipeline ignored. Ensure OpenCL drivers are installed.")
+            print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.")
             print("---------------------------------------------------------------------")
             dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
                 targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class)
 
-dispatcher_registry['__dppl_offload_gpu__'] = DpplOffloadDispatcher
-dispatcher_registry['__dppl_offload_cpu__'] = DpplOffloadDispatcher
+dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher
+dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher
diff --git a/numba_dppy/dppl_passbuilder.py b/numba_dppy/dppy_passbuilder.py
similarity index 82%
rename from numba_dppy/dppl_passbuilder.py
rename to numba_dppy/dppy_passbuilder.py
index 0ddaea6d0b..0a32a099cf 100644
--- a/numba_dppy/dppl_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -17,19 +17,19 @@
                                 DumpParforDiagnostics, IRLegalization,
                                 InlineOverloads, PreLowerStripPhis)
 
-from .dppl_passes import (
-        DPPLConstantSizeStaticLocalMemoryPass,
-        DPPLPreParforPass,
-        DPPLParforPass,
+from .dppy_passes import (
+        DPPYConstantSizeStaticLocalMemoryPass,
+        DPPYPreParforPass,
+        DPPYParforPass,
         SpirvFriendlyLowering,
-        DPPLAddNumpyOverloadPass,
-        DPPLAddNumpyRemoveOverloadPass,
-        DPPLNoPythonBackend
+        DPPYAddNumpyOverloadPass,
+        DPPYAddNumpyRemoveOverloadPass,
+        DPPYNoPythonBackend
         )
 
-class DPPLPassBuilder(object):
+class DPPYPassBuilder(object):
     """
-    This is the DPPL pass builder to run Intel GPU/CPU specific
+    This is the DPPY pass builder to run Intel GPU/CPU specific
     code-generation and optimization passes. This pass builder does
     not offer objectmode and interpreted passes.
     """
@@ -46,12 +46,12 @@ def default_numba_nopython_pipeline(state, pm):
 
         # this pass adds required logic to overload default implementation of
         # Numpy functions
-        pm.add_pass(DPPLAddNumpyOverloadPass, "dppl add typing template for Numpy functions")
+        pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
 
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
-        pm.add_pass(DPPLConstantSizeStaticLocalMemoryPass, "dppl constant size for static local memory")
+        pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory")
 
         # pre typing
         if not state.flags.no_rewrites:
@@ -90,24 +90,24 @@ def default_numba_nopython_pipeline(state, pm):
 
 
     @staticmethod
-    def define_nopython_pipeline(state, name='dppl_nopython'):
+    def define_nopython_pipeline(state, name='dppy_nopython'):
         """Returns an nopython mode pipeline based PassManager
         """
         pm = PassManager(name)
-        DPPLPassBuilder.default_numba_nopython_pipeline(state, pm)
+        DPPYPassBuilder.default_numba_nopython_pipeline(state, pm)
 
         # Intel GPU/CPU specific optimizations
-        pm.add_pass(DPPLPreParforPass, "Preprocessing for parfors")
+        pm.add_pass(DPPYPreParforPass, "Preprocessing for parfors")
         if not state.flags.no_rewrites:
             pm.add_pass(NopythonRewrites, "nopython rewrites")
-        pm.add_pass(DPPLParforPass, "convert to parfors")
+        pm.add_pass(DPPYParforPass, "convert to parfors")
 
         # legalise
         pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
 
         # lower
         pm.add_pass(SpirvFriendlyLowering, "SPIRV-friendly lowering pass")
-        pm.add_pass(DPPLNoPythonBackend, "nopython mode backend")
-        pm.add_pass(DPPLAddNumpyRemoveOverloadPass, "dppl remove typing template for Numpy functions")
+        pm.add_pass(DPPYNoPythonBackend, "nopython mode backend")
+        pm.add_pass(DPPYAddNumpyRemoveOverloadPass, "dppy remove typing template for Numpy functions")
         pm.finalize()
         return pm
diff --git a/numba_dppy/dppl_passes.py b/numba_dppy/dppy_passes.py
similarity index 95%
rename from numba_dppy/dppl_passes.py
rename to numba_dppy/dppy_passes.py
index f9e2633c3c..0bb2eadb48 100644
--- a/numba_dppy/dppl_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -24,7 +24,7 @@
 
 from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass
 
-from .dppl_lowerer import DPPLLower
+from .dppy_lowerer import DPPYLower
 
 from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
@@ -40,8 +40,8 @@ def dpnp_available():
 
 
 @register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyOverloadPass(FunctionPass):
-    _name = "dppl_add_numpy_overload_pass"
+class DPPYAddNumpyOverloadPass(FunctionPass):
+    _name = "dppy_add_numpy_overload_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -122,8 +122,8 @@ def generic(self, args, kws):
         return True
 
 @register_pass(mutates_CFG=False, analysis_only=True)
-class DPPLAddNumpyRemoveOverloadPass(FunctionPass):
-    _name = "dppl_remove_numpy_overload_pass"
+class DPPYAddNumpyRemoveOverloadPass(FunctionPass):
+    _name = "dppy_remove_numpy_overload_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -143,9 +143,9 @@ def run_pass(self, state):
         return True
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLConstantSizeStaticLocalMemoryPass(FunctionPass):
+class DPPYConstantSizeStaticLocalMemoryPass(FunctionPass):
 
-    _name = "dppl_constant_size_static_local_memory_pass"
+    _name = "dppy_constant_size_static_local_memory_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -218,9 +218,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLPreParforPass(FunctionPass):
+class DPPYPreParforPass(FunctionPass):
 
-    _name = "dppl_pre_parfor_pass"
+    _name = "dppy_pre_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -262,9 +262,9 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLParforPass(FunctionPass):
+class DPPYParforPass(FunctionPass):
 
-    _name = "dppl_parfor_pass"
+    _name = "dppy_parfor_pass"
 
     def __init__(self):
         FunctionPass.__init__(self)
@@ -343,9 +343,9 @@ def run_pass(self, state):
         targetctx = state.targetctx
 
         # This should not happen here, after we have the notion of context in Numba
-        # we should have specialized dispatcher for dppl context and that dispatcher
+        # we should have specialized dispatcher for dppy context and that dispatcher
         # should be a cpu dispatcher that will overload the lowering functions for
-        # linalg for dppl.cpu_dispatcher and the dppl.gpu_dipatcher should be the
+        # linalg for dppy.cpu_dispatcher and the dppy.gpu_dipatcher should be the
         # current target context we have to launch kernels.
         # This is broken as this essentially adds the new lowering in a list which
         # means it does not get replaced with the new lowering_buitins
@@ -373,7 +373,7 @@ def run_pass(self, state):
                     noalias=flags.noalias)
 
             with targetctx.push_code_library(library):
-                lower = DPPLLower(targetctx, library, fndesc, interp,
+                lower = DPPYLower(targetctx, library, fndesc, interp,
                                        metadata=metadata)
                 lower.lower()
                 if not flags.no_cpython_wrapper:
@@ -400,7 +400,7 @@ def run_pass(self, state):
 
 
 @register_pass(mutates_CFG=True, analysis_only=False)
-class DPPLNoPythonBackend(FunctionPass):
+class DPPYNoPythonBackend(FunctionPass):
 
     _name = "nopython_backend"
 
diff --git a/numba_dppy/examples/dppl_func.py b/numba_dppy/examples/dppy_func.py
similarity index 81%
rename from numba_dppy/examples/dppl_func.py
rename to numba_dppy/examples/dppy_func.py
index ec86681457..353ba48995 100644
--- a/numba_dppy/examples/dppl_func.py
+++ b/numba_dppy/examples/dppy_func.py
@@ -1,26 +1,26 @@
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 
-@dppl.func
+@dppy.func
 def g(a):
     return a + 1
 
 
-@dppl.kernel
+@dppy.kernel
 def f(a, b):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     b[i] = g(a[i])
 
 
 def driver(a, b, N):
     print(b)
     print("--------")
-    f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
     print(b)
 
 
diff --git a/numba_dppy/examples/dppl_with_context.py b/numba_dppy/examples/dppy_with_context.py
similarity index 94%
rename from numba_dppy/examples/dppl_with_context.py
rename to numba_dppy/examples/dppy_with_context.py
index c830e81ec6..6df025f5ca 100644
--- a/numba_dppy/examples/dppl_with_context.py
+++ b/numba_dppy/examples/dppy_with_context.py
@@ -1,6 +1,6 @@
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 @njit
diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py
index 35bef5be8a..b97ac49ca1 100644
--- a/numba_dppy/examples/matmul.py
+++ b/numba_dppy/examples/matmul.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
-def dppl_gemm(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+@dppy.kernel
+def dppy_gemm(a, b, c):
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     if i >= c.shape[0] or j >= c.shape[1]:
         return
     c[i,j] = 0
@@ -30,7 +30,7 @@ def dppl_gemm(a, b, c):
 
 def driver(a, b, c):
     # Invoke the kernel
-    dppl_gemm[griddim,blockdim](a, b, c)
+    dppy_gemm[griddim,blockdim](a, b, c)
 
 
 def main():
diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py
index cc5c232c92..b72c41ba9c 100644
--- a/numba_dppy/examples/pairwise_distance.py
+++ b/numba_dppy/examples/pairwise_distance.py
@@ -6,7 +6,7 @@
 import argparse
 import timeit
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 import dpctl._memory as dpctl_mem
 
@@ -28,9 +28,9 @@
 D = np.empty((args.n, args.n))
 
 
-@dppl.kernel
+@dppy.kernel
 def pairwise_distance(X, D, xshape0, xshape1):
-    idx = dppl.get_global_id(0)
+    idx = dppy.get_global_id(0)
 
     #for i in range(xshape0):
     for j in range(X.shape[0]):
diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py
index 418976f53a..e66c51ae2c 100644
--- a/numba_dppy/examples/sum-hybrid.py
+++ b/numba_dppy/examples/sum-hybrid.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -27,7 +27,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("CPU device not found")
@@ -40,7 +40,7 @@ def main():
             c = np.ones_like(a)
             print("before A: ", a)
             print("before B: ", b)
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
             print("after  C: ", c)
     else:
         print("GPU device not found")
diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py
index f97b8243cb..fdc1623fa7 100644
--- a/numba_dppy/examples/sum.py
+++ b/numba_dppy/examples/sum.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -18,7 +18,7 @@ def driver(a, b, c, global_size):
     print("before : ", a)
     print("before : ", b)
     print("before : ", c)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after : ", c)
 
 
diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py
index 00be613d2b..90959c8bdf 100644
--- a/numba_dppy/examples/sum2D.py
+++ b/numba_dppy/examples/sum2D.py
@@ -4,21 +4,21 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
-    j = dppl.get_global_id(1)
+    i = dppy.get_global_id(0)
+    j = dppy.get_global_id(1)
     c[i,j] = a[i,j] + b[i,j]
 
 
 def driver(a, b, c, global_size):
     print("before A: ", a)
     print("before B: ", b)
-    data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+    data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
     print("after  C : ", c)
 
 
diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py
index 6486be0275..2aea8e080a 100644
--- a/numba_dppy/examples/sum_ndarray.py
+++ b/numba_dppy/examples/sum_ndarray.py
@@ -4,13 +4,13 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py
index 3e00f95631..367fa37952 100644
--- a/numba_dppy/examples/sum_reduction.py
+++ b/numba_dppy/examples/sum_reduction.py
@@ -4,13 +4,13 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -34,7 +34,7 @@ def test_sum_reduction():
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
     else:
diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py
index e2605a7bbc..8d8e0411aa 100644
--- a/numba_dppy/examples/sum_reduction_ocl.py
+++ b/numba_dppy/examples/sum_reduction_ocl.py
@@ -1,20 +1,20 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
 
 def sum_reduction_device_plus_host():
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         # Copy from global to local memory
         local_sums[local_id] = inp[global_id]
@@ -23,7 +23,7 @@ def sum_reduction_kernel(inp, partial_sums):
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py
index 11f5023a3b..c5dd6daa47 100644
--- a/numba_dppy/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py
@@ -1,7 +1,7 @@
 import sys
 import numpy as np
 from numba import int32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import math
 
 import dpctl
@@ -11,15 +11,15 @@
 def recursive_reduction(size, group_size,
                         Dinp, Dpartial_sums):
 
-    @dppl.kernel
+    @dppy.kernel
     def sum_reduction_kernel(inp, input_size,
                              partial_sums):
-        local_id   = dppl.get_local_id(0)
-        global_id  = dppl.get_global_id(0)
-        group_size = dppl.get_local_size(0)
-        group_id   = dppl.get_group_id(0)
+        local_id   = dppy.get_local_id(0)
+        global_id  = dppy.get_global_id(0)
+        group_size = dppy.get_local_size(0)
+        group_id   = dppy.get_group_id(0)
 
-        local_sums = dppl.local.static_alloc(64, int32)
+        local_sums = dppy.local.static_alloc(64, int32)
 
         local_sums[local_id] = 0
 
@@ -30,7 +30,7 @@ def sum_reduction_kernel(inp, input_size,
         stride = group_size // 2
         while (stride > 0):
             # Waiting for each 2x2 addition into given workgroup
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
             if (local_id < stride):
diff --git a/numba_dppy/experimental_numpy_lowering_overload.py b/numba_dppy/experimental_numpy_lowering_overload.py
index 2123e6667d..dd1e2a1eb6 100644
--- a/numba_dppy/experimental_numpy_lowering_overload.py
+++ b/numba_dppy/experimental_numpy_lowering_overload.py
@@ -77,7 +77,7 @@ def get_sycl_queue(context, builder):
     void_ptr_t = context.get_value_type(types.voidptr)
     get_queue_fnty = lc.Type.function(void_ptr_t, ())
     get_queue = builder.module.get_or_insert_function(get_queue_fnty,
-                                            name="DPPLQueueMgr_GetCurrentQueue")
+                                            name="DPCTLQueueMgr_GetCurrentQueue")
     sycl_queue_val = cgutils.alloca_once(builder, void_ptr_t)
     builder.store(builder.call(get_queue, []), sycl_queue_val)
 
@@ -87,7 +87,7 @@ def allocate_usm(context, builder, size, sycl_queue):
     void_ptr_t = context.get_value_type(types.voidptr)
     usm_shared_fnty = lc.Type.function(void_ptr_t, [ll_intp_t, void_ptr_t])
     usm_shared = builder.module.get_or_insert_function(usm_shared_fnty,
-                                                       name="DPPLmalloc_shared")
+                                                       name="DPCTLmalloc_shared")
 
     buffer_ptr = cgutils.alloca_once(builder, void_ptr_t)
     args = [size, builder.load(sycl_queue)]
@@ -100,7 +100,7 @@ def copy_usm(context, builder, src, dst, size, sycl_queue):
     queue_memcpy_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t, void_ptr_t,
                                                          ll_intp_t])
     queue_memcpy = builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                       name="DPPLQueue_Memcpy")
+                                                       name="DPCTLQueue_Memcpy")
     args = [builder.load(sycl_queue),
             builder.bitcast(dst, void_ptr_t),
             builder.bitcast(src, void_ptr_t),
@@ -113,7 +113,7 @@ def free_usm(context, builder, usm_buf, sycl_queue):
 
     usm_free_fnty = lc.Type.function(ir.VoidType(), [void_ptr_t, void_ptr_t])
     usm_free = builder.module.get_or_insert_function(usm_free_fnty,
-                                               name="DPPLfree_with_queue")
+                                               name="DPCTLfree_with_queue")
 
     builder.call(usm_free, [usm_buf, builder.load(sycl_queue)])
 
@@ -350,7 +350,7 @@ def make_res(a, b):
 
 
 @lower_builtin(np.dot, types.Array, types.Array)
-def dot_dppl(context, builder, sig, args):
+def dot_dppy(context, builder, sig, args):
     """
     np.dot(a, b)
     a @ b
@@ -374,7 +374,7 @@ def dot_dppl(context, builder, sig, args):
 
 
 @lower_builtin("np.matmul", types.Array, types.Array)
-def matmul_dppl(context, builder, sig, args):
+def matmul_dppy(context, builder, sig, args):
     """
     np.matmul(matrix, matrix)
     """
diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index c8ba56220a..745e8031eb 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -5,8 +5,8 @@
 
 
 def init_jit():
-    from numba_dppy.dispatcher import DPPLDispatcher
-    return DPPLDispatcher
+    from numba_dppy.dispatcher import DPPYDispatcher
+    return DPPYDispatcher
 
 def initialize_all():
     from numba.core.registry import dispatcher_registry
@@ -17,9 +17,9 @@ def initialize_all():
     import platform as plt
     platform = plt.system()
     if platform == 'Windows':
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface.dll'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll'))
     else:
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPPLSyclInterface*'))
+        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
 
     if len(paths) == 1:
         ll.load_library_permanently(find_library(paths[0]))
diff --git a/numba_dppy/ocl/atomics/atomic_ops.cl b/numba_dppy/ocl/atomics/atomic_ops.cl
index ad581716de..56228d8bf5 100644
--- a/numba_dppy/ocl/atomics/atomic_ops.cl
+++ b/numba_dppy/ocl/atomics/atomic_ops.cl
@@ -5,7 +5,7 @@
 #ifdef cl_khr_int64_base_atomics
   #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
 
-  long numba_dppl_atomic_add_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -15,7 +15,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_add_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_add_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -25,7 +25,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_local(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_local(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -35,7 +35,7 @@
       return found;
   }
 
-  long numba_dppl_atomic_sub_i64_global(volatile __generic long *p, long val) {
+  long numba_dppy_atomic_sub_i64_global(volatile __generic long *p, long val) {
       long found = *p;
       long expected;
       do {
@@ -48,7 +48,7 @@
   #ifdef cl_khr_fp64
     #pragma OPENCL EXTENSION cl_khr_fp64: enable
 
-    double numba_dppl_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_local(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -60,7 +60,7 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
+    double numba_dppy_atomic_cmpxchg_f64_global(volatile __generic double *p, double cmp, double val) {
         union {
             ulong  u64;
             double f64;
@@ -72,50 +72,50 @@
         return old_union.f64;
     }
 
-    double numba_dppl_atomic_add_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_add_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_add_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected + val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected + val);
         } while (found != expected);
         return found;
     }
 
 
-    double numba_dppl_atomic_sub_f64_local(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_local(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_local(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_local(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
 
-    double numba_dppl_atomic_sub_f64_global(volatile __generic double *p, double val) {
+    double numba_dppy_atomic_sub_f64_global(volatile __generic double *p, double val) {
         double  found = *p;
         double  expected;
         do {
             expected = found;
-            found = numba_dppl_atomic_cmpxchg_f64_global(p, expected, expected - val);
+            found = numba_dppy_atomic_cmpxchg_f64_global(p, expected, expected - val);
         } while (found != expected);
         return found;
     }
   #endif
 #endif
 
-float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -127,7 +127,7 @@ float numba_dppl_atomic_cmpxchg_f32_local(volatile __generic float *p, float cmp
     return old_union.f32;
 }
 
-float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
+float numba_dppy_atomic_cmpxchg_f32_global(volatile __generic float *p, float cmp, float val) {
     union {
         unsigned int u32;
         float        f32;
@@ -139,47 +139,47 @@ float numba_dppl_atomic_cmpxchg_f32_global(volatile __generic float *p, float cm
     return old_union.f32;
 }
 
-float numba_dppl_atomic_add_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_add_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_add_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected + val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected + val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_local(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_local(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_local(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_local(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-float numba_dppl_atomic_sub_f32_global(volatile __generic float *p, float val) {
+float numba_dppy_atomic_sub_f32_global(volatile __generic float *p, float val) {
     float found = *p;
     float expected;
     do {
         expected = found;
-        found = numba_dppl_atomic_cmpxchg_f32_global(p, expected, expected - val);
+        found = numba_dppy_atomic_cmpxchg_f32_global(p, expected, expected - val);
     } while (found != expected);
     return found;
 }
 
-int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -189,7 +189,7 @@ int numba_dppl_atomic_add_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_add_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -199,7 +199,7 @@ int numba_dppl_atomic_add_i32_global(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_local(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
@@ -209,7 +209,7 @@ int numba_dppl_atomic_sub_i32_local(volatile __generic int *p, int val) {
     return found;
 }
 
-int numba_dppl_atomic_sub_i32_global(volatile __generic int *p, int val) {
+int numba_dppy_atomic_sub_i32_global(volatile __generic int *p, int val) {
     int found = *p;
     int expected;
     do {
diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py
index 1af90a6884..adf14a1815 100644
--- a/numba_dppy/ocl/ocldecl.py
+++ b/numba_dppy/ocl/ocldecl.py
@@ -4,7 +4,7 @@
 from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
                                         AbstractTemplate, MacroTemplate,
                                         signature, Registry)
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 
 registry = Registry()
 intrinsic = registry.register
@@ -15,71 +15,71 @@
 
 @intrinsic
 class Ocl_get_global_id(ConcreteTemplate):
-    key = dppl.get_global_id
+    key = dppy.get_global_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_id(ConcreteTemplate):
-    key = dppl.get_local_id
+    key = dppy.get_local_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_group_id(ConcreteTemplate):
-    key = dppl.get_group_id
+    key = dppy.get_group_id
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_num_groups(ConcreteTemplate):
-    key = dppl.get_num_groups
+    key = dppy.get_num_groups
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_work_dim(ConcreteTemplate):
-    key = dppl.get_work_dim
+    key = dppy.get_work_dim
     cases = [signature(types.uint32)]
 
 
 @intrinsic
 class Ocl_get_global_size(ConcreteTemplate):
-    key = dppl.get_global_size
+    key = dppy.get_global_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_get_local_size(ConcreteTemplate):
-    key = dppl.get_local_size
+    key = dppy.get_local_size
     cases = [signature(types.intp, types.uint32)]
 
 
 @intrinsic
 class Ocl_barrier(ConcreteTemplate):
-    key = dppl.barrier
+    key = dppy.barrier
     cases = [signature(types.void, types.uint32),
              signature(types.void)]
 
 
 @intrinsic
 class Ocl_mem_fence(ConcreteTemplate):
-    key = dppl.mem_fence
+    key = dppy.mem_fence
     cases = [signature(types.void, types.uint32)]
 
 
 @intrinsic
 class Ocl_sub_group_barrier(ConcreteTemplate):
-    key = dppl.sub_group_barrier
+    key = dppy.sub_group_barrier
 
     cases = [signature(types.void)]
 
 
-# dppl.atomic submodule -------------------------------------------------------
+# dppy.atomic submodule -------------------------------------------------------
 
 @intrinsic
 class Ocl_atomic_add(AbstractTemplate):
-    key = dppl.atomic.add
+    key = dppy.atomic.add
 
     def generic(self, args, kws):
         assert not kws
@@ -92,7 +92,7 @@ def generic(self, args, kws):
 
 @intrinsic
 class Ocl_atomic_sub(AbstractTemplate):
-    key = dppl.atomic.sub
+    key = dppy.atomic.sub
 
     def generic(self, args, kws):
         assert not kws
@@ -106,7 +106,7 @@ def generic(self, args, kws):
 
 @intrinsic_attr
 class OclAtomicTemplate(AttributeTemplate):
-    key = types.Module(dppl.atomic)
+    key = types.Module(dppy.atomic)
 
     def resolve_add(self, mod):
         return types.Function(Ocl_atomic_add)
@@ -115,15 +115,15 @@ def resolve_sub(self, mod):
         return types.Function(Ocl_atomic_sub)
 
 
-# dppl.local submodule -------------------------------------------------------
+# dppy.local submodule -------------------------------------------------------
 
 class Ocl_local_alloc(MacroTemplate):
-    key = dppl.local.static_alloc
+    key = dppy.local.static_alloc
 
 
 @intrinsic_attr
 class OclLocalTemplate(AttributeTemplate):
-    key = types.Module(dppl.local)
+    key = types.Module(dppy.local)
 
     def resolve_static_alloc(self, mod):
         return types.Macro(Ocl_local_alloc)
@@ -133,7 +133,7 @@ def resolve_static_alloc(self, mod):
 
 @intrinsic_attr
 class OclModuleTemplate(AttributeTemplate):
-    key = types.Module(dppl)
+    key = types.Module(dppy)
 
     def resolve_get_global_id(self, mod):
         return types.Function(Ocl_get_global_id)
@@ -166,11 +166,11 @@ def resolve_sub_group_barrier(self, mod):
         return types.Function(Ocl_sub_group_barrier)
 
     def resolve_atomic(self, mod):
-        return types.Module(dppl.atomic)
+        return types.Module(dppy.atomic)
 
     def resolve_local(self, mod):
-        return types.Module(dppl.local)
+        return types.Module(dppy.local)
 
 # intrinsic
 
-#intrinsic_global(dppl, types.Module(dppl))
+#intrinsic_global(dppy, types.Module(dppy))
diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py
index b92dca7bae..26f8482799 100644
--- a/numba_dppy/ocl/oclimpl.py
+++ b/numba_dppy/ocl/oclimpl.py
@@ -169,9 +169,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.IntType(32)
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_i32"
+            name = "numba_dppy_atomic_add_i32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_i32"
+            name = "numba_dppy_atomic_sub_i32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -182,9 +182,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.IntType(64)
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_i64"
+                name = "numba_dppy_atomic_add_i64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_i64"
+                name = "numba_dppy_atomic_sub_i64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -195,9 +195,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         ll_val = ir.FloatType()
         ll_p = ll_val.as_pointer()
         if fn_type == "add":
-            name = "numba_dppl_atomic_add_f32"
+            name = "numba_dppy_atomic_add_f32"
         elif fn_type == "sub":
-            name = "numba_dppl_atomic_sub_f32"
+            name = "numba_dppy_atomic_sub_f32"
         else:
             raise TypeError("Operation type is not supported %s" %
                              (fn_type))
@@ -208,9 +208,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             ll_val = ir.DoubleType()
             ll_p = ll_val.as_pointer()
             if fn_type == "add":
-                name = "numba_dppl_atomic_add_f64"
+                name = "numba_dppy_atomic_add_f64"
             elif fn_type == "sub":
-                name = "numba_dppl_atomic_sub_f64"
+                name = "numba_dppy_atomic_sub_f64"
             else:
                 raise TypeError("Operation type is not supported %s" %
                                  (fn_type))
@@ -331,11 +331,11 @@ def atomic_sub_tuple(context, builder, sig, args):
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
-@lower('dppl.lmem.alloc', types.UniTuple, types.Any)
-def dppl_lmem_alloc_array(context, builder, sig, args):
+@lower('dppy.lmem.alloc', types.UniTuple, types.Any)
+def dppy_lmem_alloc_array(context, builder, sig, args):
     shape, dtype = args
     return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_dppl_lmem',
+                          symbol_name='_dppy_lmem',
                           addrspace=target.SPIR_LOCAL_ADDRSPACE)
 
 
diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py
index 2ec95fa9c8..190b685955 100644
--- a/numba_dppy/ocl/stubs.py
+++ b/numba_dppy/ocl/stubs.py
@@ -83,9 +83,9 @@ def sub_group_barrier():
 
 class Stub(object):
     """A stub object to represent special objects which is meaningless
-    outside the context of DPPL compilation context.
+    outside the context of DPPY compilation context.
     """
-    _description_ = '<dppl special value>'
+    _description_ = '<dppy special value>'
     __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
@@ -100,7 +100,7 @@ def __repr__(self):
 def local_alloc(shape, dtype):
     shape = _legalize_shape(shape)
     ndim = len(shape)
-    fname = "dppl.lmem.alloc"
+    fname = "dppy.lmem.alloc"
     restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE)
     sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any)
     return ir.Intrinsic(fname, sig, args=(shape, dtype))
diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py
index 74319b1bdd..e5c9d4f793 100644
--- a/numba_dppy/printimpl.py
+++ b/numba_dppy/printimpl.py
@@ -79,8 +79,8 @@ def print_varargs(context, builder, sig, args):
     va_arg.extend(values)
     va_arg = tuple(va_arg)
 
-    dppl_print = declare_print(builder.module)
+    dppy_print = declare_print(builder.module)
 
-    builder.call(dppl_print, va_arg)
+    builder.call(dppy_print, va_arg)
 
     return context.get_dummy_value()
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index aac4efcd4b..6444a6e601 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -24,7 +24,7 @@
 # Typing
 
 
-class DPPLTypingContext(typing.BaseContext):
+class DPPYTypingContext(typing.BaseContext):
     def load_additional_registries(self):
         # Declarations for OpenCL API functions and OpenCL Math functions
         from .ocl import ocldecl, mathdecl
@@ -91,7 +91,7 @@ def _replace_numpy_ufunc_with_opencl_supported_functions():
                 ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])]
 
 
-class DPPLTargetContext(BaseContext):
+class DPPYTargetContext(BaseContext):
     implement_powi_as_math_call = True
     generic_addrspace = SPIR_GENERIC_ADDRSPACE
 
@@ -153,7 +153,7 @@ def load_additional_registries(self):
 
     @cached_property
     def call_conv(self):
-        return DPPLCallConv(self)
+        return DPPYCallConv(self)
 
     def codegen(self):
         return self._internal_codegen
@@ -169,7 +169,7 @@ def repl(m):
 
         qualified = name + '.' + '.'.join(str(a) for a in argtypes)
         mangled = VALID_CHARS.sub(repl, qualified)
-        return 'dppl_py_devfn_' + mangled
+        return 'dppy_py_devfn_' + mangled
 
     def prepare_ocl_kernel(self, func, argtypes):
         module = func.module
@@ -208,8 +208,8 @@ def sub_gen_with_global(lty):
             llargtys = changed = ()
         wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
 
-        wrapper_module = self.create_module("dppl.kernel.wrapper")
-        wrappername = 'dpplPy_{name}'.format(name=func.name)
+        wrapper_module = self.create_module("dppy.kernel.wrapper")
+        wrappername = 'dppyPy_{name}'.format(name=func.name)
 
         argtys = list(arginfo.argument_types)
         fnty = lc.Type.function(lc.Type.int(),
@@ -239,7 +239,7 @@ def sub_gen_with_global(lty):
                                                  argtypes, callargs)
         builder.ret_void()
 
-        set_dppl_kernel(wrapper)
+        set_dppy_kernel(wrapper)
 
         #print(str(wrapper_module))
         # Link
@@ -255,9 +255,9 @@ def declare_function(self, module, fndesc):
         fnty = self.call_conv.get_function_type(fndesc.restype, fndesc.argtypes)
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
         fn.attributes.add('alwaysinline')
-        ret = super(DPPLTargetContext, self).declare_function(module, fndesc)
+        ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
         # XXX: Refactor fndesc instead of this special case
-        if fndesc.llvm_func_name.startswith('dppl_py_devfn'):
+        if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
             ret.calling_convention = CC_SPIR_FUNC
         return ret
 
@@ -305,7 +305,7 @@ def addrspacecast(self, builder, src, addrspace):
         return builder.addrspacecast(src, ptras)
 
 
-def set_dppl_kernel(fn):
+def set_dppy_kernel(fn):
     """
     Ensure `fn` is usable as a SPIR kernel.
     - Fix calling convention
@@ -332,11 +332,11 @@ def set_dppl_kernel(fn):
     make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
     spir_version_constant = [make_constant(x) for x in SPIR_VERSION]
 
-    spir_version = mod.get_or_insert_named_metadata("dppl.spir.version")
+    spir_version = mod.get_or_insert_named_metadata("dppy.spir.version")
     if not spir_version.operands:
         spir_version.add(lc.MetaData.get(mod, spir_version_constant))
 
-    ocl_version = mod.get_or_insert_named_metadata("dppl.ocl.version")
+    ocl_version = mod.get_or_insert_named_metadata("dppy.ocl.version")
     if not ocl_version.operands:
         ocl_version.add(lc.MetaData.get(mod, spir_version_constant))
 
@@ -414,7 +414,7 @@ def gen_arg_base_type(fn):
     return lc.MetaData.get(mod, [name] + consts)
 
 
-class DPPLCallConv(MinimalCallConv):
+class DPPYCallConv(MinimalCallConv):
     def call_function(self, builder, callee, resty, argtys, args, env=None):
         """
         Call the Numba-compiled *callee*.
diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py
index 40b9d589d9..dde38eb75b 100644
--- a/numba_dppy/target_dispatcher.py
+++ b/numba_dppy/target_dispatcher.py
@@ -8,9 +8,9 @@
 class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta):
     __numba__ = 'py_func'
 
-    target_offload_gpu = '__dppl_offload_gpu__'
-    target_offload_cpu = '__dppl_offload_cpu__'
-    target_dppl = 'dppy'
+    target_offload_gpu = '__dppy_offload_gpu__'
+    target_offload_cpu = '__dppy_offload_cpu__'
+    target_dppy = 'dppy'
 
     def __init__(self, py_func, wrapper, target, parallel_options, compiled=None):
 
@@ -53,7 +53,7 @@ def get_compiled(self, target=None):
         return self.__compiled[disp]
 
     def __is_with_context_target(self, target):
-        return target is None or target == TargetDispatcher.target_dppl
+        return target is None or target == TargetDispatcher.target_dppy
 
     def get_current_disp(self):
         target = self.__target
@@ -66,7 +66,7 @@ def get_current_disp(self):
             if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False):
                 raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'")
 
-            from numba_dppy import dppl_offload_dispatcher
+            from numba_dppy import dppy_offload_dispatcher
 
             if target is None:
                 if dpctl.get_current_device_type() == dpctl.device_type.gpu:
@@ -75,7 +75,7 @@ def get_current_disp(self):
                     return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
                 else:
                     if dpctl.is_in_device_context():
-                        raise UnsupportedError('Unknown dppl device type')
+                        raise UnsupportedError('Unknown dppy device type')
                     if offload:
                         if dpctl.has_gpu_queues():
                             return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index 8da0b7b91e..e309b7f0c9 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -11,7 +11,7 @@
     redirect_c_stdout,
 )
 
-class DPPLTestCase(SerialMixin, unittest.TestCase):
+class DPPYTestCase(SerialMixin, unittest.TestCase):
     def setUp(self):
         #init()
 	#TODO
@@ -21,7 +21,7 @@ def tearDown(self):
 	#TODO
         pass
 
-class DPPLTextCapture(object):
+class DPPYTextCapture(object):
     def __init__(self, stream):
         self._stream = stream
 
@@ -36,16 +36,16 @@ def getvalue(self):
         return self._stream.getvalue()
 
 @contextlib.contextmanager
-def captured_dppl_stdout():
+def captured_dppy_stdout():
     """
-    Return a minimal stream-like object capturing the text output of dppl
+    Return a minimal stream-like object capturing the text output of dppy
     """
     # Prevent accidentally capturing previously output text
     sys.stdout.flush()
 
-    import numba_dppy, numba_dppy as dppl
+    import numba_dppy, numba_dppy as dppy
     with redirect_c_stdout() as stream:
-        yield DPPLTextCapture(stream)
+        yield DPPYTextCapture(stream)
 
 
 def _id(obj):
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index 5a2199f149..939c95c567 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -2,9 +2,11 @@
 from numba.testing import load_testsuite
 from os.path import dirname, join
 
-
+import numba_dppy
 import numba_dppy.config as dppy_config
 
+# from numba_dppy.tests.dppy import *
+
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
@@ -12,6 +14,6 @@ def load_tests(loader, tests, pattern):
     if dppy_config.dppy_present:
         suite.addTests(load_testsuite(loader, dirname(__file__)))
     else:
-        print("skipped DPPL tests")
+        print("skipped DPPY tests")
 
     return suite
diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
index ecc5d839bb..3de2d31770 100644
--- a/numba_dppy/tests/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -2,25 +2,25 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
-@dppl.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
 def sum_with_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
-@dppl.kernel
+@dppy.kernel
 def sum_without_accessor(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 def call_kernel(global_size, local_size,
                 A, B, C, func):
-        func[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, C)
+        func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
 
 
 global_size = 10
@@ -33,7 +33,7 @@ def call_kernel(global_size, local_size,
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArgAccessorCPU(DPPLTestCase):
+class TestDPPYArgAccessorCPU(DPPYTestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -50,7 +50,7 @@ def test_arg_without_accessor(self):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArgAccessorGPU(DPPLTestCase):
+class TestDPPYArgAccessorGPU(DPPYTestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
index fc2eae105d..7b06ef11f8 100644
--- a/numba_dppy/tests/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -2,19 +2,19 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
-@dppl.kernel
+@dppy.kernel
 def mul_kernel(A, B, test):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     B[i] = A[i] * test
 
 def call_mul_device_kernel(global_size, A, B, test):
-    mul_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, B, test)
+    mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test)
 
 
 global_size = 10
@@ -24,7 +24,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLArrayArgCPU(DPPLTestCase):
+class TestDPPYArrayArgCPU(DPPYTestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
@@ -42,7 +42,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -52,14 +52,14 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLArrayArgGPU(DPPLTestCase):
+class TestDPPYArrayArgGPU(DPPYTestCase):
     def test_integer_arg(self):
         x = np.int32(2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -77,7 +77,7 @@ def test_float_arg(self):
             self.assertTrue(np.all(A * x == B))
 
     def test_bool_arg(self):
-        @dppl.kernel
+        @dppy.kernel
         def check_bool_kernel(A, test):
             if test:
                 A[0] = 111
@@ -87,9 +87,9 @@ def check_bool_kernel(A, test):
         A = np.array([0], dtype='float64')
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, True)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
             self.assertTrue(A[0] == 111)
-            check_bool_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, False)
+            check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
index 9825c707d1..9d8e88def1 100644
--- a/numba_dppy/tests/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -3,106 +3,106 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 def atomic_add_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.add(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_sub_int32(ary):
-    tid = dppl.get_local_id(0)
-    lm = dppl.local.static_alloc(32, numba.uint32)
+    tid = dppy.get_local_id(0)
+    lm = dppy.local.static_alloc(32, numba.uint32)
     lm[tid] = 0
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     bin = ary[tid] % 32
-    dppl.atomic.sub(lm, bin, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, bin, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tid] = lm[tid]
 
 
 def atomic_add_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float32(ary):
-    lm = dppl.local.static_alloc(1, numba.float32)
+    lm = dppy.local.static_alloc(1, numba.float32)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_int64(ary):
-    lm = dppl.local.static_alloc(1, numba.int64)
+    lm = dppy.local.static_alloc(1, numba.int64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_sub_float64(ary):
-    lm = dppl.local.static_alloc(1, numba.float64)
+    lm = dppy.local.static_alloc(1, numba.float64)
     lm[0] = ary[0]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.sub(lm, 0, 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.sub(lm, 0, 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[0] = lm[0]
 
 
 def atomic_add2(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, ty), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, ty), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
 def atomic_add3(ary):
-    tx = dppl.get_local_id(0)
-    ty = dppl.get_local_id(1)
-    lm = dppl.local.static_alloc((4, 8), numba.uint32)
+    tx = dppy.get_local_id(0)
+    ty = dppy.get_local_id(1)
+    lm = dppy.local.static_alloc((4, 8), numba.uint32)
     lm[tx, ty] = ary[tx, ty]
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
-    dppl.atomic.add(lm, (tx, numba.uint64(ty)), 1)
-    dppl.barrier(dppl.CLK_GLOBAL_MEM_FENCE)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
+    dppy.atomic.add(lm, (tx, numba.uint64(ty)), 1)
+    dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     ary[tx, ty] = lm[tx, ty]
 
 
@@ -118,18 +118,18 @@ def call_fn_for_datatypes(fn, result, input, global_size):
             #    continue
             #if dtype == np.int64 and not device_env.device_support_int64_atomics():
             #    continue
-            fn[global_size, dppl.DEFAULT_LOCAL_SIZE](a)
+            fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
 
         assert(a[0] == result)
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 @unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
-class TestAtomicOp(DPPLTestCase):
+class TestAtomicOp(DPPYTestCase):
     def test_atomic_add_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_add(B):
-            dppl.atomic.add(B, 0, 1)
+            dppy.atomic.add(B, 0, 1)
 
         N = 100
         B = np.array([0])
@@ -138,9 +138,9 @@ def atomic_add(B):
 
 
     def test_atomic_sub_global(self):
-        @dppl.kernel
+        @dppy.kernel
         def atomic_sub(B):
-            dppl.atomic.sub(B, 0, 1)
+            dppy.atomic.sub(B, 0, 1)
 
         N = 100
         B = np.array([100])
@@ -152,10 +152,10 @@ def test_atomic_add_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_add = dppl.kernel('void(uint32[:])')(atomic_add_int32)
-        dppl_atomic_add = dppl.kernel(atomic_add_int32)
+        #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
+        dppy_atomic_add = dppy.kernel(atomic_add_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -168,10 +168,10 @@ def test_atomic_sub_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppl_atomic_sub = dppl.kernel('void(uint32[:])')(atomic_sub_int32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_int32)
+        #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         gold = np.zeros(32, dtype=np.uint32)
         for i in range(orig.size):
@@ -183,10 +183,10 @@ def test_atomic_sub_local_int32(self):
     def test_atomic_add_local_float32(self):
         ary = np.array([0], dtype=np.float32)
 
-        #dppl_atomic_add = dppl.kernel('void(float32[:])')(atomic_add_float32)
-        dppl_atomic_add = dppl.kernel(atomic_add_float32)
+        #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
+        dppy_atomic_add = dppy.kernel(atomic_add_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 32)
 
@@ -194,11 +194,11 @@ def test_atomic_add_local_float32(self):
     def test_atomic_sub_local_float32(self):
         ary = np.array([32], dtype=np.float32)
 
-        #dppl_atomic_sub = dppl.kernel('void(float32[:])')(atomic_sub_float32)
-        dppl_atomic_sub = dppl.kernel(atomic_sub_float32)
+        #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
+        dppy_atomic_sub = dppy.kernel(atomic_sub_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
 
-            dppl_atomic_sub[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 0)
 
@@ -206,12 +206,12 @@ def test_atomic_sub_local_float32(self):
     def test_atomic_add_local_int64(self):
         ary = np.array([0], dtype=np.int64)
 
-        #dppl_atomic_add = dppl.kernel('void(int64[:])')(atomic_add_int64)
-        dppl_atomic_add = dppl.kernel(atomic_add_int64)
+        #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
+        dppy_atomic_add = dppy.kernel(atomic_add_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            dppl_atomic_add[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -220,12 +220,12 @@ def test_atomic_add_local_int64(self):
     def test_atomic_sub_local_int64(self):
         ary = np.array([32], dtype=np.int64)
 
-        #fn = dppl.kernel('void(int64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_int64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -234,12 +234,12 @@ def test_atomic_sub_local_int64(self):
     def test_atomic_add_local_float64(self):
         ary = np.array([0], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_add_float64)
-        fn = dppl.kernel(atomic_add_float64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
+        fn = dppy.kernel(atomic_add_float64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
             #else:
             #    return
@@ -248,12 +248,12 @@ def test_atomic_add_local_float64(self):
     def test_atomic_sub_local_float64(self):
         ary = np.array([32], dtype=np.double)
 
-        #fn = dppl.kernel('void(float64[:])')(atomic_sub_int64)
-        fn = dppl.kernel(atomic_sub_int64)
+        #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
+        fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
             #if device_env.device_support_float64_atomics():
-            fn[32, dppl.DEFAULT_LOCAL_SIZE](ary)
+            fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
             #else:
             #    return
@@ -262,20 +262,20 @@ def test_atomic_sub_local_float64(self):
     def test_atomic_add2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add2 = dppl.kernel('void(uint32[:,:])')(atomic_add2)
-        dppl_atomic_add2 = dppl.kernel(atomic_add2)
+        #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
+        dppy_atomic_add2 = dppy.kernel(atomic_add2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add2[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
         self.assertTrue(np.all(ary == orig + 1))
 
 
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppl_atomic_add3 = dppl.kernel('void(uint32[:,:])')(atomic_add3)
-        dppl_atomic_add3 = dppl.kernel(atomic_add3)
+        #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
+        dppy_atomic_add3 = dppy.kernel(atomic_add3)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            dppl_atomic_add3[(4, 8), dppl.DEFAULT_LOCAL_SIZE](ary)
+            dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(np.all(ary == orig + 1))
 
diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py
index aeff16dd40..3657672240 100644
--- a/numba_dppy/tests/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -3,21 +3,21 @@
 import numpy as np
 
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba import float32
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
 class TestBarrier(unittest.TestCase):
     def test_proper_lowering(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             A[i] = d * 2
 
         N = 256
@@ -31,13 +31,13 @@ def twice(A):
         np.testing.assert_allclose(orig * 2, arr)
 
     def test_no_arg_barrier_support(self):
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def twice(A):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             d = A[i]
             # no argument defaults to global mem fence
-            dppl.barrier()
+            dppy.barrier()
             A[i] = d * 2
 
         N = 256
@@ -45,7 +45,7 @@ def twice(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            twice[N, dppl.DEFAULT_LOCAL_SIZE](arr)
+            twice[N, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
@@ -54,16 +54,16 @@ def twice(A):
     def test_local_memory(self):
         blocksize = 10
 
-        #@dppl.kernel("void(float32[::1])")
-        @dppl.kernel
+        #@dppy.kernel("void(float32[::1])")
+        @dppy.kernel
         def reverse_array(A):
-            lm = dppl.local.static_alloc(shape=10, dtype=float32)
-            i = dppl.get_global_id(0)
+            lm = dppy.local.static_alloc(shape=10, dtype=float32)
+            i = dppy.get_global_id(0)
 
             # preload
             lm[i] = A[i]
             # barrier local or global will both work as we only have one work group
-            dppl.barrier(dppl.CLK_LOCAL_MEM_FENCE)  # local mem fence
+            dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
             # write
             A[i] += lm[blocksize - 1 - i]
 
@@ -71,7 +71,7 @@ def reverse_array(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            reverse_array[blocksize, dppl.DEFAULT_LOCAL_SIZE](arr)
+            reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr)
 
         expected = orig[::-1] + orig
         np.testing.assert_allclose(expected, arr)
diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
index 3d9581bb54..312536d33a 100644
--- a/numba_dppy/tests/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -4,9 +4,9 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
@@ -49,7 +49,7 @@ def randfloat(rand_var, low, high):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLBlackScholes(DPPLTestCase):
+class TestDPPYBlackScholes(DPPYTestCase):
     def test_black_scholes(self):
         OPT_N = 400
         iterations = 2
@@ -70,9 +70,9 @@ def test_black_scholes(self):
                           optionStrike, optionYears, RISKFREE, VOLATILITY)
 
 
-        @dppl.kernel
-        def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
-            i = dppl.get_global_id(0)
+        @dppy.kernel
+        def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
+            i = dppy.get_global_id(0)
             if i >= S.shape[0]:
                 return
             sqrtT = math.sqrt(T[i])
@@ -103,7 +103,7 @@ def black_scholes_dppl(callResult, putResult, S, X, T, R, V):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             time1 = time.time()
             for i in range(iterations):
-                black_scholes_dppl[blockdim, griddim](
+                black_scholes_dppy[blockdim, griddim](
                     callResultNumbapro, putResultNumbapro, stockPrice, optionStrike,
                     optionYears, RISKFREE, VOLATILITY)
 
diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py
index 6a6a7967a5..ae693190a3 100644
--- a/numba_dppy/tests/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -3,18 +3,18 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
-class TestCaching(DPPLTestCase):
+class TestCaching(DPPYTestCase):
     def test_caching_kernel(self):
         global_size = 10
         N = global_size
@@ -25,11 +25,11 @@ def test_caching_kernel(self):
 
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            func = dppl.kernel(data_parallel_sum)
-            caching_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+            func = dppy.kernel(data_parallel_sum)
+            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
 
             for i in range(10):
-                cached_kernel = func[global_size, dppl.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
                 self.assertIs(caching_kernel, cached_kernel)
 
 
diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
index 024e3723a9..b38eac12fe 100644
--- a/numba_dppy/tests/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -4,14 +4,14 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
-@dppl.kernel
+@dppy.kernel
 def data_parallel_sum(a, b, c):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
 
@@ -24,23 +24,23 @@ def data_parallel_sum(a, b, c):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLDeviceArrayArgsGPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsGPU(DPPYTestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
             self.assertTrue(np.all(c == d))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLDeviceArrayArgsCPU(DPPLTestCase):
+class TestDPPYDeviceArrayArgsCPU(DPPYTestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            data_parallel_sum[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b, c)
+            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
 
         self.assertTrue(np.all(c == d))
 
diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
index bb72a35cf2..dcbb95e163 100644
--- a/numba_dppy/tests/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -3,12 +3,12 @@
 import numpy as np
 
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPCTLAPI(DPPLTestCase):
+class TestDPCTLAPI(DPPYTestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dpctl.dump()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index bbffb30c3f..b0837f5ba6 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -5,9 +5,9 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
 def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
@@ -76,7 +76,7 @@ def ensure_dpnp():
 
 
 @unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
-class Testdpnp_functions(DPPLTestCase):
+class Testdpnp_functions(DPPYTestCase):
     N = 10
 
     a = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_dppl_fallback.py b/numba_dppy/tests/test_dppl_fallback.py
index adb7ae868b..8519f4fb14 100644
--- a/numba_dppy/tests/test_dppl_fallback.py
+++ b/numba_dppy/tests/test_dppl_fallback.py
@@ -3,9 +3,9 @@
 import numpy as np
 
 import numba
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stderr
 import dpctl
 import sys
@@ -13,8 +13,8 @@
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFallback(DPPLTestCase):
-    def test_dppl_fallback_inner_call(self):
+class TestDPPYFallback(DPPYTestCase):
+    def test_dppy_fallback_inner_call(self):
         @numba.jit
         def fill_value(i):
             return i
@@ -29,27 +29,27 @@ def inner_call_fallback():
             return a
 
         with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(inner_call_fallback)
-            dppl_result = dppl()
+            dppy = numba.njit(parallel={'offload':True})(inner_call_fallback)
+            dppy_result = dppy()
 
         ref_result = inner_call_fallback()
 
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
 
-    def test_dppl_fallback_reductions(self):
+    def test_dppy_fallback_reductions(self):
         def reduction(a):
             return np.amax(a)
 
         a = np.ones(10)
         with captured_stderr() as msg:
-            dppl = numba.njit(parallel={'offload':True})(reduction)
-            dppl_result = dppl(a)
+            dppy = numba.njit(parallel={'offload':True})(reduction)
+            dppy_result = dppy(a)
 
         ref_result = reduction(a)
 
-        np.testing.assert_array_equal(dppl_result, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPL-device' in msg.getvalue())
+        np.testing.assert_array_equal(dppy_result, ref_result)
+        self.assertTrue('Failed to lower parfor on DPPY-device' in msg.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_dppl_func.py b/numba_dppy/tests/test_dppl_func.py
index 0f64046082..c58908554e 100644
--- a/numba_dppy/tests/test_dppl_func.py
+++ b/numba_dppy/tests/test_dppl_func.py
@@ -2,59 +2,59 @@
 
 import numpy as np
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLFunc(DPPLTestCase):
+class TestDPPYFunc(DPPYTestCase):
     N = 257
 
-    def test_dppl_func_device_array(self):
-        @dppl.func
+    def test_dppy_func_device_array(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
         self.assertTrue(np.all(b == 2))
 
-    def test_dppl_func_ndarray(self):
-        @dppl.func
+    def test_dppy_func_ndarray(self):
+        @dppy.func
         def g(a):
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
 
-        @dppl.kernel
+        @dppy.kernel
         def h(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i]) + 1
 
         a = np.ones(self.N)
         b = np.ones(self.N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 2))
 
-            h[self.N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
             self.assertTrue(np.all(b == 3))
 
diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
index 977fe85fef..f83fdd30ee 100644
--- a/numba_dppy/tests/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -4,45 +4,45 @@
 
 import sys
 import numpy as np
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 import dpctl
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import math
 
-@dppl.kernel
-def dppl_fabs(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_fabs(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.fabs(a[i])
 
-@dppl.kernel
-def dppl_exp(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_exp(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.exp(a[i])
 
-@dppl.kernel
-def dppl_log(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_log(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.log(a[i])
 
-@dppl.kernel
-def dppl_sqrt(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sqrt(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sqrt(a[i])
 
-@dppl.kernel
-def dppl_sin(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_sin(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.sin(a[i])
 
-@dppl.kernel
-def dppl_cos(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_cos(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.cos(a[i])
 
-@dppl.kernel
-def dppl_tan(a,b):
-    i = dppl.get_global_id(0)
+@dppy.kernel
+def dppy_tan(a,b):
+    i = dppy.get_global_id(0)
     b[i] = math.tan(a[i])
 
 global_size = 10
@@ -53,7 +53,7 @@ def dppl_tan(a,b):
 def driver(a, jitfunc):
     b = np.ones_like(a)
     # Device buffers
-    jitfunc[global_size, dppl.DEFAULT_LOCAL_SIZE](a, b)
+    jitfunc[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b)
     return b
 
 
@@ -73,67 +73,67 @@ def test_driver(input_arr, device_ty, jitfunc):
 
 
 @unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
-class TestDPPLMathFunctionsCPU(DPPLTestCase):
+class TestDPPYMathFunctionsCPU(DPPYTestCase):
     def test_fabs_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_fabs)
+        b_actual = test_driver(a, "CPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sin)
+        b_actual = test_driver(a, "CPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_cos)
+        b_actual = test_driver(a, "CPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_exp)
+        b_actual = test_driver(a, "CPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_sqrt)
+        b_actual = test_driver(a, "CPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_cpu(self):
-        b_actual = test_driver(a, "CPU", dppl_log)
+        b_actual = test_driver(a, "CPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLMathFunctionsGPU(DPPLTestCase):
+class TestDPPYMathFunctionsGPU(DPPYTestCase):
     def test_fabs_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_fabs)
+        b_actual = test_driver(a, "GPU", dppy_fabs)
         b_expected = np.fabs(a)
         self.assertTrue(np.all(b_actual == b_expected))
 
     def test_sin_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sin)
+        b_actual = test_driver(a, "GPU", dppy_sin)
         b_expected = np.sin(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_cos_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_cos)
+        b_actual = test_driver(a, "GPU", dppy_cos)
         b_expected = np.cos(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_exp_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_exp)
+        b_actual = test_driver(a, "GPU", dppy_exp)
         b_expected = np.exp(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_sqrt_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_sqrt)
+        b_actual = test_driver(a, "GPU", dppy_sqrt)
         b_expected = np.sqrt(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
     def test_log_gpu(self):
-        b_actual = test_driver(a, "GPU", dppl_log)
+        b_actual = test_driver(a, "GPU", dppy_log)
         b_expected = np.log(a)
         self.assertTrue(np.allclose(b_actual,b_expected))
 
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 5e3cd9ba24..de6b7bc963 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_bit_twiddling_functions(DPPLTestCase):
+class TestNumpy_bit_twiddling_functions(DPPYTestCase):
     def test_bitwise_and(self):
         @njit(parallel={'offload':True})
         def f(a, b):
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 0bd7dcbb69..5daf1fc813 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -5,11 +5,11 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
-class TestNumpy_comparison_functions(DPPLTestCase):
+class TestNumpy_comparison_functions(DPPYTestCase):
     a = np.array([4,5,6])
     b = np.array([2,6,6])
     def test_greater(self):
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index 62b76b1ade..c05c10498d 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -4,12 +4,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_floating_functions(DPPLTestCase):
+class TestNumpy_floating_functions(DPPYTestCase):
     def test_isfinite(self):
         @njit(parallel={'offload':True})
         def f(a):
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index ddbb568ede..155b352c7e 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_math_functions(DPPLTestCase):
+class TestNumpy_math_functions(DPPYTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 8f61f941c9..7ce18b870a 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestNumpy_math_functions(DPPLTestCase):
+class TestNumpy_math_functions(DPPYTestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
     b = np.array(np.random.random(N), dtype=np.float32)
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index fe8c85d356..591fd2cb0e 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -1,8 +1,8 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
-from numba_dppy.testing import unittest, DPPLTestCase
+import numba_dppy, numba_dppy as dppy
+from numba_dppy.testing import unittest, DPPYTestCase
 from numba.tests.support import captured_stdout
 import dpctl
 
@@ -19,7 +19,7 @@ def prange_example():
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
-class TestParforMessage(DPPLTestCase):
+class TestParforMessage(DPPYTestCase):
     def test_parfor_message(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             numba_dppy.compiler.DEBUG = 1
@@ -29,7 +29,7 @@ def test_parfor_message(self):
                 jitted()
 
             numba_dppy.compiler.DEBUG = 0
-            self.assertTrue("Parfor lowered on DPPL-device" in got.getvalue())
+            self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue())
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index 317c2cbb2f..f4c13c4b1f 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -6,13 +6,13 @@
 import numpy as np
 import numba
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest, expectedFailureIf
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 from numba.tests.support import captured_stdout
 
 
-class TestPrange(DPPLTestCase):
+class TestPrange(DPPYTestCase):
     def test_one_prange(self):
         @njit(parallel={'offload':True})
         def f(a, b):
@@ -118,8 +118,8 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
@@ -146,8 +146,8 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPL-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPL-device'), 0, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Parfor lowered on DPPY-device'), 2, stdout.getvalue())
+        self.assertEqual(stdout.getvalue().count('Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
         np.testing.assert_equal(res, jitted_res)
 
 
diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py
index ca1e47978a..0bc4a7cc2b 100644
--- a/numba_dppy/tests/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -5,24 +5,24 @@
 import sys
 import numpy as np
 from numba import njit, prange
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 import dpctl
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestPrint(DPPLTestCase):
-    def test_print_dppl_kernel(self):
-        @dppl.func
+class TestPrint(DPPYTestCase):
+    def test_print_dppy_kernel(self):
+        @dppy.func
         def g(a):
             print("value of a:", a)
             return a + 1
 
-        @dppl.kernel
+        @dppy.kernel
         def f(a, b):
-            i = dppl.get_global_id(0)
+            i = dppy.get_global_id(0)
             b[i] = g(a[i])
             print("value of b at:", i, "is", b[i])
 
@@ -32,7 +32,7 @@ def f(a, b):
         b = np.ones(N)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            f[N, dppl.DEFAULT_LOCAL_SIZE](a, b)
+            f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
 if __name__ == '__main__':
diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
index 3095497a66..8ec7b3d5a9 100644
--- a/numba_dppy/tests/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -4,14 +4,14 @@
 import math
 import time
 
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 import dpctl
 
-@dppl.kernel
+@dppy.kernel
 def reduction_kernel(A, R, stride):
-    i = dppl.get_global_id(0)
+    i = dppy.get_global_id(0)
     # sum two element
     R[i] = A[i] + A[i+stride]
     # store the sum to be used in nex iteration
@@ -19,7 +19,7 @@ def reduction_kernel(A, R, stride):
 
 
 @unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-class TestDPPLSumReduction(DPPLTestCase):
+class TestDPPYSumReduction(DPPYTestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
@@ -36,7 +36,7 @@ def test_sum_reduction(self):
             while (total > 1):
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppl.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
                 total = total // 2
 
             result = A_copy.sum()
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 12dc7b5ed3..04891ca296 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -5,12 +5,12 @@
 import sys
 import numpy as np
 from numba import njit, vectorize
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba_dppy.testing import unittest
-from numba_dppy.testing import DPPLTestCase
+from numba_dppy.testing import DPPYTestCase
 
 
-class TestVectorize(DPPLTestCase):
+class TestVectorize(DPPYTestCase):
     def test_vectorize(self):
 
         @vectorize(nopython=True)
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index 0749ff3e89..e025a77784 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -2,18 +2,18 @@
 import numba
 import numpy as np
 from numba import njit
-import numba_dppy, numba_dppy as dppl
+import numba_dppy, numba_dppy as dppy
 from numba.core import errors
 from numba.tests.support import captured_stdout
-from numba_dppy.testing import DPPLTestCase, unittest, expectedFailureIf
+from numba_dppy.testing import DPPYTestCase, unittest, expectedFailureIf
 import dpctl
 
 
-class TestWithDPPLContext(DPPLTestCase):
+class TestWithDPPYContext(DPPYTestCase):
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     @expectedFailureIf(sys.platform.startswith('win'))
-    def test_with_dppl_context_gpu(self):
+    def test_with_dppy_context_gpu(self):
 
         @njit
         def nested_func(a, b):
@@ -36,11 +36,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_gpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_gpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
     @unittest.expectedFailure
-    def test_with_dppl_context_cpu(self):
+    def test_with_dppy_context_cpu(self):
 
         @njit
         def nested_func(a, b):
@@ -63,11 +63,11 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_cpu)
-        self.assertTrue('Parfor lowered on DPPL-device' in got_cpu_message.getvalue())
+        self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue())
 
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
-    def test_with_dppl_context_target(self):
+    def test_with_dppy_context_target(self):
 
         @njit(target='cpu')
         def nested_func_target(a, b):

From 023fef9518c72318084eea1ee7ae8fa79b4522d3 Mon Sep 17 00:00:00 2001
From: Reazul Hoque <reazulhoque@users.noreply.github.com>
Date: Wed, 9 Dec 2020 00:46:00 -0600
Subject: [PATCH 09/24] Pass to rewrite Numpy function names to be able to
 overload them for Numba-dppy pipeline (#52)

* Sum example

* Moved from infer_type, lower_builtin to overload

* Added two level module name functions

* Remove cython generated file

* Module name fix for moving to new extension

* Incomplete linalg.eig implementation

* Updted all dppl to dppy and moved rewrite_numpy_function_pass to it's own file

* Import module at correct locations

* Added comments

* Added test and updated comments

* Revert unneeded changes

* Update Eigen implementation

* Remove eig implementation

* Add checking equivalent IR

Co-authored-by: reazul.hoque <reazul.hoque@intel.com>
---
 numba_dppy/device_init.py                     |  13 +-
 numba_dppy/dpctl_functions.py                 |  30 +++++
 numba_dppy/dpnp_glue/__init__.py              |   0
 numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx  |  44 +++++-
 numba_dppy/dpnp_glue/dpnpdecl.py              |  10 ++
 numba_dppy/dpnp_glue/dpnpimpl.py              |  89 +++++++++++++
 numba_dppy/dpnp_glue/stubs.py                 |   9 ++
 numba_dppy/dppy_passbuilder.py                |   7 +
 numba_dppy/dppy_passes.py                     |   3 +-
 numba_dppy/rename_numpy_functions_pass.py     | 125 ++++++++++++++++++
 .../tests/test_rename_numpy_function_pass.py  |  67 ++++++++++
 11 files changed, 389 insertions(+), 8 deletions(-)
 create mode 100644 numba_dppy/dpctl_functions.py
 create mode 100644 numba_dppy/dpnp_glue/__init__.py
 create mode 100644 numba_dppy/dpnp_glue/dpnpdecl.py
 create mode 100644 numba_dppy/dpnp_glue/dpnpimpl.py
 create mode 100644 numba_dppy/dpnp_glue/stubs.py
 create mode 100644 numba_dppy/rename_numpy_functions_pass.py
 create mode 100644 numba_dppy/tests/test_rename_numpy_function_pass.py

diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py
index c4506014a8..efec55ba83 100644
--- a/numba_dppy/device_init.py
+++ b/numba_dppy/device_init.py
@@ -18,6 +18,14 @@
     CLK_GLOBAL_MEM_FENCE,
 )
 
+"""
+We are importing dpnp stub module to make Numba recognize the
+module when we rename Numpy functions.
+"""
+from .dpnp_glue.stubs import (
+    dpnp
+)
+
 DEFAULT_LOCAL_SIZE = []
 
 from . import initialize
@@ -35,9 +43,4 @@ def is_available():
     return dpctl.has_gpu_queues()
 
 
-#def ocl_error():
-#    """Returns None or an exception if the OpenCL driver fails to initialize.
-#    """
-#    return driver.driver.initialization_error
-
 initialize.initialize_all()
diff --git a/numba_dppy/dpctl_functions.py b/numba_dppy/dpctl_functions.py
new file mode 100644
index 0000000000..67bc358185
--- /dev/null
+++ b/numba_dppy/dpctl_functions.py
@@ -0,0 +1,30 @@
+from numba import types
+from numba.core.typing import signature
+
+
+class _DPCTL_FUNCTIONS:
+    @classmethod
+    def dpctl_get_current_queue(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type)
+        return types.ExternalFunction("DPCTLQueueMgr_GetCurrentQueue", sig)
+
+    @classmethod
+    def dpctl_malloc_shared(cls):
+        ret_type = types.voidptr
+        sig = signature(ret_type, types.int64, types.voidptr)
+        return types.ExternalFunction("DPCTLmalloc_shared", sig)
+
+    @classmethod
+    def dpctl_queue_memcpy(cls):
+        ret_type = types.void
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.int64
+        )
+        return types.ExternalFunction("DPCTLQueue_Memcpy", sig)
+
+    @classmethod
+    def dpctl_free_with_queue(cls):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr)
+        return types.ExternalFunction("DPCTLfree_with_queue", sig)
diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
index 8eba8bf74c..a63d4fdafa 100644
--- a/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
+++ b/numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx
@@ -8,6 +8,7 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
     cdef enum DPNPFuncName "DPNPFuncName":
         DPNP_FN_ABSOLUTE
         DPNP_FN_ADD
+        DPNP_FN_ARANGE
         DPNP_FN_ARCCOS
         DPNP_FN_ARCCOSH
         DPNP_FN_ARCSIN
@@ -18,40 +19,77 @@ cdef extern from "backend_iface_fptr.hpp" namespace "DPNPFuncName":  # need this
         DPNP_FN_ARGMAX
         DPNP_FN_ARGMIN
         DPNP_FN_ARGSORT
+        DPNP_FN_BITWISE_AND
+        DPNP_FN_BITWISE_OR
+        DPNP_FN_BITWISE_XOR
         DPNP_FN_CBRT
         DPNP_FN_CEIL
+        DPNP_FN_CHOLESKY
+        DPNP_FN_COPYSIGN
+        DPNP_FN_CORRELATE
         DPNP_FN_COS
         DPNP_FN_COSH
         DPNP_FN_COV
         DPNP_FN_DEGREES
+        DPNP_FN_DET
         DPNP_FN_DIVIDE
         DPNP_FN_DOT
         DPNP_FN_EIG
+        DPNP_FN_EIGVALS
         DPNP_FN_EXP
         DPNP_FN_EXP2
         DPNP_FN_EXPM1
         DPNP_FN_FABS
+        DPNP_FN_FFT_FFT
         DPNP_FN_FLOOR
+        DPNP_FN_FLOOR_DIVIDE
         DPNP_FN_FMOD
-        DPNP_FN_GAUSSIAN
         DPNP_FN_HYPOT
+        DPNP_FN_INVERT
+        DPNP_FN_LEFT_SHIFT
         DPNP_FN_LOG
         DPNP_FN_LOG10
         DPNP_FN_LOG1P
         DPNP_FN_LOG2
         DPNP_FN_MATMUL
+        DPNP_FN_MATRIX_RANK
         DPNP_FN_MAX
         DPNP_FN_MAXIMUM
         DPNP_FN_MEAN
         DPNP_FN_MEDIAN
         DPNP_FN_MIN
         DPNP_FN_MINIMUM
+        DPNP_FN_MODF
         DPNP_FN_MULTIPLY
         DPNP_FN_POWER
         DPNP_FN_PROD
-        DPNP_FN_UNIFORM
         DPNP_FN_RADIANS
+        DPNP_FN_REMAINDER
         DPNP_FN_RECIP
+        DPNP_FN_RIGHT_SHIFT
+        DPNP_FN_RNG_BETA
+        DPNP_FN_RNG_BINOMIAL
+        DPNP_FN_RNG_CHISQUARE
+        DPNP_FN_RNG_EXPONENTIAL
+        DPNP_FN_RNG_GAMMA
+        DPNP_FN_RNG_GAUSSIAN
+        DPNP_FN_RNG_GEOMETRIC
+        DPNP_FN_RNG_GUMBEL
+        DPNP_FN_RNG_HYPERGEOMETRIC
+        DPNP_FN_RNG_LAPLACE
+        DPNP_FN_RNG_LOGNORMAL
+        DPNP_FN_RNG_MULTINOMIAL
+        DPNP_FN_RNG_MULTIVARIATE_NORMAL
+        DPNP_FN_RNG_NEGATIVE_BINOMIAL
+        DPNP_FN_RNG_NORMAL
+        DPNP_FN_RNG_POISSON
+        DPNP_FN_RNG_RAYLEIGH
+        DPNP_FN_RNG_STANDARD_CAUCHY
+        DPNP_FN_RNG_STANDARD_EXPONENTIAL
+        DPNP_FN_RNG_STANDARD_GAMMA
+        DPNP_FN_RNG_STANDARD_NORMAL
+        DPNP_FN_RNG_UNIFORM
+        DPNP_FN_RNG_WEIBULL
         DPNP_FN_SIGN
         DPNP_FN_SIN
         DPNP_FN_SINH
@@ -109,6 +147,8 @@ cdef DPNPFuncName get_DPNPFuncName_from_str(name):
         return DPNPFuncName.DPNP_FN_ARGSORT
     elif name == "dpnp_cov":
         return DPNPFuncName.DPNP_FN_COV
+    elif name == "dpnp_eig":
+        return DPNPFuncName.DPNP_FN_EIG
     else:
         return  DPNPFuncName.DPNP_FN_DOT
 
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
new file mode 100644
index 0000000000..e77739eeda
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -0,0 +1,10 @@
+from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
+import numba_dppy
+from numba import types
+
+@infer_getattr
+class DppyDpnpTemplate(AttributeTemplate):
+    key = types.Module(numba_dppy)
+
+    def resolve_dpnp(self, mod):
+        return types.Module(numba_dppy.dpnp)
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
new file mode 100644
index 0000000000..d6e53c4b99
--- /dev/null
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -0,0 +1,89 @@
+from numba.core.imputils import lower_builtin
+import numba_dppy.experimental_numpy_lowering_overload as dpnp_lowering
+from numba import types
+from numba.core.typing import signature
+from numba.core.extending import overload, register_jitable
+from . import stubs
+import numpy as np
+from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
+
+
+def get_dpnp_fptr(fn_name, type_names):
+    from . import dpnp_fptr_interface as dpnp_glue
+
+    f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
+    return f_ptr
+
+
+@register_jitable
+def _check_finite_matrix(a):
+    for v in np.nditer(a):
+        if not np.isfinite(v.item()):
+            raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
+
+
+@register_jitable
+def _dummy_liveness_func(a):
+    """pass a list of variables to be preserved through dead code elimination"""
+    return a[0]
+
+
+class RetrieveDpnpFnPtr(types.ExternalFunctionPointer):
+    def __init__(self, fn_name, type_names, sig, get_pointer):
+        self.fn_name = fn_name
+        self.type_names = type_names
+        super(RetrieveDpnpFnPtr, self).__init__(sig, get_pointer)
+
+
+class _DPNP_EXTENSION:
+    def __init__(self, name):
+        dpnp_lowering.ensure_dpnp(name)
+
+    @classmethod
+    def dpnp_sum(cls, fn_name, type_names):
+        ret_type = types.void
+        sig = signature(ret_type, types.voidptr, types.voidptr, types.int64)
+        f_ptr = get_dpnp_fptr(fn_name, type_names)
+
+        def get_pointer(obj):
+            return f_ptr
+
+        return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
+
+
+@overload(stubs.dpnp.sum)
+def dpnp_sum_impl(a):
+    dpnp_extension = _DPNP_EXTENSION("sum")
+    dpctl_functions = _DPCTL_FUNCTIONS()
+
+    dpnp_sum = dpnp_extension.dpnp_sum("dpnp_sum", [a.dtype.name, "NONE"])
+
+    get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
+    allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
+    copy_usm = dpctl_functions.dpctl_queue_memcpy()
+    free_usm = dpctl_functions.dpctl_free_with_queue()
+
+    def dpnp_sum_impl(a):
+        if a.size == 0:
+            raise ValueError("Passed Empty array")
+
+        sycl_queue = get_sycl_queue()
+        a_usm = allocate_usm_shared(a.size * a.itemsize, sycl_queue)
+        copy_usm(sycl_queue, a_usm, a.ctypes, a.size * a.itemsize)
+
+        out_usm = allocate_usm_shared(a.itemsize, sycl_queue)
+
+        dpnp_sum(a_usm, out_usm, a.size)
+
+        out = np.empty(1, dtype=a.dtype)
+        copy_usm(sycl_queue, out.ctypes, out_usm, out.size * out.itemsize)
+
+        free_usm(a_usm, sycl_queue)
+        free_usm(out_usm, sycl_queue)
+
+
+        _dummy_liveness_func([out.size])
+
+        return out[0]
+
+    return dpnp_sum_impl
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
new file mode 100644
index 0000000000..d51cd28ead
--- /dev/null
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -0,0 +1,9 @@
+from numba_dppy.ocl.stubs import Stub
+
+class dpnp(Stub):
+    """dpnp namespace
+    """
+    _description_ = '<dpnp>'
+
+    class sum(Stub):
+        pass
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index 0a32a099cf..b3c632a85a 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -27,6 +27,8 @@
         DPPYNoPythonBackend
         )
 
+from .rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+
 class DPPYPassBuilder(object):
     """
     This is the DPPY pass builder to run Intel GPU/CPU specific
@@ -44,6 +46,11 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(IRProcessing, "processing IR")
         pm.add_pass(WithLifting, "Handle with contexts")
 
+        # this pass rewrites name of NumPy functions we intend to overload
+        pm.add_pass(DPPYRewriteOverloadedFunctions,
+                "Rewrite name of Numpy functions to overload already overloaded function",
+        )
+
         # this pass adds required logic to overload default implementation of
         # Numpy functions
         pm.add_pass(DPPYAddNumpyOverloadPass, "dppy add typing template for Numpy functions")
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index 0bb2eadb48..c73f5a7736 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -3,6 +3,7 @@
 import warnings
 
 import numpy as np
+import numba
 from numba.core import ir
 import weakref
 from collections import namedtuple, deque
@@ -49,7 +50,7 @@ def __init__(self):
     def run_pass(self, state):
         if dpnp_available():
             typingctx = state.typingctx
-            from numba.core.typing.templates import builtin_registry as reg, infer_global
+            from numba.core.typing.templates import (builtin_registry as reg, infer_global)
             from numba.core.typing.templates import (AbstractTemplate, CallableTemplate, signature)
             from numba.core.typing.npydecl import MatMulTyperMixin
 
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
new file mode 100644
index 0000000000..a0c4b89b3e
--- /dev/null
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -0,0 +1,125 @@
+from numba.core import ir
+from numba.core.compiler_machinery import FunctionPass, register_pass
+from numba.core.ir_utils import (
+    find_topo_order,
+    mk_unique_var,
+    remove_dead,
+    simplify_CFG,
+)
+import numba_dppy
+
+rewrite_function_name_map = {"sum": (["np"], "sum")}
+
+
+class RewriteNumPyOverloadedFunctions(object):
+    def __init__(self, state, rewrite_function_name_map=rewrite_function_name_map):
+        self.state = state
+        self.function_name_map = rewrite_function_name_map
+
+    def run(self):
+        """
+        This function rewrites the name of NumPy functions that exist in self.function_name_map
+        e.g np.sum(a) would produce the following:
+
+        np.sum() --> numba_dppy.dpnp.sum()
+
+        ---------------------------------------------------------------------------------------
+        Numba IR Before Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $2load_global.0 = global(np: <module 'numpy' from 'numpy/__init__.py'>) ['$2load_global.0']
+            $4load_method.1 = getattr(value=$2load_global.0, attr=sum) ['$2load_global.0', '$4load_method.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        Numba IR After Rewrite:
+        ---------------------------------------------------------------------------------------
+
+            $dppy_replaced_var.0 = global(numba_dppy: <module 'numba_dppy' from 'numba_dppy/__init__.py'>) ['$dppy_replaced_var.0']
+            $dpnp_var.1 = getattr(value=$dppy_replaced_var.0, attr=dpnp) ['$dpnp_var.1', '$dppy_replaced_var.0']
+            $4load_method.1 = getattr(value=$dpnp_var.1, attr=sum) ['$4load_method.1', '$dpnp_var.1']
+            $8call_method.3 = call $4load_method.1(a, func=$4load_method.1, args=[Var(a, test_rewrite.py:7)],
+                                                   kws=(), vararg=None) ['$4load_method.1', '$8call_method.3', 'a']
+
+        ---------------------------------------------------------------------------------------
+        """
+        func_ir = self.state.func_ir
+        blocks = func_ir.blocks
+        topo_order = find_topo_order(blocks)
+
+        for label in topo_order:
+            block = blocks[label]
+            saved_arr_arg = {}
+            new_body = []
+            for stmt in block.body:
+                if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
+                    lhs = stmt.target.name
+                    rhs = stmt.value
+                    # replace np.FOO with name from self.function_name_map["FOO"]
+                    # e.g. np.sum will be replaced with numba_dppy.dpnp.sum
+                    if rhs.op == "getattr" and rhs.attr in self.function_name_map:
+                        module_node = block.find_variable_assignment(
+                            rhs.value.name
+                        ).value
+                        if (
+                            isinstance(module_node, ir.Global)
+                            and module_node.name in self.function_name_map[rhs.attr][0]
+                        ) or (
+                            isinstance(module_node, ir.Expr)
+                            and module_node.attr in self.function_name_map[rhs.attr][0]
+                        ):
+                            rhs = stmt.value
+                            rhs.attr = self.function_name_map[rhs.attr][1]
+
+                            global_module = rhs.value
+                            saved_arr_arg[lhs] = global_module
+
+                            scope = global_module.scope
+                            loc = global_module.loc
+
+                            g_dppy_var = ir.Var(
+                                scope, mk_unique_var("$2load_global"), loc
+                            )
+                            # We are trying to rename np.function_name/np.linalg.function_name with
+                            # numba_dppy.dpnp.function_name.
+                            # Hence, we need to have a global variable representing module numba_dppy.
+                            # Next, we add attribute dpnp to global module numba_dppy to
+                            # represent numba_dppy.dpnp.
+                            g_dppy = ir.Global("numba_dppy", numba_dppy, loc)
+                            g_dppy_assign = ir.Assign(g_dppy, g_dppy_var, loc)
+
+                            dpnp_var = ir.Var(scope, mk_unique_var("$4load_attr"), loc)
+                            getattr_dpnp = ir.Expr.getattr(g_dppy_var, "dpnp", loc)
+                            dpnp_assign = ir.Assign(getattr_dpnp, dpnp_var, loc)
+
+                            rhs.value = dpnp_var
+                            new_body.append(g_dppy_assign)
+                            new_body.append(dpnp_assign)
+                            func_ir._definitions[dpnp_var.name] = [getattr_dpnp]
+                            func_ir._definitions[g_dppy_var.name] = [g_dppy]
+
+                new_body.append(stmt)
+            block.body = new_body
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class DPPYRewriteOverloadedFunctions(FunctionPass):
+    _name = "dppy_rewrite_overloaded_functions_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+        import numba_dppy.dpnp_glue.dpnpdecl
+        import numba_dppy.dpnp_glue.dpnpimpl
+
+    def run_pass(self, state):
+        rewrite_function_name_pass = RewriteNumPyOverloadedFunctions(
+            state, rewrite_function_name_map
+        )
+
+        rewrite_function_name_pass.run()
+
+        remove_dead(state.func_ir.blocks, state.func_ir.arg_names, state.func_ir)
+        state.func_ir.blocks = simplify_CFG(state.func_ir.blocks)
+
+        return True
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
new file mode 100644
index 0000000000..b06a03b5e0
--- /dev/null
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -0,0 +1,67 @@
+#! /usr/bin/env python
+
+import unittest
+import numpy as np
+
+import numba
+from numba import njit, prange
+import numba_dppy, numba_dppy as dppy
+
+
+from numba.core import compiler
+from numba_dppy.rename_numpy_functions_pass import DPPYRewriteOverloadedFunctions
+
+
+class MyPipeline(object):
+    def __init__(self, test_ir):
+        self.state = compiler.StateDict()
+        self.state.func_ir = test_ir
+
+
+def check_equivalent(expected_ir, got_ir):
+    expected_block_body = expected_ir.blocks[0].body
+    got_block_body = got_ir.blocks[0].body
+
+    if len(expected_block_body) != len(got_block_body):
+        return False
+
+    for i in range(len(expected_block_body)):
+        expected_stmt = expected_block_body[i]
+        got_stmt = got_block_body[i]
+        if type(expected_stmt) != type(got_stmt):
+            return False
+        else:
+            if isinstance(expected_stmt, numba.core.ir.Assign):
+                if isinstance(expected_stmt.value, numba.core.ir.Global):
+                    if (expected_stmt.value.name != got_stmt.value.name and
+                        expected_stmt.value.name != "numba_dppy"):
+                        return False
+                elif isinstance(expected_stmt.value, numba.core.ir.Expr):
+                    # should get "dpnp" and "sum" as attr
+                    if expected_stmt.value.op == "getattr":
+                        if expected_stmt.value.attr != got_stmt.value.attr:
+                            return False
+    return True
+
+
+class TestRenameNumpyFunctionsPass(unittest.TestCase):
+    def test_rename(self):
+        def expected(a):
+            return numba_dppy.dpnp.sum(a)
+
+        def got(a):
+            return np.sum(a)
+
+        expected_ir = compiler.run_frontend(expected)
+        got_ir = compiler.run_frontend(got)
+
+        pipeline = MyPipeline(got_ir)
+
+        rewrite_numpy_functions_pass = DPPYRewriteOverloadedFunctions()
+        rewrite_numpy_functions_pass.run_pass(pipeline.state)
+
+        self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 059a5a3aa4e8a2b81ebcca34ba929a8f6320868f Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Wed, 9 Dec 2020 18:36:52 -0600
Subject: [PATCH 10/24] Store allocation queue on a per-object basis.

---
 numba_dppy/dppy_rt.c | 53 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 6 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index dd892055bf..610c45018e 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -5,33 +5,74 @@
 #include <stdio.h>
 
 NRT_ExternalAllocator usmarray_allocator;
+NRT_external_malloc_func internal_allocator = NULL;
+NRT_external_free_func internal_free = NULL;
+void *(*get_queue_internal)(void) = NULL;
+void (*free_queue_internal)(void*) = NULL;
+
+void * save_queue_allocator(size_t size, void *opaque) {
+    // Allocate a pointer-size more space than neded.
+    int new_size = size + sizeof(void*);
+    // Get the current queue
+    void *cur_queue = get_queue_internal(); // this makes a copy
+    // Use that queue to allocate.
+    void *data = internal_allocator(new_size, cur_queue);
+    // Set first pointer-sized data in allocated space to be the current queue.
+    *(void**)data = cur_queue;
+    // Return the pointer after this queue in memory.
+    return (char*)data + sizeof(void*);
+}
+
+void save_queue_deallocator(void *data, void *opaque) {
+    // Compute original allocation location by subtracting the length
+    // of the queue pointer from the data location that Numba thinks
+    // starts the object.
+    void *orig_data = (char*)data - sizeof(void*);
+    // Get the queue from the original data by derefencing the first qword.
+    void *obj_queue = *(void**)orig_data;
+    // Free the space using the correct queue.
+    internal_free(orig_data, obj_queue);
+    // Free the queue itself.
+    free_queue_internal(obj_queue);
+}
 
 void usmarray_memsys_init(void) {
-    void *(*get_queue)(void);
     char *lib_name = "libDPCTLSyclInterface.so";
     char *malloc_name = "DPCTLmalloc_shared";
     char *free_name = "DPCTLfree_with_queue";
     char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
+    char *free_queue_name = "DPCTLQueue_Delete";
 
     void *sycldl = dlopen(lib_name, RTLD_NOW);
     assert(sycldl != NULL);
-    usmarray_allocator.malloc = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+    usmarray_allocator.malloc = save_queue_allocator;
     if (usmarray_allocator.malloc == NULL) {
         printf("Did not find %s in %s\n", malloc_name, lib_name);
         exit(-1);
     }
+
     usmarray_allocator.realloc = NULL;
-    usmarray_allocator.free = (NRT_external_free_func)dlsym(sycldl, free_name);
+
+    internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
+    usmarray_allocator.free = save_queue_deallocator;
     if (usmarray_allocator.free == NULL) {
         printf("Did not find %s in %s\n", free_name, lib_name);
         exit(-1);
     }
-    get_queue = (void *(*))dlsym(sycldl, get_queue_name);
-    if (get_queue == NULL) {
+
+    get_queue_internal = (void *(*)(void))dlsym(sycldl, get_queue_name);
+    if (get_queue_internal == NULL) {
         printf("Did not find %s in %s\n", get_queue_name, lib_name);
         exit(-1);
     }
-    usmarray_allocator.opaque_data = get_queue();
+    usmarray_allocator.opaque_data = NULL;
+
+    free_queue_internal = (void (*)(void*))dlsym(sycldl, free_queue_name);
+    if (free_queue_internal == NULL) {
+        printf("Did not find %s in %s\n", free_queue_name, lib_name);
+        exit(-1);
+    }
 }
 
 void * usmarray_get_ext_allocator(void) {

From 4e99a557dae35b91c5cfce6775ce9b63ed97a6ca Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Thu, 10 Dec 2020 11:34:14 -0600
Subject: [PATCH 11/24] Add imports to usmarray module and fixed setup.py
 extension initialization

---
 numba_dppy/numpy_usm_shared.py | 53 ++++++++++++++++++++++++++++++++++
 setup.py                       |  2 +-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 0d190b1317..16b7f4fee8 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -1,3 +1,56 @@
+import numpy as np
+from inspect import getmembers, isfunction, isclass, isbuiltin
+from numbers import Number
+import numba
+from types import FunctionType as ftype, BuiltinFunctionType as bftype
+from numba import types
+from numba.extending import typeof_impl, register_model, type_callable, lower_builtin
+from numba.np import numpy_support
+from numba.core.pythonapi import box, allocator
+from llvmlite import ir
+import llvmlite.llvmpy.core as lc
+import llvmlite.binding as llb
+from numba.core import types, cgutils, config
+import builtins
+import sys
+from ctypes.util import find_library
+from numba.core.typing.templates import builtin_registry as templates_registry
+from numba.core.typing.npydecl import registry as typing_registry
+from numba.core.imputils import builtin_registry as lower_registry
+import importlib
+import functools
+import inspect
+from numba.core.typing.templates import CallableTemplate
+from numba.np.arrayobj import _array_copy
+
+from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list
+
+
+debug = config.DEBUG
+
+def dprint(*args):
+    if debug:
+        print(*args)
+        sys.stdout.flush()
+
+# # This code makes it so that Numba can contain calls into the DPPLSyclInterface library.
+# sycl_mem_lib = find_library('DPCTLSyclInterface')
+# dprint("sycl_mem_lib:", sycl_mem_lib)
+# # Load the symbols from the DPPL Sycl library.
+# llb.load_library_permanently(sycl_mem_lib)
+
+import dpctl
+from dpctl.memory import MemoryUSMShared
+import numba_dppy._dppy_rt
+
+# functions_list = [o[0] for o in getmembers(np) if isfunction(o[1]) or isbuiltin(o[1])]
+# class_list = [o for o in getmembers(np) if isclass(o[1])]
+
+# Register the helper function in dppl_rt so that we can insert calls to them via llvmlite.
+for py_name, c_address in numba_dppy._dppy_rt.c_helpers.items():
+    llb.add_symbol(py_name, c_address)
+
+
 # This class creates a type in Numba.
 class UsmSharedArrayType(types.Array):
     def __init__(
diff --git a/setup.py b/setup.py
index b870c50a8f..11db126686 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@ def get_ext_modules():
     cmdclass=versioneer.get_cmdclass(),
     entry_points={
         "numba_extensions": [
-            "init = numba_dppy.usmarray:numba_register",
+            "init = numba_dppy.numpy_usm_shared:numba_register",
     ]},
 )
 

From 595f94b586cec629c9cd46bea143c8f61e721948 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Thu, 10 Dec 2020 20:28:25 -0600
Subject: [PATCH 12/24] Register is_usm_callback with dpctl to say whether a
 given Python object is a USM MemInfo.

---
 numba_dppy/numpy_usm_shared.py | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 16b7f4fee8..8649826a53 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -23,6 +23,7 @@
 from numba.core.typing.templates import CallableTemplate
 from numba.np.arrayobj import _array_copy
 
+import dpctl.dptensor.numpy_usm_shared as numpy_usm_shared
 from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list
 
 
@@ -152,11 +153,25 @@ def allocator_UsmArray(context, builder, size, align):
 
 registered = False
 
+def is_usm_callback(obj):
+    if isinstance(obj, numba.core.runtime._nrt_python._MemInfo):
+        mobj = obj
+        while isinstance(mobj, numba.core.runtime._nrt_python._MemInfo):
+            ea = mobj.external_allocator
+            d = mobj.data
+            dppl_rt_allocator = numba_dppy._dppy_rt.get_external_allocator()
+            if ea == dppl_rt_allocator:
+                return True
+            mobj = mobj.parent
+            if isinstance(mobj, ndarray):
+                mobj = mobj.base
+    return False
 
 def numba_register():
     global registered
     if not registered:
         registered = True
+        ndarray.add_external_usm_checker(is_usm_callback)
         numba_register_typing()
         numba_register_lower_builtin()
 
@@ -217,7 +232,11 @@ def numba_register_lower_builtin():
 
     cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
-        usmarray_func = eval(func.__name__)
+        try:
+            usmarray_func = eval("numpy_usm_shared."+func.__name__)
+        except:
+            dprint("failed to eval", func.__name__)
+            continue
         dprint(
             "need to re-register lowerer for usmarray", impl, func, types, usmarray_func
         )
@@ -257,7 +276,11 @@ def numba_register_typing():
         assert len(typ.templates) == 1
         # template is the typing class to invoke generic() upon.
         template = typ.templates[0]
-        dpval = eval(val.__name__)
+        try:
+            dpval = eval("numpy_usm_shared."+val.__name__)
+        except:
+            dprint("failed to eval", val.__name__)
+            continue
         dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         """
         if debug:

From c87a94b4bfb59ffe82eabd24d589aea536c7fd67 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 12:24:28 -0600
Subject: [PATCH 13/24] Remove printf.

---
 numba_dppy/dppy_rt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index 610c45018e..d637064989 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -76,7 +76,6 @@ void usmarray_memsys_init(void) {
 }
 
 void * usmarray_get_ext_allocator(void) {
-    printf("usmarray_get_ext_allocator %p\n", &usmarray_allocator);
     return (void*)&usmarray_allocator;
 }
 

From 12148936265938c31344cfaf83344cced3234d00 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 12:30:19 -0600
Subject: [PATCH 14/24] There were some spots where there was a silent
 assumption that the class and the Numba integration were in the same file.  I
 changed those to explicitly refer to the usmarray module in dpctl.

---
 numba_dppy/numpy_usm_shared.py | 50 ++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 8649826a53..9c8c7855ba 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -23,8 +23,8 @@
 from numba.core.typing.templates import CallableTemplate
 from numba.np.arrayobj import _array_copy
 
-import dpctl.dptensor.numpy_usm_shared as numpy_usm_shared
-from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list
+import dpctl.dptensor.numpy_usm_shared as nus
+from dpctl.dptensor.numpy_usm_shared import ndarray, functions_list, class_list
 
 
 debug = config.DEBUG
@@ -233,7 +233,7 @@ def numba_register_lower_builtin():
     cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
         try:
-            usmarray_func = eval("numpy_usm_shared."+func.__name__)
+            usmarray_func = eval("dpctl.dptensor.numpy_usm_shared." + func.__name__)
         except:
             dprint("failed to eval", func.__name__)
             continue
@@ -260,28 +260,44 @@ def numba_register_typing():
     # For all Numpy identifiers that have been registered for typing in Numba...
     for ig in typing_registry.globals:
         val, typ = ig
+        dprint("Numpy registered:", val, type(val), typ, type(typ))
         # If it is a Numpy function...
         if isinstance(val, (ftype, bftype)):
             # If we have overloaded that function in the usmarray module (always True right now)...
             if val.__name__ in functions_list:
                 todo.append(ig)
         if isinstance(val, type):
-            todo_classes.append(ig)
+            if isinstance(typ, numba.core.types.functions.Function):
+                todo.append(ig)
+            elif isinstance(typ, numba.core.types.functions.NumberClass):
+                pass
+                #todo_classes.append(ig)
 
     for tgetattr in templates_registry.attributes:
         if tgetattr.key == types.Array:
             todo_getattr.append(tgetattr)
 
+    for val, typ in todo_classes:
+        dprint("todo_classes:", val, typ, type(typ))
+
+        try:
+            dptype = eval("dpctl.dptensor.numpy_usm_shared." + val.__name__)
+        except:
+            dprint("failed to eval", val.__name__)
+            continue
+
+        typing_registry.register_global(dptype, numba.core.types.NumberClass(typ.instance_type))
+
     for val, typ in todo:
         assert len(typ.templates) == 1
         # template is the typing class to invoke generic() upon.
         template = typ.templates[0]
+        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         try:
-            dpval = eval("numpy_usm_shared."+val.__name__)
+            dpval = eval("dpctl.dptensor.numpy_usm_shared." + val.__name__)
         except:
             dprint("failed to eval", val.__name__)
             continue
-        dprint("need to re-register for usmarray", val, typ, typ.typing_key)
         """
         if debug:
             print("--------------------------------------------------------------")
@@ -307,9 +323,7 @@ def set_key_original(cls, key, original):
         def generic_impl(self):
             original_typer = self.__class__.original.generic(self.__class__.original)
             ot_argspec = inspect.getfullargspec(original_typer)
-            # print("ot_argspec:", ot_argspec)
             astr = argspec_to_string(ot_argspec)
-            # print("astr:", astr)
 
             typer_func = """def typer({}):
                                 original_res = original_typer({})
@@ -321,8 +335,6 @@ def generic_impl(self):
                 astr, ",".join(ot_argspec.args)
             )
 
-            # print("typer_func:", typer_func)
-
             try:
                 gs = globals()
                 ls = locals()
@@ -344,7 +356,6 @@ def generic_impl(self):
                 print("eval failed!", sys.exc_info()[0])
                 sys.exit(0)
 
-            # print("exec_res:", exec_res)
             return exec_res
 
         new_usmarray_template = type(
@@ -370,7 +381,6 @@ def set_key(cls, key):
 
         def getattr_impl(self, attr):
             if attr.startswith("resolve_"):
-                # print("getattr_impl starts with resolve_:", self, type(self), attr)
                 def wrapper(*args, **kwargs):
                     attr_res = tgetattr.__getattribute__(self, attr)(*args, **kwargs)
                     if isinstance(attr_res, types.Array):
@@ -394,15 +404,7 @@ def wrapper(*args, **kwargs):
         templates_registry.register_attr(new_usmarray_template)
 
 
-def from_ndarray(x):
-    return copy(x)
-
-
-def as_ndarray(x):
-    return np.copy(x)
-
-
-@typing_registry.register_global(as_ndarray)
+@typing_registry.register_global(nus.as_ndarray)
 class DparrayAsNdarray(CallableTemplate):
     def generic(self):
         def typer(arg):
@@ -411,7 +413,7 @@ def typer(arg):
         return typer
 
 
-@typing_registry.register_global(from_ndarray)
+@typing_registry.register_global(nus.from_ndarray)
 class DparrayFromNdarray(CallableTemplate):
     def generic(self):
         def typer(arg):
@@ -420,11 +422,11 @@ def typer(arg):
         return typer
 
 
-@lower_registry.lower(as_ndarray, UsmSharedArrayType)
+@lower_registry.lower(nus.as_ndarray, UsmSharedArrayType)
 def usmarray_conversion_as(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)
 
 
-@lower_registry.lower(from_ndarray, types.Array)
+@lower_registry.lower(nus.from_ndarray, types.Array)
 def usmarray_conversion_from(context, builder, sig, args):
     return _array_copy(context, builder, sig, args)

From a2b2bb75794179ba18173af43a85e229f5fc4248 Mon Sep 17 00:00:00 2001
From: "Todd A. Anderson" <drtodd13@comcast.net>
Date: Fri, 11 Dec 2020 14:45:54 -0600
Subject: [PATCH 15/24] Found another spot where the current module was being
 used rather than numpy_usm_shared in dpctl.dptensor.  This fixes the ndindex
 issue.

---
 numba_dppy/numpy_usm_shared.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/numba_dppy/numpy_usm_shared.py b/numba_dppy/numpy_usm_shared.py
index 9c8c7855ba..0f058bc778 100644
--- a/numba_dppy/numpy_usm_shared.py
+++ b/numba_dppy/numpy_usm_shared.py
@@ -230,7 +230,6 @@ def numba_register_lower_builtin():
     for lg in todo_getattr:
         lower_registry.getattrs.append(lg)
 
-    cur_mod = importlib.import_module(__name__)
     for impl, func, types in todo + todo_builtin:
         try:
             usmarray_func = eval("dpctl.dptensor.numpy_usm_shared." + func.__name__)
@@ -240,7 +239,7 @@ def numba_register_lower_builtin():
         dprint(
             "need to re-register lowerer for usmarray", impl, func, types, usmarray_func
         )
-        new_impl = copy_func_for_usmarray(impl, cur_mod)
+        new_impl = copy_func_for_usmarray(impl, nus)
         lower_registry.functions.append((new_impl, usmarray_func, types))
 
 
@@ -327,7 +326,6 @@ def generic_impl(self):
 
             typer_func = """def typer({}):
                                 original_res = original_typer({})
-                                #print("original_res:", original_res)
                                 if isinstance(original_res, types.Array):
                                     return UsmSharedArrayType(dtype=original_res.dtype, ndim=original_res.ndim, layout=original_res.layout)
 

From 0769a9724209b90028fc1cc916097d48429f6162 Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Tue, 15 Dec 2020 06:47:38 -0600
Subject: [PATCH 16/24] Fix numba path

---
 setup.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 11db126686..f5882cceb4 100644
--- a/setup.py
+++ b/setup.py
@@ -3,16 +3,26 @@
 from Cython.Build import cythonize
 
 import versioneer
+import sys
+
+
+def find_numba():
+    sys_packages = sys.path
+    for pcg in sys_packages:
+        if pcg.find("/numba-0") != -1:
+            numba_dir = pcg
+    return numba_dir
 
 
 def get_ext_modules():
     ext_modules = []
+    numba_dir = find_numba()
 
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",
         sources=["numba_dppy/dppy_rt.c"],
-        include_dirs=["../numba/numba"],  # Need to get rid of relative paths.
-        depends=["../numba/numba/core/runtime/nrt_external.h", "../numba/numba/core/runtime/nrt.h", "../numba/numba/_pymodule.h"],
+        include_dirs=[numba_dir + "/numba"],
+        depends=[numba_dir + "/numba/core/runtime/nrt_external.h", numba_dir + "/numba/core/runtime/nrt.h", numba_dir + "/numba/_pymodule.h"],
     )
     ext_modules += [ext_dppy]
 

From eb53c26bc7ec73fa8275de15cf97cdda73d461ce Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Tue, 15 Dec 2020 06:51:35 -0600
Subject: [PATCH 17/24] fix

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index f5882cceb4..dc950e37d6 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,7 @@
 
 
 def find_numba():
+    numba_dir = ""
     sys_packages = sys.path
     for pcg in sys_packages:
         if pcg.find("/numba-0") != -1:

From f4333ddbcecadd29193219ac73aea4402ad74a1b Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 15 Dec 2020 16:31:00 +0300
Subject: [PATCH 18/24] Convert tests for USM array to unittest (#118)

* Split tests for usmarray in separate unittest test cases

* Remove prints and use unittest assertions

* Move functions to tests

* Give names to functions

* Add expectedFailure for failed tests

* Clean code
---
 numba_dppy/tests/test_usmarray.py | 349 ++++++++++++++----------------
 1 file changed, 162 insertions(+), 187 deletions(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index fe1be71c9e..6ee21ab3da 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -1,228 +1,203 @@
-from __future__ import print_function, division, absolute_import
-
 import numba
-import dpctl.dptensor.numpy_usm_shared as usmarray
 import numpy
-import sys
-
-
-def p1(a):
-    return a * 2.0 + 13
+import unittest
 
+import dpctl.dptensor.numpy_usm_shared as usmarray
 
-f1 = numba.njit(p1)
+from numba_dppy.testing import DPPYTestCase
 
 
 @numba.njit()
-def f2(a):
-    return a
+def numba_mul_add(a):
+    return a * 2.0 + 13
 
 
 @numba.njit()
-def f3(a, b):  # a is usmarray, b is numpy
-    return a * usmarray.asarray(b)
+def numba_add_const(a):
+    return a + 13
 
 
 @numba.njit()
-def f4():
-    return usmarray.ones(10)
-
-
-def p5(a, b):  # a is usmarray, b is numpy
+def numba_mul(a, b):  # a is usmarray, b is numpy
     return a * b
 
 
-f5 = numba.njit(p5)
-
-
 @numba.njit()
-def f6(a):
-    return a + 13
+def numba_mul_usmarray_asarray(a, b):  # a is usmarray, b is numpy
+    return a * usmarray.asarray(b)
 
 
-@numba.njit()
-def f7(a):  # a is usmarray
-    # implicit conversion of a to numpy.ndarray
-    b = numpy.ones(10)
-    c = a * b
-    d = a.argsort()  # with no implicit conversion this fails
+# @numba.njit()
+# def f7(a):  # a is usmarray
+#     # implicit conversion of a to numpy.ndarray
+#     b = numpy.ones(10)
+#     c = a * b
+#     d = a.argsort()  # with no implicit conversion this fails
 
 
 @numba.njit
-def f8(a):
+def numba_usmarray_as_ndarray(a):
     return usmarray.as_ndarray(a)
 
 
 @numba.njit
-def f9(a):
+def numba_usmarray_from_ndarray(a):
     return usmarray.from_ndarray(a)
 
 
+@numba.njit()
+def numba_usmarray_ones():
+    return usmarray.ones(10)
+
+
 @numba.njit
-def f10():
+def numba_usmarray_empty():
     return usmarray.empty((10, 10))
 
 
+@numba.njit()
+def numba_identity(a):
+    return a
+
+
 @numba.njit
-def f11(x):
+def numba_shape(x):
     return x.shape
 
 
 @numba.njit
-def f12(x):
+def numba_T(x):
     return x.T
 
 
-# --------------------------------------------------------------------------------
-
-print("------------------- Testing Python Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = p1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing Numba Numpy")
-sys.stdout.flush()
-z1 = numpy.ones(10)
-z2 = f1(z1)
-print("z2:", z2, type(z2))
-assert type(z2) == numpy.ndarray
-
-print("------------------- Testing usmarray ones")
-sys.stdout.flush()
-a = usmarray.ones(10)
-print("a:", a, type(a))
-assert isinstance(a, usmarray.ndarray)
-assert usmarray.has_array_interface(a)
-
-print("------------------- Testing usmarray.usmarray.as_ndarray")
-sys.stdout.flush()
-nd1 = a.as_ndarray()
-print("nd1:", nd1, type(nd1))
-assert type(nd1) == numpy.ndarray
-
-print("------------------- Testing usmarray.as_ndarray")
-sys.stdout.flush()
-nd2 = usmarray.as_ndarray(a)
-print("nd2:", nd2, type(nd2))
-assert type(nd2) == numpy.ndarray
-
-print("------------------- Testing usmarray.from_ndarray")
-sys.stdout.flush()
-dp1 = usmarray.from_ndarray(nd2)
-print("dp1:", dp1, type(dp1))
-assert isinstance(dp1, usmarray.ndarray)
-assert usmarray.has_array_interface(dp1)
-
-print("------------------- Testing usmarray multiplication")
-sys.stdout.flush()
-c = a * 5
-print("c", c, type(c))
-assert isinstance(c, usmarray.ndarray)
-assert usmarray.has_array_interface(c)
-
-print("------------------- Testing Python usmarray")
-sys.stdout.flush()
-b = p1(c)
-print("b:", b, type(b))
-assert isinstance(b, usmarray.ndarray)
-assert usmarray.has_array_interface(b)
-del b
-
-print("------------------- Testing Python mixing usmarray and numpy.ndarray")
-sys.stdout.flush()
-h = p5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, usmarray.ndarray)
-assert usmarray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba usmarray 2")
-sys.stdout.flush()
-d = f2(a)
-print("d:", d, type(d))
-assert isinstance(d, usmarray.ndarray)
-assert usmarray.has_array_interface(d)
-del d
-
-print("------------------- Testing Numba usmarray")
-sys.stdout.flush()
-b = f1(c)
-print("b:", b, type(b))
-assert isinstance(b, usmarray.ndarray)
-assert usmarray.has_array_interface(b)
-del b
-
-"""
-print("------------------- Testing Numba usmarray constructor from numpy.ndarray")
-sys.stdout.flush()
-e = f3(a, z1)
-print("e:", e, type(e))
-assert(isinstance(e, usmarray.ndarray))
-"""
-
-print("------------------- Testing Numba mixing usmarray and constant")
-sys.stdout.flush()
-g = f6(a)
-print("g:", g, type(g))
-assert isinstance(g, usmarray.ndarray)
-assert usmarray.has_array_interface(g)
-del g
-
-print("------------------- Testing Numba mixing usmarray and numpy.ndarray")
-sys.stdout.flush()
-h = f5(a, z1)
-print("h:", h, type(h))
-assert isinstance(h, usmarray.ndarray)
-assert usmarray.has_array_interface(h)
-del h
-
-print("------------------- Testing Numba usmarray functions")
-sys.stdout.flush()
-f = f4()
-print("f:", f, type(f))
-assert isinstance(f, usmarray.ndarray)
-assert usmarray.has_array_interface(f)
-del f
-
-print("------------------- Testing Numba usmarray.as_ndarray")
-sys.stdout.flush()
-nd3 = f8(a)
-print("nd3:", nd3, type(nd3))
-assert type(nd3) == numpy.ndarray
-
-print("------------------- Testing Numba usmarray.from_ndarray")
-sys.stdout.flush()
-dp2 = f9(nd3)
-print("dp2:", dp2, type(dp2))
-assert isinstance(dp2, usmarray.ndarray)
-assert usmarray.has_array_interface(dp2)
-del nd3
-del dp2
-
-print("------------------- Testing Numba usmarray.empty")
-sys.stdout.flush()
-dp3 = f10()
-print("dp3:", dp3, type(dp3))
-assert isinstance(dp3, usmarray.ndarray)
-assert usmarray.has_array_interface(dp3)
-
-print("------------------- Testing Numba usmarray.shape")
-sys.stdout.flush()
-s1 = f11(dp3)
-print("s1:", s1, type(s1))
-
-print("------------------- Testing Numba usmarray.T")
-sys.stdout.flush()
-dp4 = f12(dp3)
-print("dp4:", dp4, type(dp4))
-assert isinstance(dp4, usmarray.ndarray)
-assert usmarray.has_array_interface(dp4)
-del dp3
-del dp4
-
-# -------------------------------
-del a
-
-print("SUCCESS")
+class TestUsmArray(DPPYTestCase):
+    def ndarray(self):
+        """Create NumPy array"""
+        return numpy.ones(10)
+
+    def usmarray(self):
+        """Create dpCtl USM array"""
+        return usmarray.ones(10)
+
+    def test_python_numpy(self):
+        """Testing Python Numpy"""
+        z2 = numba_mul_add.py_func(self.ndarray())
+        self.assertEqual(type(z2), numpy.ndarray, z2)
+
+    def test_numba_numpy(self):
+        """Testing Numba Numpy"""
+        z2 = numba_mul_add(self.ndarray())
+        self.assertEqual(type(z2), numpy.ndarray, z2)
+
+    def test_usmarray_ones(self):
+        """Testing usmarray ones"""
+        a = usmarray.ones(10)
+        self.assertIsInstance(a, usmarray.ndarray, type(a))
+        self.assertTrue(usmarray.has_array_interface(a))
+
+    def test_usmarray_usmarray_as_ndarray(self):
+        """Testing usmarray.usmarray.as_ndarray"""
+        nd1 = self.usmarray().as_ndarray()
+        self.assertEqual(type(nd1), numpy.ndarray, nd1)
+
+    def test_usmarray_as_ndarray(self):
+        """Testing usmarray.as_ndarray"""
+        nd2 = usmarray.as_ndarray(self.usmarray())
+        self.assertEqual(type(nd2), numpy.ndarray, nd2)
+
+    def test_usmarray_from_ndarray(self):
+        """Testing usmarray.from_ndarray"""
+        nd2 = usmarray.as_ndarray(self.usmarray())
+        dp1 = usmarray.from_ndarray(nd2)
+        self.assertIsInstance(dp1, usmarray.ndarray, type(dp1))
+        self.assertTrue(usmarray.has_array_interface(dp1))
+
+    def test_usmarray_multiplication(self):
+        """Testing usmarray multiplication"""
+        c = self.usmarray() * 5
+        self.assertIsInstance(c, usmarray.ndarray, type(c))
+        self.assertTrue(usmarray.has_array_interface(c))
+
+    def test_python_usmarray_mul_add(self):
+        """Testing Python usmarray"""
+        c = self.usmarray() * 5
+        b = numba_mul_add.py_func(c)
+        self.assertIsInstance(b, usmarray.ndarray, type(b))
+        self.assertTrue(usmarray.has_array_interface(b))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_mul_add(self):
+        """Testing Numba usmarray"""
+        # fails if run tests in bunch
+        c = self.usmarray() * 5
+        b = numba_mul_add(c)
+        self.assertIsInstance(b, usmarray.ndarray, type(b))
+        self.assertTrue(usmarray.has_array_interface(b))
+
+    def test_python_mixing_usmarray_and_numpy_ndarray(self):
+        """Testing Python mixing usmarray and numpy.ndarray"""
+        h = numba_mul.py_func(self.usmarray(), self.ndarray())
+        self.assertIsInstance(h, usmarray.ndarray, type(h))
+        self.assertTrue(usmarray.has_array_interface(h))
+
+    def test_numba_usmarray_2(self):
+        """Testing Numba usmarray 2"""
+
+        d = numba_identity(self.usmarray())
+        self.assertIsInstance(d, usmarray.ndarray, type(d))
+        self.assertTrue(usmarray.has_array_interface(d))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_constructor_from_numpy_ndarray(self):
+        """Testing Numba usmarray constructor from numpy.ndarray"""
+        e = numba_mul_usmarray_asarray(self.usmarray(), self.ndarray())
+        self.assertIsInstance(e, usmarray.ndarray, type(e))
+
+    def test_numba_mixing_usmarray_and_constant(self):
+        """Testing Numba mixing usmarray and constant"""
+        g = numba_add_const(self.usmarray())
+        self.assertIsInstance(g, usmarray.ndarray, type(g))
+        self.assertTrue(usmarray.has_array_interface(g))
+
+    def test_numba_mixing_usmarray_and_numpy_ndarray(self):
+        """Testing Numba mixing usmarray and numpy.ndarray"""
+        h = numba_mul(self.usmarray(), self.ndarray())
+        self.assertIsInstance(h, usmarray.ndarray, type(h))
+        self.assertTrue(usmarray.has_array_interface(h))
+
+    def test_numba_usmarray_functions(self):
+        """Testing Numba usmarray functions"""
+        f = numba_usmarray_ones()
+        self.assertIsInstance(f, usmarray.ndarray, type(f))
+        self.assertTrue(usmarray.has_array_interface(f))
+
+    def test_numba_usmarray_as_ndarray(self):
+        """Testing Numba usmarray.as_ndarray"""
+        nd3 = numba_usmarray_as_ndarray(self.usmarray())
+        self.assertEqual(type(nd3), numpy.ndarray, nd3)
+
+    def test_numba_usmarray_from_ndarray(self):
+        """Testing Numba usmarray.from_ndarray"""
+        nd3 = numba_usmarray_as_ndarray(self.usmarray())
+        dp2 = numba_usmarray_from_ndarray(nd3)
+        self.assertIsInstance(dp2, usmarray.ndarray, type(dp2))
+        self.assertTrue(usmarray.has_array_interface(dp2))
+
+    def test_numba_usmarray_empty(self):
+        """Testing Numba usmarray.empty"""
+        dp3 = numba_usmarray_empty()
+        self.assertIsInstance(dp3, usmarray.ndarray, type(dp3))
+        self.assertTrue(usmarray.has_array_interface(dp3))
+
+    def test_numba_usmarray_shape(self):
+        """Testing Numba usmarray.shape"""
+        s1 = numba_shape(numba_usmarray_empty())
+        self.assertIsInstance(s1, tuple, type(s1))
+        self.assertEqual(s1, (10, 10))
+
+    @unittest.expectedFailure
+    def test_numba_usmarray_T(self):
+        """Testing Numba usmarray.T"""
+        dp4 = numba_T(numba_usmarray_empty())
+        self.assertIsInstance(dp4, usmarray.ndarray, type(dp4))
+        self.assertTrue(usmarray.has_array_interface(dp4))

From 313727bb318a4971e2eb8019d139d89dd25a26aa Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Tue, 15 Dec 2020 07:56:50 -0600
Subject: [PATCH 19/24] Small code fixes

---
 numba_dppy/tests/test_usmarray.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index 6ee21ab3da..a6d428f80e 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -142,7 +142,6 @@ def test_python_mixing_usmarray_and_numpy_ndarray(self):
 
     def test_numba_usmarray_2(self):
         """Testing Numba usmarray 2"""
-
         d = numba_identity(self.usmarray())
         self.assertIsInstance(d, usmarray.ndarray, type(d))
         self.assertTrue(usmarray.has_array_interface(d))

From ff877fc42bb016b56247e44b2f6f44c4db1559dd Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Tue, 15 Dec 2020 09:08:52 -0600
Subject: [PATCH 20/24] use include_path to find numba

---
 setup.py | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/setup.py b/setup.py
index dc950e37d6..d833951745 100644
--- a/setup.py
+++ b/setup.py
@@ -1,23 +1,15 @@
 import os
 from setuptools import Extension, find_packages, setup
 from Cython.Build import cythonize
+from numba.core.extending import include_path
 
 import versioneer
 import sys
 
 
-def find_numba():
-    numba_dir = ""
-    sys_packages = sys.path
-    for pcg in sys_packages:
-        if pcg.find("/numba-0") != -1:
-            numba_dir = pcg
-    return numba_dir
-
-
 def get_ext_modules():
     ext_modules = []
-    numba_dir = find_numba()
+    numba_dir = include_path()
 
     ext_dppy = Extension(
         name="numba_dppy._dppy_rt",

From f5b1989cf3f47e75122837fc05b1b8cfeb8fbd79 Mon Sep 17 00:00:00 2001
From: etotmeni <elena.totmenina@intel.com>
Date: Wed, 16 Dec 2020 15:53:32 +0300
Subject: [PATCH 21/24] Added loader lib for win

---
 numba_dppy/dppy_rt.c | 117 +++++++++++++++++++++++++++++--------------
 1 file changed, 80 insertions(+), 37 deletions(-)

diff --git a/numba_dppy/dppy_rt.c b/numba_dppy/dppy_rt.c
index d637064989..83fd6949b2 100644
--- a/numba_dppy/dppy_rt.c
+++ b/numba_dppy/dppy_rt.c
@@ -1,8 +1,12 @@
 #include "_pymodule.h"
 #include "core/runtime/nrt_external.h"
 #include "assert.h"
-#include <dlfcn.h>
 #include <stdio.h>
+#if !defined _WIN32
+   #include <dlfcn.h>
+#else
+   #include <windows.h>
+#endif
 
 NRT_ExternalAllocator usmarray_allocator;
 NRT_external_malloc_func internal_allocator = NULL;
@@ -37,42 +41,81 @@ void save_queue_deallocator(void *data, void *opaque) {
 }
 
 void usmarray_memsys_init(void) {
-    char *lib_name = "libDPCTLSyclInterface.so";
-    char *malloc_name = "DPCTLmalloc_shared";
-    char *free_name = "DPCTLfree_with_queue";
-    char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
-    char *free_queue_name = "DPCTLQueue_Delete";
-
-    void *sycldl = dlopen(lib_name, RTLD_NOW);
-    assert(sycldl != NULL);
-    internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
-    usmarray_allocator.malloc = save_queue_allocator;
-    if (usmarray_allocator.malloc == NULL) {
-        printf("Did not find %s in %s\n", malloc_name, lib_name);
-        exit(-1);
-    }
-
-    usmarray_allocator.realloc = NULL;
-
-    internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
-    usmarray_allocator.free = save_queue_deallocator;
-    if (usmarray_allocator.free == NULL) {
-        printf("Did not find %s in %s\n", free_name, lib_name);
-        exit(-1);
-    }
-
-    get_queue_internal = (void *(*)(void))dlsym(sycldl, get_queue_name);
-    if (get_queue_internal == NULL) {
-        printf("Did not find %s in %s\n", get_queue_name, lib_name);
-        exit(-1);
-    }
-    usmarray_allocator.opaque_data = NULL;
-
-    free_queue_internal = (void (*)(void*))dlsym(sycldl, free_queue_name);
-    if (free_queue_internal == NULL) {
-        printf("Did not find %s in %s\n", free_queue_name, lib_name);
-        exit(-1);
-    }
+    #if !defined _WIN32
+        char *lib_name = "libDPCTLSyclInterface.so";
+        char *malloc_name = "DPCTLmalloc_shared";
+        char *free_name = "DPCTLfree_with_queue";
+        char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
+        char *free_queue_name = "DPCTLQueue_Delete";
+
+        void *sycldl = dlopen(lib_name, RTLD_NOW);
+        assert(sycldl != NULL);
+        internal_allocator = (NRT_external_malloc_func)dlsym(sycldl, malloc_name);
+        usmarray_allocator.malloc = save_queue_allocator;
+        if (usmarray_allocator.malloc == NULL) {
+            printf("Did not find %s in %s\n", malloc_name, lib_name);
+            exit(-1);
+        }
+
+        usmarray_allocator.realloc = NULL;
+
+        internal_free = (NRT_external_free_func)dlsym(sycldl, free_name);
+        usmarray_allocator.free = save_queue_deallocator;
+        if (usmarray_allocator.free == NULL) {
+            printf("Did not find %s in %s\n", free_name, lib_name);
+            exit(-1);
+        }
+
+        get_queue_internal = (void *(*)(void))dlsym(sycldl, get_queue_name);
+        if (get_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", get_queue_name, lib_name);
+            exit(-1);
+        }
+        usmarray_allocator.opaque_data = NULL;
+
+        free_queue_internal = (void (*)(void*))dlsym(sycldl, free_queue_name);
+        if (free_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", free_queue_name, lib_name);
+            exit(-1);
+        }
+    #else
+        char *lib_name = "libDPCTLSyclInterface.dll";
+        char *malloc_name = "DPCTLmalloc_shared";
+        char *free_name = "DPCTLfree_with_queue";
+        char *get_queue_name = "DPCTLQueueMgr_GetCurrentQueue";
+        char *free_queue_name = "DPCTLQueue_Delete";
+
+        HMODULE sycldl = LoadLibrary(lib_name);
+        assert(sycldl != NULL);
+        internal_allocator = (NRT_external_malloc_func)GetProcAddress(sycldl, malloc_name);
+        usmarray_allocator.malloc = save_queue_allocator;
+        if (usmarray_allocator.malloc == NULL) {
+            printf("Did not find %s in %s\n", malloc_name, lib_name);
+            exit(-1);
+        }
+
+        usmarray_allocator.realloc = NULL;
+
+        internal_free = (NRT_external_free_func)GetProcAddress(sycldl, free_name);
+        usmarray_allocator.free = save_queue_deallocator;
+        if (usmarray_allocator.free == NULL) {
+            printf("Did not find %s in %s\n", free_name, lib_name);
+            exit(-1);
+        }
+
+        get_queue_internal = (void *(*)(void))GetProcAddress(sycldl, get_queue_name);
+        if (get_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", get_queue_name, lib_name);
+            exit(-1);
+        }
+        usmarray_allocator.opaque_data = NULL;
+
+        free_queue_internal = (void (*)(void*))GetProcAddress(sycldl, free_queue_name);
+        if (free_queue_internal == NULL) {
+            printf("Did not find %s in %s\n", free_queue_name, lib_name);
+            exit(-1);
+        }
+    #endif
 }
 
 void * usmarray_get_ext_allocator(void) {

From 5d8dd3985c3a0e79f646f65d5d91b6a217c915c8 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 16 Dec 2020 07:47:15 -0600
Subject: [PATCH 22/24] Use unittest.TestCase as base class for TestUsmArray

---
 numba_dppy/tests/test_usmarray.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/numba_dppy/tests/test_usmarray.py b/numba_dppy/tests/test_usmarray.py
index a6d428f80e..abf1a78ec6 100644
--- a/numba_dppy/tests/test_usmarray.py
+++ b/numba_dppy/tests/test_usmarray.py
@@ -4,8 +4,6 @@
 
 import dpctl.dptensor.numpy_usm_shared as usmarray
 
-from numba_dppy.testing import DPPYTestCase
-
 
 @numba.njit()
 def numba_mul_add(a):
@@ -70,7 +68,7 @@ def numba_T(x):
     return x.T
 
 
-class TestUsmArray(DPPYTestCase):
+class TestUsmArray(unittest.TestCase):
     def ndarray(self):
         """Create NumPy array"""
         return numpy.ones(10)

From 0e71fa7b8de25ae1aefc8c3f5d670b8bdca2d087 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 16 Dec 2020 16:17:35 -0600
Subject: [PATCH 23/24] one test in debug

---
 conda-recipe/run_test.bat | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/conda-recipe/run_test.bat b/conda-recipe/run_test.bat
index 031bc6e69a..fd9bf19494 100644
--- a/conda-recipe/run_test.bat
+++ b/conda-recipe/run_test.bat
@@ -3,7 +3,9 @@ call "%ONEAPI_ROOT%\compiler\latest\env\vars.bat"
 
 @echo on
 
-python -m numba.runtests -b -v -m -- numba_dppy.tests
+export NUMBA_DEBUG=1
+
+python -m numba.runtests -b -v -m -- numba_dppy.tests.test_usmarray.TestUsmArray.test_numba_usmarray_as_ndarray
 IF %ERRORLEVEL% NEQ 0 exit /B 1
 
 exit /B 0

From b00afc651c3dbb42dd80e8b95dd3f493a4eeb194 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Wed, 16 Dec 2020 16:53:29 -0600
Subject: [PATCH 24/24] fix

---
 conda-recipe/run_test.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conda-recipe/run_test.sh b/conda-recipe/run_test.sh
index 8a30af0c51..06d516c5f8 100644
--- a/conda-recipe/run_test.sh
+++ b/conda-recipe/run_test.sh
@@ -8,6 +8,7 @@ source ${ONEAPI_ROOT}/tbb/latest/env/vars.sh
 
 set -x
 
-python -m numba.runtests -b -v -m -- numba_dppy.tests
+export NUMBA_DEBUG=1
+python -m numba.runtests -b -v -m -- numba_dppy.tests.test_usmarray.TestUsmArray.test_numba_usmarray_as_ndarray
 
 exit 0