From 9669b73b17f51fe4e3eb67bfc5acaf6c7ce3008e Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 21 Dec 2020 06:28:35 -0600
Subject: [PATCH 1/5] Use the black badge in project's README.md for indicating
 the code style

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 9d1969fe3f..3641849417 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+
 # numba-dppy
 
 ## Numba + dpPy + dpCtl + dpNP = numba-dppy

From 5d3b81ae2bcaad841a8b8b70471d4aed8c0c215f Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 21 Dec 2020 06:32:31 -0600
Subject: [PATCH 2/5] Add GitHub Actions for black for check code style

---
 .github/workflows/black.yml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 .github/workflows/black.yml

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
new file mode 100644
index 0000000000..1101c3a4a0
--- /dev/null
+++ b/.github/workflows/black.yml
@@ -0,0 +1,28 @@
+# This is a workflow to format Python code with black formatter
+
+name: black
+
+# Controls when the action will run. Triggers the workflow on push or pull request
+# events but only for the master branch
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "black"
+  black:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      # Set up a Python environment for use in actions
+      - uses: actions/setup-python@v2
+
+      # Run black code formatter
+      - uses: psf/black@stable

From 7885f76c30eaa3b4ae6df535ac5ff0036d5e8745 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 21 Dec 2020 06:33:50 -0600
Subject: [PATCH 3/5] Ignore versioneer.py because it is 3rd party file (see
 pyproject.toml)

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000..2b233b61c9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.black]
+exclude = 'versioneer.py'

From 7fa195017a799619199d1fe2f81150eb8f01735f Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 21 Dec 2020 06:35:25 -0600
Subject: [PATCH 4/5] Update CONTRIBUTING.md for all developers to use black in
 everyday work

---
 CONTRIBUTING.md | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..c3076bd5e4
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,10 @@
+# Python code style
+
+## black
+
+We use [black](https://black.readthedocs.io/en/stable/) code formatter.
+
+- Revision: `20.8b1` or branch `stable`.
+- See configuration in `pyproject.toml`.
+
+Run before each commit: `black .`

From a8031b4a6cb292a308ae69a3d40ebe37d1eb9c00 Mon Sep 17 00:00:00 2001
From: Sergey Pokhodenko <sergey.pokhodenko@intel.com>
Date: Mon, 21 Dec 2020 06:36:25 -0600
Subject: [PATCH 5/5] Format all Python code with black (just black .)

---
 numba_dppy/__init__.py                        |   6 +-
 numba_dppy/_version.py                        | 154 +++--
 numba_dppy/codegen.py                         |  19 +-
 numba_dppy/compiler.py                        | 271 +++++----
 numba_dppy/decorators.py                      |  14 +-
 numba_dppy/descriptor.py                      |   5 +-
 numba_dppy/device_init.py                     |   4 +-
 numba_dppy/dispatcher.py                      |  63 ++-
 numba_dppy/dpnp_glue/__init__.py              |   2 +-
 numba_dppy/dpnp_glue/dpnp_linalgimpl.py       |  38 +-
 .../dpnp_glue/dpnp_sort_search_countimpl.py   |  10 +-
 numba_dppy/dpnp_glue/dpnp_statisticsimpl.py   |  74 ++-
 .../dpnp_glue/dpnp_transcendentalsimpl.py     |   4 +-
 numba_dppy/dpnp_glue/dpnpdecl.py              |   6 +-
 numba_dppy/dpnp_glue/dpnpimpl.py              |  14 +-
 numba_dppy/dpnp_glue/stubs.py                 |   7 +-
 numba_dppy/dppy_host_fn_call_gen.py           | 387 ++++++++-----
 numba_dppy/dppy_lowerer.py                    | 524 ++++++++++--------
 numba_dppy/dppy_offload_dispatcher.py         |  49 +-
 numba_dppy/dppy_parfor_diagnostics.py         |  59 +-
 numba_dppy/dppy_passbuilder.py                |  97 ++--
 numba_dppy/dppy_passes.py                     | 161 +++---
 numba_dppy/dufunc_inliner.py                  |  93 +++-
 numba_dppy/examples/blacksholes_njit.py       |  26 +-
 numba_dppy/examples/dppy_func.py              |   2 +-
 numba_dppy/examples/dppy_with_context.py      |   7 +-
 numba_dppy/examples/matmul.py                 |  14 +-
 numba_dppy/examples/pa_examples/test1.py      |   2 +-
 numba_dppy/examples/pairwise_distance.py      |  24 +-
 numba_dppy/examples/sum-hybrid.py             |   6 +-
 numba_dppy/examples/sum.py                    |   2 +-
 numba_dppy/examples/sum2D.py                  |  12 +-
 numba_dppy/examples/sum_ndarray.py            |   6 +-
 numba_dppy/examples/sum_reduction.py          |  17 +-
 numba_dppy/examples/sum_reduction_ocl.py      |  15 +-
 .../examples/sum_reduction_recursive_ocl.py   |  48 +-
 numba_dppy/initialize.py                      |  18 +-
 numba_dppy/ocl/atomics/__init__.py            |   9 +-
 numba_dppy/ocl/mathdecl.py                    |  11 +-
 numba_dppy/ocl/mathimpl.py                    | 145 ++---
 numba_dppy/ocl/ocldecl.py                     |  26 +-
 numba_dppy/ocl/oclimpl.py                     | 207 ++++---
 numba_dppy/ocl/stubs.py                       |  31 +-
 numba_dppy/printimpl.py                       |   5 +-
 numba_dppy/rename_numpy_functions_pass.py     |  50 +-
 numba_dppy/spirv_generator.py                 |  80 +--
 numba_dppy/target.py                          | 188 ++++---
 numba_dppy/target_dispatcher.py               |  46 +-
 numba_dppy/testing.py                         |   2 +
 numba_dppy/tests/__init__.py                  |   1 +
 numba_dppy/tests/skip_tests.py                |   1 +
 numba_dppy/tests/test_arg_accessor.py         |  29 +-
 numba_dppy/tests/test_arg_types.py            |  12 +-
 numba_dppy/tests/test_atomic_op.py            |  62 +--
 numba_dppy/tests/test_barrier.py              |  13 +-
 numba_dppy/tests/test_black_scholes.py        |  70 ++-
 numba_dppy/tests/test_caching.py              |  11 +-
 .../tests/test_controllable_fallback.py       |  16 +-
 numba_dppy/tests/test_device_array_args.py    |   7 +-
 numba_dppy/tests/test_dpctl_api.py            |   4 +-
 numba_dppy/tests/test_dpnp_functions.py       | 161 ++++--
 numba_dppy/tests/test_dppy_fallback.py        |  10 +-
 numba_dppy/tests/test_dppy_func.py            |   5 +-
 numba_dppy/tests/test_math_functions.py       |  49 +-
 .../test_numpy_bit_twiddling_functions.py     |   4 +-
 .../tests/test_numpy_comparison_functions.py  |   4 +-
 .../tests/test_numpy_floating_functions.py    |  10 +-
 numba_dppy/tests/test_numpy_math_functions.py |  11 +-
 .../test_numpy_trigonomteric_functions.py     |   4 +-
 numba_dppy/tests/test_offload_diagnostics.py  |   4 +-
 numba_dppy/tests/test_parfor_lower_message.py |   4 +-
 numba_dppy/tests/test_prange.py               |  38 +-
 numba_dppy/tests/test_print.py                |   4 +-
 .../tests/test_rename_numpy_function_pass.py  |  18 +-
 numba_dppy/tests/test_sum_reduction.py        |  17 +-
 numba_dppy/tests/test_vectorize.py            |   7 +-
 numba_dppy/tests/test_with_context.py         |  16 +-
 setup.py                                      |  16 +-
 78 files changed, 2198 insertions(+), 1470 deletions(-)

diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py
index ac4e898889..7bc3ea504a 100644
--- a/numba_dppy/__init__.py
+++ b/numba_dppy/__init__.py
@@ -506,17 +506,21 @@ def main():
 import numba.testing
 
 from .config import dppy_present
+
 if dppy_present:
     from .device_init import *
 else:
     raise ImportError("Importing numba-dppy failed")
 
+
 def test(*args, **kwargs):
     if not dppy_present and not is_available():
         dppy_error()
 
     return numba.testing.test("numba_dppy.tests", *args, **kwargs)
 
+
 from ._version import get_versions
-__version__ = get_versions()['version']
+
+__version__ = get_versions()["version"]
 del get_versions
diff --git a/numba_dppy/_version.py b/numba_dppy/_version.py
index 165dbf4d17..dc6811b00f 100644
--- a/numba_dppy/_version.py
+++ b/numba_dppy/_version.py
@@ -1,4 +1,3 @@
-
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build
@@ -58,17 +57,18 @@ class NotThisMethod(Exception):
 
 def register_vcs_handler(vcs, method):  # decorator
     """Create decorator to mark a method as the handler of a VCS."""
+
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
         if vcs not in HANDLERS:
             HANDLERS[vcs] = {}
         HANDLERS[vcs][method] = f
         return f
+
     return decorate
 
 
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
+def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
     """Call the given command(s)."""
     assert isinstance(commands, list)
     p = None
@@ -76,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
         try:
             dispcmd = str([c] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
+            p = subprocess.Popen(
+                [c] + args,
+                cwd=cwd,
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=(subprocess.PIPE if hide_stderr else None),
+            )
             break
         except EnvironmentError:
             e = sys.exc_info()[1]
@@ -114,16 +117,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     for i in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
+            return {
+                "version": dirname[len(parentdir_prefix) :],
+                "full-revisionid": None,
+                "dirty": False,
+                "error": None,
+                "date": None,
+            }
         else:
             rootdirs.append(root)
             root = os.path.dirname(root)  # up a level
 
     if verbose:
-        print("Tried directories %s but none started with prefix %s" %
-              (str(rootdirs), parentdir_prefix))
+        print(
+            "Tried directories %s but none started with prefix %s"
+            % (str(rootdirs), parentdir_prefix)
+        )
     raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
 
 
@@ -183,7 +192,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
+    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -192,7 +201,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
+        tags = set([r for r in refs if re.search(r"\d", r)])
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -200,19 +209,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
     for ref in sorted(tags):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
+            r = ref[len(tag_prefix) :]
             if verbose:
                 print("picking %s" % r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
+            return {
+                "version": r,
+                "full-revisionid": keywords["full"].strip(),
+                "dirty": False,
+                "error": None,
+                "date": date,
+            }
     # no suitable tags, so version is "0+unknown", but full hex is still there
     if verbose:
         print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": keywords["full"].strip(),
+        "dirty": False,
+        "error": "no suitable tags",
+        "date": None,
+    }
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
@@ -227,8 +243,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
+    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -236,10 +251,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%s*" % tag_prefix],
-                                   cwd=root)
+    describe_out, rc = run_command(
+        GITS,
+        [
+            "describe",
+            "--tags",
+            "--dirty",
+            "--always",
+            "--long",
+            "--match",
+            "%s*" % tag_prefix,
+        ],
+        cwd=root,
+    )
     # --long was added in git-1.5.5
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
@@ -262,17 +286,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     dirty = git_describe.endswith("-dirty")
     pieces["dirty"] = dirty
     if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
+        git_describe = git_describe[: git_describe.rindex("-dirty")]
 
     # now we have TAG-NUM-gHEX or HEX
 
     if "-" in git_describe:
         # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
+        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
             # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%s'"
-                               % describe_out)
+            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
         # tag
@@ -281,10 +304,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             if verbose:
                 fmt = "tag '%s' doesn't start with prefix '%s'"
                 print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%s' doesn't start with prefix '%s'"
-                               % (full_tag, tag_prefix))
+            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
+                full_tag,
+                tag_prefix,
+            )
             return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
+        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
 
         # distance: number of commits since tag
         pieces["distance"] = int(mo.group(2))
@@ -295,13 +320,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
+        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
         pieces["distance"] = int(count_out)  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"],
-                       cwd=root)[0].strip()
+    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
+        0
+    ].strip()
     # Use only the last line.  Previous lines may contain GPG signature
     # information.
     date = date.splitlines()[-1]
@@ -335,8 +360,7 @@ def render_pep440(pieces):
                 rendered += ".dirty"
     else:
         # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"],
-                                          pieces["short"])
+        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
         if pieces["dirty"]:
             rendered += ".dirty"
     return rendered
@@ -450,11 +474,13 @@ def render_git_describe_long(pieces):
 def render(pieces, style):
     """Render the given version pieces into the requested style."""
     if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
+        return {
+            "version": "unknown",
+            "full-revisionid": pieces.get("long"),
+            "dirty": None,
+            "error": pieces["error"],
+            "date": None,
+        }
 
     if not style or style == "default":
         style = "pep440"  # the default
@@ -474,9 +500,13 @@ def render(pieces, style):
     else:
         raise ValueError("unknown style '%s'" % style)
 
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
+    return {
+        "version": rendered,
+        "full-revisionid": pieces["long"],
+        "dirty": pieces["dirty"],
+        "error": None,
+        "date": pieces.get("date"),
+    }
 
 
 def get_versions():
@@ -490,8 +520,7 @@ def get_versions():
     verbose = cfg.verbose
 
     try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
-                                          verbose)
+        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
     except NotThisMethod:
         pass
 
@@ -500,13 +529,16 @@ def get_versions():
         # versionfile_source is the relative path from the top of the source
         # tree (where the .git directory might live) to this file. Invert
         # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
+        for i in cfg.versionfile_source.split("/"):
             root = os.path.dirname(root)
     except NameError:
-        return {"version": "0+unknown", "full-revisionid": None,
-                "dirty": None,
-                "error": "unable to find root of source tree",
-                "date": None}
+        return {
+            "version": "0+unknown",
+            "full-revisionid": None,
+            "dirty": None,
+            "error": "unable to find root of source tree",
+            "date": None,
+        }
 
     try:
         pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
@@ -520,6 +552,10 @@ def get_versions():
     except NotThisMethod:
         pass
 
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to compute version", "date": None}
+    return {
+        "version": "0+unknown",
+        "full-revisionid": None,
+        "dirty": None,
+        "error": "unable to compute version",
+        "date": None,
+    }
diff --git a/numba_dppy/codegen.py b/numba_dppy/codegen.py
index 4e278d7ebc..ef78551a2f 100644
--- a/numba_dppy/codegen.py
+++ b/numba_dppy/codegen.py
@@ -5,14 +5,17 @@
 from numba.core import utils
 
 
-SPIR_TRIPLE = {32: ' spir-unknown-unknown',
-               64: 'spir64-unknown-unknown'}
+SPIR_TRIPLE = {32: " spir-unknown-unknown", 64: "spir64-unknown-unknown"}
 
 SPIR_DATA_LAYOUT = {
-    32 : ('e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:'
-          '256-v512:512-v1024:1024'),
-    64 : ('e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-'
-          'v512:512-v1024:1024')
+    32: (
+        "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:"
+        "256-v512:512-v1024:1024"
+    ),
+    64: (
+        "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-"
+        "v512:512-v1024:1024"
+    ),
 }
 
 
@@ -36,8 +39,8 @@ def _optimize_final_module(self):
     def _finalize_specific(self):
         # Fix global naming
         for gv in self._final_module.global_variables:
-            if '.' in gv.name:
-                gv.name = gv.name.replace('.', '_')
+            if "." in gv.name:
+                gv.name = gv.name.replace(".", "_")
 
     def get_asm_str(self):
         # Return nothing: we can only dump assembler code when it is later
diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py
index 37b9e25e9f..31e9d57278 100644
--- a/numba_dppy/compiler.py
+++ b/numba_dppy/compiler.py
@@ -22,22 +22,26 @@
 from numba_dppy.dppy_parfor_diagnostics import ExtendedParforDiagnostics
 
 
-DEBUG = os.environ.get('NUMBA_DPPY_DEBUG', None)
-_NUMBA_DPPY_READ_ONLY  = "read_only"
+DEBUG = os.environ.get("NUMBA_DPPY_DEBUG", None)
+_NUMBA_DPPY_READ_ONLY = "read_only"
 _NUMBA_DPPY_WRITE_ONLY = "write_only"
 _NUMBA_DPPY_READ_WRITE = "read_write"
 
 
 def _raise_no_device_found_error():
-    error_message = ("No OpenCL device specified. "
-                     "Usage : jit_fn[device, globalsize, localsize](...)")
+    error_message = (
+        "No OpenCL device specified. "
+        "Usage : jit_fn[device, globalsize, localsize](...)"
+    )
     raise ValueError(error_message)
 
 
 def _raise_invalid_kernel_enqueue_args():
-    error_message = ("Incorrect number of arguments for enquing dppy.kernel. "
-                     "Usage: device_env, global size, local size. "
-                     "The local size argument is optional.")
+    error_message = (
+        "Incorrect number of arguments for enquing dppy.kernel. "
+        "Usage: device_env, global size, local size. "
+        "The local size argument is optional."
+    )
     raise ValueError(error_message)
 
 
@@ -63,18 +67,14 @@ def define_pipelines(self):
         # this maintains the objmode fallback behaviour
         pms = []
         self.state.parfor_diagnostics = ExtendedParforDiagnostics()
-        self.state.metadata['parfor_diagnostics'] = self.state.parfor_diagnostics
+        self.state.metadata["parfor_diagnostics"] = self.state.parfor_diagnostics
         if not self.state.flags.force_pyobject:
-            #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
+            # print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline")
             pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state))
         if self.state.status.can_fallback or self.state.flags.force_pyobject:
-            pms.append(
-                DefaultPassBuilder.define_objectmode_pipeline(self.state)
-            )
+            pms.append(DefaultPassBuilder.define_objectmode_pipeline(self.state))
         if self.state.status.can_giveup:
-            pms.append(
-                DefaultPassBuilder.define_interpreted_pipeline(self.state)
-            )
+            pms.append(DefaultPassBuilder.define_interpreted_pipeline(self.state))
         return pms
 
 
@@ -88,32 +88,36 @@ def compile_with_dppy(pyfunc, return_type, args, debug):
     flags = compiler.Flags()
     # Do not compile (generate native code), just lower (to LLVM)
     if debug:
-        flags.set('debuginfo')
-    flags.set('no_compile')
-    flags.set('no_cpython_wrapper')
-    flags.unset('nrt')
+        flags.set("debuginfo")
+    flags.set("no_compile")
+    flags.set("no_cpython_wrapper")
+    flags.unset("nrt")
 
     # Run compilation pipeline
     if isinstance(pyfunc, FunctionType):
-        cres = compiler.compile_extra(typingctx=typingctx,
-                                      targetctx=targetctx,
-                                      func=pyfunc,
-                                      args=args,
-                                      return_type=return_type,
-                                      flags=flags,
-                                      locals={},
-                                      pipeline_class=DPPYCompiler)
+        cres = compiler.compile_extra(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=DPPYCompiler,
+        )
     elif isinstance(pyfunc, ir.FunctionIR):
-        cres = compiler.compile_ir(typingctx=typingctx,
-                                   targetctx=targetctx,
-                                   func_ir=pyfunc,
-                                   args=args,
-                                   return_type=return_type,
-                                   flags=flags,
-                                   locals={},
-                                   pipeline_class=DPPYCompiler)
+        cres = compiler.compile_ir(
+            typingctx=typingctx,
+            targetctx=targetctx,
+            func_ir=pyfunc,
+            args=args,
+            return_type=return_type,
+            flags=flags,
+            locals={},
+            pipeline_class=DPPYCompiler,
+        )
     else:
-        assert(0)
+        assert 0
     # Linking depending libraries
     # targetctx.link_dependencies(cres.llvm_module, cres.target_context.linking)
     library = cres.library
@@ -138,17 +142,18 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False):
     # depending on the target context. For example, we want to link our kernel object
     # with implementation containing atomic operations only when atomic operations
     # are being used in the kernel.
-    oclkern = DPPYKernel(context=cres.target_context,
-                         sycl_queue=sycl_queue,
-                         llvm_module=kernel.module,
-                         name=kernel.name,
-                         argtypes=cres.signature.args,
-                         ordered_arg_access_types=access_types)
+    oclkern = DPPYKernel(
+        context=cres.target_context,
+        sycl_queue=sycl_queue,
+        llvm_module=kernel.module,
+        name=kernel.name,
+        argtypes=cres.signature.args,
+        ordered_arg_access_types=access_types,
+    )
     return oclkern
 
 
-def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
-                          debug=False):
+def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces, debug=False):
     if DEBUG:
         print("compile_kernel_parfor", args)
         for a in args:
@@ -156,25 +161,26 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces,
             if isinstance(a, types.npytypes.Array):
                 print("addrspace:", a.addrspace)
 
-    cres = compile_with_dppy(func_ir, None, args_with_addrspaces,
-                             debug=debug)
+    cres = compile_with_dppy(func_ir, None, args_with_addrspaces, debug=debug)
     func = cres.library.get_function(cres.fndesc.llvm_func_name)
 
     if DEBUG:
         print("compile_kernel_parfor signature", cres.signature.args)
         for a in cres.signature.args:
             print(a, type(a))
-#            if isinstance(a, types.npytypes.Array):
-#                print("addrspace:", a.addrspace)
+    #            if isinstance(a, types.npytypes.Array):
+    #                print("addrspace:", a.addrspace)
 
     kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args)
-    #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
-    oclkern = DPPYKernel(context=cres.target_context,
-                         sycl_queue=sycl_queue,
-                         llvm_module=kernel.module,
-                         name=kernel.name,
-                         argtypes=args_with_addrspaces)
-                         #argtypes=cres.signature.args)
+    # kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces)
+    oclkern = DPPYKernel(
+        context=cres.target_context,
+        sycl_queue=sycl_queue,
+        llvm_module=kernel.module,
+        name=kernel.name,
+        argtypes=args_with_addrspaces,
+    )
+    # argtypes=cres.signature.args)
     return oclkern
 
 
@@ -196,8 +202,7 @@ class dppy_function_template(ConcreteTemplate):
 
 # Compile dppy function template
 def compile_dppy_func_template(pyfunc):
-    """Compile a DPPYFunctionTemplate
-    """
+    """Compile a DPPYFunctionTemplate"""
     from .descriptor import dppy_target
 
     dft = DPPYFunctionTemplate(pyfunc)
@@ -215,8 +220,8 @@ def generic(self, args, kws):
 
 
 class DPPYFunctionTemplate(object):
-    """Unmaterialized dppy function
-    """
+    """Unmaterialized dppy function"""
+
     def __init__(self, pyfunc, debug=False):
         self.py_func = pyfunc
         self.debug = debug
@@ -239,8 +244,7 @@ def compile(self, args):
 
             if first_definition:
                 # First definition
-                cres.target_context.insert_user_function(self, cres.fndesc,
-                                                         libs)
+                cres.target_context.insert_user_function(self, cres.fndesc, libs)
             else:
                 cres.target_context.add_user_function(self, cres.fndesc, libs)
 
@@ -258,61 +262,65 @@ def __init__(self, cres):
 def _ensure_valid_work_item_grid(val, sycl_queue):
 
     if not isinstance(val, (tuple, list, int)):
-        error_message = ("Cannot create work item dimension from "
-                         "provided argument")
+        error_message = "Cannot create work item dimension from " "provided argument"
         raise ValueError(error_message)
 
     if isinstance(val, int):
         val = [val]
 
     # TODO: we need some way to check the max dimensions
-    '''
+    """
     if len(val) > device_env.get_max_work_item_dims():
         error_message = ("Unsupported number of work item dimensions ")
         raise ValueError(error_message)
-    '''
+    """
+
+    return list(
+        val[::-1]
+    )  # reversing due to sycl and opencl interop kernel range mismatch semantic
 
-    return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 def _ensure_valid_work_group_size(val, work_item_grid):
 
     if not isinstance(val, (tuple, list, int)):
-        error_message = ("Cannot create work item dimension from "
-                         "provided argument")
+        error_message = "Cannot create work item dimension from " "provided argument"
         raise ValueError(error_message)
 
     if isinstance(val, int):
         val = [val]
 
     if len(val) != len(work_item_grid):
-        error_message = ("Unsupported number of work item dimensions, " +
-                         "dimensions of global and local work items has to be the same ")
+        error_message = (
+            "Unsupported number of work item dimensions, "
+            + "dimensions of global and local work items has to be the same "
+        )
         raise ValueError(error_message)
 
-    return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic
+    return list(
+        val[::-1]
+    )  # reversing due to sycl and opencl interop kernel range mismatch semantic
 
 
 class DPPYKernelBase(object):
-    """Define interface for configurable kernels
-    """
+    """Define interface for configurable kernels"""
 
     def __init__(self):
         self.global_size = []
-        self.local_size  = []
-        self.sycl_queue  = None
+        self.local_size = []
+        self.sycl_queue = None
 
         # list of supported access types, stored in dict for fast lookup
         self.valid_access_types = {
-                _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
-                _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
-                _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE}
+            _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY,
+            _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY,
+            _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE,
+        }
 
     def copy(self):
         return copy.copy(self)
 
     def configure(self, sycl_queue, global_size, local_size=None):
-        """Configure the OpenCL kernel. The local_size can be None
-        """
+        """Configure the OpenCL kernel. The local_size can be None"""
         clone = self.copy()
         clone.global_size = global_size
         clone.local_size = local_size
@@ -346,8 +354,15 @@ class DPPYKernel(DPPYKernelBase):
     A OCL kernel object
     """
 
-    def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
-                 ordered_arg_access_types=None):
+    def __init__(
+        self,
+        context,
+        sycl_queue,
+        llvm_module,
+        name,
+        argtypes,
+        ordered_arg_access_types=None,
+    ):
         super(DPPYKernel, self).__init__()
         self._llvm_module = llvm_module
         self.assembly = self.binary = llvm_module.__str__()
@@ -365,7 +380,9 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes,
         self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary)
 
         # create a program
-        self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc)
+        self.program = dpctl_prog.create_program_from_spirv(
+            self.sycl_queue, self.spirv_bc
+        )
         #  create a kernel
         self.kernel = self.program.get_sycl_kernel(self.entry_name)
 
@@ -376,33 +393,47 @@ def __call__(self, *args):
         retr = []  # hold functors for writeback
         kernelargs = []
         internal_device_arrs = []
-        for ty, val, access_type in zip(self.argument_types, args,
-                                        self.ordered_arg_access_types):
-            self._unpack_argument(ty, val, self.sycl_queue, retr,
-                    kernelargs, internal_device_arrs, access_type)
+        for ty, val, access_type in zip(
+            self.argument_types, args, self.ordered_arg_access_types
+        ):
+            self._unpack_argument(
+                ty,
+                val,
+                self.sycl_queue,
+                retr,
+                kernelargs,
+                internal_device_arrs,
+                access_type,
+            )
 
-        self.sycl_queue.submit(self.kernel, kernelargs, self.global_size, self.local_size)
+        self.sycl_queue.submit(
+            self.kernel, kernelargs, self.global_size, self.local_size
+        )
         self.sycl_queue.wait()
 
-        for ty, val, i_dev_arr, access_type in zip(self.argument_types, args,
-                internal_device_arrs, self.ordered_arg_access_types):
-            self._pack_argument(ty, val, self.sycl_queue, i_dev_arr,
-                                access_type)
+        for ty, val, i_dev_arr, access_type in zip(
+            self.argument_types,
+            args,
+            internal_device_arrs,
+            self.ordered_arg_access_types,
+        ):
+            self._pack_argument(ty, val, self.sycl_queue, i_dev_arr, access_type)
 
     def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type):
         """
         Copy device data back to host
         """
-        if (device_arr and (access_type not in self.valid_access_types or
-            access_type in self.valid_access_types and
-            self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)):
+        if device_arr and (
+            access_type not in self.valid_access_types
+            or access_type in self.valid_access_types
+            and self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY
+        ):
             # we get the date back to host if have created a
             # device_array or if access_type of this device_array
             # is not of type read_only and read_write
             usm_buf, usm_ndarr, orig_ndarray = device_arr
             np.copyto(orig_ndarray, usm_ndarr)
 
-
     def _unpack_device_array_argument(self, val, kernelargs):
         # this function only takes ndarrays created using USM allocated buffer
         void_ptr_arg = True
@@ -422,9 +453,9 @@ def _unpack_device_array_argument(self, val, kernelargs):
         for ax in range(val.ndim):
             kernelargs.append(ctypes.c_longlong(val.strides[ax]))
 
-
-    def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
-                         device_arrs, access_type):
+    def _unpack_argument(
+        self, ty, val, sycl_queue, retr, kernelargs, device_arrs, access_type
+    ):
         """
         Convert arguments to ctypes and append to kernelargs
         """
@@ -440,9 +471,11 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
                 usm_buf = dpctl_mem.MemoryUSMShared(val.size * val.dtype.itemsize)
                 usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype)
 
-                if (default_behavior or
-                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or
-                    self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE):
+                if (
+                    default_behavior
+                    or self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY
+                    or self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE
+                ):
                     np.copyto(usm_ndarr, val)
 
                 device_arrs[-1] = (usm_buf, usm_ndarr, val)
@@ -470,13 +503,13 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
             cval = ctypes.c_uint8(int(val))
             kernelargs.append(cval)
         elif ty == types.complex64:
-            #kernelargs.append(ctypes.c_float(val.real))
-            #kernelargs.append(ctypes.c_float(val.imag))
+            # kernelargs.append(ctypes.c_float(val.real))
+            # kernelargs.append(ctypes.c_float(val.imag))
             raise NotImplementedError(ty, val)
 
         elif ty == types.complex128:
-            #kernelargs.append(ctypes.c_double(val.real))
-            #kernelargs.append(ctypes.c_double(val.imag))
+            # kernelargs.append(ctypes.c_double(val.real))
+            # kernelargs.append(ctypes.c_double(val.imag))
             raise NotImplementedError(ty, val)
 
         else:
@@ -484,13 +517,16 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs,
 
     def check_for_invalid_access_type(self, access_type):
         if access_type not in self.valid_access_types:
-            msg = ("[!] %s is not a valid access type. "
-                  "Supported access types are [" % (access_type))
+            msg = (
+                "[!] %s is not a valid access type. "
+                "Supported access types are [" % (access_type)
+            )
             for key in self.valid_access_types:
                 msg += " %s |" % (key)
 
             msg = msg[:-1] + "]"
-            if access_type != None: print(msg)
+            if access_type != None:
+                print(msg)
             return True
         else:
             return False
@@ -518,28 +554,27 @@ def __call__(self, *args, **kwargs):
                 _raise_no_device_found_error()
 
         kernel = self.specialize(*args)
-        cfg = kernel.configure(self.sycl_queue, self.global_size,
-                               self.local_size)
+        cfg = kernel.configure(self.sycl_queue, self.global_size, self.local_size)
         cfg(*args)
 
     def specialize(self, *args):
-        argtypes = tuple([self.typingctx.resolve_argument_type(a)
-                          for a in args])
+        argtypes = tuple([self.typingctx.resolve_argument_type(a) for a in args])
         q = None
         kernel = None
         # we were previously using the _env_ptr of the device_env, the sycl_queue
         # should be sufficient to cache the compiled kernel for now, but we should
         # use the device type to cache such kernels
-        #key_definitions = (self.sycl_queue, argtypes)
-        key_definitions = (argtypes)
+        # key_definitions = (self.sycl_queue, argtypes)
+        key_definitions = argtypes
         result = self.definitions.get(key_definitions)
         if result:
             q, kernel = result
 
         if q and self.sycl_queue.equals(q):
-                return kernel
+            return kernel
         else:
-            kernel = compile_kernel(self.sycl_queue, self.py_func, argtypes,
-                                    self.access_types)
+            kernel = compile_kernel(
+                self.sycl_queue, self.py_func, argtypes, self.access_types
+            )
             self.definitions[key_definitions] = (self.sycl_queue, kernel)
         return kernel
diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py
index 641d924134..7576e20aae 100644
--- a/numba_dppy/decorators.py
+++ b/numba_dppy/decorators.py
@@ -1,7 +1,12 @@
 from __future__ import print_function, absolute_import, division
 from numba.core import sigutils, types
-from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template,
-                       compile_dppy_func, get_ordered_arg_access_types)
+from .compiler import (
+    compile_kernel,
+    JitDPPYKernel,
+    compile_dppy_func_template,
+    compile_dppy_func,
+    get_ordered_arg_access_types,
+)
 
 
 def kernel(signature=None, access_types=None, debug=False):
@@ -23,13 +28,14 @@ def autojit(debug=False, access_types=None):
     def _kernel_autojit(pyfunc):
         ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types)
         return JitDPPYKernel(pyfunc, ordered_arg_access_types)
+
     return _kernel_autojit
 
 
 def _kernel_jit(signature, debug, access_types):
     argtypes, restype = sigutils.normalize_signature(signature)
     if restype is not None and restype != types.void:
-        msg = ("DPPY kernel must have void return type but got {restype}")
+        msg = "DPPY kernel must have void return type but got {restype}"
         raise TypeError(msg.format(restype=restype))
 
     def _wrapped(pyfunc):
@@ -39,7 +45,6 @@ def _wrapped(pyfunc):
     return _wrapped
 
 
-
 def func(signature=None):
     if signature is None:
         return _func_autojit
@@ -58,5 +63,6 @@ def _wrapped(pyfunc):
 
     return _wrapped
 
+
 def _func_autojit(pyfunc):
     return compile_dppy_func_template(pyfunc)
diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py
index c8e6a58ec7..40e1ef0628 100644
--- a/numba_dppy/descriptor.py
+++ b/numba_dppy/descriptor.py
@@ -10,8 +10,8 @@
 
 class DPPYTarget(TargetDescriptor):
     options = CPUTargetOptions
-    #typingctx = DPPYTypingContext()
-    #targetctx = DPPYTargetContext(typingctx)
+    # typingctx = DPPYTypingContext()
+    # targetctx = DPPYTargetContext(typingctx)
 
     @utils.cached_property
     def _toplevel_target_context(self):
@@ -38,6 +38,5 @@ def typing_context(self):
         return self._toplevel_typing_context
 
 
-
 # The global DPPY target
 dppy_target = DPPYTarget()
diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py
index efec55ba83..57c0d9c6e7 100644
--- a/numba_dppy/device_init.py
+++ b/numba_dppy/device_init.py
@@ -22,9 +22,7 @@
 We are importing dpnp stub module to make Numba recognize the
 module when we rename Numpy functions.
 """
-from .dpnp_glue.stubs import (
-    dpnp
-)
+from .dpnp_glue.stubs import dpnp
 
 DEFAULT_LOCAL_SIZE = []
 
diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py
index d00a597875..e4a9ddecd3 100644
--- a/numba_dppy/dispatcher.py
+++ b/numba_dppy/dispatcher.py
@@ -2,21 +2,22 @@
 
 import numpy as np
 
-#from numba.targets.descriptors import TargetDescriptor
-#from numba.targets.options import TargetOptions
-#import numba_dppy, numba_dppy as dppy
+# from numba.targets.descriptors import TargetDescriptor
+# from numba.targets.options import TargetOptions
+# import numba_dppy, numba_dppy as dppy
 from numba_dppy import kernel, autojit
 from .descriptor import dppy_target
-#from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
- #                                       GUFuncCallSteps)
+
+# from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc,
+#                                       GUFuncCallSteps)
 
 from .. import dispatcher, utils, typing
 from .compiler import DPPYCompiler
 
+
 class DPPYDispatcher(dispatcher.Dispatcher):
     targetdescr = dppy_target
 
-
     def __init__(self, py_func, locals={}, targetoptions={}):
         assert not locals
         self.py_func = py_func
@@ -44,8 +45,7 @@ def __call__(self, *args, **kws):
         return self.compiled(*args, **kws)
 
     def disable_compile(self, val=True):
-        """Disable the compilation of new signatures at call time.
-        """
+        """Disable the compilation of new signatures at call time."""
         # Do nothing
         pass
 
@@ -58,6 +58,7 @@ def __getitem__(self, *args):
     def __getattr__(self, key):
         return getattr(self.compiled, key)
 
+
 class DPPYUFuncDispatcher(object):
     """
     Invoke the OpenCL ufunc specialization for the given inputs.
@@ -89,8 +90,7 @@ def __call__(self, *args, **kws):
         return DPPYUFuncMechanism.call(self.functions, args, kws)
 
     def reduce(self, arg, stream=0):
-        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \
-                                                         "ufunc"
+        assert len(list(self.functions.keys())[0]) == 2, "must be a binary " "ufunc"
         assert arg.ndim == 1, "must use 1d array"
 
         n = arg.shape[0]
@@ -112,7 +112,12 @@ def reduce(self, arg, stream=0):
             # reduce by recursively spliting and operating
             out = self.__reduce(mem, gpu_mems, stream)
             # store the resultong scalar in a [1,] buffer
-            buf = np.empty([out.size,], dtype=out.dtype)
+            buf = np.empty(
+                [
+                    out.size,
+                ],
+                dtype=out.dtype,
+            )
             # copy the result back to host
             out.copy_to_host(buf, stream=stream)
 
@@ -144,7 +149,7 @@ def __reduce(self, mem, gpu_mems, stream):
 
 class _DPPYGUFuncCallSteps(GUFuncCallSteps):
     __slots__ = [
-        '_stream',
+        "_stream",
     ]
 
     def is_device_array(self, obj):
@@ -161,7 +166,7 @@ def device_array(self, shape, dtype):
         return ocl.device_array(shape=shape, dtype=dtype, stream=self._stream)
 
     def prepare_inputs(self):
-        self._stream = self.kwargs.get('stream', 0)
+        self._stream = self.kwargs.get("stream", 0)
 
     def launch_kernel(self, kernel, nelem, args):
         kernel.forall(nelem, queue=self._stream)(*args)
@@ -173,27 +178,26 @@ def _call_steps(self):
         return _DPPYGUFuncCallSteps
 
     def _broadcast_scalar_input(self, ary, shape):
-        return devicearray.DeviceNDArray(shape=shape,
-                                         strides=(0,),
-                                         dtype=ary.dtype,
-                                         gpu_data=ary.gpu_data)
+        return devicearray.DeviceNDArray(
+            shape=shape, strides=(0,), dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
 
     def _broadcast_add_axis(self, ary, newshape):
         newax = len(newshape) - len(ary.shape)
         # Add 0 strides for missing dimension
         newstrides = (0,) * newax + ary.strides
-        return devicearray.DeviceNDArray(shape=newshape,
-                                         strides=newstrides,
-                                         dtype=ary.dtype,
-                                         gpu_data=ary.gpu_data)
+        return devicearray.DeviceNDArray(
+            shape=newshape, strides=newstrides, dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
 
 
 class DPPYUFuncMechanism(UFuncMechanism):
     """
     Provide OpenCL specialization
     """
+
     DEFAULT_STREAM = 0
-    ARRAY_ORDER = 'A'
+    ARRAY_ORDER = "A"
 
     def launch(self, func, count, stream, args):
         func.forall(count, queue=stream)(*args)
@@ -211,9 +215,11 @@ def device_array(self, shape, dtype, stream):
         return ocl.device_array(shape=shape, dtype=dtype, stream=stream)
 
     def broadcast_device(self, ary, shape):
-        ax_differs = [ax for ax in range(len(shape))
-                      if ax >= ary.ndim
-                      or ary.shape[ax] != shape[ax]]
+        ax_differs = [
+            ax
+            for ax in range(len(shape))
+            if ax >= ary.ndim or ary.shape[ax] != shape[ax]
+        ]
 
         missingdim = len(shape) - len(ary.shape)
         strides = [0] * missingdim + list(ary.strides)
@@ -221,7 +227,6 @@ def broadcast_device(self, ary, shape):
         for ax in ax_differs:
             strides[ax] = 0
 
-        return devicearray.DeviceNDArray(shape=shape,
-                                         strides=strides,
-                                         dtype=ary.dtype,
-                                         gpu_data=ary.gpu_data)
+        return devicearray.DeviceNDArray(
+            shape=shape, strides=strides, dtype=ary.dtype, gpu_data=ary.gpu_data
+        )
diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py
index 17d6b5ad6a..9e76480680 100644
--- a/numba_dppy/dpnp_glue/__init__.py
+++ b/numba_dppy/dpnp_glue/__init__.py
@@ -1,6 +1,6 @@
 def ensure_dpnp(name):
     try:
-       # import dpnp
+        # import dpnp
         from . import dpnp_fptr_interface as dpnp_glue
     except ImportError:
         raise ImportError("dpNP is needed to call np.%s" % name)
diff --git a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
index 9146299b05..4b47382c57 100644
--- a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py
@@ -7,6 +7,7 @@
 import numpy as np
 from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS
 
+
 @overload(stubs.dpnp.eig)
 def dpnp_eig_impl(a):
     name = "eig"
@@ -22,10 +23,8 @@ def dpnp_eig_impl(a):
     void dpnp_eig_c(const void* array_in, void* result1, void* result2, size_t size)
 
     """
-    sig = signature(
-        ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp
-    )
-    dpnp_eig = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp)
+    dpnp_eig = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -92,8 +91,14 @@ def dpnp_dot_impl(a, b):
 
     """
     sig = signature(
-        ret_type, types.voidptr, types.voidptr, types.voidptr,
-                  types.intp, types.intp, types.intp)
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.intp,
+        types.intp,
+    )
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -137,6 +142,7 @@ def dpnp_dot_impl(a, b):
     ndims = [a.ndim, b.ndim]
     if ndims == [2, 2]:
         dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+
         def dot_2_mm(a, b):
             sycl_queue = get_sycl_queue()
 
@@ -170,11 +176,12 @@ def dot_2_mm(a, b):
         return dot_2_mm
     elif ndims == [2, 1]:
         dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+
         def dot_2_mv(a, b):
             sycl_queue = get_sycl_queue()
 
             m, k = a.shape
-            _n,  = b.shape
+            (_n,) = b.shape
             n = 1
 
             if _n != k:
@@ -186,7 +193,7 @@ def dot_2_mv(a, b):
             b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
             copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
 
-            out = np.empty((m, ), dtype=res_dtype)
+            out = np.empty((m,), dtype=res_dtype)
             out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
 
             dpnp_func(a_usm, b_usm, out_usm, m, n, k)
@@ -204,10 +211,11 @@ def dot_2_mv(a, b):
         return dot_2_mv
     elif ndims == [1, 2]:
         dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig)
+
         def dot_2_vm(a, b):
             sycl_queue = get_sycl_queue()
 
-            m, = a.shape
+            (m,) = a.shape
             k, n = b.shape
 
             if m != k:
@@ -219,7 +227,7 @@ def dot_2_vm(a, b):
             b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue)
             copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize)
 
-            out = np.empty((n, ), dtype=res_dtype)
+            out = np.empty((n,), dtype=res_dtype)
             out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue)
 
             dpnp_func(a_usm, b_usm, out_usm, m, n, k)
@@ -236,14 +244,16 @@ def dot_2_vm(a, b):
 
         return dot_2_vm
     elif ndims == [1, 1]:
-        sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr,
-                                  types.intp)
+        sig = signature(
+            ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp
+        )
         dpnp_func = dpnp_ext.dpnp_func("dpnp_dot", [a.dtype.name, "NONE"], sig)
+
         def dot_2_vv(a, b):
             sycl_queue = get_sycl_queue()
 
-            m, = a.shape
-            n, = b.shape
+            (m,) = a.shape
+            (n,) = b.shape
 
             if m != n:
                 raise ValueError("Incompatible array sizes for np.dot(a, b)")
diff --git a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
index 8ec200059b..073b83e900 100644
--- a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py
@@ -22,7 +22,9 @@ def dpnp_argmax_impl(a):
     void custom_argmax_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+    dpnp_func = dpnp_ext.dpnp_func(
+        "dpnp_" + name, [a.dtype.name, np.dtype(np.int64).name], sig
+    )
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -72,7 +74,9 @@ def dpnp_argmin_impl(a):
     void custom_argmin_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig)
+    dpnp_func = dpnp_ext.dpnp_func(
+        "dpnp_" + name, [a.dtype.name, np.dtype(np.int64).name], sig
+    )
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -122,7 +126,7 @@ def dpnp_argsort_impl(a):
     void custom_argmin_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
diff --git a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
index cae9507902..a3883dc860 100644
--- a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py
@@ -28,10 +28,16 @@ def dpnp_amax_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -85,10 +91,16 @@ def dpnp_amin_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -141,10 +153,16 @@ def dpnp_mean_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -200,10 +218,16 @@ def dpnp_median_impl(a):
     if the compiler allows there should not be any mismatch in the size of
     the container to hold different types of pointer.
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.voidptr, types.intp,
-                              types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(
+        ret_type,
+        types.voidptr,
+        types.voidptr,
+        types.voidptr,
+        types.intp,
+        types.voidptr,
+        types.intp,
+    )
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -254,9 +278,8 @@ def dpnp_cov_impl(a):
     Function declaration:
     void custom_cov_c(void* array1_in, void* result1, size_t nrows, size_t ncols)
     """
-    sig = signature(ret_type, types.voidptr, types.voidptr,
-                              types.intp, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    sig = signature(ret_type, types.voidptr, types.voidptr, types.intp, types.intp)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -268,7 +291,6 @@ def dpnp_cov_impl(a):
     if a.dtype == types.float64:
         copy_input_to_double = False
 
-
     def dpnp_impl(a):
         if a.size == 0:
             raise ValueError("Passed Empty array")
@@ -280,9 +302,15 @@ def dpnp_impl(a):
             a_copy_in_double = a.astype(np.float64)
         else:
             a_copy_in_double = a
-        a_usm = allocate_usm_shared(a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue)
-        copy_usm(sycl_queue, a_usm, a_copy_in_double.ctypes,
-                 a_copy_in_double.size * a_copy_in_double.itemsize)
+        a_usm = allocate_usm_shared(
+            a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue
+        )
+        copy_usm(
+            sycl_queue,
+            a_usm,
+            a_copy_in_double.ctypes,
+            a_copy_in_double.size * a_copy_in_double.itemsize,
+        )
 
         if a.ndim == 2:
             rows = a.shape[0]
diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
index f7ba425206..af79ad2c1c 100644
--- a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
+++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py
@@ -23,7 +23,7 @@ def dpnp_sum_impl(a):
 
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
@@ -70,7 +70,7 @@ def dpnp_prod_impl(a):
     void custom_prod_c(void* array1_in, void* result1, size_t size)
     """
     sig = signature(ret_type, types.voidptr, types.voidptr, types.intp)
-    dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig)
+    dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig)
 
     get_sycl_queue = dpctl_functions.dpctl_get_current_queue()
     allocate_usm_shared = dpctl_functions.dpctl_malloc_shared()
diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py
index ce1f7d3583..373018c8db 100644
--- a/numba_dppy/dpnp_glue/dpnpdecl.py
+++ b/numba_dppy/dpnp_glue/dpnpdecl.py
@@ -1,8 +1,9 @@
-from numba.core.typing.templates import (AttributeTemplate, infer_getattr)
+from numba.core.typing.templates import AttributeTemplate, infer_getattr
 import numba_dppy
 from numba import types
 from numba.core.types.misc import RawPointer
 
+
 @infer_getattr
 class DppyDpnpTemplate(AttributeTemplate):
     key = types.Module(numba_dppy)
@@ -10,11 +11,14 @@ class DppyDpnpTemplate(AttributeTemplate):
     def resolve_dpnp(self, mod):
         return types.Module(numba_dppy.dpnp)
 
+
 """
 This adds a shapeptr attribute to Numba type representing np.ndarray.
 This allows us to get the raw pointer to the structure where the shape
 of an ndarray is stored from an overloaded implementation
 """
+
+
 @infer_getattr
 class ArrayAttribute(AttributeTemplate):
     key = types.Array
diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py
index fa429f923f..a3dc5ce195 100644
--- a/numba_dppy/dpnp_glue/dpnpimpl.py
+++ b/numba_dppy/dpnp_glue/dpnpimpl.py
@@ -8,23 +8,27 @@
 
 ll_void_p = ir.IntType(8).as_pointer()
 
+
 def get_dpnp_fptr(fn_name, type_names):
     from . import dpnp_fptr_interface as dpnp_glue
 
     f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names)
     return f_ptr
 
+
 @register_jitable
 def _check_finite_matrix(a):
     for v in np.nditer(a):
         if not np.isfinite(v.item()):
             raise np.linalg.LinAlgError("Array must not contain infs or NaNs.")
 
+
 @register_jitable
 def _dummy_liveness_func(a):
     """pass a list of variables to be preserved through dead code elimination"""
     return a[0]
 
+
 def dpnp_func(fn_name, type_names, sig):
     f_ptr = get_dpnp_fptr(fn_name, type_names)
 
@@ -33,15 +37,19 @@ def get_pointer(obj):
 
     return types.ExternalFunctionPointer(sig, get_pointer=get_pointer)
 
+
 """
 This function retrieves the pointer to the structure where the shape
 of an ndarray is stored. We cast it to void * to make it easier to
 pass around.
 """
+
+
 @lower_getattr(types.Array, "shapeptr")
 def array_shape(context, builder, typ, value):
-    shape_ptr = builder.gep(value.operands[0],
-                            [context.get_constant(types.int32, 0),
-                             context.get_constant(types.int32, 5)])
+    shape_ptr = builder.gep(
+        value.operands[0],
+        [context.get_constant(types.int32, 0), context.get_constant(types.int32, 5)],
+    )
 
     return builder.bitcast(shape_ptr, ll_void_p)
diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py
index 2fdd6ecbe3..fa7e06ea48 100644
--- a/numba_dppy/dpnp_glue/stubs.py
+++ b/numba_dppy/dpnp_glue/stubs.py
@@ -1,9 +1,10 @@
 from numba_dppy.ocl.stubs import Stub
 
+
 class dpnp(Stub):
-    """dpnp namespace
-    """
-    _description_ = '<dpnp>'
+    """dpnp namespace"""
+
+    _description_ = "<dpnp>"
 
     class sum(Stub):
         pass
diff --git a/numba_dppy/dppy_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py
index 2808ddf90d..585d461127 100644
--- a/numba_dppy/dppy_host_fn_call_gen.py
+++ b/numba_dppy/dppy_host_fn_call_gen.py
@@ -9,6 +9,7 @@
 
 from numba.core.ir_utils import legalize_names
 
+
 class DPPYHostFunctionCallsGenerator(object):
     def __init__(self, lowerer, cres, num_inputs):
         self.lowerer = lowerer
@@ -27,8 +28,8 @@ def __init__(self, lowerer, cres, num_inputs):
         self.null_ptr = self._create_null_ptr()
 
         self.total_kernel_args = 0
-        self.cur_arg           = 0
-        self.num_inputs        = num_inputs
+        self.cur_arg = 0
+        self.num_inputs = num_inputs
 
         # list of buffer that needs to comeback to host
         self.write_buffs = []
@@ -36,65 +37,91 @@ def __init__(self, lowerer, cres, num_inputs):
         # list of buffer that does not need to comeback to host
         self.read_only_buffs = []
 
-
     def _create_null_ptr(self):
-        null_ptr = cgutils.alloca_once(self.builder, self.void_ptr_t,
-                size=self.context.get_constant(types.uintp, 1), name="null_ptr")
+        null_ptr = cgutils.alloca_once(
+            self.builder,
+            self.void_ptr_t,
+            size=self.context.get_constant(types.uintp, 1),
+            name="null_ptr",
+        )
         self.builder.store(
             self.builder.inttoptr(
-                self.context.get_constant(types.uintp, 0), self.void_ptr_t),
-                null_ptr)
+                self.context.get_constant(types.uintp, 0), self.void_ptr_t
+            ),
+            null_ptr,
+        )
         return null_ptr
 
-
     def _init_llvm_types_and_constants(self):
-        self.byte_t          = lc.Type.int(8)
-        self.byte_ptr_t      = lc.Type.pointer(self.byte_t)
-        self.byte_ptr_ptr_t  = lc.Type.pointer(self.byte_ptr_t)
-        self.intp_t          = self.context.get_value_type(types.intp)
-        self.int64_t         = self.context.get_value_type(types.int64)
-        self.int32_t         = self.context.get_value_type(types.int32)
-        self.int32_ptr_t     = lc.Type.pointer(self.int32_t)
-        self.uintp_t         = self.context.get_value_type(types.uintp)
-        self.intp_ptr_t      = lc.Type.pointer(self.intp_t)
-        self.uintp_ptr_t     = lc.Type.pointer(self.uintp_t)
-        self.zero            = self.context.get_constant(types.uintp, 0)
-        self.one             = self.context.get_constant(types.uintp, 1)
-        self.one_type        = self.one.type
-        self.sizeof_intp     = self.context.get_abi_sizeof(self.intp_t)
-        self.void_ptr_t      = self.context.get_value_type(types.voidptr)
-        self.void_ptr_ptr_t  = lc.Type.pointer(self.void_ptr_t)
+        self.byte_t = lc.Type.int(8)
+        self.byte_ptr_t = lc.Type.pointer(self.byte_t)
+        self.byte_ptr_ptr_t = lc.Type.pointer(self.byte_ptr_t)
+        self.intp_t = self.context.get_value_type(types.intp)
+        self.int64_t = self.context.get_value_type(types.int64)
+        self.int32_t = self.context.get_value_type(types.int32)
+        self.int32_ptr_t = lc.Type.pointer(self.int32_t)
+        self.uintp_t = self.context.get_value_type(types.uintp)
+        self.intp_ptr_t = lc.Type.pointer(self.intp_t)
+        self.uintp_ptr_t = lc.Type.pointer(self.uintp_t)
+        self.zero = self.context.get_constant(types.uintp, 0)
+        self.one = self.context.get_constant(types.uintp, 1)
+        self.one_type = self.one.type
+        self.sizeof_intp = self.context.get_abi_sizeof(self.intp_t)
+        self.void_ptr_t = self.context.get_value_type(types.voidptr)
+        self.void_ptr_ptr_t = lc.Type.pointer(self.void_ptr_t)
         self.sizeof_void_ptr = self.context.get_abi_sizeof(self.intp_t)
         self.sycl_queue_val = None
 
     def _declare_functions(self):
         get_queue_fnty = lc.Type.function(self.void_ptr_t, ())
-        self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty,
-                                                                name="DPCTLQueueMgr_GetCurrentQueue")
-
-        submit_range_fnty = lc.Type.function(self.void_ptr_t,
-                [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t,
-                    self.int32_ptr_t, self.intp_t, self.intp_ptr_t,
-                    self.intp_t, self.void_ptr_t, self.intp_t])
-        self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty,
-                                                                name="DPCTLQueue_SubmitRange")
-
-
-        queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t])
-        self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty,
-                                                                name="DPCTLQueue_Memcpy")
-
-        queue_wait_fnty =  lc.Type.function(lir.VoidType(), [self.void_ptr_t])
-        self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty,
-                                                                name="DPCTLQueue_Wait")
-
-        usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t])
-        self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty,
-                                                                name="DPCTLmalloc_shared")
-
-        usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t])
-        self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty,
-                                                                   name="DPCTLfree_with_queue")
+        self.get_queue = self.builder.module.get_or_insert_function(
+            get_queue_fnty, name="DPCTLQueueMgr_GetCurrentQueue"
+        )
+
+        submit_range_fnty = lc.Type.function(
+            self.void_ptr_t,
+            [
+                self.void_ptr_t,
+                self.void_ptr_t,
+                self.void_ptr_ptr_t,
+                self.int32_ptr_t,
+                self.intp_t,
+                self.intp_ptr_t,
+                self.intp_t,
+                self.void_ptr_t,
+                self.intp_t,
+            ],
+        )
+        self.submit_range = self.builder.module.get_or_insert_function(
+            submit_range_fnty, name="DPCTLQueue_SubmitRange"
+        )
+
+        queue_memcpy_fnty = lc.Type.function(
+            lir.VoidType(),
+            [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t],
+        )
+        self.queue_memcpy = self.builder.module.get_or_insert_function(
+            queue_memcpy_fnty, name="DPCTLQueue_Memcpy"
+        )
+
+        queue_wait_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t])
+        self.queue_wait = self.builder.module.get_or_insert_function(
+            queue_wait_fnty, name="DPCTLQueue_Wait"
+        )
+
+        usm_shared_fnty = lc.Type.function(
+            self.void_ptr_t, [self.intp_t, self.void_ptr_t]
+        )
+        self.usm_shared = self.builder.module.get_or_insert_function(
+            usm_shared_fnty, name="DPCTLmalloc_shared"
+        )
+
+        usm_free_fnty = lc.Type.function(
+            lir.VoidType(), [self.void_ptr_t, self.void_ptr_t]
+        )
+        self.usm_free = self.builder.module.get_or_insert_function(
+            usm_free_fnty, name="DPCTLfree_with_queue"
+        )
 
     def allocate_kenrel_arg_array(self, num_kernel_args):
         self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t)
@@ -104,17 +131,21 @@ def allocate_kenrel_arg_array(self, num_kernel_args):
 
         # we need a kernel arg array to enqueue
         self.kernel_arg_array = cgutils.alloca_once(
-            self.builder, self.void_ptr_t, size=self.context.get_constant(
-                types.uintp, num_kernel_args), name="kernel_arg_array")
+            self.builder,
+            self.void_ptr_t,
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+            name="kernel_arg_array",
+        )
 
         self.kernel_arg_ty_array = cgutils.alloca_once(
-            self.builder, self.int32_t, size=self.context.get_constant(
-                types.uintp, num_kernel_args), name="kernel_arg_ty_array")
-
+            self.builder,
+            self.int32_t,
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+            name="kernel_arg_ty_array",
+        )
 
     def resolve_and_return_dpctl_type(self, ty):
-        """This function looks up the dpctl defined enum values from DPCTLKernelArgType.
-        """
+        """This function looks up the dpctl defined enum values from DPCTLKernelArgType."""
 
         val = None
         if ty == types.int32 or isinstance(ty, types.scalars.IntegerLiteral):
@@ -136,20 +167,26 @@ def resolve_and_return_dpctl_type(self, ty):
         else:
             raise NotImplementedError
 
-        assert(val != None)
+        assert val != None
 
         return val
 
-
     def form_kernel_arg_and_arg_ty(self, val, ty):
-        kernel_arg_dst = self.builder.gep(self.kernel_arg_array, [self.context.get_constant(types.int32, self.cur_arg)])
-        kernel_arg_ty_dst = self.builder.gep(self.kernel_arg_ty_array, [self.context.get_constant(types.int32, self.cur_arg)])
+        kernel_arg_dst = self.builder.gep(
+            self.kernel_arg_array,
+            [self.context.get_constant(types.int32, self.cur_arg)],
+        )
+        kernel_arg_ty_dst = self.builder.gep(
+            self.kernel_arg_ty_array,
+            [self.context.get_constant(types.int32, self.cur_arg)],
+        )
         self.cur_arg += 1
         self.builder.store(val, kernel_arg_dst)
         self.builder.store(ty, kernel_arg_ty_dst)
 
-
-    def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays):
+    def process_kernel_arg(
+        self, var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays
+    ):
         if isinstance(arg_type, types.npytypes.Array):
             if llvm_arg is None:
                 raise NotImplementedError(arg_type, var)
@@ -157,50 +194,83 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m
             storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(storage, self.void_ptr_t), ty
+            )
 
             storage = cgutils.alloca_once(self.builder, self.int64_t)
             self.builder.store(self.context.get_constant(types.int64, 0), storage)
             ty = self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty)
-
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(storage, self.void_ptr_t), ty
+            )
 
             # Handle array size
-            array_size_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 2)])
-
-            ty =  self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(array_size_member, self.void_ptr_t), ty)
+            array_size_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 2),
+                ],
+            )
 
+            ty = self.resolve_and_return_dpctl_type(types.int64)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(array_size_member, self.void_ptr_t), ty
+            )
 
             # Handle itemsize
-            item_size_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 3)])
-
-            ty =  self.resolve_and_return_dpctl_type(types.int64)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(item_size_member, self.void_ptr_t), ty)
+            item_size_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 3),
+                ],
+            )
 
+            ty = self.resolve_and_return_dpctl_type(types.int64)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(item_size_member, self.void_ptr_t), ty
+            )
 
             # Calculate total buffer size
-            total_size = cgutils.alloca_once(self.builder, self.intp_t,
-                    size=self.one, name="total_size" + str(self.cur_arg))
-            self.builder.store(self.builder.sext(self.builder.mul(self.builder.load(array_size_member),
-                               self.builder.load(item_size_member)), self.intp_t), total_size)
+            total_size = cgutils.alloca_once(
+                self.builder,
+                self.intp_t,
+                size=self.one,
+                name="total_size" + str(self.cur_arg),
+            )
+            self.builder.store(
+                self.builder.sext(
+                    self.builder.mul(
+                        self.builder.load(array_size_member),
+                        self.builder.load(item_size_member),
+                    ),
+                    self.intp_t,
+                ),
+                total_size,
+            )
 
             # Handle data
-            data_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 4)])
+            data_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 4),
+                ],
+            )
 
             buffer_name = "buffer_ptr" + str(self.cur_arg)
-            buffer_ptr = cgutils.alloca_once(self.builder, self.void_ptr_t,
-                                             name=buffer_name)
+            buffer_ptr = cgutils.alloca_once(
+                self.builder, self.void_ptr_t, name=buffer_name
+            )
 
-
-            args = [self.builder.load(total_size),
-                    self.builder.load(self.sycl_queue_val)]
+            args = [
+                self.builder.load(total_size),
+                self.builder.load(self.sycl_queue_val),
+            ]
             self.builder.store(self.builder.call(self.usm_shared, args), buffer_ptr)
 
-
             # names are replaces usig legalize names, we have to do the same for them to match
             legal_names = legalize_names([var])
 
@@ -211,46 +281,70 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m
 
             # We really need to detect when an array needs to be copied over
             if index < self.num_inputs:
-                args = [self.builder.load(self.sycl_queue_val),
-                        self.builder.load(buffer_ptr),
-                        self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t),
-                        self.builder.load(total_size)]
+                args = [
+                    self.builder.load(self.sycl_queue_val),
+                    self.builder.load(buffer_ptr),
+                    self.builder.bitcast(
+                        self.builder.load(data_member), self.void_ptr_t
+                    ),
+                    self.builder.load(total_size),
+                ]
                 self.builder.call(self.queue_memcpy, args)
 
-
-            ty =  self.resolve_and_return_dpctl_type(types.voidptr)
+            ty = self.resolve_and_return_dpctl_type(types.voidptr)
             self.form_kernel_arg_and_arg_ty(self.builder.load(buffer_ptr), ty)
 
             # Handle shape
-            shape_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0),
-                     self.context.get_constant(types.int32, 5)])
+            shape_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 5),
+                ],
+            )
 
             for this_dim in range(arg_type.ndim):
-                shape_entry = self.builder.gep(shape_member,
-                                [self.context.get_constant(types.int32, 0),
-                                 self.context.get_constant(types.int32, this_dim)])
-
-                ty =  self.resolve_and_return_dpctl_type(types.int64)
-                self.form_kernel_arg_and_arg_ty(self.builder.bitcast(shape_entry, self.void_ptr_t), ty)
-
+                shape_entry = self.builder.gep(
+                    shape_member,
+                    [
+                        self.context.get_constant(types.int32, 0),
+                        self.context.get_constant(types.int32, this_dim),
+                    ],
+                )
+
+                ty = self.resolve_and_return_dpctl_type(types.int64)
+                self.form_kernel_arg_and_arg_ty(
+                    self.builder.bitcast(shape_entry, self.void_ptr_t), ty
+                )
 
             # Handle strides
-            stride_member = self.builder.gep(llvm_arg,
-                    [self.context.get_constant(types.int32, 0),
-                     self.context.get_constant(types.int32, 6)])
+            stride_member = self.builder.gep(
+                llvm_arg,
+                [
+                    self.context.get_constant(types.int32, 0),
+                    self.context.get_constant(types.int32, 6),
+                ],
+            )
 
             for this_stride in range(arg_type.ndim):
-                stride_entry = self.builder.gep(stride_member,
-                                [self.context.get_constant(types.int32, 0),
-                                 self.context.get_constant(types.int32, this_stride)])
-
-                ty =  self.resolve_and_return_dpctl_type(types.int64)
-                self.form_kernel_arg_and_arg_ty(self.builder.bitcast(stride_entry, self.void_ptr_t), ty)
+                stride_entry = self.builder.gep(
+                    stride_member,
+                    [
+                        self.context.get_constant(types.int32, 0),
+                        self.context.get_constant(types.int32, this_stride),
+                    ],
+                )
+
+                ty = self.resolve_and_return_dpctl_type(types.int64)
+                self.form_kernel_arg_and_arg_ty(
+                    self.builder.bitcast(stride_entry, self.void_ptr_t), ty
+                )
 
         else:
-            ty =  self.resolve_and_return_dpctl_type(arg_type)
-            self.form_kernel_arg_and_arg_ty(self.builder.bitcast(llvm_arg, self.void_ptr_t), ty)
+            ty = self.resolve_and_return_dpctl_type(arg_type)
+            self.form_kernel_arg_and_arg_ty(
+                self.builder.bitcast(llvm_arg, self.void_ptr_t), ty
+            )
 
     def enqueue_kernel_and_read_back(self, loop_ranges):
         # the assumption is loop_ranges will always be less than or equal to 3 dimensions
@@ -258,8 +352,11 @@ def enqueue_kernel_and_read_back(self, loop_ranges):
 
         # form the global range
         global_range = cgutils.alloca_once(
-                        self.builder, self.uintp_t,
-                        size=self.context.get_constant(types.uintp, num_dim), name="global_range")
+            self.builder,
+            self.uintp_t,
+            size=self.context.get_constant(types.uintp, num_dim),
+            name="global_range",
+        )
 
         for i in range(num_dim):
             start, stop, step = loop_ranges[i]
@@ -267,20 +364,28 @@ def enqueue_kernel_and_read_back(self, loop_ranges):
                 stop = self.builder.sext(stop, self.one_type)
 
             # we reverse the global range to account for how sycl and opencl range differs
-            self.builder.store(stop,
-                               self.builder.gep(global_range, [self.context.get_constant(types.uintp, (num_dim-1)-i)]))
-
-
-        args = [self.builder.inttoptr(self.context.get_constant(types.uintp, self.kernel_addr), self.void_ptr_t),
-                self.builder.load(self.sycl_queue_val),
-                self.kernel_arg_array,
-                self.kernel_arg_ty_array,
-                self.context.get_constant(types.uintp, self.total_kernel_args),
-                self.builder.bitcast(global_range, self.intp_ptr_t),
-                self.context.get_constant(types.uintp, num_dim),
-                self.builder.bitcast(self.null_ptr,  self.void_ptr_t),
-                self.context.get_constant(types.uintp, 0)
-                ]
+            self.builder.store(
+                stop,
+                self.builder.gep(
+                    global_range,
+                    [self.context.get_constant(types.uintp, (num_dim - 1) - i)],
+                ),
+            )
+
+        args = [
+            self.builder.inttoptr(
+                self.context.get_constant(types.uintp, self.kernel_addr),
+                self.void_ptr_t,
+            ),
+            self.builder.load(self.sycl_queue_val),
+            self.kernel_arg_array,
+            self.kernel_arg_ty_array,
+            self.context.get_constant(types.uintp, self.total_kernel_args),
+            self.builder.bitcast(global_range, self.intp_ptr_t),
+            self.context.get_constant(types.uintp, num_dim),
+            self.builder.bitcast(self.null_ptr, self.void_ptr_t),
+            self.context.get_constant(types.uintp, 0),
+        ]
         self.builder.call(self.submit_range, args)
 
         self.builder.call(self.queue_wait, [self.builder.load(self.sycl_queue_val)])
@@ -288,14 +393,22 @@ def enqueue_kernel_and_read_back(self, loop_ranges):
         # read buffers back to host
         for write_buff in self.write_buffs:
             buffer_ptr, total_size, data_member = write_buff
-            args = [self.builder.load(self.sycl_queue_val),
-                    self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t),
-                    self.builder.load(buffer_ptr),
-                    self.builder.load(total_size)]
+            args = [
+                self.builder.load(self.sycl_queue_val),
+                self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t),
+                self.builder.load(buffer_ptr),
+                self.builder.load(total_size),
+            ]
             self.builder.call(self.queue_memcpy, args)
 
-            self.builder.call(self.usm_free, [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)])
+            self.builder.call(
+                self.usm_free,
+                [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)],
+            )
 
         for read_buff in self.read_only_buffs:
             buffer_ptr, total_size, data_member = read_buff
-            self.builder.call(self.usm_free, [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)])
+            self.builder.call(
+                self.usm_free,
+                [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)],
+            )
diff --git a/numba_dppy/dppy_lowerer.py b/numba_dppy/dppy_lowerer.py
index 3040362592..420414ec2f 100644
--- a/numba_dppy/dppy_lowerer.py
+++ b/numba_dppy/dppy_lowerer.py
@@ -9,28 +9,29 @@
 import numpy as np
 
 import numba
-from numba.core import (compiler, ir, types, sigutils, lowering,
-                funcdesc, config)
+from numba.core import compiler, ir, types, sigutils, lowering, funcdesc, config
 from numba.parfors import parfor
 import numba_dppy, numba_dppy as dppy
-from numba.core.ir_utils import (add_offset_to_labels,
-                            replace_var_names,
-                            remove_dels,
-                            legalize_names,
-                            mk_unique_var,
-                            rename_labels,
-                            get_name_var_table,
-                            visit_vars_inner,
-                            guard,
-                            find_callname,
-                            remove_dead,
-                            get_call_table,
-                            is_pure,
-                            build_definitions,
-                            get_np_ufunc_typ,
-                            get_unused_var_name,
-                            find_potential_aliases,
-                            is_const_call)
+from numba.core.ir_utils import (
+    add_offset_to_labels,
+    replace_var_names,
+    remove_dels,
+    legalize_names,
+    mk_unique_var,
+    rename_labels,
+    get_name_var_table,
+    visit_vars_inner,
+    guard,
+    find_callname,
+    remove_dead,
+    get_call_table,
+    is_pure,
+    build_definitions,
+    get_np_ufunc_typ,
+    get_unused_var_name,
+    find_potential_aliases,
+    is_const_call,
+)
 
 from numba.core.typing import signature
 
@@ -47,9 +48,9 @@ def _print_block(block):
     for i, inst in enumerate(block.body):
         print("    ", i, inst)
 
+
 def _print_body(body_dict):
-    '''Pretty-print a set of IR blocks.
-    '''
+    """Pretty-print a set of IR blocks."""
     for label, block in body_dict.items():
         print("label: ", label)
         _print_block(block)
@@ -61,9 +62,9 @@ def _print_body(body_dict):
 # through OpenCL and generate for loops for the remaining
 # dimensions
 def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
-    gufunc_txt    = ""
+    gufunc_txt = ""
     global_id_dim = 0
-    for_loop_dim  = parfor_dim
+    for_loop_dim = parfor_dim
 
     if parfor_dim > 3:
         global_id_dim = 3
@@ -71,9 +72,14 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
         global_id_dim = parfor_dim
 
     for eachdim in range(global_id_dim):
-        gufunc_txt += ("    " + legal_loop_indices[eachdim] + " = "
-                       + "dppy.get_global_id(" + str(eachdim) + ")\n")
-
+        gufunc_txt += (
+            "    "
+            + legal_loop_indices[eachdim]
+            + " = "
+            + "dppy.get_global_id("
+            + str(eachdim)
+            + ")\n"
+        )
 
     for eachdim in range(global_id_dim, for_loop_dim):
         for indent in range(1 + (eachdim - global_id_dim)):
@@ -82,11 +88,15 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
         start, stop, step = loop_ranges[eachdim]
         start = param_dict.get(str(start), start)
         stop = param_dict.get(str(stop), stop)
-        gufunc_txt += ("for " +
-                   legal_loop_indices[eachdim] +
-                   " in range(" + str(start) +
-                   ", " + str(stop) +
-                   " + 1):\n")
+        gufunc_txt += (
+            "for "
+            + legal_loop_indices[eachdim]
+            + " in range("
+            + str(start)
+            + ", "
+            + str(stop)
+            + " + 1):\n"
+        )
 
     for eachdim in range(global_id_dim, for_loop_dim):
         for indent in range(1 + (eachdim - global_id_dim)):
@@ -114,17 +124,18 @@ def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap):
                 strconsttyp = types.StringLiteral(strval)
 
                 lhs = ir.Var(scope, mk_unique_var("str_const"), loc)
-                assign_lhs = ir.Assign(value=ir.Const(value=strval, loc=loc),
-                                       target=lhs, loc=loc)
+                assign_lhs = ir.Assign(
+                    value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc
+                )
                 typemap[lhs.name] = strconsttyp
                 new_block.append(assign_lhs)
 
                 # Make print node
-                print_node = ir.Print(args=[lhs, inst.target], vararg=None,
-                                      loc=loc)
+                print_node = ir.Print(args=[lhs, inst.target], vararg=None, loc=loc)
                 new_block.append(print_node)
-                sig = numba.typing.signature(types.none, typemap[lhs.name],
-                                             typemap[inst.target.name])
+                sig = numba.typing.signature(
+                    types.none, typemap[lhs.name], typemap[inst.target.name]
+                )
                 lowerer.fndesc.calltypes[print_node] = sig
         loop_body[label] = new_block
 
@@ -134,33 +145,36 @@ def replace_var_with_array_in_block(vars, block, typemap, calltypes):
     for inst in block.body:
         if isinstance(inst, ir.Assign) and inst.target.name in vars:
             const_node = ir.Const(0, inst.loc)
-            const_var = ir.Var(inst.target.scope, mk_unique_var("$const_ind_0"),
-                               inst.loc)
+            const_var = ir.Var(
+                inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc
+            )
             typemap[const_var.name] = types.uintp
             const_assign = ir.Assign(const_node, const_var, inst.loc)
             new_block.append(const_assign)
 
-            setitem_node = ir.SetItem(inst.target, const_var, inst.value,
-                                      inst.loc)
+            setitem_node = ir.SetItem(inst.target, const_var, inst.value, inst.loc)
             calltypes[setitem_node] = signature(
-                types.none, types.npytypes.Array(typemap[inst.target.name], 1,
-                                                 "C"), types.intp,
-                                                 typemap[inst.target.name])
+                types.none,
+                types.npytypes.Array(typemap[inst.target.name], 1, "C"),
+                types.intp,
+                typemap[inst.target.name],
+            )
             new_block.append(setitem_node)
             continue
         elif isinstance(inst, parfor.Parfor):
-            replace_var_with_array_internal(vars, {0: inst.init_block},
-                                            typemap, calltypes)
-            replace_var_with_array_internal(vars, inst.loop_body,
-                                            typemap, calltypes)
+            replace_var_with_array_internal(
+                vars, {0: inst.init_block}, typemap, calltypes
+            )
+            replace_var_with_array_internal(vars, inst.loop_body, typemap, calltypes)
 
         new_block.append(inst)
     return new_block
 
+
 def replace_var_with_array_internal(vars, loop_body, typemap, calltypes):
     for label, block in loop_body.items():
-        block.body = replace_var_with_array_in_block(vars, block, typemap,
-                                                     calltypes)
+        block.body = replace_var_with_array_in_block(vars, block, typemap, calltypes)
+
 
 def replace_var_with_array(vars, loop_body, typemap, calltypes):
     replace_var_with_array_internal(vars, loop_body, typemap, calltypes)
@@ -178,17 +192,18 @@ def wrap_loop_body(loop_body):
     blocks[last_label].body.append(ir.Jump(first_label, loc))
     return blocks
 
+
 def unwrap_loop_body(loop_body):
     last_label = max(loop_body.keys())
     loop_body[last_label].body = loop_body[last_label].body[:-1]
 
 
 def legalize_names_with_typemap(names, typemap):
-    """ We use ir_utils.legalize_names to replace internal IR variable names
-        containing illegal characters (e.g. period) with a legal character
-        (underscore) so as to create legal variable names.
-        The original variable names are in the typemap so we also
-        need to add the legalized name to the typemap as well.
+    """We use ir_utils.legalize_names to replace internal IR variable names
+    containing illegal characters (e.g. period) with a legal character
+    (underscore) so as to create legal variable names.
+    The original variable names are in the typemap so we also
+    need to add the legalized name to the typemap as well.
     """
     outdict = legalize_names(names)
     # For each pair in the dict of legalized names...
@@ -206,6 +221,7 @@ def to_scalar_from_0d(x):
             return x.dtype
     return x
 
+
 def find_setitems_block(setitems, block, typemap):
     for inst in block.body:
         if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
@@ -214,15 +230,17 @@ def find_setitems_block(setitems, block, typemap):
             find_setitems_block(setitems, inst.init_block, typemap)
             find_setitems_body(setitems, inst.loop_body, typemap)
 
+
 def find_setitems_body(setitems, loop_body, typemap):
     """
-      Find the arrays that are written into (goes into setitems)
+    Find the arrays that are written into (goes into setitems)
     """
     for label, block in loop_body.items():
         find_setitems_block(setitems, block, typemap)
 
+
 def _create_gufunc_for_regular_parfor():
-    #TODO
+    # TODO
     pass
 
 
@@ -231,18 +249,19 @@ def _create_gufunc_for_reduction_parfor():
 
 
 def _create_gufunc_for_parfor_body(
-        lowerer,
-        parfor,
-        typemap,
-        typingctx,
-        targetctx,
-        flags,
-        loop_ranges,
-        locals,
-        has_aliases,
-        index_var_typ,
-        races):
-    '''
+    lowerer,
+    parfor,
+    typemap,
+    typingctx,
+    targetctx,
+    flags,
+    loop_ranges,
+    locals,
+    has_aliases,
+    index_var_typ,
+    races,
+):
+    """
     Takes a parfor and creates a gufunc function for its body. There
     are two parts to this function:
 
@@ -257,7 +276,7 @@ def _create_gufunc_for_parfor_body(
     IR retrieved with run_frontend. The IR is scanned for the sentinel
     assignment where that basic block is split and the IR for the parfor
     body inserted.
-    '''
+    """
 
     loc = parfor.init_block.loc
 
@@ -288,25 +307,22 @@ def _create_gufunc_for_parfor_body(
     typemap = lowerer.fndesc.typemap
 
     parfor_redvars, parfor_reddict = numba.parfors.parfor.get_parfor_reductions(
-                                        lowerer.func_ir,
-                                        parfor,
-                                        parfor_params,
-                                        lowerer.fndesc.calltypes)
+        lowerer.func_ir, parfor, parfor_params, lowerer.fndesc.calltypes
+    )
     has_reduction = False if len(parfor_redvars) == 0 else True
 
     if has_reduction:
         _create_gufunc_for_reduction_parfor()
 
     # Compute just the parfor inputs as a set difference.
-    parfor_inputs = sorted(
-        list(
-            set(parfor_params) -
-            set(parfor_outputs)))
+    parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs)))
 
     for race in races:
-        msg = ("Variable %s used in parallel loop may be written "
-               "to simultaneously by multiple workers and may result "
-               "in non-deterministic or unintended results." % race)
+        msg = (
+            "Variable %s used in parallel loop may be written "
+            "to simultaneously by multiple workers and may result "
+            "in non-deterministic or unintended results." % race
+        )
         warnings.warn(NumbaParallelSafetyWarning(msg, loc))
     replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes)
 
@@ -321,15 +337,13 @@ def _create_gufunc_for_parfor_body(
     def addrspace_from(params, def_addr):
         addrspaces = []
         for p in params:
-            if isinstance(to_scalar_from_0d(typemap[p]),
-                          types.npytypes.Array):
+            if isinstance(to_scalar_from_0d(typemap[p]), types.npytypes.Array):
                 addrspaces.append(def_addr)
             else:
                 addrspaces.append(None)
         return addrspaces
 
-    addrspaces = addrspace_from(parfor_params,
-                                numba_dppy.target.SPIR_GLOBAL_ADDRSPACE)
+    addrspaces = addrspace_from(parfor_params, numba_dppy.target.SPIR_GLOBAL_ADDRSPACE)
 
     if config.DEBUG_ARRAY_OPT >= 1:
         print("parfor_params = ", parfor_params, type(parfor_params))
@@ -351,8 +365,7 @@ def addrspace_from(params, def_addr):
 
     if config.DEBUG_ARRAY_OPT >= 1:
         print("ind_dict = ", sorted(ind_dict.items()), type(ind_dict))
-        print("legal_loop_indices = ",legal_loop_indices,
-              type(legal_loop_indices))
+        print("legal_loop_indices = ", legal_loop_indices, type(legal_loop_indices))
 
         for pd in parfor_params:
             print("pd = ", pd)
@@ -365,14 +378,15 @@ def addrspace_from(params, def_addr):
 
     # Calculate types of args passed to gufunc.
     func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
-    assert(len(param_types_addrspaces) == len(addrspaces))
+    assert len(param_types_addrspaces) == len(addrspaces)
     for i in range(len(param_types_addrspaces)):
         if addrspaces[i] is not None:
-            #print("before:", id(param_types_addrspaces[i]))
-            assert(isinstance(param_types_addrspaces[i], types.npytypes.Array))
-            param_types_addrspaces[i] = (param_types_addrspaces[i]
-                                        .copy(addrspace=addrspaces[i]))
-            #print("setting param type", i, param_types[i], id(param_types[i]),
+            # print("before:", id(param_types_addrspaces[i]))
+            assert isinstance(param_types_addrspaces[i], types.npytypes.Array)
+            param_types_addrspaces[i] = param_types_addrspaces[i].copy(
+                addrspace=addrspaces[i]
+            )
+            # print("setting param type", i, param_types[i], id(param_types[i]),
             #      "to addrspace", param_types_addrspaces[i].addrspace)
 
     def print_arg_with_addrspaces(args):
@@ -396,10 +410,12 @@ def print_arg_with_addrspaces(args):
     parfor_params = []
     ascontig = False
     for pindex in range(len(parfor_params_orig)):
-        if (ascontig and
-            pindex < len(parfor_inputs) and
-            isinstance(param_types[pindex], types.npytypes.Array)):
-            parfor_params.append(parfor_params_orig[pindex]+"param")
+        if (
+            ascontig
+            and pindex < len(parfor_inputs)
+            and isinstance(param_types[pindex], types.npytypes.Array)
+        ):
+            parfor_params.append(parfor_params_orig[pindex] + "param")
         else:
             parfor_params.append(parfor_params_orig[pindex])
 
@@ -409,11 +425,7 @@ def print_arg_with_addrspaces(args):
     sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)
 
     if config.DEBUG_ARRAY_OPT >= 1:
-        print(
-            "legal parfor_params = ",
-            parfor_params,
-            type(parfor_params))
-
+        print("legal parfor_params = ", parfor_params, type(parfor_params))
 
     # Determine the unique names of the scheduling and gufunc functions.
     gufunc_name = "__numba_parfor_gufunc_%s" % (parfor.id)
@@ -428,9 +440,9 @@ def print_arg_with_addrspaces(args):
     gufunc_txt += "def " + gufunc_name
     gufunc_txt += "(" + (", ".join(parfor_params)) + "):\n"
 
-
-    gufunc_txt += _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges,
-                                 param_dict)
+    gufunc_txt += _schedule_loop(
+        parfor_dim, legal_loop_indices, loop_ranges, param_dict
+    )
 
     # Add the sentinel assignment so that we can find the loop body position
     # in the IR.
@@ -463,8 +475,7 @@ def print_arg_with_addrspaces(args):
     # rename all variables in gufunc_ir afresh
     var_table = get_name_var_table(gufunc_ir.blocks)
     new_var_dict = {}
-    reserved_names = [sentinel_name] + \
-        list(param_dict.values()) + legal_loop_indices
+    reserved_names = [sentinel_name] + list(param_dict.values()) + legal_loop_indices
     for name, var in var_table.items():
         if not (name in reserved_names):
             new_var_dict[name] = mk_unique_var(name)
@@ -481,10 +492,8 @@ def print_arg_with_addrspaces(args):
 
     if config.DEBUG_ARRAY_OPT:
         print(
-            "gufunc_param_types = ",
-            type(gufunc_param_types),
-            "\n",
-            gufunc_param_types)
+            "gufunc_param_types = ", type(gufunc_param_types), "\n", gufunc_param_types
+        )
 
     gufunc_stub_last_label = max(gufunc_ir.blocks.keys()) + 1
 
@@ -503,7 +512,7 @@ def print_arg_with_addrspaces(args):
         _print_body(loop_body)
 
     wrapped_blocks = wrap_loop_body(loop_body)
-    #hoisted, not_hoisted = hoist(parfor_params, loop_body,
+    # hoisted, not_hoisted = hoist(parfor_params, loop_body,
     #                             typemap, wrapped_blocks)
     setitems = set()
     find_setitems_body(setitems, loop_body, typemap)
@@ -516,11 +525,12 @@ def print_arg_with_addrspaces(args):
     unwrap_loop_body(loop_body)
 
     # store hoisted into diagnostics
-    diagnostics = lowerer.metadata['parfor_diagnostics']
-    diagnostics.hoist_info[parfor.id] = {'hoisted': hoisted,
-                                         'not_hoisted': not_hoisted}
+    diagnostics = lowerer.metadata["parfor_diagnostics"]
+    diagnostics.hoist_info[parfor.id] = {"hoisted": hoisted, "not_hoisted": not_hoisted}
 
-    lowerer.metadata['parfor_diagnostics'].extra_info[str(parfor.id)] = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
+    lowerer.metadata["parfor_diagnostics"].extra_info[str(parfor.id)] = str(
+        dpctl.get_current_queue().get_sycl_device().get_device_name()
+    )
 
     if config.DEBUG_ARRAY_OPT:
         print("After hoisting")
@@ -529,8 +539,7 @@ def print_arg_with_addrspaces(args):
     # Search all the block in the gufunc outline for the sentinel assignment.
     for label, block in gufunc_ir.blocks.items():
         for i, inst in enumerate(block.body):
-            if (isinstance(inst, ir.Assign) and
-                inst.target.name == sentinel_name):
+            if isinstance(inst, ir.Assign) and inst.target.name == sentinel_name:
                 # We found the sentinel assignment.
                 loc = inst.loc
                 scope = block.scope
@@ -541,7 +550,7 @@ def print_arg_with_addrspaces(args):
                 prev_block.body = block.body[:i]
 
                 # The current block is used for statements after the sentinel.
-                block.body = block.body[i + 1:]
+                block.body = block.body[i + 1 :]
                 # But the current block gets a new label.
                 body_first_label = min(loop_body.keys())
 
@@ -557,8 +566,7 @@ def print_arg_with_addrspaces(args):
                 gufunc_ir.blocks[label] = prev_block
                 # Add a jump from the last parfor body block to the block
                 # containing statements after the sentinel.
-                gufunc_ir.blocks[body_last_label].append(
-                    ir.Jump(new_label, loc))
+                gufunc_ir.blocks[body_last_label].append(ir.Jump(new_label, loc))
                 break
         else:
             continue
@@ -598,22 +606,21 @@ def print_arg_with_addrspaces(args):
         sys.stdout.flush()
 
     if config.DEBUG_ARRAY_OPT:
-        print('before DUFunc inlining'.center(80, '-'))
+        print("before DUFunc inlining".center(80, "-"))
         gufunc_ir.dump()
 
     # Inlining all DUFuncs
-    dufunc_inliner(gufunc_ir, lowerer.fndesc.calltypes, typemap,
-                   lowerer.context.typing_context)
+    dufunc_inliner(
+        gufunc_ir, lowerer.fndesc.calltypes, typemap, lowerer.context.typing_context
+    )
 
     if config.DEBUG_ARRAY_OPT:
-        print('after DUFunc inline'.center(80, '-'))
+        print("after DUFunc inline".center(80, "-"))
         gufunc_ir.dump()
 
     kernel_func = numba_dppy.compiler.compile_kernel_parfor(
-        dpctl.get_current_queue(),
-        gufunc_ir,
-        gufunc_param_types,
-        param_types_addrspaces)
+        dpctl.get_current_queue(), gufunc_ir, gufunc_param_types, param_types_addrspaces
+    )
 
     flags.noalias = old_alias
 
@@ -672,8 +679,9 @@ def _lower_parfor_gufunc(lowerer, parfor):
 
     alias_map = {}
     arg_aliases = {}
-    numba.parfors.parfor.find_potential_aliases_parfor(parfor, parfor.params, typemap,
-                                        lowerer.func_ir, alias_map, arg_aliases)
+    numba.parfors.parfor.find_potential_aliases_parfor(
+        parfor, parfor.params, typemap, lowerer.func_ir, alias_map, arg_aliases
+    )
     if config.DEBUG_ARRAY_OPT:
         print("alias_map", alias_map)
         print("arg_aliases", arg_aliases)
@@ -684,12 +692,12 @@ def _lower_parfor_gufunc(lowerer, parfor):
     assert parfor.params != None
 
     parfor_output_arrays = numba.parfors.parfor.get_parfor_outputs(
-        parfor, parfor.params)
-
+        parfor, parfor.params
+    )
 
     # compile parfor body as a separate function to be used with GUFuncWrapper
     flags = copy.copy(parfor.flags)
-    flags.set('error_model', 'numpy')
+    flags.set("error_model", "numpy")
 
     # Can't get here unless flags.set('auto_parallel', ParallelOptions(True))
     index_var_typ = typemap[parfor.loop_nests[0].index_variable.name]
@@ -702,8 +710,13 @@ def _lower_parfor_gufunc(lowerer, parfor):
     loop_ranges = [(l.start, l.stop, l.step) for l in parfor.loop_nests]
 
     try:
-        func, func_args, func_sig, func_arg_types, modified_arrays =(
-        _create_gufunc_for_parfor_body(
+        (
+            func,
+            func_args,
+            func_sig,
+            func_arg_types,
+            modified_arrays,
+        ) = _create_gufunc_for_parfor_body(
             lowerer,
             parfor,
             typemap,
@@ -714,7 +727,8 @@ def _lower_parfor_gufunc(lowerer, parfor):
             {},
             bool(alias_map),
             index_var_typ,
-            parfor.races))
+            parfor.races,
+        )
     finally:
         numba.parfors.parfor.sequential_parfor_lowering = False
 
@@ -735,12 +749,8 @@ def _lower_parfor_gufunc(lowerer, parfor):
         print("loop_ranges = ", loop_ranges)
 
     gu_signature = _create_shape_signature(
-        parfor.get_shape_classes,
-        num_inputs,
-        func_args,
-        func_sig,
-        parfor.races,
-        typemap)
+        parfor.get_shape_classes, num_inputs, func_args, func_sig, parfor.races, typemap
+    )
 
     generate_dppy_host_wrapper(
         lowerer,
@@ -754,7 +764,8 @@ def _lower_parfor_gufunc(lowerer, parfor):
         parfor.init_block,
         index_var_typ,
         parfor.races,
-        modified_arrays)
+        modified_arrays,
+    )
 
     if config.DEBUG_ARRAY_OPT:
         sys.stdout.flush()
@@ -765,50 +776,52 @@ def _lower_parfor_gufunc(lowerer, parfor):
 
 
 def _create_shape_signature(
-        get_shape_classes,
-        num_inputs,
-        #num_reductions,
-        args,
-        func_sig,
-        races,
-        typemap):
-    '''Create shape signature for GUFunc
-    '''
+    get_shape_classes,
+    num_inputs,
+    # num_reductions,
+    args,
+    func_sig,
+    races,
+    typemap,
+):
+    """Create shape signature for GUFunc"""
     if config.DEBUG_ARRAY_OPT:
         print("_create_shape_signature", num_inputs, args)
         arg_start_print = 0
         for i in args[arg_start_print:]:
             print("argument", i, type(i), get_shape_classes(i, typemap=typemap))
 
-    #num_inouts = len(args) - num_reductions
+    # num_inouts = len(args) - num_reductions
     num_inouts = len(args)
     # maximum class number for array shapes
-    classes = [get_shape_classes(var, typemap=typemap)
-               if var not in races else (-1,) for var in args[1:]]
+    classes = [
+        get_shape_classes(var, typemap=typemap) if var not in races else (-1,)
+        for var in args[1:]
+    ]
     class_set = set()
     for _class in classes:
         if _class:
             for i in _class:
                 class_set.add(i)
     max_class = max(class_set) + 1 if class_set else 0
-    classes.insert(0, (max_class,)) # force set the class of 'sched' argument
+    classes.insert(0, (max_class,))  # force set the class of 'sched' argument
     class_set.add(max_class)
     class_map = {}
     # TODO: use prefix + class number instead of single char
-    alphabet = ord('a')
+    alphabet = ord("a")
     for n in class_set:
-       if n >= 0:
-           class_map[n] = chr(alphabet)
-           alphabet += 1
+        if n >= 0:
+            class_map[n] = chr(alphabet)
+            alphabet += 1
 
-    alpha_dict = {'latest_alpha' : alphabet}
+    alpha_dict = {"latest_alpha": alphabet}
 
     def bump_alpha(c, class_map):
         if c >= 0:
             return class_map[c]
         else:
-            alpha_dict['latest_alpha'] += 1
-            return chr(alpha_dict['latest_alpha'])
+            alpha_dict["latest_alpha"] += 1
+            return chr(alpha_dict["latest_alpha"])
 
     gu_sin = []
     gu_sout = []
@@ -833,21 +846,24 @@ def bump_alpha(c, class_map):
 # Keep all the dppy kernels and programs created alive indefinitely.
 keep_alive_kernels = []
 
-def generate_dppy_host_wrapper(lowerer,
-                               cres,
-                               gu_signature,
-                               outer_sig,
-                               expr_args,
-                               num_inputs,
-                               expr_arg_types,
-                               loop_ranges,
-                               init_block,
-                               index_var_typ,
-                               races,
-                               modified_arrays):
-    '''
+
+def generate_dppy_host_wrapper(
+    lowerer,
+    cres,
+    gu_signature,
+    outer_sig,
+    expr_args,
+    num_inputs,
+    expr_arg_types,
+    loop_ranges,
+    init_block,
+    index_var_typ,
+    races,
+    modified_arrays,
+):
+    """
     Adds the call to the gufunc function from the main function.
-    '''
+    """
     context = lowerer.context
     builder = lowerer.builder
     sin, sout = gu_signature
@@ -856,8 +872,13 @@ def generate_dppy_host_wrapper(lowerer,
     if config.DEBUG_ARRAY_OPT:
         print("generate_dppy_host_wrapper")
         print("args = ", expr_args)
-        print("outer_sig = ", outer_sig.args, outer_sig.return_type,
-              outer_sig.recvr, outer_sig.pysig)
+        print(
+            "outer_sig = ",
+            outer_sig.args,
+            outer_sig.return_type,
+            outer_sig.recvr,
+            outer_sig.pysig,
+        )
         print("loop_ranges = ", loop_ranges)
         print("expr_args", expr_args)
         print("expr_arg_types", expr_arg_types)
@@ -866,13 +887,13 @@ def generate_dppy_host_wrapper(lowerer,
         print("sout", sout)
         print("cres", cres, type(cres))
         print("modified_arrays", modified_arrays)
-#        print("cres.library", cres.library, type(cres.library))
-#        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
-
+    #        print("cres.library", cres.library, type(cres.library))
+    #        print("cres.fndesc", cres.fndesc, type(cres.fndesc))
 
     # get dppy_cpu_portion_lowerer object
     dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator(
-                           lowerer, cres, num_inputs)
+        lowerer, cres, num_inputs
+    )
 
     # Compute number of args ------------------------------------------------
     num_expanded_args = 0
@@ -911,8 +932,7 @@ def val_type_or_none(context, lowerer, x):
             return None
 
     all_llvm_args = [getvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
-    all_val_types = ([val_type_or_none(context, lowerer, x)
-                     for x in expr_args[:ninouts]])
+    all_val_types = [val_type_or_none(context, lowerer, x) for x in expr_args[:ninouts]]
     all_args = [loadvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
 
     keep_alive_kernels.append(cres)
@@ -922,19 +942,37 @@ def val_type_or_none(context, lowerer, x):
     # the enqueue function. Put each part of each argument into
     # kernel_arg_array.
     for var, llvm_arg, arg_type, gu_sig, val_type, index in zip(
-        expr_args, all_llvm_args, expr_arg_types, sin + sout, all_val_types,
-        range(len(expr_args))):
+        expr_args,
+        all_llvm_args,
+        expr_arg_types,
+        sin + sout,
+        all_val_types,
+        range(len(expr_args)),
+    ):
 
         if config.DEBUG_ARRAY_OPT:
-            print("var:", var, type(var),
-                  "\n\tllvm_arg:", llvm_arg, type(llvm_arg),
-                  "\n\targ_type:", arg_type, type(arg_type),
-                  "\n\tgu_sig:", gu_sig,
-                  "\n\tval_type:", val_type, type(val_type),
-                  "\n\tindex:", index)
-
-        dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig,
-                                            val_type, index, modified_arrays)
+            print(
+                "var:",
+                var,
+                type(var),
+                "\n\tllvm_arg:",
+                llvm_arg,
+                type(llvm_arg),
+                "\n\targ_type:",
+                arg_type,
+                type(arg_type),
+                "\n\tgu_sig:",
+                gu_sig,
+                "\n\tval_type:",
+                val_type,
+                type(val_type),
+                "\n\tindex:",
+                index,
+            )
+
+        dppy_cpu_lowerer.process_kernel_arg(
+            var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays
+        )
     # -----------------------------------------------------------------------
 
     # loadvars for loop_ranges
@@ -949,7 +987,7 @@ def load_range(v):
         start, stop, step = loop_ranges[i]
         start = load_range(start)
         stop = load_range(stop)
-        assert(step == 1)  # We do not support loop steps other than 1
+        assert step == 1  # We do not support loop steps other than 1
         step = load_range(step)
         loop_ranges[i] = (start, stop, step)
 
@@ -985,10 +1023,26 @@ def relatively_deep_copy(obj, memo):
     from numba.core.types.abstract import Type
 
     # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type,
-                        Dispatcher, ModuleType, Signature,
-                        DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr,
-                        type, str, bool, type(None))):
+    if isinstance(
+        obj,
+        (
+            FunctionIdentity,
+            _DispatcherBase,
+            Function,
+            Type,
+            Dispatcher,
+            ModuleType,
+            Signature,
+            DPPYFunctionTemplate,
+            CompileResult,
+            DUFunc,
+            _CFuncPtr,
+            type,
+            str,
+            bool,
+            type(None),
+        ),
+    ):
         return obj
 
     from numba.core.ir import Global, FreeVar
@@ -997,15 +1051,24 @@ def relatively_deep_copy(obj, memo):
     from numba.core.funcdesc import FunctionDescriptor
 
     if isinstance(obj, FunctionDescriptor):
-        cpy = FunctionDescriptor(native=obj.native, modname=obj.modname, qualname=obj.qualname,
-                                 unique_name=obj.unique_name, doc=obj.doc,
-                                 typemap=relatively_deep_copy(obj.typemap, memo),
-                                 restype=obj.restype,
-                                 calltypes=relatively_deep_copy(obj.calltypes, memo),
-                                 args=obj.args, kws=obj.kws, mangler=None,
-                                 argtypes=relatively_deep_copy(obj.argtypes, memo),
-                                 inline=obj.inline, noalias=obj.noalias, env_name=obj.env_name,
-                                 global_dict=obj.global_dict)
+        cpy = FunctionDescriptor(
+            native=obj.native,
+            modname=obj.modname,
+            qualname=obj.qualname,
+            unique_name=obj.unique_name,
+            doc=obj.doc,
+            typemap=relatively_deep_copy(obj.typemap, memo),
+            restype=obj.restype,
+            calltypes=relatively_deep_copy(obj.calltypes, memo),
+            args=obj.args,
+            kws=obj.kws,
+            mangler=None,
+            argtypes=relatively_deep_copy(obj.argtypes, memo),
+            inline=obj.inline,
+            noalias=obj.noalias,
+            env_name=obj.env_name,
+            global_dict=obj.global_dict,
+        )
         # mangler parameter is not saved in FunctionDescriptor, but used to generated name.
         # So pass None as mangler parameter and then copy mangled_name by hands
         cpy.mangled_name = obj.mangled_name
@@ -1025,13 +1088,15 @@ def relatively_deep_copy(obj, memo):
         # This means that copy of IR actually has a side effect on it.
         pp = PostProcessor(obj)
         pp.run()
-        cpy = FunctionIR(blocks=relatively_deep_copy(obj.blocks, memo),
-                         is_generator=relatively_deep_copy(obj.is_generator, memo),
-                         func_id=relatively_deep_copy(obj.func_id, memo),
-                         loc=obj.loc,
-                         definitions=relatively_deep_copy(obj._definitions, memo),
-                         arg_count=obj.arg_count,
-                         arg_names=relatively_deep_copy(obj.arg_names, memo))
+        cpy = FunctionIR(
+            blocks=relatively_deep_copy(obj.blocks, memo),
+            is_generator=relatively_deep_copy(obj.is_generator, memo),
+            func_id=relatively_deep_copy(obj.func_id, memo),
+            loc=obj.loc,
+            definitions=relatively_deep_copy(obj._definitions, memo),
+            arg_count=obj.arg_count,
+            arg_names=relatively_deep_copy(obj.arg_names, memo),
+        )
         pp = PostProcessor(cpy)
         pp.run()
 
@@ -1142,8 +1207,9 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None):
         fndesc_cpu = relatively_deep_copy(fndesc, memo)
         func_ir_cpu = relatively_deep_copy(func_ir, memo)
 
-
-        cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context
+        cpu_context = (
+            context.cpu_context if isinstance(context, DPPYTargetContext) else context
+        )
         self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata)
         self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata)
 
@@ -1168,17 +1234,23 @@ def lower(self):
             lowering.lower_extensions[parfor.Parfor].append(lower_parfor_rollback)
             self.gpu_lower.lower()
             # if lower dont crash, and parfor_diagnostics is empty then it is kernel
-            if not self.gpu_lower.metadata['parfor_diagnostics'].extra_info:
-                str_name = str(dpctl.get_current_queue().get_sycl_device().get_device_name())
-                self.gpu_lower.metadata['parfor_diagnostics'].extra_info["kernel"] = str_name
+            if not self.gpu_lower.metadata["parfor_diagnostics"].extra_info:
+                str_name = str(
+                    dpctl.get_current_queue().get_sycl_device().get_device_name()
+                )
+                self.gpu_lower.metadata["parfor_diagnostics"].extra_info[
+                    "kernel"
+                ] = str_name
             self.base_lower = self.gpu_lower
             lowering.lower_extensions[parfor.Parfor].pop()
         except Exception as e:
             if numba_dppy.compiler.DEBUG:
                 print("Failed to lower parfor on DPPY-device. Due to:\n", e)
             lowering.lower_extensions[parfor.Parfor].pop()
-            if ((lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel) and
-                numba_dppy.config.FALLBACK_ON_CPU == 1):
+            if (
+                lowering.lower_extensions[parfor.Parfor][-1]
+                == numba.parfors.parfor_lowering._lower_parfor_parallel
+            ) and numba_dppy.config.FALLBACK_ON_CPU == 1:
                 self.cpu_lower.lower()
                 self.base_lower = self.cpu_lower
             else:
diff --git a/numba_dppy/dppy_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py
index 0c5fe10f5e..58ca6d9729 100644
--- a/numba_dppy/dppy_offload_dispatcher.py
+++ b/numba_dppy/dppy_offload_dispatcher.py
@@ -6,18 +6,45 @@
 class DppyOffloadDispatcher(dispatcher.Dispatcher):
     targetdescr = cpu_target
 
-    def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler):
+    def __init__(
+        self,
+        py_func,
+        locals={},
+        targetoptions={},
+        impl_kind="direct",
+        pipeline_class=compiler.Compiler,
+    ):
         if dppy_config.dppy_present:
             from numba_dppy.compiler import DPPYCompiler
-            targetoptions['parallel'] = True
-            dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                    targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler)
+
+            targetoptions["parallel"] = True
+            dispatcher.Dispatcher.__init__(
+                self,
+                py_func,
+                locals=locals,
+                targetoptions=targetoptions,
+                impl_kind=impl_kind,
+                pipeline_class=DPPYCompiler,
+            )
         else:
-            print("---------------------------------------------------------------------")
-            print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.")
-            print("---------------------------------------------------------------------")
-            dispatcher.Dispatcher.__init__(self, py_func, locals=locals,
-                targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class)
+            print(
+                "---------------------------------------------------------------------"
+            )
+            print(
+                "WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed."
+            )
+            print(
+                "---------------------------------------------------------------------"
+            )
+            dispatcher.Dispatcher.__init__(
+                self,
+                py_func,
+                locals=locals,
+                targetoptions=targetoptions,
+                impl_kind=impl_kind,
+                pipeline_class=pipeline_class,
+            )
+
 
-dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher
-dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher
+dispatcher_registry["__dppy_offload_gpu__"] = DppyOffloadDispatcher
+dispatcher_registry["__dppy_offload_cpu__"] = DppyOffloadDispatcher
diff --git a/numba_dppy/dppy_parfor_diagnostics.py b/numba_dppy/dppy_parfor_diagnostics.py
index 50e19a1cb1..51b3747cac 100644
--- a/numba_dppy/dppy_parfor_diagnostics.py
+++ b/numba_dppy/dppy_parfor_diagnostics.py
@@ -14,14 +14,14 @@ def dump(self, level=1):
         if self.extra_info:
             parfors_simple = self.get_parfors_simple(False)
             all_lines = self.get_all_lines(parfors_simple)
-            print(' Auto-offloading '.center(_termwidth,'-'))
+            print(" Auto-offloading ".center(_termwidth, "-"))
             self.print_auto_offloading(all_lines)
-            if 'kernel' in self.extra_info.keys():
-                print_wrapped("Device - '%s'" % self.extra_info['kernel'])
-            print(_termwidth * '-')
+            if "kernel" in self.extra_info.keys():
+                print_wrapped("Device - '%s'" % self.extra_info["kernel"])
+            print(_termwidth * "-")
 
     def print_auto_offloading(self, lines):
-        sword = '+--'
+        sword = "+--"
         fac = len(sword)
 
         summary = dict()
@@ -30,26 +30,26 @@ def print_auto_offloading(self, lines):
         def print_nest(fadj_, nadj_, theroot, reported, region_id):
             def print_g(fadj_, nadj_, nroot, depth):
                 for k in nadj_[nroot]:
-                    msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial')
+                    msg = fac * depth * " " + "%s%s %s" % (sword, k, "(serial")
                     if nadj_[k] == []:
                         fused = []
                         if fadj_[k] != [] and k not in reported:
                             fused = sorted(self.reachable_nodes(fadj_, k))
                             msg += ", fused with loop(s): "
-                            msg += ', '.join([str(x) for x in fused])
-                        msg += ')'
+                            msg += ", ".join([str(x) for x in fused])
+                        msg += ")"
                         reported.append(k)
                         print_wrapped(msg)
-                        summary[region_id]['fused'] += len(fused)
+                        summary[region_id]["fused"] += len(fused)
                     else:
-                        print_wrapped(msg + ')')
+                        print_wrapped(msg + ")")
                         print_g(fadj_, nadj_, k, depth + 1)
-                    summary[region_id]['serialized'] += 1
+                    summary[region_id]["serialized"] += 1
 
             if nadj_[theroot] != []:
                 print_wrapped("Parallel region %s:" % region_id)
-                print_wrapped('%s%s %s' % (sword, theroot, '(parallel)'))
-                summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0}
+                print_wrapped("%s%s %s" % (sword, theroot, "(parallel)"))
+                summary[region_id] = {"root": theroot, "fused": 0, "serialized": 0}
                 print_g(fadj_, nadj_, theroot, 1)
                 print("\n")
                 region_id = region_id + 1
@@ -57,15 +57,15 @@ def print_g(fadj_, nadj_, nroot, depth):
 
         def print_fuse(ty, pf_id, adj, depth, region_id):
             print_wrapped("Parallel region %s:" % region_id)
-            msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel')
+            msg = fac * depth * " " + "%s%s %s" % (sword, pf_id, "(parallel")
             fused = []
             if adj[pf_id] != []:
                 fused = sorted(self.reachable_nodes(adj, pf_id))
                 msg += ", fused with loop(s): "
-                msg += ', '.join([str(x) for x in fused])
+                msg += ", ".join([str(x) for x in fused])
 
-            summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0}
-            msg += ')'
+            summary[region_id] = {"root": pf_id, "fused": len(fused), "serialized": 0}
+            msg += ")"
             print_wrapped(msg)
             extra_info = self.extra_info.get(str(region_id))
             if extra_info:
@@ -78,10 +78,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id):
         reported = []
         for line, info in sorted(lines.items()):
             opt_ty, pf_id, adj = info
-            if opt_ty == 'fuse':
+            if opt_ty == "fuse":
                 if pf_id not in reported:
-                    region_id = print_fuse('f', pf_id, adj, 0, region_id)
-            elif opt_ty == 'nest':
+                    region_id = print_fuse("f", pf_id, adj, 0, region_id)
+            elif opt_ty == "nest":
                 region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
             else:
                 assert 0
@@ -89,18 +89,19 @@ def print_fuse(ty, pf_id, adj, depth, region_id):
         # print the summary of the fuse/serialize rewrite
         if summary:
             for k, v in sorted(summary.items()):
-                msg = ('\n \nParallel region %s (loop #%s) had %s '
-                    'loop(s) fused')
-                root = v['root']
-                fused = v['fused']
-                serialized = v['serialized']
+                msg = "\n \nParallel region %s (loop #%s) had %s " "loop(s) fused"
+                root = v["root"]
+                fused = v["fused"]
+                serialized = v["serialized"]
                 if serialized != 0:
-                    msg += (' and %s loop(s) '
-                    'serialized as part of the larger '
-                    'parallel loop (#%s).')
+                    msg += (
+                        " and %s loop(s) "
+                        "serialized as part of the larger "
+                        "parallel loop (#%s)."
+                    )
                     print_wrapped(msg % (k, root, fused, serialized, root))
                 else:
-                    msg += '.'
+                    msg += "."
                     print_wrapped(msg % (k, root, fused))
         else:
             print_wrapped("Parallel structure is already optimal.")
diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py
index 994351d509..c6b2534a62 100644
--- a/numba_dppy/dppy_passbuilder.py
+++ b/numba_dppy/dppy_passbuilder.py
@@ -2,32 +2,51 @@
 
 from numba.core.compiler_machinery import PassManager
 
-from numba.core.untyped_passes import (ExtractByteCode, TranslateByteCode, FixupArgs,
-                                  IRProcessing, DeadBranchPrune,
-                                  RewriteSemanticConstants, InlineClosureLikes,
-                                  GenericRewrites, WithLifting,
-                                  InlineInlinables, FindLiterallyCalls,
-                                  MakeFunctionToJitFunction,
-                                  CanonicalizeLoopExit, CanonicalizeLoopEntry,
-                                  ReconstructSSA,
-                                  LiteralUnroll)
-
-from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes,
-                                NopythonRewrites, PreParforPass, ParforPass,
-                                DumpParforDiagnostics, IRLegalization,
-                                InlineOverloads, PreLowerStripPhis)
+from numba.core.untyped_passes import (
+    ExtractByteCode,
+    TranslateByteCode,
+    FixupArgs,
+    IRProcessing,
+    DeadBranchPrune,
+    RewriteSemanticConstants,
+    InlineClosureLikes,
+    GenericRewrites,
+    WithLifting,
+    InlineInlinables,
+    FindLiterallyCalls,
+    MakeFunctionToJitFunction,
+    CanonicalizeLoopExit,
+    CanonicalizeLoopEntry,
+    ReconstructSSA,
+    LiteralUnroll,
+)
+
+from numba.core.typed_passes import (
+    NopythonTypeInference,
+    AnnotateTypes,
+    NopythonRewrites,
+    PreParforPass,
+    ParforPass,
+    DumpParforDiagnostics,
+    IRLegalization,
+    InlineOverloads,
+    PreLowerStripPhis,
+)
 
 from .dppy_passes import (
-        DPPYConstantSizeStaticLocalMemoryPass,
-        DPPYPreParforPass,
-        DPPYParforPass,
-        SpirvFriendlyLowering,
-        DPPYNoPythonBackend,
-        DPPYDumpParforDiagnostics
-        )
+    DPPYConstantSizeStaticLocalMemoryPass,
+    DPPYPreParforPass,
+    DPPYParforPass,
+    SpirvFriendlyLowering,
+    DPPYNoPythonBackend,
+    DPPYDumpParforDiagnostics,
+)
+
+from .rename_numpy_functions_pass import (
+    DPPYRewriteOverloadedNumPyFunctions,
+    DPPYRewriteNdarrayFunctions,
+)
 
-from .rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
-                                          DPPYRewriteNdarrayFunctions)
 
 class DPPYPassBuilder(object):
     """
@@ -38,8 +57,7 @@ class DPPYPassBuilder(object):
 
     @staticmethod
     def default_numba_nopython_pipeline(state, pm):
-        """Adds the default set of NUMBA passes to the pass manager
-        """
+        """Adds the default set of NUMBA passes to the pass manager"""
         if state.func_ir is None:
             pm.add_pass(TranslateByteCode, "analyzing bytecode")
             pm.add_pass(FixupArgs, "fix up args")
@@ -47,14 +65,18 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(WithLifting, "Handle with contexts")
 
         # this pass rewrites name of NumPy functions we intend to overload
-        pm.add_pass(DPPYRewriteOverloadedNumPyFunctions,
-                "Rewrite name of Numpy functions to overload already overloaded function",
+        pm.add_pass(
+            DPPYRewriteOverloadedNumPyFunctions,
+            "Rewrite name of Numpy functions to overload already overloaded function",
         )
 
         # Add pass to ensure when users are allocating static
         # constant memory the size is a constant and can not
         # come from a closure variable
-        pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory")
+        pm.add_pass(
+            DPPYConstantSizeStaticLocalMemoryPass,
+            "dppy constant size for static local memory",
+        )
 
         # pre typing
         if not state.flags.no_rewrites:
@@ -62,11 +84,11 @@ def default_numba_nopython_pipeline(state, pm):
             pm.add_pass(DeadBranchPrune, "dead branch pruning")
             pm.add_pass(GenericRewrites, "nopython rewrites")
 
-        pm.add_pass(InlineClosureLikes,
-                    "inline calls to locally defined closures")
+        pm.add_pass(InlineClosureLikes, "inline calls to locally defined closures")
         # convert any remaining closures into functions
-        pm.add_pass(MakeFunctionToJitFunction,
-                    "convert make_function into JIT functions")
+        pm.add_pass(
+            MakeFunctionToJitFunction, "convert make_function into JIT functions"
+        )
         # inline functions that have been determined as inlinable and rerun
         # branch pruning, this needs to be run after closures are inlined as
         # the IR repr of a closure masks call sites if an inlinable is called
@@ -84,8 +106,9 @@ def default_numba_nopython_pipeline(state, pm):
         pm.add_pass(NopythonTypeInference, "nopython frontend")
         pm.add_pass(AnnotateTypes, "annotate types")
 
-        pm.add_pass(DPPYRewriteNdarrayFunctions,
-                "Rewrite ndarray functions to dppy supported functions",
+        pm.add_pass(
+            DPPYRewriteNdarrayFunctions,
+            "Rewrite ndarray functions to dppy supported functions",
         )
 
         # strip phis
@@ -94,11 +117,9 @@ def default_numba_nopython_pipeline(state, pm):
         # optimisation
         pm.add_pass(InlineOverloads, "inline overloaded functions")
 
-
     @staticmethod
-    def define_nopython_pipeline(state, name='dppy_nopython'):
-        """Returns an nopython mode pipeline based PassManager
-        """
+    def define_nopython_pipeline(state, name="dppy_nopython"):
+        """Returns an nopython mode pipeline based PassManager"""
         pm = PassManager(name)
         DPPYPassBuilder.default_numba_nopython_pipeline(state, pm)
 
diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py
index be9423230b..858420a098 100644
--- a/numba_dppy/dppy_passes.py
+++ b/numba_dppy/dppy_passes.py
@@ -16,19 +16,31 @@
     utils,
     typing,
     types,
-    )
+)
 
 from numba.core.ir_utils import remove_dels
 
-from numba.core.errors import (LoweringError, new_error_context, TypingError,
-                     LiteralTypingError)
+from numba.core.errors import (
+    LoweringError,
+    new_error_context,
+    TypingError,
+    LiteralTypingError,
+)
 
-from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass, AnalysisPass
+from numba.core.compiler_machinery import (
+    FunctionPass,
+    LoweringPass,
+    register_pass,
+    AnalysisPass,
+)
 
 from .dppy_lowerer import DPPYLower
 from numba_dppy import config as dppy_config
 
-from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map
+from numba.parfors.parfor import (
+    PreParforPass as _parfor_PreParforPass,
+    replace_functions_map,
+)
 from numba.parfors.parfor import ParforPass as _parfor_ParforPass
 from numba.parfors.parfor import Parfor
 
@@ -52,7 +64,11 @@ def run_pass(self, state):
         _DEBUG = False
 
         if _DEBUG:
-            print('Checks if size of OpenCL local address space alloca is a compile-time constant.'.center(80, '-'))
+            print(
+                "Checks if size of OpenCL local address space alloca is a compile-time constant.".center(
+                    80, "-"
+                )
+            )
             print(func_ir.dump())
 
         work_list = list(func_ir.blocks.items())
@@ -62,9 +78,14 @@ def run_pass(self, state):
                 if isinstance(instr, ir.Assign):
                     expr = instr.value
                     if isinstance(expr, ir.Expr):
-                        if expr.op == 'call':
-                            call_node = block.find_variable_assignment(expr.func.name).value
-                            if isinstance(call_node, ir.Expr) and call_node.attr == "static_alloc":
+                        if expr.op == "call":
+                            call_node = block.find_variable_assignment(
+                                expr.func.name
+                            ).value
+                            if (
+                                isinstance(call_node, ir.Expr)
+                                and call_node.attr == "static_alloc"
+                            ):
                                 arg = None
                                 # at first look in keyword arguments to get the shape, which has to be
                                 # constant
@@ -82,24 +103,28 @@ def run_pass(self, state):
                                 if isinstance(arg_type, ir.Expr):
                                     # we have a tuple
                                     for item in arg_type.items:
-                                        if not isinstance(func_ir.get_definition(item.name), ir.Const):
+                                        if not isinstance(
+                                            func_ir.get_definition(item.name), ir.Const
+                                        ):
                                             error = True
                                             break
 
                                 else:
-                                    if not isinstance(func_ir.get_definition(arg.name), ir.Const):
+                                    if not isinstance(
+                                        func_ir.get_definition(arg.name), ir.Const
+                                    ):
                                         error = True
                                         break
 
                                 if error:
-                                    warnings.warn_explicit("The size of the Local memory has to be constant",
-                                                           errors.NumbaError,
-                                                           state.func_id.filename,
-                                                           state.func_id.firstlineno)
+                                    warnings.warn_explicit(
+                                        "The size of the Local memory has to be constant",
+                                        errors.NumbaError,
+                                        state.func_id.filename,
+                                        state.func_id.firstlineno,
+                                    )
                                     raise
 
-
-
         if config.DEBUG or config.DUMP_IR:
             name = state.func_ir.func_id.func_qualname
             print(("IR DUMP: %s" % name).center(80, "-"))
@@ -124,22 +149,23 @@ def run_pass(self, state):
         # Ensure we have an IR and type information.
         assert state.func_ir
         functions_map = replace_functions_map.copy()
-        functions_map.pop(('dot', 'numpy'), None)
-        functions_map.pop(('sum', 'numpy'), None)
-        functions_map.pop(('prod', 'numpy'), None)
-        functions_map.pop(('argmax', 'numpy'), None)
-        functions_map.pop(('max', 'numpy'), None)
-        functions_map.pop(('argmin', 'numpy'), None)
-        functions_map.pop(('min', 'numpy'), None)
-        functions_map.pop(('mean', 'numpy'), None)
+        functions_map.pop(("dot", "numpy"), None)
+        functions_map.pop(("sum", "numpy"), None)
+        functions_map.pop(("prod", "numpy"), None)
+        functions_map.pop(("argmax", "numpy"), None)
+        functions_map.pop(("max", "numpy"), None)
+        functions_map.pop(("argmin", "numpy"), None)
+        functions_map.pop(("min", "numpy"), None)
+        functions_map.pop(("mean", "numpy"), None)
 
         preparfor_pass = _parfor_PreParforPass(
             state.func_ir,
             state.type_annotation.typemap,
-            state.type_annotation.calltypes, state.typingctx,
+            state.type_annotation.calltypes,
+            state.typingctx,
             state.flags.auto_parallel,
             state.parfor_diagnostics.replaced_fns,
-            replace_functions_map=functions_map
+            replace_functions_map=functions_map,
         )
 
         preparfor_pass.run()
@@ -166,14 +192,16 @@ def run_pass(self, state):
         """
         # Ensure we have an IR and type information.
         assert state.func_ir
-        parfor_pass = _parfor_ParforPass(state.func_ir,
-                                         state.type_annotation.typemap,
-                                         state.type_annotation.calltypes,
-                                         state.return_type,
-                                         state.typingctx,
-                                         state.flags.auto_parallel,
-                                         state.flags,
-                                         state.parfor_diagnostics)
+        parfor_pass = _parfor_ParforPass(
+            state.func_ir,
+            state.type_annotation.typemap,
+            state.type_annotation.calltypes,
+            state.return_type,
+            state.typingctx,
+            state.flags.auto_parallel,
+            state.flags,
+            state.parfor_diagnostics,
+        )
 
         parfor_pass.run()
 
@@ -203,14 +231,17 @@ def fallback_context(state, msg):
                 e = e.with_traceback(None)
             # this emits a warning containing the error message body in the
             # case of fallback from npm to objmode
-            loop_lift = '' if state.flags.enable_looplift else 'OUT'
-            msg_rewrite = ("\nCompilation is falling back to object mode "
-                           "WITH%s looplifting enabled because %s"
-                           % (loop_lift, msg))
-            warnings.warn_explicit('%s due to: %s' % (msg_rewrite, e),
-                                   errors.NumbaWarning,
-                                   state.func_id.filename,
-                                   state.func_id.firstlineno)
+            loop_lift = "" if state.flags.enable_looplift else "OUT"
+            msg_rewrite = (
+                "\nCompilation is falling back to object mode "
+                "WITH%s looplifting enabled because %s" % (loop_lift, msg)
+            )
+            warnings.warn_explicit(
+                "%s due to: %s" % (msg_rewrite, e),
+                errors.NumbaWarning,
+                state.func_id.filename,
+                state.func_id.firstlineno,
+            )
             raise
 
 
@@ -232,27 +263,31 @@ def run_pass(self, state):
 
         targetctx = state.targetctx
 
-        library   = state.library
-        interp    = state.func_ir  # why is it called this?!
-        typemap   = state.typemap
-        restype   = state.return_type
+        library = state.library
+        interp = state.func_ir  # why is it called this?!
+        typemap = state.typemap
+        restype = state.return_type
         calltypes = state.calltypes
-        flags     = state.flags
-        metadata  = state.metadata
+        flags = state.flags
+        metadata = state.metadata
 
-        msg = ("Function %s failed at nopython "
-               "mode lowering" % (state.func_id.func_name,))
+        msg = "Function %s failed at nopython " "mode lowering" % (
+            state.func_id.func_name,
+        )
         with fallback_context(state, msg):
             # Lowering
-            fndesc = \
-                funcdesc.PythonFunctionDescriptor.from_specialized_function(
-                    interp, typemap, restype, calltypes,
-                    mangler=targetctx.mangler, inline=flags.forceinline,
-                    noalias=flags.noalias)
+            fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function(
+                interp,
+                typemap,
+                restype,
+                calltypes,
+                mangler=targetctx.mangler,
+                inline=flags.forceinline,
+                noalias=flags.noalias,
+            )
 
             with targetctx.push_code_library(library):
-                lower = DPPYLower(targetctx, library, fndesc, interp,
-                                       metadata=metadata)
+                lower = DPPYLower(targetctx, library, fndesc, interp, metadata=metadata)
                 lower.lower()
                 if not flags.no_cpython_wrapper:
                     lower.create_cpython_wrapper(flags.release_gil)
@@ -262,17 +297,16 @@ def run_pass(self, state):
                 del lower
 
             from numba.core.compiler import _LowerResult  # TODO: move this
+
             if flags.no_compile:
-                state['cr'] = _LowerResult(fndesc, call_helper,
-                                           cfunc=None, env=env)
+                state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env)
             else:
                 # Prepare for execution
                 cfunc = targetctx.get_executable(library, fndesc, env)
                 # Insert native function for use by other jitted-functions.
                 # We also register its library to allow for inlining.
                 targetctx.insert_user_function(cfunc, fndesc, [library])
-                state['cr'] = _LowerResult(fndesc, call_helper,
-                                           cfunc=cfunc, env=env)
+                state["cr"] = _LowerResult(fndesc, call_helper, cfunc=cfunc, env=env)
 
         return True
 
@@ -290,10 +324,11 @@ def run_pass(self, state):
         Back-end: Generate LLVM IR from Numba IR, compile to machine code
         """
 
-        lowered = state['cr']
+        lowered = state["cr"]
         signature = typing.signature(state.return_type, *state.args)
 
         from numba.core.compiler import compile_result
+
         state.cr = compile_result(
             typing_context=state.typingctx,
             target_context=state.targetctx,
diff --git a/numba_dppy/dufunc_inliner.py b/numba_dppy/dufunc_inliner.py
index f42a9b1855..ca14cab531 100644
--- a/numba_dppy/dufunc_inliner.py
+++ b/numba_dppy/dufunc_inliner.py
@@ -4,25 +4,41 @@
 from numba.core.ir_utils import dead_code_elimination, simplify_CFG
 
 
-def _run_inliner(func_ir, sig, template, arg_typs, expr, i, py_func, block,
-                 work_list, typemap, calltypes, typingctx):
-    from numba.core.inline_closurecall import (inline_closure_call,
-                                          callee_ir_validator)
+def _run_inliner(
+    func_ir,
+    sig,
+    template,
+    arg_typs,
+    expr,
+    i,
+    py_func,
+    block,
+    work_list,
+    typemap,
+    calltypes,
+    typingctx,
+):
+    from numba.core.inline_closurecall import inline_closure_call, callee_ir_validator
 
     # pass is typed so use the callee globals
-    inline_closure_call(func_ir, py_func.__globals__,
-                        block, i, py_func, typingctx=typingctx,
-                        arg_typs=arg_typs,
-                        typemap=typemap,
-                        calltypes=calltypes,
-                        work_list=work_list,
-                        replace_freevars=False,
-                        callee_validator=callee_ir_validator)
+    inline_closure_call(
+        func_ir,
+        py_func.__globals__,
+        block,
+        i,
+        py_func,
+        typingctx=typingctx,
+        arg_typs=arg_typs,
+        typemap=typemap,
+        calltypes=calltypes,
+        work_list=work_list,
+        replace_freevars=False,
+        callee_validator=callee_ir_validator,
+    )
     return True
 
 
-def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes,
-            typingctx):
+def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes, typingctx):
     # try and get a definition for the call, this isn't always possible as
     # it might be a eval(str)/part generated awaiting update etc. (parfors)
     to_inline = None
@@ -32,7 +48,7 @@ def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes,
         return False
 
     # do not handle closure inlining here, another pass deals with that.
-    if getattr(to_inline, 'op', False) == 'make_function':
+    if getattr(to_inline, "op", False) == "make_function":
         return False
 
     # check this is a known and typed function
@@ -40,33 +56,43 @@ def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes,
         func_ty = typemap[expr.func.name]
     except KeyError:
         return False
-    if not hasattr(func_ty, 'get_call_type'):
+    if not hasattr(func_ty, "get_call_type"):
         return False
 
     sig = calltypes[expr]
     is_method = False
 
-    templates = getattr(func_ty, 'templates', None)
+    templates = getattr(func_ty, "templates", None)
     arg_typs = sig.args
 
     if templates is None:
         return False
 
-    assert(len(templates) == 1)
+    assert len(templates) == 1
 
     # at this point we know we maybe want to inline something and there's
     # definitely something that could be inlined.
     return _run_inliner(
-        func_ir, sig, templates[0], arg_typs, expr, i, py_func, block,
-        work_list, typemap, calltypes, typingctx
+        func_ir,
+        sig,
+        templates[0],
+        arg_typs,
+        expr,
+        i,
+        py_func,
+        block,
+        work_list,
+        typemap,
+        calltypes,
+        typingctx,
     )
 
 
 def _is_dufunc_callsite(expr, block):
-    if expr.op == 'call':
+    if expr.op == "call":
         call_node = block.find_variable_assignment(expr.func.name).value
         # due to circular import we can not import DUFunc, TODO: Fix it
-        if(call_node.value.__class__.__name__ == "DUFunc"):
+        if call_node.value.__class__.__name__ == "DUFunc":
             return call_node
     return None
 
@@ -76,7 +102,7 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx):
     modified = False
 
     if _DEBUG:
-        print('GUFunc before inlining DUFunc'.center(80, '-'))
+        print("GUFunc before inlining DUFunc".center(80, "-"))
         print(func_ir.dump())
 
     work_list = list(func_ir.blocks.items())
@@ -92,17 +118,26 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx):
                     call_node = _is_dufunc_callsite(expr, block)
                     if call_node:
                         py_func = call_node.value._dispatcher.py_func
-                        workfn = _inline(func_ir, work_list, block, i, expr,
-                                         py_func, typemap, calltypes, typingctx)
+                        workfn = _inline(
+                            func_ir,
+                            work_list,
+                            block,
+                            i,
+                            expr,
+                            py_func,
+                            typemap,
+                            calltypes,
+                            typingctx,
+                        )
                         if workfn:
                             modified = True
                             break  # because block structure changed
                     else:
                         continue
     if _DEBUG:
-        print('GUFunc after inlining DUFunc'.center(80, '-'))
+        print("GUFunc after inlining DUFunc".center(80, "-"))
         print(func_ir.dump())
-        print(''.center(80, '-'))
+        print("".center(80, "-"))
 
     if modified:
         # clean up leftover load instructions. This step is needed or else
@@ -113,8 +148,8 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx):
         func_ir.blocks = simplify_CFG(func_ir.blocks)
 
     if _DEBUG:
-        print('GUFunc after inlining DUFunc, DCE, SimplyCFG'.center(80, '-'))
+        print("GUFunc after inlining DUFunc, DCE, SimplyCFG".center(80, "-"))
         print(func_ir.dump())
-        print(''.center(80, '-'))
+        print("".center(80, "-"))
 
     return True
diff --git a/numba_dppy/examples/blacksholes_njit.py b/numba_dppy/examples/blacksholes_njit.py
index 3654a90c66..0786fed9bc 100644
--- a/numba_dppy/examples/blacksholes_njit.py
+++ b/numba_dppy/examples/blacksholes_njit.py
@@ -9,12 +9,14 @@
 import argparse
 import time
 
+
 @numba.vectorize(nopython=True)
 def cndf2(inp):
-    out = 0.5 + 0.5 * math.erf((math.sqrt(2.0)/2.0) * inp)
+    out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp)
     return out
 
-@numba.njit(parallel={'offload':True}, fastmath=True)
+
+@numba.njit(parallel={"offload": True}, fastmath=True)
 def blackscholes(sptprice, strike, rate, volatility, timev):
     logterm = np.log(sptprice / strike)
     powterm = 0.5 * volatility * volatility
@@ -23,29 +25,30 @@ def blackscholes(sptprice, strike, rate, volatility, timev):
     d2 = d1 - den
     NofXd1 = cndf2(d1)
     NofXd2 = cndf2(d2)
-    futureValue = strike * np.exp(- rate * timev)
+    futureValue = strike * np.exp(-rate * timev)
     c1 = futureValue * NofXd2
     call = sptprice * NofXd1 - c1
-    put  = call - futureValue + sptprice
+    put = call - futureValue + sptprice
     return put
 
 
 def run(iterations):
-    sptprice   = np.full((iterations,), 42.0)
+    sptprice = np.full((iterations,), 42.0)
     initStrike = 40 + (np.arange(iterations) + 1.0) / iterations
-    rate       = np.full((iterations,), 0.5)
+    rate = np.full((iterations,), 0.5)
     volatility = np.full((iterations,), 0.2)
-    timev      = np.full((iterations,), 0.5)
+    timev = np.full((iterations,), 0.5)
 
     t1 = time.time()
     put = blackscholes(sptprice, initStrike, rate, volatility, timev)
-    t = time.time()-t1
+    t = time.time() - t1
     print("checksum: ", sum(put))
     print("SELFTIMED ", t)
 
+
 def main():
-    parser = argparse.ArgumentParser(description='Black-Scholes')
-    parser.add_argument('--options', dest='options', type=int, default=10000000)
+    parser = argparse.ArgumentParser(description="Black-Scholes")
+    parser.add_argument("--options", dest="options", type=int, default=10000000)
     args = parser.parse_args()
     options = args.options
 
@@ -53,5 +56,6 @@ def main():
     print("options = ", options)
     run(options)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/dppy_func.py b/numba_dppy/examples/dppy_func.py
index 353ba48995..9230bf64fb 100644
--- a/numba_dppy/examples/dppy_func.py
+++ b/numba_dppy/examples/dppy_func.py
@@ -39,5 +39,5 @@ def main():
         print("No device found")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/dppy_with_context.py b/numba_dppy/examples/dppy_with_context.py
index 6df025f5ca..0520582a2a 100644
--- a/numba_dppy/examples/dppy_with_context.py
+++ b/numba_dppy/examples/dppy_with_context.py
@@ -3,6 +3,7 @@
 import numba_dppy, numba_dppy as dppy
 import dpctl
 
+
 @njit
 def add_two_arrays(b, c):
     a = np.empty_like(b)
@@ -20,14 +21,14 @@ def main():
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu"):
             gpu_result = add_two_arrays(b, c)
-        print('GPU device found. Result on GPU:', gpu_result)
+        print("GPU device found. Result on GPU:", gpu_result)
     elif dpctl.has_cpu_queues():
         with dpctl.device_context("opencl:cpu"):
             cpu_result = add_two_arrays(b, c)
-        print('CPU device found. Result on CPU:', cpu_result)
+        print("CPU device found. Result on CPU:", cpu_result)
     else:
         print("No device found")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py
index b97ac49ca1..184e65235e 100644
--- a/numba_dppy/examples/matmul.py
+++ b/numba_dppy/examples/matmul.py
@@ -14,7 +14,7 @@ def dppy_gemm(a, b, c):
     j = dppy.get_global_id(1)
     if i >= c.shape[0] or j >= c.shape[1]:
         return
-    c[i,j] = 0
+    c[i, j] = 0
     for k in range(c.shape[0]):
         c[i, j] += a[i, k] * b[k, j]
 
@@ -22,7 +22,7 @@ def dppy_gemm(a, b, c):
 # Array dimesnions
 X = 1024
 Y = 16
-global_size = X,X
+global_size = X, X
 
 griddim = X, X
 blockdim = Y, Y
@@ -30,13 +30,13 @@ def dppy_gemm(a, b, c):
 
 def driver(a, b, c):
     # Invoke the kernel
-    dppy_gemm[griddim,blockdim](a, b, c)
+    dppy_gemm[griddim, blockdim](a, b, c)
 
 
 def main():
-    a = np.arange(X*X, dtype=np.float32).reshape(X,X)
-    b = np.array(np.random.random(X*X), dtype=np.float32).reshape(X,X)
-    c = np.ones_like(a).reshape(X,X)
+    a = np.arange(X * X, dtype=np.float32).reshape(X, X)
+    b = np.array(np.random.random(X * X), dtype=np.float32).reshape(X, X)
+    c = np.ones_like(a).reshape(X, X)
 
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -58,5 +58,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/pa_examples/test1.py b/numba_dppy/examples/pa_examples/test1.py
index 01209b3309..ffe715549a 100644
--- a/numba_dppy/examples/pa_examples/test1.py
+++ b/numba_dppy/examples/pa_examples/test1.py
@@ -31,5 +31,5 @@ def main():
             break
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py
index b72c41ba9c..0a0705ee81 100644
--- a/numba_dppy/examples/pairwise_distance.py
+++ b/numba_dppy/examples/pairwise_distance.py
@@ -1,7 +1,7 @@
 from time import time
 import numba
 from numba import int32, float32
-from math import ceil,sqrt
+from math import ceil, sqrt
 import numpy as np
 import argparse
 import timeit
@@ -10,12 +10,12 @@
 import dpctl
 import dpctl._memory as dpctl_mem
 
-parser = argparse.ArgumentParser(description='Program to compute pairwise distance')
+parser = argparse.ArgumentParser(description="Program to compute pairwise distance")
 
-parser.add_argument('-n', type=int, default=10, help='Number of points')
-parser.add_argument('-d', type=int, default=3, help='Dimensions')
-parser.add_argument('-r', type=int, default=1, help='repeat')
-parser.add_argument('-l', type=int, default=1, help='local_work_size')
+parser.add_argument("-n", type=int, default=10, help="Number of points")
+parser.add_argument("-d", type=int, default=3, help="Dimensions")
+parser.add_argument("-r", type=int, default=1, help="repeat")
+parser.add_argument("-l", type=int, default=1, help="local_work_size")
 
 args = parser.parse_args()
 
@@ -32,7 +32,7 @@
 def pairwise_distance(X, D, xshape0, xshape1):
     idx = dppy.get_global_id(0)
 
-    #for i in range(xshape0):
+    # for i in range(xshape0):
     for j in range(X.shape[0]):
         d = 0.0
         for k in range(X.shape[1]):
@@ -42,7 +42,7 @@ def pairwise_distance(X, D, xshape0, xshape1):
 
 
 def driver():
-    #measure running time
+    # measure running time
     times = list()
 
     xbuf = dpctl_mem.MemoryUSMShared(X.size * X.dtype.itemsize)
@@ -55,7 +55,9 @@ def driver():
 
     for repeat in range(args.r):
         start = time()
-        pairwise_distance[global_size, local_size](x_ndarray, d_ndarray, X.shape[0], X.shape[1])
+        pairwise_distance[global_size, local_size](
+            x_ndarray, d_ndarray, X.shape[0], X.shape[1]
+        )
         end = time()
 
         total_time = end - start
@@ -80,9 +82,9 @@ def main():
         print("No device found")
         exit()
 
-    times =  np.asarray(times, dtype=np.float32)
+    times = np.asarray(times, dtype=np.float32)
     print("Average time of %d runs is = %fs" % (args.r, times.mean()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py
index e66c51ae2c..38cb708fa3 100644
--- a/numba_dppy/examples/sum-hybrid.py
+++ b/numba_dppy/examples/sum-hybrid.py
@@ -14,8 +14,8 @@ def data_parallel_sum(a, b, c):
     c[i] = a[i] + b[i]
 
 
-N = 50*32
-global_size = N,
+N = 50 * 32
+global_size = (N,)
 
 
 def main():
@@ -49,5 +49,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py
index fdc1623fa7..1f2a0f6c31 100644
--- a/numba_dppy/examples/sum.py
+++ b/numba_dppy/examples/sum.py
@@ -44,5 +44,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py
index 90959c8bdf..4e279cfc01 100644
--- a/numba_dppy/examples/sum2D.py
+++ b/numba_dppy/examples/sum2D.py
@@ -12,7 +12,7 @@
 def data_parallel_sum(a, b, c):
     i = dppy.get_global_id(0)
     j = dppy.get_global_id(1)
-    c[i,j] = a[i,j] + b[i,j]
+    c[i, j] = a[i, j] + b[i, j]
 
 
 def driver(a, b, c, global_size):
@@ -26,11 +26,11 @@ def main():
     # Array dimesnions
     X = 8
     Y = 8
-    global_size = X,Y
+    global_size = X, Y
 
-    a = np.arange(X*Y, dtype=np.float32).reshape(X,Y)
-    b = np.array(np.random.random(X*Y), dtype=np.float32).reshape(X,Y)
-    c = np.ones_like(a).reshape(X,Y)
+    a = np.arange(X * Y, dtype=np.float32).reshape(X, Y)
+    b = np.array(np.random.random(X * Y), dtype=np.float32).reshape(X, Y)
+    c = np.ones_like(a).reshape(X, Y)
 
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -45,5 +45,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py
index 2aea8e080a..fedf722000 100644
--- a/numba_dppy/examples/sum_ndarray.py
+++ b/numba_dppy/examples/sum_ndarray.py
@@ -8,7 +8,9 @@
 import dpctl
 
 
-@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(
+    access_types={"read_only": ["a", "b"], "write_only": ["c"], "read_write": []}
+)
 def data_parallel_sum(a, b, c):
     i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
@@ -45,5 +47,5 @@ def main():
     print("Done...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py
index 367fa37952..c4d7a171e4 100644
--- a/numba_dppy/examples/sum_reduction.py
+++ b/numba_dppy/examples/sum_reduction.py
@@ -12,7 +12,7 @@
 def reduction_kernel(A, R, stride):
     i = dppy.get_global_id(0)
     # sum two element
-    R[i] = A[i] + A[i+stride]
+    R[i] = A[i] + A[i + stride]
     # store the sum to be used in nex iteration
     A[i] = R[i]
 
@@ -20,21 +20,23 @@ def reduction_kernel(A, R, stride):
 def test_sum_reduction():
     # This test will only work for size = power of two
     N = 2048
-    assert(N%2 == 0)
+    assert N % 2 == 0
 
     A = np.array(np.random.random(N), dtype=np.float32)
     A_copy = A.copy()
     # at max we will require half the size of A to store sum
-    R = np.array(np.random.random(math.ceil(N/2)), dtype=np.float32)
+    R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32)
 
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             total = N
 
-            while (total > 1):
+            while total > 1:
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](
+                    A, R, global_size
+                )
                 total = total // 2
 
     else:
@@ -43,7 +45,8 @@ def test_sum_reduction():
 
     result = A_copy.sum()
     max_abs_err = result - R[0]
-    assert(max_abs_err < 1e-2)
+    assert max_abs_err < 1e-2
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     test_sum_reduction()
diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py
index 8d8e0411aa..c3903cc8f2 100644
--- a/numba_dppy/examples/sum_reduction_ocl.py
+++ b/numba_dppy/examples/sum_reduction_ocl.py
@@ -6,13 +6,14 @@
 
 import dpctl
 
+
 def sum_reduction_device_plus_host():
     @dppy.kernel
     def sum_reduction_kernel(inp, partial_sums):
-        local_id   = dppy.get_local_id(0)
-        global_id  = dppy.get_global_id(0)
+        local_id = dppy.get_local_id(0)
+        global_id = dppy.get_global_id(0)
         group_size = dppy.get_local_size(0)
-        group_id   = dppy.get_group_id(0)
+        group_id = dppy.get_group_id(0)
 
         local_sums = dppy.local.static_alloc(64, int32)
 
@@ -21,12 +22,12 @@ def sum_reduction_kernel(inp, partial_sums):
 
         # Loop for computing local_sums : divide workgroup into 2 parts
         stride = group_size // 2
-        while (stride > 0):
+        while stride > 0:
             # Waiting for each 2x2 addition into given workgroup
             dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
-            if (local_id < stride):
+            if local_id < stride:
                 local_sums[local_id] += local_sums[local_id + stride]
 
             stride >>= 1
@@ -55,9 +56,9 @@ def sum_reduction_kernel(inp, partial_sums):
     for i in range(nb_work_groups):
         final_sum += partial_sums[i]
 
-    assert(final_sum == global_size)
+    assert final_sum == global_size
     print("Expected:", global_size, "--- GOT:", final_sum)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sum_reduction_device_plus_host()
diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py
index c5dd6daa47..eb06cb2054 100644
--- a/numba_dppy/examples/sum_reduction_recursive_ocl.py
+++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py
@@ -8,16 +8,13 @@
 import dpctl._memory as dpctl_mem
 
 
-def recursive_reduction(size, group_size,
-                        Dinp, Dpartial_sums):
-
+def recursive_reduction(size, group_size, Dinp, Dpartial_sums):
     @dppy.kernel
-    def sum_reduction_kernel(inp, input_size,
-                             partial_sums):
-        local_id   = dppy.get_local_id(0)
-        global_id  = dppy.get_global_id(0)
+    def sum_reduction_kernel(inp, input_size, partial_sums):
+        local_id = dppy.get_local_id(0)
+        global_id = dppy.get_global_id(0)
         group_size = dppy.get_local_size(0)
-        group_id   = dppy.get_group_id(0)
+        group_id = dppy.get_group_id(0)
 
         local_sums = dppy.local.static_alloc(64, int32)
 
@@ -28,12 +25,12 @@ def sum_reduction_kernel(inp, input_size,
 
         # Loop for computing local_sums : divide workgroup into 2 parts
         stride = group_size // 2
-        while (stride > 0):
+        while stride > 0:
             # Waiting for each 2x2 addition into given workgroup
             dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)
 
             # Add elements 2 by 2 between local_id and local_id + stride
-            if (local_id < stride):
+            if local_id < stride:
                 local_sums[local_id] += local_sums[local_id + stride]
 
             stride >>= 1
@@ -41,27 +38,27 @@ def sum_reduction_kernel(inp, input_size,
         if local_id == 0:
             partial_sums[group_id] = local_sums[0]
 
-
     result = 0
     nb_work_groups = 0
     passed_size = size
 
-    if (size <= group_size):
+    if size <= group_size:
         nb_work_groups = 1
     else:
-        nb_work_groups = size // group_size;
-        if (size % group_size != 0):
+        nb_work_groups = size // group_size
+        if size % group_size != 0:
             nb_work_groups += 1
             passed_size = nb_work_groups * group_size
 
     sum_reduction_kernel[passed_size, group_size](Dinp, size, Dpartial_sums)
 
     if nb_work_groups <= group_size:
-        sum_reduction_kernel[group_size, group_size](Dpartial_sums, nb_work_groups, Dinp)
+        sum_reduction_kernel[group_size, group_size](
+            Dpartial_sums, nb_work_groups, Dinp
+        )
         result = Dinp[0]
     else:
-        result = recursive_reduction(nb_work_groups, group_size,
-                                     Dpartial_sums, Dinp)
+        result = recursive_reduction(nb_work_groups, group_size, Dpartial_sums, Dinp)
 
     return result
 
@@ -76,27 +73,30 @@ def sum_reduction_recursive():
     inp = np.ones(global_size).astype(np.int32)
     partial_sums = np.zeros(nb_work_groups).astype(np.int32)
 
-
     if dpctl.has_gpu_queues():
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             inp_buf = dpctl_mem.MemoryUSMShared(inp.size * inp.dtype.itemsize)
             inp_ndarray = np.ndarray(inp.shape, buffer=inp_buf, dtype=inp.dtype)
             np.copyto(inp_ndarray, inp)
 
-            partial_sums_buf = dpctl_mem.MemoryUSMShared(partial_sums.size * partial_sums.dtype.itemsize)
-            partial_sums_ndarray = np.ndarray(partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype)
+            partial_sums_buf = dpctl_mem.MemoryUSMShared(
+                partial_sums.size * partial_sums.dtype.itemsize
+            )
+            partial_sums_ndarray = np.ndarray(
+                partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype
+            )
             np.copyto(partial_sums_ndarray, partial_sums)
 
             print("Running recursive reduction")
-            result = recursive_reduction(global_size, work_group_size,
-                                         inp_ndarray, partial_sums_ndarray)
+            result = recursive_reduction(
+                global_size, work_group_size, inp_ndarray, partial_sums_ndarray
+            )
     else:
         print("No device found")
         exit()
 
-
     print("Expected:", global_size, "--- GOT:", result)
-    assert(result == global_size)
+    assert result == global_size
 
 
 sum_reduction_recursive()
diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py
index 2a2c70f796..3894335051 100644
--- a/numba_dppy/initialize.py
+++ b/numba_dppy/initialize.py
@@ -6,24 +6,32 @@
 
 def init_jit():
     from numba_dppy.dispatcher import DPPYDispatcher
+
     return DPPYDispatcher
 
+
 def initialize_all():
     from numba.core.registry import dispatcher_registry
-    dispatcher_registry.ondemand['dppy'] = init_jit
+
+    dispatcher_registry.ondemand["dppy"] = init_jit
 
     import dpctl
     import glob
     import platform as plt
+
     platform = plt.system()
-    if platform == 'Windows':
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll'))
+    if platform == "Windows":
+        paths = glob.glob(
+            os.path.join(os.path.dirname(dpctl.__file__), "*DPCTLSyclInterface.dll")
+        )
     else:
-        paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*'))
+        paths = glob.glob(
+            os.path.join(os.path.dirname(dpctl.__file__), "*DPCTLSyclInterface*")
+        )
 
     if len(paths) == 1:
         ll.load_library_permanently(paths[0])
     else:
         raise ImportError
 
-    ll.load_library_permanently(find_library('OpenCL'))
+    ll.load_library_permanently(find_library("OpenCL"))
diff --git a/numba_dppy/ocl/atomics/__init__.py b/numba_dppy/ocl/atomics/__init__.py
index aa6fdf5dfd..d3ac430da0 100644
--- a/numba_dppy/ocl/atomics/__init__.py
+++ b/numba_dppy/ocl/atomics/__init__.py
@@ -1,22 +1,25 @@
 import os
 import os.path
 
+
 def atomic_support_present():
-    if os.path.isfile(os.path.join(os.path.dirname(__file__), 'atomic_ops.spir')):
+    if os.path.isfile(os.path.join(os.path.dirname(__file__), "atomic_ops.spir")):
         return True
     else:
         return False
 
+
 def get_atomic_spirv_path():
     if atomic_support_present():
-        return os.path.join(os.path.dirname(__file__), 'atomic_ops.spir')
+        return os.path.join(os.path.dirname(__file__), "atomic_ops.spir")
     else:
         return None
 
+
 def read_atomic_spirv_file():
     path = get_atomic_spirv_path()
     if path:
-        with open(path, 'rb') as fin:
+        with open(path, "rb") as fin:
             spirv = fin.read()
         return spirv
     else:
diff --git a/numba_dppy/ocl/mathdecl.py b/numba_dppy/ocl/mathdecl.py
index 442e269cb4..7ce5e15d95 100644
--- a/numba_dppy/ocl/mathdecl.py
+++ b/numba_dppy/ocl/mathdecl.py
@@ -1,8 +1,12 @@
 from __future__ import print_function, absolute_import, division
 import math
 from numba.core import types, utils
-from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
-                                        signature, Registry)
+from numba.core.typing.templates import (
+    AttributeTemplate,
+    ConcreteTemplate,
+    signature,
+    Registry,
+)
 
 registry = Registry()
 builtin_attr = registry.register_attr
@@ -253,12 +257,15 @@ class Math_degrees(Math_unary):
 class Math_erf(Math_unary):
     key = math.erf
 
+
 class Math_erfc(Math_unary):
     key = math.erfc
 
+
 class Math_gamma(Math_unary):
     key = math.gamma
 
+
 class Math_lgamma(Math_unary):
     key = math.lgamma
 
diff --git a/numba_dppy/ocl/mathimpl.py b/numba_dppy/ocl/mathimpl.py
index 86c8195046..bb2f9cb7eb 100644
--- a/numba_dppy/ocl/mathimpl.py
+++ b/numba_dppy/ocl/mathimpl.py
@@ -25,58 +25,50 @@
 _binary_d_dl = types.float64(types.float64, types.int64)
 
 sig_mapper = {
-        'f->f' : _unary_f_f,
-        'd->d' : _unary_d_d,
-        'ff->f': _binary_f_ff,
-        'dd->d': _binary_d_dd,
-        'fi->f': _binary_f_fi,
-        'fl->f': _binary_f_fl,
-        'di->d': _binary_d_di,
-        'dl->d': _binary_d_dl,
-        }
+    "f->f": _unary_f_f,
+    "d->d": _unary_d_d,
+    "ff->f": _binary_f_ff,
+    "dd->d": _binary_d_dd,
+    "fi->f": _binary_f_fi,
+    "fl->f": _binary_f_fl,
+    "di->d": _binary_d_di,
+    "dl->d": _binary_d_dl,
+}
 
 function_descriptors = {
-    'isnan': (_unary_b_f, _unary_b_d),
-    'isinf': (_unary_b_f, _unary_b_d),
-
-    'ceil': (_unary_f_f, _unary_d_d),
-    'floor': (_unary_f_f, _unary_d_d),
-    'trunc': (_unary_f_f, _unary_d_d),
-
-    'fabs': (_unary_f_f, _unary_d_d),
-
-    'sqrt': (_unary_f_f, _unary_d_d),
-    'exp': (_unary_f_f, _unary_d_d),
-    'expm1': (_unary_f_f, _unary_d_d),
-    'log': (_unary_f_f, _unary_d_d),
-    'log10': (_unary_f_f, _unary_d_d),
-    'log1p': (_unary_f_f, _unary_d_d),
-
-    'sin': (_unary_f_f, _unary_d_d),
-    'cos': (_unary_f_f, _unary_d_d),
-    'tan': (_unary_f_f, _unary_d_d),
-    'asin': (_unary_f_f, _unary_d_d),
-    'acos': (_unary_f_f, _unary_d_d),
-    'atan': (_unary_f_f, _unary_d_d),
-    'sinh': (_unary_f_f, _unary_d_d),
-    'cosh': (_unary_f_f, _unary_d_d),
-    'tanh': (_unary_f_f, _unary_d_d),
-    'asinh': (_unary_f_f, _unary_d_d),
-    'acosh': (_unary_f_f, _unary_d_d),
-    'atanh': (_unary_f_f, _unary_d_d),
-
-    'copysign': (_binary_f_ff, _binary_d_dd),
-    'atan2': (_binary_f_ff, _binary_d_dd),
-    'pow': (_binary_f_ff, _binary_d_dd),
-    'fmod': (_binary_f_ff, _binary_d_dd),
-
-    'erf': (_unary_f_f, _unary_d_d),
-    'erfc': (_unary_f_f, _unary_d_d),
-    'gamma': (_unary_f_f, _unary_d_d),
-    'lgamma': (_unary_f_f, _unary_d_d),
-
-    'ldexp': (_binary_f_fi, _binary_f_fl, _binary_d_di, _binary_d_dl),
-
+    "isnan": (_unary_b_f, _unary_b_d),
+    "isinf": (_unary_b_f, _unary_b_d),
+    "ceil": (_unary_f_f, _unary_d_d),
+    "floor": (_unary_f_f, _unary_d_d),
+    "trunc": (_unary_f_f, _unary_d_d),
+    "fabs": (_unary_f_f, _unary_d_d),
+    "sqrt": (_unary_f_f, _unary_d_d),
+    "exp": (_unary_f_f, _unary_d_d),
+    "expm1": (_unary_f_f, _unary_d_d),
+    "log": (_unary_f_f, _unary_d_d),
+    "log10": (_unary_f_f, _unary_d_d),
+    "log1p": (_unary_f_f, _unary_d_d),
+    "sin": (_unary_f_f, _unary_d_d),
+    "cos": (_unary_f_f, _unary_d_d),
+    "tan": (_unary_f_f, _unary_d_d),
+    "asin": (_unary_f_f, _unary_d_d),
+    "acos": (_unary_f_f, _unary_d_d),
+    "atan": (_unary_f_f, _unary_d_d),
+    "sinh": (_unary_f_f, _unary_d_d),
+    "cosh": (_unary_f_f, _unary_d_d),
+    "tanh": (_unary_f_f, _unary_d_d),
+    "asinh": (_unary_f_f, _unary_d_d),
+    "acosh": (_unary_f_f, _unary_d_d),
+    "atanh": (_unary_f_f, _unary_d_d),
+    "copysign": (_binary_f_ff, _binary_d_dd),
+    "atan2": (_binary_f_ff, _binary_d_dd),
+    "pow": (_binary_f_ff, _binary_d_dd),
+    "fmod": (_binary_f_ff, _binary_d_dd),
+    "erf": (_unary_f_f, _unary_d_d),
+    "erfc": (_unary_f_f, _unary_d_d),
+    "gamma": (_unary_f_f, _unary_d_d),
+    "lgamma": (_unary_f_f, _unary_d_d),
+    "ldexp": (_binary_f_fi, _binary_f_fl, _binary_d_di, _binary_d_dl),
     # unsupported functions listed in the math module documentation:
     # frexp, ldexp, trunc, modf, factorial, fsum
 }
@@ -84,17 +76,16 @@
 
 # some functions may be named differently by the underlying math
 # library as oposed to the Python name.
-_lib_counterpart = {
-    'gamma': 'tgamma'
-}
+_lib_counterpart = {"gamma": "tgamma"}
 
 
 def _mk_fn_decl(name, decl_sig):
     sym = _lib_counterpart.get(name, name)
 
     def core(context, builder, sig, args):
-        fn = _declare_function(context, builder, sym, decl_sig, decl_sig.args,
-                               mangler=mangle)
+        fn = _declare_function(
+            context, builder, sym, decl_sig, decl_sig.args, mangler=mangle
+        )
         res = builder.call(fn, args)
         return context.cast(builder, res, decl_sig.return_type, sig.return_type)
 
@@ -102,16 +93,46 @@ def core(context, builder, sig, args):
     return core
 
 
-_supported = ['sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'atan2', 'sinh',
-              'cosh', 'tanh', 'asinh', 'acosh', 'atanh', 'isnan', 'isinf',
-              'ceil', 'floor', 'fabs', 'sqrt', 'exp', 'expm1', 'log',
-              'log10', 'log1p', 'copysign', 'pow', 'fmod', 'erf', 'erfc',
-              'gamma', 'lgamma', 'ldexp', 'trunc'
-              ]
+_supported = [
+    "sin",
+    "cos",
+    "tan",
+    "asin",
+    "acos",
+    "atan",
+    "atan2",
+    "sinh",
+    "cosh",
+    "tanh",
+    "asinh",
+    "acosh",
+    "atanh",
+    "isnan",
+    "isinf",
+    "ceil",
+    "floor",
+    "fabs",
+    "sqrt",
+    "exp",
+    "expm1",
+    "log",
+    "log10",
+    "log1p",
+    "copysign",
+    "pow",
+    "fmod",
+    "erf",
+    "erfc",
+    "gamma",
+    "lgamma",
+    "ldexp",
+    "trunc",
+]
 
 
 lower_ocl_impl = dict()
 
+
 def function_name_to_supported_decl(name, sig):
     try:
         # only symbols present in the math module
@@ -120,7 +141,7 @@ def function_name_to_supported_decl(name, sig):
         return None
 
     fn = _mk_fn_decl(name, sig)
-    #lower(key, *sig.args)(fn)
+    # lower(key, *sig.args)(fn)
     lower_ocl_impl[(name, sig)] = lower(key, *sig.args)(fn)
 
 
diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py
index adf14a1815..a5aa6545e5 100644
--- a/numba_dppy/ocl/ocldecl.py
+++ b/numba_dppy/ocl/ocldecl.py
@@ -1,17 +1,23 @@
 from __future__ import print_function, division, absolute_import
 from numba import types
 from numba.core.typing.npydecl import register_number_classes
-from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate,
-                                        AbstractTemplate, MacroTemplate,
-                                        signature, Registry)
+from numba.core.typing.templates import (
+    AttributeTemplate,
+    ConcreteTemplate,
+    AbstractTemplate,
+    MacroTemplate,
+    signature,
+    Registry,
+)
 import numba_dppy, numba_dppy as dppy
 
 registry = Registry()
 intrinsic = registry.register
 intrinsic_attr = registry.register_attr
-#intrinsic_global = registry.register_global
+# intrinsic_global = registry.register_global
+
+# register_number_classes(intrinsic_global)
 
-#register_number_classes(intrinsic_global)
 
 @intrinsic
 class Ocl_get_global_id(ConcreteTemplate):
@@ -58,8 +64,7 @@ class Ocl_get_local_size(ConcreteTemplate):
 @intrinsic
 class Ocl_barrier(ConcreteTemplate):
     key = dppy.barrier
-    cases = [signature(types.void, types.uint32),
-             signature(types.void)]
+    cases = [signature(types.void, types.uint32), signature(types.void)]
 
 
 @intrinsic
@@ -77,6 +82,7 @@ class Ocl_sub_group_barrier(ConcreteTemplate):
 
 # dppy.atomic submodule -------------------------------------------------------
 
+
 @intrinsic
 class Ocl_atomic_add(AbstractTemplate):
     key = dppy.atomic.add
@@ -90,6 +96,7 @@ def generic(self, args, kws):
         elif ary.ndim > 1:
             return signature(ary.dtype, ary, idx, ary.dtype)
 
+
 @intrinsic
 class Ocl_atomic_sub(AbstractTemplate):
     key = dppy.atomic.sub
@@ -117,6 +124,7 @@ def resolve_sub(self, mod):
 
 # dppy.local submodule -------------------------------------------------------
 
+
 class Ocl_local_alloc(MacroTemplate):
     key = dppy.local.static_alloc
 
@@ -131,6 +139,7 @@ def resolve_static_alloc(self, mod):
 
 # OpenCL module --------------------------------------------------------------
 
+
 @intrinsic_attr
 class OclModuleTemplate(AttributeTemplate):
     key = types.Module(dppy)
@@ -171,6 +180,7 @@ def resolve_atomic(self, mod):
     def resolve_local(self, mod):
         return types.Module(dppy.local)
 
+
 # intrinsic
 
-#intrinsic_global(dppy, types.Module(dppy))
+# intrinsic_global(dppy, types.Module(dppy))
diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py
index 26f8482799..893a37c6a8 100644
--- a/numba_dppy/ocl/oclimpl.py
+++ b/numba_dppy/ocl/oclimpl.py
@@ -24,8 +24,7 @@
 # -----------------------------------------------------------------------------
 
 
-def _declare_function(context, builder, name, sig, cargs,
-                      mangler=mangle_c):
+def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c):
     """Insert declaration for a opencl builtin function.
     Uses the Itanium mangler.
 
@@ -60,11 +59,13 @@ def _declare_function(context, builder, name, sig, cargs,
     fn.calling_convention = target.CC_SPIR_FUNC
     return fn
 
+
 @lower(stubs.get_global_id, types.uint32)
 def get_global_id_impl(context, builder, sig, args):
     [dim] = args
-    get_global_id = _declare_function(context, builder, 'get_global_id', sig,
-                                      ['unsigned int'])
+    get_global_id = _declare_function(
+        context, builder, "get_global_id", sig, ["unsigned int"]
+    )
     res = builder.call(get_global_id, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -72,8 +73,9 @@ def get_global_id_impl(context, builder, sig, args):
 @lower(stubs.get_local_id, types.uint32)
 def get_local_id_impl(context, builder, sig, args):
     [dim] = args
-    get_local_id = _declare_function(context, builder, 'get_local_id', sig,
-                                     ['unsigned int'])
+    get_local_id = _declare_function(
+        context, builder, "get_local_id", sig, ["unsigned int"]
+    )
     res = builder.call(get_local_id, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -81,8 +83,9 @@ def get_local_id_impl(context, builder, sig, args):
 @lower(stubs.get_group_id, types.uint32)
 def get_group_id_impl(context, builder, sig, args):
     [dim] = args
-    get_group_id = _declare_function(context, builder, 'get_group_id', sig,
-                                     ['unsigned int'])
+    get_group_id = _declare_function(
+        context, builder, "get_group_id", sig, ["unsigned int"]
+    )
     res = builder.call(get_group_id, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -90,16 +93,16 @@ def get_group_id_impl(context, builder, sig, args):
 @lower(stubs.get_num_groups, types.uint32)
 def get_num_groups_impl(context, builder, sig, args):
     [dim] = args
-    get_num_groups = _declare_function(context, builder, 'get_num_groups', sig,
-                                       ['unsigned int'])
+    get_num_groups = _declare_function(
+        context, builder, "get_num_groups", sig, ["unsigned int"]
+    )
     res = builder.call(get_num_groups, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
 
 @lower(stubs.get_work_dim)
 def get_work_dim_impl(context, builder, sig, args):
-    get_work_dim = _declare_function(context, builder, 'get_work_dim', sig,
-                                     ["void"])
+    get_work_dim = _declare_function(context, builder, "get_work_dim", sig, ["void"])
     res = builder.call(get_work_dim, [])
     return res
 
@@ -107,8 +110,9 @@ def get_work_dim_impl(context, builder, sig, args):
 @lower(stubs.get_global_size, types.uint32)
 def get_global_size_impl(context, builder, sig, args):
     [dim] = args
-    get_global_size = _declare_function(context, builder, 'get_global_size',
-                                        sig, ['unsigned int'])
+    get_global_size = _declare_function(
+        context, builder, "get_global_size", sig, ["unsigned int"]
+    )
     res = builder.call(get_global_size, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -116,8 +120,9 @@ def get_global_size_impl(context, builder, sig, args):
 @lower(stubs.get_local_size, types.uint32)
 def get_local_size_impl(context, builder, sig, args):
     [dim] = args
-    get_local_size = _declare_function(context, builder, 'get_local_size',
-                                       sig, ['unsigned int'])
+    get_local_size = _declare_function(
+        context, builder, "get_local_size", sig, ["unsigned int"]
+    )
     res = builder.call(get_local_size, [dim])
     return context.cast(builder, res, types.uintp, types.intp)
 
@@ -125,17 +130,16 @@ def get_local_size_impl(context, builder, sig, args):
 @lower(stubs.barrier, types.uint32)
 def barrier_one_arg_impl(context, builder, sig, args):
     [flags] = args
-    barrier = _declare_function(context, builder, 'barrier', sig,
-                                ['unsigned int'])
+    barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"])
     builder.call(barrier, [flags])
     return _void_value
 
+
 @lower(stubs.barrier)
 def barrier_no_arg_impl(context, builder, sig, args):
     assert not args
     sig = types.void(types.uint32)
-    barrier = _declare_function(context, builder, 'barrier', sig,
-                                ['unsigned int'])
+    barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"])
     flags = context.get_constant(types.uint32, stubs.CLK_GLOBAL_MEM_FENCE)
     builder.call(barrier, [flags])
     return _void_value
@@ -144,8 +148,7 @@ def barrier_no_arg_impl(context, builder, sig, args):
 @lower(stubs.mem_fence, types.uint32)
 def mem_fence_impl(context, builder, sig, args):
     [flags] = args
-    mem_fence = _declare_function(context, builder, 'mem_fence', sig,
-                                ['unsigned int'])
+    mem_fence = _declare_function(context, builder, "mem_fence", sig, ["unsigned int"])
     builder.call(mem_fence, [flags])
     return _void_value
 
@@ -154,15 +157,15 @@ def mem_fence_impl(context, builder, sig, args):
 def sub_group_barrier_impl(context, builder, sig, args):
     assert not args
     sig = types.void(types.uint32)
-    barrier = _declare_function(context, builder, 'barrier', sig,
-                                ['unsigned int'])
+    barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"])
     flags = context.get_constant(types.uint32, stubs.CLK_LOCAL_MEM_FENCE)
     builder.call(barrier, [flags])
     return _void_value
 
 
-def insert_and_call_atomic_fn(context, builder, sig, fn_type,
-                              dtype, ptr, val, addrspace):
+def insert_and_call_atomic_fn(
+    context, builder, sig, fn_type, dtype, ptr, val, addrspace
+):
     ll_p = None
     name = ""
     if dtype.name == "int32" or dtype.name == "uint32":
@@ -173,11 +176,10 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         elif fn_type == "sub":
             name = "numba_dppy_atomic_sub_i32"
         else:
-            raise TypeError("Operation type is not supported %s" %
-                             (fn_type))
+            raise TypeError("Operation type is not supported %s" % (fn_type))
     elif dtype.name == "int64" or dtype.name == "uint64":
         # dpctl needs to expose same functions()
-        #if device_env.device_support_int64_atomics():
+        # if device_env.device_support_int64_atomics():
         if True:
             ll_val = ir.IntType(64)
             ll_p = ll_val.as_pointer()
@@ -186,9 +188,8 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             elif fn_type == "sub":
                 name = "numba_dppy_atomic_sub_i64"
             else:
-                raise TypeError("Operation type is not supported %s" %
-                                 (fn_type))
-        #else:
+                raise TypeError("Operation type is not supported %s" % (fn_type))
+        # else:
         #    raise TypeError("Current device does not support atomic " +
         #                     "operations on 64-bit Integer")
     elif dtype.name == "float32":
@@ -199,10 +200,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
         elif fn_type == "sub":
             name = "numba_dppy_atomic_sub_f32"
         else:
-            raise TypeError("Operation type is not supported %s" %
-                             (fn_type))
+            raise TypeError("Operation type is not supported %s" % (fn_type))
     elif dtype.name == "float64":
-        #if device_env.device_support_float64_atomics():
+        # if device_env.device_support_float64_atomics():
         # dpctl needs to expose same functions()
         if True:
             ll_val = ir.DoubleType()
@@ -212,22 +212,20 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
             elif fn_type == "sub":
                 name = "numba_dppy_atomic_sub_f64"
             else:
-                raise TypeError("Operation type is not supported %s" %
-                                 (fn_type))
-        #else:
+                raise TypeError("Operation type is not supported %s" % (fn_type))
+        # else:
         #    raise TypeError("Current device does not support atomic " +
         #                    "operations on 64-bit Float")
     else:
-        raise TypeError("Atomic operation is not supported for type %s" %
-                        (dtype.name))
+        raise TypeError("Atomic operation is not supported for type %s" % (dtype.name))
 
     if addrspace == target.SPIR_LOCAL_ADDRSPACE:
         name = name + "_local"
     else:
         name = name + "_global"
 
-    assert(ll_p != None)
-    assert(name != "")
+    assert ll_p != None
+    assert name != ""
     ll_p.addrspace = target.SPIR_GENERIC_ADDRSPACE
 
     mod = builder.module
@@ -242,19 +240,17 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type,
     fn = mod.get_or_insert_function(fnty, name)
     fn.calling_convention = target.CC_SPIR_FUNC
 
-    generic_ptr = context.addrspacecast(builder, ptr,
-                                    target.SPIR_GENERIC_ADDRSPACE)
+    generic_ptr = context.addrspacecast(builder, ptr, target.SPIR_GENERIC_ADDRSPACE)
 
     return builder.call(fn, [generic_ptr, val])
 
 
 @lower(stubs.atomic.add, types.Array, types.intp, types.Any)
-@lower(stubs.atomic.add, types.Array,
-           types.UniTuple, types.Any)
-@lower(stubs.atomic.add, types.Array, types.Tuple,
-           types.Any)
+@lower(stubs.atomic.add, types.Array, types.UniTuple, types.Any)
+@lower(stubs.atomic.add, types.Array, types.Tuple, types.Any)
 def atomic_add_tuple(context, builder, sig, args):
     from .atomics import atomic_support_present
+
     if atomic_support_present():
         context.link_binaries[target.LINK_ATOMIC] = True
         aryty, indty, valty = sig.args
@@ -266,36 +262,53 @@ def atomic_add_tuple(context, builder, sig, args):
             indty = [indty]
         else:
             indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
-            indices = [context.cast(builder, i, t, types.intp)
-                       for t, i in zip(indty, indices)]
+            indices = [
+                context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices)
+            ]
 
         if dtype != valty:
             raise TypeError("expecting %s but got %s" % (dtype, valty))
 
         if aryty.ndim != len(indty):
-            raise TypeError("indexing %d-D array with %d-D index" %
-                            (aryty.ndim, len(indty)))
+            raise TypeError(
+                "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty))
+            )
 
         lary = context.make_array(aryty)(context, builder, ary)
         ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices)
 
         if aryty.addrspace == target.SPIR_LOCAL_ADDRSPACE:
-            return insert_and_call_atomic_fn(context, builder, sig, "add", dtype,
-                    ptr, val, target.SPIR_LOCAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "add",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_LOCAL_ADDRSPACE,
+            )
         else:
-            return insert_and_call_atomic_fn(context, builder, sig, "add", dtype,
-                    ptr, val, target.SPIR_GLOBAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "add",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_GLOBAL_ADDRSPACE,
+            )
     else:
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
 @lower(stubs.atomic.sub, types.Array, types.intp, types.Any)
-@lower(stubs.atomic.sub, types.Array,
-           types.UniTuple, types.Any)
-@lower(stubs.atomic.sub, types.Array, types.Tuple,
-           types.Any)
+@lower(stubs.atomic.sub, types.Array, types.UniTuple, types.Any)
+@lower(stubs.atomic.sub, types.Array, types.Tuple, types.Any)
 def atomic_sub_tuple(context, builder, sig, args):
     from .atomics import atomic_support_present
+
     if atomic_support_present():
         context.link_binaries[target.LINK_ATOMIC] = True
         aryty, indty, valty = sig.args
@@ -307,36 +320,58 @@ def atomic_sub_tuple(context, builder, sig, args):
             indty = [indty]
         else:
             indices = cgutils.unpack_tuple(builder, inds, count=len(indty))
-            indices = [context.cast(builder, i, t, types.intp)
-                       for t, i in zip(indty, indices)]
+            indices = [
+                context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices)
+            ]
 
         if dtype != valty:
             raise TypeError("expecting %s but got %s" % (dtype, valty))
 
         if aryty.ndim != len(indty):
-            raise TypeError("indexing %d-D array with %d-D index" %
-                            (aryty.ndim, len(indty)))
+            raise TypeError(
+                "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty))
+            )
 
         lary = context.make_array(aryty)(context, builder, ary)
         ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices)
 
-
         if aryty.addrspace == target.SPIR_LOCAL_ADDRSPACE:
-            return insert_and_call_atomic_fn(context, builder, sig, "sub", dtype,
-                    ptr, val, target.SPIR_LOCAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "sub",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_LOCAL_ADDRSPACE,
+            )
         else:
-            return insert_and_call_atomic_fn(context, builder, sig, "sub", dtype,
-                    ptr, val, target.SPIR_GLOBAL_ADDRSPACE)
+            return insert_and_call_atomic_fn(
+                context,
+                builder,
+                sig,
+                "sub",
+                dtype,
+                ptr,
+                val,
+                target.SPIR_GLOBAL_ADDRSPACE,
+            )
     else:
         raise ImportError("Atomic support is not present, can not perform atomic_add")
 
 
-@lower('dppy.lmem.alloc', types.UniTuple, types.Any)
+@lower("dppy.lmem.alloc", types.UniTuple, types.Any)
 def dppy_lmem_alloc_array(context, builder, sig, args):
     shape, dtype = args
-    return _generic_array(context, builder, shape=shape, dtype=dtype,
-                          symbol_name='_dppy_lmem',
-                          addrspace=target.SPIR_LOCAL_ADDRSPACE)
+    return _generic_array(
+        context,
+        builder,
+        shape=shape,
+        dtype=dtype,
+        symbol_name="_dppy_lmem",
+        addrspace=target.SPIR_LOCAL_ADDRSPACE,
+    )
 
 
 def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
@@ -374,10 +409,18 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace):
     return _make_array(context, builder, gvmem, dtype, shape, addrspace=addrspace)
 
 
-def _make_array(context, builder, dataptr, dtype, shape, layout='C', addrspace=target.SPIR_GENERIC_ADDRSPACE):
+def _make_array(
+    context,
+    builder,
+    dataptr,
+    dtype,
+    shape,
+    layout="C",
+    addrspace=target.SPIR_GENERIC_ADDRSPACE,
+):
     ndim = len(shape)
     # Create array object
-    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C', addrspace=addrspace)
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout="C", addrspace=addrspace)
     ary = context.make_array(aryty)(context, builder)
 
     targetdata = _get_target_data(context)
@@ -392,12 +435,14 @@ def _make_array(context, builder, dataptr, dtype, shape, layout='C', addrspace=t
     kshape = [context.get_constant(types.intp, s) for s in shape]
     kstrides = [context.get_constant(types.intp, s) for s in strides]
 
-    context.populate_array(ary,
-                           data=builder.bitcast(dataptr, ary.data.type),
-                           shape=cgutils.pack_array(builder, kshape),
-                           strides=cgutils.pack_array(builder, kstrides),
-                           itemsize=context.get_constant(types.intp, itemsize),
-                           meminfo=None)
+    context.populate_array(
+        ary,
+        data=builder.bitcast(dataptr, ary.data.type),
+        shape=cgutils.pack_array(builder, kshape),
+        strides=cgutils.pack_array(builder, kstrides),
+        itemsize=context.get_constant(types.intp, itemsize),
+        meminfo=None,
+    )
 
     return ary._getvalue()
 
diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py
index 190b685955..bd4e03c21a 100644
--- a/numba_dppy/ocl/stubs.py
+++ b/numba_dppy/ocl/stubs.py
@@ -7,7 +7,7 @@
 _stub_error = NotImplementedError("This is a stub.")
 
 # mem fence
-CLK_LOCAL_MEM_FENCE  = 0x1
+CLK_LOCAL_MEM_FENCE = 0x1
 CLK_GLOBAL_MEM_FENCE = 0x2
 
 
@@ -85,7 +85,8 @@ class Stub(object):
     """A stub object to represent special objects which is meaningless
     outside the context of DPPY compilation context.
     """
-    _description_ = '<dppy special value>'
+
+    _description_ = "<dppy special value>"
     __slots__ = ()  # don't allocate __dict__
 
     def __new__(cls):
@@ -94,25 +95,28 @@ def __new__(cls):
     def __repr__(self):
         return self._description_
 
-#-------------------------------------------------------------------------------
+
+# -------------------------------------------------------------------------------
 # local memory
 
+
 def local_alloc(shape, dtype):
     shape = _legalize_shape(shape)
     ndim = len(shape)
     fname = "dppy.lmem.alloc"
-    restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE)
+    restype = types.Array(dtype, ndim, "C", addrspace=SPIR_LOCAL_ADDRSPACE)
     sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any)
     return ir.Intrinsic(fname, sig, args=(shape, dtype))
 
 
 class local(Stub):
-    """local namespace
-    """
-    _description_ = '<local>'
+    """local namespace"""
 
-    static_alloc = Macro('local.static_alloc', local_alloc, callable=True,
-                        argnames=['shape', 'dtype'])
+    _description_ = "<local>"
+
+    static_alloc = Macro(
+        "local.static_alloc", local_alloc, callable=True, argnames=["shape", "dtype"]
+    )
 
 
 def _legalize_shape(shape):
@@ -124,13 +128,14 @@ def _legalize_shape(shape):
         raise TypeError("invalid type for shape; got {0}".format(type(shape)))
 
 
-#-------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 # atomic
 
+
 class atomic(Stub):
-    """atomic namespace
-    """
-    _description_ = '<atomic>'
+    """atomic namespace"""
+
+    _description_ = "<atomic>"
 
     class add(Stub):
         """add(ary, idx, val)
diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py
index e5c9d4f793..983e1f267c 100644
--- a/numba_dppy/printimpl.py
+++ b/numba_dppy/printimpl.py
@@ -27,8 +27,7 @@ def print_item(ty, context, builder, val):
     A (format string, [list of arguments]) is returned that will allow
     forming the final printf()-like call.
     """
-    raise NotImplementedError("printing unimplemented for values of type %s"
-                              % (ty,))
+    raise NotImplementedError("printing unimplemented for values of type %s" % (ty,))
 
 
 @print_item.register(types.Integer)
@@ -44,11 +43,13 @@ def int_print_impl(ty, context, builder, val):
     lld = context.cast(builder, val, ty, dsttype)
     return rawfmt, [lld]
 
+
 @print_item.register(types.Float)
 def real_print_impl(ty, context, builder, val):
     lld = context.cast(builder, val, ty, types.float64)
     return "%f", [lld]
 
+
 @print_item.register(types.StringLiteral)
 def const_print_impl(ty, context, builder, sigval):
     pyval = ty.literal_value
diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py
index c1d58ce036..b55c122d93 100644
--- a/numba_dppy/rename_numpy_functions_pass.py
+++ b/numba_dppy/rename_numpy_functions_pass.py
@@ -9,21 +9,23 @@
 import numba_dppy
 from numba.core import types
 
-rewrite_function_name_map = {"sum": (["np"], "sum"),
-                             "eig": (["linalg"], "eig"),
-                             "prod": (["np"], "prod"),
-                             "max": (["np"], "max"),
-                             "amax": (["np"], "amax"),
-                             "min": (["np"], "min"),
-                             "amin": (["np"], "amin"),
-                             "mean": (["np"], "mean"),
-                             "median": (["np"], "median"),
-                             "argmax": (["np"], "argmax"),
-                             "argmin": (["np"], "argmin"),
-                             "argsort": (["np"], "argsort"),
-                             "cov": (["np"], "cov"),
-                             "dot": (["np"], "dot"),
-                             "matmul": (["np"], "matmul")}
+rewrite_function_name_map = {
+    "sum": (["np"], "sum"),
+    "eig": (["linalg"], "eig"),
+    "prod": (["np"], "prod"),
+    "max": (["np"], "max"),
+    "amax": (["np"], "amax"),
+    "min": (["np"], "min"),
+    "amin": (["np"], "amin"),
+    "mean": (["np"], "mean"),
+    "median": (["np"], "median"),
+    "argmax": (["np"], "argmax"),
+    "argmin": (["np"], "argmin"),
+    "argsort": (["np"], "argsort"),
+    "cov": (["np"], "cov"),
+    "dot": (["np"], "dot"),
+    "matmul": (["np"], "matmul"),
+}
 
 
 class RewriteNumPyOverloadedFunctions(object):
@@ -147,6 +149,7 @@ def run_pass(self, state):
 
 def get_dpnp_func_typ(func):
     from numba.core.typing.templates import builtin_registry
+
     for (k, v) in builtin_registry.globals:
         if k == func:
             return v
@@ -178,9 +181,13 @@ def run(self):
                     lhs = stmt.target.name
                     rhs = stmt.value
                     # replace A.func with np.func, and save A in saved_arr_arg
-                    if (rhs.op == 'getattr' and rhs.attr in self.function_name_map
-                            and isinstance(
-                                self.typemap[rhs.value.name], types.npytypes.Array)):
+                    if (
+                        rhs.op == "getattr"
+                        and rhs.attr in self.function_name_map
+                        and isinstance(
+                            self.typemap[rhs.value.name], types.npytypes.Array
+                        )
+                    ):
                         rhs = stmt.value
                         arr = rhs.value
                         saved_arr_arg[lhs] = arr
@@ -211,17 +218,18 @@ def run(self):
                         self.typemap.pop(lhs)
                         self.typemap[lhs] = func_typ
 
-                    if rhs.op == 'call' and rhs.func.name in saved_arr_arg:
+                    if rhs.op == "call" and rhs.func.name in saved_arr_arg:
                         # add array as first arg
                         arr = saved_arr_arg[rhs.func.name]
                         # update call type signature to include array arg
                         old_sig = self.calltypes.pop(rhs)
                         # argsort requires kws for typing so sig.args can't be used
                         # reusing sig.args since some types become Const in sig
-                        argtyps = old_sig.args[:len(rhs.args)]
+                        argtyps = old_sig.args[: len(rhs.args)]
                         kwtyps = {name: self.typemap[v.name] for name, v in rhs.kws}
                         self.calltypes[rhs] = self.typemap[rhs.func.name].get_call_type(
-                            typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps)
+                            typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps
+                        )
                         rhs.args = [arr] + rhs.args
 
                 new_body.append(stmt)
diff --git a/numba_dppy/spirv_generator.py b/numba_dppy/spirv_generator.py
index 5bac98e014..3d31596c5a 100644
--- a/numba_dppy/spirv_generator.py
+++ b/numba_dppy/spirv_generator.py
@@ -17,60 +17,69 @@ def _raise_bad_env_path(msg, path, extra=None):
         error_message += extra
     raise ValueError(error_message)
 
+
 _real_check_call = check_call
 
+
 def check_call(*args, **kwargs):
-    #print("check_call:", *args, **kwargs)
+    # print("check_call:", *args, **kwargs)
     return _real_check_call(*args, **kwargs)
 
-class CmdLine(object):
 
+class CmdLine(object):
     def disassemble(self, ipath, opath):
-        check_call([
-            "spirv-dis",
-            # "--no-indent",
-            # "--no-header",
-            # "--raw-id",
-            # "--offsets",
-            "-o",
-            opath,
-            ipath])
+        check_call(
+            [
+                "spirv-dis",
+                # "--no-indent",
+                # "--no-header",
+                # "--raw-id",
+                # "--offsets",
+                "-o",
+                opath,
+                ipath,
+            ]
+        )
 
     def validate(self, ipath):
-        check_call(["spirv-val",ipath])
+        check_call(["spirv-val", ipath])
 
     def optimize(self, ipath, opath):
-        check_call([
-            "spirv-opt",
-            # "--strip-debug",
-            # "--freeze-spec-const",
-            # "--eliminate-dead-const",
-            # "--fold-spec-const-op-composite",
-            # "--set-spec-const-default-value '<spec id>:<default value> ...'",
-            # "--unify-const",
-            # "--inline-entry-points-exhaustive",
-            # "--flatten-decorations",
-            # "--compact-ids",
-            "-o",
-            opath,
-            ipath])
+        check_call(
+            [
+                "spirv-opt",
+                # "--strip-debug",
+                # "--freeze-spec-const",
+                # "--eliminate-dead-const",
+                # "--fold-spec-const-op-composite",
+                # "--set-spec-const-default-value '<spec id>:<default value> ...'",
+                # "--unify-const",
+                # "--inline-entry-points-exhaustive",
+                # "--flatten-decorations",
+                # "--compact-ids",
+                "-o",
+                opath,
+                ipath,
+            ]
+        )
 
     def generate(self, ipath, opath):
         # DRD : Temporary hack to get SPIR-V code generation to work.
         # The opt step is needed for:
         #     a) generate a bitcode file from the text IR file
         #     b) hoist all allocas to the enty block of the module
-        check_call(["opt","-O1","-o",ipath+'.bc',ipath])
-        check_call(["llvm-spirv","-o",opath,ipath+'.bc'])
+        check_call(["opt", "-O1", "-o", ipath + ".bc", ipath])
+        check_call(["llvm-spirv", "-o", opath, ipath + ".bc"])
         if dppy_config.SAVE_IR_FILES == 0:
-            os.unlink(ipath + '.bc')
+            os.unlink(ipath + ".bc")
 
     def link(self, opath, binaries):
-        params = ["spirv-link","--allow-partial-linkage","-o", opath]
+        params = ["spirv-link", "--allow-partial-linkage", "-o", opath]
         params.extend(binaries)
 
         check_call(params)
 
+
 class Module(object):
     def __init__(self, context):
         """
@@ -93,14 +102,13 @@ def __del__(self):
         if dppy_config.SAVE_IR_FILES == 0:
             os.rmdir(self._tmpdir)
 
-    def _create_temp_file(self, name, mode='wb'):
+    def _create_temp_file(self, name, mode="wb"):
         path = self._track_temp_file(name)
         fobj = open(path, mode=mode)
         return fobj, path
 
     def _track_temp_file(self, name):
-        path = os.path.join(self._tmpdir,
-                            "{0}-{1}".format(len(self._tempfiles), name))
+        path = os.path.join(self._tmpdir, "{0}-{1}".format(len(self._tempfiles), name))
         self._tempfiles.append(path)
         return path
 
@@ -130,6 +138,7 @@ def finalize(self):
             del self.context.link_binaries[key]
             if key == LINK_ATOMIC:
                 from .ocl.atomics import get_atomic_spirv_path
+
                 binary_paths.append(get_atomic_spirv_path())
 
         if len(binary_paths) > 1:
@@ -152,13 +161,13 @@ def finalize(self):
                     # Disassemble optimized SPIR-V code
                     dis_path = self._track_temp_file("disassembled-spirv")
                     self._cmd.disassemble(ipath=opt_path, opath=dis_path)
-                    with open(dis_path, 'rb') as fin_opt:
+                    with open(dis_path, "rb") as fin_opt:
                         print("ASSEMBLY".center(80, "-"))
                         print(fin_opt.read())
                         print("".center(80, "="))
 
         # Read and return final SPIR-V (not optimized!)
-        with open(spirv_path, 'rb') as fin:
+        with open(spirv_path, "rb") as fin:
             spirv = fin.read()
 
         self._finalized = True
@@ -168,6 +177,7 @@ def finalize(self):
 
 # Public llvm_to_spirv function ###############################################
 
+
 def llvm_to_spirv(context, bitcode):
     mod = Module(context)
     mod.load_llvm(bitcode)
diff --git a/numba_dppy/target.py b/numba_dppy/target.py
index 147b229e77..87103a5045 100644
--- a/numba_dppy/target.py
+++ b/numba_dppy/target.py
@@ -39,15 +39,15 @@ def load_additional_registries(self):
 # -----------------------------------------------------------------------------
 # Implementation
 
-VALID_CHARS = re.compile(r'[^a-z0-9]', re.I)
+VALID_CHARS = re.compile(r"[^a-z0-9]", re.I)
 
 
 # Address spaces
-SPIR_PRIVATE_ADDRSPACE  = 0
-SPIR_GLOBAL_ADDRSPACE   = 1
+SPIR_PRIVATE_ADDRSPACE = 0
+SPIR_GLOBAL_ADDRSPACE = 1
 SPIR_CONSTANT_ADDRSPACE = 2
-SPIR_LOCAL_ADDRSPACE    = 3
-SPIR_GENERIC_ADDRSPACE  = 4
+SPIR_LOCAL_ADDRSPACE = 3
+SPIR_GENERIC_ADDRSPACE = 4
 
 SPIR_VERSION = (2, 0)
 
@@ -57,9 +57,13 @@ def load_additional_registries(self):
 
 class GenericPointerModel(datamodel.PrimitiveModel):
     def __init__(self, dmm, fe_type):
-        #print("GenericPointerModel:", dmm, fe_type, fe_type.addrspace)
-        adrsp = fe_type.addrspace if fe_type.addrspace is not None else SPIR_GENERIC_ADDRSPACE
-        #adrsp = SPIR_GENERIC_ADDRSPACE
+        # print("GenericPointerModel:", dmm, fe_type, fe_type.addrspace)
+        adrsp = (
+            fe_type.addrspace
+            if fe_type.addrspace is not None
+            else SPIR_GENERIC_ADDRSPACE
+        )
+        # adrsp = SPIR_GENERIC_ADDRSPACE
         be_type = dmm.lookup(fe_type.dtype).get_data_type().as_pointer(adrsp)
         super(GenericPointerModel, self).__init__(dmm, fe_type, be_type)
 
@@ -72,18 +76,37 @@ def _init_data_model_manager():
 
 spirv_data_model_manager = _init_data_model_manager()
 
+
 def _replace_numpy_ufunc_with_opencl_supported_functions():
     from numba.np.ufunc_db import _ufunc_db as ufunc_db
     from numba_dppy.ocl.mathimpl import lower_ocl_impl, sig_mapper
 
-    ufuncs = [("fabs", np.fabs), ("exp", np.exp), ("log", np.log),
-              ("log10", np.log10), ("expm1", np.expm1), ("log1p", np.log1p),
-              ("sqrt", np.sqrt), ("sin", np.sin), ("cos", np.cos),
-              ("tan", np.tan), ("asin", np.arcsin), ("acos", np.arccos),
-              ("atan", np.arctan), ("atan2", np.arctan2), ("sinh", np.sinh),
-              ("cosh", np.cosh), ("tanh", np.tanh), ("asinh", np.arcsinh),
-              ("acosh", np.arccosh), ("atanh", np.arctanh), ("ldexp", np.ldexp),
-              ("floor", np.floor), ("ceil", np.ceil), ("trunc", np.trunc)]
+    ufuncs = [
+        ("fabs", np.fabs),
+        ("exp", np.exp),
+        ("log", np.log),
+        ("log10", np.log10),
+        ("expm1", np.expm1),
+        ("log1p", np.log1p),
+        ("sqrt", np.sqrt),
+        ("sin", np.sin),
+        ("cos", np.cos),
+        ("tan", np.tan),
+        ("asin", np.arcsin),
+        ("acos", np.arccos),
+        ("atan", np.arctan),
+        ("atan2", np.arctan2),
+        ("sinh", np.sinh),
+        ("cosh", np.cosh),
+        ("tanh", np.tanh),
+        ("asinh", np.arcsinh),
+        ("acosh", np.arccosh),
+        ("atanh", np.arctanh),
+        ("ldexp", np.ldexp),
+        ("floor", np.floor),
+        ("ceil", np.ceil),
+        ("trunc", np.trunc),
+    ]
 
     for name, ufunc in ufuncs:
         for sig in ufunc_db[ufunc].keys():
@@ -97,16 +120,19 @@ class DPPYTargetContext(BaseContext):
 
     def init(self):
         self._internal_codegen = codegen.JITSPIRVCodegen("numba_dppy.jit")
-        self._target_data = (ll.create_target_data(codegen
-                                .SPIR_DATA_LAYOUT[utils.MACHINE_BITS]))
+        self._target_data = ll.create_target_data(
+            codegen.SPIR_DATA_LAYOUT[utils.MACHINE_BITS]
+        )
         # Override data model manager to SPIR model
         self.data_model_manager = spirv_data_model_manager
         self.link_binaries = dict()
 
         from numba.np.ufunc_db import _lazy_init_db
         import copy
+
         _lazy_init_db()
         from numba.np.ufunc_db import _ufunc_db as ufunc_db
+
         self.ufunc_db = copy.deepcopy(ufunc_db)
 
         from numba.core.cpu import CPUContext
@@ -114,26 +140,41 @@ def init(self):
 
         self.cpu_context = cpu_target.target_context
 
-
-
     def replace_numpy_ufunc_with_opencl_supported_functions(self):
         from numba_dppy.ocl.mathimpl import lower_ocl_impl, sig_mapper
 
-        ufuncs = [("fabs", np.fabs), ("exp", np.exp), ("log", np.log),
-                  ("log10", np.log10), ("expm1", np.expm1), ("log1p", np.log1p),
-                  ("sqrt", np.sqrt), ("sin", np.sin), ("cos", np.cos),
-                  ("tan", np.tan), ("asin", np.arcsin), ("acos", np.arccos),
-                  ("atan", np.arctan), ("atan2", np.arctan2), ("sinh", np.sinh),
-                  ("cosh", np.cosh), ("tanh", np.tanh), ("asinh", np.arcsinh),
-                  ("acosh", np.arccosh), ("atanh", np.arctanh), ("ldexp", np.ldexp),
-                  ("floor", np.floor), ("ceil", np.ceil), ("trunc", np.trunc)]
+        ufuncs = [
+            ("fabs", np.fabs),
+            ("exp", np.exp),
+            ("log", np.log),
+            ("log10", np.log10),
+            ("expm1", np.expm1),
+            ("log1p", np.log1p),
+            ("sqrt", np.sqrt),
+            ("sin", np.sin),
+            ("cos", np.cos),
+            ("tan", np.tan),
+            ("asin", np.arcsin),
+            ("acos", np.arccos),
+            ("atan", np.arctan),
+            ("atan2", np.arctan2),
+            ("sinh", np.sinh),
+            ("cosh", np.cosh),
+            ("tanh", np.tanh),
+            ("asinh", np.arcsinh),
+            ("acosh", np.arccosh),
+            ("atanh", np.arctanh),
+            ("ldexp", np.ldexp),
+            ("floor", np.floor),
+            ("ceil", np.ceil),
+            ("trunc", np.trunc),
+        ]
 
         for name, ufunc in ufuncs:
             for sig in self.ufunc_db[ufunc].keys():
                 if sig in sig_mapper and (name, sig_mapper[sig]) in lower_ocl_impl:
                     self.ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])]
 
-
     def load_additional_registries(self):
         from .ocl import oclimpl, mathimpl
         from numba.np import npyimpl
@@ -150,7 +191,6 @@ def load_additional_registries(self):
         """
         self.replace_numpy_ufunc_with_opencl_supported_functions()
 
-
     @cached_property
     def call_conv(self):
         return DPPYCallConv(self)
@@ -167,13 +207,13 @@ def repl(m):
             ch = m.group(0)
             return "_%X_" % ord(ch)
 
-        qualified = name + '.' + '.'.join(str(a) for a in argtypes)
+        qualified = name + "." + ".".join(str(a) for a in argtypes)
         mangled = VALID_CHARS.sub(repl, qualified)
-        return 'dppy_py_devfn_' + mangled
+        return "dppy_py_devfn_" + mangled
 
     def prepare_ocl_kernel(self, func, argtypes):
         module = func.module
-        func.linkage = 'linkonce_odr'
+        func.linkage = "linkonce_odr"
 
         module.data_layout = codegen.SPIR_DATA_LAYOUT[self.address_size]
         wrapper = self.generate_kernel_wrapper(func, argtypes)
@@ -184,7 +224,7 @@ def mark_ocl_device(self, func):
         # Adapt to SPIR
         # module = func.module
         func.calling_convention = CC_SPIR_FUNC
-        func.linkage = 'linkonce_odr'
+        func.linkage = "linkonce_odr"
         return func
 
     def generate_kernel_wrapper(self, func, argtypes):
@@ -196,32 +236,33 @@ def sub_gen_with_global(lty):
                 if lty.addrspace == SPIR_LOCAL_ADDRSPACE:
                     return lty, None
                 # DRD : Cast all pointer types to global address space.
-                if  lty.addrspace != SPIR_GLOBAL_ADDRSPACE: # jcaraban
-                    return (lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE),
-                            lty.addrspace)
+                if lty.addrspace != SPIR_GLOBAL_ADDRSPACE:  # jcaraban
+                    return (
+                        lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE),
+                        lty.addrspace,
+                    )
             return lty, None
 
         if len(arginfo.argument_types) > 0:
-            llargtys, changed = zip(*map(sub_gen_with_global,
-                                         arginfo.argument_types))
+            llargtys, changed = zip(*map(sub_gen_with_global, arginfo.argument_types))
         else:
             llargtys = changed = ()
         wrapperfnty = lc.Type.function(lc.Type.void(), llargtys)
 
         wrapper_module = self.create_module("dppy.kernel.wrapper")
-        wrappername = 'dppyPy_{name}'.format(name=func.name)
+        wrappername = "dppyPy_{name}".format(name=func.name)
 
         argtys = list(arginfo.argument_types)
-        fnty = lc.Type.function(lc.Type.int(),
-                                [self.call_conv.get_return_type(
-                                    types.pyobject)] + argtys)
+        fnty = lc.Type.function(
+            lc.Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys
+        )
 
         func = wrapper_module.add_function(fnty, name=func.name)
         func.calling_convention = CC_SPIR_FUNC
 
         wrapper = wrapper_module.add_function(wrapperfnty, name=wrappername)
 
-        builder = lc.Builder(wrapper.append_basic_block(''))
+        builder = lc.Builder(wrapper.append_basic_block(""))
 
         # Adjust address space of each kernel argument
         fixed_args = []
@@ -235,20 +276,21 @@ def sub_gen_with_global(lty):
         callargs = arginfo.from_arguments(builder, fixed_args)
 
         # XXX handle error status
-        status, _ = self.call_conv.call_function(builder, func, types.void,
-                                                 argtypes, callargs)
+        status, _ = self.call_conv.call_function(
+            builder, func, types.void, argtypes, callargs
+        )
         builder.ret_void()
 
         set_dppy_kernel(wrapper)
 
-        #print(str(wrapper_module))
+        # print(str(wrapper_module))
         # Link
         module.link_in(ll.parse_assembly(str(wrapper_module)))
         # To enable inlining which is essential because addrspacecast 1->0 is
         # illegal.  Inlining will optimize the addrspacecast out.
-        func.linkage = 'internal'
+        func.linkage = "internal"
         wrapper = module.get_function(wrapper.name)
-        module.get_function(func.name).linkage = 'internal'
+        module.get_function(func.name).linkage = "internal"
         return wrapper
 
     def declare_function(self, module, fndesc):
@@ -256,12 +298,12 @@ def declare_function(self, module, fndesc):
         fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name)
 
         if not self.enable_debuginfo:
-            fn.attributes.add('alwaysinline')
+            fn.attributes.add("alwaysinline")
 
         ret = super(DPPYTargetContext, self).declare_function(module, fndesc)
 
         # XXX: Refactor fndesc instead of this special case
-        if fndesc.llvm_func_name.startswith('dppy_py_devfn'):
+        if fndesc.llvm_func_name.startswith("dppy_py_devfn"):
             ret.calling_convention = CC_SPIR_FUNC
         return ret
 
@@ -274,32 +316,29 @@ def make_constant_array(self, builder, typ, ary):
         # return a._getvalue()
         raise NotImplementedError
 
-
     def insert_const_string(self, mod, string):
         """
         This returns a a pointer in the spir generic addrspace.
         """
         text = lc.Constant.stringz(string)
 
-        name = '$'.join(["__conststring__",
-                         self.mangler(string, ["str"])])
+        name = "$".join(["__conststring__", self.mangler(string, ["str"])])
 
         # Try to reuse existing global
         try:
             gv = mod.get_global(name)
         except KeyError as e:
             # Not defined yet
-            gv = mod.add_global_variable(text.type, name=name,
-                                         addrspace=SPIR_GENERIC_ADDRSPACE)
-            gv.linkage = 'internal'
+            gv = mod.add_global_variable(
+                text.type, name=name, addrspace=SPIR_GENERIC_ADDRSPACE
+            )
+            gv.linkage = "internal"
             gv.global_constant = True
             gv.initializer = text
 
         # Cast to a i8* pointer
         charty = gv.type.pointee.element
-        return lc.Constant.bitcast(gv,
-                               charty.as_pointer(SPIR_GENERIC_ADDRSPACE))
-
+        return lc.Constant.bitcast(gv, charty.as_pointer(SPIR_GENERIC_ADDRSPACE))
 
     def addrspacecast(self, builder, src, addrspace):
         """
@@ -325,12 +364,19 @@ def set_dppy_kernel(fn):
 
     # Mark kernels
     ocl_kernels = mod.get_or_insert_named_metadata("opencl.kernels")
-    ocl_kernels.add(lc.MetaData.get(mod, [fn,
-                                          gen_arg_addrspace_md(fn),
-                                          gen_arg_access_qual_md(fn),
-                                          gen_arg_type(fn),
-                                          gen_arg_type_qual(fn),
-                                          gen_arg_base_type(fn)]))
+    ocl_kernels.add(
+        lc.MetaData.get(
+            mod,
+            [
+                fn,
+                gen_arg_addrspace_md(fn),
+                gen_arg_access_qual_md(fn),
+                gen_arg_type(fn),
+                gen_arg_type_qual(fn),
+                gen_arg_base_type(fn),
+            ],
+        )
+    )
 
     # SPIR version 2.0
     make_constant = lambda x: lc.Constant.int(lc.Type.int(), x)
@@ -346,14 +392,16 @@ def set_dppy_kernel(fn):
 
     # Other metadata
     empty_md = lc.MetaData.get(mod, ())
-    others = ["opencl.used.extensions",
-              "opencl.used.optional.core.features",
-              "opencl.compiler.options"]
+    others = [
+        "opencl.used.extensions",
+        "opencl.used.optional.core.features",
+        "opencl.compiler.options",
+    ]
 
     for name in others:
         nmd = mod.get_or_insert_named_metadata(name)
         if not nmd.operands:
-             nmd.add(empty_md)
+            nmd.add(empty_md)
 
 
 def gen_arg_addrspace_md(fn):
diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py
index dde38eb75b..4a9dc16fc5 100644
--- a/numba_dppy/target_dispatcher.py
+++ b/numba_dppy/target_dispatcher.py
@@ -6,11 +6,11 @@
 
 
 class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta):
-    __numba__ = 'py_func'
+    __numba__ = "py_func"
 
-    target_offload_gpu = '__dppy_offload_gpu__'
-    target_offload_cpu = '__dppy_offload_cpu__'
-    target_dppy = 'dppy'
+    target_offload_gpu = "__dppy_offload_gpu__"
+    target_offload_cpu = "__dppy_offload_cpu__"
+    target_dppy = "dppy"
 
     def __init__(self, py_func, wrapper, target, parallel_options, compiled=None):
 
@@ -58,32 +58,46 @@ def __is_with_context_target(self, target):
     def get_current_disp(self):
         target = self.__target
         parallel = self.__parallel
-        offload = isinstance(parallel, dict) and parallel.get('offload') is True
+        offload = isinstance(parallel, dict) and parallel.get("offload") is True
 
-        if (dpctl.is_in_device_context() or offload):
+        if dpctl.is_in_device_context() or offload:
             if not self.__is_with_context_target(target):
-                raise UnsupportedError(f"Can't use 'with' context with explicitly specified target '{target}'")
-            if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False):
-                raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'")
+                raise UnsupportedError(
+                    f"Can't use 'with' context with explicitly specified target '{target}'"
+                )
+            if parallel is False or (
+                isinstance(parallel, dict) and parallel.get("offload") is False
+            ):
+                raise UnsupportedError(
+                    f"Can't use 'with' context with parallel option '{parallel}'"
+                )
 
             from numba_dppy import dppy_offload_dispatcher
 
             if target is None:
                 if dpctl.get_current_device_type() == dpctl.device_type.gpu:
-                    return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
+                    return registry.dispatcher_registry[
+                        TargetDispatcher.target_offload_gpu
+                    ]
                 elif dpctl.get_current_device_type() == dpctl.device_type.cpu:
-                    return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
+                    return registry.dispatcher_registry[
+                        TargetDispatcher.target_offload_cpu
+                    ]
                 else:
                     if dpctl.is_in_device_context():
-                        raise UnsupportedError('Unknown dppy device type')
+                        raise UnsupportedError("Unknown dppy device type")
                     if offload:
                         if dpctl.has_gpu_queues():
-                            return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu]
+                            return registry.dispatcher_registry[
+                                TargetDispatcher.target_offload_gpu
+                            ]
                         elif dpctl.has_cpu_queues():
-                            return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu]
+                            return registry.dispatcher_registry[
+                                TargetDispatcher.target_offload_cpu
+                            ]
 
         if target is None:
-            target = 'cpu'
+            target = "cpu"
 
         return registry.dispatcher_registry[target]
 
@@ -93,5 +107,5 @@ def _reduce_states(self):
             wrapper=self.__wrapper,
             target=self.__target,
             parallel=self.__parallel,
-            compiled=self.__compiled
+            compiled=self.__compiled,
         )
diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py
index e6ff1e3ab3..7f6e539b38 100644
--- a/numba_dppy/testing.py
+++ b/numba_dppy/testing.py
@@ -18,6 +18,7 @@ def captured_dppy_stdout():
     sys.stdout.flush()
 
     import numba_dppy, numba_dppy as dppy
+
     with redirect_c_stdout() as stream:
         yield DPPYTextCapture(stream)
 
@@ -38,6 +39,7 @@ def expectedFailureIf(condition):
 def ensure_dpnp():
     try:
         from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue
+
         return True
     except:
         return False
diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py
index 939c95c567..09ff707488 100644
--- a/numba_dppy/tests/__init__.py
+++ b/numba_dppy/tests/__init__.py
@@ -7,6 +7,7 @@
 
 # from numba_dppy.tests.dppy import *
 
+
 def load_tests(loader, tests, pattern):
 
     suite = SerialSuite()
diff --git a/numba_dppy/tests/skip_tests.py b/numba_dppy/tests/skip_tests.py
index fa18d36181..6765b95cfe 100644
--- a/numba_dppy/tests/skip_tests.py
+++ b/numba_dppy/tests/skip_tests.py
@@ -1,5 +1,6 @@
 import dpctl
 
+
 def is_gen12(device_type):
     with dpctl.device_context(device_type):
         q = dpctl.get_current_queue()
diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py
index 494f269c59..3a2f3d7f05 100644
--- a/numba_dppy/tests/test_arg_accessor.py
+++ b/numba_dppy/tests/test_arg_accessor.py
@@ -5,19 +5,22 @@
 import dpctl
 
 
-@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []})
+@dppy.kernel(
+    access_types={"read_only": ["a", "b"], "write_only": ["c"], "read_write": []}
+)
 def sum_with_accessor(a, b, c):
     i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
+
 @dppy.kernel
 def sum_without_accessor(a, b, c):
     i = dppy.get_global_id(0)
     c[i] = a[i] + b[i]
 
-def call_kernel(global_size, local_size,
-                A, B, C, func):
-        func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
+
+def call_kernel(global_size, local_size, A, B, C, func):
+    func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C)
 
 
 global_size = 10
@@ -29,39 +32,35 @@ def call_kernel(global_size, local_size,
 D = A + B
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYArgAccessorCPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_with_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_with_accessor)
         self.assertTrue(np.all(D == C))
 
     def test_arg_without_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:cpu") as cpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_without_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_without_accessor)
         self.assertTrue(np.all(D == C))
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYArgAccessorGPU(unittest.TestCase):
     def test_arg_with_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_with_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_with_accessor)
         self.assertTrue(np.all(D == C))
 
     def test_arg_without_accessor(self):
         C = np.ones_like(A)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            call_kernel(global_size, local_size,
-                        A, B, C, sum_without_accessor)
+            call_kernel(global_size, local_size, A, B, C, sum_without_accessor)
         self.assertTrue(np.all(D == C))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py
index ed55e12e16..7e21748a2c 100644
--- a/numba_dppy/tests/test_arg_types.py
+++ b/numba_dppy/tests/test_arg_types.py
@@ -10,6 +10,7 @@ def mul_kernel(A, B, test):
     i = dppy.get_global_id(0)
     B[i] = A[i] * test
 
+
 def call_mul_device_kernel(global_size, A, B, test):
     mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test)
 
@@ -20,7 +21,7 @@ def call_mul_device_kernel(global_size, A, B, test):
 B = np.array(np.random.random(N), dtype=np.float32)
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYArrayArgCPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
@@ -46,7 +47,7 @@ def check_bool_kernel(A, test):
             else:
                 A[0] = 222
 
-        A = np.array([0], dtype='float64')
+        A = np.array([0], dtype="float64")
 
         with dpctl.device_context("opencl:cpu") as cpu_queue:
             check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
@@ -55,7 +56,7 @@ def check_bool_kernel(A, test):
             self.assertTrue(A[0] == 222)
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYArrayArgGPU(unittest.TestCase):
     def test_integer_arg(self):
         x = np.int32(2)
@@ -81,7 +82,7 @@ def check_bool_kernel(A, test):
             else:
                 A[0] = 222
 
-        A = np.array([0], dtype='float64')
+        A = np.array([0], dtype="float64")
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True)
@@ -89,5 +90,6 @@ def check_bool_kernel(A, test):
             check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False)
             self.assertTrue(A[0] == 222)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py
index 27a810ba08..ccfcae978c 100644
--- a/numba_dppy/tests/test_atomic_op.py
+++ b/numba_dppy/tests/test_atomic_op.py
@@ -5,6 +5,7 @@
 import unittest
 import dpctl
 
+
 def atomic_add_int32(ary):
     tid = dppy.get_local_id(0)
     lm = dppy.local.static_alloc(32, numba.uint32)
@@ -111,17 +112,19 @@ def call_fn_for_datatypes(fn, result, input, global_size):
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if dtype == np.double and not device_env.device_support_float64_atomics():
+            # if dtype == np.double and not device_env.device_support_float64_atomics():
             #    continue
-            #if dtype == np.int64 and not device_env.device_support_int64_atomics():
+            # if dtype == np.int64 and not device_env.device_support_int64_atomics():
             #    continue
             fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
 
-        assert(a[0] == result)
+        assert a[0] == result
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
-@unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
+@unittest.skipUnless(
+    numba_dppy.ocl.atomic_support_present(), "test only when atomic support is present"
+)
 class TestAtomicOp(unittest.TestCase):
     def test_atomic_add_global(self):
         @dppy.kernel
@@ -133,7 +136,6 @@ def atomic_add(B):
 
         call_fn_for_datatypes(atomic_add, N, B, N)
 
-
     def test_atomic_sub_global(self):
         @dppy.kernel
         def atomic_sub(B):
@@ -144,12 +146,11 @@ def atomic_sub(B):
 
         call_fn_for_datatypes(atomic_sub, 0, B, N)
 
-
     def test_atomic_add_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
+        # dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32)
         dppy_atomic_add = dppy.kernel(atomic_add_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
@@ -160,12 +161,11 @@ def test_atomic_add_local_int32(self):
 
         self.assertTrue(np.all(ary == gold))
 
-
     def test_atomic_sub_local_int32(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32)
         orig = ary.copy()
 
-        #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
+        # dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32)
         dppy_atomic_sub = dppy.kernel(atomic_sub_int32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary)
@@ -176,22 +176,20 @@ def test_atomic_sub_local_int32(self):
 
         self.assertTrue(np.all(ary == gold))
 
-
     def test_atomic_add_local_float32(self):
         ary = np.array([0], dtype=np.float32)
 
-        #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
+        # dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32)
         dppy_atomic_add = dppy.kernel(atomic_add_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
 
         self.assertTrue(ary[0] == 32)
 
-
     def test_atomic_sub_local_float32(self):
         ary = np.array([32], dtype=np.float32)
 
-        #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
+        # dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32)
         dppy_atomic_sub = dppy.kernel(atomic_sub_float32)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
 
@@ -199,77 +197,71 @@ def test_atomic_sub_local_float32(self):
 
         self.assertTrue(ary[0] == 0)
 
-
     def test_atomic_add_local_int64(self):
         ary = np.array([0], dtype=np.int64)
 
-        #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
+        # dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64)
         dppy_atomic_add = dppy.kernel(atomic_add_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_int64_atomics():
+            # if device_env.device_support_int64_atomics():
             dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_sub_local_int64(self):
         ary = np.array([32], dtype=np.int64)
 
-        #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
+        # fn = dppy.kernel('void(int64[:])')(atomic_sub_int64)
         fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_int64_atomics():
+            # if device_env.device_support_int64_atomics():
             fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_add_local_float64(self):
         ary = np.array([0], dtype=np.double)
 
-        #fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
+        # fn = dppy.kernel('void(float64[:])')(atomic_add_float64)
         fn = dppy.kernel(atomic_add_float64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_float64_atomics():
+            # if device_env.device_support_float64_atomics():
             fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 32)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_sub_local_float64(self):
         ary = np.array([32], dtype=np.double)
 
-        #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
+        # fn = dppy.kernel('void(float64[:])')(atomic_sub_int64)
         fn = dppy.kernel(atomic_sub_int64)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             # TODO: dpctl needs to expose this functions
-            #if device_env.device_support_float64_atomics():
+            # if device_env.device_support_float64_atomics():
             fn[32, dppy.DEFAULT_LOCAL_SIZE](ary)
             self.assertTrue(ary[0] == 0)
-            #else:
+            # else:
             #    return
 
-
     def test_atomic_add2(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
+        # dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2)
         dppy_atomic_add2 = dppy.kernel(atomic_add2)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
         self.assertTrue(np.all(ary == orig + 1))
 
-
     def test_atomic_add3(self):
         ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8)
         orig = ary.copy()
-        #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
+        # dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3)
         dppy_atomic_add3 = dppy.kernel(atomic_add3)
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary)
@@ -277,5 +269,5 @@ def test_atomic_add3(self):
         self.assertTrue(np.all(ary == orig + 1))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py
index 7cedc18f13..6f30e06f2b 100644
--- a/numba_dppy/tests/test_barrier.py
+++ b/numba_dppy/tests/test_barrier.py
@@ -6,10 +6,10 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestBarrier(unittest.TestCase):
     def test_proper_lowering(self):
-        #@dppy.kernel("void(float32[::1])")
+        # @dppy.kernel("void(float32[::1])")
         @dppy.kernel
         def twice(A):
             i = dppy.get_global_id(0)
@@ -22,13 +22,13 @@ def twice(A):
         orig = arr.copy()
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
-            twice[N, N//2](arr)
+            twice[N, N // 2](arr)
 
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
 
     def test_no_arg_barrier_support(self):
-        #@dppy.kernel("void(float32[::1])")
+        # @dppy.kernel("void(float32[::1])")
         @dppy.kernel
         def twice(A):
             i = dppy.get_global_id(0)
@@ -47,11 +47,10 @@ def twice(A):
         # The computation is correct?
         np.testing.assert_allclose(orig * 2, arr)
 
-
     def test_local_memory(self):
         blocksize = 10
 
-        #@dppy.kernel("void(float32[::1])")
+        # @dppy.kernel("void(float32[::1])")
         @dppy.kernel
         def reverse_array(A):
             lm = dppy.local.static_alloc(shape=10, dtype=float32)
@@ -74,5 +73,5 @@ def reverse_array(A):
         np.testing.assert_allclose(expected, arr)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py
index 7baecbeda5..fca2fe69f3 100644
--- a/numba_dppy/tests/test_black_scholes.py
+++ b/numba_dppy/tests/test_black_scholes.py
@@ -20,12 +20,17 @@
 
 def cnd(d):
     K = 1.0 / (1.0 + 0.2316419 * np.abs(d))
-    ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) *
-               (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+    ret_val = (
+        RSQRT2PI
+        * np.exp(-0.5 * d * d)
+        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+    )
     return np.where(d > 0, 1.0 - ret_val, ret_val)
 
-def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
-                  Riskfree, Volatility):
+
+def black_scholes(
+    callResult, putResult, stockPrice, optionStrike, optionYears, Riskfree, Volatility
+):
     S = stockPrice
     X = optionStrike
     T = optionYears
@@ -37,15 +42,16 @@ def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears,
     cndd1 = cnd(d1)
     cndd2 = cnd(d2)
 
-    expRT = np.exp(- R * T)
-    callResult[:] = (S * cndd1 - X * expRT * cndd2)
-    putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1))
+    expRT = np.exp(-R * T)
+    callResult[:] = S * cndd1 - X * expRT * cndd2
+    putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1)
+
 
 def randfloat(rand_var, low, high):
     return (1.0 - rand_var) * low + rand_var * high
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYBlackScholes(unittest.TestCase):
     def test_black_scholes(self):
         OPT_N = 400
@@ -63,9 +69,15 @@ def test_black_scholes(self):
 
         # numpy
         for i in range(iterations):
-            black_scholes(callResultNumpy, putResultNumpy, stockPrice,
-                          optionStrike, optionYears, RISKFREE, VOLATILITY)
-
+            black_scholes(
+                callResultNumpy,
+                putResultNumpy,
+                stockPrice,
+                optionStrike,
+                optionYears,
+                RISKFREE,
+                VOLATILITY,
+            )
 
         @dppy.kernel
         def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
@@ -77,20 +89,26 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
             d2 = d1 - V * sqrtT
 
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1))
-            cndd1 = (RSQRT2PI * math.exp(-0.5 * d1 * d1) *
-                    (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+            cndd1 = (
+                RSQRT2PI
+                * math.exp(-0.5 * d1 * d1)
+                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+            )
             if d1 > 0:
                 cndd1 = 1.0 - cndd1
 
             K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2))
-            cndd2 = (RSQRT2PI * math.exp(-0.5 * d2 * d2) *
-                    (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))))
+            cndd2 = (
+                RSQRT2PI
+                * math.exp(-0.5 * d2 * d2)
+                * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
+            )
             if d2 > 0:
                 cndd2 = 1.0 - cndd2
 
-            expRT = math.exp((-1. * R) * T[i])
-            callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2)
-            putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1))
+            expRT = math.exp((-1.0 * R) * T[i])
+            callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
+            putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
 
         # numbapro
         time0 = time.time()
@@ -101,11 +119,16 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
             time1 = time.time()
             for i in range(iterations):
                 black_scholes_dppy[blockdim, griddim](
-                    callResultNumbapro, putResultNumbapro, stockPrice, optionStrike,
-                    optionYears, RISKFREE, VOLATILITY)
+                    callResultNumbapro,
+                    putResultNumbapro,
+                    stockPrice,
+                    optionStrike,
+                    optionYears,
+                    RISKFREE,
+                    VOLATILITY,
+                )
 
-
-        dt = (time1 - time0)
+        dt = time1 - time0
 
         delta = np.abs(callResultNumpy - callResultNumbapro)
         L1norm = delta.sum() / np.abs(callResultNumpy).sum()
@@ -114,5 +137,6 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V):
         self.assertTrue(L1norm < 1e-13)
         self.assertTrue(max_abs_err < 1e-13)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py
index 268401ce98..561c1dda87 100644
--- a/numba_dppy/tests/test_caching.py
+++ b/numba_dppy/tests/test_caching.py
@@ -19,15 +19,18 @@ def test_caching_kernel(self):
         b = np.array(np.random.random(N), dtype=np.float32)
         c = np.ones_like(a)
 
-
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             func = dppy.kernel(data_parallel_sum)
-            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+            caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
+                a, b, c
+            )
 
             for i in range(10):
-                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c)
+                cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
+                    a, b, c
+                )
                 self.assertIs(caching_kernel, cached_kernel)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_controllable_fallback.py b/numba_dppy/tests/test_controllable_fallback.py
index 357f0b5e20..6a722d2d72 100644
--- a/numba_dppy/tests/test_controllable_fallback.py
+++ b/numba_dppy/tests/test_controllable_fallback.py
@@ -7,7 +7,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYFallback(unittest.TestCase):
     def test_dppy_fallback_true(self):
         @numba.jit
@@ -33,7 +33,9 @@ def inner_call_fallback():
         numba_dppy.compiler.DEBUG = 0
 
         np.testing.assert_array_equal(dppy_fallback_true, ref_result)
-        self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+        self.assertTrue(
+            "Failed to lower parfor on DPPY-device" in msg_fallback_true.getvalue()
+        )
 
     @unittest.expectedFailure
     def test_dppy_fallback_false(self):
@@ -52,7 +54,7 @@ def inner_call_fallback():
 
         try:
             numba_dppy.compiler.DEBUG = 1
-            numba_dppy.config.FALLBACK_ON_CPU  = 0
+            numba_dppy.config.FALLBACK_ON_CPU = 0
             with captured_stderr() as msg_fallback_true:
                 with dpctl.device_context("opencl:gpu") as gpu_queue:
                     dppy = numba.njit(parallel=True)(inner_call_fallback)
@@ -60,12 +62,14 @@ def inner_call_fallback():
 
         finally:
             ref_result = inner_call_fallback()
-            numba_dppy.config.FALLBACK_ON_CPU  = 1
+            numba_dppy.config.FALLBACK_ON_CPU = 1
             numba_dppy.compiler.DEBUG = 0
 
             not np.testing.assert_array_equal(dppy_fallback_false, ref_result)
-            not self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue())
+            not self.assertTrue(
+                "Failed to lower parfor on DPPY-device" in msg_fallback_true.getvalue()
+            )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py
index eb47cd28bc..06a49dd5ed 100644
--- a/numba_dppy/tests/test_device_array_args.py
+++ b/numba_dppy/tests/test_device_array_args.py
@@ -5,6 +5,7 @@
 import dpctl
 import unittest
 
+
 @dppy.kernel
 def data_parallel_sum(a, b, c):
     i = dppy.get_global_id(0)
@@ -19,7 +20,7 @@ def data_parallel_sum(a, b, c):
 d = a + b
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYDeviceArrayArgsGPU(unittest.TestCase):
     def test_device_array_args_cpu(self):
         c = np.ones_like(a)
@@ -30,7 +31,7 @@ def test_device_array_args_cpu(self):
             self.assertTrue(np.all(c == d))
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYDeviceArrayArgsCPU(unittest.TestCase):
     def test_device_array_args_gpu(self):
         c = np.ones_like(a)
@@ -41,5 +42,5 @@ def test_device_array_args_gpu(self):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py
index 59ddd16f65..29d31bbb2b 100644
--- a/numba_dppy/tests/test_dpctl_api.py
+++ b/numba_dppy/tests/test_dpctl_api.py
@@ -2,7 +2,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPCTLAPI(unittest.TestCase):
     def test_dpctl_api(self):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
@@ -16,5 +16,5 @@ def test_dpctl_api(self):
             dpctl.is_in_device_context()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py
index 166937c275..75f22d1274 100644
--- a/numba_dppy/tests/test_dpnp_functions.py
+++ b/numba_dppy/tests/test_dpnp_functions.py
@@ -13,12 +13,16 @@
 
 import dpctl
 
-def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None):
+
+def test_for_different_datatypes(
+    fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None
+):
     if arg_count == 1:
         for ty in tys:
             if matrix and matrix[0]:
-                a = np.array(np.random.random(
-                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(
+                    dims[0], dims[1]
+                )
             else:
                 a = np.array(np.random.random(dims[0]), dtype=ty)
 
@@ -36,13 +40,15 @@ def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False
     elif arg_count == 2:
         for ty in tys:
             if matrix and matrix[0]:
-                a = np.array(np.random.random(
-                    dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1])
+                a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape(
+                    dims[0], dims[1]
+                )
             else:
                 a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty)
             if matrix and matrix[1]:
-                b = np.array(np.random.random(
-                    dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3])
+                b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty).reshape(
+                    dims[2], dims[3]
+                )
             else:
                 b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty)
 
@@ -100,9 +106,10 @@ def vvsort(val, vec, size):
             vec[k, imax] = temp
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available")
 class Testdpnp_linalg_functions(unittest.TestCase):
     tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+
     def test_eig(self):
         @njit
         def f(a):
@@ -111,7 +118,11 @@ def f(a):
         size = 3
         for ty in self.tys:
             a = np.arange(size * size, dtype=ty).reshape((size, size))
-            symm_a = np.tril(a) + np.tril(a, -1).T + np.diag(np.full((size,), size * size, dtype=ty))
+            symm_a = (
+                np.tril(a)
+                + np.tril(a, -1).T
+                + np.diag(np.full((size,), size * size, dtype=ty))
+            )
 
             with dpctl.device_context("opencl:gpu"):
                 got_val, got_vec = f(symm_a)
@@ -122,8 +133,7 @@ def f(a):
             vvsort(got_val, got_vec, size)
             vvsort(np_val, np_vec, size)
 
-
-	    # NP change sign of vectors
+            # NP change sign of vectors
             for i in range(np_vec.shape[1]):
                 if np_vec[0, i] * got_vec[0, i] < 0:
                     np_vec[:, i] = -np_vec[:, i]
@@ -132,9 +142,10 @@ def f(a):
             self.assertTrue(np.allclose(got_vec, np_vec))
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available")
 class Testdpnp_ndarray_functions(unittest.TestCase):
     tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double]
+
     def test_ndarray_sum(self):
         @njit
         def f(a):
@@ -225,7 +236,6 @@ def f(a):
 
             self.assertTrue(expected == got)
 
-
     def test_ndarray_argmin(self):
         @njit
         def f(a):
@@ -257,7 +267,9 @@ def f(a):
             self.assertTrue(np.array_equal(expected, got))
 
 
-@unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available')
+@unittest.skipUnless(
+    ensure_dpnp() and dpctl.has_gpu_queues(), "test only when dpNP and GPU is available"
+)
 class Testdpnp_functions(unittest.TestCase):
     N = 10
 
@@ -271,8 +283,7 @@ def f(a):
             c = np.sum(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.sum, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.sum, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.sum, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.sum, [10, 2, 3], self.tys))
 
@@ -282,8 +293,7 @@ def f(a):
             c = np.prod(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.prod, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.prod, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.prod, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.prod, [10, 2, 3], self.tys))
 
@@ -293,11 +303,9 @@ def f(a):
             c = np.argmax(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.argmax, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.argmax, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(
-            f, np.argmax, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2, 3], self.tys))
 
     def test_max(self):
         @njit
@@ -305,8 +313,7 @@ def f(a):
             c = np.max(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.max, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.max, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys))
 
@@ -316,23 +323,19 @@ def f(a):
             c = np.amax(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.amax, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.amax, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.amax, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.amax, [10, 2, 3], self.tys))
 
-
     def test_argmin(self):
         @njit
         def f(a):
             c = np.argmin(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.argmin, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(
-            f, np.argmin, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2, 3], self.tys))
 
     def test_min(self):
         @njit
@@ -340,8 +343,7 @@ def f(a):
             c = np.min(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
 
@@ -351,8 +353,7 @@ def f(a):
             c = np.amin(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.min, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys))
 
@@ -362,8 +363,9 @@ def f(a):
             c = np.argsort(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.argmin, [10], 1, self.tys, np_all=True))
+        self.assertTrue(
+            test_for_different_datatypes(f, np.argmin, [10], 1, self.tys, np_all=True)
+        )
 
     def test_median(self):
         @njit
@@ -371,11 +373,9 @@ def f(a):
             c = np.median(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.median, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.median, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.median, [10, 2], self.tys))
-        self.assertTrue(test_for_dimensions(
-            f, np.median, [10, 2, 3], self.tys))
+        self.assertTrue(test_for_dimensions(f, np.median, [10, 2, 3], self.tys))
 
     def test_mean(self):
         @njit
@@ -383,8 +383,7 @@ def f(a):
             c = np.mean(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.mean, [10], 1, self.tys))
+        self.assertTrue(test_for_different_datatypes(f, np.mean, [10], 1, self.tys))
         self.assertTrue(test_for_dimensions(f, np.mean, [10, 2], self.tys))
         self.assertTrue(test_for_dimensions(f, np.mean, [10, 2, 3], self.tys))
 
@@ -394,8 +393,17 @@ def f(a, b):
             c = np.matmul(a, b)
             return c
 
-        self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [
-                        np.float, np.double], np_all=True, matrix=[True, True]))
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.matmul,
+                [10, 5, 5, 10],
+                2,
+                [np.float, np.double],
+                np_all=True,
+                matrix=[True, True],
+            )
+        )
 
     def test_dot(self):
         @njit
@@ -403,14 +411,44 @@ def f(a, b):
             c = np.dot(a, b)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [
-                        np.float, np.double], matrix=[False, True], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [
-                        np.float, np.double], matrix=[True, False], np_all=True))
-        self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [
-                        np.float, np.double], matrix=[True, True], np_all=True))
+        self.assertTrue(
+            test_for_different_datatypes(
+                f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double]
+            )
+        )
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.dot,
+                [10, 1, 10, 2],
+                2,
+                [np.float, np.double],
+                matrix=[False, True],
+                np_all=True,
+            )
+        )
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.dot,
+                [2, 10, 10, 1],
+                2,
+                [np.float, np.double],
+                matrix=[True, False],
+                np_all=True,
+            )
+        )
+        self.assertTrue(
+            test_for_different_datatypes(
+                f,
+                np.dot,
+                [10, 2, 2, 10],
+                2,
+                [np.float, np.double],
+                matrix=[True, True],
+                np_all=True,
+            )
+        )
 
     def test_cov(self):
         @njit
@@ -418,23 +456,26 @@ def f(a):
             c = np.cov(a)
             return c
 
-        self.assertTrue(test_for_different_datatypes(
-            f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True))
+        self.assertTrue(
+            test_for_different_datatypes(
+                f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True
+            )
+        )
 
     def test_dpnp_interacting_with_parfor(self):
         @njit
         def f(a, b):
             c = np.sum(a)
             e = np.add(b, a)
-            #d = a + 1
+            # d = a + 1
             return 0
 
         result = f(self.a, self.b)
-        #np_result = np.add((self.a + np.sum(self.a)), self.b)
+        # np_result = np.add((self.a + np.sum(self.a)), self.b)
 
-        #max_abs_err = result.sum() - np_result.sum()
-        #self.assertTrue(max_abs_err < 1e-4)
+        # max_abs_err = result.sum() - np_result.sum()
+        # self.assertTrue(max_abs_err < 1e-4)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dppy_fallback.py b/numba_dppy/tests/test_dppy_fallback.py
index dd05bbdc84..3ebad7aed9 100644
--- a/numba_dppy/tests/test_dppy_fallback.py
+++ b/numba_dppy/tests/test_dppy_fallback.py
@@ -6,7 +6,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYFallback(unittest.TestCase):
     def test_dppy_fallback_inner_call(self):
         @numba.jit
@@ -29,8 +29,7 @@ def inner_call_fallback():
         ref_result = inner_call_fallback()
 
         np.testing.assert_array_equal(dppy_result, ref_result)
-        self.assertTrue(
-            'Failed to lower parfor on DPPY-device' in msg.getvalue())
+        self.assertTrue("Failed to lower parfor on DPPY-device" in msg.getvalue())
 
     def test_dppy_fallback_reductions(self):
         def reduction(a):
@@ -47,9 +46,8 @@ def reduction(a):
         ref_result = reduction(a)
 
         np.testing.assert_array_equal(dppy_result, ref_result)
-        self.assertTrue(
-            'Failed to lower parfor on DPPY-device' in msg.getvalue())
+        self.assertTrue("Failed to lower parfor on DPPY-device" in msg.getvalue())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_dppy_func.py b/numba_dppy/tests/test_dppy_func.py
index 729030e153..69ff82b38a 100644
--- a/numba_dppy/tests/test_dppy_func.py
+++ b/numba_dppy/tests/test_dppy_func.py
@@ -5,7 +5,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYFunc(unittest.TestCase):
     N = 257
 
@@ -25,7 +25,6 @@ def f(a, b):
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
-
         self.assertTrue(np.all(b == 2))
 
     def test_dppy_func_ndarray(self):
@@ -56,5 +55,5 @@ def h(a, b):
             self.assertTrue(np.all(b == 3))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py
index 6336c63759..e09202c6a3 100644
--- a/numba_dppy/tests/test_math_functions.py
+++ b/numba_dppy/tests/test_math_functions.py
@@ -5,46 +5,55 @@
 import unittest
 import math
 
+
 @dppy.kernel
-def dppy_fabs(a,b):
+def dppy_fabs(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.fabs(a[i])
 
+
 @dppy.kernel
-def dppy_exp(a,b):
+def dppy_exp(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.exp(a[i])
 
+
 @dppy.kernel
-def dppy_log(a,b):
+def dppy_log(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.log(a[i])
 
+
 @dppy.kernel
-def dppy_sqrt(a,b):
+def dppy_sqrt(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.sqrt(a[i])
 
+
 @dppy.kernel
-def dppy_sin(a,b):
+def dppy_sin(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.sin(a[i])
 
+
 @dppy.kernel
-def dppy_cos(a,b):
+def dppy_cos(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.cos(a[i])
 
+
 @dppy.kernel
-def dppy_tan(a,b):
+def dppy_tan(a, b):
     i = dppy.get_global_id(0)
     b[i] = math.tan(a[i])
 
+
 global_size = 10
 N = global_size
 
 a = np.array(np.random.random(N), dtype=np.float32)
 
+
 def driver(a, jitfunc):
     b = np.ones_like(a)
     # Device buffers
@@ -67,7 +76,7 @@ def test_driver(input_arr, device_ty, jitfunc):
     return out_actual
 
 
-@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system')
+@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system")
 class TestDPPYMathFunctionsCPU(unittest.TestCase):
     def test_fabs_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_fabs)
@@ -77,30 +86,30 @@ def test_fabs_cpu(self):
     def test_sin_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_sin)
         b_expected = np.sin(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_cos_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_cos)
         b_expected = np.cos(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_exp_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_exp)
         b_expected = np.exp(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_sqrt_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_sqrt)
         b_expected = np.sqrt(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_log_cpu(self):
         b_actual = test_driver(a, "CPU", dppy_log)
         b_expected = np.log(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYMathFunctionsGPU(unittest.TestCase):
     def test_fabs_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_fabs)
@@ -110,28 +119,28 @@ def test_fabs_gpu(self):
     def test_sin_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_sin)
         b_expected = np.sin(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_cos_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_cos)
         b_expected = np.cos(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_exp_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_exp)
         b_expected = np.exp(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_sqrt_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_sqrt)
         b_expected = np.sqrt(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
     def test_log_gpu(self):
         b_actual = test_driver(a, "GPU", dppy_log)
         b_expected = np.log(a)
-        self.assertTrue(np.allclose(b_actual,b_expected))
+        self.assertTrue(np.allclose(b_actual, b_expected))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
index 21a8fc8444..c6ffd9433f 100644
--- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
+++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py
@@ -5,7 +5,7 @@
 import unittest
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_bit_twiddling_functions(unittest.TestCase):
     def test_bitwise_and(self):
         @njit
@@ -111,5 +111,5 @@ def f(a, b):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py
index 9d56e94374..b4f47d96b6 100644
--- a/numba_dppy/tests/test_numpy_comparison_functions.py
+++ b/numba_dppy/tests/test_numpy_comparison_functions.py
@@ -5,7 +5,7 @@
 import unittest
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_comparison_functions(unittest.TestCase):
     a = np.array([4, 5, 6])
     b = np.array([2, 6, 6])
@@ -202,5 +202,5 @@ def f(a, b):
         np.testing.assert_equal(c, d)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py
index 8df7e2b5d4..6e746c4ed7 100644
--- a/numba_dppy/tests/test_numpy_floating_functions.py
+++ b/numba_dppy/tests/test_numpy_floating_functions.py
@@ -4,7 +4,7 @@
 import unittest
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_floating_functions(unittest.TestCase):
     def test_isfinite(self):
         @njit
@@ -12,7 +12,7 @@ def f(a):
             c = np.isfinite(a)
             return c
 
-        test_arr = [np.log(-1.), 1., np.log(0)]
+        test_arr = [np.log(-1.0), 1.0, np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu"):
@@ -27,7 +27,7 @@ def f(a):
             c = np.isinf(a)
             return c
 
-        test_arr = [np.log(-1.), 1., np.log(0)]
+        test_arr = [np.log(-1.0), 1.0, np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu"):
@@ -42,7 +42,7 @@ def f(a):
             c = np.isnan(a)
             return c
 
-        test_arr = [np.log(-1.), 1., np.log(0)]
+        test_arr = [np.log(-1.0), 1.0, np.log(0)]
         input_arr = np.asarray(test_arr, dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu"):
@@ -94,5 +94,5 @@ def f(a):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py
index ef5dc235b8..4a701495f9 100644
--- a/numba_dppy/tests/test_numpy_math_functions.py
+++ b/numba_dppy/tests/test_numpy_math_functions.py
@@ -5,7 +5,8 @@
 import unittest
 from . import skip_tests
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_math_functions(unittest.TestCase):
     N = 10
     a = np.array(np.random.random(N), dtype=np.float32)
@@ -138,7 +139,7 @@ def f(a, b):
         with dpctl.device_context("opencl:gpu"):
             c = f(input_arr, divisor)
 
-        self.assertTrue(np.all(c == 1.))
+        self.assertTrue(np.all(c == 1.0))
 
     def test_abs(self):
         @njit
@@ -191,7 +192,7 @@ def f(a):
         with dpctl.device_context("opencl:gpu"):
             c = f(input_arr)
 
-        self.assertTrue(np.all(c == -1.))
+        self.assertTrue(np.all(c == -1.0))
 
     def test_conj(self):
         @njit
@@ -322,7 +323,7 @@ def f(a):
         with dpctl.device_context("opencl:gpu"):
             c = f(input_arr)
 
-        self.assertTrue(np.all(c == 1/input_arr))
+        self.assertTrue(np.all(c == 1 / input_arr))
 
     def test_conjugate(self):
         @njit
@@ -339,5 +340,5 @@ def f(a):
         self.assertTrue(np.all(c == d))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
index 812f3d060c..a67862032b 100644
--- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py
+++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py
@@ -6,7 +6,7 @@
 from . import skip_tests
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestNumpy_math_functions(unittest.TestCase):
 
     N = 10
@@ -238,5 +238,5 @@ def f(a):
         self.assertTrue(max_abs_err < 1e-5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_offload_diagnostics.py b/numba_dppy/tests/test_offload_diagnostics.py
index 6b41252fc6..9faebed088 100644
--- a/numba_dppy/tests/test_offload_diagnostics.py
+++ b/numba_dppy/tests/test_offload_diagnostics.py
@@ -16,7 +16,7 @@ def prange_func():
             a = np.ones((n), dtype=np.float64)
             b = np.ones((n), dtype=np.float64)
             c = np.ones((n), dtype=np.float64)
-            for i in prange(n//2):
+            for i in prange(n // 2):
                 a[i] = b[i] + c[i]
 
             return a
@@ -56,5 +56,5 @@ def parallel_sum(a, b, c):
             self.assertTrue("Device -" in got.getvalue())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py
index 17f1456bb6..0e578ce154 100644
--- a/numba_dppy/tests/test_parfor_lower_message.py
+++ b/numba_dppy/tests/test_parfor_lower_message.py
@@ -13,7 +13,7 @@ def prange_example():
     a = np.ones((n), dtype=np.float64)
     b = np.ones((n), dtype=np.float64)
     c = np.ones((n), dtype=np.float64)
-    for i in prange(n//2):
+    for i in prange(n // 2):
         a[i] = b[i] + c[i]
 
     return a
@@ -33,5 +33,5 @@ def test_parfor_message(self):
             self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py
index eda9ccebbc..8f5305198a 100644
--- a/numba_dppy/tests/test_prange.py
+++ b/numba_dppy/tests/test_prange.py
@@ -96,14 +96,14 @@ def f(a, b):
 
         self.assertTrue(np.all(b == 12))
 
-    @unittest.skip('numba-dppy issue 110')
+    @unittest.skip("numba-dppy issue 110")
     def test_two_consequent_prange(self):
         def prange_example():
             n = 10
             a = np.ones((n), dtype=np.float64)
             b = np.ones((n), dtype=np.float64)
             c = np.ones((n), dtype=np.float64)
-            for i in prange(n//2):
+            for i in prange(n // 2):
                 a[i] = b[i] + c[i]
 
             return a
@@ -120,20 +120,26 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count(
-            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count(
-            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
+        self.assertEqual(
+            stdout.getvalue().count("Parfor lowered on DPPY-device"),
+            2,
+            stdout.getvalue(),
+        )
+        self.assertEqual(
+            stdout.getvalue().count("Failed to lower parfor on DPPY-device"),
+            0,
+            stdout.getvalue(),
+        )
         np.testing.assert_equal(res, jitted_res)
 
-    @unittest.skip('NRT required but not enabled')
+    @unittest.skip("NRT required but not enabled")
     def test_2d_arrays(self):
         def prange_example():
             n = 10
             a = np.ones((n, n), dtype=np.float64)
             b = np.ones((n, n), dtype=np.float64)
             c = np.ones((n, n), dtype=np.float64)
-            for i in prange(n//2):
+            for i in prange(n // 2):
                 a[i] = b[i] + c[i]
 
             return a
@@ -150,12 +156,18 @@ def prange_example():
 
         numba_dppy.compiler.DEBUG = old_debug
 
-        self.assertEqual(stdout.getvalue().count(
-            'Parfor lowered on DPPY-device'), 2, stdout.getvalue())
-        self.assertEqual(stdout.getvalue().count(
-            'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue())
+        self.assertEqual(
+            stdout.getvalue().count("Parfor lowered on DPPY-device"),
+            2,
+            stdout.getvalue(),
+        )
+        self.assertEqual(
+            stdout.getvalue().count("Failed to lower parfor on DPPY-device"),
+            0,
+            stdout.getvalue(),
+        )
         np.testing.assert_equal(res, jitted_res)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py
index af19658048..8beca0a83f 100644
--- a/numba_dppy/tests/test_print.py
+++ b/numba_dppy/tests/test_print.py
@@ -7,7 +7,7 @@
 import dpctl
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestPrint(unittest.TestCase):
     def test_print_dppy_kernel(self):
         @dppy.func
@@ -30,5 +30,5 @@ def f(a, b):
             f[N, dppy.DEFAULT_LOCAL_SIZE](a, b)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py
index cfeff09b8d..7ef237fcd0 100644
--- a/numba_dppy/tests/test_rename_numpy_function_pass.py
+++ b/numba_dppy/tests/test_rename_numpy_function_pass.py
@@ -7,10 +7,12 @@
 from numba_dppy.testing import ensure_dpnp
 
 
-from numba.core import (compiler, typing, cpu)
-from numba_dppy.rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions,
-        DPPYRewriteNdarrayFunctions)
-from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes)
+from numba.core import compiler, typing, cpu
+from numba_dppy.rename_numpy_functions_pass import (
+    DPPYRewriteOverloadedNumPyFunctions,
+    DPPYRewriteNdarrayFunctions,
+)
+from numba.core.typed_passes import NopythonTypeInference, AnnotateTypes
 
 
 class MyPipeline(object):
@@ -46,8 +48,10 @@ def check_equivalent(expected_ir, got_ir):
         else:
             if isinstance(expected_stmt, numba.core.ir.Assign):
                 if isinstance(expected_stmt.value, numba.core.ir.Global):
-                    if (expected_stmt.value.name != got_stmt.value.name and
-                        expected_stmt.value.name != "numba_dppy"):
+                    if (
+                        expected_stmt.value.name != got_stmt.value.name
+                        and expected_stmt.value.name != "numba_dppy"
+                    ):
                         return False
                 elif isinstance(expected_stmt.value, numba.core.ir.Expr):
                     # should get "dpnp" and "sum" as attr
@@ -76,7 +80,7 @@ def got(a):
         self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir))
 
 
-@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available')
+@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available")
 class TestRenameNdarrayFunctionsPass(unittest.TestCase):
     def test_rename_ndarray(self):
         def expected(a):
diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py
index 37ca38a12a..c2001e13a6 100644
--- a/numba_dppy/tests/test_sum_reduction.py
+++ b/numba_dppy/tests/test_sum_reduction.py
@@ -4,34 +4,37 @@
 import unittest
 import dpctl
 
+
 @dppy.kernel
 def reduction_kernel(A, R, stride):
     i = dppy.get_global_id(0)
     # sum two element
-    R[i] = A[i] + A[i+stride]
+    R[i] = A[i] + A[i + stride]
     # store the sum to be used in nex iteration
     A[i] = R[i]
 
 
-@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system')
+@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestDPPYSumReduction(unittest.TestCase):
     def test_sum_reduction(self):
         # This test will only work for even case
         N = 1024
-        self.assertTrue(N%2 == 0)
+        self.assertTrue(N % 2 == 0)
 
         A = np.array(np.random.random(N), dtype=np.float32)
         A_copy = A.copy()
         # at max we will require half the size of A to store sum
-        R = np.array(np.random.random(math.ceil(N/2)), dtype=np.float32)
+        R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32)
 
         with dpctl.device_context("opencl:gpu") as gpu_queue:
             total = N
 
-            while (total > 1):
+            while total > 1:
                 # call kernel
                 global_size = total // 2
-                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size)
+                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](
+                    A, R, global_size
+                )
                 total = total // 2
 
             result = A_copy.sum()
@@ -39,5 +42,5 @@ def test_sum_reduction(self):
             self.assertTrue(max_abs_err < 1e-4)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py
index 5b3a41629c..6f93c232f6 100644
--- a/numba_dppy/tests/test_vectorize.py
+++ b/numba_dppy/tests/test_vectorize.py
@@ -8,20 +8,19 @@
 @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system")
 class TestVectorize(unittest.TestCase):
     def test_vectorize(self):
-
         @vectorize(nopython=True)
         def axy(a, x, y):
             return a * x + y
 
         @njit
         def f(a0, a1):
-            return np.cos(axy(a0, np.sin(a1) - 1., 1.))
+            return np.cos(axy(a0, np.sin(a1) - 1.0, 1.0))
 
         def f_np(a0, a1):
             sin_res = np.sin(a1)
             res = []
             for i in range(len(a0)):
-                res.append(axy(a0[i], sin_res[i] - 1., 1.))
+                res.append(axy(a0[i], sin_res[i] - 1.0, 1.0))
             return np.cos(np.array(res))
 
         A = np.random.random(10)
@@ -36,5 +35,5 @@ def f_np(a0, a1):
         self.assertTrue(max_abs_err < 1e-5)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py
index 693c155ab2..58f14952d9 100644
--- a/numba_dppy/tests/test_with_context.py
+++ b/numba_dppy/tests/test_with_context.py
@@ -8,10 +8,8 @@
 
 
 class TestWithDPPYContext(unittest.TestCase):
-
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     def test_with_dppy_context_gpu(self):
-
         @njit
         def nested_func(a, b):
             np.sin(a, b)
@@ -33,11 +31,10 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_gpu)
-        self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue())
+        self.assertTrue("Parfor lowered on DPPY-device" in got_gpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available")
     def test_with_dppy_context_cpu(self):
-
         @njit
         def nested_func(a, b):
             np.sin(a, b)
@@ -59,17 +56,15 @@ def func(b):
         func(expected)
 
         np.testing.assert_array_equal(expected, got_cpu)
-        self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue())
-
+        self.assertTrue("Parfor lowered on DPPY-device" in got_cpu_message.getvalue())
 
     @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available")
     def test_with_dppy_context_target(self):
-
-        @njit(target='cpu')
+        @njit(target="cpu")
         def nested_func_target(a, b):
             np.sin(a, b)
 
-        @njit(target='gpu')
+        @njit(target="gpu")
         def func_target(b):
             a = np.ones((64), dtype=np.float64)
             nested_func_target(a, b)
@@ -84,7 +79,6 @@ def func_no_parallel(b):
             a = np.ones((64), dtype=np.float64)
             return a
 
-
         a = np.ones((64), dtype=np.float64)
         b = np.ones((64), dtype=np.float64)
 
@@ -112,5 +106,5 @@ def func_no_parallel(b):
         self.assertTrue(msg_2 in str(raises_4.exception))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/setup.py b/setup.py
index 37ad0bfc68..8c892f6fd2 100644
--- a/setup.py
+++ b/setup.py
@@ -19,13 +19,15 @@ def get_ext_modules():
     if dpnp_present:
         dpnp_lib_path = []
         dpnp_lib_path += [os.path.dirname(dpnp.__file__)]
-        ext_dpnp_glue = Extension(name='numba_dppy.dpnp_glue.dpnp_fptr_interface',
-                                  sources=['numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx'],
-                                  include_dirs=[dpnp.get_include()],
-                                  libraries=['dpnp_backend_c'],
-                                  library_dirs=dpnp_lib_path,
-                                  runtime_library_dirs=dpnp_lib_path,
-                                  language="c++")
+        ext_dpnp_glue = Extension(
+            name="numba_dppy.dpnp_glue.dpnp_fptr_interface",
+            sources=["numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx"],
+            include_dirs=[dpnp.get_include()],
+            libraries=["dpnp_backend_c"],
+            library_dirs=dpnp_lib_path,
+            runtime_library_dirs=dpnp_lib_path,
+            language="c++",
+        )
         ext_modules += [ext_dpnp_glue]
 
     if dpnp_present: