From 9669b73b17f51fe4e3eb67bfc5acaf6c7ce3008e Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 21 Dec 2020 06:28:35 -0600 Subject: [PATCH 1/5] Use the black badge in project's README.md for indicating the code style --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9d1969fe3f..3641849417 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) + # numba-dppy ## Numba + dpPy + dpCtl + dpNP = numba-dppy From 5d3b81ae2bcaad841a8b8b70471d4aed8c0c215f Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 21 Dec 2020 06:32:31 -0600 Subject: [PATCH 2/5] Add GitHub Actions for black for check code style --- .github/workflows/black.yml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 .github/workflows/black.yml diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000000..1101c3a4a0 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,28 @@ +# This is a workflow to format Python code with black formatter + +name: black + +# Controls when the action will run. Triggers the workflow on push or pull request +# events but only for the master branch +on: + push: + branches: [main] + pull_request: + branches: [main] + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # This workflow contains a single job called "black" + black: + # The type of runner that the job will run on + runs-on: ubuntu-latest + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v2 + # Set up a Python environment for use in actions + - uses: actions/setup-python@v2 + + # Run black code formatter + - uses: psf/black@stable From 7885f76c30eaa3b4ae6df535ac5ff0036d5e8745 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 21 Dec 2020 06:33:50 -0600 Subject: [PATCH 3/5] Ignore versioneer.py because it is 3rd party file (see pyproject.toml) --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..2b233b61c9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[tool.black] +exclude = 'versioneer.py' From 7fa195017a799619199d1fe2f81150eb8f01735f Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 21 Dec 2020 06:35:25 -0600 Subject: [PATCH 4/5] Update CONTRIBUTING.md for all developers to use black in everyday work --- CONTRIBUTING.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..c3076bd5e4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,10 @@ +# Python code style + +## black + +We use [black](https://black.readthedocs.io/en/stable/) code formatter. + +- Revision: `20.8b1` or branch `stable`. +- See configuration in `pyproject.toml`. + +Run before each commit: `black .` From a8031b4a6cb292a308ae69a3d40ebe37d1eb9c00 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 21 Dec 2020 06:36:25 -0600 Subject: [PATCH 5/5] Format all Python code with black (just black .) --- numba_dppy/__init__.py | 6 +- numba_dppy/_version.py | 154 +++-- numba_dppy/codegen.py | 19 +- numba_dppy/compiler.py | 271 +++++---- numba_dppy/decorators.py | 14 +- numba_dppy/descriptor.py | 5 +- numba_dppy/device_init.py | 4 +- numba_dppy/dispatcher.py | 63 ++- numba_dppy/dpnp_glue/__init__.py | 2 +- numba_dppy/dpnp_glue/dpnp_linalgimpl.py | 38 +- .../dpnp_glue/dpnp_sort_search_countimpl.py | 10 +- numba_dppy/dpnp_glue/dpnp_statisticsimpl.py | 74 ++- .../dpnp_glue/dpnp_transcendentalsimpl.py | 4 +- numba_dppy/dpnp_glue/dpnpdecl.py | 6 +- numba_dppy/dpnp_glue/dpnpimpl.py | 14 +- numba_dppy/dpnp_glue/stubs.py | 7 +- numba_dppy/dppy_host_fn_call_gen.py | 387 ++++++++----- numba_dppy/dppy_lowerer.py | 524 ++++++++++-------- numba_dppy/dppy_offload_dispatcher.py | 49 +- numba_dppy/dppy_parfor_diagnostics.py | 59 +- numba_dppy/dppy_passbuilder.py | 97 ++-- numba_dppy/dppy_passes.py | 161 +++--- numba_dppy/dufunc_inliner.py | 93 +++- numba_dppy/examples/blacksholes_njit.py | 26 +- numba_dppy/examples/dppy_func.py | 2 +- numba_dppy/examples/dppy_with_context.py | 7 +- numba_dppy/examples/matmul.py | 14 +- numba_dppy/examples/pa_examples/test1.py | 2 +- numba_dppy/examples/pairwise_distance.py | 24 +- numba_dppy/examples/sum-hybrid.py | 6 +- numba_dppy/examples/sum.py | 2 +- numba_dppy/examples/sum2D.py | 12 +- numba_dppy/examples/sum_ndarray.py | 6 +- numba_dppy/examples/sum_reduction.py | 17 +- numba_dppy/examples/sum_reduction_ocl.py | 15 +- .../examples/sum_reduction_recursive_ocl.py | 48 +- numba_dppy/initialize.py | 18 +- numba_dppy/ocl/atomics/__init__.py | 9 +- numba_dppy/ocl/mathdecl.py | 11 +- numba_dppy/ocl/mathimpl.py | 145 ++--- numba_dppy/ocl/ocldecl.py | 26 +- numba_dppy/ocl/oclimpl.py | 207 ++++--- numba_dppy/ocl/stubs.py | 31 +- numba_dppy/printimpl.py | 5 +- numba_dppy/rename_numpy_functions_pass.py | 50 +- numba_dppy/spirv_generator.py | 80 +-- numba_dppy/target.py | 188 ++++--- numba_dppy/target_dispatcher.py | 46 +- numba_dppy/testing.py | 2 + numba_dppy/tests/__init__.py | 1 + numba_dppy/tests/skip_tests.py | 1 + numba_dppy/tests/test_arg_accessor.py | 29 +- numba_dppy/tests/test_arg_types.py | 12 +- numba_dppy/tests/test_atomic_op.py | 62 +-- numba_dppy/tests/test_barrier.py | 13 +- numba_dppy/tests/test_black_scholes.py | 70 ++- numba_dppy/tests/test_caching.py | 11 +- .../tests/test_controllable_fallback.py | 16 +- numba_dppy/tests/test_device_array_args.py | 7 +- numba_dppy/tests/test_dpctl_api.py | 4 +- numba_dppy/tests/test_dpnp_functions.py | 161 ++++-- numba_dppy/tests/test_dppy_fallback.py | 10 +- numba_dppy/tests/test_dppy_func.py | 5 +- numba_dppy/tests/test_math_functions.py | 49 +- .../test_numpy_bit_twiddling_functions.py | 4 +- .../tests/test_numpy_comparison_functions.py | 4 +- .../tests/test_numpy_floating_functions.py | 10 +- numba_dppy/tests/test_numpy_math_functions.py | 11 +- .../test_numpy_trigonomteric_functions.py | 4 +- numba_dppy/tests/test_offload_diagnostics.py | 4 +- numba_dppy/tests/test_parfor_lower_message.py | 4 +- numba_dppy/tests/test_prange.py | 38 +- numba_dppy/tests/test_print.py | 4 +- .../tests/test_rename_numpy_function_pass.py | 18 +- numba_dppy/tests/test_sum_reduction.py | 17 +- numba_dppy/tests/test_vectorize.py | 7 +- numba_dppy/tests/test_with_context.py | 16 +- setup.py | 16 +- 78 files changed, 2198 insertions(+), 1470 deletions(-) diff --git a/numba_dppy/__init__.py b/numba_dppy/__init__.py index ac4e898889..7bc3ea504a 100644 --- a/numba_dppy/__init__.py +++ b/numba_dppy/__init__.py @@ -506,17 +506,21 @@ def main(): import numba.testing from .config import dppy_present + if dppy_present: from .device_init import * else: raise ImportError("Importing numba-dppy failed") + def test(*args, **kwargs): if not dppy_present and not is_available(): dppy_error() return numba.testing.test("numba_dppy.tests", *args, **kwargs) + from ._version import get_versions -__version__ = get_versions()['version'] + +__version__ = get_versions()["version"] del get_versions diff --git a/numba_dppy/_version.py b/numba_dppy/_version.py index 165dbf4d17..dc6811b00f 100644 --- a/numba_dppy/_version.py +++ b/numba_dppy/_version.py @@ -1,4 +1,3 @@ - # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -58,17 +57,18 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs, method): # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): """Call the given command(s).""" assert isinstance(commands, list) p = None @@ -76,10 +76,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, try: dispcmd = str([c] + args) # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) + p = subprocess.Popen( + [c] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) break except EnvironmentError: e = sys.exc_info()[1] @@ -114,16 +117,22 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): for i in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } else: rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -183,7 +192,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) + tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)]) if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -192,7 +201,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) + tags = set([r for r in refs if re.search(r"\d", r)]) if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -200,19 +209,26 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") @@ -227,8 +243,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) + out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -236,10 +251,19 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) + describe_out, rc = run_command( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s*" % tag_prefix, + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -262,17 +286,16 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag @@ -281,10 +304,12 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -295,13 +320,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): else: # HEX: no tags pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) + count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root) pieces["distance"] = int(count_out) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() + date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] @@ -335,8 +360,7 @@ def render_pep440(pieces): rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -450,11 +474,13 @@ def render_git_describe_long(pieces): def render(pieces, style): """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -474,9 +500,13 @@ def render(pieces, style): else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } def get_versions(): @@ -490,8 +520,7 @@ def get_versions(): verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass @@ -500,13 +529,16 @@ def get_versions(): # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): + for i in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -520,6 +552,10 @@ def get_versions(): except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/numba_dppy/codegen.py b/numba_dppy/codegen.py index 4e278d7ebc..ef78551a2f 100644 --- a/numba_dppy/codegen.py +++ b/numba_dppy/codegen.py @@ -5,14 +5,17 @@ from numba.core import utils -SPIR_TRIPLE = {32: ' spir-unknown-unknown', - 64: 'spir64-unknown-unknown'} +SPIR_TRIPLE = {32: " spir-unknown-unknown", 64: "spir64-unknown-unknown"} SPIR_DATA_LAYOUT = { - 32 : ('e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:' - '256-v512:512-v1024:1024'), - 64 : ('e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-' - 'v512:512-v1024:1024') + 32: ( + "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:" + "256-v512:512-v1024:1024" + ), + 64: ( + "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-" + "v512:512-v1024:1024" + ), } @@ -36,8 +39,8 @@ def _optimize_final_module(self): def _finalize_specific(self): # Fix global naming for gv in self._final_module.global_variables: - if '.' in gv.name: - gv.name = gv.name.replace('.', '_') + if "." in gv.name: + gv.name = gv.name.replace(".", "_") def get_asm_str(self): # Return nothing: we can only dump assembler code when it is later diff --git a/numba_dppy/compiler.py b/numba_dppy/compiler.py index 37b9e25e9f..31e9d57278 100644 --- a/numba_dppy/compiler.py +++ b/numba_dppy/compiler.py @@ -22,22 +22,26 @@ from numba_dppy.dppy_parfor_diagnostics import ExtendedParforDiagnostics -DEBUG = os.environ.get('NUMBA_DPPY_DEBUG', None) -_NUMBA_DPPY_READ_ONLY = "read_only" +DEBUG = os.environ.get("NUMBA_DPPY_DEBUG", None) +_NUMBA_DPPY_READ_ONLY = "read_only" _NUMBA_DPPY_WRITE_ONLY = "write_only" _NUMBA_DPPY_READ_WRITE = "read_write" def _raise_no_device_found_error(): - error_message = ("No OpenCL device specified. " - "Usage : jit_fn[device, globalsize, localsize](...)") + error_message = ( + "No OpenCL device specified. " + "Usage : jit_fn[device, globalsize, localsize](...)" + ) raise ValueError(error_message) def _raise_invalid_kernel_enqueue_args(): - error_message = ("Incorrect number of arguments for enquing dppy.kernel. " - "Usage: device_env, global size, local size. " - "The local size argument is optional.") + error_message = ( + "Incorrect number of arguments for enquing dppy.kernel. " + "Usage: device_env, global size, local size. " + "The local size argument is optional." + ) raise ValueError(error_message) @@ -63,18 +67,14 @@ def define_pipelines(self): # this maintains the objmode fallback behaviour pms = [] self.state.parfor_diagnostics = ExtendedParforDiagnostics() - self.state.metadata['parfor_diagnostics'] = self.state.parfor_diagnostics + self.state.metadata["parfor_diagnostics"] = self.state.parfor_diagnostics if not self.state.flags.force_pyobject: - #print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline") + # print("Numba-DPPY [INFO]: Using Numba-DPPY pipeline") pms.append(DPPYPassBuilder.define_nopython_pipeline(self.state)) if self.state.status.can_fallback or self.state.flags.force_pyobject: - pms.append( - DefaultPassBuilder.define_objectmode_pipeline(self.state) - ) + pms.append(DefaultPassBuilder.define_objectmode_pipeline(self.state)) if self.state.status.can_giveup: - pms.append( - DefaultPassBuilder.define_interpreted_pipeline(self.state) - ) + pms.append(DefaultPassBuilder.define_interpreted_pipeline(self.state)) return pms @@ -88,32 +88,36 @@ def compile_with_dppy(pyfunc, return_type, args, debug): flags = compiler.Flags() # Do not compile (generate native code), just lower (to LLVM) if debug: - flags.set('debuginfo') - flags.set('no_compile') - flags.set('no_cpython_wrapper') - flags.unset('nrt') + flags.set("debuginfo") + flags.set("no_compile") + flags.set("no_cpython_wrapper") + flags.unset("nrt") # Run compilation pipeline if isinstance(pyfunc, FunctionType): - cres = compiler.compile_extra(typingctx=typingctx, - targetctx=targetctx, - func=pyfunc, - args=args, - return_type=return_type, - flags=flags, - locals={}, - pipeline_class=DPPYCompiler) + cres = compiler.compile_extra( + typingctx=typingctx, + targetctx=targetctx, + func=pyfunc, + args=args, + return_type=return_type, + flags=flags, + locals={}, + pipeline_class=DPPYCompiler, + ) elif isinstance(pyfunc, ir.FunctionIR): - cres = compiler.compile_ir(typingctx=typingctx, - targetctx=targetctx, - func_ir=pyfunc, - args=args, - return_type=return_type, - flags=flags, - locals={}, - pipeline_class=DPPYCompiler) + cres = compiler.compile_ir( + typingctx=typingctx, + targetctx=targetctx, + func_ir=pyfunc, + args=args, + return_type=return_type, + flags=flags, + locals={}, + pipeline_class=DPPYCompiler, + ) else: - assert(0) + assert 0 # Linking depending libraries # targetctx.link_dependencies(cres.llvm_module, cres.target_context.linking) library = cres.library @@ -138,17 +142,18 @@ def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=False): # depending on the target context. For example, we want to link our kernel object # with implementation containing atomic operations only when atomic operations # are being used in the kernel. - oclkern = DPPYKernel(context=cres.target_context, - sycl_queue=sycl_queue, - llvm_module=kernel.module, - name=kernel.name, - argtypes=cres.signature.args, - ordered_arg_access_types=access_types) + oclkern = DPPYKernel( + context=cres.target_context, + sycl_queue=sycl_queue, + llvm_module=kernel.module, + name=kernel.name, + argtypes=cres.signature.args, + ordered_arg_access_types=access_types, + ) return oclkern -def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces, - debug=False): +def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces, debug=False): if DEBUG: print("compile_kernel_parfor", args) for a in args: @@ -156,25 +161,26 @@ def compile_kernel_parfor(sycl_queue, func_ir, args, args_with_addrspaces, if isinstance(a, types.npytypes.Array): print("addrspace:", a.addrspace) - cres = compile_with_dppy(func_ir, None, args_with_addrspaces, - debug=debug) + cres = compile_with_dppy(func_ir, None, args_with_addrspaces, debug=debug) func = cres.library.get_function(cres.fndesc.llvm_func_name) if DEBUG: print("compile_kernel_parfor signature", cres.signature.args) for a in cres.signature.args: print(a, type(a)) -# if isinstance(a, types.npytypes.Array): -# print("addrspace:", a.addrspace) + # if isinstance(a, types.npytypes.Array): + # print("addrspace:", a.addrspace) kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args) - #kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces) - oclkern = DPPYKernel(context=cres.target_context, - sycl_queue=sycl_queue, - llvm_module=kernel.module, - name=kernel.name, - argtypes=args_with_addrspaces) - #argtypes=cres.signature.args) + # kernel = cres.target_context.prepare_ocl_kernel(func, args_with_addrspaces) + oclkern = DPPYKernel( + context=cres.target_context, + sycl_queue=sycl_queue, + llvm_module=kernel.module, + name=kernel.name, + argtypes=args_with_addrspaces, + ) + # argtypes=cres.signature.args) return oclkern @@ -196,8 +202,7 @@ class dppy_function_template(ConcreteTemplate): # Compile dppy function template def compile_dppy_func_template(pyfunc): - """Compile a DPPYFunctionTemplate - """ + """Compile a DPPYFunctionTemplate""" from .descriptor import dppy_target dft = DPPYFunctionTemplate(pyfunc) @@ -215,8 +220,8 @@ def generic(self, args, kws): class DPPYFunctionTemplate(object): - """Unmaterialized dppy function - """ + """Unmaterialized dppy function""" + def __init__(self, pyfunc, debug=False): self.py_func = pyfunc self.debug = debug @@ -239,8 +244,7 @@ def compile(self, args): if first_definition: # First definition - cres.target_context.insert_user_function(self, cres.fndesc, - libs) + cres.target_context.insert_user_function(self, cres.fndesc, libs) else: cres.target_context.add_user_function(self, cres.fndesc, libs) @@ -258,61 +262,65 @@ def __init__(self, cres): def _ensure_valid_work_item_grid(val, sycl_queue): if not isinstance(val, (tuple, list, int)): - error_message = ("Cannot create work item dimension from " - "provided argument") + error_message = "Cannot create work item dimension from " "provided argument" raise ValueError(error_message) if isinstance(val, int): val = [val] # TODO: we need some way to check the max dimensions - ''' + """ if len(val) > device_env.get_max_work_item_dims(): error_message = ("Unsupported number of work item dimensions ") raise ValueError(error_message) - ''' + """ + + return list( + val[::-1] + ) # reversing due to sycl and opencl interop kernel range mismatch semantic - return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic def _ensure_valid_work_group_size(val, work_item_grid): if not isinstance(val, (tuple, list, int)): - error_message = ("Cannot create work item dimension from " - "provided argument") + error_message = "Cannot create work item dimension from " "provided argument" raise ValueError(error_message) if isinstance(val, int): val = [val] if len(val) != len(work_item_grid): - error_message = ("Unsupported number of work item dimensions, " + - "dimensions of global and local work items has to be the same ") + error_message = ( + "Unsupported number of work item dimensions, " + + "dimensions of global and local work items has to be the same " + ) raise ValueError(error_message) - return list(val[::-1]) # reversing due to sycl and opencl interop kernel range mismatch semantic + return list( + val[::-1] + ) # reversing due to sycl and opencl interop kernel range mismatch semantic class DPPYKernelBase(object): - """Define interface for configurable kernels - """ + """Define interface for configurable kernels""" def __init__(self): self.global_size = [] - self.local_size = [] - self.sycl_queue = None + self.local_size = [] + self.sycl_queue = None # list of supported access types, stored in dict for fast lookup self.valid_access_types = { - _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY, - _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY, - _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE} + _NUMBA_DPPY_READ_ONLY: _NUMBA_DPPY_READ_ONLY, + _NUMBA_DPPY_WRITE_ONLY: _NUMBA_DPPY_WRITE_ONLY, + _NUMBA_DPPY_READ_WRITE: _NUMBA_DPPY_READ_WRITE, + } def copy(self): return copy.copy(self) def configure(self, sycl_queue, global_size, local_size=None): - """Configure the OpenCL kernel. The local_size can be None - """ + """Configure the OpenCL kernel. The local_size can be None""" clone = self.copy() clone.global_size = global_size clone.local_size = local_size @@ -346,8 +354,15 @@ class DPPYKernel(DPPYKernelBase): A OCL kernel object """ - def __init__(self, context, sycl_queue, llvm_module, name, argtypes, - ordered_arg_access_types=None): + def __init__( + self, + context, + sycl_queue, + llvm_module, + name, + argtypes, + ordered_arg_access_types=None, + ): super(DPPYKernel, self).__init__() self._llvm_module = llvm_module self.assembly = self.binary = llvm_module.__str__() @@ -365,7 +380,9 @@ def __init__(self, context, sycl_queue, llvm_module, name, argtypes, self.spirv_bc = spirv_generator.llvm_to_spirv(self.context, self.binary) # create a program - self.program = dpctl_prog.create_program_from_spirv(self.sycl_queue, self.spirv_bc) + self.program = dpctl_prog.create_program_from_spirv( + self.sycl_queue, self.spirv_bc + ) # create a kernel self.kernel = self.program.get_sycl_kernel(self.entry_name) @@ -376,33 +393,47 @@ def __call__(self, *args): retr = [] # hold functors for writeback kernelargs = [] internal_device_arrs = [] - for ty, val, access_type in zip(self.argument_types, args, - self.ordered_arg_access_types): - self._unpack_argument(ty, val, self.sycl_queue, retr, - kernelargs, internal_device_arrs, access_type) + for ty, val, access_type in zip( + self.argument_types, args, self.ordered_arg_access_types + ): + self._unpack_argument( + ty, + val, + self.sycl_queue, + retr, + kernelargs, + internal_device_arrs, + access_type, + ) - self.sycl_queue.submit(self.kernel, kernelargs, self.global_size, self.local_size) + self.sycl_queue.submit( + self.kernel, kernelargs, self.global_size, self.local_size + ) self.sycl_queue.wait() - for ty, val, i_dev_arr, access_type in zip(self.argument_types, args, - internal_device_arrs, self.ordered_arg_access_types): - self._pack_argument(ty, val, self.sycl_queue, i_dev_arr, - access_type) + for ty, val, i_dev_arr, access_type in zip( + self.argument_types, + args, + internal_device_arrs, + self.ordered_arg_access_types, + ): + self._pack_argument(ty, val, self.sycl_queue, i_dev_arr, access_type) def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type): """ Copy device data back to host """ - if (device_arr and (access_type not in self.valid_access_types or - access_type in self.valid_access_types and - self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY)): + if device_arr and ( + access_type not in self.valid_access_types + or access_type in self.valid_access_types + and self.valid_access_types[access_type] != _NUMBA_DPPY_READ_ONLY + ): # we get the date back to host if have created a # device_array or if access_type of this device_array # is not of type read_only and read_write usm_buf, usm_ndarr, orig_ndarray = device_arr np.copyto(orig_ndarray, usm_ndarr) - def _unpack_device_array_argument(self, val, kernelargs): # this function only takes ndarrays created using USM allocated buffer void_ptr_arg = True @@ -422,9 +453,9 @@ def _unpack_device_array_argument(self, val, kernelargs): for ax in range(val.ndim): kernelargs.append(ctypes.c_longlong(val.strides[ax])) - - def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs, - device_arrs, access_type): + def _unpack_argument( + self, ty, val, sycl_queue, retr, kernelargs, device_arrs, access_type + ): """ Convert arguments to ctypes and append to kernelargs """ @@ -440,9 +471,11 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs, usm_buf = dpctl_mem.MemoryUSMShared(val.size * val.dtype.itemsize) usm_ndarr = np.ndarray(val.shape, buffer=usm_buf, dtype=val.dtype) - if (default_behavior or - self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY or - self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE): + if ( + default_behavior + or self.valid_access_types[access_type] == _NUMBA_DPPY_READ_ONLY + or self.valid_access_types[access_type] == _NUMBA_DPPY_READ_WRITE + ): np.copyto(usm_ndarr, val) device_arrs[-1] = (usm_buf, usm_ndarr, val) @@ -470,13 +503,13 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs, cval = ctypes.c_uint8(int(val)) kernelargs.append(cval) elif ty == types.complex64: - #kernelargs.append(ctypes.c_float(val.real)) - #kernelargs.append(ctypes.c_float(val.imag)) + # kernelargs.append(ctypes.c_float(val.real)) + # kernelargs.append(ctypes.c_float(val.imag)) raise NotImplementedError(ty, val) elif ty == types.complex128: - #kernelargs.append(ctypes.c_double(val.real)) - #kernelargs.append(ctypes.c_double(val.imag)) + # kernelargs.append(ctypes.c_double(val.real)) + # kernelargs.append(ctypes.c_double(val.imag)) raise NotImplementedError(ty, val) else: @@ -484,13 +517,16 @@ def _unpack_argument(self, ty, val, sycl_queue, retr, kernelargs, def check_for_invalid_access_type(self, access_type): if access_type not in self.valid_access_types: - msg = ("[!] %s is not a valid access type. " - "Supported access types are [" % (access_type)) + msg = ( + "[!] %s is not a valid access type. " + "Supported access types are [" % (access_type) + ) for key in self.valid_access_types: msg += " %s |" % (key) msg = msg[:-1] + "]" - if access_type != None: print(msg) + if access_type != None: + print(msg) return True else: return False @@ -518,28 +554,27 @@ def __call__(self, *args, **kwargs): _raise_no_device_found_error() kernel = self.specialize(*args) - cfg = kernel.configure(self.sycl_queue, self.global_size, - self.local_size) + cfg = kernel.configure(self.sycl_queue, self.global_size, self.local_size) cfg(*args) def specialize(self, *args): - argtypes = tuple([self.typingctx.resolve_argument_type(a) - for a in args]) + argtypes = tuple([self.typingctx.resolve_argument_type(a) for a in args]) q = None kernel = None # we were previously using the _env_ptr of the device_env, the sycl_queue # should be sufficient to cache the compiled kernel for now, but we should # use the device type to cache such kernels - #key_definitions = (self.sycl_queue, argtypes) - key_definitions = (argtypes) + # key_definitions = (self.sycl_queue, argtypes) + key_definitions = argtypes result = self.definitions.get(key_definitions) if result: q, kernel = result if q and self.sycl_queue.equals(q): - return kernel + return kernel else: - kernel = compile_kernel(self.sycl_queue, self.py_func, argtypes, - self.access_types) + kernel = compile_kernel( + self.sycl_queue, self.py_func, argtypes, self.access_types + ) self.definitions[key_definitions] = (self.sycl_queue, kernel) return kernel diff --git a/numba_dppy/decorators.py b/numba_dppy/decorators.py index 641d924134..7576e20aae 100644 --- a/numba_dppy/decorators.py +++ b/numba_dppy/decorators.py @@ -1,7 +1,12 @@ from __future__ import print_function, absolute_import, division from numba.core import sigutils, types -from .compiler import (compile_kernel, JitDPPYKernel, compile_dppy_func_template, - compile_dppy_func, get_ordered_arg_access_types) +from .compiler import ( + compile_kernel, + JitDPPYKernel, + compile_dppy_func_template, + compile_dppy_func, + get_ordered_arg_access_types, +) def kernel(signature=None, access_types=None, debug=False): @@ -23,13 +28,14 @@ def autojit(debug=False, access_types=None): def _kernel_autojit(pyfunc): ordered_arg_access_types = get_ordered_arg_access_types(pyfunc, access_types) return JitDPPYKernel(pyfunc, ordered_arg_access_types) + return _kernel_autojit def _kernel_jit(signature, debug, access_types): argtypes, restype = sigutils.normalize_signature(signature) if restype is not None and restype != types.void: - msg = ("DPPY kernel must have void return type but got {restype}") + msg = "DPPY kernel must have void return type but got {restype}" raise TypeError(msg.format(restype=restype)) def _wrapped(pyfunc): @@ -39,7 +45,6 @@ def _wrapped(pyfunc): return _wrapped - def func(signature=None): if signature is None: return _func_autojit @@ -58,5 +63,6 @@ def _wrapped(pyfunc): return _wrapped + def _func_autojit(pyfunc): return compile_dppy_func_template(pyfunc) diff --git a/numba_dppy/descriptor.py b/numba_dppy/descriptor.py index c8e6a58ec7..40e1ef0628 100644 --- a/numba_dppy/descriptor.py +++ b/numba_dppy/descriptor.py @@ -10,8 +10,8 @@ class DPPYTarget(TargetDescriptor): options = CPUTargetOptions - #typingctx = DPPYTypingContext() - #targetctx = DPPYTargetContext(typingctx) + # typingctx = DPPYTypingContext() + # targetctx = DPPYTargetContext(typingctx) @utils.cached_property def _toplevel_target_context(self): @@ -38,6 +38,5 @@ def typing_context(self): return self._toplevel_typing_context - # The global DPPY target dppy_target = DPPYTarget() diff --git a/numba_dppy/device_init.py b/numba_dppy/device_init.py index efec55ba83..57c0d9c6e7 100644 --- a/numba_dppy/device_init.py +++ b/numba_dppy/device_init.py @@ -22,9 +22,7 @@ We are importing dpnp stub module to make Numba recognize the module when we rename Numpy functions. """ -from .dpnp_glue.stubs import ( - dpnp -) +from .dpnp_glue.stubs import dpnp DEFAULT_LOCAL_SIZE = [] diff --git a/numba_dppy/dispatcher.py b/numba_dppy/dispatcher.py index d00a597875..e4a9ddecd3 100644 --- a/numba_dppy/dispatcher.py +++ b/numba_dppy/dispatcher.py @@ -2,21 +2,22 @@ import numpy as np -#from numba.targets.descriptors import TargetDescriptor -#from numba.targets.options import TargetOptions -#import numba_dppy, numba_dppy as dppy +# from numba.targets.descriptors import TargetDescriptor +# from numba.targets.options import TargetOptions +# import numba_dppy, numba_dppy as dppy from numba_dppy import kernel, autojit from .descriptor import dppy_target -#from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc, - # GUFuncCallSteps) + +# from numba.npyufunc.deviceufunc import (UFuncMechanism, GenerializedUFunc, +# GUFuncCallSteps) from .. import dispatcher, utils, typing from .compiler import DPPYCompiler + class DPPYDispatcher(dispatcher.Dispatcher): targetdescr = dppy_target - def __init__(self, py_func, locals={}, targetoptions={}): assert not locals self.py_func = py_func @@ -44,8 +45,7 @@ def __call__(self, *args, **kws): return self.compiled(*args, **kws) def disable_compile(self, val=True): - """Disable the compilation of new signatures at call time. - """ + """Disable the compilation of new signatures at call time.""" # Do nothing pass @@ -58,6 +58,7 @@ def __getitem__(self, *args): def __getattr__(self, key): return getattr(self.compiled, key) + class DPPYUFuncDispatcher(object): """ Invoke the OpenCL ufunc specialization for the given inputs. @@ -89,8 +90,7 @@ def __call__(self, *args, **kws): return DPPYUFuncMechanism.call(self.functions, args, kws) def reduce(self, arg, stream=0): - assert len(list(self.functions.keys())[0]) == 2, "must be a binary " \ - "ufunc" + assert len(list(self.functions.keys())[0]) == 2, "must be a binary " "ufunc" assert arg.ndim == 1, "must use 1d array" n = arg.shape[0] @@ -112,7 +112,12 @@ def reduce(self, arg, stream=0): # reduce by recursively spliting and operating out = self.__reduce(mem, gpu_mems, stream) # store the resultong scalar in a [1,] buffer - buf = np.empty([out.size,], dtype=out.dtype) + buf = np.empty( + [ + out.size, + ], + dtype=out.dtype, + ) # copy the result back to host out.copy_to_host(buf, stream=stream) @@ -144,7 +149,7 @@ def __reduce(self, mem, gpu_mems, stream): class _DPPYGUFuncCallSteps(GUFuncCallSteps): __slots__ = [ - '_stream', + "_stream", ] def is_device_array(self, obj): @@ -161,7 +166,7 @@ def device_array(self, shape, dtype): return ocl.device_array(shape=shape, dtype=dtype, stream=self._stream) def prepare_inputs(self): - self._stream = self.kwargs.get('stream', 0) + self._stream = self.kwargs.get("stream", 0) def launch_kernel(self, kernel, nelem, args): kernel.forall(nelem, queue=self._stream)(*args) @@ -173,27 +178,26 @@ def _call_steps(self): return _DPPYGUFuncCallSteps def _broadcast_scalar_input(self, ary, shape): - return devicearray.DeviceNDArray(shape=shape, - strides=(0,), - dtype=ary.dtype, - gpu_data=ary.gpu_data) + return devicearray.DeviceNDArray( + shape=shape, strides=(0,), dtype=ary.dtype, gpu_data=ary.gpu_data + ) def _broadcast_add_axis(self, ary, newshape): newax = len(newshape) - len(ary.shape) # Add 0 strides for missing dimension newstrides = (0,) * newax + ary.strides - return devicearray.DeviceNDArray(shape=newshape, - strides=newstrides, - dtype=ary.dtype, - gpu_data=ary.gpu_data) + return devicearray.DeviceNDArray( + shape=newshape, strides=newstrides, dtype=ary.dtype, gpu_data=ary.gpu_data + ) class DPPYUFuncMechanism(UFuncMechanism): """ Provide OpenCL specialization """ + DEFAULT_STREAM = 0 - ARRAY_ORDER = 'A' + ARRAY_ORDER = "A" def launch(self, func, count, stream, args): func.forall(count, queue=stream)(*args) @@ -211,9 +215,11 @@ def device_array(self, shape, dtype, stream): return ocl.device_array(shape=shape, dtype=dtype, stream=stream) def broadcast_device(self, ary, shape): - ax_differs = [ax for ax in range(len(shape)) - if ax >= ary.ndim - or ary.shape[ax] != shape[ax]] + ax_differs = [ + ax + for ax in range(len(shape)) + if ax >= ary.ndim or ary.shape[ax] != shape[ax] + ] missingdim = len(shape) - len(ary.shape) strides = [0] * missingdim + list(ary.strides) @@ -221,7 +227,6 @@ def broadcast_device(self, ary, shape): for ax in ax_differs: strides[ax] = 0 - return devicearray.DeviceNDArray(shape=shape, - strides=strides, - dtype=ary.dtype, - gpu_data=ary.gpu_data) + return devicearray.DeviceNDArray( + shape=shape, strides=strides, dtype=ary.dtype, gpu_data=ary.gpu_data + ) diff --git a/numba_dppy/dpnp_glue/__init__.py b/numba_dppy/dpnp_glue/__init__.py index 17d6b5ad6a..9e76480680 100644 --- a/numba_dppy/dpnp_glue/__init__.py +++ b/numba_dppy/dpnp_glue/__init__.py @@ -1,6 +1,6 @@ def ensure_dpnp(name): try: - # import dpnp + # import dpnp from . import dpnp_fptr_interface as dpnp_glue except ImportError: raise ImportError("dpNP is needed to call np.%s" % name) diff --git a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py index 9146299b05..4b47382c57 100644 --- a/numba_dppy/dpnp_glue/dpnp_linalgimpl.py +++ b/numba_dppy/dpnp_glue/dpnp_linalgimpl.py @@ -7,6 +7,7 @@ import numpy as np from numba_dppy.dpctl_functions import _DPCTL_FUNCTIONS + @overload(stubs.dpnp.eig) def dpnp_eig_impl(a): name = "eig" @@ -22,10 +23,8 @@ def dpnp_eig_impl(a): void dpnp_eig_c(const void* array_in, void* result1, void* result2, size_t size) """ - sig = signature( - ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp - ) - dpnp_eig = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp) + dpnp_eig = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -92,8 +91,14 @@ def dpnp_dot_impl(a, b): """ sig = signature( - ret_type, types.voidptr, types.voidptr, types.voidptr, - types.intp, types.intp, types.intp) + ret_type, + types.voidptr, + types.voidptr, + types.voidptr, + types.intp, + types.intp, + types.intp, + ) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -137,6 +142,7 @@ def dpnp_dot_impl(a, b): ndims = [a.ndim, b.ndim] if ndims == [2, 2]: dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig) + def dot_2_mm(a, b): sycl_queue = get_sycl_queue() @@ -170,11 +176,12 @@ def dot_2_mm(a, b): return dot_2_mm elif ndims == [2, 1]: dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig) + def dot_2_mv(a, b): sycl_queue = get_sycl_queue() m, k = a.shape - _n, = b.shape + (_n,) = b.shape n = 1 if _n != k: @@ -186,7 +193,7 @@ def dot_2_mv(a, b): b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue) copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize) - out = np.empty((m, ), dtype=res_dtype) + out = np.empty((m,), dtype=res_dtype) out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue) dpnp_func(a_usm, b_usm, out_usm, m, n, k) @@ -204,10 +211,11 @@ def dot_2_mv(a, b): return dot_2_mv elif ndims == [1, 2]: dpnp_func = dpnp_ext.dpnp_func("dpnp_matmul", [a.dtype.name, "NONE"], sig) + def dot_2_vm(a, b): sycl_queue = get_sycl_queue() - m, = a.shape + (m,) = a.shape k, n = b.shape if m != k: @@ -219,7 +227,7 @@ def dot_2_vm(a, b): b_usm = allocate_usm_shared(b.size * b.itemsize, sycl_queue) copy_usm(sycl_queue, b_usm, b.ctypes, b.size * b.itemsize) - out = np.empty((n, ), dtype=res_dtype) + out = np.empty((n,), dtype=res_dtype) out_usm = allocate_usm_shared(out.size * out.itemsize, sycl_queue) dpnp_func(a_usm, b_usm, out_usm, m, n, k) @@ -236,14 +244,16 @@ def dot_2_vm(a, b): return dot_2_vm elif ndims == [1, 1]: - sig = signature(ret_type, types.voidptr, types.voidptr, types.voidptr, - types.intp) + sig = signature( + ret_type, types.voidptr, types.voidptr, types.voidptr, types.intp + ) dpnp_func = dpnp_ext.dpnp_func("dpnp_dot", [a.dtype.name, "NONE"], sig) + def dot_2_vv(a, b): sycl_queue = get_sycl_queue() - m, = a.shape - n, = b.shape + (m,) = a.shape + (n,) = b.shape if m != n: raise ValueError("Incompatible array sizes for np.dot(a, b)") diff --git a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py index 8ec200059b..073b83e900 100644 --- a/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py +++ b/numba_dppy/dpnp_glue/dpnp_sort_search_countimpl.py @@ -22,7 +22,9 @@ def dpnp_argmax_impl(a): void custom_argmax_c(void* array1_in, void* result1, size_t size) """ sig = signature(ret_type, types.voidptr, types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig) + dpnp_func = dpnp_ext.dpnp_func( + "dpnp_" + name, [a.dtype.name, np.dtype(np.int64).name], sig + ) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -72,7 +74,9 @@ def dpnp_argmin_impl(a): void custom_argmin_c(void* array1_in, void* result1, size_t size) """ sig = signature(ret_type, types.voidptr, types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, np.dtype(np.int64).name], sig) + dpnp_func = dpnp_ext.dpnp_func( + "dpnp_" + name, [a.dtype.name, np.dtype(np.int64).name], sig + ) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -122,7 +126,7 @@ def dpnp_argsort_impl(a): void custom_argmin_c(void* array1_in, void* result1, size_t size) """ sig = signature(ret_type, types.voidptr, types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() diff --git a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py index cae9507902..a3883dc860 100644 --- a/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py +++ b/numba_dppy/dpnp_glue/dpnp_statisticsimpl.py @@ -28,10 +28,16 @@ def dpnp_amax_impl(a): if the compiler allows there should not be any mismatch in the size of the container to hold different types of pointer. """ - sig = signature(ret_type, types.voidptr, types.voidptr, - types.voidptr, types.intp, - types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + sig = signature( + ret_type, + types.voidptr, + types.voidptr, + types.voidptr, + types.intp, + types.voidptr, + types.intp, + ) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -85,10 +91,16 @@ def dpnp_amin_impl(a): if the compiler allows there should not be any mismatch in the size of the container to hold different types of pointer. """ - sig = signature(ret_type, types.voidptr, types.voidptr, - types.voidptr, types.intp, - types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + sig = signature( + ret_type, + types.voidptr, + types.voidptr, + types.voidptr, + types.intp, + types.voidptr, + types.intp, + ) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -141,10 +153,16 @@ def dpnp_mean_impl(a): if the compiler allows there should not be any mismatch in the size of the container to hold different types of pointer. """ - sig = signature(ret_type, types.voidptr, types.voidptr, - types.voidptr, types.intp, - types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + sig = signature( + ret_type, + types.voidptr, + types.voidptr, + types.voidptr, + types.intp, + types.voidptr, + types.intp, + ) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -200,10 +218,16 @@ def dpnp_median_impl(a): if the compiler allows there should not be any mismatch in the size of the container to hold different types of pointer. """ - sig = signature(ret_type, types.voidptr, types.voidptr, - types.voidptr, types.intp, - types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + sig = signature( + ret_type, + types.voidptr, + types.voidptr, + types.voidptr, + types.intp, + types.voidptr, + types.intp, + ) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -254,9 +278,8 @@ def dpnp_cov_impl(a): Function declaration: void custom_cov_c(void* array1_in, void* result1, size_t nrows, size_t ncols) """ - sig = signature(ret_type, types.voidptr, types.voidptr, - types.intp, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + sig = signature(ret_type, types.voidptr, types.voidptr, types.intp, types.intp) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -268,7 +291,6 @@ def dpnp_cov_impl(a): if a.dtype == types.float64: copy_input_to_double = False - def dpnp_impl(a): if a.size == 0: raise ValueError("Passed Empty array") @@ -280,9 +302,15 @@ def dpnp_impl(a): a_copy_in_double = a.astype(np.float64) else: a_copy_in_double = a - a_usm = allocate_usm_shared(a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue) - copy_usm(sycl_queue, a_usm, a_copy_in_double.ctypes, - a_copy_in_double.size * a_copy_in_double.itemsize) + a_usm = allocate_usm_shared( + a_copy_in_double.size * a_copy_in_double.itemsize, sycl_queue + ) + copy_usm( + sycl_queue, + a_usm, + a_copy_in_double.ctypes, + a_copy_in_double.size * a_copy_in_double.itemsize, + ) if a.ndim == 2: rows = a.shape[0] diff --git a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py index f7ba425206..af79ad2c1c 100644 --- a/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py +++ b/numba_dppy/dpnp_glue/dpnp_transcendentalsimpl.py @@ -23,7 +23,7 @@ def dpnp_sum_impl(a): """ sig = signature(ret_type, types.voidptr, types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() @@ -70,7 +70,7 @@ def dpnp_prod_impl(a): void custom_prod_c(void* array1_in, void* result1, size_t size) """ sig = signature(ret_type, types.voidptr, types.voidptr, types.intp) - dpnp_func = dpnp_ext.dpnp_func("dpnp_"+name, [a.dtype.name, "NONE"], sig) + dpnp_func = dpnp_ext.dpnp_func("dpnp_" + name, [a.dtype.name, "NONE"], sig) get_sycl_queue = dpctl_functions.dpctl_get_current_queue() allocate_usm_shared = dpctl_functions.dpctl_malloc_shared() diff --git a/numba_dppy/dpnp_glue/dpnpdecl.py b/numba_dppy/dpnp_glue/dpnpdecl.py index ce1f7d3583..373018c8db 100644 --- a/numba_dppy/dpnp_glue/dpnpdecl.py +++ b/numba_dppy/dpnp_glue/dpnpdecl.py @@ -1,8 +1,9 @@ -from numba.core.typing.templates import (AttributeTemplate, infer_getattr) +from numba.core.typing.templates import AttributeTemplate, infer_getattr import numba_dppy from numba import types from numba.core.types.misc import RawPointer + @infer_getattr class DppyDpnpTemplate(AttributeTemplate): key = types.Module(numba_dppy) @@ -10,11 +11,14 @@ class DppyDpnpTemplate(AttributeTemplate): def resolve_dpnp(self, mod): return types.Module(numba_dppy.dpnp) + """ This adds a shapeptr attribute to Numba type representing np.ndarray. This allows us to get the raw pointer to the structure where the shape of an ndarray is stored from an overloaded implementation """ + + @infer_getattr class ArrayAttribute(AttributeTemplate): key = types.Array diff --git a/numba_dppy/dpnp_glue/dpnpimpl.py b/numba_dppy/dpnp_glue/dpnpimpl.py index fa429f923f..a3dc5ce195 100644 --- a/numba_dppy/dpnp_glue/dpnpimpl.py +++ b/numba_dppy/dpnp_glue/dpnpimpl.py @@ -8,23 +8,27 @@ ll_void_p = ir.IntType(8).as_pointer() + def get_dpnp_fptr(fn_name, type_names): from . import dpnp_fptr_interface as dpnp_glue f_ptr = dpnp_glue.get_dpnp_fn_ptr(fn_name, type_names) return f_ptr + @register_jitable def _check_finite_matrix(a): for v in np.nditer(a): if not np.isfinite(v.item()): raise np.linalg.LinAlgError("Array must not contain infs or NaNs.") + @register_jitable def _dummy_liveness_func(a): """pass a list of variables to be preserved through dead code elimination""" return a[0] + def dpnp_func(fn_name, type_names, sig): f_ptr = get_dpnp_fptr(fn_name, type_names) @@ -33,15 +37,19 @@ def get_pointer(obj): return types.ExternalFunctionPointer(sig, get_pointer=get_pointer) + """ This function retrieves the pointer to the structure where the shape of an ndarray is stored. We cast it to void * to make it easier to pass around. """ + + @lower_getattr(types.Array, "shapeptr") def array_shape(context, builder, typ, value): - shape_ptr = builder.gep(value.operands[0], - [context.get_constant(types.int32, 0), - context.get_constant(types.int32, 5)]) + shape_ptr = builder.gep( + value.operands[0], + [context.get_constant(types.int32, 0), context.get_constant(types.int32, 5)], + ) return builder.bitcast(shape_ptr, ll_void_p) diff --git a/numba_dppy/dpnp_glue/stubs.py b/numba_dppy/dpnp_glue/stubs.py index 2fdd6ecbe3..fa7e06ea48 100644 --- a/numba_dppy/dpnp_glue/stubs.py +++ b/numba_dppy/dpnp_glue/stubs.py @@ -1,9 +1,10 @@ from numba_dppy.ocl.stubs import Stub + class dpnp(Stub): - """dpnp namespace - """ - _description_ = '' + """dpnp namespace""" + + _description_ = "" class sum(Stub): pass diff --git a/numba_dppy/dppy_host_fn_call_gen.py b/numba_dppy/dppy_host_fn_call_gen.py index 2808ddf90d..585d461127 100644 --- a/numba_dppy/dppy_host_fn_call_gen.py +++ b/numba_dppy/dppy_host_fn_call_gen.py @@ -9,6 +9,7 @@ from numba.core.ir_utils import legalize_names + class DPPYHostFunctionCallsGenerator(object): def __init__(self, lowerer, cres, num_inputs): self.lowerer = lowerer @@ -27,8 +28,8 @@ def __init__(self, lowerer, cres, num_inputs): self.null_ptr = self._create_null_ptr() self.total_kernel_args = 0 - self.cur_arg = 0 - self.num_inputs = num_inputs + self.cur_arg = 0 + self.num_inputs = num_inputs # list of buffer that needs to comeback to host self.write_buffs = [] @@ -36,65 +37,91 @@ def __init__(self, lowerer, cres, num_inputs): # list of buffer that does not need to comeback to host self.read_only_buffs = [] - def _create_null_ptr(self): - null_ptr = cgutils.alloca_once(self.builder, self.void_ptr_t, - size=self.context.get_constant(types.uintp, 1), name="null_ptr") + null_ptr = cgutils.alloca_once( + self.builder, + self.void_ptr_t, + size=self.context.get_constant(types.uintp, 1), + name="null_ptr", + ) self.builder.store( self.builder.inttoptr( - self.context.get_constant(types.uintp, 0), self.void_ptr_t), - null_ptr) + self.context.get_constant(types.uintp, 0), self.void_ptr_t + ), + null_ptr, + ) return null_ptr - def _init_llvm_types_and_constants(self): - self.byte_t = lc.Type.int(8) - self.byte_ptr_t = lc.Type.pointer(self.byte_t) - self.byte_ptr_ptr_t = lc.Type.pointer(self.byte_ptr_t) - self.intp_t = self.context.get_value_type(types.intp) - self.int64_t = self.context.get_value_type(types.int64) - self.int32_t = self.context.get_value_type(types.int32) - self.int32_ptr_t = lc.Type.pointer(self.int32_t) - self.uintp_t = self.context.get_value_type(types.uintp) - self.intp_ptr_t = lc.Type.pointer(self.intp_t) - self.uintp_ptr_t = lc.Type.pointer(self.uintp_t) - self.zero = self.context.get_constant(types.uintp, 0) - self.one = self.context.get_constant(types.uintp, 1) - self.one_type = self.one.type - self.sizeof_intp = self.context.get_abi_sizeof(self.intp_t) - self.void_ptr_t = self.context.get_value_type(types.voidptr) - self.void_ptr_ptr_t = lc.Type.pointer(self.void_ptr_t) + self.byte_t = lc.Type.int(8) + self.byte_ptr_t = lc.Type.pointer(self.byte_t) + self.byte_ptr_ptr_t = lc.Type.pointer(self.byte_ptr_t) + self.intp_t = self.context.get_value_type(types.intp) + self.int64_t = self.context.get_value_type(types.int64) + self.int32_t = self.context.get_value_type(types.int32) + self.int32_ptr_t = lc.Type.pointer(self.int32_t) + self.uintp_t = self.context.get_value_type(types.uintp) + self.intp_ptr_t = lc.Type.pointer(self.intp_t) + self.uintp_ptr_t = lc.Type.pointer(self.uintp_t) + self.zero = self.context.get_constant(types.uintp, 0) + self.one = self.context.get_constant(types.uintp, 1) + self.one_type = self.one.type + self.sizeof_intp = self.context.get_abi_sizeof(self.intp_t) + self.void_ptr_t = self.context.get_value_type(types.voidptr) + self.void_ptr_ptr_t = lc.Type.pointer(self.void_ptr_t) self.sizeof_void_ptr = self.context.get_abi_sizeof(self.intp_t) self.sycl_queue_val = None def _declare_functions(self): get_queue_fnty = lc.Type.function(self.void_ptr_t, ()) - self.get_queue = self.builder.module.get_or_insert_function(get_queue_fnty, - name="DPCTLQueueMgr_GetCurrentQueue") - - submit_range_fnty = lc.Type.function(self.void_ptr_t, - [self.void_ptr_t, self.void_ptr_t, self.void_ptr_ptr_t, - self.int32_ptr_t, self.intp_t, self.intp_ptr_t, - self.intp_t, self.void_ptr_t, self.intp_t]) - self.submit_range = self.builder.module.get_or_insert_function(submit_range_fnty, - name="DPCTLQueue_SubmitRange") - - - queue_memcpy_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t]) - self.queue_memcpy = self.builder.module.get_or_insert_function(queue_memcpy_fnty, - name="DPCTLQueue_Memcpy") - - queue_wait_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t]) - self.queue_wait = self.builder.module.get_or_insert_function(queue_wait_fnty, - name="DPCTLQueue_Wait") - - usm_shared_fnty = lc.Type.function(self.void_ptr_t, [self.intp_t, self.void_ptr_t]) - self.usm_shared = self.builder.module.get_or_insert_function(usm_shared_fnty, - name="DPCTLmalloc_shared") - - usm_free_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t, self.void_ptr_t]) - self.usm_free = self.builder.module.get_or_insert_function(usm_free_fnty, - name="DPCTLfree_with_queue") + self.get_queue = self.builder.module.get_or_insert_function( + get_queue_fnty, name="DPCTLQueueMgr_GetCurrentQueue" + ) + + submit_range_fnty = lc.Type.function( + self.void_ptr_t, + [ + self.void_ptr_t, + self.void_ptr_t, + self.void_ptr_ptr_t, + self.int32_ptr_t, + self.intp_t, + self.intp_ptr_t, + self.intp_t, + self.void_ptr_t, + self.intp_t, + ], + ) + self.submit_range = self.builder.module.get_or_insert_function( + submit_range_fnty, name="DPCTLQueue_SubmitRange" + ) + + queue_memcpy_fnty = lc.Type.function( + lir.VoidType(), + [self.void_ptr_t, self.void_ptr_t, self.void_ptr_t, self.intp_t], + ) + self.queue_memcpy = self.builder.module.get_or_insert_function( + queue_memcpy_fnty, name="DPCTLQueue_Memcpy" + ) + + queue_wait_fnty = lc.Type.function(lir.VoidType(), [self.void_ptr_t]) + self.queue_wait = self.builder.module.get_or_insert_function( + queue_wait_fnty, name="DPCTLQueue_Wait" + ) + + usm_shared_fnty = lc.Type.function( + self.void_ptr_t, [self.intp_t, self.void_ptr_t] + ) + self.usm_shared = self.builder.module.get_or_insert_function( + usm_shared_fnty, name="DPCTLmalloc_shared" + ) + + usm_free_fnty = lc.Type.function( + lir.VoidType(), [self.void_ptr_t, self.void_ptr_t] + ) + self.usm_free = self.builder.module.get_or_insert_function( + usm_free_fnty, name="DPCTLfree_with_queue" + ) def allocate_kenrel_arg_array(self, num_kernel_args): self.sycl_queue_val = cgutils.alloca_once(self.builder, self.void_ptr_t) @@ -104,17 +131,21 @@ def allocate_kenrel_arg_array(self, num_kernel_args): # we need a kernel arg array to enqueue self.kernel_arg_array = cgutils.alloca_once( - self.builder, self.void_ptr_t, size=self.context.get_constant( - types.uintp, num_kernel_args), name="kernel_arg_array") + self.builder, + self.void_ptr_t, + size=self.context.get_constant(types.uintp, num_kernel_args), + name="kernel_arg_array", + ) self.kernel_arg_ty_array = cgutils.alloca_once( - self.builder, self.int32_t, size=self.context.get_constant( - types.uintp, num_kernel_args), name="kernel_arg_ty_array") - + self.builder, + self.int32_t, + size=self.context.get_constant(types.uintp, num_kernel_args), + name="kernel_arg_ty_array", + ) def resolve_and_return_dpctl_type(self, ty): - """This function looks up the dpctl defined enum values from DPCTLKernelArgType. - """ + """This function looks up the dpctl defined enum values from DPCTLKernelArgType.""" val = None if ty == types.int32 or isinstance(ty, types.scalars.IntegerLiteral): @@ -136,20 +167,26 @@ def resolve_and_return_dpctl_type(self, ty): else: raise NotImplementedError - assert(val != None) + assert val != None return val - def form_kernel_arg_and_arg_ty(self, val, ty): - kernel_arg_dst = self.builder.gep(self.kernel_arg_array, [self.context.get_constant(types.int32, self.cur_arg)]) - kernel_arg_ty_dst = self.builder.gep(self.kernel_arg_ty_array, [self.context.get_constant(types.int32, self.cur_arg)]) + kernel_arg_dst = self.builder.gep( + self.kernel_arg_array, + [self.context.get_constant(types.int32, self.cur_arg)], + ) + kernel_arg_ty_dst = self.builder.gep( + self.kernel_arg_ty_array, + [self.context.get_constant(types.int32, self.cur_arg)], + ) self.cur_arg += 1 self.builder.store(val, kernel_arg_dst) self.builder.store(ty, kernel_arg_ty_dst) - - def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays): + def process_kernel_arg( + self, var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays + ): if isinstance(arg_type, types.npytypes.Array): if llvm_arg is None: raise NotImplementedError(arg_type, var) @@ -157,50 +194,83 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m storage = cgutils.alloca_once(self.builder, self.int64_t) self.builder.store(self.context.get_constant(types.int64, 0), storage) ty = self.resolve_and_return_dpctl_type(types.int64) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty) + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(storage, self.void_ptr_t), ty + ) storage = cgutils.alloca_once(self.builder, self.int64_t) self.builder.store(self.context.get_constant(types.int64, 0), storage) ty = self.resolve_and_return_dpctl_type(types.int64) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(storage, self.void_ptr_t), ty) - + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(storage, self.void_ptr_t), ty + ) # Handle array size - array_size_member = self.builder.gep(llvm_arg, - [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 2)]) - - ty = self.resolve_and_return_dpctl_type(types.int64) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(array_size_member, self.void_ptr_t), ty) + array_size_member = self.builder.gep( + llvm_arg, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, 2), + ], + ) + ty = self.resolve_and_return_dpctl_type(types.int64) + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(array_size_member, self.void_ptr_t), ty + ) # Handle itemsize - item_size_member = self.builder.gep(llvm_arg, - [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 3)]) - - ty = self.resolve_and_return_dpctl_type(types.int64) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(item_size_member, self.void_ptr_t), ty) + item_size_member = self.builder.gep( + llvm_arg, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, 3), + ], + ) + ty = self.resolve_and_return_dpctl_type(types.int64) + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(item_size_member, self.void_ptr_t), ty + ) # Calculate total buffer size - total_size = cgutils.alloca_once(self.builder, self.intp_t, - size=self.one, name="total_size" + str(self.cur_arg)) - self.builder.store(self.builder.sext(self.builder.mul(self.builder.load(array_size_member), - self.builder.load(item_size_member)), self.intp_t), total_size) + total_size = cgutils.alloca_once( + self.builder, + self.intp_t, + size=self.one, + name="total_size" + str(self.cur_arg), + ) + self.builder.store( + self.builder.sext( + self.builder.mul( + self.builder.load(array_size_member), + self.builder.load(item_size_member), + ), + self.intp_t, + ), + total_size, + ) # Handle data - data_member = self.builder.gep(llvm_arg, - [self.context.get_constant(types.int32, 0), self.context.get_constant(types.int32, 4)]) + data_member = self.builder.gep( + llvm_arg, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, 4), + ], + ) buffer_name = "buffer_ptr" + str(self.cur_arg) - buffer_ptr = cgutils.alloca_once(self.builder, self.void_ptr_t, - name=buffer_name) + buffer_ptr = cgutils.alloca_once( + self.builder, self.void_ptr_t, name=buffer_name + ) - - args = [self.builder.load(total_size), - self.builder.load(self.sycl_queue_val)] + args = [ + self.builder.load(total_size), + self.builder.load(self.sycl_queue_val), + ] self.builder.store(self.builder.call(self.usm_shared, args), buffer_ptr) - # names are replaces usig legalize names, we have to do the same for them to match legal_names = legalize_names([var]) @@ -211,46 +281,70 @@ def process_kernel_arg(self, var, llvm_arg, arg_type, gu_sig, val_type, index, m # We really need to detect when an array needs to be copied over if index < self.num_inputs: - args = [self.builder.load(self.sycl_queue_val), - self.builder.load(buffer_ptr), - self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t), - self.builder.load(total_size)] + args = [ + self.builder.load(self.sycl_queue_val), + self.builder.load(buffer_ptr), + self.builder.bitcast( + self.builder.load(data_member), self.void_ptr_t + ), + self.builder.load(total_size), + ] self.builder.call(self.queue_memcpy, args) - - ty = self.resolve_and_return_dpctl_type(types.voidptr) + ty = self.resolve_and_return_dpctl_type(types.voidptr) self.form_kernel_arg_and_arg_ty(self.builder.load(buffer_ptr), ty) # Handle shape - shape_member = self.builder.gep(llvm_arg, - [self.context.get_constant(types.int32, 0), - self.context.get_constant(types.int32, 5)]) + shape_member = self.builder.gep( + llvm_arg, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, 5), + ], + ) for this_dim in range(arg_type.ndim): - shape_entry = self.builder.gep(shape_member, - [self.context.get_constant(types.int32, 0), - self.context.get_constant(types.int32, this_dim)]) - - ty = self.resolve_and_return_dpctl_type(types.int64) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(shape_entry, self.void_ptr_t), ty) - + shape_entry = self.builder.gep( + shape_member, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, this_dim), + ], + ) + + ty = self.resolve_and_return_dpctl_type(types.int64) + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(shape_entry, self.void_ptr_t), ty + ) # Handle strides - stride_member = self.builder.gep(llvm_arg, - [self.context.get_constant(types.int32, 0), - self.context.get_constant(types.int32, 6)]) + stride_member = self.builder.gep( + llvm_arg, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, 6), + ], + ) for this_stride in range(arg_type.ndim): - stride_entry = self.builder.gep(stride_member, - [self.context.get_constant(types.int32, 0), - self.context.get_constant(types.int32, this_stride)]) - - ty = self.resolve_and_return_dpctl_type(types.int64) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(stride_entry, self.void_ptr_t), ty) + stride_entry = self.builder.gep( + stride_member, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, this_stride), + ], + ) + + ty = self.resolve_and_return_dpctl_type(types.int64) + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(stride_entry, self.void_ptr_t), ty + ) else: - ty = self.resolve_and_return_dpctl_type(arg_type) - self.form_kernel_arg_and_arg_ty(self.builder.bitcast(llvm_arg, self.void_ptr_t), ty) + ty = self.resolve_and_return_dpctl_type(arg_type) + self.form_kernel_arg_and_arg_ty( + self.builder.bitcast(llvm_arg, self.void_ptr_t), ty + ) def enqueue_kernel_and_read_back(self, loop_ranges): # the assumption is loop_ranges will always be less than or equal to 3 dimensions @@ -258,8 +352,11 @@ def enqueue_kernel_and_read_back(self, loop_ranges): # form the global range global_range = cgutils.alloca_once( - self.builder, self.uintp_t, - size=self.context.get_constant(types.uintp, num_dim), name="global_range") + self.builder, + self.uintp_t, + size=self.context.get_constant(types.uintp, num_dim), + name="global_range", + ) for i in range(num_dim): start, stop, step = loop_ranges[i] @@ -267,20 +364,28 @@ def enqueue_kernel_and_read_back(self, loop_ranges): stop = self.builder.sext(stop, self.one_type) # we reverse the global range to account for how sycl and opencl range differs - self.builder.store(stop, - self.builder.gep(global_range, [self.context.get_constant(types.uintp, (num_dim-1)-i)])) - - - args = [self.builder.inttoptr(self.context.get_constant(types.uintp, self.kernel_addr), self.void_ptr_t), - self.builder.load(self.sycl_queue_val), - self.kernel_arg_array, - self.kernel_arg_ty_array, - self.context.get_constant(types.uintp, self.total_kernel_args), - self.builder.bitcast(global_range, self.intp_ptr_t), - self.context.get_constant(types.uintp, num_dim), - self.builder.bitcast(self.null_ptr, self.void_ptr_t), - self.context.get_constant(types.uintp, 0) - ] + self.builder.store( + stop, + self.builder.gep( + global_range, + [self.context.get_constant(types.uintp, (num_dim - 1) - i)], + ), + ) + + args = [ + self.builder.inttoptr( + self.context.get_constant(types.uintp, self.kernel_addr), + self.void_ptr_t, + ), + self.builder.load(self.sycl_queue_val), + self.kernel_arg_array, + self.kernel_arg_ty_array, + self.context.get_constant(types.uintp, self.total_kernel_args), + self.builder.bitcast(global_range, self.intp_ptr_t), + self.context.get_constant(types.uintp, num_dim), + self.builder.bitcast(self.null_ptr, self.void_ptr_t), + self.context.get_constant(types.uintp, 0), + ] self.builder.call(self.submit_range, args) self.builder.call(self.queue_wait, [self.builder.load(self.sycl_queue_val)]) @@ -288,14 +393,22 @@ def enqueue_kernel_and_read_back(self, loop_ranges): # read buffers back to host for write_buff in self.write_buffs: buffer_ptr, total_size, data_member = write_buff - args = [self.builder.load(self.sycl_queue_val), - self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t), - self.builder.load(buffer_ptr), - self.builder.load(total_size)] + args = [ + self.builder.load(self.sycl_queue_val), + self.builder.bitcast(self.builder.load(data_member), self.void_ptr_t), + self.builder.load(buffer_ptr), + self.builder.load(total_size), + ] self.builder.call(self.queue_memcpy, args) - self.builder.call(self.usm_free, [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)]) + self.builder.call( + self.usm_free, + [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)], + ) for read_buff in self.read_only_buffs: buffer_ptr, total_size, data_member = read_buff - self.builder.call(self.usm_free, [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)]) + self.builder.call( + self.usm_free, + [self.builder.load(buffer_ptr), self.builder.load(self.sycl_queue_val)], + ) diff --git a/numba_dppy/dppy_lowerer.py b/numba_dppy/dppy_lowerer.py index 3040362592..420414ec2f 100644 --- a/numba_dppy/dppy_lowerer.py +++ b/numba_dppy/dppy_lowerer.py @@ -9,28 +9,29 @@ import numpy as np import numba -from numba.core import (compiler, ir, types, sigutils, lowering, - funcdesc, config) +from numba.core import compiler, ir, types, sigutils, lowering, funcdesc, config from numba.parfors import parfor import numba_dppy, numba_dppy as dppy -from numba.core.ir_utils import (add_offset_to_labels, - replace_var_names, - remove_dels, - legalize_names, - mk_unique_var, - rename_labels, - get_name_var_table, - visit_vars_inner, - guard, - find_callname, - remove_dead, - get_call_table, - is_pure, - build_definitions, - get_np_ufunc_typ, - get_unused_var_name, - find_potential_aliases, - is_const_call) +from numba.core.ir_utils import ( + add_offset_to_labels, + replace_var_names, + remove_dels, + legalize_names, + mk_unique_var, + rename_labels, + get_name_var_table, + visit_vars_inner, + guard, + find_callname, + remove_dead, + get_call_table, + is_pure, + build_definitions, + get_np_ufunc_typ, + get_unused_var_name, + find_potential_aliases, + is_const_call, +) from numba.core.typing import signature @@ -47,9 +48,9 @@ def _print_block(block): for i, inst in enumerate(block.body): print(" ", i, inst) + def _print_body(body_dict): - '''Pretty-print a set of IR blocks. - ''' + """Pretty-print a set of IR blocks.""" for label, block in body_dict.items(): print("label: ", label) _print_block(block) @@ -61,9 +62,9 @@ def _print_body(body_dict): # through OpenCL and generate for loops for the remaining # dimensions def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict): - gufunc_txt = "" + gufunc_txt = "" global_id_dim = 0 - for_loop_dim = parfor_dim + for_loop_dim = parfor_dim if parfor_dim > 3: global_id_dim = 3 @@ -71,9 +72,14 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict): global_id_dim = parfor_dim for eachdim in range(global_id_dim): - gufunc_txt += (" " + legal_loop_indices[eachdim] + " = " - + "dppy.get_global_id(" + str(eachdim) + ")\n") - + gufunc_txt += ( + " " + + legal_loop_indices[eachdim] + + " = " + + "dppy.get_global_id(" + + str(eachdim) + + ")\n" + ) for eachdim in range(global_id_dim, for_loop_dim): for indent in range(1 + (eachdim - global_id_dim)): @@ -82,11 +88,15 @@ def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict): start, stop, step = loop_ranges[eachdim] start = param_dict.get(str(start), start) stop = param_dict.get(str(stop), stop) - gufunc_txt += ("for " + - legal_loop_indices[eachdim] + - " in range(" + str(start) + - ", " + str(stop) + - " + 1):\n") + gufunc_txt += ( + "for " + + legal_loop_indices[eachdim] + + " in range(" + + str(start) + + ", " + + str(stop) + + " + 1):\n" + ) for eachdim in range(global_id_dim, for_loop_dim): for indent in range(1 + (eachdim - global_id_dim)): @@ -114,17 +124,18 @@ def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap): strconsttyp = types.StringLiteral(strval) lhs = ir.Var(scope, mk_unique_var("str_const"), loc) - assign_lhs = ir.Assign(value=ir.Const(value=strval, loc=loc), - target=lhs, loc=loc) + assign_lhs = ir.Assign( + value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc + ) typemap[lhs.name] = strconsttyp new_block.append(assign_lhs) # Make print node - print_node = ir.Print(args=[lhs, inst.target], vararg=None, - loc=loc) + print_node = ir.Print(args=[lhs, inst.target], vararg=None, loc=loc) new_block.append(print_node) - sig = numba.typing.signature(types.none, typemap[lhs.name], - typemap[inst.target.name]) + sig = numba.typing.signature( + types.none, typemap[lhs.name], typemap[inst.target.name] + ) lowerer.fndesc.calltypes[print_node] = sig loop_body[label] = new_block @@ -134,33 +145,36 @@ def replace_var_with_array_in_block(vars, block, typemap, calltypes): for inst in block.body: if isinstance(inst, ir.Assign) and inst.target.name in vars: const_node = ir.Const(0, inst.loc) - const_var = ir.Var(inst.target.scope, mk_unique_var("$const_ind_0"), - inst.loc) + const_var = ir.Var( + inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc + ) typemap[const_var.name] = types.uintp const_assign = ir.Assign(const_node, const_var, inst.loc) new_block.append(const_assign) - setitem_node = ir.SetItem(inst.target, const_var, inst.value, - inst.loc) + setitem_node = ir.SetItem(inst.target, const_var, inst.value, inst.loc) calltypes[setitem_node] = signature( - types.none, types.npytypes.Array(typemap[inst.target.name], 1, - "C"), types.intp, - typemap[inst.target.name]) + types.none, + types.npytypes.Array(typemap[inst.target.name], 1, "C"), + types.intp, + typemap[inst.target.name], + ) new_block.append(setitem_node) continue elif isinstance(inst, parfor.Parfor): - replace_var_with_array_internal(vars, {0: inst.init_block}, - typemap, calltypes) - replace_var_with_array_internal(vars, inst.loop_body, - typemap, calltypes) + replace_var_with_array_internal( + vars, {0: inst.init_block}, typemap, calltypes + ) + replace_var_with_array_internal(vars, inst.loop_body, typemap, calltypes) new_block.append(inst) return new_block + def replace_var_with_array_internal(vars, loop_body, typemap, calltypes): for label, block in loop_body.items(): - block.body = replace_var_with_array_in_block(vars, block, typemap, - calltypes) + block.body = replace_var_with_array_in_block(vars, block, typemap, calltypes) + def replace_var_with_array(vars, loop_body, typemap, calltypes): replace_var_with_array_internal(vars, loop_body, typemap, calltypes) @@ -178,17 +192,18 @@ def wrap_loop_body(loop_body): blocks[last_label].body.append(ir.Jump(first_label, loc)) return blocks + def unwrap_loop_body(loop_body): last_label = max(loop_body.keys()) loop_body[last_label].body = loop_body[last_label].body[:-1] def legalize_names_with_typemap(names, typemap): - """ We use ir_utils.legalize_names to replace internal IR variable names - containing illegal characters (e.g. period) with a legal character - (underscore) so as to create legal variable names. - The original variable names are in the typemap so we also - need to add the legalized name to the typemap as well. + """We use ir_utils.legalize_names to replace internal IR variable names + containing illegal characters (e.g. period) with a legal character + (underscore) so as to create legal variable names. + The original variable names are in the typemap so we also + need to add the legalized name to the typemap as well. """ outdict = legalize_names(names) # For each pair in the dict of legalized names... @@ -206,6 +221,7 @@ def to_scalar_from_0d(x): return x.dtype return x + def find_setitems_block(setitems, block, typemap): for inst in block.body: if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem): @@ -214,15 +230,17 @@ def find_setitems_block(setitems, block, typemap): find_setitems_block(setitems, inst.init_block, typemap) find_setitems_body(setitems, inst.loop_body, typemap) + def find_setitems_body(setitems, loop_body, typemap): """ - Find the arrays that are written into (goes into setitems) + Find the arrays that are written into (goes into setitems) """ for label, block in loop_body.items(): find_setitems_block(setitems, block, typemap) + def _create_gufunc_for_regular_parfor(): - #TODO + # TODO pass @@ -231,18 +249,19 @@ def _create_gufunc_for_reduction_parfor(): def _create_gufunc_for_parfor_body( - lowerer, - parfor, - typemap, - typingctx, - targetctx, - flags, - loop_ranges, - locals, - has_aliases, - index_var_typ, - races): - ''' + lowerer, + parfor, + typemap, + typingctx, + targetctx, + flags, + loop_ranges, + locals, + has_aliases, + index_var_typ, + races, +): + """ Takes a parfor and creates a gufunc function for its body. There are two parts to this function: @@ -257,7 +276,7 @@ def _create_gufunc_for_parfor_body( IR retrieved with run_frontend. The IR is scanned for the sentinel assignment where that basic block is split and the IR for the parfor body inserted. - ''' + """ loc = parfor.init_block.loc @@ -288,25 +307,22 @@ def _create_gufunc_for_parfor_body( typemap = lowerer.fndesc.typemap parfor_redvars, parfor_reddict = numba.parfors.parfor.get_parfor_reductions( - lowerer.func_ir, - parfor, - parfor_params, - lowerer.fndesc.calltypes) + lowerer.func_ir, parfor, parfor_params, lowerer.fndesc.calltypes + ) has_reduction = False if len(parfor_redvars) == 0 else True if has_reduction: _create_gufunc_for_reduction_parfor() # Compute just the parfor inputs as a set difference. - parfor_inputs = sorted( - list( - set(parfor_params) - - set(parfor_outputs))) + parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs))) for race in races: - msg = ("Variable %s used in parallel loop may be written " - "to simultaneously by multiple workers and may result " - "in non-deterministic or unintended results." % race) + msg = ( + "Variable %s used in parallel loop may be written " + "to simultaneously by multiple workers and may result " + "in non-deterministic or unintended results." % race + ) warnings.warn(NumbaParallelSafetyWarning(msg, loc)) replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes) @@ -321,15 +337,13 @@ def _create_gufunc_for_parfor_body( def addrspace_from(params, def_addr): addrspaces = [] for p in params: - if isinstance(to_scalar_from_0d(typemap[p]), - types.npytypes.Array): + if isinstance(to_scalar_from_0d(typemap[p]), types.npytypes.Array): addrspaces.append(def_addr) else: addrspaces.append(None) return addrspaces - addrspaces = addrspace_from(parfor_params, - numba_dppy.target.SPIR_GLOBAL_ADDRSPACE) + addrspaces = addrspace_from(parfor_params, numba_dppy.target.SPIR_GLOBAL_ADDRSPACE) if config.DEBUG_ARRAY_OPT >= 1: print("parfor_params = ", parfor_params, type(parfor_params)) @@ -351,8 +365,7 @@ def addrspace_from(params, def_addr): if config.DEBUG_ARRAY_OPT >= 1: print("ind_dict = ", sorted(ind_dict.items()), type(ind_dict)) - print("legal_loop_indices = ",legal_loop_indices, - type(legal_loop_indices)) + print("legal_loop_indices = ", legal_loop_indices, type(legal_loop_indices)) for pd in parfor_params: print("pd = ", pd) @@ -365,14 +378,15 @@ def addrspace_from(params, def_addr): # Calculate types of args passed to gufunc. func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)] - assert(len(param_types_addrspaces) == len(addrspaces)) + assert len(param_types_addrspaces) == len(addrspaces) for i in range(len(param_types_addrspaces)): if addrspaces[i] is not None: - #print("before:", id(param_types_addrspaces[i])) - assert(isinstance(param_types_addrspaces[i], types.npytypes.Array)) - param_types_addrspaces[i] = (param_types_addrspaces[i] - .copy(addrspace=addrspaces[i])) - #print("setting param type", i, param_types[i], id(param_types[i]), + # print("before:", id(param_types_addrspaces[i])) + assert isinstance(param_types_addrspaces[i], types.npytypes.Array) + param_types_addrspaces[i] = param_types_addrspaces[i].copy( + addrspace=addrspaces[i] + ) + # print("setting param type", i, param_types[i], id(param_types[i]), # "to addrspace", param_types_addrspaces[i].addrspace) def print_arg_with_addrspaces(args): @@ -396,10 +410,12 @@ def print_arg_with_addrspaces(args): parfor_params = [] ascontig = False for pindex in range(len(parfor_params_orig)): - if (ascontig and - pindex < len(parfor_inputs) and - isinstance(param_types[pindex], types.npytypes.Array)): - parfor_params.append(parfor_params_orig[pindex]+"param") + if ( + ascontig + and pindex < len(parfor_inputs) + and isinstance(param_types[pindex], types.npytypes.Array) + ): + parfor_params.append(parfor_params_orig[pindex] + "param") else: parfor_params.append(parfor_params_orig[pindex]) @@ -409,11 +425,7 @@ def print_arg_with_addrspaces(args): sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table) if config.DEBUG_ARRAY_OPT >= 1: - print( - "legal parfor_params = ", - parfor_params, - type(parfor_params)) - + print("legal parfor_params = ", parfor_params, type(parfor_params)) # Determine the unique names of the scheduling and gufunc functions. gufunc_name = "__numba_parfor_gufunc_%s" % (parfor.id) @@ -428,9 +440,9 @@ def print_arg_with_addrspaces(args): gufunc_txt += "def " + gufunc_name gufunc_txt += "(" + (", ".join(parfor_params)) + "):\n" - - gufunc_txt += _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, - param_dict) + gufunc_txt += _schedule_loop( + parfor_dim, legal_loop_indices, loop_ranges, param_dict + ) # Add the sentinel assignment so that we can find the loop body position # in the IR. @@ -463,8 +475,7 @@ def print_arg_with_addrspaces(args): # rename all variables in gufunc_ir afresh var_table = get_name_var_table(gufunc_ir.blocks) new_var_dict = {} - reserved_names = [sentinel_name] + \ - list(param_dict.values()) + legal_loop_indices + reserved_names = [sentinel_name] + list(param_dict.values()) + legal_loop_indices for name, var in var_table.items(): if not (name in reserved_names): new_var_dict[name] = mk_unique_var(name) @@ -481,10 +492,8 @@ def print_arg_with_addrspaces(args): if config.DEBUG_ARRAY_OPT: print( - "gufunc_param_types = ", - type(gufunc_param_types), - "\n", - gufunc_param_types) + "gufunc_param_types = ", type(gufunc_param_types), "\n", gufunc_param_types + ) gufunc_stub_last_label = max(gufunc_ir.blocks.keys()) + 1 @@ -503,7 +512,7 @@ def print_arg_with_addrspaces(args): _print_body(loop_body) wrapped_blocks = wrap_loop_body(loop_body) - #hoisted, not_hoisted = hoist(parfor_params, loop_body, + # hoisted, not_hoisted = hoist(parfor_params, loop_body, # typemap, wrapped_blocks) setitems = set() find_setitems_body(setitems, loop_body, typemap) @@ -516,11 +525,12 @@ def print_arg_with_addrspaces(args): unwrap_loop_body(loop_body) # store hoisted into diagnostics - diagnostics = lowerer.metadata['parfor_diagnostics'] - diagnostics.hoist_info[parfor.id] = {'hoisted': hoisted, - 'not_hoisted': not_hoisted} + diagnostics = lowerer.metadata["parfor_diagnostics"] + diagnostics.hoist_info[parfor.id] = {"hoisted": hoisted, "not_hoisted": not_hoisted} - lowerer.metadata['parfor_diagnostics'].extra_info[str(parfor.id)] = str(dpctl.get_current_queue().get_sycl_device().get_device_name()) + lowerer.metadata["parfor_diagnostics"].extra_info[str(parfor.id)] = str( + dpctl.get_current_queue().get_sycl_device().get_device_name() + ) if config.DEBUG_ARRAY_OPT: print("After hoisting") @@ -529,8 +539,7 @@ def print_arg_with_addrspaces(args): # Search all the block in the gufunc outline for the sentinel assignment. for label, block in gufunc_ir.blocks.items(): for i, inst in enumerate(block.body): - if (isinstance(inst, ir.Assign) and - inst.target.name == sentinel_name): + if isinstance(inst, ir.Assign) and inst.target.name == sentinel_name: # We found the sentinel assignment. loc = inst.loc scope = block.scope @@ -541,7 +550,7 @@ def print_arg_with_addrspaces(args): prev_block.body = block.body[:i] # The current block is used for statements after the sentinel. - block.body = block.body[i + 1:] + block.body = block.body[i + 1 :] # But the current block gets a new label. body_first_label = min(loop_body.keys()) @@ -557,8 +566,7 @@ def print_arg_with_addrspaces(args): gufunc_ir.blocks[label] = prev_block # Add a jump from the last parfor body block to the block # containing statements after the sentinel. - gufunc_ir.blocks[body_last_label].append( - ir.Jump(new_label, loc)) + gufunc_ir.blocks[body_last_label].append(ir.Jump(new_label, loc)) break else: continue @@ -598,22 +606,21 @@ def print_arg_with_addrspaces(args): sys.stdout.flush() if config.DEBUG_ARRAY_OPT: - print('before DUFunc inlining'.center(80, '-')) + print("before DUFunc inlining".center(80, "-")) gufunc_ir.dump() # Inlining all DUFuncs - dufunc_inliner(gufunc_ir, lowerer.fndesc.calltypes, typemap, - lowerer.context.typing_context) + dufunc_inliner( + gufunc_ir, lowerer.fndesc.calltypes, typemap, lowerer.context.typing_context + ) if config.DEBUG_ARRAY_OPT: - print('after DUFunc inline'.center(80, '-')) + print("after DUFunc inline".center(80, "-")) gufunc_ir.dump() kernel_func = numba_dppy.compiler.compile_kernel_parfor( - dpctl.get_current_queue(), - gufunc_ir, - gufunc_param_types, - param_types_addrspaces) + dpctl.get_current_queue(), gufunc_ir, gufunc_param_types, param_types_addrspaces + ) flags.noalias = old_alias @@ -672,8 +679,9 @@ def _lower_parfor_gufunc(lowerer, parfor): alias_map = {} arg_aliases = {} - numba.parfors.parfor.find_potential_aliases_parfor(parfor, parfor.params, typemap, - lowerer.func_ir, alias_map, arg_aliases) + numba.parfors.parfor.find_potential_aliases_parfor( + parfor, parfor.params, typemap, lowerer.func_ir, alias_map, arg_aliases + ) if config.DEBUG_ARRAY_OPT: print("alias_map", alias_map) print("arg_aliases", arg_aliases) @@ -684,12 +692,12 @@ def _lower_parfor_gufunc(lowerer, parfor): assert parfor.params != None parfor_output_arrays = numba.parfors.parfor.get_parfor_outputs( - parfor, parfor.params) - + parfor, parfor.params + ) # compile parfor body as a separate function to be used with GUFuncWrapper flags = copy.copy(parfor.flags) - flags.set('error_model', 'numpy') + flags.set("error_model", "numpy") # Can't get here unless flags.set('auto_parallel', ParallelOptions(True)) index_var_typ = typemap[parfor.loop_nests[0].index_variable.name] @@ -702,8 +710,13 @@ def _lower_parfor_gufunc(lowerer, parfor): loop_ranges = [(l.start, l.stop, l.step) for l in parfor.loop_nests] try: - func, func_args, func_sig, func_arg_types, modified_arrays =( - _create_gufunc_for_parfor_body( + ( + func, + func_args, + func_sig, + func_arg_types, + modified_arrays, + ) = _create_gufunc_for_parfor_body( lowerer, parfor, typemap, @@ -714,7 +727,8 @@ def _lower_parfor_gufunc(lowerer, parfor): {}, bool(alias_map), index_var_typ, - parfor.races)) + parfor.races, + ) finally: numba.parfors.parfor.sequential_parfor_lowering = False @@ -735,12 +749,8 @@ def _lower_parfor_gufunc(lowerer, parfor): print("loop_ranges = ", loop_ranges) gu_signature = _create_shape_signature( - parfor.get_shape_classes, - num_inputs, - func_args, - func_sig, - parfor.races, - typemap) + parfor.get_shape_classes, num_inputs, func_args, func_sig, parfor.races, typemap + ) generate_dppy_host_wrapper( lowerer, @@ -754,7 +764,8 @@ def _lower_parfor_gufunc(lowerer, parfor): parfor.init_block, index_var_typ, parfor.races, - modified_arrays) + modified_arrays, + ) if config.DEBUG_ARRAY_OPT: sys.stdout.flush() @@ -765,50 +776,52 @@ def _lower_parfor_gufunc(lowerer, parfor): def _create_shape_signature( - get_shape_classes, - num_inputs, - #num_reductions, - args, - func_sig, - races, - typemap): - '''Create shape signature for GUFunc - ''' + get_shape_classes, + num_inputs, + # num_reductions, + args, + func_sig, + races, + typemap, +): + """Create shape signature for GUFunc""" if config.DEBUG_ARRAY_OPT: print("_create_shape_signature", num_inputs, args) arg_start_print = 0 for i in args[arg_start_print:]: print("argument", i, type(i), get_shape_classes(i, typemap=typemap)) - #num_inouts = len(args) - num_reductions + # num_inouts = len(args) - num_reductions num_inouts = len(args) # maximum class number for array shapes - classes = [get_shape_classes(var, typemap=typemap) - if var not in races else (-1,) for var in args[1:]] + classes = [ + get_shape_classes(var, typemap=typemap) if var not in races else (-1,) + for var in args[1:] + ] class_set = set() for _class in classes: if _class: for i in _class: class_set.add(i) max_class = max(class_set) + 1 if class_set else 0 - classes.insert(0, (max_class,)) # force set the class of 'sched' argument + classes.insert(0, (max_class,)) # force set the class of 'sched' argument class_set.add(max_class) class_map = {} # TODO: use prefix + class number instead of single char - alphabet = ord('a') + alphabet = ord("a") for n in class_set: - if n >= 0: - class_map[n] = chr(alphabet) - alphabet += 1 + if n >= 0: + class_map[n] = chr(alphabet) + alphabet += 1 - alpha_dict = {'latest_alpha' : alphabet} + alpha_dict = {"latest_alpha": alphabet} def bump_alpha(c, class_map): if c >= 0: return class_map[c] else: - alpha_dict['latest_alpha'] += 1 - return chr(alpha_dict['latest_alpha']) + alpha_dict["latest_alpha"] += 1 + return chr(alpha_dict["latest_alpha"]) gu_sin = [] gu_sout = [] @@ -833,21 +846,24 @@ def bump_alpha(c, class_map): # Keep all the dppy kernels and programs created alive indefinitely. keep_alive_kernels = [] -def generate_dppy_host_wrapper(lowerer, - cres, - gu_signature, - outer_sig, - expr_args, - num_inputs, - expr_arg_types, - loop_ranges, - init_block, - index_var_typ, - races, - modified_arrays): - ''' + +def generate_dppy_host_wrapper( + lowerer, + cres, + gu_signature, + outer_sig, + expr_args, + num_inputs, + expr_arg_types, + loop_ranges, + init_block, + index_var_typ, + races, + modified_arrays, +): + """ Adds the call to the gufunc function from the main function. - ''' + """ context = lowerer.context builder = lowerer.builder sin, sout = gu_signature @@ -856,8 +872,13 @@ def generate_dppy_host_wrapper(lowerer, if config.DEBUG_ARRAY_OPT: print("generate_dppy_host_wrapper") print("args = ", expr_args) - print("outer_sig = ", outer_sig.args, outer_sig.return_type, - outer_sig.recvr, outer_sig.pysig) + print( + "outer_sig = ", + outer_sig.args, + outer_sig.return_type, + outer_sig.recvr, + outer_sig.pysig, + ) print("loop_ranges = ", loop_ranges) print("expr_args", expr_args) print("expr_arg_types", expr_arg_types) @@ -866,13 +887,13 @@ def generate_dppy_host_wrapper(lowerer, print("sout", sout) print("cres", cres, type(cres)) print("modified_arrays", modified_arrays) -# print("cres.library", cres.library, type(cres.library)) -# print("cres.fndesc", cres.fndesc, type(cres.fndesc)) - + # print("cres.library", cres.library, type(cres.library)) + # print("cres.fndesc", cres.fndesc, type(cres.fndesc)) # get dppy_cpu_portion_lowerer object dppy_cpu_lowerer = dppy_call_gen.DPPYHostFunctionCallsGenerator( - lowerer, cres, num_inputs) + lowerer, cres, num_inputs + ) # Compute number of args ------------------------------------------------ num_expanded_args = 0 @@ -911,8 +932,7 @@ def val_type_or_none(context, lowerer, x): return None all_llvm_args = [getvar_or_none(lowerer, x) for x in expr_args[:ninouts]] - all_val_types = ([val_type_or_none(context, lowerer, x) - for x in expr_args[:ninouts]]) + all_val_types = [val_type_or_none(context, lowerer, x) for x in expr_args[:ninouts]] all_args = [loadvar_or_none(lowerer, x) for x in expr_args[:ninouts]] keep_alive_kernels.append(cres) @@ -922,19 +942,37 @@ def val_type_or_none(context, lowerer, x): # the enqueue function. Put each part of each argument into # kernel_arg_array. for var, llvm_arg, arg_type, gu_sig, val_type, index in zip( - expr_args, all_llvm_args, expr_arg_types, sin + sout, all_val_types, - range(len(expr_args))): + expr_args, + all_llvm_args, + expr_arg_types, + sin + sout, + all_val_types, + range(len(expr_args)), + ): if config.DEBUG_ARRAY_OPT: - print("var:", var, type(var), - "\n\tllvm_arg:", llvm_arg, type(llvm_arg), - "\n\targ_type:", arg_type, type(arg_type), - "\n\tgu_sig:", gu_sig, - "\n\tval_type:", val_type, type(val_type), - "\n\tindex:", index) - - dppy_cpu_lowerer.process_kernel_arg(var, llvm_arg, arg_type, gu_sig, - val_type, index, modified_arrays) + print( + "var:", + var, + type(var), + "\n\tllvm_arg:", + llvm_arg, + type(llvm_arg), + "\n\targ_type:", + arg_type, + type(arg_type), + "\n\tgu_sig:", + gu_sig, + "\n\tval_type:", + val_type, + type(val_type), + "\n\tindex:", + index, + ) + + dppy_cpu_lowerer.process_kernel_arg( + var, llvm_arg, arg_type, gu_sig, val_type, index, modified_arrays + ) # ----------------------------------------------------------------------- # loadvars for loop_ranges @@ -949,7 +987,7 @@ def load_range(v): start, stop, step = loop_ranges[i] start = load_range(start) stop = load_range(stop) - assert(step == 1) # We do not support loop steps other than 1 + assert step == 1 # We do not support loop steps other than 1 step = load_range(step) loop_ranges[i] = (start, stop, step) @@ -985,10 +1023,26 @@ def relatively_deep_copy(obj, memo): from numba.core.types.abstract import Type # objects which shouldn't or can't be copied and it's ok not to copy it. - if isinstance(obj, (FunctionIdentity, _DispatcherBase, Function, Type, - Dispatcher, ModuleType, Signature, - DPPYFunctionTemplate, CompileResult, DUFunc, _CFuncPtr, - type, str, bool, type(None))): + if isinstance( + obj, + ( + FunctionIdentity, + _DispatcherBase, + Function, + Type, + Dispatcher, + ModuleType, + Signature, + DPPYFunctionTemplate, + CompileResult, + DUFunc, + _CFuncPtr, + type, + str, + bool, + type(None), + ), + ): return obj from numba.core.ir import Global, FreeVar @@ -997,15 +1051,24 @@ def relatively_deep_copy(obj, memo): from numba.core.funcdesc import FunctionDescriptor if isinstance(obj, FunctionDescriptor): - cpy = FunctionDescriptor(native=obj.native, modname=obj.modname, qualname=obj.qualname, - unique_name=obj.unique_name, doc=obj.doc, - typemap=relatively_deep_copy(obj.typemap, memo), - restype=obj.restype, - calltypes=relatively_deep_copy(obj.calltypes, memo), - args=obj.args, kws=obj.kws, mangler=None, - argtypes=relatively_deep_copy(obj.argtypes, memo), - inline=obj.inline, noalias=obj.noalias, env_name=obj.env_name, - global_dict=obj.global_dict) + cpy = FunctionDescriptor( + native=obj.native, + modname=obj.modname, + qualname=obj.qualname, + unique_name=obj.unique_name, + doc=obj.doc, + typemap=relatively_deep_copy(obj.typemap, memo), + restype=obj.restype, + calltypes=relatively_deep_copy(obj.calltypes, memo), + args=obj.args, + kws=obj.kws, + mangler=None, + argtypes=relatively_deep_copy(obj.argtypes, memo), + inline=obj.inline, + noalias=obj.noalias, + env_name=obj.env_name, + global_dict=obj.global_dict, + ) # mangler parameter is not saved in FunctionDescriptor, but used to generated name. # So pass None as mangler parameter and then copy mangled_name by hands cpy.mangled_name = obj.mangled_name @@ -1025,13 +1088,15 @@ def relatively_deep_copy(obj, memo): # This means that copy of IR actually has a side effect on it. pp = PostProcessor(obj) pp.run() - cpy = FunctionIR(blocks=relatively_deep_copy(obj.blocks, memo), - is_generator=relatively_deep_copy(obj.is_generator, memo), - func_id=relatively_deep_copy(obj.func_id, memo), - loc=obj.loc, - definitions=relatively_deep_copy(obj._definitions, memo), - arg_count=obj.arg_count, - arg_names=relatively_deep_copy(obj.arg_names, memo)) + cpy = FunctionIR( + blocks=relatively_deep_copy(obj.blocks, memo), + is_generator=relatively_deep_copy(obj.is_generator, memo), + func_id=relatively_deep_copy(obj.func_id, memo), + loc=obj.loc, + definitions=relatively_deep_copy(obj._definitions, memo), + arg_count=obj.arg_count, + arg_names=relatively_deep_copy(obj.arg_names, memo), + ) pp = PostProcessor(cpy) pp.run() @@ -1142,8 +1207,9 @@ def __init__(self, context, library, fndesc, func_ir, metadata=None): fndesc_cpu = relatively_deep_copy(fndesc, memo) func_ir_cpu = relatively_deep_copy(func_ir, memo) - - cpu_context = context.cpu_context if isinstance(context, DPPYTargetContext) else context + cpu_context = ( + context.cpu_context if isinstance(context, DPPYTargetContext) else context + ) self.gpu_lower = Lower(context, library, fndesc, func_ir, metadata) self.cpu_lower = Lower(cpu_context, library, fndesc_cpu, func_ir_cpu, metadata) @@ -1168,17 +1234,23 @@ def lower(self): lowering.lower_extensions[parfor.Parfor].append(lower_parfor_rollback) self.gpu_lower.lower() # if lower dont crash, and parfor_diagnostics is empty then it is kernel - if not self.gpu_lower.metadata['parfor_diagnostics'].extra_info: - str_name = str(dpctl.get_current_queue().get_sycl_device().get_device_name()) - self.gpu_lower.metadata['parfor_diagnostics'].extra_info["kernel"] = str_name + if not self.gpu_lower.metadata["parfor_diagnostics"].extra_info: + str_name = str( + dpctl.get_current_queue().get_sycl_device().get_device_name() + ) + self.gpu_lower.metadata["parfor_diagnostics"].extra_info[ + "kernel" + ] = str_name self.base_lower = self.gpu_lower lowering.lower_extensions[parfor.Parfor].pop() except Exception as e: if numba_dppy.compiler.DEBUG: print("Failed to lower parfor on DPPY-device. Due to:\n", e) lowering.lower_extensions[parfor.Parfor].pop() - if ((lowering.lower_extensions[parfor.Parfor][-1] == numba.parfors.parfor_lowering._lower_parfor_parallel) and - numba_dppy.config.FALLBACK_ON_CPU == 1): + if ( + lowering.lower_extensions[parfor.Parfor][-1] + == numba.parfors.parfor_lowering._lower_parfor_parallel + ) and numba_dppy.config.FALLBACK_ON_CPU == 1: self.cpu_lower.lower() self.base_lower = self.cpu_lower else: diff --git a/numba_dppy/dppy_offload_dispatcher.py b/numba_dppy/dppy_offload_dispatcher.py index 0c5fe10f5e..58ca6d9729 100644 --- a/numba_dppy/dppy_offload_dispatcher.py +++ b/numba_dppy/dppy_offload_dispatcher.py @@ -6,18 +6,45 @@ class DppyOffloadDispatcher(dispatcher.Dispatcher): targetdescr = cpu_target - def __init__(self, py_func, locals={}, targetoptions={}, impl_kind='direct', pipeline_class=compiler.Compiler): + def __init__( + self, + py_func, + locals={}, + targetoptions={}, + impl_kind="direct", + pipeline_class=compiler.Compiler, + ): if dppy_config.dppy_present: from numba_dppy.compiler import DPPYCompiler - targetoptions['parallel'] = True - dispatcher.Dispatcher.__init__(self, py_func, locals=locals, - targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=DPPYCompiler) + + targetoptions["parallel"] = True + dispatcher.Dispatcher.__init__( + self, + py_func, + locals=locals, + targetoptions=targetoptions, + impl_kind=impl_kind, + pipeline_class=DPPYCompiler, + ) else: - print("---------------------------------------------------------------------") - print("WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed.") - print("---------------------------------------------------------------------") - dispatcher.Dispatcher.__init__(self, py_func, locals=locals, - targetoptions=targetoptions, impl_kind=impl_kind, pipeline_class=pipeline_class) + print( + "---------------------------------------------------------------------" + ) + print( + "WARNING : DPPY pipeline ignored. Ensure OpenCL drivers are installed." + ) + print( + "---------------------------------------------------------------------" + ) + dispatcher.Dispatcher.__init__( + self, + py_func, + locals=locals, + targetoptions=targetoptions, + impl_kind=impl_kind, + pipeline_class=pipeline_class, + ) + -dispatcher_registry['__dppy_offload_gpu__'] = DppyOffloadDispatcher -dispatcher_registry['__dppy_offload_cpu__'] = DppyOffloadDispatcher +dispatcher_registry["__dppy_offload_gpu__"] = DppyOffloadDispatcher +dispatcher_registry["__dppy_offload_cpu__"] = DppyOffloadDispatcher diff --git a/numba_dppy/dppy_parfor_diagnostics.py b/numba_dppy/dppy_parfor_diagnostics.py index 50e19a1cb1..51b3747cac 100644 --- a/numba_dppy/dppy_parfor_diagnostics.py +++ b/numba_dppy/dppy_parfor_diagnostics.py @@ -14,14 +14,14 @@ def dump(self, level=1): if self.extra_info: parfors_simple = self.get_parfors_simple(False) all_lines = self.get_all_lines(parfors_simple) - print(' Auto-offloading '.center(_termwidth,'-')) + print(" Auto-offloading ".center(_termwidth, "-")) self.print_auto_offloading(all_lines) - if 'kernel' in self.extra_info.keys(): - print_wrapped("Device - '%s'" % self.extra_info['kernel']) - print(_termwidth * '-') + if "kernel" in self.extra_info.keys(): + print_wrapped("Device - '%s'" % self.extra_info["kernel"]) + print(_termwidth * "-") def print_auto_offloading(self, lines): - sword = '+--' + sword = "+--" fac = len(sword) summary = dict() @@ -30,26 +30,26 @@ def print_auto_offloading(self, lines): def print_nest(fadj_, nadj_, theroot, reported, region_id): def print_g(fadj_, nadj_, nroot, depth): for k in nadj_[nroot]: - msg = fac * depth * ' ' + '%s%s %s' % (sword, k, '(serial') + msg = fac * depth * " " + "%s%s %s" % (sword, k, "(serial") if nadj_[k] == []: fused = [] if fadj_[k] != [] and k not in reported: fused = sorted(self.reachable_nodes(fadj_, k)) msg += ", fused with loop(s): " - msg += ', '.join([str(x) for x in fused]) - msg += ')' + msg += ", ".join([str(x) for x in fused]) + msg += ")" reported.append(k) print_wrapped(msg) - summary[region_id]['fused'] += len(fused) + summary[region_id]["fused"] += len(fused) else: - print_wrapped(msg + ')') + print_wrapped(msg + ")") print_g(fadj_, nadj_, k, depth + 1) - summary[region_id]['serialized'] += 1 + summary[region_id]["serialized"] += 1 if nadj_[theroot] != []: print_wrapped("Parallel region %s:" % region_id) - print_wrapped('%s%s %s' % (sword, theroot, '(parallel)')) - summary[region_id] = {'root': theroot, 'fused': 0, 'serialized': 0} + print_wrapped("%s%s %s" % (sword, theroot, "(parallel)")) + summary[region_id] = {"root": theroot, "fused": 0, "serialized": 0} print_g(fadj_, nadj_, theroot, 1) print("\n") region_id = region_id + 1 @@ -57,15 +57,15 @@ def print_g(fadj_, nadj_, nroot, depth): def print_fuse(ty, pf_id, adj, depth, region_id): print_wrapped("Parallel region %s:" % region_id) - msg = fac * depth * ' ' + '%s%s %s' % (sword, pf_id, '(parallel') + msg = fac * depth * " " + "%s%s %s" % (sword, pf_id, "(parallel") fused = [] if adj[pf_id] != []: fused = sorted(self.reachable_nodes(adj, pf_id)) msg += ", fused with loop(s): " - msg += ', '.join([str(x) for x in fused]) + msg += ", ".join([str(x) for x in fused]) - summary[region_id] = {'root': pf_id, 'fused': len(fused), 'serialized': 0} - msg += ')' + summary[region_id] = {"root": pf_id, "fused": len(fused), "serialized": 0} + msg += ")" print_wrapped(msg) extra_info = self.extra_info.get(str(region_id)) if extra_info: @@ -78,10 +78,10 @@ def print_fuse(ty, pf_id, adj, depth, region_id): reported = [] for line, info in sorted(lines.items()): opt_ty, pf_id, adj = info - if opt_ty == 'fuse': + if opt_ty == "fuse": if pf_id not in reported: - region_id = print_fuse('f', pf_id, adj, 0, region_id) - elif opt_ty == 'nest': + region_id = print_fuse("f", pf_id, adj, 0, region_id) + elif opt_ty == "nest": region_id = print_nest(fadj, nadj, pf_id, reported, region_id) else: assert 0 @@ -89,18 +89,19 @@ def print_fuse(ty, pf_id, adj, depth, region_id): # print the summary of the fuse/serialize rewrite if summary: for k, v in sorted(summary.items()): - msg = ('\n \nParallel region %s (loop #%s) had %s ' - 'loop(s) fused') - root = v['root'] - fused = v['fused'] - serialized = v['serialized'] + msg = "\n \nParallel region %s (loop #%s) had %s " "loop(s) fused" + root = v["root"] + fused = v["fused"] + serialized = v["serialized"] if serialized != 0: - msg += (' and %s loop(s) ' - 'serialized as part of the larger ' - 'parallel loop (#%s).') + msg += ( + " and %s loop(s) " + "serialized as part of the larger " + "parallel loop (#%s)." + ) print_wrapped(msg % (k, root, fused, serialized, root)) else: - msg += '.' + msg += "." print_wrapped(msg % (k, root, fused)) else: print_wrapped("Parallel structure is already optimal.") diff --git a/numba_dppy/dppy_passbuilder.py b/numba_dppy/dppy_passbuilder.py index 994351d509..c6b2534a62 100644 --- a/numba_dppy/dppy_passbuilder.py +++ b/numba_dppy/dppy_passbuilder.py @@ -2,32 +2,51 @@ from numba.core.compiler_machinery import PassManager -from numba.core.untyped_passes import (ExtractByteCode, TranslateByteCode, FixupArgs, - IRProcessing, DeadBranchPrune, - RewriteSemanticConstants, InlineClosureLikes, - GenericRewrites, WithLifting, - InlineInlinables, FindLiterallyCalls, - MakeFunctionToJitFunction, - CanonicalizeLoopExit, CanonicalizeLoopEntry, - ReconstructSSA, - LiteralUnroll) - -from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes, - NopythonRewrites, PreParforPass, ParforPass, - DumpParforDiagnostics, IRLegalization, - InlineOverloads, PreLowerStripPhis) +from numba.core.untyped_passes import ( + ExtractByteCode, + TranslateByteCode, + FixupArgs, + IRProcessing, + DeadBranchPrune, + RewriteSemanticConstants, + InlineClosureLikes, + GenericRewrites, + WithLifting, + InlineInlinables, + FindLiterallyCalls, + MakeFunctionToJitFunction, + CanonicalizeLoopExit, + CanonicalizeLoopEntry, + ReconstructSSA, + LiteralUnroll, +) + +from numba.core.typed_passes import ( + NopythonTypeInference, + AnnotateTypes, + NopythonRewrites, + PreParforPass, + ParforPass, + DumpParforDiagnostics, + IRLegalization, + InlineOverloads, + PreLowerStripPhis, +) from .dppy_passes import ( - DPPYConstantSizeStaticLocalMemoryPass, - DPPYPreParforPass, - DPPYParforPass, - SpirvFriendlyLowering, - DPPYNoPythonBackend, - DPPYDumpParforDiagnostics - ) + DPPYConstantSizeStaticLocalMemoryPass, + DPPYPreParforPass, + DPPYParforPass, + SpirvFriendlyLowering, + DPPYNoPythonBackend, + DPPYDumpParforDiagnostics, +) + +from .rename_numpy_functions_pass import ( + DPPYRewriteOverloadedNumPyFunctions, + DPPYRewriteNdarrayFunctions, +) -from .rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions, - DPPYRewriteNdarrayFunctions) class DPPYPassBuilder(object): """ @@ -38,8 +57,7 @@ class DPPYPassBuilder(object): @staticmethod def default_numba_nopython_pipeline(state, pm): - """Adds the default set of NUMBA passes to the pass manager - """ + """Adds the default set of NUMBA passes to the pass manager""" if state.func_ir is None: pm.add_pass(TranslateByteCode, "analyzing bytecode") pm.add_pass(FixupArgs, "fix up args") @@ -47,14 +65,18 @@ def default_numba_nopython_pipeline(state, pm): pm.add_pass(WithLifting, "Handle with contexts") # this pass rewrites name of NumPy functions we intend to overload - pm.add_pass(DPPYRewriteOverloadedNumPyFunctions, - "Rewrite name of Numpy functions to overload already overloaded function", + pm.add_pass( + DPPYRewriteOverloadedNumPyFunctions, + "Rewrite name of Numpy functions to overload already overloaded function", ) # Add pass to ensure when users are allocating static # constant memory the size is a constant and can not # come from a closure variable - pm.add_pass(DPPYConstantSizeStaticLocalMemoryPass, "dppy constant size for static local memory") + pm.add_pass( + DPPYConstantSizeStaticLocalMemoryPass, + "dppy constant size for static local memory", + ) # pre typing if not state.flags.no_rewrites: @@ -62,11 +84,11 @@ def default_numba_nopython_pipeline(state, pm): pm.add_pass(DeadBranchPrune, "dead branch pruning") pm.add_pass(GenericRewrites, "nopython rewrites") - pm.add_pass(InlineClosureLikes, - "inline calls to locally defined closures") + pm.add_pass(InlineClosureLikes, "inline calls to locally defined closures") # convert any remaining closures into functions - pm.add_pass(MakeFunctionToJitFunction, - "convert make_function into JIT functions") + pm.add_pass( + MakeFunctionToJitFunction, "convert make_function into JIT functions" + ) # inline functions that have been determined as inlinable and rerun # branch pruning, this needs to be run after closures are inlined as # the IR repr of a closure masks call sites if an inlinable is called @@ -84,8 +106,9 @@ def default_numba_nopython_pipeline(state, pm): pm.add_pass(NopythonTypeInference, "nopython frontend") pm.add_pass(AnnotateTypes, "annotate types") - pm.add_pass(DPPYRewriteNdarrayFunctions, - "Rewrite ndarray functions to dppy supported functions", + pm.add_pass( + DPPYRewriteNdarrayFunctions, + "Rewrite ndarray functions to dppy supported functions", ) # strip phis @@ -94,11 +117,9 @@ def default_numba_nopython_pipeline(state, pm): # optimisation pm.add_pass(InlineOverloads, "inline overloaded functions") - @staticmethod - def define_nopython_pipeline(state, name='dppy_nopython'): - """Returns an nopython mode pipeline based PassManager - """ + def define_nopython_pipeline(state, name="dppy_nopython"): + """Returns an nopython mode pipeline based PassManager""" pm = PassManager(name) DPPYPassBuilder.default_numba_nopython_pipeline(state, pm) diff --git a/numba_dppy/dppy_passes.py b/numba_dppy/dppy_passes.py index be9423230b..858420a098 100644 --- a/numba_dppy/dppy_passes.py +++ b/numba_dppy/dppy_passes.py @@ -16,19 +16,31 @@ utils, typing, types, - ) +) from numba.core.ir_utils import remove_dels -from numba.core.errors import (LoweringError, new_error_context, TypingError, - LiteralTypingError) +from numba.core.errors import ( + LoweringError, + new_error_context, + TypingError, + LiteralTypingError, +) -from numba.core.compiler_machinery import FunctionPass, LoweringPass, register_pass, AnalysisPass +from numba.core.compiler_machinery import ( + FunctionPass, + LoweringPass, + register_pass, + AnalysisPass, +) from .dppy_lowerer import DPPYLower from numba_dppy import config as dppy_config -from numba.parfors.parfor import PreParforPass as _parfor_PreParforPass, replace_functions_map +from numba.parfors.parfor import ( + PreParforPass as _parfor_PreParforPass, + replace_functions_map, +) from numba.parfors.parfor import ParforPass as _parfor_ParforPass from numba.parfors.parfor import Parfor @@ -52,7 +64,11 @@ def run_pass(self, state): _DEBUG = False if _DEBUG: - print('Checks if size of OpenCL local address space alloca is a compile-time constant.'.center(80, '-')) + print( + "Checks if size of OpenCL local address space alloca is a compile-time constant.".center( + 80, "-" + ) + ) print(func_ir.dump()) work_list = list(func_ir.blocks.items()) @@ -62,9 +78,14 @@ def run_pass(self, state): if isinstance(instr, ir.Assign): expr = instr.value if isinstance(expr, ir.Expr): - if expr.op == 'call': - call_node = block.find_variable_assignment(expr.func.name).value - if isinstance(call_node, ir.Expr) and call_node.attr == "static_alloc": + if expr.op == "call": + call_node = block.find_variable_assignment( + expr.func.name + ).value + if ( + isinstance(call_node, ir.Expr) + and call_node.attr == "static_alloc" + ): arg = None # at first look in keyword arguments to get the shape, which has to be # constant @@ -82,24 +103,28 @@ def run_pass(self, state): if isinstance(arg_type, ir.Expr): # we have a tuple for item in arg_type.items: - if not isinstance(func_ir.get_definition(item.name), ir.Const): + if not isinstance( + func_ir.get_definition(item.name), ir.Const + ): error = True break else: - if not isinstance(func_ir.get_definition(arg.name), ir.Const): + if not isinstance( + func_ir.get_definition(arg.name), ir.Const + ): error = True break if error: - warnings.warn_explicit("The size of the Local memory has to be constant", - errors.NumbaError, - state.func_id.filename, - state.func_id.firstlineno) + warnings.warn_explicit( + "The size of the Local memory has to be constant", + errors.NumbaError, + state.func_id.filename, + state.func_id.firstlineno, + ) raise - - if config.DEBUG or config.DUMP_IR: name = state.func_ir.func_id.func_qualname print(("IR DUMP: %s" % name).center(80, "-")) @@ -124,22 +149,23 @@ def run_pass(self, state): # Ensure we have an IR and type information. assert state.func_ir functions_map = replace_functions_map.copy() - functions_map.pop(('dot', 'numpy'), None) - functions_map.pop(('sum', 'numpy'), None) - functions_map.pop(('prod', 'numpy'), None) - functions_map.pop(('argmax', 'numpy'), None) - functions_map.pop(('max', 'numpy'), None) - functions_map.pop(('argmin', 'numpy'), None) - functions_map.pop(('min', 'numpy'), None) - functions_map.pop(('mean', 'numpy'), None) + functions_map.pop(("dot", "numpy"), None) + functions_map.pop(("sum", "numpy"), None) + functions_map.pop(("prod", "numpy"), None) + functions_map.pop(("argmax", "numpy"), None) + functions_map.pop(("max", "numpy"), None) + functions_map.pop(("argmin", "numpy"), None) + functions_map.pop(("min", "numpy"), None) + functions_map.pop(("mean", "numpy"), None) preparfor_pass = _parfor_PreParforPass( state.func_ir, state.type_annotation.typemap, - state.type_annotation.calltypes, state.typingctx, + state.type_annotation.calltypes, + state.typingctx, state.flags.auto_parallel, state.parfor_diagnostics.replaced_fns, - replace_functions_map=functions_map + replace_functions_map=functions_map, ) preparfor_pass.run() @@ -166,14 +192,16 @@ def run_pass(self, state): """ # Ensure we have an IR and type information. assert state.func_ir - parfor_pass = _parfor_ParforPass(state.func_ir, - state.type_annotation.typemap, - state.type_annotation.calltypes, - state.return_type, - state.typingctx, - state.flags.auto_parallel, - state.flags, - state.parfor_diagnostics) + parfor_pass = _parfor_ParforPass( + state.func_ir, + state.type_annotation.typemap, + state.type_annotation.calltypes, + state.return_type, + state.typingctx, + state.flags.auto_parallel, + state.flags, + state.parfor_diagnostics, + ) parfor_pass.run() @@ -203,14 +231,17 @@ def fallback_context(state, msg): e = e.with_traceback(None) # this emits a warning containing the error message body in the # case of fallback from npm to objmode - loop_lift = '' if state.flags.enable_looplift else 'OUT' - msg_rewrite = ("\nCompilation is falling back to object mode " - "WITH%s looplifting enabled because %s" - % (loop_lift, msg)) - warnings.warn_explicit('%s due to: %s' % (msg_rewrite, e), - errors.NumbaWarning, - state.func_id.filename, - state.func_id.firstlineno) + loop_lift = "" if state.flags.enable_looplift else "OUT" + msg_rewrite = ( + "\nCompilation is falling back to object mode " + "WITH%s looplifting enabled because %s" % (loop_lift, msg) + ) + warnings.warn_explicit( + "%s due to: %s" % (msg_rewrite, e), + errors.NumbaWarning, + state.func_id.filename, + state.func_id.firstlineno, + ) raise @@ -232,27 +263,31 @@ def run_pass(self, state): targetctx = state.targetctx - library = state.library - interp = state.func_ir # why is it called this?! - typemap = state.typemap - restype = state.return_type + library = state.library + interp = state.func_ir # why is it called this?! + typemap = state.typemap + restype = state.return_type calltypes = state.calltypes - flags = state.flags - metadata = state.metadata + flags = state.flags + metadata = state.metadata - msg = ("Function %s failed at nopython " - "mode lowering" % (state.func_id.func_name,)) + msg = "Function %s failed at nopython " "mode lowering" % ( + state.func_id.func_name, + ) with fallback_context(state, msg): # Lowering - fndesc = \ - funcdesc.PythonFunctionDescriptor.from_specialized_function( - interp, typemap, restype, calltypes, - mangler=targetctx.mangler, inline=flags.forceinline, - noalias=flags.noalias) + fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function( + interp, + typemap, + restype, + calltypes, + mangler=targetctx.mangler, + inline=flags.forceinline, + noalias=flags.noalias, + ) with targetctx.push_code_library(library): - lower = DPPYLower(targetctx, library, fndesc, interp, - metadata=metadata) + lower = DPPYLower(targetctx, library, fndesc, interp, metadata=metadata) lower.lower() if not flags.no_cpython_wrapper: lower.create_cpython_wrapper(flags.release_gil) @@ -262,17 +297,16 @@ def run_pass(self, state): del lower from numba.core.compiler import _LowerResult # TODO: move this + if flags.no_compile: - state['cr'] = _LowerResult(fndesc, call_helper, - cfunc=None, env=env) + state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env) else: # Prepare for execution cfunc = targetctx.get_executable(library, fndesc, env) # Insert native function for use by other jitted-functions. # We also register its library to allow for inlining. targetctx.insert_user_function(cfunc, fndesc, [library]) - state['cr'] = _LowerResult(fndesc, call_helper, - cfunc=cfunc, env=env) + state["cr"] = _LowerResult(fndesc, call_helper, cfunc=cfunc, env=env) return True @@ -290,10 +324,11 @@ def run_pass(self, state): Back-end: Generate LLVM IR from Numba IR, compile to machine code """ - lowered = state['cr'] + lowered = state["cr"] signature = typing.signature(state.return_type, *state.args) from numba.core.compiler import compile_result + state.cr = compile_result( typing_context=state.typingctx, target_context=state.targetctx, diff --git a/numba_dppy/dufunc_inliner.py b/numba_dppy/dufunc_inliner.py index f42a9b1855..ca14cab531 100644 --- a/numba_dppy/dufunc_inliner.py +++ b/numba_dppy/dufunc_inliner.py @@ -4,25 +4,41 @@ from numba.core.ir_utils import dead_code_elimination, simplify_CFG -def _run_inliner(func_ir, sig, template, arg_typs, expr, i, py_func, block, - work_list, typemap, calltypes, typingctx): - from numba.core.inline_closurecall import (inline_closure_call, - callee_ir_validator) +def _run_inliner( + func_ir, + sig, + template, + arg_typs, + expr, + i, + py_func, + block, + work_list, + typemap, + calltypes, + typingctx, +): + from numba.core.inline_closurecall import inline_closure_call, callee_ir_validator # pass is typed so use the callee globals - inline_closure_call(func_ir, py_func.__globals__, - block, i, py_func, typingctx=typingctx, - arg_typs=arg_typs, - typemap=typemap, - calltypes=calltypes, - work_list=work_list, - replace_freevars=False, - callee_validator=callee_ir_validator) + inline_closure_call( + func_ir, + py_func.__globals__, + block, + i, + py_func, + typingctx=typingctx, + arg_typs=arg_typs, + typemap=typemap, + calltypes=calltypes, + work_list=work_list, + replace_freevars=False, + callee_validator=callee_ir_validator, + ) return True -def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes, - typingctx): +def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes, typingctx): # try and get a definition for the call, this isn't always possible as # it might be a eval(str)/part generated awaiting update etc. (parfors) to_inline = None @@ -32,7 +48,7 @@ def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes, return False # do not handle closure inlining here, another pass deals with that. - if getattr(to_inline, 'op', False) == 'make_function': + if getattr(to_inline, "op", False) == "make_function": return False # check this is a known and typed function @@ -40,33 +56,43 @@ def _inline(func_ir, work_list, block, i, expr, py_func, typemap, calltypes, func_ty = typemap[expr.func.name] except KeyError: return False - if not hasattr(func_ty, 'get_call_type'): + if not hasattr(func_ty, "get_call_type"): return False sig = calltypes[expr] is_method = False - templates = getattr(func_ty, 'templates', None) + templates = getattr(func_ty, "templates", None) arg_typs = sig.args if templates is None: return False - assert(len(templates) == 1) + assert len(templates) == 1 # at this point we know we maybe want to inline something and there's # definitely something that could be inlined. return _run_inliner( - func_ir, sig, templates[0], arg_typs, expr, i, py_func, block, - work_list, typemap, calltypes, typingctx + func_ir, + sig, + templates[0], + arg_typs, + expr, + i, + py_func, + block, + work_list, + typemap, + calltypes, + typingctx, ) def _is_dufunc_callsite(expr, block): - if expr.op == 'call': + if expr.op == "call": call_node = block.find_variable_assignment(expr.func.name).value # due to circular import we can not import DUFunc, TODO: Fix it - if(call_node.value.__class__.__name__ == "DUFunc"): + if call_node.value.__class__.__name__ == "DUFunc": return call_node return None @@ -76,7 +102,7 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx): modified = False if _DEBUG: - print('GUFunc before inlining DUFunc'.center(80, '-')) + print("GUFunc before inlining DUFunc".center(80, "-")) print(func_ir.dump()) work_list = list(func_ir.blocks.items()) @@ -92,17 +118,26 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx): call_node = _is_dufunc_callsite(expr, block) if call_node: py_func = call_node.value._dispatcher.py_func - workfn = _inline(func_ir, work_list, block, i, expr, - py_func, typemap, calltypes, typingctx) + workfn = _inline( + func_ir, + work_list, + block, + i, + expr, + py_func, + typemap, + calltypes, + typingctx, + ) if workfn: modified = True break # because block structure changed else: continue if _DEBUG: - print('GUFunc after inlining DUFunc'.center(80, '-')) + print("GUFunc after inlining DUFunc".center(80, "-")) print(func_ir.dump()) - print(''.center(80, '-')) + print("".center(80, "-")) if modified: # clean up leftover load instructions. This step is needed or else @@ -113,8 +148,8 @@ def dufunc_inliner(func_ir, calltypes, typemap, typingctx): func_ir.blocks = simplify_CFG(func_ir.blocks) if _DEBUG: - print('GUFunc after inlining DUFunc, DCE, SimplyCFG'.center(80, '-')) + print("GUFunc after inlining DUFunc, DCE, SimplyCFG".center(80, "-")) print(func_ir.dump()) - print(''.center(80, '-')) + print("".center(80, "-")) return True diff --git a/numba_dppy/examples/blacksholes_njit.py b/numba_dppy/examples/blacksholes_njit.py index 3654a90c66..0786fed9bc 100644 --- a/numba_dppy/examples/blacksholes_njit.py +++ b/numba_dppy/examples/blacksholes_njit.py @@ -9,12 +9,14 @@ import argparse import time + @numba.vectorize(nopython=True) def cndf2(inp): - out = 0.5 + 0.5 * math.erf((math.sqrt(2.0)/2.0) * inp) + out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp) return out -@numba.njit(parallel={'offload':True}, fastmath=True) + +@numba.njit(parallel={"offload": True}, fastmath=True) def blackscholes(sptprice, strike, rate, volatility, timev): logterm = np.log(sptprice / strike) powterm = 0.5 * volatility * volatility @@ -23,29 +25,30 @@ def blackscholes(sptprice, strike, rate, volatility, timev): d2 = d1 - den NofXd1 = cndf2(d1) NofXd2 = cndf2(d2) - futureValue = strike * np.exp(- rate * timev) + futureValue = strike * np.exp(-rate * timev) c1 = futureValue * NofXd2 call = sptprice * NofXd1 - c1 - put = call - futureValue + sptprice + put = call - futureValue + sptprice return put def run(iterations): - sptprice = np.full((iterations,), 42.0) + sptprice = np.full((iterations,), 42.0) initStrike = 40 + (np.arange(iterations) + 1.0) / iterations - rate = np.full((iterations,), 0.5) + rate = np.full((iterations,), 0.5) volatility = np.full((iterations,), 0.2) - timev = np.full((iterations,), 0.5) + timev = np.full((iterations,), 0.5) t1 = time.time() put = blackscholes(sptprice, initStrike, rate, volatility, timev) - t = time.time()-t1 + t = time.time() - t1 print("checksum: ", sum(put)) print("SELFTIMED ", t) + def main(): - parser = argparse.ArgumentParser(description='Black-Scholes') - parser.add_argument('--options', dest='options', type=int, default=10000000) + parser = argparse.ArgumentParser(description="Black-Scholes") + parser.add_argument("--options", dest="options", type=int, default=10000000) args = parser.parse_args() options = args.options @@ -53,5 +56,6 @@ def main(): print("options = ", options) run(options) -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/dppy_func.py b/numba_dppy/examples/dppy_func.py index 353ba48995..9230bf64fb 100644 --- a/numba_dppy/examples/dppy_func.py +++ b/numba_dppy/examples/dppy_func.py @@ -39,5 +39,5 @@ def main(): print("No device found") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/dppy_with_context.py b/numba_dppy/examples/dppy_with_context.py index 6df025f5ca..0520582a2a 100644 --- a/numba_dppy/examples/dppy_with_context.py +++ b/numba_dppy/examples/dppy_with_context.py @@ -3,6 +3,7 @@ import numba_dppy, numba_dppy as dppy import dpctl + @njit def add_two_arrays(b, c): a = np.empty_like(b) @@ -20,14 +21,14 @@ def main(): if dpctl.has_gpu_queues(): with dpctl.device_context("opencl:gpu"): gpu_result = add_two_arrays(b, c) - print('GPU device found. Result on GPU:', gpu_result) + print("GPU device found. Result on GPU:", gpu_result) elif dpctl.has_cpu_queues(): with dpctl.device_context("opencl:cpu"): cpu_result = add_two_arrays(b, c) - print('CPU device found. Result on CPU:', cpu_result) + print("CPU device found. Result on CPU:", cpu_result) else: print("No device found") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/matmul.py b/numba_dppy/examples/matmul.py index b97ac49ca1..184e65235e 100644 --- a/numba_dppy/examples/matmul.py +++ b/numba_dppy/examples/matmul.py @@ -14,7 +14,7 @@ def dppy_gemm(a, b, c): j = dppy.get_global_id(1) if i >= c.shape[0] or j >= c.shape[1]: return - c[i,j] = 0 + c[i, j] = 0 for k in range(c.shape[0]): c[i, j] += a[i, k] * b[k, j] @@ -22,7 +22,7 @@ def dppy_gemm(a, b, c): # Array dimesnions X = 1024 Y = 16 -global_size = X,X +global_size = X, X griddim = X, X blockdim = Y, Y @@ -30,13 +30,13 @@ def dppy_gemm(a, b, c): def driver(a, b, c): # Invoke the kernel - dppy_gemm[griddim,blockdim](a, b, c) + dppy_gemm[griddim, blockdim](a, b, c) def main(): - a = np.arange(X*X, dtype=np.float32).reshape(X,X) - b = np.array(np.random.random(X*X), dtype=np.float32).reshape(X,X) - c = np.ones_like(a).reshape(X,X) + a = np.arange(X * X, dtype=np.float32).reshape(X, X) + b = np.array(np.random.random(X * X), dtype=np.float32).reshape(X, X) + c = np.ones_like(a).reshape(X, X) if dpctl.has_gpu_queues(): with dpctl.device_context("opencl:gpu") as gpu_queue: @@ -58,5 +58,5 @@ def main(): print("Done...") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/pa_examples/test1.py b/numba_dppy/examples/pa_examples/test1.py index 01209b3309..ffe715549a 100644 --- a/numba_dppy/examples/pa_examples/test1.py +++ b/numba_dppy/examples/pa_examples/test1.py @@ -31,5 +31,5 @@ def main(): break -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/pairwise_distance.py b/numba_dppy/examples/pairwise_distance.py index b72c41ba9c..0a0705ee81 100644 --- a/numba_dppy/examples/pairwise_distance.py +++ b/numba_dppy/examples/pairwise_distance.py @@ -1,7 +1,7 @@ from time import time import numba from numba import int32, float32 -from math import ceil,sqrt +from math import ceil, sqrt import numpy as np import argparse import timeit @@ -10,12 +10,12 @@ import dpctl import dpctl._memory as dpctl_mem -parser = argparse.ArgumentParser(description='Program to compute pairwise distance') +parser = argparse.ArgumentParser(description="Program to compute pairwise distance") -parser.add_argument('-n', type=int, default=10, help='Number of points') -parser.add_argument('-d', type=int, default=3, help='Dimensions') -parser.add_argument('-r', type=int, default=1, help='repeat') -parser.add_argument('-l', type=int, default=1, help='local_work_size') +parser.add_argument("-n", type=int, default=10, help="Number of points") +parser.add_argument("-d", type=int, default=3, help="Dimensions") +parser.add_argument("-r", type=int, default=1, help="repeat") +parser.add_argument("-l", type=int, default=1, help="local_work_size") args = parser.parse_args() @@ -32,7 +32,7 @@ def pairwise_distance(X, D, xshape0, xshape1): idx = dppy.get_global_id(0) - #for i in range(xshape0): + # for i in range(xshape0): for j in range(X.shape[0]): d = 0.0 for k in range(X.shape[1]): @@ -42,7 +42,7 @@ def pairwise_distance(X, D, xshape0, xshape1): def driver(): - #measure running time + # measure running time times = list() xbuf = dpctl_mem.MemoryUSMShared(X.size * X.dtype.itemsize) @@ -55,7 +55,9 @@ def driver(): for repeat in range(args.r): start = time() - pairwise_distance[global_size, local_size](x_ndarray, d_ndarray, X.shape[0], X.shape[1]) + pairwise_distance[global_size, local_size]( + x_ndarray, d_ndarray, X.shape[0], X.shape[1] + ) end = time() total_time = end - start @@ -80,9 +82,9 @@ def main(): print("No device found") exit() - times = np.asarray(times, dtype=np.float32) + times = np.asarray(times, dtype=np.float32) print("Average time of %d runs is = %fs" % (args.r, times.mean())) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/sum-hybrid.py b/numba_dppy/examples/sum-hybrid.py index e66c51ae2c..38cb708fa3 100644 --- a/numba_dppy/examples/sum-hybrid.py +++ b/numba_dppy/examples/sum-hybrid.py @@ -14,8 +14,8 @@ def data_parallel_sum(a, b, c): c[i] = a[i] + b[i] -N = 50*32 -global_size = N, +N = 50 * 32 +global_size = (N,) def main(): @@ -49,5 +49,5 @@ def main(): print("Done...") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/sum.py b/numba_dppy/examples/sum.py index fdc1623fa7..1f2a0f6c31 100644 --- a/numba_dppy/examples/sum.py +++ b/numba_dppy/examples/sum.py @@ -44,5 +44,5 @@ def main(): print("Done...") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/sum2D.py b/numba_dppy/examples/sum2D.py index 90959c8bdf..4e279cfc01 100644 --- a/numba_dppy/examples/sum2D.py +++ b/numba_dppy/examples/sum2D.py @@ -12,7 +12,7 @@ def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) j = dppy.get_global_id(1) - c[i,j] = a[i,j] + b[i,j] + c[i, j] = a[i, j] + b[i, j] def driver(a, b, c, global_size): @@ -26,11 +26,11 @@ def main(): # Array dimesnions X = 8 Y = 8 - global_size = X,Y + global_size = X, Y - a = np.arange(X*Y, dtype=np.float32).reshape(X,Y) - b = np.array(np.random.random(X*Y), dtype=np.float32).reshape(X,Y) - c = np.ones_like(a).reshape(X,Y) + a = np.arange(X * Y, dtype=np.float32).reshape(X, Y) + b = np.array(np.random.random(X * Y), dtype=np.float32).reshape(X, Y) + c = np.ones_like(a).reshape(X, Y) if dpctl.has_gpu_queues(): with dpctl.device_context("opencl:gpu") as gpu_queue: @@ -45,5 +45,5 @@ def main(): print("Done...") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/sum_ndarray.py b/numba_dppy/examples/sum_ndarray.py index 2aea8e080a..fedf722000 100644 --- a/numba_dppy/examples/sum_ndarray.py +++ b/numba_dppy/examples/sum_ndarray.py @@ -8,7 +8,9 @@ import dpctl -@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []}) +@dppy.kernel( + access_types={"read_only": ["a", "b"], "write_only": ["c"], "read_write": []} +) def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] @@ -45,5 +47,5 @@ def main(): print("Done...") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/numba_dppy/examples/sum_reduction.py b/numba_dppy/examples/sum_reduction.py index 367fa37952..c4d7a171e4 100644 --- a/numba_dppy/examples/sum_reduction.py +++ b/numba_dppy/examples/sum_reduction.py @@ -12,7 +12,7 @@ def reduction_kernel(A, R, stride): i = dppy.get_global_id(0) # sum two element - R[i] = A[i] + A[i+stride] + R[i] = A[i] + A[i + stride] # store the sum to be used in nex iteration A[i] = R[i] @@ -20,21 +20,23 @@ def reduction_kernel(A, R, stride): def test_sum_reduction(): # This test will only work for size = power of two N = 2048 - assert(N%2 == 0) + assert N % 2 == 0 A = np.array(np.random.random(N), dtype=np.float32) A_copy = A.copy() # at max we will require half the size of A to store sum - R = np.array(np.random.random(math.ceil(N/2)), dtype=np.float32) + R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32) if dpctl.has_gpu_queues(): with dpctl.device_context("opencl:gpu") as gpu_queue: total = N - while (total > 1): + while total > 1: # call kernel global_size = total // 2 - reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size) + reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE]( + A, R, global_size + ) total = total // 2 else: @@ -43,7 +45,8 @@ def test_sum_reduction(): result = A_copy.sum() max_abs_err = result - R[0] - assert(max_abs_err < 1e-2) + assert max_abs_err < 1e-2 -if __name__ == '__main__': + +if __name__ == "__main__": test_sum_reduction() diff --git a/numba_dppy/examples/sum_reduction_ocl.py b/numba_dppy/examples/sum_reduction_ocl.py index 8d8e0411aa..c3903cc8f2 100644 --- a/numba_dppy/examples/sum_reduction_ocl.py +++ b/numba_dppy/examples/sum_reduction_ocl.py @@ -6,13 +6,14 @@ import dpctl + def sum_reduction_device_plus_host(): @dppy.kernel def sum_reduction_kernel(inp, partial_sums): - local_id = dppy.get_local_id(0) - global_id = dppy.get_global_id(0) + local_id = dppy.get_local_id(0) + global_id = dppy.get_global_id(0) group_size = dppy.get_local_size(0) - group_id = dppy.get_group_id(0) + group_id = dppy.get_group_id(0) local_sums = dppy.local.static_alloc(64, int32) @@ -21,12 +22,12 @@ def sum_reduction_kernel(inp, partial_sums): # Loop for computing local_sums : divide workgroup into 2 parts stride = group_size // 2 - while (stride > 0): + while stride > 0: # Waiting for each 2x2 addition into given workgroup dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # Add elements 2 by 2 between local_id and local_id + stride - if (local_id < stride): + if local_id < stride: local_sums[local_id] += local_sums[local_id + stride] stride >>= 1 @@ -55,9 +56,9 @@ def sum_reduction_kernel(inp, partial_sums): for i in range(nb_work_groups): final_sum += partial_sums[i] - assert(final_sum == global_size) + assert final_sum == global_size print("Expected:", global_size, "--- GOT:", final_sum) -if __name__ == '__main__': +if __name__ == "__main__": sum_reduction_device_plus_host() diff --git a/numba_dppy/examples/sum_reduction_recursive_ocl.py b/numba_dppy/examples/sum_reduction_recursive_ocl.py index c5dd6daa47..eb06cb2054 100644 --- a/numba_dppy/examples/sum_reduction_recursive_ocl.py +++ b/numba_dppy/examples/sum_reduction_recursive_ocl.py @@ -8,16 +8,13 @@ import dpctl._memory as dpctl_mem -def recursive_reduction(size, group_size, - Dinp, Dpartial_sums): - +def recursive_reduction(size, group_size, Dinp, Dpartial_sums): @dppy.kernel - def sum_reduction_kernel(inp, input_size, - partial_sums): - local_id = dppy.get_local_id(0) - global_id = dppy.get_global_id(0) + def sum_reduction_kernel(inp, input_size, partial_sums): + local_id = dppy.get_local_id(0) + global_id = dppy.get_global_id(0) group_size = dppy.get_local_size(0) - group_id = dppy.get_group_id(0) + group_id = dppy.get_group_id(0) local_sums = dppy.local.static_alloc(64, int32) @@ -28,12 +25,12 @@ def sum_reduction_kernel(inp, input_size, # Loop for computing local_sums : divide workgroup into 2 parts stride = group_size // 2 - while (stride > 0): + while stride > 0: # Waiting for each 2x2 addition into given workgroup dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # Add elements 2 by 2 between local_id and local_id + stride - if (local_id < stride): + if local_id < stride: local_sums[local_id] += local_sums[local_id + stride] stride >>= 1 @@ -41,27 +38,27 @@ def sum_reduction_kernel(inp, input_size, if local_id == 0: partial_sums[group_id] = local_sums[0] - result = 0 nb_work_groups = 0 passed_size = size - if (size <= group_size): + if size <= group_size: nb_work_groups = 1 else: - nb_work_groups = size // group_size; - if (size % group_size != 0): + nb_work_groups = size // group_size + if size % group_size != 0: nb_work_groups += 1 passed_size = nb_work_groups * group_size sum_reduction_kernel[passed_size, group_size](Dinp, size, Dpartial_sums) if nb_work_groups <= group_size: - sum_reduction_kernel[group_size, group_size](Dpartial_sums, nb_work_groups, Dinp) + sum_reduction_kernel[group_size, group_size]( + Dpartial_sums, nb_work_groups, Dinp + ) result = Dinp[0] else: - result = recursive_reduction(nb_work_groups, group_size, - Dpartial_sums, Dinp) + result = recursive_reduction(nb_work_groups, group_size, Dpartial_sums, Dinp) return result @@ -76,27 +73,30 @@ def sum_reduction_recursive(): inp = np.ones(global_size).astype(np.int32) partial_sums = np.zeros(nb_work_groups).astype(np.int32) - if dpctl.has_gpu_queues(): with dpctl.device_context("opencl:gpu") as gpu_queue: inp_buf = dpctl_mem.MemoryUSMShared(inp.size * inp.dtype.itemsize) inp_ndarray = np.ndarray(inp.shape, buffer=inp_buf, dtype=inp.dtype) np.copyto(inp_ndarray, inp) - partial_sums_buf = dpctl_mem.MemoryUSMShared(partial_sums.size * partial_sums.dtype.itemsize) - partial_sums_ndarray = np.ndarray(partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype) + partial_sums_buf = dpctl_mem.MemoryUSMShared( + partial_sums.size * partial_sums.dtype.itemsize + ) + partial_sums_ndarray = np.ndarray( + partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype + ) np.copyto(partial_sums_ndarray, partial_sums) print("Running recursive reduction") - result = recursive_reduction(global_size, work_group_size, - inp_ndarray, partial_sums_ndarray) + result = recursive_reduction( + global_size, work_group_size, inp_ndarray, partial_sums_ndarray + ) else: print("No device found") exit() - print("Expected:", global_size, "--- GOT:", result) - assert(result == global_size) + assert result == global_size sum_reduction_recursive() diff --git a/numba_dppy/initialize.py b/numba_dppy/initialize.py index 2a2c70f796..3894335051 100644 --- a/numba_dppy/initialize.py +++ b/numba_dppy/initialize.py @@ -6,24 +6,32 @@ def init_jit(): from numba_dppy.dispatcher import DPPYDispatcher + return DPPYDispatcher + def initialize_all(): from numba.core.registry import dispatcher_registry - dispatcher_registry.ondemand['dppy'] = init_jit + + dispatcher_registry.ondemand["dppy"] = init_jit import dpctl import glob import platform as plt + platform = plt.system() - if platform == 'Windows': - paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface.dll')) + if platform == "Windows": + paths = glob.glob( + os.path.join(os.path.dirname(dpctl.__file__), "*DPCTLSyclInterface.dll") + ) else: - paths = glob.glob(os.path.join(os.path.dirname(dpctl.__file__), '*DPCTLSyclInterface*')) + paths = glob.glob( + os.path.join(os.path.dirname(dpctl.__file__), "*DPCTLSyclInterface*") + ) if len(paths) == 1: ll.load_library_permanently(paths[0]) else: raise ImportError - ll.load_library_permanently(find_library('OpenCL')) + ll.load_library_permanently(find_library("OpenCL")) diff --git a/numba_dppy/ocl/atomics/__init__.py b/numba_dppy/ocl/atomics/__init__.py index aa6fdf5dfd..d3ac430da0 100644 --- a/numba_dppy/ocl/atomics/__init__.py +++ b/numba_dppy/ocl/atomics/__init__.py @@ -1,22 +1,25 @@ import os import os.path + def atomic_support_present(): - if os.path.isfile(os.path.join(os.path.dirname(__file__), 'atomic_ops.spir')): + if os.path.isfile(os.path.join(os.path.dirname(__file__), "atomic_ops.spir")): return True else: return False + def get_atomic_spirv_path(): if atomic_support_present(): - return os.path.join(os.path.dirname(__file__), 'atomic_ops.spir') + return os.path.join(os.path.dirname(__file__), "atomic_ops.spir") else: return None + def read_atomic_spirv_file(): path = get_atomic_spirv_path() if path: - with open(path, 'rb') as fin: + with open(path, "rb") as fin: spirv = fin.read() return spirv else: diff --git a/numba_dppy/ocl/mathdecl.py b/numba_dppy/ocl/mathdecl.py index 442e269cb4..7ce5e15d95 100644 --- a/numba_dppy/ocl/mathdecl.py +++ b/numba_dppy/ocl/mathdecl.py @@ -1,8 +1,12 @@ from __future__ import print_function, absolute_import, division import math from numba.core import types, utils -from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate, - signature, Registry) +from numba.core.typing.templates import ( + AttributeTemplate, + ConcreteTemplate, + signature, + Registry, +) registry = Registry() builtin_attr = registry.register_attr @@ -253,12 +257,15 @@ class Math_degrees(Math_unary): class Math_erf(Math_unary): key = math.erf + class Math_erfc(Math_unary): key = math.erfc + class Math_gamma(Math_unary): key = math.gamma + class Math_lgamma(Math_unary): key = math.lgamma diff --git a/numba_dppy/ocl/mathimpl.py b/numba_dppy/ocl/mathimpl.py index 86c8195046..bb2f9cb7eb 100644 --- a/numba_dppy/ocl/mathimpl.py +++ b/numba_dppy/ocl/mathimpl.py @@ -25,58 +25,50 @@ _binary_d_dl = types.float64(types.float64, types.int64) sig_mapper = { - 'f->f' : _unary_f_f, - 'd->d' : _unary_d_d, - 'ff->f': _binary_f_ff, - 'dd->d': _binary_d_dd, - 'fi->f': _binary_f_fi, - 'fl->f': _binary_f_fl, - 'di->d': _binary_d_di, - 'dl->d': _binary_d_dl, - } + "f->f": _unary_f_f, + "d->d": _unary_d_d, + "ff->f": _binary_f_ff, + "dd->d": _binary_d_dd, + "fi->f": _binary_f_fi, + "fl->f": _binary_f_fl, + "di->d": _binary_d_di, + "dl->d": _binary_d_dl, +} function_descriptors = { - 'isnan': (_unary_b_f, _unary_b_d), - 'isinf': (_unary_b_f, _unary_b_d), - - 'ceil': (_unary_f_f, _unary_d_d), - 'floor': (_unary_f_f, _unary_d_d), - 'trunc': (_unary_f_f, _unary_d_d), - - 'fabs': (_unary_f_f, _unary_d_d), - - 'sqrt': (_unary_f_f, _unary_d_d), - 'exp': (_unary_f_f, _unary_d_d), - 'expm1': (_unary_f_f, _unary_d_d), - 'log': (_unary_f_f, _unary_d_d), - 'log10': (_unary_f_f, _unary_d_d), - 'log1p': (_unary_f_f, _unary_d_d), - - 'sin': (_unary_f_f, _unary_d_d), - 'cos': (_unary_f_f, _unary_d_d), - 'tan': (_unary_f_f, _unary_d_d), - 'asin': (_unary_f_f, _unary_d_d), - 'acos': (_unary_f_f, _unary_d_d), - 'atan': (_unary_f_f, _unary_d_d), - 'sinh': (_unary_f_f, _unary_d_d), - 'cosh': (_unary_f_f, _unary_d_d), - 'tanh': (_unary_f_f, _unary_d_d), - 'asinh': (_unary_f_f, _unary_d_d), - 'acosh': (_unary_f_f, _unary_d_d), - 'atanh': (_unary_f_f, _unary_d_d), - - 'copysign': (_binary_f_ff, _binary_d_dd), - 'atan2': (_binary_f_ff, _binary_d_dd), - 'pow': (_binary_f_ff, _binary_d_dd), - 'fmod': (_binary_f_ff, _binary_d_dd), - - 'erf': (_unary_f_f, _unary_d_d), - 'erfc': (_unary_f_f, _unary_d_d), - 'gamma': (_unary_f_f, _unary_d_d), - 'lgamma': (_unary_f_f, _unary_d_d), - - 'ldexp': (_binary_f_fi, _binary_f_fl, _binary_d_di, _binary_d_dl), - + "isnan": (_unary_b_f, _unary_b_d), + "isinf": (_unary_b_f, _unary_b_d), + "ceil": (_unary_f_f, _unary_d_d), + "floor": (_unary_f_f, _unary_d_d), + "trunc": (_unary_f_f, _unary_d_d), + "fabs": (_unary_f_f, _unary_d_d), + "sqrt": (_unary_f_f, _unary_d_d), + "exp": (_unary_f_f, _unary_d_d), + "expm1": (_unary_f_f, _unary_d_d), + "log": (_unary_f_f, _unary_d_d), + "log10": (_unary_f_f, _unary_d_d), + "log1p": (_unary_f_f, _unary_d_d), + "sin": (_unary_f_f, _unary_d_d), + "cos": (_unary_f_f, _unary_d_d), + "tan": (_unary_f_f, _unary_d_d), + "asin": (_unary_f_f, _unary_d_d), + "acos": (_unary_f_f, _unary_d_d), + "atan": (_unary_f_f, _unary_d_d), + "sinh": (_unary_f_f, _unary_d_d), + "cosh": (_unary_f_f, _unary_d_d), + "tanh": (_unary_f_f, _unary_d_d), + "asinh": (_unary_f_f, _unary_d_d), + "acosh": (_unary_f_f, _unary_d_d), + "atanh": (_unary_f_f, _unary_d_d), + "copysign": (_binary_f_ff, _binary_d_dd), + "atan2": (_binary_f_ff, _binary_d_dd), + "pow": (_binary_f_ff, _binary_d_dd), + "fmod": (_binary_f_ff, _binary_d_dd), + "erf": (_unary_f_f, _unary_d_d), + "erfc": (_unary_f_f, _unary_d_d), + "gamma": (_unary_f_f, _unary_d_d), + "lgamma": (_unary_f_f, _unary_d_d), + "ldexp": (_binary_f_fi, _binary_f_fl, _binary_d_di, _binary_d_dl), # unsupported functions listed in the math module documentation: # frexp, ldexp, trunc, modf, factorial, fsum } @@ -84,17 +76,16 @@ # some functions may be named differently by the underlying math # library as oposed to the Python name. -_lib_counterpart = { - 'gamma': 'tgamma' -} +_lib_counterpart = {"gamma": "tgamma"} def _mk_fn_decl(name, decl_sig): sym = _lib_counterpart.get(name, name) def core(context, builder, sig, args): - fn = _declare_function(context, builder, sym, decl_sig, decl_sig.args, - mangler=mangle) + fn = _declare_function( + context, builder, sym, decl_sig, decl_sig.args, mangler=mangle + ) res = builder.call(fn, args) return context.cast(builder, res, decl_sig.return_type, sig.return_type) @@ -102,16 +93,46 @@ def core(context, builder, sig, args): return core -_supported = ['sin', 'cos', 'tan', 'asin', 'acos', 'atan', 'atan2', 'sinh', - 'cosh', 'tanh', 'asinh', 'acosh', 'atanh', 'isnan', 'isinf', - 'ceil', 'floor', 'fabs', 'sqrt', 'exp', 'expm1', 'log', - 'log10', 'log1p', 'copysign', 'pow', 'fmod', 'erf', 'erfc', - 'gamma', 'lgamma', 'ldexp', 'trunc' - ] +_supported = [ + "sin", + "cos", + "tan", + "asin", + "acos", + "atan", + "atan2", + "sinh", + "cosh", + "tanh", + "asinh", + "acosh", + "atanh", + "isnan", + "isinf", + "ceil", + "floor", + "fabs", + "sqrt", + "exp", + "expm1", + "log", + "log10", + "log1p", + "copysign", + "pow", + "fmod", + "erf", + "erfc", + "gamma", + "lgamma", + "ldexp", + "trunc", +] lower_ocl_impl = dict() + def function_name_to_supported_decl(name, sig): try: # only symbols present in the math module @@ -120,7 +141,7 @@ def function_name_to_supported_decl(name, sig): return None fn = _mk_fn_decl(name, sig) - #lower(key, *sig.args)(fn) + # lower(key, *sig.args)(fn) lower_ocl_impl[(name, sig)] = lower(key, *sig.args)(fn) diff --git a/numba_dppy/ocl/ocldecl.py b/numba_dppy/ocl/ocldecl.py index adf14a1815..a5aa6545e5 100644 --- a/numba_dppy/ocl/ocldecl.py +++ b/numba_dppy/ocl/ocldecl.py @@ -1,17 +1,23 @@ from __future__ import print_function, division, absolute_import from numba import types from numba.core.typing.npydecl import register_number_classes -from numba.core.typing.templates import (AttributeTemplate, ConcreteTemplate, - AbstractTemplate, MacroTemplate, - signature, Registry) +from numba.core.typing.templates import ( + AttributeTemplate, + ConcreteTemplate, + AbstractTemplate, + MacroTemplate, + signature, + Registry, +) import numba_dppy, numba_dppy as dppy registry = Registry() intrinsic = registry.register intrinsic_attr = registry.register_attr -#intrinsic_global = registry.register_global +# intrinsic_global = registry.register_global + +# register_number_classes(intrinsic_global) -#register_number_classes(intrinsic_global) @intrinsic class Ocl_get_global_id(ConcreteTemplate): @@ -58,8 +64,7 @@ class Ocl_get_local_size(ConcreteTemplate): @intrinsic class Ocl_barrier(ConcreteTemplate): key = dppy.barrier - cases = [signature(types.void, types.uint32), - signature(types.void)] + cases = [signature(types.void, types.uint32), signature(types.void)] @intrinsic @@ -77,6 +82,7 @@ class Ocl_sub_group_barrier(ConcreteTemplate): # dppy.atomic submodule ------------------------------------------------------- + @intrinsic class Ocl_atomic_add(AbstractTemplate): key = dppy.atomic.add @@ -90,6 +96,7 @@ def generic(self, args, kws): elif ary.ndim > 1: return signature(ary.dtype, ary, idx, ary.dtype) + @intrinsic class Ocl_atomic_sub(AbstractTemplate): key = dppy.atomic.sub @@ -117,6 +124,7 @@ def resolve_sub(self, mod): # dppy.local submodule ------------------------------------------------------- + class Ocl_local_alloc(MacroTemplate): key = dppy.local.static_alloc @@ -131,6 +139,7 @@ def resolve_static_alloc(self, mod): # OpenCL module -------------------------------------------------------------- + @intrinsic_attr class OclModuleTemplate(AttributeTemplate): key = types.Module(dppy) @@ -171,6 +180,7 @@ def resolve_atomic(self, mod): def resolve_local(self, mod): return types.Module(dppy.local) + # intrinsic -#intrinsic_global(dppy, types.Module(dppy)) +# intrinsic_global(dppy, types.Module(dppy)) diff --git a/numba_dppy/ocl/oclimpl.py b/numba_dppy/ocl/oclimpl.py index 26f8482799..893a37c6a8 100644 --- a/numba_dppy/ocl/oclimpl.py +++ b/numba_dppy/ocl/oclimpl.py @@ -24,8 +24,7 @@ # ----------------------------------------------------------------------------- -def _declare_function(context, builder, name, sig, cargs, - mangler=mangle_c): +def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c): """Insert declaration for a opencl builtin function. Uses the Itanium mangler. @@ -60,11 +59,13 @@ def _declare_function(context, builder, name, sig, cargs, fn.calling_convention = target.CC_SPIR_FUNC return fn + @lower(stubs.get_global_id, types.uint32) def get_global_id_impl(context, builder, sig, args): [dim] = args - get_global_id = _declare_function(context, builder, 'get_global_id', sig, - ['unsigned int']) + get_global_id = _declare_function( + context, builder, "get_global_id", sig, ["unsigned int"] + ) res = builder.call(get_global_id, [dim]) return context.cast(builder, res, types.uintp, types.intp) @@ -72,8 +73,9 @@ def get_global_id_impl(context, builder, sig, args): @lower(stubs.get_local_id, types.uint32) def get_local_id_impl(context, builder, sig, args): [dim] = args - get_local_id = _declare_function(context, builder, 'get_local_id', sig, - ['unsigned int']) + get_local_id = _declare_function( + context, builder, "get_local_id", sig, ["unsigned int"] + ) res = builder.call(get_local_id, [dim]) return context.cast(builder, res, types.uintp, types.intp) @@ -81,8 +83,9 @@ def get_local_id_impl(context, builder, sig, args): @lower(stubs.get_group_id, types.uint32) def get_group_id_impl(context, builder, sig, args): [dim] = args - get_group_id = _declare_function(context, builder, 'get_group_id', sig, - ['unsigned int']) + get_group_id = _declare_function( + context, builder, "get_group_id", sig, ["unsigned int"] + ) res = builder.call(get_group_id, [dim]) return context.cast(builder, res, types.uintp, types.intp) @@ -90,16 +93,16 @@ def get_group_id_impl(context, builder, sig, args): @lower(stubs.get_num_groups, types.uint32) def get_num_groups_impl(context, builder, sig, args): [dim] = args - get_num_groups = _declare_function(context, builder, 'get_num_groups', sig, - ['unsigned int']) + get_num_groups = _declare_function( + context, builder, "get_num_groups", sig, ["unsigned int"] + ) res = builder.call(get_num_groups, [dim]) return context.cast(builder, res, types.uintp, types.intp) @lower(stubs.get_work_dim) def get_work_dim_impl(context, builder, sig, args): - get_work_dim = _declare_function(context, builder, 'get_work_dim', sig, - ["void"]) + get_work_dim = _declare_function(context, builder, "get_work_dim", sig, ["void"]) res = builder.call(get_work_dim, []) return res @@ -107,8 +110,9 @@ def get_work_dim_impl(context, builder, sig, args): @lower(stubs.get_global_size, types.uint32) def get_global_size_impl(context, builder, sig, args): [dim] = args - get_global_size = _declare_function(context, builder, 'get_global_size', - sig, ['unsigned int']) + get_global_size = _declare_function( + context, builder, "get_global_size", sig, ["unsigned int"] + ) res = builder.call(get_global_size, [dim]) return context.cast(builder, res, types.uintp, types.intp) @@ -116,8 +120,9 @@ def get_global_size_impl(context, builder, sig, args): @lower(stubs.get_local_size, types.uint32) def get_local_size_impl(context, builder, sig, args): [dim] = args - get_local_size = _declare_function(context, builder, 'get_local_size', - sig, ['unsigned int']) + get_local_size = _declare_function( + context, builder, "get_local_size", sig, ["unsigned int"] + ) res = builder.call(get_local_size, [dim]) return context.cast(builder, res, types.uintp, types.intp) @@ -125,17 +130,16 @@ def get_local_size_impl(context, builder, sig, args): @lower(stubs.barrier, types.uint32) def barrier_one_arg_impl(context, builder, sig, args): [flags] = args - barrier = _declare_function(context, builder, 'barrier', sig, - ['unsigned int']) + barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"]) builder.call(barrier, [flags]) return _void_value + @lower(stubs.barrier) def barrier_no_arg_impl(context, builder, sig, args): assert not args sig = types.void(types.uint32) - barrier = _declare_function(context, builder, 'barrier', sig, - ['unsigned int']) + barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"]) flags = context.get_constant(types.uint32, stubs.CLK_GLOBAL_MEM_FENCE) builder.call(barrier, [flags]) return _void_value @@ -144,8 +148,7 @@ def barrier_no_arg_impl(context, builder, sig, args): @lower(stubs.mem_fence, types.uint32) def mem_fence_impl(context, builder, sig, args): [flags] = args - mem_fence = _declare_function(context, builder, 'mem_fence', sig, - ['unsigned int']) + mem_fence = _declare_function(context, builder, "mem_fence", sig, ["unsigned int"]) builder.call(mem_fence, [flags]) return _void_value @@ -154,15 +157,15 @@ def mem_fence_impl(context, builder, sig, args): def sub_group_barrier_impl(context, builder, sig, args): assert not args sig = types.void(types.uint32) - barrier = _declare_function(context, builder, 'barrier', sig, - ['unsigned int']) + barrier = _declare_function(context, builder, "barrier", sig, ["unsigned int"]) flags = context.get_constant(types.uint32, stubs.CLK_LOCAL_MEM_FENCE) builder.call(barrier, [flags]) return _void_value -def insert_and_call_atomic_fn(context, builder, sig, fn_type, - dtype, ptr, val, addrspace): +def insert_and_call_atomic_fn( + context, builder, sig, fn_type, dtype, ptr, val, addrspace +): ll_p = None name = "" if dtype.name == "int32" or dtype.name == "uint32": @@ -173,11 +176,10 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type, elif fn_type == "sub": name = "numba_dppy_atomic_sub_i32" else: - raise TypeError("Operation type is not supported %s" % - (fn_type)) + raise TypeError("Operation type is not supported %s" % (fn_type)) elif dtype.name == "int64" or dtype.name == "uint64": # dpctl needs to expose same functions() - #if device_env.device_support_int64_atomics(): + # if device_env.device_support_int64_atomics(): if True: ll_val = ir.IntType(64) ll_p = ll_val.as_pointer() @@ -186,9 +188,8 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type, elif fn_type == "sub": name = "numba_dppy_atomic_sub_i64" else: - raise TypeError("Operation type is not supported %s" % - (fn_type)) - #else: + raise TypeError("Operation type is not supported %s" % (fn_type)) + # else: # raise TypeError("Current device does not support atomic " + # "operations on 64-bit Integer") elif dtype.name == "float32": @@ -199,10 +200,9 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type, elif fn_type == "sub": name = "numba_dppy_atomic_sub_f32" else: - raise TypeError("Operation type is not supported %s" % - (fn_type)) + raise TypeError("Operation type is not supported %s" % (fn_type)) elif dtype.name == "float64": - #if device_env.device_support_float64_atomics(): + # if device_env.device_support_float64_atomics(): # dpctl needs to expose same functions() if True: ll_val = ir.DoubleType() @@ -212,22 +212,20 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type, elif fn_type == "sub": name = "numba_dppy_atomic_sub_f64" else: - raise TypeError("Operation type is not supported %s" % - (fn_type)) - #else: + raise TypeError("Operation type is not supported %s" % (fn_type)) + # else: # raise TypeError("Current device does not support atomic " + # "operations on 64-bit Float") else: - raise TypeError("Atomic operation is not supported for type %s" % - (dtype.name)) + raise TypeError("Atomic operation is not supported for type %s" % (dtype.name)) if addrspace == target.SPIR_LOCAL_ADDRSPACE: name = name + "_local" else: name = name + "_global" - assert(ll_p != None) - assert(name != "") + assert ll_p != None + assert name != "" ll_p.addrspace = target.SPIR_GENERIC_ADDRSPACE mod = builder.module @@ -242,19 +240,17 @@ def insert_and_call_atomic_fn(context, builder, sig, fn_type, fn = mod.get_or_insert_function(fnty, name) fn.calling_convention = target.CC_SPIR_FUNC - generic_ptr = context.addrspacecast(builder, ptr, - target.SPIR_GENERIC_ADDRSPACE) + generic_ptr = context.addrspacecast(builder, ptr, target.SPIR_GENERIC_ADDRSPACE) return builder.call(fn, [generic_ptr, val]) @lower(stubs.atomic.add, types.Array, types.intp, types.Any) -@lower(stubs.atomic.add, types.Array, - types.UniTuple, types.Any) -@lower(stubs.atomic.add, types.Array, types.Tuple, - types.Any) +@lower(stubs.atomic.add, types.Array, types.UniTuple, types.Any) +@lower(stubs.atomic.add, types.Array, types.Tuple, types.Any) def atomic_add_tuple(context, builder, sig, args): from .atomics import atomic_support_present + if atomic_support_present(): context.link_binaries[target.LINK_ATOMIC] = True aryty, indty, valty = sig.args @@ -266,36 +262,53 @@ def atomic_add_tuple(context, builder, sig, args): indty = [indty] else: indices = cgutils.unpack_tuple(builder, inds, count=len(indty)) - indices = [context.cast(builder, i, t, types.intp) - for t, i in zip(indty, indices)] + indices = [ + context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices) + ] if dtype != valty: raise TypeError("expecting %s but got %s" % (dtype, valty)) if aryty.ndim != len(indty): - raise TypeError("indexing %d-D array with %d-D index" % - (aryty.ndim, len(indty))) + raise TypeError( + "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty)) + ) lary = context.make_array(aryty)(context, builder, ary) ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices) if aryty.addrspace == target.SPIR_LOCAL_ADDRSPACE: - return insert_and_call_atomic_fn(context, builder, sig, "add", dtype, - ptr, val, target.SPIR_LOCAL_ADDRSPACE) + return insert_and_call_atomic_fn( + context, + builder, + sig, + "add", + dtype, + ptr, + val, + target.SPIR_LOCAL_ADDRSPACE, + ) else: - return insert_and_call_atomic_fn(context, builder, sig, "add", dtype, - ptr, val, target.SPIR_GLOBAL_ADDRSPACE) + return insert_and_call_atomic_fn( + context, + builder, + sig, + "add", + dtype, + ptr, + val, + target.SPIR_GLOBAL_ADDRSPACE, + ) else: raise ImportError("Atomic support is not present, can not perform atomic_add") @lower(stubs.atomic.sub, types.Array, types.intp, types.Any) -@lower(stubs.atomic.sub, types.Array, - types.UniTuple, types.Any) -@lower(stubs.atomic.sub, types.Array, types.Tuple, - types.Any) +@lower(stubs.atomic.sub, types.Array, types.UniTuple, types.Any) +@lower(stubs.atomic.sub, types.Array, types.Tuple, types.Any) def atomic_sub_tuple(context, builder, sig, args): from .atomics import atomic_support_present + if atomic_support_present(): context.link_binaries[target.LINK_ATOMIC] = True aryty, indty, valty = sig.args @@ -307,36 +320,58 @@ def atomic_sub_tuple(context, builder, sig, args): indty = [indty] else: indices = cgutils.unpack_tuple(builder, inds, count=len(indty)) - indices = [context.cast(builder, i, t, types.intp) - for t, i in zip(indty, indices)] + indices = [ + context.cast(builder, i, t, types.intp) for t, i in zip(indty, indices) + ] if dtype != valty: raise TypeError("expecting %s but got %s" % (dtype, valty)) if aryty.ndim != len(indty): - raise TypeError("indexing %d-D array with %d-D index" % - (aryty.ndim, len(indty))) + raise TypeError( + "indexing %d-D array with %d-D index" % (aryty.ndim, len(indty)) + ) lary = context.make_array(aryty)(context, builder, ary) ptr = cgutils.get_item_pointer(context, builder, aryty, lary, indices) - if aryty.addrspace == target.SPIR_LOCAL_ADDRSPACE: - return insert_and_call_atomic_fn(context, builder, sig, "sub", dtype, - ptr, val, target.SPIR_LOCAL_ADDRSPACE) + return insert_and_call_atomic_fn( + context, + builder, + sig, + "sub", + dtype, + ptr, + val, + target.SPIR_LOCAL_ADDRSPACE, + ) else: - return insert_and_call_atomic_fn(context, builder, sig, "sub", dtype, - ptr, val, target.SPIR_GLOBAL_ADDRSPACE) + return insert_and_call_atomic_fn( + context, + builder, + sig, + "sub", + dtype, + ptr, + val, + target.SPIR_GLOBAL_ADDRSPACE, + ) else: raise ImportError("Atomic support is not present, can not perform atomic_add") -@lower('dppy.lmem.alloc', types.UniTuple, types.Any) +@lower("dppy.lmem.alloc", types.UniTuple, types.Any) def dppy_lmem_alloc_array(context, builder, sig, args): shape, dtype = args - return _generic_array(context, builder, shape=shape, dtype=dtype, - symbol_name='_dppy_lmem', - addrspace=target.SPIR_LOCAL_ADDRSPACE) + return _generic_array( + context, + builder, + shape=shape, + dtype=dtype, + symbol_name="_dppy_lmem", + addrspace=target.SPIR_LOCAL_ADDRSPACE, + ) def _generic_array(context, builder, shape, dtype, symbol_name, addrspace): @@ -374,10 +409,18 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace): return _make_array(context, builder, gvmem, dtype, shape, addrspace=addrspace) -def _make_array(context, builder, dataptr, dtype, shape, layout='C', addrspace=target.SPIR_GENERIC_ADDRSPACE): +def _make_array( + context, + builder, + dataptr, + dtype, + shape, + layout="C", + addrspace=target.SPIR_GENERIC_ADDRSPACE, +): ndim = len(shape) # Create array object - aryty = types.Array(dtype=dtype, ndim=ndim, layout='C', addrspace=addrspace) + aryty = types.Array(dtype=dtype, ndim=ndim, layout="C", addrspace=addrspace) ary = context.make_array(aryty)(context, builder) targetdata = _get_target_data(context) @@ -392,12 +435,14 @@ def _make_array(context, builder, dataptr, dtype, shape, layout='C', addrspace=t kshape = [context.get_constant(types.intp, s) for s in shape] kstrides = [context.get_constant(types.intp, s) for s in strides] - context.populate_array(ary, - data=builder.bitcast(dataptr, ary.data.type), - shape=cgutils.pack_array(builder, kshape), - strides=cgutils.pack_array(builder, kstrides), - itemsize=context.get_constant(types.intp, itemsize), - meminfo=None) + context.populate_array( + ary, + data=builder.bitcast(dataptr, ary.data.type), + shape=cgutils.pack_array(builder, kshape), + strides=cgutils.pack_array(builder, kstrides), + itemsize=context.get_constant(types.intp, itemsize), + meminfo=None, + ) return ary._getvalue() diff --git a/numba_dppy/ocl/stubs.py b/numba_dppy/ocl/stubs.py index 190b685955..bd4e03c21a 100644 --- a/numba_dppy/ocl/stubs.py +++ b/numba_dppy/ocl/stubs.py @@ -7,7 +7,7 @@ _stub_error = NotImplementedError("This is a stub.") # mem fence -CLK_LOCAL_MEM_FENCE = 0x1 +CLK_LOCAL_MEM_FENCE = 0x1 CLK_GLOBAL_MEM_FENCE = 0x2 @@ -85,7 +85,8 @@ class Stub(object): """A stub object to represent special objects which is meaningless outside the context of DPPY compilation context. """ - _description_ = '' + + _description_ = "" __slots__ = () # don't allocate __dict__ def __new__(cls): @@ -94,25 +95,28 @@ def __new__(cls): def __repr__(self): return self._description_ -#------------------------------------------------------------------------------- + +# ------------------------------------------------------------------------------- # local memory + def local_alloc(shape, dtype): shape = _legalize_shape(shape) ndim = len(shape) fname = "dppy.lmem.alloc" - restype = types.Array(dtype, ndim, 'C', addrspace=SPIR_LOCAL_ADDRSPACE) + restype = types.Array(dtype, ndim, "C", addrspace=SPIR_LOCAL_ADDRSPACE) sig = typing.signature(restype, types.UniTuple(types.intp, ndim), types.Any) return ir.Intrinsic(fname, sig, args=(shape, dtype)) class local(Stub): - """local namespace - """ - _description_ = '' + """local namespace""" - static_alloc = Macro('local.static_alloc', local_alloc, callable=True, - argnames=['shape', 'dtype']) + _description_ = "" + + static_alloc = Macro( + "local.static_alloc", local_alloc, callable=True, argnames=["shape", "dtype"] + ) def _legalize_shape(shape): @@ -124,13 +128,14 @@ def _legalize_shape(shape): raise TypeError("invalid type for shape; got {0}".format(type(shape))) -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------- # atomic + class atomic(Stub): - """atomic namespace - """ - _description_ = '' + """atomic namespace""" + + _description_ = "" class add(Stub): """add(ary, idx, val) diff --git a/numba_dppy/printimpl.py b/numba_dppy/printimpl.py index e5c9d4f793..983e1f267c 100644 --- a/numba_dppy/printimpl.py +++ b/numba_dppy/printimpl.py @@ -27,8 +27,7 @@ def print_item(ty, context, builder, val): A (format string, [list of arguments]) is returned that will allow forming the final printf()-like call. """ - raise NotImplementedError("printing unimplemented for values of type %s" - % (ty,)) + raise NotImplementedError("printing unimplemented for values of type %s" % (ty,)) @print_item.register(types.Integer) @@ -44,11 +43,13 @@ def int_print_impl(ty, context, builder, val): lld = context.cast(builder, val, ty, dsttype) return rawfmt, [lld] + @print_item.register(types.Float) def real_print_impl(ty, context, builder, val): lld = context.cast(builder, val, ty, types.float64) return "%f", [lld] + @print_item.register(types.StringLiteral) def const_print_impl(ty, context, builder, sigval): pyval = ty.literal_value diff --git a/numba_dppy/rename_numpy_functions_pass.py b/numba_dppy/rename_numpy_functions_pass.py index c1d58ce036..b55c122d93 100644 --- a/numba_dppy/rename_numpy_functions_pass.py +++ b/numba_dppy/rename_numpy_functions_pass.py @@ -9,21 +9,23 @@ import numba_dppy from numba.core import types -rewrite_function_name_map = {"sum": (["np"], "sum"), - "eig": (["linalg"], "eig"), - "prod": (["np"], "prod"), - "max": (["np"], "max"), - "amax": (["np"], "amax"), - "min": (["np"], "min"), - "amin": (["np"], "amin"), - "mean": (["np"], "mean"), - "median": (["np"], "median"), - "argmax": (["np"], "argmax"), - "argmin": (["np"], "argmin"), - "argsort": (["np"], "argsort"), - "cov": (["np"], "cov"), - "dot": (["np"], "dot"), - "matmul": (["np"], "matmul")} +rewrite_function_name_map = { + "sum": (["np"], "sum"), + "eig": (["linalg"], "eig"), + "prod": (["np"], "prod"), + "max": (["np"], "max"), + "amax": (["np"], "amax"), + "min": (["np"], "min"), + "amin": (["np"], "amin"), + "mean": (["np"], "mean"), + "median": (["np"], "median"), + "argmax": (["np"], "argmax"), + "argmin": (["np"], "argmin"), + "argsort": (["np"], "argsort"), + "cov": (["np"], "cov"), + "dot": (["np"], "dot"), + "matmul": (["np"], "matmul"), +} class RewriteNumPyOverloadedFunctions(object): @@ -147,6 +149,7 @@ def run_pass(self, state): def get_dpnp_func_typ(func): from numba.core.typing.templates import builtin_registry + for (k, v) in builtin_registry.globals: if k == func: return v @@ -178,9 +181,13 @@ def run(self): lhs = stmt.target.name rhs = stmt.value # replace A.func with np.func, and save A in saved_arr_arg - if (rhs.op == 'getattr' and rhs.attr in self.function_name_map - and isinstance( - self.typemap[rhs.value.name], types.npytypes.Array)): + if ( + rhs.op == "getattr" + and rhs.attr in self.function_name_map + and isinstance( + self.typemap[rhs.value.name], types.npytypes.Array + ) + ): rhs = stmt.value arr = rhs.value saved_arr_arg[lhs] = arr @@ -211,17 +218,18 @@ def run(self): self.typemap.pop(lhs) self.typemap[lhs] = func_typ - if rhs.op == 'call' and rhs.func.name in saved_arr_arg: + if rhs.op == "call" and rhs.func.name in saved_arr_arg: # add array as first arg arr = saved_arr_arg[rhs.func.name] # update call type signature to include array arg old_sig = self.calltypes.pop(rhs) # argsort requires kws for typing so sig.args can't be used # reusing sig.args since some types become Const in sig - argtyps = old_sig.args[:len(rhs.args)] + argtyps = old_sig.args[: len(rhs.args)] kwtyps = {name: self.typemap[v.name] for name, v in rhs.kws} self.calltypes[rhs] = self.typemap[rhs.func.name].get_call_type( - typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps) + typingctx, [self.typemap[arr.name]] + list(argtyps), kwtyps + ) rhs.args = [arr] + rhs.args new_body.append(stmt) diff --git a/numba_dppy/spirv_generator.py b/numba_dppy/spirv_generator.py index 5bac98e014..3d31596c5a 100644 --- a/numba_dppy/spirv_generator.py +++ b/numba_dppy/spirv_generator.py @@ -17,60 +17,69 @@ def _raise_bad_env_path(msg, path, extra=None): error_message += extra raise ValueError(error_message) + _real_check_call = check_call + def check_call(*args, **kwargs): - #print("check_call:", *args, **kwargs) + # print("check_call:", *args, **kwargs) return _real_check_call(*args, **kwargs) -class CmdLine(object): +class CmdLine(object): def disassemble(self, ipath, opath): - check_call([ - "spirv-dis", - # "--no-indent", - # "--no-header", - # "--raw-id", - # "--offsets", - "-o", - opath, - ipath]) + check_call( + [ + "spirv-dis", + # "--no-indent", + # "--no-header", + # "--raw-id", + # "--offsets", + "-o", + opath, + ipath, + ] + ) def validate(self, ipath): - check_call(["spirv-val",ipath]) + check_call(["spirv-val", ipath]) def optimize(self, ipath, opath): - check_call([ - "spirv-opt", - # "--strip-debug", - # "--freeze-spec-const", - # "--eliminate-dead-const", - # "--fold-spec-const-op-composite", - # "--set-spec-const-default-value ': ...'", - # "--unify-const", - # "--inline-entry-points-exhaustive", - # "--flatten-decorations", - # "--compact-ids", - "-o", - opath, - ipath]) + check_call( + [ + "spirv-opt", + # "--strip-debug", + # "--freeze-spec-const", + # "--eliminate-dead-const", + # "--fold-spec-const-op-composite", + # "--set-spec-const-default-value ': ...'", + # "--unify-const", + # "--inline-entry-points-exhaustive", + # "--flatten-decorations", + # "--compact-ids", + "-o", + opath, + ipath, + ] + ) def generate(self, ipath, opath): # DRD : Temporary hack to get SPIR-V code generation to work. # The opt step is needed for: # a) generate a bitcode file from the text IR file # b) hoist all allocas to the enty block of the module - check_call(["opt","-O1","-o",ipath+'.bc',ipath]) - check_call(["llvm-spirv","-o",opath,ipath+'.bc']) + check_call(["opt", "-O1", "-o", ipath + ".bc", ipath]) + check_call(["llvm-spirv", "-o", opath, ipath + ".bc"]) if dppy_config.SAVE_IR_FILES == 0: - os.unlink(ipath + '.bc') + os.unlink(ipath + ".bc") def link(self, opath, binaries): - params = ["spirv-link","--allow-partial-linkage","-o", opath] + params = ["spirv-link", "--allow-partial-linkage", "-o", opath] params.extend(binaries) check_call(params) + class Module(object): def __init__(self, context): """ @@ -93,14 +102,13 @@ def __del__(self): if dppy_config.SAVE_IR_FILES == 0: os.rmdir(self._tmpdir) - def _create_temp_file(self, name, mode='wb'): + def _create_temp_file(self, name, mode="wb"): path = self._track_temp_file(name) fobj = open(path, mode=mode) return fobj, path def _track_temp_file(self, name): - path = os.path.join(self._tmpdir, - "{0}-{1}".format(len(self._tempfiles), name)) + path = os.path.join(self._tmpdir, "{0}-{1}".format(len(self._tempfiles), name)) self._tempfiles.append(path) return path @@ -130,6 +138,7 @@ def finalize(self): del self.context.link_binaries[key] if key == LINK_ATOMIC: from .ocl.atomics import get_atomic_spirv_path + binary_paths.append(get_atomic_spirv_path()) if len(binary_paths) > 1: @@ -152,13 +161,13 @@ def finalize(self): # Disassemble optimized SPIR-V code dis_path = self._track_temp_file("disassembled-spirv") self._cmd.disassemble(ipath=opt_path, opath=dis_path) - with open(dis_path, 'rb') as fin_opt: + with open(dis_path, "rb") as fin_opt: print("ASSEMBLY".center(80, "-")) print(fin_opt.read()) print("".center(80, "=")) # Read and return final SPIR-V (not optimized!) - with open(spirv_path, 'rb') as fin: + with open(spirv_path, "rb") as fin: spirv = fin.read() self._finalized = True @@ -168,6 +177,7 @@ def finalize(self): # Public llvm_to_spirv function ############################################### + def llvm_to_spirv(context, bitcode): mod = Module(context) mod.load_llvm(bitcode) diff --git a/numba_dppy/target.py b/numba_dppy/target.py index 147b229e77..87103a5045 100644 --- a/numba_dppy/target.py +++ b/numba_dppy/target.py @@ -39,15 +39,15 @@ def load_additional_registries(self): # ----------------------------------------------------------------------------- # Implementation -VALID_CHARS = re.compile(r'[^a-z0-9]', re.I) +VALID_CHARS = re.compile(r"[^a-z0-9]", re.I) # Address spaces -SPIR_PRIVATE_ADDRSPACE = 0 -SPIR_GLOBAL_ADDRSPACE = 1 +SPIR_PRIVATE_ADDRSPACE = 0 +SPIR_GLOBAL_ADDRSPACE = 1 SPIR_CONSTANT_ADDRSPACE = 2 -SPIR_LOCAL_ADDRSPACE = 3 -SPIR_GENERIC_ADDRSPACE = 4 +SPIR_LOCAL_ADDRSPACE = 3 +SPIR_GENERIC_ADDRSPACE = 4 SPIR_VERSION = (2, 0) @@ -57,9 +57,13 @@ def load_additional_registries(self): class GenericPointerModel(datamodel.PrimitiveModel): def __init__(self, dmm, fe_type): - #print("GenericPointerModel:", dmm, fe_type, fe_type.addrspace) - adrsp = fe_type.addrspace if fe_type.addrspace is not None else SPIR_GENERIC_ADDRSPACE - #adrsp = SPIR_GENERIC_ADDRSPACE + # print("GenericPointerModel:", dmm, fe_type, fe_type.addrspace) + adrsp = ( + fe_type.addrspace + if fe_type.addrspace is not None + else SPIR_GENERIC_ADDRSPACE + ) + # adrsp = SPIR_GENERIC_ADDRSPACE be_type = dmm.lookup(fe_type.dtype).get_data_type().as_pointer(adrsp) super(GenericPointerModel, self).__init__(dmm, fe_type, be_type) @@ -72,18 +76,37 @@ def _init_data_model_manager(): spirv_data_model_manager = _init_data_model_manager() + def _replace_numpy_ufunc_with_opencl_supported_functions(): from numba.np.ufunc_db import _ufunc_db as ufunc_db from numba_dppy.ocl.mathimpl import lower_ocl_impl, sig_mapper - ufuncs = [("fabs", np.fabs), ("exp", np.exp), ("log", np.log), - ("log10", np.log10), ("expm1", np.expm1), ("log1p", np.log1p), - ("sqrt", np.sqrt), ("sin", np.sin), ("cos", np.cos), - ("tan", np.tan), ("asin", np.arcsin), ("acos", np.arccos), - ("atan", np.arctan), ("atan2", np.arctan2), ("sinh", np.sinh), - ("cosh", np.cosh), ("tanh", np.tanh), ("asinh", np.arcsinh), - ("acosh", np.arccosh), ("atanh", np.arctanh), ("ldexp", np.ldexp), - ("floor", np.floor), ("ceil", np.ceil), ("trunc", np.trunc)] + ufuncs = [ + ("fabs", np.fabs), + ("exp", np.exp), + ("log", np.log), + ("log10", np.log10), + ("expm1", np.expm1), + ("log1p", np.log1p), + ("sqrt", np.sqrt), + ("sin", np.sin), + ("cos", np.cos), + ("tan", np.tan), + ("asin", np.arcsin), + ("acos", np.arccos), + ("atan", np.arctan), + ("atan2", np.arctan2), + ("sinh", np.sinh), + ("cosh", np.cosh), + ("tanh", np.tanh), + ("asinh", np.arcsinh), + ("acosh", np.arccosh), + ("atanh", np.arctanh), + ("ldexp", np.ldexp), + ("floor", np.floor), + ("ceil", np.ceil), + ("trunc", np.trunc), + ] for name, ufunc in ufuncs: for sig in ufunc_db[ufunc].keys(): @@ -97,16 +120,19 @@ class DPPYTargetContext(BaseContext): def init(self): self._internal_codegen = codegen.JITSPIRVCodegen("numba_dppy.jit") - self._target_data = (ll.create_target_data(codegen - .SPIR_DATA_LAYOUT[utils.MACHINE_BITS])) + self._target_data = ll.create_target_data( + codegen.SPIR_DATA_LAYOUT[utils.MACHINE_BITS] + ) # Override data model manager to SPIR model self.data_model_manager = spirv_data_model_manager self.link_binaries = dict() from numba.np.ufunc_db import _lazy_init_db import copy + _lazy_init_db() from numba.np.ufunc_db import _ufunc_db as ufunc_db + self.ufunc_db = copy.deepcopy(ufunc_db) from numba.core.cpu import CPUContext @@ -114,26 +140,41 @@ def init(self): self.cpu_context = cpu_target.target_context - - def replace_numpy_ufunc_with_opencl_supported_functions(self): from numba_dppy.ocl.mathimpl import lower_ocl_impl, sig_mapper - ufuncs = [("fabs", np.fabs), ("exp", np.exp), ("log", np.log), - ("log10", np.log10), ("expm1", np.expm1), ("log1p", np.log1p), - ("sqrt", np.sqrt), ("sin", np.sin), ("cos", np.cos), - ("tan", np.tan), ("asin", np.arcsin), ("acos", np.arccos), - ("atan", np.arctan), ("atan2", np.arctan2), ("sinh", np.sinh), - ("cosh", np.cosh), ("tanh", np.tanh), ("asinh", np.arcsinh), - ("acosh", np.arccosh), ("atanh", np.arctanh), ("ldexp", np.ldexp), - ("floor", np.floor), ("ceil", np.ceil), ("trunc", np.trunc)] + ufuncs = [ + ("fabs", np.fabs), + ("exp", np.exp), + ("log", np.log), + ("log10", np.log10), + ("expm1", np.expm1), + ("log1p", np.log1p), + ("sqrt", np.sqrt), + ("sin", np.sin), + ("cos", np.cos), + ("tan", np.tan), + ("asin", np.arcsin), + ("acos", np.arccos), + ("atan", np.arctan), + ("atan2", np.arctan2), + ("sinh", np.sinh), + ("cosh", np.cosh), + ("tanh", np.tanh), + ("asinh", np.arcsinh), + ("acosh", np.arccosh), + ("atanh", np.arctanh), + ("ldexp", np.ldexp), + ("floor", np.floor), + ("ceil", np.ceil), + ("trunc", np.trunc), + ] for name, ufunc in ufuncs: for sig in self.ufunc_db[ufunc].keys(): if sig in sig_mapper and (name, sig_mapper[sig]) in lower_ocl_impl: self.ufunc_db[ufunc][sig] = lower_ocl_impl[(name, sig_mapper[sig])] - def load_additional_registries(self): from .ocl import oclimpl, mathimpl from numba.np import npyimpl @@ -150,7 +191,6 @@ def load_additional_registries(self): """ self.replace_numpy_ufunc_with_opencl_supported_functions() - @cached_property def call_conv(self): return DPPYCallConv(self) @@ -167,13 +207,13 @@ def repl(m): ch = m.group(0) return "_%X_" % ord(ch) - qualified = name + '.' + '.'.join(str(a) for a in argtypes) + qualified = name + "." + ".".join(str(a) for a in argtypes) mangled = VALID_CHARS.sub(repl, qualified) - return 'dppy_py_devfn_' + mangled + return "dppy_py_devfn_" + mangled def prepare_ocl_kernel(self, func, argtypes): module = func.module - func.linkage = 'linkonce_odr' + func.linkage = "linkonce_odr" module.data_layout = codegen.SPIR_DATA_LAYOUT[self.address_size] wrapper = self.generate_kernel_wrapper(func, argtypes) @@ -184,7 +224,7 @@ def mark_ocl_device(self, func): # Adapt to SPIR # module = func.module func.calling_convention = CC_SPIR_FUNC - func.linkage = 'linkonce_odr' + func.linkage = "linkonce_odr" return func def generate_kernel_wrapper(self, func, argtypes): @@ -196,32 +236,33 @@ def sub_gen_with_global(lty): if lty.addrspace == SPIR_LOCAL_ADDRSPACE: return lty, None # DRD : Cast all pointer types to global address space. - if lty.addrspace != SPIR_GLOBAL_ADDRSPACE: # jcaraban - return (lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE), - lty.addrspace) + if lty.addrspace != SPIR_GLOBAL_ADDRSPACE: # jcaraban + return ( + lty.pointee.as_pointer(SPIR_GLOBAL_ADDRSPACE), + lty.addrspace, + ) return lty, None if len(arginfo.argument_types) > 0: - llargtys, changed = zip(*map(sub_gen_with_global, - arginfo.argument_types)) + llargtys, changed = zip(*map(sub_gen_with_global, arginfo.argument_types)) else: llargtys = changed = () wrapperfnty = lc.Type.function(lc.Type.void(), llargtys) wrapper_module = self.create_module("dppy.kernel.wrapper") - wrappername = 'dppyPy_{name}'.format(name=func.name) + wrappername = "dppyPy_{name}".format(name=func.name) argtys = list(arginfo.argument_types) - fnty = lc.Type.function(lc.Type.int(), - [self.call_conv.get_return_type( - types.pyobject)] + argtys) + fnty = lc.Type.function( + lc.Type.int(), [self.call_conv.get_return_type(types.pyobject)] + argtys + ) func = wrapper_module.add_function(fnty, name=func.name) func.calling_convention = CC_SPIR_FUNC wrapper = wrapper_module.add_function(wrapperfnty, name=wrappername) - builder = lc.Builder(wrapper.append_basic_block('')) + builder = lc.Builder(wrapper.append_basic_block("")) # Adjust address space of each kernel argument fixed_args = [] @@ -235,20 +276,21 @@ def sub_gen_with_global(lty): callargs = arginfo.from_arguments(builder, fixed_args) # XXX handle error status - status, _ = self.call_conv.call_function(builder, func, types.void, - argtypes, callargs) + status, _ = self.call_conv.call_function( + builder, func, types.void, argtypes, callargs + ) builder.ret_void() set_dppy_kernel(wrapper) - #print(str(wrapper_module)) + # print(str(wrapper_module)) # Link module.link_in(ll.parse_assembly(str(wrapper_module))) # To enable inlining which is essential because addrspacecast 1->0 is # illegal. Inlining will optimize the addrspacecast out. - func.linkage = 'internal' + func.linkage = "internal" wrapper = module.get_function(wrapper.name) - module.get_function(func.name).linkage = 'internal' + module.get_function(func.name).linkage = "internal" return wrapper def declare_function(self, module, fndesc): @@ -256,12 +298,12 @@ def declare_function(self, module, fndesc): fn = module.get_or_insert_function(fnty, name=fndesc.mangled_name) if not self.enable_debuginfo: - fn.attributes.add('alwaysinline') + fn.attributes.add("alwaysinline") ret = super(DPPYTargetContext, self).declare_function(module, fndesc) # XXX: Refactor fndesc instead of this special case - if fndesc.llvm_func_name.startswith('dppy_py_devfn'): + if fndesc.llvm_func_name.startswith("dppy_py_devfn"): ret.calling_convention = CC_SPIR_FUNC return ret @@ -274,32 +316,29 @@ def make_constant_array(self, builder, typ, ary): # return a._getvalue() raise NotImplementedError - def insert_const_string(self, mod, string): """ This returns a a pointer in the spir generic addrspace. """ text = lc.Constant.stringz(string) - name = '$'.join(["__conststring__", - self.mangler(string, ["str"])]) + name = "$".join(["__conststring__", self.mangler(string, ["str"])]) # Try to reuse existing global try: gv = mod.get_global(name) except KeyError as e: # Not defined yet - gv = mod.add_global_variable(text.type, name=name, - addrspace=SPIR_GENERIC_ADDRSPACE) - gv.linkage = 'internal' + gv = mod.add_global_variable( + text.type, name=name, addrspace=SPIR_GENERIC_ADDRSPACE + ) + gv.linkage = "internal" gv.global_constant = True gv.initializer = text # Cast to a i8* pointer charty = gv.type.pointee.element - return lc.Constant.bitcast(gv, - charty.as_pointer(SPIR_GENERIC_ADDRSPACE)) - + return lc.Constant.bitcast(gv, charty.as_pointer(SPIR_GENERIC_ADDRSPACE)) def addrspacecast(self, builder, src, addrspace): """ @@ -325,12 +364,19 @@ def set_dppy_kernel(fn): # Mark kernels ocl_kernels = mod.get_or_insert_named_metadata("opencl.kernels") - ocl_kernels.add(lc.MetaData.get(mod, [fn, - gen_arg_addrspace_md(fn), - gen_arg_access_qual_md(fn), - gen_arg_type(fn), - gen_arg_type_qual(fn), - gen_arg_base_type(fn)])) + ocl_kernels.add( + lc.MetaData.get( + mod, + [ + fn, + gen_arg_addrspace_md(fn), + gen_arg_access_qual_md(fn), + gen_arg_type(fn), + gen_arg_type_qual(fn), + gen_arg_base_type(fn), + ], + ) + ) # SPIR version 2.0 make_constant = lambda x: lc.Constant.int(lc.Type.int(), x) @@ -346,14 +392,16 @@ def set_dppy_kernel(fn): # Other metadata empty_md = lc.MetaData.get(mod, ()) - others = ["opencl.used.extensions", - "opencl.used.optional.core.features", - "opencl.compiler.options"] + others = [ + "opencl.used.extensions", + "opencl.used.optional.core.features", + "opencl.compiler.options", + ] for name in others: nmd = mod.get_or_insert_named_metadata(name) if not nmd.operands: - nmd.add(empty_md) + nmd.add(empty_md) def gen_arg_addrspace_md(fn): diff --git a/numba_dppy/target_dispatcher.py b/numba_dppy/target_dispatcher.py index dde38eb75b..4a9dc16fc5 100644 --- a/numba_dppy/target_dispatcher.py +++ b/numba_dppy/target_dispatcher.py @@ -6,11 +6,11 @@ class TargetDispatcher(serialize.ReduceMixin, metaclass=dispatcher.DispatcherMeta): - __numba__ = 'py_func' + __numba__ = "py_func" - target_offload_gpu = '__dppy_offload_gpu__' - target_offload_cpu = '__dppy_offload_cpu__' - target_dppy = 'dppy' + target_offload_gpu = "__dppy_offload_gpu__" + target_offload_cpu = "__dppy_offload_cpu__" + target_dppy = "dppy" def __init__(self, py_func, wrapper, target, parallel_options, compiled=None): @@ -58,32 +58,46 @@ def __is_with_context_target(self, target): def get_current_disp(self): target = self.__target parallel = self.__parallel - offload = isinstance(parallel, dict) and parallel.get('offload') is True + offload = isinstance(parallel, dict) and parallel.get("offload") is True - if (dpctl.is_in_device_context() or offload): + if dpctl.is_in_device_context() or offload: if not self.__is_with_context_target(target): - raise UnsupportedError(f"Can't use 'with' context with explicitly specified target '{target}'") - if parallel is False or (isinstance(parallel, dict) and parallel.get('offload') is False): - raise UnsupportedError(f"Can't use 'with' context with parallel option '{parallel}'") + raise UnsupportedError( + f"Can't use 'with' context with explicitly specified target '{target}'" + ) + if parallel is False or ( + isinstance(parallel, dict) and parallel.get("offload") is False + ): + raise UnsupportedError( + f"Can't use 'with' context with parallel option '{parallel}'" + ) from numba_dppy import dppy_offload_dispatcher if target is None: if dpctl.get_current_device_type() == dpctl.device_type.gpu: - return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu] + return registry.dispatcher_registry[ + TargetDispatcher.target_offload_gpu + ] elif dpctl.get_current_device_type() == dpctl.device_type.cpu: - return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu] + return registry.dispatcher_registry[ + TargetDispatcher.target_offload_cpu + ] else: if dpctl.is_in_device_context(): - raise UnsupportedError('Unknown dppy device type') + raise UnsupportedError("Unknown dppy device type") if offload: if dpctl.has_gpu_queues(): - return registry.dispatcher_registry[TargetDispatcher.target_offload_gpu] + return registry.dispatcher_registry[ + TargetDispatcher.target_offload_gpu + ] elif dpctl.has_cpu_queues(): - return registry.dispatcher_registry[TargetDispatcher.target_offload_cpu] + return registry.dispatcher_registry[ + TargetDispatcher.target_offload_cpu + ] if target is None: - target = 'cpu' + target = "cpu" return registry.dispatcher_registry[target] @@ -93,5 +107,5 @@ def _reduce_states(self): wrapper=self.__wrapper, target=self.__target, parallel=self.__parallel, - compiled=self.__compiled + compiled=self.__compiled, ) diff --git a/numba_dppy/testing.py b/numba_dppy/testing.py index e6ff1e3ab3..7f6e539b38 100644 --- a/numba_dppy/testing.py +++ b/numba_dppy/testing.py @@ -18,6 +18,7 @@ def captured_dppy_stdout(): sys.stdout.flush() import numba_dppy, numba_dppy as dppy + with redirect_c_stdout() as stream: yield DPPYTextCapture(stream) @@ -38,6 +39,7 @@ def expectedFailureIf(condition): def ensure_dpnp(): try: from numba_dppy.dpnp_glue import dpnp_fptr_interface as dpnp_glue + return True except: return False diff --git a/numba_dppy/tests/__init__.py b/numba_dppy/tests/__init__.py index 939c95c567..09ff707488 100644 --- a/numba_dppy/tests/__init__.py +++ b/numba_dppy/tests/__init__.py @@ -7,6 +7,7 @@ # from numba_dppy.tests.dppy import * + def load_tests(loader, tests, pattern): suite = SerialSuite() diff --git a/numba_dppy/tests/skip_tests.py b/numba_dppy/tests/skip_tests.py index fa18d36181..6765b95cfe 100644 --- a/numba_dppy/tests/skip_tests.py +++ b/numba_dppy/tests/skip_tests.py @@ -1,5 +1,6 @@ import dpctl + def is_gen12(device_type): with dpctl.device_context(device_type): q = dpctl.get_current_queue() diff --git a/numba_dppy/tests/test_arg_accessor.py b/numba_dppy/tests/test_arg_accessor.py index 494f269c59..3a2f3d7f05 100644 --- a/numba_dppy/tests/test_arg_accessor.py +++ b/numba_dppy/tests/test_arg_accessor.py @@ -5,19 +5,22 @@ import dpctl -@dppy.kernel(access_types={"read_only": ['a', 'b'], "write_only": ['c'], "read_write": []}) +@dppy.kernel( + access_types={"read_only": ["a", "b"], "write_only": ["c"], "read_write": []} +) def sum_with_accessor(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] + @dppy.kernel def sum_without_accessor(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] -def call_kernel(global_size, local_size, - A, B, C, func): - func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C) + +def call_kernel(global_size, local_size, A, B, C, func): + func[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, C) global_size = 10 @@ -29,39 +32,35 @@ def call_kernel(global_size, local_size, D = A + B -@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system') +@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system") class TestDPPYArgAccessorCPU(unittest.TestCase): def test_arg_with_accessor(self): C = np.ones_like(A) with dpctl.device_context("opencl:cpu") as cpu_queue: - call_kernel(global_size, local_size, - A, B, C, sum_with_accessor) + call_kernel(global_size, local_size, A, B, C, sum_with_accessor) self.assertTrue(np.all(D == C)) def test_arg_without_accessor(self): C = np.ones_like(A) with dpctl.device_context("opencl:cpu") as cpu_queue: - call_kernel(global_size, local_size, - A, B, C, sum_without_accessor) + call_kernel(global_size, local_size, A, B, C, sum_without_accessor) self.assertTrue(np.all(D == C)) -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYArgAccessorGPU(unittest.TestCase): def test_arg_with_accessor(self): C = np.ones_like(A) with dpctl.device_context("opencl:gpu") as gpu_queue: - call_kernel(global_size, local_size, - A, B, C, sum_with_accessor) + call_kernel(global_size, local_size, A, B, C, sum_with_accessor) self.assertTrue(np.all(D == C)) def test_arg_without_accessor(self): C = np.ones_like(A) with dpctl.device_context("opencl:gpu") as gpu_queue: - call_kernel(global_size, local_size, - A, B, C, sum_without_accessor) + call_kernel(global_size, local_size, A, B, C, sum_without_accessor) self.assertTrue(np.all(D == C)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_arg_types.py b/numba_dppy/tests/test_arg_types.py index ed55e12e16..7e21748a2c 100644 --- a/numba_dppy/tests/test_arg_types.py +++ b/numba_dppy/tests/test_arg_types.py @@ -10,6 +10,7 @@ def mul_kernel(A, B, test): i = dppy.get_global_id(0) B[i] = A[i] * test + def call_mul_device_kernel(global_size, A, B, test): mul_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, B, test) @@ -20,7 +21,7 @@ def call_mul_device_kernel(global_size, A, B, test): B = np.array(np.random.random(N), dtype=np.float32) -@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system') +@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system") class TestDPPYArrayArgCPU(unittest.TestCase): def test_integer_arg(self): x = np.int32(2) @@ -46,7 +47,7 @@ def check_bool_kernel(A, test): else: A[0] = 222 - A = np.array([0], dtype='float64') + A = np.array([0], dtype="float64") with dpctl.device_context("opencl:cpu") as cpu_queue: check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True) @@ -55,7 +56,7 @@ def check_bool_kernel(A, test): self.assertTrue(A[0] == 222) -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYArrayArgGPU(unittest.TestCase): def test_integer_arg(self): x = np.int32(2) @@ -81,7 +82,7 @@ def check_bool_kernel(A, test): else: A[0] = 222 - A = np.array([0], dtype='float64') + A = np.array([0], dtype="float64") with dpctl.device_context("opencl:gpu") as gpu_queue: check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, True) @@ -89,5 +90,6 @@ def check_bool_kernel(A, test): check_bool_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, False) self.assertTrue(A[0] == 222) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_atomic_op.py b/numba_dppy/tests/test_atomic_op.py index 27a810ba08..ccfcae978c 100644 --- a/numba_dppy/tests/test_atomic_op.py +++ b/numba_dppy/tests/test_atomic_op.py @@ -5,6 +5,7 @@ import unittest import dpctl + def atomic_add_int32(ary): tid = dppy.get_local_id(0) lm = dppy.local.static_alloc(32, numba.uint32) @@ -111,17 +112,19 @@ def call_fn_for_datatypes(fn, result, input, global_size): with dpctl.device_context("opencl:gpu") as gpu_queue: # TODO: dpctl needs to expose this functions - #if dtype == np.double and not device_env.device_support_float64_atomics(): + # if dtype == np.double and not device_env.device_support_float64_atomics(): # continue - #if dtype == np.int64 and not device_env.device_support_int64_atomics(): + # if dtype == np.int64 and not device_env.device_support_int64_atomics(): # continue fn[global_size, dppy.DEFAULT_LOCAL_SIZE](a) - assert(a[0] == result) + assert a[0] == result -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') -@unittest.skipUnless(numba_dppy.ocl.atomic_support_present(), 'test only when atomic support is present') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") +@unittest.skipUnless( + numba_dppy.ocl.atomic_support_present(), "test only when atomic support is present" +) class TestAtomicOp(unittest.TestCase): def test_atomic_add_global(self): @dppy.kernel @@ -133,7 +136,6 @@ def atomic_add(B): call_fn_for_datatypes(atomic_add, N, B, N) - def test_atomic_sub_global(self): @dppy.kernel def atomic_sub(B): @@ -144,12 +146,11 @@ def atomic_sub(B): call_fn_for_datatypes(atomic_sub, 0, B, N) - def test_atomic_add_local_int32(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() - #dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32) + # dppy_atomic_add = dppy.kernel('void(uint32[:])')(atomic_add_int32) dppy_atomic_add = dppy.kernel(atomic_add_int32) with dpctl.device_context("opencl:gpu") as gpu_queue: dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary) @@ -160,12 +161,11 @@ def test_atomic_add_local_int32(self): self.assertTrue(np.all(ary == gold)) - def test_atomic_sub_local_int32(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32) orig = ary.copy() - #dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32) + # dppy_atomic_sub = dppy.kernel('void(uint32[:])')(atomic_sub_int32) dppy_atomic_sub = dppy.kernel(atomic_sub_int32) with dpctl.device_context("opencl:gpu") as gpu_queue: dppy_atomic_sub[32, dppy.DEFAULT_LOCAL_SIZE](ary) @@ -176,22 +176,20 @@ def test_atomic_sub_local_int32(self): self.assertTrue(np.all(ary == gold)) - def test_atomic_add_local_float32(self): ary = np.array([0], dtype=np.float32) - #dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32) + # dppy_atomic_add = dppy.kernel('void(float32[:])')(atomic_add_float32) dppy_atomic_add = dppy.kernel(atomic_add_float32) with dpctl.device_context("opencl:gpu") as gpu_queue: dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary) self.assertTrue(ary[0] == 32) - def test_atomic_sub_local_float32(self): ary = np.array([32], dtype=np.float32) - #dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32) + # dppy_atomic_sub = dppy.kernel('void(float32[:])')(atomic_sub_float32) dppy_atomic_sub = dppy.kernel(atomic_sub_float32) with dpctl.device_context("opencl:gpu") as gpu_queue: @@ -199,77 +197,71 @@ def test_atomic_sub_local_float32(self): self.assertTrue(ary[0] == 0) - def test_atomic_add_local_int64(self): ary = np.array([0], dtype=np.int64) - #dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64) + # dppy_atomic_add = dppy.kernel('void(int64[:])')(atomic_add_int64) dppy_atomic_add = dppy.kernel(atomic_add_int64) with dpctl.device_context("opencl:gpu") as gpu_queue: # TODO: dpctl needs to expose this functions - #if device_env.device_support_int64_atomics(): + # if device_env.device_support_int64_atomics(): dppy_atomic_add[32, dppy.DEFAULT_LOCAL_SIZE](ary) self.assertTrue(ary[0] == 32) - #else: + # else: # return - def test_atomic_sub_local_int64(self): ary = np.array([32], dtype=np.int64) - #fn = dppy.kernel('void(int64[:])')(atomic_sub_int64) + # fn = dppy.kernel('void(int64[:])')(atomic_sub_int64) fn = dppy.kernel(atomic_sub_int64) with dpctl.device_context("opencl:gpu") as gpu_queue: # TODO: dpctl needs to expose this functions - #if device_env.device_support_int64_atomics(): + # if device_env.device_support_int64_atomics(): fn[32, dppy.DEFAULT_LOCAL_SIZE](ary) self.assertTrue(ary[0] == 0) - #else: + # else: # return - def test_atomic_add_local_float64(self): ary = np.array([0], dtype=np.double) - #fn = dppy.kernel('void(float64[:])')(atomic_add_float64) + # fn = dppy.kernel('void(float64[:])')(atomic_add_float64) fn = dppy.kernel(atomic_add_float64) with dpctl.device_context("opencl:gpu") as gpu_queue: # TODO: dpctl needs to expose this functions - #if device_env.device_support_float64_atomics(): + # if device_env.device_support_float64_atomics(): fn[32, dppy.DEFAULT_LOCAL_SIZE](ary) self.assertTrue(ary[0] == 32) - #else: + # else: # return - def test_atomic_sub_local_float64(self): ary = np.array([32], dtype=np.double) - #fn = dppy.kernel('void(float64[:])')(atomic_sub_int64) + # fn = dppy.kernel('void(float64[:])')(atomic_sub_int64) fn = dppy.kernel(atomic_sub_int64) with dpctl.device_context("opencl:gpu") as gpu_queue: # TODO: dpctl needs to expose this functions - #if device_env.device_support_float64_atomics(): + # if device_env.device_support_float64_atomics(): fn[32, dppy.DEFAULT_LOCAL_SIZE](ary) self.assertTrue(ary[0] == 0) - #else: + # else: # return - def test_atomic_add2(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - #dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2) + # dppy_atomic_add2 = dppy.kernel('void(uint32[:,:])')(atomic_add2) dppy_atomic_add2 = dppy.kernel(atomic_add2) with dpctl.device_context("opencl:gpu") as gpu_queue: dppy_atomic_add2[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary) self.assertTrue(np.all(ary == orig + 1)) - def test_atomic_add3(self): ary = np.random.randint(0, 32, size=32).astype(np.uint32).reshape(4, 8) orig = ary.copy() - #dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3) + # dppy_atomic_add3 = dppy.kernel('void(uint32[:,:])')(atomic_add3) dppy_atomic_add3 = dppy.kernel(atomic_add3) with dpctl.device_context("opencl:gpu") as gpu_queue: dppy_atomic_add3[(4, 8), dppy.DEFAULT_LOCAL_SIZE](ary) @@ -277,5 +269,5 @@ def test_atomic_add3(self): self.assertTrue(np.all(ary == orig + 1)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_barrier.py b/numba_dppy/tests/test_barrier.py index 7cedc18f13..6f30e06f2b 100644 --- a/numba_dppy/tests/test_barrier.py +++ b/numba_dppy/tests/test_barrier.py @@ -6,10 +6,10 @@ import dpctl -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestBarrier(unittest.TestCase): def test_proper_lowering(self): - #@dppy.kernel("void(float32[::1])") + # @dppy.kernel("void(float32[::1])") @dppy.kernel def twice(A): i = dppy.get_global_id(0) @@ -22,13 +22,13 @@ def twice(A): orig = arr.copy() with dpctl.device_context("opencl:gpu") as gpu_queue: - twice[N, N//2](arr) + twice[N, N // 2](arr) # The computation is correct? np.testing.assert_allclose(orig * 2, arr) def test_no_arg_barrier_support(self): - #@dppy.kernel("void(float32[::1])") + # @dppy.kernel("void(float32[::1])") @dppy.kernel def twice(A): i = dppy.get_global_id(0) @@ -47,11 +47,10 @@ def twice(A): # The computation is correct? np.testing.assert_allclose(orig * 2, arr) - def test_local_memory(self): blocksize = 10 - #@dppy.kernel("void(float32[::1])") + # @dppy.kernel("void(float32[::1])") @dppy.kernel def reverse_array(A): lm = dppy.local.static_alloc(shape=10, dtype=float32) @@ -74,5 +73,5 @@ def reverse_array(A): np.testing.assert_allclose(expected, arr) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_black_scholes.py b/numba_dppy/tests/test_black_scholes.py index 7baecbeda5..fca2fe69f3 100644 --- a/numba_dppy/tests/test_black_scholes.py +++ b/numba_dppy/tests/test_black_scholes.py @@ -20,12 +20,17 @@ def cnd(d): K = 1.0 / (1.0 + 0.2316419 * np.abs(d)) - ret_val = (RSQRT2PI * np.exp(-0.5 * d * d) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) + ret_val = ( + RSQRT2PI + * np.exp(-0.5 * d * d) + * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + ) return np.where(d > 0, 1.0 - ret_val, ret_val) -def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears, - Riskfree, Volatility): + +def black_scholes( + callResult, putResult, stockPrice, optionStrike, optionYears, Riskfree, Volatility +): S = stockPrice X = optionStrike T = optionYears @@ -37,15 +42,16 @@ def black_scholes(callResult, putResult, stockPrice, optionStrike, optionYears, cndd1 = cnd(d1) cndd2 = cnd(d2) - expRT = np.exp(- R * T) - callResult[:] = (S * cndd1 - X * expRT * cndd2) - putResult[:] = (X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1)) + expRT = np.exp(-R * T) + callResult[:] = S * cndd1 - X * expRT * cndd2 + putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1) + def randfloat(rand_var, low, high): return (1.0 - rand_var) * low + rand_var * high -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYBlackScholes(unittest.TestCase): def test_black_scholes(self): OPT_N = 400 @@ -63,9 +69,15 @@ def test_black_scholes(self): # numpy for i in range(iterations): - black_scholes(callResultNumpy, putResultNumpy, stockPrice, - optionStrike, optionYears, RISKFREE, VOLATILITY) - + black_scholes( + callResultNumpy, + putResultNumpy, + stockPrice, + optionStrike, + optionYears, + RISKFREE, + VOLATILITY, + ) @dppy.kernel def black_scholes_dppy(callResult, putResult, S, X, T, R, V): @@ -77,20 +89,26 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V): d2 = d1 - V * sqrtT K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1)) - cndd1 = (RSQRT2PI * math.exp(-0.5 * d1 * d1) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) + cndd1 = ( + RSQRT2PI + * math.exp(-0.5 * d1 * d1) + * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + ) if d1 > 0: cndd1 = 1.0 - cndd1 K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2)) - cndd2 = (RSQRT2PI * math.exp(-0.5 * d2 * d2) * - (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))) + cndd2 = ( + RSQRT2PI + * math.exp(-0.5 * d2 * d2) + * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) + ) if d2 > 0: cndd2 = 1.0 - cndd2 - expRT = math.exp((-1. * R) * T[i]) - callResult[i] = (S[i] * cndd1 - X[i] * expRT * cndd2) - putResult[i] = (X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)) + expRT = math.exp((-1.0 * R) * T[i]) + callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2 + putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1) # numbapro time0 = time.time() @@ -101,11 +119,16 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V): time1 = time.time() for i in range(iterations): black_scholes_dppy[blockdim, griddim]( - callResultNumbapro, putResultNumbapro, stockPrice, optionStrike, - optionYears, RISKFREE, VOLATILITY) + callResultNumbapro, + putResultNumbapro, + stockPrice, + optionStrike, + optionYears, + RISKFREE, + VOLATILITY, + ) - - dt = (time1 - time0) + dt = time1 - time0 delta = np.abs(callResultNumpy - callResultNumbapro) L1norm = delta.sum() / np.abs(callResultNumpy).sum() @@ -114,5 +137,6 @@ def black_scholes_dppy(callResult, putResult, S, X, T, R, V): self.assertTrue(L1norm < 1e-13) self.assertTrue(max_abs_err < 1e-13) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_caching.py b/numba_dppy/tests/test_caching.py index 268401ce98..561c1dda87 100644 --- a/numba_dppy/tests/test_caching.py +++ b/numba_dppy/tests/test_caching.py @@ -19,15 +19,18 @@ def test_caching_kernel(self): b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) - with dpctl.device_context("opencl:gpu") as gpu_queue: func = dppy.kernel(data_parallel_sum) - caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c) + caching_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( + a, b, c + ) for i in range(10): - cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(a, b, c) + cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( + a, b, c + ) self.assertIs(caching_kernel, cached_kernel) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_controllable_fallback.py b/numba_dppy/tests/test_controllable_fallback.py index 357f0b5e20..6a722d2d72 100644 --- a/numba_dppy/tests/test_controllable_fallback.py +++ b/numba_dppy/tests/test_controllable_fallback.py @@ -7,7 +7,7 @@ import dpctl -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYFallback(unittest.TestCase): def test_dppy_fallback_true(self): @numba.jit @@ -33,7 +33,9 @@ def inner_call_fallback(): numba_dppy.compiler.DEBUG = 0 np.testing.assert_array_equal(dppy_fallback_true, ref_result) - self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue()) + self.assertTrue( + "Failed to lower parfor on DPPY-device" in msg_fallback_true.getvalue() + ) @unittest.expectedFailure def test_dppy_fallback_false(self): @@ -52,7 +54,7 @@ def inner_call_fallback(): try: numba_dppy.compiler.DEBUG = 1 - numba_dppy.config.FALLBACK_ON_CPU = 0 + numba_dppy.config.FALLBACK_ON_CPU = 0 with captured_stderr() as msg_fallback_true: with dpctl.device_context("opencl:gpu") as gpu_queue: dppy = numba.njit(parallel=True)(inner_call_fallback) @@ -60,12 +62,14 @@ def inner_call_fallback(): finally: ref_result = inner_call_fallback() - numba_dppy.config.FALLBACK_ON_CPU = 1 + numba_dppy.config.FALLBACK_ON_CPU = 1 numba_dppy.compiler.DEBUG = 0 not np.testing.assert_array_equal(dppy_fallback_false, ref_result) - not self.assertTrue('Failed to lower parfor on DPPY-device' in msg_fallback_true.getvalue()) + not self.assertTrue( + "Failed to lower parfor on DPPY-device" in msg_fallback_true.getvalue() + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_device_array_args.py b/numba_dppy/tests/test_device_array_args.py index eb47cd28bc..06a49dd5ed 100644 --- a/numba_dppy/tests/test_device_array_args.py +++ b/numba_dppy/tests/test_device_array_args.py @@ -5,6 +5,7 @@ import dpctl import unittest + @dppy.kernel def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) @@ -19,7 +20,7 @@ def data_parallel_sum(a, b, c): d = a + b -@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system') +@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system") class TestDPPYDeviceArrayArgsGPU(unittest.TestCase): def test_device_array_args_cpu(self): c = np.ones_like(a) @@ -30,7 +31,7 @@ def test_device_array_args_cpu(self): self.assertTrue(np.all(c == d)) -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYDeviceArrayArgsCPU(unittest.TestCase): def test_device_array_args_gpu(self): c = np.ones_like(a) @@ -41,5 +42,5 @@ def test_device_array_args_gpu(self): self.assertTrue(np.all(c == d)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_dpctl_api.py b/numba_dppy/tests/test_dpctl_api.py index 59ddd16f65..29d31bbb2b 100644 --- a/numba_dppy/tests/test_dpctl_api.py +++ b/numba_dppy/tests/test_dpctl_api.py @@ -2,7 +2,7 @@ import dpctl -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPCTLAPI(unittest.TestCase): def test_dpctl_api(self): with dpctl.device_context("opencl:gpu") as gpu_queue: @@ -16,5 +16,5 @@ def test_dpctl_api(self): dpctl.is_in_device_context() -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_dpnp_functions.py b/numba_dppy/tests/test_dpnp_functions.py index 166937c275..75f22d1274 100644 --- a/numba_dppy/tests/test_dpnp_functions.py +++ b/numba_dppy/tests/test_dpnp_functions.py @@ -13,12 +13,16 @@ import dpctl -def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None): + +def test_for_different_datatypes( + fn, test_fn, dims, arg_count, tys, np_all=False, matrix=None +): if arg_count == 1: for ty in tys: if matrix and matrix[0]: - a = np.array(np.random.random( - dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1]) + a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape( + dims[0], dims[1] + ) else: a = np.array(np.random.random(dims[0]), dtype=ty) @@ -36,13 +40,15 @@ def test_for_different_datatypes(fn, test_fn, dims, arg_count, tys, np_all=False elif arg_count == 2: for ty in tys: if matrix and matrix[0]: - a = np.array(np.random.random( - dims[0] * dims[1]), dtype=ty).reshape(dims[0], dims[1]) + a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty).reshape( + dims[0], dims[1] + ) else: a = np.array(np.random.random(dims[0] * dims[1]), dtype=ty) if matrix and matrix[1]: - b = np.array(np.random.random( - dims[2] * dims[3]), dtype=ty).reshape(dims[2], dims[3]) + b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty).reshape( + dims[2], dims[3] + ) else: b = np.array(np.random.random(dims[2] * dims[3]), dtype=ty) @@ -100,9 +106,10 @@ def vvsort(val, vec, size): vec[k, imax] = temp -@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available') +@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available") class Testdpnp_linalg_functions(unittest.TestCase): tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double] + def test_eig(self): @njit def f(a): @@ -111,7 +118,11 @@ def f(a): size = 3 for ty in self.tys: a = np.arange(size * size, dtype=ty).reshape((size, size)) - symm_a = np.tril(a) + np.tril(a, -1).T + np.diag(np.full((size,), size * size, dtype=ty)) + symm_a = ( + np.tril(a) + + np.tril(a, -1).T + + np.diag(np.full((size,), size * size, dtype=ty)) + ) with dpctl.device_context("opencl:gpu"): got_val, got_vec = f(symm_a) @@ -122,8 +133,7 @@ def f(a): vvsort(got_val, got_vec, size) vvsort(np_val, np_vec, size) - - # NP change sign of vectors + # NP change sign of vectors for i in range(np_vec.shape[1]): if np_vec[0, i] * got_vec[0, i] < 0: np_vec[:, i] = -np_vec[:, i] @@ -132,9 +142,10 @@ def f(a): self.assertTrue(np.allclose(got_vec, np_vec)) -@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available') +@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available") class Testdpnp_ndarray_functions(unittest.TestCase): tys = [np.int32, np.uint32, np.int64, np.uint64, np.float, np.double] + def test_ndarray_sum(self): @njit def f(a): @@ -225,7 +236,6 @@ def f(a): self.assertTrue(expected == got) - def test_ndarray_argmin(self): @njit def f(a): @@ -257,7 +267,9 @@ def f(a): self.assertTrue(np.array_equal(expected, got)) -@unittest.skipUnless(ensure_dpnp() and dpctl.has_gpu_queues(), 'test only when dpNP and GPU is available') +@unittest.skipUnless( + ensure_dpnp() and dpctl.has_gpu_queues(), "test only when dpNP and GPU is available" +) class Testdpnp_functions(unittest.TestCase): N = 10 @@ -271,8 +283,7 @@ def f(a): c = np.sum(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.sum, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.sum, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.sum, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.sum, [10, 2, 3], self.tys)) @@ -282,8 +293,7 @@ def f(a): c = np.prod(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.prod, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.prod, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.prod, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.prod, [10, 2, 3], self.tys)) @@ -293,11 +303,9 @@ def f(a): c = np.argmax(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.argmax, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.argmax, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2], self.tys)) - self.assertTrue(test_for_dimensions( - f, np.argmax, [10, 2, 3], self.tys)) + self.assertTrue(test_for_dimensions(f, np.argmax, [10, 2, 3], self.tys)) def test_max(self): @njit @@ -305,8 +313,7 @@ def f(a): c = np.max(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.max, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.max, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.max, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.max, [10, 2, 3], self.tys)) @@ -316,23 +323,19 @@ def f(a): c = np.amax(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.amax, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.amax, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.amax, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.amax, [10, 2, 3], self.tys)) - def test_argmin(self): @njit def f(a): c = np.argmin(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.argmin, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.argmin, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2], self.tys)) - self.assertTrue(test_for_dimensions( - f, np.argmin, [10, 2, 3], self.tys)) + self.assertTrue(test_for_dimensions(f, np.argmin, [10, 2, 3], self.tys)) def test_min(self): @njit @@ -340,8 +343,7 @@ def f(a): c = np.min(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.min, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys)) @@ -351,8 +353,7 @@ def f(a): c = np.amin(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.min, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.min, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.min, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.min, [10, 2, 3], self.tys)) @@ -362,8 +363,9 @@ def f(a): c = np.argsort(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.argmin, [10], 1, self.tys, np_all=True)) + self.assertTrue( + test_for_different_datatypes(f, np.argmin, [10], 1, self.tys, np_all=True) + ) def test_median(self): @njit @@ -371,11 +373,9 @@ def f(a): c = np.median(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.median, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.median, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.median, [10, 2], self.tys)) - self.assertTrue(test_for_dimensions( - f, np.median, [10, 2, 3], self.tys)) + self.assertTrue(test_for_dimensions(f, np.median, [10, 2, 3], self.tys)) def test_mean(self): @njit @@ -383,8 +383,7 @@ def f(a): c = np.mean(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.mean, [10], 1, self.tys)) + self.assertTrue(test_for_different_datatypes(f, np.mean, [10], 1, self.tys)) self.assertTrue(test_for_dimensions(f, np.mean, [10, 2], self.tys)) self.assertTrue(test_for_dimensions(f, np.mean, [10, 2, 3], self.tys)) @@ -394,8 +393,17 @@ def f(a, b): c = np.matmul(a, b) return c - self.assertTrue(test_for_different_datatypes(f, np.matmul, [10, 5, 5, 10], 2, [ - np.float, np.double], np_all=True, matrix=[True, True])) + self.assertTrue( + test_for_different_datatypes( + f, + np.matmul, + [10, 5, 5, 10], + 2, + [np.float, np.double], + np_all=True, + matrix=[True, True], + ) + ) def test_dot(self): @njit @@ -403,14 +411,44 @@ def f(a, b): c = np.dot(a, b) return c - self.assertTrue(test_for_different_datatypes( - f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double])) - self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 1, 10, 2], 2, [ - np.float, np.double], matrix=[False, True], np_all=True)) - self.assertTrue(test_for_different_datatypes(f, np.dot, [2, 10, 10, 1], 2, [ - np.float, np.double], matrix=[True, False], np_all=True)) - self.assertTrue(test_for_different_datatypes(f, np.dot, [10, 2, 2, 10], 2, [ - np.float, np.double], matrix=[True, True], np_all=True)) + self.assertTrue( + test_for_different_datatypes( + f, np.dot, [10, 1, 10, 1], 2, [np.float, np.double] + ) + ) + self.assertTrue( + test_for_different_datatypes( + f, + np.dot, + [10, 1, 10, 2], + 2, + [np.float, np.double], + matrix=[False, True], + np_all=True, + ) + ) + self.assertTrue( + test_for_different_datatypes( + f, + np.dot, + [2, 10, 10, 1], + 2, + [np.float, np.double], + matrix=[True, False], + np_all=True, + ) + ) + self.assertTrue( + test_for_different_datatypes( + f, + np.dot, + [10, 2, 2, 10], + 2, + [np.float, np.double], + matrix=[True, True], + np_all=True, + ) + ) def test_cov(self): @njit @@ -418,23 +456,26 @@ def f(a): c = np.cov(a) return c - self.assertTrue(test_for_different_datatypes( - f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True)) + self.assertTrue( + test_for_different_datatypes( + f, np.cov, [10, 7], 1, self.tys, matrix=[True], np_all=True + ) + ) def test_dpnp_interacting_with_parfor(self): @njit def f(a, b): c = np.sum(a) e = np.add(b, a) - #d = a + 1 + # d = a + 1 return 0 result = f(self.a, self.b) - #np_result = np.add((self.a + np.sum(self.a)), self.b) + # np_result = np.add((self.a + np.sum(self.a)), self.b) - #max_abs_err = result.sum() - np_result.sum() - #self.assertTrue(max_abs_err < 1e-4) + # max_abs_err = result.sum() - np_result.sum() + # self.assertTrue(max_abs_err < 1e-4) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_dppy_fallback.py b/numba_dppy/tests/test_dppy_fallback.py index dd05bbdc84..3ebad7aed9 100644 --- a/numba_dppy/tests/test_dppy_fallback.py +++ b/numba_dppy/tests/test_dppy_fallback.py @@ -6,7 +6,7 @@ import dpctl -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYFallback(unittest.TestCase): def test_dppy_fallback_inner_call(self): @numba.jit @@ -29,8 +29,7 @@ def inner_call_fallback(): ref_result = inner_call_fallback() np.testing.assert_array_equal(dppy_result, ref_result) - self.assertTrue( - 'Failed to lower parfor on DPPY-device' in msg.getvalue()) + self.assertTrue("Failed to lower parfor on DPPY-device" in msg.getvalue()) def test_dppy_fallback_reductions(self): def reduction(a): @@ -47,9 +46,8 @@ def reduction(a): ref_result = reduction(a) np.testing.assert_array_equal(dppy_result, ref_result) - self.assertTrue( - 'Failed to lower parfor on DPPY-device' in msg.getvalue()) + self.assertTrue("Failed to lower parfor on DPPY-device" in msg.getvalue()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_dppy_func.py b/numba_dppy/tests/test_dppy_func.py index 729030e153..69ff82b38a 100644 --- a/numba_dppy/tests/test_dppy_func.py +++ b/numba_dppy/tests/test_dppy_func.py @@ -5,7 +5,7 @@ import dpctl -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYFunc(unittest.TestCase): N = 257 @@ -25,7 +25,6 @@ def f(a, b): with dpctl.device_context("opencl:gpu") as gpu_queue: f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) - self.assertTrue(np.all(b == 2)) def test_dppy_func_ndarray(self): @@ -56,5 +55,5 @@ def h(a, b): self.assertTrue(np.all(b == 3)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_math_functions.py b/numba_dppy/tests/test_math_functions.py index 6336c63759..e09202c6a3 100644 --- a/numba_dppy/tests/test_math_functions.py +++ b/numba_dppy/tests/test_math_functions.py @@ -5,46 +5,55 @@ import unittest import math + @dppy.kernel -def dppy_fabs(a,b): +def dppy_fabs(a, b): i = dppy.get_global_id(0) b[i] = math.fabs(a[i]) + @dppy.kernel -def dppy_exp(a,b): +def dppy_exp(a, b): i = dppy.get_global_id(0) b[i] = math.exp(a[i]) + @dppy.kernel -def dppy_log(a,b): +def dppy_log(a, b): i = dppy.get_global_id(0) b[i] = math.log(a[i]) + @dppy.kernel -def dppy_sqrt(a,b): +def dppy_sqrt(a, b): i = dppy.get_global_id(0) b[i] = math.sqrt(a[i]) + @dppy.kernel -def dppy_sin(a,b): +def dppy_sin(a, b): i = dppy.get_global_id(0) b[i] = math.sin(a[i]) + @dppy.kernel -def dppy_cos(a,b): +def dppy_cos(a, b): i = dppy.get_global_id(0) b[i] = math.cos(a[i]) + @dppy.kernel -def dppy_tan(a,b): +def dppy_tan(a, b): i = dppy.get_global_id(0) b[i] = math.tan(a[i]) + global_size = 10 N = global_size a = np.array(np.random.random(N), dtype=np.float32) + def driver(a, jitfunc): b = np.ones_like(a) # Device buffers @@ -67,7 +76,7 @@ def test_driver(input_arr, device_ty, jitfunc): return out_actual -@unittest.skipUnless(dpctl.has_cpu_queues(), 'test only on CPU system') +@unittest.skipUnless(dpctl.has_cpu_queues(), "test only on CPU system") class TestDPPYMathFunctionsCPU(unittest.TestCase): def test_fabs_cpu(self): b_actual = test_driver(a, "CPU", dppy_fabs) @@ -77,30 +86,30 @@ def test_fabs_cpu(self): def test_sin_cpu(self): b_actual = test_driver(a, "CPU", dppy_sin) b_expected = np.sin(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_cos_cpu(self): b_actual = test_driver(a, "CPU", dppy_cos) b_expected = np.cos(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_exp_cpu(self): b_actual = test_driver(a, "CPU", dppy_exp) b_expected = np.exp(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_sqrt_cpu(self): b_actual = test_driver(a, "CPU", dppy_sqrt) b_expected = np.sqrt(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_log_cpu(self): b_actual = test_driver(a, "CPU", dppy_log) b_expected = np.log(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYMathFunctionsGPU(unittest.TestCase): def test_fabs_gpu(self): b_actual = test_driver(a, "GPU", dppy_fabs) @@ -110,28 +119,28 @@ def test_fabs_gpu(self): def test_sin_gpu(self): b_actual = test_driver(a, "GPU", dppy_sin) b_expected = np.sin(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_cos_gpu(self): b_actual = test_driver(a, "GPU", dppy_cos) b_expected = np.cos(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_exp_gpu(self): b_actual = test_driver(a, "GPU", dppy_exp) b_expected = np.exp(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_sqrt_gpu(self): b_actual = test_driver(a, "GPU", dppy_sqrt) b_expected = np.sqrt(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) def test_log_gpu(self): b_actual = test_driver(a, "GPU", dppy_log) b_expected = np.log(a) - self.assertTrue(np.allclose(b_actual,b_expected)) + self.assertTrue(np.allclose(b_actual, b_expected)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py index 21a8fc8444..c6ffd9433f 100644 --- a/numba_dppy/tests/test_numpy_bit_twiddling_functions.py +++ b/numba_dppy/tests/test_numpy_bit_twiddling_functions.py @@ -5,7 +5,7 @@ import unittest -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestNumpy_bit_twiddling_functions(unittest.TestCase): def test_bitwise_and(self): @njit @@ -111,5 +111,5 @@ def f(a, b): self.assertTrue(np.all(c == d)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_numpy_comparison_functions.py b/numba_dppy/tests/test_numpy_comparison_functions.py index 9d56e94374..b4f47d96b6 100644 --- a/numba_dppy/tests/test_numpy_comparison_functions.py +++ b/numba_dppy/tests/test_numpy_comparison_functions.py @@ -5,7 +5,7 @@ import unittest -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestNumpy_comparison_functions(unittest.TestCase): a = np.array([4, 5, 6]) b = np.array([2, 6, 6]) @@ -202,5 +202,5 @@ def f(a, b): np.testing.assert_equal(c, d) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_numpy_floating_functions.py b/numba_dppy/tests/test_numpy_floating_functions.py index 8df7e2b5d4..6e746c4ed7 100644 --- a/numba_dppy/tests/test_numpy_floating_functions.py +++ b/numba_dppy/tests/test_numpy_floating_functions.py @@ -4,7 +4,7 @@ import unittest -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestNumpy_floating_functions(unittest.TestCase): def test_isfinite(self): @njit @@ -12,7 +12,7 @@ def f(a): c = np.isfinite(a) return c - test_arr = [np.log(-1.), 1., np.log(0)] + test_arr = [np.log(-1.0), 1.0, np.log(0)] input_arr = np.asarray(test_arr, dtype=np.float32) with dpctl.device_context("opencl:gpu"): @@ -27,7 +27,7 @@ def f(a): c = np.isinf(a) return c - test_arr = [np.log(-1.), 1., np.log(0)] + test_arr = [np.log(-1.0), 1.0, np.log(0)] input_arr = np.asarray(test_arr, dtype=np.float32) with dpctl.device_context("opencl:gpu"): @@ -42,7 +42,7 @@ def f(a): c = np.isnan(a) return c - test_arr = [np.log(-1.), 1., np.log(0)] + test_arr = [np.log(-1.0), 1.0, np.log(0)] input_arr = np.asarray(test_arr, dtype=np.float32) with dpctl.device_context("opencl:gpu"): @@ -94,5 +94,5 @@ def f(a): self.assertTrue(np.all(c == d)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_numpy_math_functions.py b/numba_dppy/tests/test_numpy_math_functions.py index ef5dc235b8..4a701495f9 100644 --- a/numba_dppy/tests/test_numpy_math_functions.py +++ b/numba_dppy/tests/test_numpy_math_functions.py @@ -5,7 +5,8 @@ import unittest from . import skip_tests -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') + +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestNumpy_math_functions(unittest.TestCase): N = 10 a = np.array(np.random.random(N), dtype=np.float32) @@ -138,7 +139,7 @@ def f(a, b): with dpctl.device_context("opencl:gpu"): c = f(input_arr, divisor) - self.assertTrue(np.all(c == 1.)) + self.assertTrue(np.all(c == 1.0)) def test_abs(self): @njit @@ -191,7 +192,7 @@ def f(a): with dpctl.device_context("opencl:gpu"): c = f(input_arr) - self.assertTrue(np.all(c == -1.)) + self.assertTrue(np.all(c == -1.0)) def test_conj(self): @njit @@ -322,7 +323,7 @@ def f(a): with dpctl.device_context("opencl:gpu"): c = f(input_arr) - self.assertTrue(np.all(c == 1/input_arr)) + self.assertTrue(np.all(c == 1 / input_arr)) def test_conjugate(self): @njit @@ -339,5 +340,5 @@ def f(a): self.assertTrue(np.all(c == d)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_numpy_trigonomteric_functions.py b/numba_dppy/tests/test_numpy_trigonomteric_functions.py index 812f3d060c..a67862032b 100644 --- a/numba_dppy/tests/test_numpy_trigonomteric_functions.py +++ b/numba_dppy/tests/test_numpy_trigonomteric_functions.py @@ -6,7 +6,7 @@ from . import skip_tests -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestNumpy_math_functions(unittest.TestCase): N = 10 @@ -238,5 +238,5 @@ def f(a): self.assertTrue(max_abs_err < 1e-5) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_offload_diagnostics.py b/numba_dppy/tests/test_offload_diagnostics.py index 6b41252fc6..9faebed088 100644 --- a/numba_dppy/tests/test_offload_diagnostics.py +++ b/numba_dppy/tests/test_offload_diagnostics.py @@ -16,7 +16,7 @@ def prange_func(): a = np.ones((n), dtype=np.float64) b = np.ones((n), dtype=np.float64) c = np.ones((n), dtype=np.float64) - for i in prange(n//2): + for i in prange(n // 2): a[i] = b[i] + c[i] return a @@ -56,5 +56,5 @@ def parallel_sum(a, b, c): self.assertTrue("Device -" in got.getvalue()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_parfor_lower_message.py b/numba_dppy/tests/test_parfor_lower_message.py index 17f1456bb6..0e578ce154 100644 --- a/numba_dppy/tests/test_parfor_lower_message.py +++ b/numba_dppy/tests/test_parfor_lower_message.py @@ -13,7 +13,7 @@ def prange_example(): a = np.ones((n), dtype=np.float64) b = np.ones((n), dtype=np.float64) c = np.ones((n), dtype=np.float64) - for i in prange(n//2): + for i in prange(n // 2): a[i] = b[i] + c[i] return a @@ -33,5 +33,5 @@ def test_parfor_message(self): self.assertTrue("Parfor lowered on DPPY-device" in got.getvalue()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_prange.py b/numba_dppy/tests/test_prange.py index eda9ccebbc..8f5305198a 100644 --- a/numba_dppy/tests/test_prange.py +++ b/numba_dppy/tests/test_prange.py @@ -96,14 +96,14 @@ def f(a, b): self.assertTrue(np.all(b == 12)) - @unittest.skip('numba-dppy issue 110') + @unittest.skip("numba-dppy issue 110") def test_two_consequent_prange(self): def prange_example(): n = 10 a = np.ones((n), dtype=np.float64) b = np.ones((n), dtype=np.float64) c = np.ones((n), dtype=np.float64) - for i in prange(n//2): + for i in prange(n // 2): a[i] = b[i] + c[i] return a @@ -120,20 +120,26 @@ def prange_example(): numba_dppy.compiler.DEBUG = old_debug - self.assertEqual(stdout.getvalue().count( - 'Parfor lowered on DPPY-device'), 2, stdout.getvalue()) - self.assertEqual(stdout.getvalue().count( - 'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue()) + self.assertEqual( + stdout.getvalue().count("Parfor lowered on DPPY-device"), + 2, + stdout.getvalue(), + ) + self.assertEqual( + stdout.getvalue().count("Failed to lower parfor on DPPY-device"), + 0, + stdout.getvalue(), + ) np.testing.assert_equal(res, jitted_res) - @unittest.skip('NRT required but not enabled') + @unittest.skip("NRT required but not enabled") def test_2d_arrays(self): def prange_example(): n = 10 a = np.ones((n, n), dtype=np.float64) b = np.ones((n, n), dtype=np.float64) c = np.ones((n, n), dtype=np.float64) - for i in prange(n//2): + for i in prange(n // 2): a[i] = b[i] + c[i] return a @@ -150,12 +156,18 @@ def prange_example(): numba_dppy.compiler.DEBUG = old_debug - self.assertEqual(stdout.getvalue().count( - 'Parfor lowered on DPPY-device'), 2, stdout.getvalue()) - self.assertEqual(stdout.getvalue().count( - 'Failed to lower parfor on DPPY-device'), 0, stdout.getvalue()) + self.assertEqual( + stdout.getvalue().count("Parfor lowered on DPPY-device"), + 2, + stdout.getvalue(), + ) + self.assertEqual( + stdout.getvalue().count("Failed to lower parfor on DPPY-device"), + 0, + stdout.getvalue(), + ) np.testing.assert_equal(res, jitted_res) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_print.py b/numba_dppy/tests/test_print.py index af19658048..8beca0a83f 100644 --- a/numba_dppy/tests/test_print.py +++ b/numba_dppy/tests/test_print.py @@ -7,7 +7,7 @@ import dpctl -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestPrint(unittest.TestCase): def test_print_dppy_kernel(self): @dppy.func @@ -30,5 +30,5 @@ def f(a, b): f[N, dppy.DEFAULT_LOCAL_SIZE](a, b) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_rename_numpy_function_pass.py b/numba_dppy/tests/test_rename_numpy_function_pass.py index cfeff09b8d..7ef237fcd0 100644 --- a/numba_dppy/tests/test_rename_numpy_function_pass.py +++ b/numba_dppy/tests/test_rename_numpy_function_pass.py @@ -7,10 +7,12 @@ from numba_dppy.testing import ensure_dpnp -from numba.core import (compiler, typing, cpu) -from numba_dppy.rename_numpy_functions_pass import (DPPYRewriteOverloadedNumPyFunctions, - DPPYRewriteNdarrayFunctions) -from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes) +from numba.core import compiler, typing, cpu +from numba_dppy.rename_numpy_functions_pass import ( + DPPYRewriteOverloadedNumPyFunctions, + DPPYRewriteNdarrayFunctions, +) +from numba.core.typed_passes import NopythonTypeInference, AnnotateTypes class MyPipeline(object): @@ -46,8 +48,10 @@ def check_equivalent(expected_ir, got_ir): else: if isinstance(expected_stmt, numba.core.ir.Assign): if isinstance(expected_stmt.value, numba.core.ir.Global): - if (expected_stmt.value.name != got_stmt.value.name and - expected_stmt.value.name != "numba_dppy"): + if ( + expected_stmt.value.name != got_stmt.value.name + and expected_stmt.value.name != "numba_dppy" + ): return False elif isinstance(expected_stmt.value, numba.core.ir.Expr): # should get "dpnp" and "sum" as attr @@ -76,7 +80,7 @@ def got(a): self.assertTrue(check_equivalent(expected_ir, pipeline.state.func_ir)) -@unittest.skipUnless(ensure_dpnp(), 'test only when dpNP is available') +@unittest.skipUnless(ensure_dpnp(), "test only when dpNP is available") class TestRenameNdarrayFunctionsPass(unittest.TestCase): def test_rename_ndarray(self): def expected(a): diff --git a/numba_dppy/tests/test_sum_reduction.py b/numba_dppy/tests/test_sum_reduction.py index 37ca38a12a..c2001e13a6 100644 --- a/numba_dppy/tests/test_sum_reduction.py +++ b/numba_dppy/tests/test_sum_reduction.py @@ -4,34 +4,37 @@ import unittest import dpctl + @dppy.kernel def reduction_kernel(A, R, stride): i = dppy.get_global_id(0) # sum two element - R[i] = A[i] + A[i+stride] + R[i] = A[i] + A[i + stride] # store the sum to be used in nex iteration A[i] = R[i] -@unittest.skipUnless(dpctl.has_gpu_queues(), 'test only on GPU system') +@unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestDPPYSumReduction(unittest.TestCase): def test_sum_reduction(self): # This test will only work for even case N = 1024 - self.assertTrue(N%2 == 0) + self.assertTrue(N % 2 == 0) A = np.array(np.random.random(N), dtype=np.float32) A_copy = A.copy() # at max we will require half the size of A to store sum - R = np.array(np.random.random(math.ceil(N/2)), dtype=np.float32) + R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32) with dpctl.device_context("opencl:gpu") as gpu_queue: total = N - while (total > 1): + while total > 1: # call kernel global_size = total // 2 - reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](A, R, global_size) + reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE]( + A, R, global_size + ) total = total // 2 result = A_copy.sum() @@ -39,5 +42,5 @@ def test_sum_reduction(self): self.assertTrue(max_abs_err < 1e-4) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_vectorize.py b/numba_dppy/tests/test_vectorize.py index 5b3a41629c..6f93c232f6 100644 --- a/numba_dppy/tests/test_vectorize.py +++ b/numba_dppy/tests/test_vectorize.py @@ -8,20 +8,19 @@ @unittest.skipUnless(dpctl.has_gpu_queues(), "test only on GPU system") class TestVectorize(unittest.TestCase): def test_vectorize(self): - @vectorize(nopython=True) def axy(a, x, y): return a * x + y @njit def f(a0, a1): - return np.cos(axy(a0, np.sin(a1) - 1., 1.)) + return np.cos(axy(a0, np.sin(a1) - 1.0, 1.0)) def f_np(a0, a1): sin_res = np.sin(a1) res = [] for i in range(len(a0)): - res.append(axy(a0[i], sin_res[i] - 1., 1.)) + res.append(axy(a0[i], sin_res[i] - 1.0, 1.0)) return np.cos(np.array(res)) A = np.random.random(10) @@ -36,5 +35,5 @@ def f_np(a0, a1): self.assertTrue(max_abs_err < 1e-5) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/numba_dppy/tests/test_with_context.py b/numba_dppy/tests/test_with_context.py index 693c155ab2..58f14952d9 100644 --- a/numba_dppy/tests/test_with_context.py +++ b/numba_dppy/tests/test_with_context.py @@ -8,10 +8,8 @@ class TestWithDPPYContext(unittest.TestCase): - @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available") def test_with_dppy_context_gpu(self): - @njit def nested_func(a, b): np.sin(a, b) @@ -33,11 +31,10 @@ def func(b): func(expected) np.testing.assert_array_equal(expected, got_gpu) - self.assertTrue('Parfor lowered on DPPY-device' in got_gpu_message.getvalue()) + self.assertTrue("Parfor lowered on DPPY-device" in got_gpu_message.getvalue()) @unittest.skipIf(not dpctl.has_cpu_queues(), "No CPU platforms available") def test_with_dppy_context_cpu(self): - @njit def nested_func(a, b): np.sin(a, b) @@ -59,17 +56,15 @@ def func(b): func(expected) np.testing.assert_array_equal(expected, got_cpu) - self.assertTrue('Parfor lowered on DPPY-device' in got_cpu_message.getvalue()) - + self.assertTrue("Parfor lowered on DPPY-device" in got_cpu_message.getvalue()) @unittest.skipIf(not dpctl.has_gpu_queues(), "No GPU platforms available") def test_with_dppy_context_target(self): - - @njit(target='cpu') + @njit(target="cpu") def nested_func_target(a, b): np.sin(a, b) - @njit(target='gpu') + @njit(target="gpu") def func_target(b): a = np.ones((64), dtype=np.float64) nested_func_target(a, b) @@ -84,7 +79,6 @@ def func_no_parallel(b): a = np.ones((64), dtype=np.float64) return a - a = np.ones((64), dtype=np.float64) b = np.ones((64), dtype=np.float64) @@ -112,5 +106,5 @@ def func_no_parallel(b): self.assertTrue(msg_2 in str(raises_4.exception)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/setup.py b/setup.py index 37ad0bfc68..8c892f6fd2 100644 --- a/setup.py +++ b/setup.py @@ -19,13 +19,15 @@ def get_ext_modules(): if dpnp_present: dpnp_lib_path = [] dpnp_lib_path += [os.path.dirname(dpnp.__file__)] - ext_dpnp_glue = Extension(name='numba_dppy.dpnp_glue.dpnp_fptr_interface', - sources=['numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx'], - include_dirs=[dpnp.get_include()], - libraries=['dpnp_backend_c'], - library_dirs=dpnp_lib_path, - runtime_library_dirs=dpnp_lib_path, - language="c++") + ext_dpnp_glue = Extension( + name="numba_dppy.dpnp_glue.dpnp_fptr_interface", + sources=["numba_dppy/dpnp_glue/dpnp_fptr_interface.pyx"], + include_dirs=[dpnp.get_include()], + libraries=["dpnp_backend_c"], + library_dirs=dpnp_lib_path, + runtime_library_dirs=dpnp_lib_path, + language="c++", + ) ext_modules += [ext_dpnp_glue] if dpnp_present: