From 4f3f7e5e77a08e9e03cb9026d1699109520c6ccb Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 9 Nov 2021 04:22:03 +0300 Subject: [PATCH 1/5] Adds zip and dict builtins overloads to support easy literal dict ctor Motivation: there's no easy way to create Numba LiteralStrKeyDict objects for const dicts with many elements. This adds a special overload for dict builtin that creates LiteralStrKeyDict from tuple of pairs ('col_name', col_data). --- sdc/functions/tuple_utils.py | 67 +++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py index 17dffa200..25dafa93d 100644 --- a/sdc/functions/tuple_utils.py +++ b/sdc/functions/tuple_utils.py @@ -25,10 +25,14 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +from textwrap import dedent + from numba import types -from numba.extending import (intrinsic, ) +from numba.extending import intrinsic from numba.core.typing.templates import (signature, ) +from sdc.utilities.utils import sdc_overload + @intrinsic def sdc_tuple_map(typingctx, func, data, *args): @@ -205,3 +209,64 @@ def codegen(context, builder, sig, args): return context.make_tuple(builder, ret_type, [first_tup, second_tup]) return ret_type(data_type), codegen + + +@sdc_overload(zip) +def zip_tuples_spec_ovld(x, y): + + if not (isinstance(x, types.BaseAnonymousTuple) and isinstance(y, types.BaseAnonymousTuple)): + return None + + res_size = min(len(x), len(y)) + func_impl_name = 'zip_tuples_spec_impl' + tup_elements = ', '.join([f"(x[{i}], y[{i}])" for i in range(res_size)]) + func_text = dedent(f""" + def {func_impl_name}(x, y): + return ({tup_elements}{',' if res_size else ''}) + """) + use_globals, use_locals = {}, {} + exec(func_text, use_globals, use_locals) + return use_locals[func_impl_name] + + # FIXME_Numba#6533: alternatively we could have used sdc_tuple_map_elementwise + # to avoid another use of exec, but due to @intrinsic-s not supporting + # prefer_literal option below implementation looses literaly of args! + # from sdc.functions.tuple_utils import sdc_tuple_map_elementwise + # def zip_tuples_spec_impl(x, y): + # return sdc_tuple_map_elementwise( + # lambda a, b: (a, b), + # x, + # y + # ) + # + # return zip_tuples_spec_impl + + +@sdc_overload(dict) +def dict_from_tuples_ovld(x): + + accepted_tuple_types = (types.Tuple, types.UniTuple) + if not isinstance(x, accepted_tuple_types): + return None + + def check_tuple_element(ty): + return (isinstance(ty, accepted_tuple_types) + and len(ty) == 2 + and isinstance(ty[0], types.StringLiteral)) + + # below checks that elements are tuples with size 2 and first element is literal string + if not (len(x) != 0 and all(map(check_tuple_element, x))): + assert False, f"Creating LiteralStrKeyDict not supported from pairs of: {x}" + + # numba type-infers {'A': [1, 2, 3]} i.e. const dict of size 1 not as LiteralStrKeyDict + # but as non literal dict! TO-DO: add special branch here and call literal dict ctor directly + func_impl_name = 'dict_from_tuples_impl' + dict_elements = ', '.join([f"x[{i}][0]:x[{i}][1]" for i in range(len(x))]) + func_text = dedent(f""" + def {func_impl_name}(x): + res = {{{dict_elements}}} + return res + """) + use_globals, use_locals = {}, {} + exec(func_text, use_globals, use_locals) + return use_locals[func_impl_name] From 7d1c0859b6827bf2b39be1f6fd33bf3472c8cde6 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Mon, 15 Nov 2021 16:13:48 +0300 Subject: [PATCH 2/5] Replacing zip overload builtin with internal sdc_tuple_zip function Details: zip builtin is already overloaded in Numba and has priority over user defined overloads, hence in cases when we want zip two single elements tuples, e.g. zip(('A', ), (1, )) builtin function will match and type inference will unliteral all tuples, producing iter objects (that are always homogeneous in Numba). That is, literality of objects will be lost. Using sdc_zip_tuples explicitly avoid this problem. --- sdc/functions/tuple_utils.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py index 25dafa93d..4e1243235 100644 --- a/sdc/functions/tuple_utils.py +++ b/sdc/functions/tuple_utils.py @@ -211,14 +211,20 @@ def codegen(context, builder, sig, args): return ret_type(data_type), codegen -@sdc_overload(zip) -def zip_tuples_spec_ovld(x, y): +def sdc_tuple_zip(x, y): + pass + + +@sdc_overload(sdc_tuple_zip) +def sdc_tuple_zip_ovld(x, y): + """ This function combines tuple of pairs from two input tuples x and y, preserving + literality of elements in them. """ if not (isinstance(x, types.BaseAnonymousTuple) and isinstance(y, types.BaseAnonymousTuple)): return None res_size = min(len(x), len(y)) - func_impl_name = 'zip_tuples_spec_impl' + func_impl_name = 'sdc_tuple_zip_impl' tup_elements = ', '.join([f"(x[{i}], y[{i}])" for i in range(res_size)]) func_text = dedent(f""" def {func_impl_name}(x, y): @@ -232,14 +238,14 @@ def {func_impl_name}(x, y): # to avoid another use of exec, but due to @intrinsic-s not supporting # prefer_literal option below implementation looses literaly of args! # from sdc.functions.tuple_utils import sdc_tuple_map_elementwise - # def zip_tuples_spec_impl(x, y): + # def sdc_tuple_zip_impl(x, y): # return sdc_tuple_map_elementwise( # lambda a, b: (a, b), # x, # y # ) # - # return zip_tuples_spec_impl + # return sdc_tuple_zip_impl @sdc_overload(dict) From 624225637bdcbf06209e28b81c26ef2293bb76e9 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 16 Nov 2021 18:32:52 +0300 Subject: [PATCH 3/5] Fixing issue with literal dict ctor with single element --- sdc/functions/tuple_utils.py | 44 +++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py index 4e1243235..8af94ffd2 100644 --- a/sdc/functions/tuple_utils.py +++ b/sdc/functions/tuple_utils.py @@ -30,6 +30,7 @@ from numba import types from numba.extending import intrinsic from numba.core.typing.templates import (signature, ) +from numba.typed.dictobject import build_map from sdc.utilities.utils import sdc_overload @@ -248,11 +249,37 @@ def {func_impl_name}(x, y): # return sdc_tuple_zip_impl +@intrinsic +def literal_dict_ctor(typingctx, items): + + tup_size = len(items) + key_order = {p[0].literal_value: i for i, p in enumerate(items)} + ret_type = types.LiteralStrKeyDict(dict(items), key_order) + + def codegen(context, builder, sig, args): + items_val = args[0] + + # extract elements from the input tuple, incref and add pairs of + # extracted variables into a list, required by build_map + repacked_items = [] + for i in range(tup_size): + elem = builder.extract_value(items_val, i) + elem_first = builder.extract_value(elem, 0) + elem_second = builder.extract_value(elem, 1) + repacked_items.append((elem_first, elem_second)) + if context.enable_nrt: + context.nrt.incref(builder, items[i], elem) + d = build_map(context, builder, ret_type, items, repacked_items) + return d + + return ret_type(items), codegen + + @sdc_overload(dict) def dict_from_tuples_ovld(x): accepted_tuple_types = (types.Tuple, types.UniTuple) - if not isinstance(x, accepted_tuple_types): + if not isinstance(x, types.BaseAnonymousTuple): return None def check_tuple_element(ty): @@ -264,15 +291,6 @@ def check_tuple_element(ty): if not (len(x) != 0 and all(map(check_tuple_element, x))): assert False, f"Creating LiteralStrKeyDict not supported from pairs of: {x}" - # numba type-infers {'A': [1, 2, 3]} i.e. const dict of size 1 not as LiteralStrKeyDict - # but as non literal dict! TO-DO: add special branch here and call literal dict ctor directly - func_impl_name = 'dict_from_tuples_impl' - dict_elements = ', '.join([f"x[{i}][0]:x[{i}][1]" for i in range(len(x))]) - func_text = dedent(f""" - def {func_impl_name}(x): - res = {{{dict_elements}}} - return res - """) - use_globals, use_locals = {}, {} - exec(func_text, use_globals, use_locals) - return use_locals[func_impl_name] + def dict_from_tuples_impl(x): + return literal_dict_ctor(x) + return dict_from_tuples_impl From 6f21fb4c075ee01b29238a4ebd2ca919cc2cc411 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Wed, 17 Nov 2021 18:23:27 +0300 Subject: [PATCH 4/5] Fixing refcnt issue and adding tests --- sdc/functions/tuple_utils.py | 5 +--- sdc/tests/test_basic.py | 49 +++++++++++++++++++++++++++++++++++- sdc/tests/test_utils.py | 7 ++++++ 3 files changed, 56 insertions(+), 5 deletions(-) diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py index 8af94ffd2..aa41db68f 100644 --- a/sdc/functions/tuple_utils.py +++ b/sdc/functions/tuple_utils.py @@ -259,16 +259,13 @@ def literal_dict_ctor(typingctx, items): def codegen(context, builder, sig, args): items_val = args[0] - # extract elements from the input tuple, incref and add pairs of - # extracted variables into a list, required by build_map + # extract elements from the input tuple and repack into a list of variables required by build_map repacked_items = [] for i in range(tup_size): elem = builder.extract_value(items_val, i) elem_first = builder.extract_value(elem, 0) elem_second = builder.extract_value(elem, 1) repacked_items.append((elem_first, elem_second)) - if context.enable_nrt: - context.nrt.incref(builder, items[i], elem) d = build_map(context, builder, ret_type, items, repacked_items) return d diff --git a/sdc/tests/test_basic.py b/sdc/tests/test_basic.py index 413905dc9..0809de2ea 100644 --- a/sdc/tests/test_basic.py +++ b/sdc/tests/test_basic.py @@ -30,7 +30,10 @@ import pandas as pd import random import unittest +from itertools import product + from numba import types +from numba.tests.support import MemoryLeakMixin import sdc from sdc.tests.test_base import TestCase @@ -43,7 +46,8 @@ dist_IR_contains, get_rank, get_start_end, - skip_numba_jit) + skip_numba_jit, + assert_nbtype_for_varname) def get_np_state_ptr(): @@ -540,5 +544,48 @@ def test_rhs(arr_len): np.testing.assert_allclose(A, B) +class TestPython(MemoryLeakMixin, TestCase): + + def test_literal_dict_ctor(self): + """ Verifies that dict builtin creates LiteralStrKeyDict from tuple + of pairs ('col_name_i', col_data_i), where col_name_i is literal string """ + + def test_impl_1(): + items = (('A', np.arange(11)), ) + res = dict(items) + return len(res) + + def test_impl_2(): + items = (('A', np.arange(5)), ('B', np.ones(11)), ) + res = dict(items) + return len(res) + + local_vars = locals() + list_tested_fns = [local_vars[k] for k in local_vars.keys() if k.startswith('test_impl')] + + for test_impl in list_tested_fns: + with self.subTest(tested_func_name=test_impl.__name__): + sdc_func = self.jit(test_impl) + self.assertEqual(sdc_func(), test_impl()) + assert_nbtype_for_varname(self, sdc_func, 'res', types.LiteralStrKeyDict) + + def test_dict_zip_rewrite(self): + """ Verifies that a compination of dict(zip()) creates LiteralStrKeyDict when + zip is applied to tuples of literal column names and columns data """ + + from sdc.functions.tuple_utils import sdc_tuple_zip + dict_keys = ('A', 'B') + dict_values = (np.ones(5), np.array([1, 2, 3])) + + def test_impl(): + res = dict(sdc_tuple_zip(dict_keys, dict_values)) + return len(res) + + sdc_func = self.jit(test_impl) + expected = len(dict(zip(dict_keys, dict_values))) + self.assertEqual(sdc_func(), expected) + assert_nbtype_for_varname(self, sdc_func, 'res', types.LiteralStrKeyDict) + + if __name__ == "__main__": unittest.main() diff --git a/sdc/tests/test_utils.py b/sdc/tests/test_utils.py index 110c7424b..571da7d1a 100644 --- a/sdc/tests/test_utils.py +++ b/sdc/tests/test_utils.py @@ -272,3 +272,10 @@ def _make_func_from_text(func_text, func_name='test_impl', global_vars={}): exec(func_text, global_vars, loc_vars) test_impl = loc_vars[func_name] return test_impl + + +def assert_nbtype_for_varname(self, disp, var, expected_type, fn_sig=None): + fn_sig = fn_sig or disp.nopython_signatures[0] + cres = disp.get_compile_result(fn_sig) + fn_typemap = cres.type_annotation.typemap + self.assertIsInstance(fn_typemap[var], expected_type) From 7b828c131632862e5d09a2f4357e595b93ef291b Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Wed, 17 Nov 2021 20:44:18 +0300 Subject: [PATCH 5/5] Adding rewrite for dict(zip()) calls --- sdc/__init__.py | 1 + sdc/rewrites/dict_zip_tuples.py | 78 +++++++++++++++++++++++++++++++++ sdc/tests/test_basic.py | 3 +- 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 sdc/rewrites/dict_zip_tuples.py diff --git a/sdc/__init__.py b/sdc/__init__.py index e73c51682..0c3235441 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -70,6 +70,7 @@ import sdc.rewrites.dataframe_constructor import sdc.rewrites.read_csv_consts +import sdc.rewrites.dict_zip_tuples import sdc.rewrites.dataframe_getitem_attribute import sdc.datatypes.hpat_pandas_functions import sdc.datatypes.hpat_pandas_dataframe_functions diff --git a/sdc/rewrites/dict_zip_tuples.py b/sdc/rewrites/dict_zip_tuples.py new file mode 100644 index 000000000..96e38f5ca --- /dev/null +++ b/sdc/rewrites/dict_zip_tuples.py @@ -0,0 +1,78 @@ +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba.core.rewrites import register_rewrite, Rewrite +from numba.core.ir_utils import guard, get_definition +from numba import errors +from numba.core import ir + +from sdc.rewrites.ir_utils import find_operations, import_function +from sdc.functions.tuple_utils import sdc_tuple_zip + + +@register_rewrite('before-inference') +class RewriteDictZip(Rewrite): + """ + Searches for calls like dict(zip(arg1, arg2)) and replaces zip with sdc_zip. + """ + + def match(self, func_ir, block, typemap, calltypes): + + self._block = block + self._func_ir = func_ir + self._calls_to_rewrite = set() + + # Find all assignments with a RHS expr being a call to dict, and where arg + # is a call to zip and store these ir.Expr for further modification + for inst in find_operations(block=block, op_name='call'): + expr = inst.value + try: + callee = func_ir.infer_constant(expr.func) + except errors.ConstantInferenceError: + continue + + if (callee is dict and len(expr.args) == 1): + dict_arg_expr = guard(get_definition, func_ir, expr.args[0]) + if (getattr(dict_arg_expr, 'op', None) == 'call'): + called_func = guard(get_definition, func_ir, dict_arg_expr.func) + if (called_func.value is zip and len(dict_arg_expr.args) == 2): + self._calls_to_rewrite.add(dict_arg_expr) + + return len(self._calls_to_rewrite) > 0 + + def apply(self): + """ + Replace call to zip in matched expressions with call to sdc_zip. + """ + new_block = self._block.copy() + new_block.clear() + zip_spec_stmt = import_function(sdc_tuple_zip, new_block, self._func_ir) + for inst in self._block.body: + if isinstance(inst, ir.Assign) and inst.value in self._calls_to_rewrite: + expr = inst.value + expr.func = zip_spec_stmt.target # injects the new function + new_block.append(inst) + return new_block diff --git a/sdc/tests/test_basic.py b/sdc/tests/test_basic.py index 0809de2ea..21cd245f3 100644 --- a/sdc/tests/test_basic.py +++ b/sdc/tests/test_basic.py @@ -573,12 +573,11 @@ def test_dict_zip_rewrite(self): """ Verifies that a compination of dict(zip()) creates LiteralStrKeyDict when zip is applied to tuples of literal column names and columns data """ - from sdc.functions.tuple_utils import sdc_tuple_zip dict_keys = ('A', 'B') dict_values = (np.ones(5), np.array([1, 2, 3])) def test_impl(): - res = dict(sdc_tuple_zip(dict_keys, dict_values)) + res = dict(zip(dict_keys, dict_values)) return len(res) sdc_func = self.jit(test_impl)