From 5e4c42022c4a7469c51f3ed1aeb6c486122b26c5 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Sat, 12 Jun 2021 19:14:12 +0300 Subject: [PATCH 1/7] Initial version of pandas MultiIndex class --- sdc/__init__.py | 1 + sdc/datatypes/indexes/__init__.py | 1 + sdc/datatypes/indexes/multi_index_type.py | 147 ++ sdc/extensions/indexes/indexes_generic.py | 37 +- sdc/extensions/indexes/multi_index_ext.py | 1574 +++++++++++++++++++++ sdc/hiframes/api.py | 21 +- sdc/tests/indexes/__init__.py | 1 + sdc/tests/indexes/index_datagens.py | 90 +- sdc/tests/indexes/test_multi_index.py | 692 +++++++++ sdc/tests/test_utils.py | 13 + sdc/utilities/sdc_typing_utils.py | 4 + test_create_multiindex.py | 52 + 12 files changed, 2626 insertions(+), 7 deletions(-) create mode 100644 sdc/datatypes/indexes/multi_index_type.py create mode 100644 sdc/extensions/indexes/multi_index_ext.py create mode 100644 sdc/tests/indexes/test_multi_index.py create mode 100644 test_create_multiindex.py diff --git a/sdc/__init__.py b/sdc/__init__.py index 76c29ae97..e73c51682 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -49,6 +49,7 @@ import sdc.extensions.indexes.range_index_ext import sdc.extensions.indexes.int64_index_ext +import sdc.extensions.indexes.multi_index_ext import sdc.extensions.sdc_hashmap_ext diff --git a/sdc/datatypes/indexes/__init__.py b/sdc/datatypes/indexes/__init__.py index 52d144708..cae399e6d 100644 --- a/sdc/datatypes/indexes/__init__.py +++ b/sdc/datatypes/indexes/__init__.py @@ -30,3 +30,4 @@ from .positional_index_type import PositionalIndexType from .empty_index_type import EmptyIndexType from .int64_index_type import Int64IndexType +from .multi_index_type import MultiIndexType \ No newline at end of file diff --git a/sdc/datatypes/indexes/multi_index_type.py b/sdc/datatypes/indexes/multi_index_type.py new file mode 100644 index 000000000..ddb54bfb4 --- /dev/null +++ b/sdc/datatypes/indexes/multi_index_type.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper, + typeof_impl, +) +from numba.core.typing.typeof import _typeof_type as numba_typeof_type + + +class MultiIndexIteratorType(types.SimpleIteratorType): + def __init__(self, iterable): + self.parent = iterable + yield_type = iterable.dtype + name = "iter[{}->{}],{}".format( + iterable, yield_type, iterable.name + ) + super(MultiIndexIteratorType, self).__init__(name, yield_type) + + +@register_model(MultiIndexIteratorType) +class MultiIndexIterModel(models.StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('parent', fe_type.parent), # reference to the index object + ('state', types.CPointer(types.int64)), # iterator state (i.e. counter) + ] + super(MultiIndexIterModel, self).__init__(dmm, fe_type, members) + + +class MultiIndexType(types.IterableType): + + def __init__(self, levels, codes, is_named=False): + self.levels = levels + self.codes = codes + self.is_named = is_named + super(MultiIndexType, self).__init__( + name='MultiIndexType({}, {}, {})'.format(levels, codes, is_named)) + + @property + def iterator_type(self): + return MultiIndexIteratorType(self).iterator_type + + @property + def dtype(self): + nlevels = len(self.levels) + levels_types = [self.levels.dtype] * nlevels if isinstance(self.levels, types.UniTuple) else self.levels + return types.Tuple.from_types([level.dtype for level in levels_types]) + + @property + def nlevels(self): + return len(self.levels) + + @property + def levels_types(self): + if isinstance(self.levels, types.UniTuple): + return [self.levels.dtype] * self.levels.count + + return self.levels + + @property + def codes_types(self): + if isinstance(self.codes, types.UniTuple): + return [self.codes.dtype] * self.codes.count + + return self.codes + + +@register_model(MultiIndexType) +class MultiIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + levels_type = fe_type.levels + codes_type = fe_type.codes + name_type = types.unicode_type if fe_type.is_named else types.none # TO-DO: change to types.Optional + members = [ + ('levels', levels_type), + ('codes', codes_type), + ('name', name_type), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(MultiIndexType, 'levels', '_levels') +make_attribute_wrapper(MultiIndexType, 'codes', '_codes') +make_attribute_wrapper(MultiIndexType, 'name', '_name') + + +#### FIXME: move below to one common place: + +# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef +# we have to use our new SdcTypeRef to type objects created from types.Type +# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. +class SdcTypeRef(types.Dummy): + """Reference to a type. + Used when a type is passed as a value. + """ + def __init__(self, instance_type): + self.instance_type = instance_type + super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) + + +@register_model(SdcTypeRef) +class SdcTypeRefModel(models.OpaqueModel): + def __init__(self, dmm, fe_type): + + models.OpaqueModel.__init__(self, dmm, fe_type) + + +import pandas as pd +@typeof_impl.register(type) +def mynew_typeof_type(val, c): + """ This function is a workaround for """ + + # print("DEBUG: val=", val) + if not issubclass(val, pd.MultiIndex): + # if not issubclass(val, MultiIndex): + return numba_typeof_type(val, c) + else: + return SdcTypeRef(MultiIndexType) diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 3462067cc..0d1a8710f 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -96,7 +96,10 @@ def sdc_indexes_operator_eq_ovld(self, other): # TO-DO: this is for numeric indexes only now, extend to string-index when it's added use_self_values = isinstance(self, sdc_pandas_index_types) and not isinstance(self, types.Array) use_other_values = isinstance(other, sdc_pandas_index_types) and not isinstance(other, types.Array) - one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) + + ## prev. version: one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) + # FIXME: check that one_operand_is_scalar is fixed and works in tests now + one_operand_is_scalar = self is other.dtype or other is self.dtype def sdc_indexes_operator_eq_impl(self, other): @@ -217,8 +220,8 @@ def pd_fix_indexes_join_overload(joined, indexer1, indexer2): """ Wraps pandas index.join() into new function that returns indexers as arrays and not optional(array) """ # This function is simply a workaround for problem with parfor lowering - # broken by indexers typed as types.Optional(Array) - FIXME_Numba#XXXX: remove it - # in all places whne parfor issue is fixed + # broken by indexers typed as types.Optional(Array) - FIXME_Numba#6686: remove it + # in all places when parfor issue is fixed def pd_fix_indexes_join_impl(joined, indexer1, indexer2): if indexer1 is not None: _indexer1 = _nonoptional(indexer1) @@ -282,3 +285,31 @@ def sdc_np_array_overload(A): if isinstance(A, Int64IndexType): return lambda A: A._data + + +def sdc_indexes_take(self, target): + pass + + +@sdc_overload(sdc_indexes_take) +def pd_fix_indexes_take_overload(self, indexes): + """ Simply workaround for not having take method as unique indexes due to + the fact that StringArrayType is one of the index types """ + + check = isinstance(self, sdc_pandas_index_types) + print("DEBUG: sdc_indexes_take typing:", self, check) + if not isinstance(self, sdc_pandas_index_types): + return None + + index_api_supported = not isinstance(self, sdc_old_index_types) + + def pd_fix_indexes_take_impl(self, indexes): + + if index_api_supported == True: # noqa + res = self.take(indexes) + else: + res = numpy_like.take(self, indexes) + + return res + + return pd_fix_indexes_take_impl diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py new file mode 100644 index 000000000..b40666d09 --- /dev/null +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -0,0 +1,1574 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types, prange +from numba.core import cgutils +from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, type_callable) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature, AttributeTemplate, AbstractTemplate, infer_getattr +from numba.core.imputils import impl_ret_untracked, call_getiter, impl_ret_borrowed +from numba.core.imputils import (impl_ret_new_ref, impl_ret_borrowed, iternext_impl, RefType) +from numba.core.boxing import box_array, unbox_array +from numba.core.boxing import box_array, unbox_array, box_tuple + +import llvmlite.llvmpy.core as lc + +from sdc.datatypes.indexes import * +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + check_signed_integer, + _check_dtype_param_type, + sdc_pandas_index_types, + check_types_comparable, + ) +from sdc.functions import numpy_like +from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.hiframes.boxing import _infer_index_type, _unbox_index_data +from sdc.extensions.indexes.indexes_generic import * +from sdc.datatypes.common_functions import hpat_arrays_append + +from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType +from numba.core.extending import register_jitable +from numba import literal_unroll +from numba.typed import Dict, List +from sdc.str_arr_type import StringArrayType +from sdc.extensions.indexes.positional_index_ext import init_positional_index +from sdc.extensions.indexes.empty_index_ext import init_empty_index + +from sdc.datatypes.indexes.multi_index_type import SdcTypeRef +from sdc.hiframes.boxing import _infer_index_type + + +### FIXME: clean-up imports + + +@intrinsic +def init_multi_index(typingctx, levels, codes): + + print("DEBUG: init_multi_index typing:\n", + f"\tlevels={levels}\n", + f"\tcodes={codes}\n") + if not (isinstance(levels, (types.Tuple, types.UniTuple)) and + isinstance(codes, (types.Tuple, types.UniTuple))): + assert False, "init_multi_index types " + return None + + def is_valid_level_type(typ): + return isinstance(typ, sdc_pandas_index_types) + + def is_valid_code_type(typ): + return (isinstance(typ, types.Array) and isinstance(typ.dtype, types.Integer)) + + if not all(map(is_valid_level_type, levels)): + return None + + if not all(map(is_valid_code_type, codes)): + return None + + def codegen(context, builder, sig, args): + levels_val, codes_val = args + # create series struct and store values + multi_index = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + + multi_index.levels = levels_val + multi_index.codes = codes_val + multi_index.name = context.get_dummy_value() + + if context.enable_nrt: + context.nrt.incref(builder, sig.args[0], levels_val) + context.nrt.incref(builder, sig.args[1], codes_val) + + return multi_index._getvalue() + + ret_typ = MultiIndexType(levels, codes, is_named=False) # pandas ctor always creates unnamed indexes + sig = signature(ret_typ, levels, codes) + return sig, codegen + + +@sdc_overload(len) +def pd_multi_index_len_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_len_impl(self): + return len(self._codes[0]) + + return pd_multi_index_len_impl + + +@intrinsic +def _multi_index_getitem_impl(typingctx, self, idx): + if not isinstance(self, MultiIndexType): + return None + + nlevels = self.nlevels + levels_types = self.levels_types + codes_types = self.codes_types + ret_type = types.Tuple.from_types([index.dtype for index in levels_types]) + + def codegen(context, builder, sig, args): + self_val, idx_val = args + self_ctinfo = context.make_helper(builder, self, self_val) + + res_elements = [] + for level_index in range(nlevels): + level = builder.extract_value(self_ctinfo.levels, level_index) + code = builder.extract_value(self_ctinfo.codes, level_index) + element = context.compile_internal( + builder, + lambda index, code, i: index[code[i]], + signature(levels_types[level_index].dtype, levels_types[level_index], codes_types[level_index], idx), + [level, code, idx_val] + ) + res_elements.append(element) + + return context.make_tuple(builder, ret_type, res_elements) + + return ret_type(self, idx), codegen + + +@sdc_overload(operator.getitem) +def pd_multi_index_getitem_overload(self, idx): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Operator getitem().' + ty_checker = TypeChecker(_func_name) + print("DEBUG: pd_multi_index_getitem_overload typing") + + if not (isinstance(idx, (types.Integer, types.SliceType)) + or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): + ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') + + if isinstance(idx, types.Integer): + def pd_multi_index_getitem_idx_scalar_impl(self, idx): + index_len = len(self) + print("DEBUG: pd_multi_index_getitem_impl: index_len=", index_len, "idx=", idx) + # FIXME_Numba#5801: Numba type unification rules make this float + idx = types.int64((index_len + idx) if idx < 0 else idx) + if (idx < 0 or idx >= index_len): + raise IndexError("MultiIndex.getitem: index is out of bounds") + + return _multi_index_getitem_impl(self, idx) + + return pd_multi_index_getitem_idx_scalar_impl + + # FIXME: check why Int64Index uses numpy_array but not numpy_like in this case? + elif isinstance(idx, types.SliceType): + def pd_multi_index_getitem_idx_slice_impl(self, idx): + + new_levels = self._levels + new_codes = sdc_tuple_map( + lambda arr_codes, taken_idxs: arr_codes[taken_idxs], + self._codes, + idx + ) + return pd.MultiIndex(new_levels, new_codes) + + return pd_multi_index_getitem_idx_slice_impl + + elif isinstance(idx, types.Array) and isinstance(idx.dtype, types.Boolean): + def pd_multi_index_getitem_idx_bool_array_impl(self, idx): + + new_levels = self._levels + new_codes = sdc_tuple_map( + lambda arr_codes, taken_idxs: numpy_like.getitem_by_mask(arr_codes, taken_idxs), + self._codes, + idx + ) + return pd.MultiIndex(new_levels, new_codes) + + return pd_multi_index_getitem_idx_bool_array_impl + + elif isinstance(idx, types.Array) and isinstance(idx.dtype, types.Integer): + def pd_multi_index_getitem_as_take_impl(self, idx): + return self.take(idx) + + return pd_multi_index_getitem_as_take_impl + + + + +@sdc_overload_attribute(MultiIndexType, 'values') +def pd_multi_index_values_overload(self): + if not isinstance(self, MultiIndexType): + return None + + # FIXME: we return a list for now, as there's no arrays of tuples in numba, nor other + # sequence container that is boxed to dtype=object numpy array. TO-DO: replace with other type? + def pd_multi_index_values_impl(self): + res = [] + for i in range(len(self)): + res.append(self[i]) + return res + + return pd_multi_index_values_impl + + +@sdc_overload_attribute(MultiIndexType, 'dtype') +def pd_multi_index_dtype_overload(self): + if not isinstance(self, MultiIndexType): + return None + + mindex_dtype = self.dtype + + def pd_multi_index_dtype_impl(self): + return mindex_dtype + + return pd_multi_index_dtype_impl + + +@sdc_overload_attribute(MultiIndexType, 'levels') +def pd_multi_index_levels_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_levels_impl(self): + return self._levels + + return pd_multi_index_levels_impl + + +@sdc_overload_attribute(MultiIndexType, 'codes') +def codespd_multi_index_levels_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_codes_impl(self): + return self._codes + + return pd_multi_index_codes_impl + + +@typeof_impl.register(pd.MultiIndex) +def typeof_multi_index(val, c): + print(f"DEBUG: typeof_impl: val={val}") + levels = tuple(_infer_index_type(x) for x in val.levels) + print(f"DEBUG: typeof_impl: levels={levels}") + codes = tuple(numba.typeof(x) for x in val.codes) # note this produces readonly array(int8, 1d, C) + is_named = val.name is not None + + return MultiIndexType(types.Tuple.from_types(levels), + types.Tuple.from_types(codes), + is_named=is_named) + + +@box(MultiIndexType) +def box_multi_index(typ, val, c): + + print("DEBUG: typ.levels=", typ.levels) + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + + py_levels = box_tuple(typ.levels, multi_index.levels, c) + py_codes = box_tuple(typ.codes, multi_index.codes, c) + + # dtype and copy params are not stored so use default values + dtype = c.pyapi.make_none() + copy = c.pyapi.bool_from_bool( + c.context.get_constant(types.bool_, False) + ) + sortorder = c.pyapi.make_none() + + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, multi_index.name) + else: + name = c.pyapi.make_none() + + # build MultiIndex names from names of boxed levels (if python level has name attribute) + # TO-DO: refactor this to use native indexes names when all index have it (e.g. StringIndexType) + nlevels = len(typ.levels) + py_nlevels = c.pyapi.tuple_size(py_levels) + py_names = c.pyapi.list_new(py_nlevels) + for i in range(nlevels): + level_type = typ.levels[i] + if isinstance(level_type, sdc_old_index_types): + py_level_name = c.pyapi.make_none() + else: + py_level_obj = c.pyapi.tuple_getitem(py_levels, i) + py_level_name = c.pyapi.object_getattr_string(py_level_obj, 'name') + c.pyapi.list_setitem(py_names, c.context.get_constant(types.intp, i), py_level_name) + # FIXME: check decref is needed for pe_level_obj? + + res = c.pyapi.call_method(pd_class_obj, "MultiIndex", + (py_levels, py_codes, sortorder, py_names, dtype, copy, name)) + + c.pyapi.decref(py_levels) + c.pyapi.decref(py_codes) + c.pyapi.decref(sortorder) + c.pyapi.decref(py_names) + c.pyapi.decref(dtype) + c.pyapi.decref(copy) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + + +@unbox(MultiIndexType) +def unbox_int64_index(typ, val, c): + + nlevels = len(typ.levels) + levels_types = typ.levels_types + codes_types = typ.codes_types + multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + + py_levels_data = c.pyapi.object_getattr_string(val, "levels") + native_levels_data = [] + for i in range(nlevels): + idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) + level_data = c.pyapi.object_getitem(py_levels_data, idx) + native_levels_data.append( + _unbox_index_data(levels_types[i], level_data, c).value + ) + c.pyapi.decref(level_data) + c.pyapi.decref(py_levels_data) + multi_index.levels = c.context.make_tuple(c.builder, typ.levels, native_levels_data) + + py_codes_data = c.pyapi.object_getattr_string(val, "codes") + native_codes_data = [] + for i in range(nlevels): + idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) + code_data = c.pyapi.object_getitem(py_codes_data, idx) + native_codes_data.append( + unbox_array(codes_types[i], code_data, c).value + ) + c.pyapi.decref(code_data) + c.pyapi.decref(py_codes_data) + multi_index.codes = c.context.make_tuple(c.builder, typ.codes, native_codes_data) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + multi_index.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(multi_index._getvalue(), is_error=is_error) + + +@sdc_overload_method(MultiIndexType, 'take') +def pd_multi_index_take_overload(self, indexes): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List, types.ListType) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) + and isinstance(indexes.dtype, (types.Integer, types.ListType))): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_multi_index_take_impl(self, indexes): + new_levels = self._levels + new_codes = sdc_tuple_map( + lambda idx, taken_idxs: sdc_indexes_take(idx, taken_idxs), + self._codes, + indexes) + return pd.MultiIndex(new_levels, new_codes) + + return pd_multi_index_take_impl + + +@sdc_overload_attribute(MultiIndexType, 'nlevels') +def pd_multi_index_nlevels_overload(self): + if not isinstance(self, MultiIndexType): + return None + + nlevels_value = len(self.levels) + + def pd_multi_index_nlevels_impl(self): + return nlevels_value + + return pd_multi_index_nlevels_impl + + +@sdc_overload_attribute(MultiIndexType, 'name') +def pd_multi_index_name_overload(self): + if not isinstance(self, MultiIndexType): + return None + + is_named_index = self.is_named + + def pd_multi_index_name_impl(self): + if is_named_index == True: # noqa + return self._name + else: + return None + + return pd_multi_index_name_impl + + +@sdc_overload_attribute(MultiIndexType, 'names') +def pd_multi_index_names_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_names_impl(self): + levels_names = sdc_tuple_map( + lambda x: x.name, + self._levels + ) + + # this exploits undesired side-effect of literal_unroll - type-unification + # of resulting list dtype that will be types.Optional(types.unicode_type) + # as using typed.List of Optional values currently fails to compile + res = [] + for i in literal_unroll(levels_names): + res.append(i) + return res + + return pd_multi_index_names_impl + + +# FIXME: move to a different file? +def cat_array_equal(A, codes_A, B, codes_B): + pass + + +@sdc_overload(cat_array_equal) +def sdc_cat_array_equal_overload(A, codes_A, B, codes_B): + + def sdc_cat_array_equal_impl(A, codes_A, B, codes_B): + if len(codes_A) != len(codes_B): + return False + + # FIXME_Numba#5157: change to simple A == B when issue is resolved + eq_res_size = len(codes_A) + eq_res = np.empty(eq_res_size, dtype=types.bool_) + for i in numba.prange(eq_res_size): + eq_res[i] = A[codes_A[i]] == B[codes_B[i]] + return np.all(eq_res) + + return sdc_cat_array_equal_impl + + +@intrinsic +def _multi_index_binop_helper(typingctx, self, other): + """ This function gets two multi_index objects each represented as + Tuple(levels) and Tuple(codes) and repacks these into Tuple of following + elements (self_level_0, self_codes_0, other_level_0, other_codes_0), etc + """ + + nlevels = len(self.levels) + if not len(self.levels) == len(other.levels): + assert True, "Cannot flatten MultiIndex of different nlevels" + + elements_types = zip(self.levels, self.codes, other.levels, other.codes) + ret_type = types.Tuple([types.Tuple.from_types(x) for x in elements_types]) + + def codegen(context, builder, sig, args): + self_val, other_val = args + + self_ctinfo = cgutils.create_struct_proxy(self)( + context, builder, value=self_val) + self_levels = self_ctinfo.levels + self_codes = self_ctinfo.codes + + other_ctinfo = cgutils.create_struct_proxy(other)( + context, builder, value=other_val) + other_levels = other_ctinfo.levels + other_codes = other_ctinfo.codes + + ret_tuples = [] + for i in range(nlevels): + self_level_i = builder.extract_value(self_levels, i) + self_codes_i = builder.extract_value(self_codes, i) + other_level_i = builder.extract_value(other_levels, i) + other_codes_i = builder.extract_value(other_codes, i) + + ret_tuples.append( + context.make_tuple(builder, + ret_type[i], + [self_level_i, self_codes_i, other_level_i, other_codes_i]) + ) + + if context.enable_nrt: + context.nrt.incref(builder, ret_type[i][0], self_level_i) + context.nrt.incref(builder, ret_type[i][1], self_codes_i) + context.nrt.incref(builder, ret_type[i][2], other_level_i) + context.nrt.incref(builder, ret_type[i][3], other_codes_i) + + res = context.make_tuple(builder, ret_type, ret_tuples) + return res + + return ret_type(self, other), codegen + + +@sdc_overload_method(MultiIndexType, 'equals') +def pd_multi_index_equals_overload(self, other): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method equals().' + # FIXME: add proper type-checks +# if not isinstance(other, MultiIndexType): +# raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + def pd_multi_index_equals_impl(self, other): + + if self.nlevels != other.nlevels: + return False + + self_and_other_data = _multi_index_binop_helper(self, other) + tup_levels_cmp_res = sdc_tuple_map( + lambda x: cat_array_equal(*x), + self_and_other_data, + ) + + # np.all is not supported for Tuples and below compiles a bit faster + # than 'np.all(np.array(list(tup_levels_cmp_res)))' + for cmp_res in tup_levels_cmp_res: + if not cmp_res: + return False + return True + + return pd_multi_index_equals_impl + + +# FIXME: move to another file? +def _build_index_map(self): + pass + + +@sdc_overload(_build_index_map) +def _build_index_map_ovld(self): + + indexer_dtype = self.dtype + indexer_value_type = types.ListType(types.int64) + + def _build_index_map(self): + indexer_map = Dict.empty(indexer_dtype, indexer_value_type) + for i in range(len(self)): + val = self[i] + index_list = indexer_map.get(val, None) + if index_list is None: + indexer_map[val] = List.empty_list(types.int64) + indexer_map[val].append(i) + else: + index_list.append(i) + + return indexer_map + + return _build_index_map + + +@sdc_overload(operator.contains) +def pd_multi_index_contains_overload(self, label): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method contains().' + # FIXME: add proper type-checks +# if not isinstance(other, MultiIndexType): +# raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + def pd_multi_index_contains_impl(self, label): + + # build indexer_map (should already been built in index ctor?) + indexer_map = _build_index_map(self) + res = label in indexer_map + return res + + return pd_multi_index_contains_impl + + +@sdc_overload(operator.eq) +def pd_multi_index_eq_overload(self, other): + + _func_name = 'Operator eq.' + + self_is_multi_index = isinstance(self, MultiIndexType) + other_is_multi_index = isinstance(other, MultiIndexType) + both_are_multi_indexes = self_is_multi_index and other_is_multi_index + if not (both_are_multi_indexes and check_types_comparable(self, other) + or (self_is_multi_index and other is getattr(self, 'dtype', types.none)) + or (self is getattr(other, 'dtype', types.none) and other_is_multi_index)): + raise TypingError('{} Not allowed for non comparable types. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_multi_index_eq_impl(self, other): + + if both_are_multi_indexes == True: # noqa + self_size = len(self) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if self.nlevels != other.nlevels: + res = np.zeros(self_size, dtype=types.bool_) + else: + res = np.empty(self_size, dtype=types.bool_) + for i in prange(self_size): + res[i] = self[i] == other[i] + + elif self_is_multi_index == True: # noqa + self_size = len(self) + res = np.empty(self_size, dtype=types.bool_) + for i in prange(self_size): + res[i] = self[i] == other + + else: + other_size = len(other) + res = np.empty(other_size, dtype=types.bool_) + for i in prange(other_size): + res[i] = self == other[i] + + return list(res) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + + return pd_multi_index_eq_impl + + +@sdc_overload_method(MultiIndexType, 'ravel') +def pd_multi_index_ravel_overload(self, order='C'): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method ravel().' + + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_multi_index_ravel_impl(self, order='C'): + # np.ravel argument order is not supported in Numba + if order != 'C': + raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") + + return self.values + + return pd_multi_index_ravel_impl + + +@sdc_overload(operator.ne) +def pd_multi_index_ne_overload(self, other): + + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_multi_index = isinstance(self, MultiIndexType) + other_is_multi_index = isinstance(other, MultiIndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_multi_index and other_is_multi_index + or (self_is_multi_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_multi_index)): + return None + + def pd_multi_index_ne_impl(self, other): + + eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list + return list(~eq_res) + + return pd_multi_index_ne_impl + + +@lower_builtin(operator.is_, MultiIndexType, MultiIndexType) +def pd_multi_index_is_overload(context, builder, sig, args): + + ty_lhs, ty_rhs = sig.args + if ty_lhs != ty_rhs: + return cgutils.false_bit + + lhs, rhs = args + lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) + rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) + return builder.icmp_signed('==', lhs_ptr, rhs_ptr) + + +@lower_builtin('getiter', MultiIndexType) +def impl_conc_dict_getiter(context, builder, sig, args): + index_type, = sig.args + index_val, = args + + it = context.make_helper(builder, index_type.iterator_type) + it.parent = index_val + zero = context.get_constant(types.intp, 0) + it.state = cgutils.alloca_once_value(builder, zero) + + res = it._getvalue() + return impl_ret_borrowed(context, builder, index_type.iterator_type, res) + + +@lower_builtin('iternext', MultiIndexIteratorType) +@iternext_impl(RefType.BORROWED) +def impl_iterator_iternext(context, builder, sig, args, result): + iter_type, = sig.args + iter_val, = args + + index_type = iter_type.parent + it = context.make_helper(builder, iter_type, iter_val) + + nitems = context.compile_internal( + builder, + lambda index: len(index), + signature(types.int64, index_type), + [it.parent] + ) + + index = builder.load(it.state) + is_valid = builder.icmp(lc.ICMP_SLT, index, nitems) + result.set_valid(is_valid) + + with builder.if_then(is_valid): + element = context.compile_internal( + builder, + lambda index, i: index[i], + signature(index_type.dtype, index_type, types.int64), + [it.parent, index] + ) + result.yield_(element) + nindex = cgutils.increment_index(builder, index) + builder.store(nindex, it.state) + + +@sdc_overload_method(MultiIndexType, 'reindex') +def pd_multi_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + # TO-DO: check why compilation time is more than 10 seconds + def pd_multi_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_multi_index_reindex_impl + + + +@register_jitable +def _appender_build_map(index1, index2): + res = {} + for i, val in enumerate(index1): + if val not in res: + res[val] = i + + k, count = i, len(res) + while k < i + len(index2): + val = index2[k - i] + if val not in res: + res[val] = count + count += 1 + k += 1 + + return res + + +def _multi_index_append_level(A, codes_A, B, codes_B): + pass + + +@sdc_overload(_multi_index_append_level) +def _multi_index_append_overload(A, codes_A, B, codes_B): + + def _multi_index_append_impl(A, codes_A, B, codes_B): + + appender_map = _appender_build_map(A, B) + res_size = len(codes_A) + len(codes_B) + res_level = fix_df_index( + list(appender_map.keys()) + ) + + res_codes = np.empty(res_size, dtype=np.int64) + A_size = len(codes_A) + for i in prange(res_size): + if i < A_size: + res_codes[i] = codes_A[i] + else: + res_codes[i] = appender_map[B[codes_B[i - A_size]]] + + return (res_level, res_codes) + + return _multi_index_append_impl + + +@intrinsic +def sdc_tuple_unzip(typingctx, data_type): + """ This function gets tuple of pairs and repacks them into two tuples, holding + first and seconds elements, i.e. ((a, b), (c, d), (e, f)) -> ((a, c, e), (b, d, f)). """ + + _func_name = 'sdc_tuple_unzip' + _given_args_str = f'Given: data_type={data_type}' + assert isinstance(data_type, (types.Tuple, types.UniTuple)), \ + f"{_func_name} expects tuple as argument. {_given_args_str}" + + data_len = len(data_type) + assert data_len > 0, f"{_func_name}: empty tuple not allowed. {_given_args_str}" + + for x in data_type: + assert isinstance(x, (types.Tuple, types.UniTuple)) and len(x) == len(data_type[0]), \ + f"{_func_name}: non-supported tuple elements types. {_given_args_str}" + + ty_firsts, ty_seconds = map(lambda x: types.Tuple.from_types(x), + zip(*data_type)) + ret_type = types.Tuple([ty_firsts, ty_seconds]) + +# print(f"DEBUG: sdc_multi_index_repack typing: data_type={data_type}") +# print(f"DEBUG: sdc_multi_index_repack typing: ty_levels={ty_levels}") +# print(f"DEBUG: sdc_multi_index_repack typing: ty_codes={ty_codes}") +# print(f"DEBUG: sdc_multi_index_repack typing: ret_type={ret_type}") + + def codegen(context, builder, sig, args): + data_val, = args + + all_firsts = [] + all_seconds = [] + for i in range(data_len): + tup_element_i = builder.extract_value(data_val, i) + first_i = builder.extract_value(tup_element_i, 0) + second_i = builder.extract_value(tup_element_i, 1) + + all_firsts.append(first_i) + all_seconds.append(second_i) + + ### FIXME: building inserting arrays into new tuple and returning it + ### doesn't automatically increfs? Why below is needed? + if context.enable_nrt: + context.nrt.incref(builder, ty_firsts[i], first_i) + context.nrt.incref(builder, ty_seconds[i], second_i) + + first_tup = context.make_tuple(builder, ty_firsts, all_firsts) + second_tup = context.make_tuple(builder, ty_seconds, all_seconds) + return context.make_tuple(builder, ret_type, [first_tup, second_tup]) + + return ret_type(data_type), codegen + + +@sdc_overload_method(MultiIndexType, 'append') +def pd_multi_index_append_overload(self, other): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(other, MultiIndexType)): + ty_checker.raise_exc(other, 'pandas MultiIndex', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_multi_index_append_impl(self, other): + + self_and_other_data = _multi_index_binop_helper(self, other) + tup_append_level_res = sdc_tuple_map( + lambda x: _multi_index_append_level(*x), + self_and_other_data + ) + + new_levels, new_codes = sdc_tuple_unzip(tup_append_level_res) + return pd.MultiIndex( + levels=new_levels, + codes=new_codes + ) + + return pd_multi_index_append_impl + + + +### FIXME: main question is not should we implement names at all +### but how to implement it? Pandas MultiIndex _name can be different +### than _names (list of level's names), e.g. when it's assigned +### but when created in ctor name argument is specifically checked to +### be index names, so we should probably stick to this behavior: +### ctor arg SHOULD BE list of unicodes! that needs to reset names of +### indexes that we get during construction (after fix_df_index) +def _sdc_multi_index_ctor_typer(typing_ctx, *args): + print("DEBUG: typer for SdcTypeRef: ", args) + + _func_name = '_sdc_multi_index_ctor_typer' + # this types subsequent call to sdc_pandas_multi_index_ctor function with signature: + # args = (levels, codes, sortorder=None, names=None, dtype=None, copy=False, name=None) + + assert len(args) >= 2, f"{_func_name}: Expecting 2 or more positional args, given: {args}" + + levels, codes = args[:2] + if not (isinstance(levels, (types.Tuple, types.UniTuple)) + and isinstance(codes, (types.Tuple, types.UniTuple))): + raise TypingError(f"{_func_name}: levels and codes args must be tuples, given: levels={levels}, codes={codes}") + + nlevels = len(levels) + ty_codes = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + fix_df_array, (typ,), {}).return_type for typ in codes] + ) + + if len(args) >= 2 and not (isinstance(args[2], (types.NoneType, types.Omitted)) or args[2] is None): + assert False, f"{_func_name}: argument sortorder is not supported, given: {args[2]}" + if len(args) >= 3 and not (isinstance(args[3], (types.NoneType, types.Omitted)) or args[3] is None): + assert False, f"{_func_name}: argument names is not supported, given: {args[3]}" + if len(args) >= 4 and not (isinstance(args[4], (types.NoneType, types.Omitted)) or args[4] is None): + assert False, f"{_func_name}: argument dtype is not supported, given: {args[4]}" + if len(args) >= 5 and not (isinstance(args[5], (types.Boolean, types.Omitted)) or args[5] is False): + assert False, f"{_func_name}: argument copy is not supported, given: {args[5]}" + + # if ctor args provide list of levels names via name argument + # update type information for elements in ty_levels + name = args[6] if len(args) >= 6 and not args[6] is None else types.none + if not isinstance(name, (types.NoneType, types.Omitted)): + assert (isinstance(name, types.Tuple) + and all(map(lambda x: isinstance(x, (types.StringLiteral, types.UnicodeType, types.NoneType)), name)) + or isinstance(name, types.UniTuple) + and isinstance(name.dtype, (types.UnicodeType, types.NoneType))), \ + f"{_func_name}: argument name must be tuple of strings, given: {args[6]}" + assert len(name) == nlevels, \ + f"{_func_name}: Length of names must match number of levels in MultiIndex, given: {args[6]}" + + ty_levels = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + _multi_index_create_level, (t1, t2), {}).return_type for t1, t2 in zip(levels, name)] + ) + else: + ty_levels = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + _multi_index_create_level, (typ, types.none), {}).return_type for typ in levels] + ) + + return MultiIndexType(ty_levels, ty_codes, is_named=False) + + +### FIXME: this should not be generic SdcTypeRef, but +### specific class for MultiIndexTypeRef +@type_callable(SdcTypeRef) +def typing_sdctyperef(context): + print("DEBUG: enter typing_sdctyperef") + typing_ctx = context + + def typer(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + return _sdc_multi_index_ctor_typer(typing_ctx, levels, codes, sortorder, + names, dtype, copy, name) + + return typer + + +def sdc_indexes_rename(index, name): + pass + + +@sdc_overload(sdc_indexes_rename) +def sdc_index_rename_ovld(index, name): + + if not isinstance(index, sdc_pandas_index_types): + return None + + if isinstance(index, sdc_old_index_types): + def sdc_indexes_rename_stub(index, name): + # cannot rename string or float indexes, TO-DO: StringIndexType + return index + return sdc_indexes_rename_stub + + if isinstance(index, PositionalIndexType): + def sdc_indexes_rename_impl(index, name): + return init_positional_index(len(index), name) + return sdc_indexes_rename_impl + + elif isinstance(index, RangeIndexType): + def sdc_indexes_rename_impl(index, name): + return pd.RangeIndex(index.start, index.stop, index.step, name=name) + return sdc_indexes_rename_impl + + elif isinstance(index, Int64IndexType): + def sdc_indexes_rename_impl(index, name): + return pd.Int64Index(index, name=name) + return sdc_indexes_rename_impl + + +def sdc_indexes_get_name(index): + pass + + +@sdc_overload(sdc_indexes_get_name) +def sdc_indexes_get_name_ovld(index): + + if (isinstance(index, sdc_pandas_index_types) + and not isinstance(index, sdc_old_index_types)): + def sdc_indexes_get_name_impl(index): + return index.name + return sdc_indexes_get_name_impl + + def sdc_indexes_get_name_stub(index): + # cannot rename string or float indexes, TO-DO: StringIndexType + return None + return sdc_indexes_get_name_stub + + +### FIXME: this is a workaround for not having index.set_names +def _multi_index_create_level(index_data, name): + pass + + +@sdc_overload(_multi_index_create_level) +def _multi_index_create_level_ovld(index_data, name): + + print(f"DEBUG: _multi_index_create_level_ovld: index={index_data}, name={name}") + + def _multi_index_create_level_impl(index_data, name): + index = fix_df_index(index_data) + return sdc_indexes_rename(index, name) + return _multi_index_create_level_impl + + +def _multi_index_create_levels_and_codes(level_data, codes_data, name): + pass + + +@sdc_overload(_multi_index_create_levels_and_codes) +def _multi_index_create_levels_and_codes_ovld(level_data, codes_data, name): + + print(f"DEBUG: _multi_index_create_levels_and_codes_ovld: index={level_data}, codes_data={codes_data}, name={name}") + + def _multi_index_create_levels_and_codes_impl(level_data, codes_data, name): + level_data_fixed = fix_df_index(level_data) + level = sdc_indexes_rename(level_data_fixed, name) + codes = fix_df_array(codes_data) + + # to avoid additional overload make data verification checks inplace + # these checks repeat those in MultiIndex::_verify_integrity + if len(codes) and np.max(codes) >= len(level): + raise ValueError( + "On one of the levels code max >= length of level. " + "NOTE: this index is in an inconsistent state" + ) + if len(codes) and np.min(codes) < -1: + raise ValueError( + "On one of the levels code value < -1") + + # TO-DO: support is_unique for all indexes and use it here + indexer_map = _build_index_map(level) + if len(level) != len(indexer_map): + raise ValueError("Level values must be unique") + + return (level, codes) + + return _multi_index_create_levels_and_codes_impl + + +### FIXME: add comment explaining why it's needed +@infer_getattr +class SdcTypeRefAttribute(AttributeTemplate): + key = SdcTypeRef + + def resolve___call__(self, instance): + return type(instance) + + +def sdc_pandas_multi_index_ctor(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + pass + + +@sdc_overload(sdc_pandas_multi_index_ctor) +def pd_multi_index_overload(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + + _func_name = 'pd.MultiIndex().' + ty_checker = TypeChecker(_func_name) + + # FIXME: add other checks (e.g. for levels and codes) + accepted_index_names = (types.NoneType, types.StringLiteral, types.UnicodeType) + is_name_none = name is None or isinstance(name, (types.NoneType, types.Omitted)) + if not (isinstance(name, (types.Tuple, types.UniTuple)) + and all(map(lambda x: isinstance(x, accepted_index_names), name)) + or is_name_none): + ty_checker.raise_exc(name, 'tuple of strings/nones or none', 'name') + print("DEBUG: sdc_pandas_multi_index_ctor typing:", levels, codes) + + def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must be the same.") + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + + # if name is None then all level names are reset + if is_name_none == True: + _names = sdc_tuple_map( + lambda x: None, + levels, + ) + else: + _names = name + + levels_and_codes_pairs = sdc_tuple_map_elementwise( + _multi_index_create_levels_and_codes, + levels, + codes, + _names + ) + + _levels, _codes = sdc_tuple_unzip(levels_and_codes_pairs) + return init_multi_index(_levels, _codes) + + return pd_multi_index_ctor_impl + + +@lower_builtin(SdcTypeRef, types.VarArg(types.Any)) +def sdctyperef_call_impl(context, builder, sig, args): + + # FIXME: this hardcodes template number and selected dispatcher, refactor? + call_sig = context.typing_context._resolve_user_function_type( + sdc_pandas_multi_index_ctor, + sig.args, + {} + ) + fnty = context.typing_context._lookup_global(sdc_pandas_multi_index_ctor) + disp = fnty.templates[0](context.typing_context)._get_impl(call_sig.args, {}) + cres = disp[0].get_compile_result(call_sig) + + res = context.call_internal( + builder, + cres.fndesc, + sig, + args + ) + + return impl_ret_borrowed(context, builder, sig.return_type, res) + + +@register_jitable +def next_codes_info(level_info, cumprod_list): + _, codes = level_info + cumprod_list.append(cumprod_list[-1] * len(codes)) + return codes, cumprod_list[-1] + + +@register_jitable +def next_codes_array(stats, res_size): + codes_pattern, factor = stats + span_i = res_size // factor # tiles whole array + repeat_i = res_size // (len(codes_pattern) * span_i) # repeats each element + return np.array(list(np.repeat(codes_pattern, span_i)) * repeat_i) + + +### FIXME: can we re-use this in from_tuples? +def factorize_level(level): + pass + + +@sdc_overload(factorize_level) +def factorize_level_ovld(level): + + level_dtype = level.dtype + + def factorize_level_impl(level): + unique_labels = List.empty_list(level_dtype) + res_size = len(level) + codes = np.empty(res_size, types.int64) + if not res_size: + return unique_labels, codes + + indexer_map = Dict.empty(level_dtype, types.int64) + for i in range(res_size): + val = level[i] + _code = indexer_map.get(val, -1) + if _code == -1: + new_code = len(unique_labels) + indexer_map[val] = new_code + unique_labels.append(val) + else: + new_code = _code + + codes[i] = new_code + + return unique_labels, codes + + return factorize_level_impl + + +def _make_level_unique(index): + pass + + +@sdc_overload(_make_level_unique) +def _make_level_unique_ovld(index): + + def _make_level_unique_impl(index): + indexer_map = _build_index_map(index) + return list(indexer_map.keys()) + + return _make_level_unique_impl + + +@sdc_overload_method(SdcTypeRef, 'from_product', prefer_literal=False) +def multi_index_type_from_product_ovld(cls, iterables, sortorder=None, names=None): + if cls.instance_type is not MultiIndexType: + return + + # FIXME: add proper typ checks + print("DEBUG: SdcTypeRef::from_product:", cls, iterables) + _func_name = f'Method MultiIndexType::from_product()' +# ty_checker = TypeChecker(_func_name) +# +# valid_keys_types = (types.Sequence, types.Array, StringArrayType) +# if not isinstance(keys, valid_keys_types): +# ty_checker.raise_exc(keys, f'array or sequence', 'keys') + + def multi_index_type_from_product_impl(cls, iterables, sortorder=None, names=None): + + # TO-DO: support indexes.unique() method and use it here + levels_factorized = sdc_tuple_map( + factorize_level, + iterables + ) + + levels_names = sdc_tuple_map( + sdc_indexes_get_name, + iterables + ) +# print("DEBUG: levels_factorized=", levels_factorized) + + index_levels = sdc_tuple_map( + lambda x: fix_df_index(list(x[0])), + levels_factorized + ) +# print("DEBUG: index_levels=", levels_factorized) + + temp_cumprod_sizes = [1, ] + codes_info = sdc_tuple_map( + next_codes_info, + levels_factorized, + temp_cumprod_sizes + ) +# print("DEBUG: codes_info=", codes_info) + + res_index_size = temp_cumprod_sizes[-1] + index_codes = sdc_tuple_map( + next_codes_array, + codes_info, + res_index_size + ) +# print("DEBUG: index_codes=", index_codes) + + res = sdc_pandas_multi_index_ctor( + index_levels, + index_codes, + name=levels_names + ) + + return res + + return multi_index_type_from_product_impl + + +def _make_level_dict(index): + pass + + +@sdc_overload(_make_level_dict) +def _make_level_dict_ovld(index): + + index_type = index + + def _make_level_dict_impl(index): + return Dict.empty(index_type, types.int64) + + return _make_level_dict_impl + + + +def _update_levels_and_codes(val, level, codes, indexer): + pass + + +@sdc_overload(_update_levels_and_codes) +def _update_levels_and_codes_ovld(val, level, codes, indexer): + + def _update_levels_and_codes_impl(val, level, codes, indexer): + current_index = indexer[-1] + + if val in level: + code = len(level) + else: + code = level.index(val) + level.append(val) + codes[current_index] = code + + indexer[-1] = current_index + 1 + + return _update_levels_and_codes_impl + + +def _multi_index_get_new_code(level, val): + + _code = level.get(val, -1) + if _code == -1: + res = len(level) + level[val] = res + else: + res = _code + + return types.int64(res) + + +def _multi_index_set_new_code(codes, new_code, i): + codes[i] = new_code + + +@intrinsic +def _multi_index_append_value(typingctx, val, levels, codes, idx): + + nlevels = len(val) + if not (nlevels == len(levels) and nlevels == len(codes)): + assert True, f"Cannot append MultiIndex value to existing codes/levels.\n" \ + f"Given: val={val}, levels={levels}, codes={codes}" + + def codegen(context, builder, sig, args): + index_val, levels_val, codes_val, idx_val = args + + for i in range(nlevels): + label = builder.extract_value(index_val, i) + level_i = builder.extract_value(levels_val, i) + codes_i = builder.extract_value(codes_val, i) + + new_code = context.compile_internal( + builder, + _multi_index_get_new_code, + signature(types.int64, levels[i], val[i]), + [level_i, label] + ) + context.compile_internal( + builder, + _multi_index_set_new_code, + signature(types.none, codes[i], types.int64, idx), + [codes_i, new_code, idx_val] + ) + + return types.none(val, levels, codes, idx), codegen + + +@sdc_overload_method(SdcTypeRef, 'from_tuples', prefer_literal=False) +def multi_index_type_from_tuples_ovld(cls, iterables): + if cls.instance_type is not MultiIndexType: + return + + # FIXME: add proper typ checks + print("DEBUG: SdcTypeRef::from_tuples:", cls, iterables) + _func_name = f'Method MultiIndexType::from_tuples()' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(iterables, (types.List, types.ListType)) + and isinstance(iterables.dtype, (types.Tuple, types.UniTuple))): + ty_checker.raise_exc(iterables, f'list of tuples', 'iterables') + + mindex_dtype = iterables.dtype + nlevels = len(mindex_dtype) + range_tup = tuple(np.arange(nlevels)) + + def multi_index_type_from_tuples_impl(cls, iterables): + + ### what we need is a tuple of dicts (for each level): mapping level label into position + ### it was first seen, but also updating codes arrays as per the index that + ### was received from the dict + + index_size = len(iterables) + if not index_size: + raise TypeError("Cannot infer number of levels from empty list") + + example_value = iterables[0] + levels_dicts = sdc_tuple_map( + _make_level_dict, + example_value + ) + index_codes = sdc_tuple_map( + lambda _, size: np.empty(size, dtype=types.int64), + example_value, + index_size + ) + + for i in range(index_size): + val = iterables[i] + _multi_index_append_value(val, levels_dicts, index_codes, i) + + index_levels = sdc_tuple_map( + lambda x: list(x.keys()), + levels_dicts + ) + + res = pd.MultiIndex( + levels=index_levels, + codes=index_codes, + ) + return res + + return multi_index_type_from_tuples_impl + + +@intrinsic +def sdc_tuple_map(typingctx, func, data, *args): + + print("DEBUG: func=", func) + if not isinstance(func, (types.Dispatcher, types.Function)): + assert False, f"sdc_tuple_map's arg 'func' is expected to be " \ + f"numba compiled function or a dispatcher, given: {func}" + + if not isinstance(data, (types.Tuple, types.UniTuple)): + assert False, f"sdc_tuple_map's arg 'data' is expected to be a tuple, given: {data}" + + nargs = len(args) + tuple_len = len(data) + + func_arg_types = [(typ, ) + args for typ in data] + ret_tuple_types = [] + for i in range(tuple_len): + res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) + ret_tuple_types.append(res_sig.return_type) + ret_type = types.Tuple(ret_tuple_types) + ret_sig = ret_type(func, data, types.StarArgTuple.from_types(args)) + print("DEBUG: func_arg_types=", func_arg_types) + print("DEBUG: ret_type=", ret_type) + print("DEBUG: ret_sig=", ret_sig) + + ### FIXME: this works with single overload for decorated function only + ### but this isn't necessary, just need to find out corresponding template + if isinstance(func, types.Function): + assert len(func.templates) == 1, "Function template has multiple overloads" + + def codegen(context, builder, sig, args): + + tup_val = args[1] # main tuple which elements are mapped + other_val = [] + for i in range(0, nargs): + other_val.append( + builder.extract_value(args[2], i) + ) + + mapped_values = [] + for i in range(tuple_len): + tup_elem = builder.extract_value(tup_val, i) + input_args = [tup_elem] + other_val + call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) + + if isinstance(func, types.Dispatcher): + py_func = func.dispatcher.py_func + else: + # for function overloads get pyfunc from compiled impl + target_disp = func.templates[0](context.typing_context) + py_func = target_disp._get_impl(call_sig.args, {})[0].py_func + + mapped_values.append( + context.compile_internal(builder, + py_func, + call_sig, + input_args) + ) + res = context.make_tuple(builder, ret_type, mapped_values) + return res + + return ret_sig, codegen + + +@intrinsic +def sdc_tuple_map_elementwise(typingctx, func, lhs, rhs, *args): + + print("DEBUG: func=", func) + if not isinstance(func, (types.Dispatcher, types.Function)): + assert False, f"sdc_tuple_map_elementwise's arg 'func' is expected to be " \ + f"numba compiled function or a dispatcher, given: {func}" + + if not (isinstance(lhs, (types.Tuple, types.UniTuple)) + and isinstance(rhs, (types.Tuple, types.UniTuple))): + assert False, f"sdc_tuple_map_elementwise's args are expected to be " \ + f"tuples, given: lhs={lhs}, rhs={rhs}" + + assert len(lhs) == len(rhs), f"lhs and rhs tuples have different sizes: lhs={lhs}, rhs={rhs}" + + nargs = len(args) + tuple_len = len(lhs) + + func_arg_types = [x for x in zip(lhs, rhs, *args)] + ret_tuple_types = [] + for i in range(tuple_len): + res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) + ret_tuple_types.append(res_sig.return_type) + ret_type = types.Tuple(ret_tuple_types) + ret_sig = ret_type(func, lhs, rhs, types.StarArgTuple.from_types(args)) + print("DEBUG: func_arg_types=", func_arg_types) + print("DEBUG: ret_type=", ret_type) + print("DEBUG: ret_sig=", ret_sig) + + if isinstance(func, types.Function): + assert len(func.templates) == 1, "Function template has multiple overloads" + + def codegen(context, builder, sig, args): + lhs_val = args[1] + rhs_val = args[2] + other_vals = [] + for i in range(0, nargs): + other_vals.append( + builder.extract_value(args[3], i) + ) + + mapped_values = [] + for i in range(tuple_len): + lhs_elem = builder.extract_value(lhs_val, i) + rhs_elem = builder.extract_value(rhs_val, i) + other_elems = [] + for other_tup in other_vals: + other_elems.append( + builder.extract_value(other_tup, i) + ) + + input_args = [lhs_elem, rhs_elem] + other_elems + call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) + + if isinstance(func, types.Dispatcher): + py_func = func.dispatcher.py_func + else: + # for function overloads get pyfunc from compiled impl + target_disp = func.templates[0](context.typing_context) + py_func = target_disp._get_impl(call_sig.args, {})[0].py_func + + mapped_values.append( + context.compile_internal(builder, + py_func, + call_sig, + input_args) + ) + res = context.make_tuple(builder, ret_type, mapped_values) + return res + + return ret_sig, codegen diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index c06203ecd..919b75080 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -44,7 +44,10 @@ if_series_to_array_type) from numba.core.errors import TypingError from sdc.datatypes.categorical.types import Categorical -from sdc.utilities.sdc_typing_utils import sdc_pandas_df_column_types +from sdc.utilities.sdc_typing_utils import ( + sdc_pandas_df_column_types, + sdc_pandas_index_types, + sdc_old_index_types, ) def isna(arr, i): @@ -146,6 +149,10 @@ def fix_df_array(column): @overload(fix_df_array) def fix_df_array_overload(column): + if not isinstance(column, (types.List, types.ListType, types.Array, StringArrayType)): + return None + + print("DEBUG: fix_df_array_overload column=", column) if (isinstance(column, types.List)): dtype = column.dtype if isinstance(dtype, (types.Number, types.Boolean)): @@ -165,7 +172,11 @@ def fix_df_array_list_str_impl(column): # pragma: no cover return lambda column: np.array(column) if isinstance(column, (types.Array, StringArrayType, Categorical)): - return lambda column: column + def fix_df_array_array_impl(column): + print("DEBUG: calling fix_df_array, column=", column) + return column + return fix_df_array_array_impl + # return lambda column: column def fix_df_index(index, coldata=None): @@ -175,6 +186,7 @@ def fix_df_index(index, coldata=None): @overload(fix_df_index) def fix_df_index_overload(index, coldata=None): + print("DEBUG: fix_df_index_overload index=", index) # FIXME: import here due to circular import between indexes, numpy_like, and api from sdc.extensions.indexes.empty_index_ext import init_empty_index from sdc.extensions.indexes.positional_index_ext import init_positional_index @@ -192,8 +204,11 @@ def fix_df_index_impl(index, coldata=None): return fix_df_index_impl - elif isinstance(index, (RangeIndexType, Int64IndexType, EmptyIndexType, PositionalIndexType)): + # elif isinstance(index, (RangeIndexType, Int64IndexType, EmptyIndexType, PositionalIndexType)): + elif (isinstance(index, sdc_pandas_index_types) + and not isinstance(index, sdc_old_index_types)): ## MAJOR bug fix in a separate PR def fix_df_index_impl(index, coldata=None): + print("DEBUG: calling this fix_df_index, index=", index) return index # currently only signed integer indexes are represented with own type diff --git a/sdc/tests/indexes/__init__.py b/sdc/tests/indexes/__init__.py index c0adc55e5..3c472d8ac 100644 --- a/sdc/tests/indexes/__init__.py +++ b/sdc/tests/indexes/__init__.py @@ -28,4 +28,5 @@ from sdc.tests.indexes.test_range_index import TestRangeIndex from sdc.tests.indexes.test_positional_index import TestPositionalIndex from sdc.tests.indexes.test_int64_index import TestInt64Index +from sdc.tests.indexes.test_multi_index import TestMultiIndex from sdc.tests.indexes.test_indexes import TestIndexes diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py index 244fa52f8..268c9e211 100644 --- a/sdc/tests/indexes/index_datagens.py +++ b/sdc/tests/indexes/index_datagens.py @@ -126,5 +126,93 @@ def get_sample_index(size, sdc_index_type): return pd.RangeIndex(-1, size - 1, 1) if sdc_index_type is Int64IndexType: return pd.Int64Index(np.arange(size)) + if sdc_index_type is MultiIndexType: + levels = [['a', 'b', 'c'], np.arange(size // 2 + 1)] + return pd.MultiIndex.from_product(levels)[:size] - assert False, f"Refusing to create index of non-specific index type: {sdc_index_type}" + assert False, f"Index generation failed: index type not-recognized: {sdc_index_type}" + + +def _get_multi_index_base_index(exceeded_size, nlevels=2, dtypes=None): + """ Produces multi-index with certain nlevels/dtypes, pre-defined values and size >= exceeded_size """ + + str_labels = ['a', 'b', 'c', 'd', 'e'] + sample_labels = { + 'str': str_labels, + 'int': np.arange(exceeded_size // len(str_labels) + 1), + } + + if dtypes is None: + dtypes = ['str', 'int'] + + # first expand, then cut as needed + if len(dtypes) < nlevels: + dtypes = dtypes * (nlevels // len(dtypes) + 1) + if len(dtypes) > nlevels: + dtypes = dtypes[:nlevels] + + all_levels = [sample_labels[ty] for ty in dtypes] + base_index = pd.MultiIndex.from_tuples( + list(product(*all_levels)) + ) + return base_index + + +def _generate_multi_indexes_fixed(size, nlevels=2, dtypes=None, base_index=None): + """ This is used to generate fixed-size multi-indexes of needed nlevels and dtypes + with generated indexes having certain set of values. """ + + size_range = np.arange(size) + base_index = base_index or _get_multi_index_base_index(size) + base_index_range = np.arange(len(base_index)) + + yield base_index[:size] # unique values from first size values of base_index + yield base_index.take(np.random.choice(size_range, size)) # same values, random order, with duplicates + yield base_index.take(np.random.choice(size_range, size, replace=False)) # same values, unique, random order + yield base_index.take(np.random.choice(base_index_range, size)) # random order, with values not in base_index + + +def _generate_multi_index_levels_unique(n=10, k=5): + yield [gen_strlist(n, nchars=2), np.arange(k)] + yield [gen_strlist(n, nchars=2), gen_strlist(2*n, nchars=2), np.arange(k)] + yield [['a', 'b', 'c'], [1, 2, 3], ['d', 'e']] + yield [np.array([100, 200, 300]), np.arange(k)] + yield [pd.Int64Index([100, 200, 300]), pd.RangeIndex(k)] + + # this is to check named levels creation and name/names arguments + yield [pd.Int64Index([100, 200, 300], name="first"), pd.RangeIndex(k, name="second")] + yield [pd.Int64Index([100, 200, 300], name="first"), pd.RangeIndex(k)] + yield [pd.Int64Index([100, 200, 300], name="first"), ] + + +def _generate_multi_index_levels_with_duplicates(n=10, k=5): + yield [['a', 'b', 'c', 'a', 'b'], ] + yield [np.arange(k), ['a', 'b', 'c', 'a', 'b']] + + +def _generate_multi_index_levels(n=10, k=5): + """ This is useful for generating all set of levels specific dtypes, names, etc. """ + return chain( + _generate_multi_index_levels_unique(n, k), + _generate_multi_index_levels_with_duplicates(n, k), + ) + + +def get_codes_from_levels(size, levels, replace=True): + res_codes = [] + for x in levels: + res_codes.append( + np.random.choice(np.arange(len(x)), size, replace) + ) + return res_codes + + +def _generate_multi_indexes(): + n = 100 + gen_levels = _generate_multi_index_levels + gen_unique_levels = _generate_multi_index_levels_unique + return chain( + map(lambda x: pd.MultiIndex.from_product(x), gen_levels()), + map(lambda x: pd.MultiIndex(x, get_codes_from_levels(n, x)), gen_unique_levels()), + _generate_multi_indexes_fixed(n), + ) diff --git a/sdc/tests/indexes/test_multi_index.py b/sdc/tests/indexes/test_multi_index.py new file mode 100644 index 000000000..5159e00db --- /dev/null +++ b/sdc/tests/indexes/test_multi_index.py @@ -0,0 +1,692 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import unittest +from itertools import (combinations_with_replacement, product, combinations, ) + +from numba.core import types +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_multi_indexes_fixed, + _generate_multi_index_levels_unique, + _generate_multi_index_levels, + _generate_multi_indexes, + _get_multi_index_base_index, + get_sample_index, + get_codes_from_levels, + ) +from sdc.tests.test_base import TestCase +from sdc.datatypes.indexes import * +from sdc.tests.test_utils import skip_numba_jit, assert_pandas_exception + + +class TestMultiIndex(TestCase): + + def test_multi_index_type_inferred(self): + for index, name in product(_generate_multi_indexes(), + test_global_index_names): + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, MultiIndexType) + + index.name = name + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, MultiIndexType) + + def test_multi_index_create_and_box(self): + def test_impl(levels, codes): + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + for data in _generate_multi_index_levels_unique(): + # creating pd.MultiIndex is only supported with levels and codes as tuples + levels = tuple(data) + codes = tuple(get_codes_from_levels(n, levels)) + with self.subTest(levels=levels, codes=codes): + result = sdc_func(levels, codes) + result_ref = test_impl(levels, codes) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_create_invalid_inputs(self): + def test_impl(levels, codes): + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + level_and_codes = [ + (['a', 'b', 'c'], [3, 0, 1, 2, 2]), # code 3 is out of bounds + (['a', 'b', 'c'], [1, 0, 1, -2, 2]), # code -2 is out of bounds + (['a', 'b', 'c', 'a', 'b'], [1, 0, 1, 2, 2]) # duplicate labels in level + ] + exc_strs = [ + "On one of the levels code max >= length of level.", + "On one of the levels code value < -1", + "Level values must be unique", + ] + + for i, level_codes_pair in enumerate(level_and_codes): + levels, codes = (level_codes_pair[0], ), (level_codes_pair[1], ) + test_msg = f"Inconsistent codes: levels={levels}, codes={codes}" + sdc_exc_str = exc_strs[i] + assert_pandas_exception(self, test_msg, sdc_exc_str, test_impl, sdc_func, (levels, codes)) + + def test_multi_index_create_from_tuples(self): + def test_impl(): + codes_max = 5 + levels = ( + ['a', 'b', 'c', 'd', 'e'], + np.arange(codes_max) + ) + codes = ( + np.arange(0, codes_max), + np.arange(codes_max, 0, -1) - 1, + ) + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_index_equal(result, result_ref) + + @skip_numba_jit("MultiIndexType ctor supports levels and codes as tuples only") + def test_multi_index_create_from_lists(self): + def test_impl(): + codes_max = 5 + levels = [ + ['a', 'b', 'c', 'd', 'e'], + np.arange(codes_max), + ] + codes = [ + np.arange(0, codes_max), + np.arange(codes_max, 0, -1) - 1, + ] + + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_create_param_names(self): + + # using keyword arguments in typeref ctor, is not supported due to limitation of __call__ overload, + # TO-DO: refactor this after @overload is supported for typerefs (see FIXME_Numba#XXXX): + def test_impl(levels, codes, names): + # return pd.MultiIndex(levels, codes, name=names) + return pd.MultiIndex(levels, codes, None, None, None, False, names) + sdc_func = self.jit(test_impl) + + n = 11 + max_codes = 5 + all_levels = [ + [5, 2, 1, 4, 3], + np.arange(max_codes), + pd.RangeIndex(max_codes), + pd.RangeIndex(max_codes, name='abc'), + pd.Int64Index([5, 2, 1, 4, 3]), + pd.Int64Index([5, 2, 1, 4, 3], name='bce'), + ] + for data, names in product( + combinations(all_levels, 2), + combinations_with_replacement(test_global_index_names, 2) + ): + + # all parameters are supported as tuples only in pd.MultiIndex ctor + levels = tuple(data) + codes = tuple(get_codes_from_levels(n, levels)) + _names = tuple(names) + with self.subTest(levels=levels, codes=codes, names=_names): + result = sdc_func(levels, codes, _names) + result_ref = test_impl(levels, codes, _names) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_unbox_and_box(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_attribute_dtype(self): + from numba.typed import List + + # index dtype cannot be returned (boxed), thus it only checks it can be used + def test_impl(index): + return List.empty_list(index.dtype) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + expected = types.Tuple.from_types([types.unicode_type, types.intp]) + self.assertEqual(result._dtype, expected) + + def test_multi_index_attribute_name(self): + def test_impl(index): + return index.name + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + for name in test_global_index_names: + index.name = name + with self.subTest(name=name): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + @skip_numba_jit("StringArrayType as index has no name. TO-DO: StringIndexType") + def test_multi_index_attribute_names(self): + def test_impl(index): + return index.names + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + for names in combinations_with_replacement( + test_global_index_names, + index.nlevels): + index.names = names + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_attribute_nlevels(self): + def test_impl(index): + return index.nlevels + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_len(self): + def test_impl(index): + return len(index) + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_attribute_values(self): + def test_impl(index): + return index.values + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index_data=index): + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.values return list but not numpy array + self.assertEqual(result, list(result_ref)) + + def test_multi_index_attribute_levels(self): + def test_impl(index): + return index.levels + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index_data=index): + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.levels return tuple of levels not list + error_msg = f"Indexes'levels are different:\nresult={result},\nresult_ref{result_ref}" + self.assertEqual(len(result), len(result_ref), error_msg) + self.assertTrue(map( + lambda x, y: pd.testing.assert_index_equal(x, y), + zip(result, result_ref)), + error_msg + ) + + def test_multi_index_attribute_codes(self): + def test_impl(index): + return index.codes + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index_data=index): + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.levels return tuple of levels not list + error_msg = f"Indexes'levels are different:\nresult={result},\nresult_ref{result_ref}" + self.assertEqual(len(result), len(result_ref), error_msg) + self.assertTrue(map( + lambda x, y: np.testing.assert_array_equal(x, y), + zip(result, result_ref)), + error_msg + ) + + def test_multi_index_contains(self): + def test_impl(index, value): + return value in index + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + values_to_test = [('a', 1), ('a', 4), ('e', 1), ('x', 5)] + for value in values_to_test: + with self.subTest(value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_getitem_scalar(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + idxs_to_test = [0, n // 2, n - 1, -1] + for idx in idxs_to_test: + with self.subTest(idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + self.assertEqual(result, result_ref) + + def test_multi_index_getitem_scalar_idx_bounds(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + idxs_to_test = [-(n + 1), n] + for idx in idxs_to_test: + with self.subTest(idx=idx): + with self.assertRaises(Exception) as context: + test_impl(index, idx) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index, idx) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn("out of bounds", str(sdc_exception)) + + def test_multi_index_getitem_slice(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 17 + index = get_sample_index(n, MultiIndexType) + slices_params = combinations_with_replacement( + [None, 0, -1, n // 2, n, n - 3, n + 3, -(n + 3)], + 2 + ) + + for slice_start, slice_stop in slices_params: + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_iterator_1(self): + def test_impl(index): + res = [] + for i, label in enumerate(index): + res.append((i, label)) + return res + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_iterator_2(self): + def test_impl(index): + res = [] + for label in index: + str_part, _ = label + if str_part == 'a': + res.append(label) + return res + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + @skip_numba_jit("Requires np.array of complex dtypes (tuples) support in Numba") + def test_multi_index_nparray(self): + def test_impl(index): + return np.array(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_eq_index(self): + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_eq_scalar(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + A = get_sample_index(n, MultiIndexType) + scalars_to_test = [('a', 1), ('a', 4), ('e', 1), ('x', 5)] + for B in scalars_to_test: + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + @skip_numba_jit("Requires np.array of complex dtypes (tuples) support in Numba") + def test_multi_index_operator_eq_nparray(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + for A, B in product( + _generate_multi_indexes_fixed(n), + map(lambda x: np.array(x), _generate_multi_indexes_fixed(n)) + ): + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_ne_index(self): + def test_impl(index1, index2): + return index1 != index2 + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_is_nounbox(self): + def test_impl_1(): + index1 = pd.MultiIndex( + levels=(['a', 'b', 'c'], [1, 2, 3]), + codes=([0, 1, 0, 1, 2], [0, 0, 1, 1, 2]) + ) + index2 = index1 + return index1 is index2 + sdc_func_1 = self.jit(test_impl_1) + + def test_impl_2(): + index1 = pd.MultiIndex( + levels=(['a', 'b', 'c'], [1, 2, 3]), + codes=([0, 1, 0, 1, 2], [0, 0, 1, 1, 2]) + ) + index2 = pd.MultiIndex( + levels=(['a', 'b', 'c'], [1, 2, 3]), + codes=([0, 1, 0, 1, 2], [0, 0, 1, 1, 2]) + ) + return index1 is index2 + sdc_func_2 = self.jit(test_impl_2) + + # positive testcase + with self.subTest(subtest="same indexes"): + result = sdc_func_1() + result_ref = test_impl_1() + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + result = sdc_func_2() + result_ref = test_impl_2() + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_multi_index_getitem_by_mask(self): + def test_impl(index, mask): + return index[mask] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + mask = np.random.choice([True, False], n) + for index in _generate_multi_indexes_fixed(n): + result = sdc_func(index, mask) + result_ref = test_impl(index, mask) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n, k = 11, 7 + np.random.seed(0) + idx = np.random.choice(np.arange(n), k) + for index in _generate_multi_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_reindex_equal_indexes(self): + + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) + + n = 10 + index1 = get_sample_index(n, MultiIndexType) + index2 = index1.copy(deep=True) + + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_multi_index_reindex(self): + + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + base_index = _get_multi_index_base_index(n) + index1 = base_index[:n] + size_range = np.arange(len(index1)) + reindex_by = list(map( + lambda x: base_index.take(x), + [ + size_range, # same index as index1 + np.random.choice(size_range, n), # random values from index1 with duplicates + np.random.choice(size_range, n, replace=False), # random unique values from index1 + np.random.choice(np.arange(len(base_index)), n), # random values from larger set + size_range[:n // 2], # shorter index + np.random.choice(size_range, 2*n), # longer index + ] + )) + + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_multi_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + def test_multi_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.values return list but not numpy array + np.testing.assert_array_equal(result, list(result_ref)) + + def test_multi_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_multi_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) + + index = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) + other = pd.MultiIndex.from_tuples( + [('a', 3), ('c', 1), ('c', 3), ('b', 2), ('b', 3)]) + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) + + @skip_numba_jit("MultiIndexType.join is not implemented yet") + def test_multi_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index, other in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) + + def test_multi_index_from_product(self): + def test_impl(levels): + return pd.MultiIndex.from_product(levels) + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for data in _generate_multi_index_levels(): + # creating pd.MultiIndex is only supported with levels and codes as tuples + levels = tuple(data) + with self.subTest(levels=levels): + result = sdc_func(levels) + result_ref = test_impl(levels) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_from_tuples(self): + def test_impl(data): + return pd.MultiIndex.from_tuples(data) + sdc_func = self.jit(test_impl) + + n = 100 + np.random.seed(0) + for index in _generate_multi_indexes_fixed(n): + data = list(index.values) + with self.subTest(data=data): + result = sdc_func(data) + result_ref = test_impl(data) + pd.testing.assert_index_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/test_utils.py b/sdc/tests/test_utils.py index 719682097..110c7424b 100644 --- a/sdc/tests/test_utils.py +++ b/sdc/tests/test_utils.py @@ -254,6 +254,19 @@ def assert_raises_ty_checker(self, err_details, func, *args, **kwargs): self.assertRaisesRegex(TypingError, regex_str, func, *args, **kwargs) +def assert_pandas_exception(self, test_msg, sdc_exc_str, test_impl, sdc_func, args): + with self.subTest(test_msg): + with self.assertRaises(Exception) as context: + test_impl(*args) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(*args) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn(sdc_exc_str, str(sdc_exception)) + + def _make_func_from_text(func_text, func_name='test_impl', global_vars={}): loc_vars = {} exec(func_text, global_vars, loc_vars) diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 3c5c4219e..6d8b6810f 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -49,6 +49,7 @@ PositionalIndexType, RangeIndexType, Int64IndexType, + MultiIndexType, ) + sdc_old_index_types sdc_indexes_range_like = ( @@ -189,6 +190,9 @@ def check_types_comparable(ty_left, ty_right): return isinstance(ty_right, types.UnicodeType) if isinstance(ty_left, types.Boolean): return isinstance(ty_right, types.Boolean) + if isinstance(ty_left, (types.Tuple, types.UniTuple)): + # FIXME: just for now to unblock compilation + return ty_left == ty_right return False diff --git a/test_create_multiindex.py b/test_create_multiindex.py new file mode 100644 index 000000000..b13dddd9c --- /dev/null +++ b/test_create_multiindex.py @@ -0,0 +1,52 @@ +import pandas as pd +import numpy as np + +### + + + +### Let's define use-cases first, what we actually need from multi-index +### is having it as DF columns! that is we need to support indexes +### that arise from groupby.agg method. + +def test_impl_1(df): + A = df.groupby('A').agg({'A': ['count', 'min', 'max'], + 'B': ['std', 'mean']}) + return A + +df = pd.DataFrame({ + 'A': [2, 1, 1, 1, 2, 2, 1], + 'B': [-8, 2, 3, 1, 5, 6, 7] +}) + +# print("df:", df) +# res = test_impl_1(df) +# print("res:", res) + + + +def test_impl_2(df): + A = df.groupby('A').agg([lambda x: x.max() - x.min(), lambda x: x.max() + x.min()]) + return A + +df = pd.DataFrame({ + 'A': [2, 1, 1, 1, 2, 2, 1], + 'B': [-8, 2, 3, 1, 5, 6, 7], + 'C': [-81, 21, 31, 11, 51, 61, 71] +}) + +# print("df:", df) +# res = test_impl_2(df) +# print("res:", res) + + +def test_impl_3(): + res = pd.MultiIndex( + levels=[np.array([1, 2]), np.array([3, 4])], + #levels=[["zero", "one"], ["x", "y"]], + codes=[[1, 1, 0, 0], [1, 0, 1, 0]] + ) + return res + +res = test_impl_3() +print("res:", res) From da6899f4ed3b7a7ebd86a16e2e5ad76d2786ca68 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Sun, 13 Jun 2021 00:18:32 +0300 Subject: [PATCH 2/7] Fixing tests --- sdc/datatypes/indexes/multi_index_type.py | 19 ++----- sdc/datatypes/sdc_typeref.py | 66 +++++++++++++++++++++++ sdc/extensions/indexes/indexes_generic.py | 5 +- sdc/extensions/indexes/multi_index_ext.py | 9 ++-- sdc/extensions/sdc_hashmap_ext.py | 2 +- sdc/extensions/sdc_hashmap_type.py | 43 +-------------- sdc/hiframes/api.py | 5 +- sdc/tests/test_compile_time.py | 2 +- sdc/utilities/sdc_typing_utils.py | 1 + 9 files changed, 83 insertions(+), 69 deletions(-) create mode 100644 sdc/datatypes/sdc_typeref.py diff --git a/sdc/datatypes/indexes/multi_index_type.py b/sdc/datatypes/indexes/multi_index_type.py index ddb54bfb4..ddf3f9b5e 100644 --- a/sdc/datatypes/indexes/multi_index_type.py +++ b/sdc/datatypes/indexes/multi_index_type.py @@ -125,23 +125,10 @@ class SdcTypeRef(types.Dummy): def __init__(self, instance_type): self.instance_type = instance_type super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) - - + + @register_model(SdcTypeRef) class SdcTypeRefModel(models.OpaqueModel): def __init__(self, dmm, fe_type): - - models.OpaqueModel.__init__(self, dmm, fe_type) - -import pandas as pd -@typeof_impl.register(type) -def mynew_typeof_type(val, c): - """ This function is a workaround for """ - - # print("DEBUG: val=", val) - if not issubclass(val, pd.MultiIndex): - # if not issubclass(val, MultiIndex): - return numba_typeof_type(val, c) - else: - return SdcTypeRef(MultiIndexType) + models.OpaqueModel.__init__(self, dmm, fe_type) diff --git a/sdc/datatypes/sdc_typeref.py b/sdc/datatypes/sdc_typeref.py new file mode 100644 index 000000000..589105b89 --- /dev/null +++ b/sdc/datatypes/sdc_typeref.py @@ -0,0 +1,66 @@ +# ***************************************************************************** +# Copyright (c) 2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd + +from numba.core import types +from numba.extending import (models, register_model, typeof_impl, ) +from numba.core.typing.typeof import _typeof_type as numba_typeof_type + +from sdc.extensions.sdc_hashmap_type import ConcurrentDict, ConcurrentDictType +from sdc.datatypes.indexes import MultiIndexType + + +# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef +# we have to use our new SdcTypeRef to type objects created from types.Type +# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. +class SdcTypeRef(types.Dummy): + """Reference to a type. + + Used when a type is passed as a value. + """ + def __init__(self, instance_type): + self.instance_type = instance_type + super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) + + +@register_model(SdcTypeRef) +class SdcTypeRefModel(models.OpaqueModel): + def __init__(self, dmm, fe_type): + + models.OpaqueModel.__init__(self, dmm, fe_type) + + +@typeof_impl.register(type) +def mynew_typeof_type(val, c): + """ This function is a workaround for """ + + if issubclass(val, ConcurrentDict): + return SdcTypeRef(ConcurrentDictType) + elif issubclass(val, pd.MultiIndex): + return SdcTypeRef(MultiIndexType) + else: + return numba_typeof_type(val, c) diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 0d1a8710f..6a0b696a8 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -97,9 +97,8 @@ def sdc_indexes_operator_eq_ovld(self, other): use_self_values = isinstance(self, sdc_pandas_index_types) and not isinstance(self, types.Array) use_other_values = isinstance(other, sdc_pandas_index_types) and not isinstance(other, types.Array) - ## prev. version: one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) - # FIXME: check that one_operand_is_scalar is fixed and works in tests now - one_operand_is_scalar = self is other.dtype or other is self.dtype + one_operand_is_scalar = (isinstance(other, sdc_pandas_index_types) and self is other.dtype + or isinstance(self, sdc_pandas_index_types) and other is self.dtype) def sdc_indexes_operator_eq_impl(self, other): diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py index b40666d09..ddf1319f2 100644 --- a/sdc/extensions/indexes/multi_index_ext.py +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -37,7 +37,6 @@ from numba.core.typing.templates import signature, AttributeTemplate, AbstractTemplate, infer_getattr from numba.core.imputils import impl_ret_untracked, call_getiter, impl_ret_borrowed from numba.core.imputils import (impl_ret_new_ref, impl_ret_borrowed, iternext_impl, RefType) -from numba.core.boxing import box_array, unbox_array from numba.core.boxing import box_array, unbox_array, box_tuple import llvmlite.llvmpy.core as lc @@ -55,8 +54,8 @@ from sdc.functions import numpy_like from sdc.hiframes.api import fix_df_array, fix_df_index from sdc.hiframes.boxing import _infer_index_type, _unbox_index_data -from sdc.extensions.indexes.indexes_generic import * from sdc.datatypes.common_functions import hpat_arrays_append +from sdc.extensions.indexes.indexes_generic import * from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType from numba.core.extending import register_jitable @@ -64,11 +63,11 @@ from numba.typed import Dict, List from sdc.str_arr_type import StringArrayType from sdc.extensions.indexes.positional_index_ext import init_positional_index -from sdc.extensions.indexes.empty_index_ext import init_empty_index -from sdc.datatypes.indexes.multi_index_type import SdcTypeRef -from sdc.hiframes.boxing import _infer_index_type +from sdc.datatypes.sdc_typeref import SdcTypeRef + +from sdc.extensions.sdc_hashmap_type import * ### FIXME: clean-up imports diff --git a/sdc/extensions/sdc_hashmap_ext.py b/sdc/extensions/sdc_hashmap_ext.py index 5fea972d8..54b8edaa3 100644 --- a/sdc/extensions/sdc_hashmap_ext.py +++ b/sdc/extensions/sdc_hashmap_ext.py @@ -59,7 +59,7 @@ ConcDictItemsIterableType, ConcDictValuesIterableType) from numba.extending import register_jitable -from sdc.extensions.sdc_hashmap_type import SdcTypeRef +from sdc.datatypes.sdc_typeref import SdcTypeRef from sdc.utilities.sdc_typing_utils import TypingError, TypeChecker, check_types_comparable from itertools import product diff --git a/sdc/extensions/sdc_hashmap_type.py b/sdc/extensions/sdc_hashmap_type.py index b54c49b56..2c598ea6f 100644 --- a/sdc/extensions/sdc_hashmap_type.py +++ b/sdc/extensions/sdc_hashmap_type.py @@ -24,20 +24,11 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -from numba.core.typing.templates import ( - infer_global, AbstractTemplate, signature, - ) -from numba.extending import type_callable, lower_builtin from numba import types -from numba.extending import (models, register_model, make_attribute_wrapper, overload_method) -from sdc.str_ext import string_type +from numba.core.types import IterableType, SimpleIterableType, SimpleIteratorType +from numba.extending import (models, register_model, make_attribute_wrapper, ) from collections.abc import MutableMapping -from numba.core.types import Dummy, IterableType, SimpleIterableType, SimpleIteratorType - -from numba.extending import typeof_impl -from numba.typed import Dict -from numba.core.typing.typeof import _typeof_type as numba_typeof_type class ConcDictIteratorType(SimpleIteratorType): @@ -161,33 +152,3 @@ def _numba_type_(self): if self._dict_type is None: raise TypeError("invalid operation on untyped dictionary") return self._dict_type - - -# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef -# we have to use our new SdcTypeRef to type objects created from types.Type -# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. -class SdcTypeRef(Dummy): - """Reference to a type. - - Used when a type is passed as a value. - """ - def __init__(self, instance_type): - self.instance_type = instance_type - super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) - - -@register_model(SdcTypeRef) -class SdcTypeRefModel(models.OpaqueModel): - def __init__(self, dmm, fe_type): - - models.OpaqueModel.__init__(self, dmm, fe_type) - - -@typeof_impl.register(type) -def mynew_typeof_type(val, c): - """ This function is a workaround for """ - - if not issubclass(val, ConcurrentDict): - return numba_typeof_type(val, c) - else: - return SdcTypeRef(ConcurrentDictType) diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 919b75080..8752774a4 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -149,8 +149,9 @@ def fix_df_array(column): @overload(fix_df_array) def fix_df_array_overload(column): - if not isinstance(column, (types.List, types.ListType, types.Array, StringArrayType)): - return None + # FIXME: do we need some restriction on column types here? +# if not isinstance(column, (types.List, types.ListType, types.Array, StringArrayType)): +# return None print("DEBUG: fix_df_array_overload column=", column) if (isinstance(column, types.List)): diff --git a/sdc/tests/test_compile_time.py b/sdc/tests/test_compile_time.py index 03b5fd46a..845b16b42 100644 --- a/sdc/tests/test_compile_time.py +++ b/sdc/tests/test_compile_time.py @@ -69,7 +69,7 @@ def test_impl(S1, S2): test_impl(S1, S2) entry_format = fr'{line_function}{line_pipeline}{line_time}\n' - log_format = fr'^{line_heading}({entry_format})+{line_ending}$' + log_format = fr'{line_heading}({entry_format})+{line_ending}$' self.assertRegex(buffer.getvalue(), log_format) def test_log_format_detailed(self): diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 6d8b6810f..31dc54035 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -70,6 +70,7 @@ Categorical, ) + class TypeChecker: """ Validate object type and raise TypingError if the type is invalid, e.g.: From 7f1351abfc6a3cc8ccb54e2940955992724f3500 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Sun, 13 Jun 2021 03:36:02 +0300 Subject: [PATCH 3/7] Removing debug traces and reorganizing code --- sdc/extensions/indexes/indexes_generic.py | 54 +- sdc/extensions/indexes/multi_index_ext.py | 918 ++++++++-------------- sdc/functions/tuple_utils.py | 239 ++++++ sdc/hiframes/api.py | 13 +- 4 files changed, 602 insertions(+), 622 deletions(-) create mode 100644 sdc/functions/tuple_utils.py diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 6a0b696a8..40f198b7c 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -296,7 +296,6 @@ def pd_fix_indexes_take_overload(self, indexes): the fact that StringArrayType is one of the index types """ check = isinstance(self, sdc_pandas_index_types) - print("DEBUG: sdc_indexes_take typing:", self, check) if not isinstance(self, sdc_pandas_index_types): return None @@ -312,3 +311,56 @@ def pd_fix_indexes_take_impl(self, indexes): return res return pd_fix_indexes_take_impl + + +def sdc_indexes_rename(index, name): + pass + + +@sdc_overload(sdc_indexes_rename) +def sdc_index_rename_ovld(index, name): + + if not isinstance(index, sdc_pandas_index_types): + return None + + if isinstance(index, sdc_old_index_types): + def sdc_indexes_rename_stub(index, name): + # cannot rename string or float indexes, TO-DO: StringIndexType + return index + return sdc_indexes_rename_stub + + if isinstance(index, PositionalIndexType): + from sdc.extensions.indexes.positional_index_ext import init_positional_index + + def sdc_indexes_rename_impl(index, name): + return init_positional_index(len(index), name) + return sdc_indexes_rename_impl + + elif isinstance(index, RangeIndexType): + def sdc_indexes_rename_impl(index, name): + return pd.RangeIndex(index.start, index.stop, index.step, name=name) + return sdc_indexes_rename_impl + + elif isinstance(index, Int64IndexType): + def sdc_indexes_rename_impl(index, name): + return pd.Int64Index(index, name=name) + return sdc_indexes_rename_impl + + +def sdc_indexes_get_name(index): + pass + + +@sdc_overload(sdc_indexes_get_name) +def sdc_indexes_get_name_ovld(index): + + if (isinstance(index, sdc_pandas_index_types) + and not isinstance(index, sdc_old_index_types)): + def sdc_indexes_get_name_impl(index): + return index.name + return sdc_indexes_get_name_impl + + def sdc_indexes_get_name_stub(index): + # cannot rename string or float indexes, TO-DO: StringIndexType + return None + return sdc_indexes_get_name_stub diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py index ddf1319f2..7fb497d48 100644 --- a/sdc/extensions/indexes/multi_index_ext.py +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -34,10 +34,9 @@ from numba.core import cgutils from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, type_callable) from numba.core.errors import TypingError -from numba.core.typing.templates import signature, AttributeTemplate, AbstractTemplate, infer_getattr -from numba.core.imputils import impl_ret_untracked, call_getiter, impl_ret_borrowed -from numba.core.imputils import (impl_ret_new_ref, impl_ret_borrowed, iternext_impl, RefType) -from numba.core.boxing import box_array, unbox_array, box_tuple +from numba.core.typing.templates import signature, AttributeTemplate, infer_getattr +from numba.core.imputils import (impl_ret_borrowed, iternext_impl, RefType) +from numba.core.boxing import unbox_array, box_tuple import llvmlite.llvmpy.core as lc @@ -46,41 +45,133 @@ from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral from sdc.utilities.sdc_typing_utils import ( TypeChecker, - check_signed_integer, - _check_dtype_param_type, sdc_pandas_index_types, + sdc_pandas_df_column_types, check_types_comparable, ) from sdc.functions import numpy_like +from sdc.functions.tuple_utils import sdc_tuple_map, sdc_tuple_map_elementwise, sdc_tuple_unzip from sdc.hiframes.api import fix_df_array, fix_df_index from sdc.hiframes.boxing import _infer_index_type, _unbox_index_data -from sdc.datatypes.common_functions import hpat_arrays_append from sdc.extensions.indexes.indexes_generic import * from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType from numba.core.extending import register_jitable from numba import literal_unroll from numba.typed import Dict, List -from sdc.str_arr_type import StringArrayType -from sdc.extensions.indexes.positional_index_ext import init_positional_index - from sdc.datatypes.sdc_typeref import SdcTypeRef -from sdc.extensions.sdc_hashmap_type import * +@typeof_impl.register(pd.MultiIndex) +def typeof_multi_index(val, c): + levels = tuple(_infer_index_type(x) for x in val.levels) + codes = tuple(numba.typeof(x) for x in val.codes) # note this produces readonly array(int8, 1d, C) + is_named = val.name is not None + + return MultiIndexType(types.Tuple.from_types(levels), + types.Tuple.from_types(codes), + is_named=is_named) + + +@box(MultiIndexType) +def box_multi_index(typ, val, c): + + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + + py_levels = box_tuple(typ.levels, multi_index.levels, c) + py_codes = box_tuple(typ.codes, multi_index.codes, c) + + # dtype and copy params are not stored so use default values + dtype = c.pyapi.make_none() + copy = c.pyapi.bool_from_bool( + c.context.get_constant(types.bool_, False) + ) + sortorder = c.pyapi.make_none() + + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, multi_index.name) + else: + name = c.pyapi.make_none() + + # build MultiIndex names from names of boxed levels (if python level has name attribute) + # TO-DO: refactor this to use native indexes names when all index have it (e.g. StringIndexType) + nlevels = len(typ.levels) + py_nlevels = c.pyapi.tuple_size(py_levels) + py_names = c.pyapi.list_new(py_nlevels) + for i in range(nlevels): + level_type = typ.levels[i] + if isinstance(level_type, sdc_old_index_types): + py_level_name = c.pyapi.make_none() + else: + py_level_obj = c.pyapi.tuple_getitem(py_levels, i) + py_level_name = c.pyapi.object_getattr_string(py_level_obj, 'name') + c.pyapi.list_setitem(py_names, c.context.get_constant(types.intp, i), py_level_name) + # FIXME: check decref is needed for pe_level_obj? + + res = c.pyapi.call_method(pd_class_obj, "MultiIndex", + (py_levels, py_codes, sortorder, py_names, dtype, copy, name)) + + c.pyapi.decref(py_levels) + c.pyapi.decref(py_codes) + c.pyapi.decref(sortorder) + c.pyapi.decref(py_names) + c.pyapi.decref(dtype) + c.pyapi.decref(copy) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + -### FIXME: clean-up imports +@unbox(MultiIndexType) +def unbox_int64_index(typ, val, c): + + nlevels = len(typ.levels) + levels_types = typ.levels_types + codes_types = typ.codes_types + multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + + py_levels_data = c.pyapi.object_getattr_string(val, "levels") + native_levels_data = [] + for i in range(nlevels): + idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) + level_data = c.pyapi.object_getitem(py_levels_data, idx) + native_levels_data.append( + _unbox_index_data(levels_types[i], level_data, c).value + ) + c.pyapi.decref(level_data) + c.pyapi.decref(py_levels_data) + multi_index.levels = c.context.make_tuple(c.builder, typ.levels, native_levels_data) + + py_codes_data = c.pyapi.object_getattr_string(val, "codes") + native_codes_data = [] + for i in range(nlevels): + idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) + code_data = c.pyapi.object_getitem(py_codes_data, idx) + native_codes_data.append( + unbox_array(codes_types[i], code_data, c).value + ) + c.pyapi.decref(code_data) + c.pyapi.decref(py_codes_data) + multi_index.codes = c.context.make_tuple(c.builder, typ.codes, native_codes_data) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + multi_index.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(multi_index._getvalue(), is_error=is_error) @intrinsic def init_multi_index(typingctx, levels, codes): - print("DEBUG: init_multi_index typing:\n", - f"\tlevels={levels}\n", - f"\tcodes={codes}\n") if not (isinstance(levels, (types.Tuple, types.UniTuple)) and isinstance(codes, (types.Tuple, types.UniTuple))): - assert False, "init_multi_index types " return None def is_valid_level_type(typ): @@ -116,6 +207,165 @@ def codegen(context, builder, sig, args): return sig, codegen +def _sdc_multi_index_ctor_typer(typing_ctx, *args): + + _func_name = '_sdc_multi_index_ctor_typer' + # this types subsequent call to sdc_pandas_multi_index_ctor function with signature: + # args = (levels, codes, sortorder=None, names=None, dtype=None, copy=False, name=None) + + assert len(args) >= 2, f"{_func_name}: Expecting 2 or more positional args, given: {args}" + + levels, codes = args[:2] + if not (isinstance(levels, (types.Tuple, types.UniTuple)) + and isinstance(codes, (types.Tuple, types.UniTuple))): + raise TypingError(f"{_func_name}: levels and codes args must be tuples, given: levels={levels}, codes={codes}") + + nlevels = len(levels) + ty_codes = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + fix_df_array, (typ,), {}).return_type for typ in codes] + ) + + if len(args) >= 2 and not (isinstance(args[2], (types.NoneType, types.Omitted)) or args[2] is None): + assert False, f"{_func_name}: argument sortorder is not supported, given: {args[2]}" + if len(args) >= 3 and not (isinstance(args[3], (types.NoneType, types.Omitted)) or args[3] is None): + assert False, f"{_func_name}: argument names is not supported, given: {args[3]}" + if len(args) >= 4 and not (isinstance(args[4], (types.NoneType, types.Omitted)) or args[4] is None): + assert False, f"{_func_name}: argument dtype is not supported, given: {args[4]}" + if len(args) >= 5 and not (isinstance(args[5], (types.Boolean, types.Omitted)) or args[5] is False): + assert False, f"{_func_name}: argument copy is not supported, given: {args[5]}" + + # if ctor args provide list of levels names via name argument + # update type information for elements in ty_levels + name = args[6] if len(args) >= 6 and not args[6] is None else types.none + if not isinstance(name, (types.NoneType, types.Omitted)): + assert (isinstance(name, types.Tuple) + and all(map(lambda x: isinstance(x, (types.StringLiteral, types.UnicodeType, types.NoneType)), name)) + or isinstance(name, types.UniTuple) + and isinstance(name.dtype, (types.UnicodeType, types.NoneType))), \ + f"{_func_name}: argument name must be tuple of strings, given: {args[6]}" + assert len(name) == nlevels, \ + f"{_func_name}: Length of names must match number of levels in MultiIndex, given: {args[6]}" + + ty_levels = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + _multi_index_create_level, (t1, t2), {}).return_type for t1, t2 in zip(levels, name)] + ) + else: + ty_levels = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + _multi_index_create_level, (typ, types.none), {}).return_type for typ in levels] + ) + + return MultiIndexType(ty_levels, ty_codes, is_named=False) + + +### FIXME: this should not be generic SdcTypeRef, but specific type, such as MultiIndexTypeRef +@type_callable(SdcTypeRef) +def typing_sdctyperef(context): + typing_ctx = context + + def typer(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + return _sdc_multi_index_ctor_typer(typing_ctx, levels, codes, sortorder, + names, dtype, copy, name) + + return typer + + +### FIXME: add comment explaining why it's needed +@infer_getattr +class SdcTypeRefAttribute(AttributeTemplate): + key = SdcTypeRef + + def resolve___call__(self, instance): + return type(instance) + + +def sdc_pandas_multi_index_ctor(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + pass + + +@sdc_overload(sdc_pandas_multi_index_ctor) +def pd_multi_index_overload(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + + _func_name = 'pd.MultiIndex().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(sortorder, (types.Omitted, types.NoneType)) or sortorder is None): + raise TypingError('{} Unsupported parameters. Given sortorder: {}'.format(_func_name, sortorder)) + + if not (isinstance(names, (types.Omitted, types.NoneType)) or names is None): + raise TypingError('{} Unsupported parameters. Given names: {}'.format(_func_name, names)) + + if not (isinstance(dtype, (types.Omitted, types.NoneType)) or dtype is None): + raise TypingError('{} Unsupported parameters. Given dtype: {}'.format(_func_name, dtype)) + + if not (isinstance(copy, (types.Omitted, types.Boolean, types.BooleanLiteral)) or copy is False): + raise TypingError('{} Unsupported parameters. Given copy: {}'.format(_func_name, copy)) + + accepted_index_names = (types.NoneType, types.StringLiteral, types.UnicodeType) + is_name_none = name is None or isinstance(name, (types.NoneType, types.Omitted)) + if not (isinstance(name, (types.Tuple, types.UniTuple)) + and all(map(lambda x: isinstance(x, accepted_index_names), name)) + or is_name_none): + ty_checker.raise_exc(name, 'tuple of strings/nones or none', 'name') + + def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must be the same.") + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + + # if name is None then all level names are reset + if is_name_none == True: + _names = sdc_tuple_map( + lambda x: None, + levels, + ) + else: + _names = name + + levels_and_codes_pairs = sdc_tuple_map_elementwise( + _multi_index_create_levels_and_codes, + levels, + codes, + _names + ) + + _levels, _codes = sdc_tuple_unzip(levels_and_codes_pairs) + return init_multi_index(_levels, _codes) + + return pd_multi_index_ctor_impl + + +@lower_builtin(SdcTypeRef, types.VarArg(types.Any)) +def sdctyperef_call_impl(context, builder, sig, args): + + # FIXME: this hardcodes template number and selected dispatcher, refactor? + call_sig = context.typing_context._resolve_user_function_type( + sdc_pandas_multi_index_ctor, + sig.args, + {} + ) + fnty = context.typing_context._lookup_global(sdc_pandas_multi_index_ctor) + disp = fnty.templates[0](context.typing_context)._get_impl(call_sig.args, {}) + cres = disp[0].get_compile_result(call_sig) + + res = context.call_internal( + builder, + cres.fndesc, + sig, + args + ) + + return impl_ret_borrowed(context, builder, sig.return_type, res) + + @sdc_overload(len) def pd_multi_index_len_overload(self): if not isinstance(self, MultiIndexType): @@ -165,7 +415,6 @@ def pd_multi_index_getitem_overload(self, idx): _func_name = 'Operator getitem().' ty_checker = TypeChecker(_func_name) - print("DEBUG: pd_multi_index_getitem_overload typing") if not (isinstance(idx, (types.Integer, types.SliceType)) or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): @@ -174,7 +423,6 @@ def pd_multi_index_getitem_overload(self, idx): if isinstance(idx, types.Integer): def pd_multi_index_getitem_idx_scalar_impl(self, idx): index_len = len(self) - print("DEBUG: pd_multi_index_getitem_impl: index_len=", index_len, "idx=", idx) # FIXME_Numba#5801: Numba type unification rules make this float idx = types.int64((index_len + idx) if idx < 0 else idx) if (idx < 0 or idx >= index_len): @@ -184,7 +432,6 @@ def pd_multi_index_getitem_idx_scalar_impl(self, idx): return pd_multi_index_getitem_idx_scalar_impl - # FIXME: check why Int64Index uses numpy_array but not numpy_like in this case? elif isinstance(idx, types.SliceType): def pd_multi_index_getitem_idx_slice_impl(self, idx): @@ -218,8 +465,6 @@ def pd_multi_index_getitem_as_take_impl(self, idx): return pd_multi_index_getitem_as_take_impl - - @sdc_overload_attribute(MultiIndexType, 'values') def pd_multi_index_values_overload(self): if not isinstance(self, MultiIndexType): @@ -261,122 +506,14 @@ def pd_multi_index_levels_impl(self): @sdc_overload_attribute(MultiIndexType, 'codes') -def codespd_multi_index_levels_overload(self): - if not isinstance(self, MultiIndexType): - return None - - def pd_multi_index_codes_impl(self): - return self._codes - - return pd_multi_index_codes_impl - - -@typeof_impl.register(pd.MultiIndex) -def typeof_multi_index(val, c): - print(f"DEBUG: typeof_impl: val={val}") - levels = tuple(_infer_index_type(x) for x in val.levels) - print(f"DEBUG: typeof_impl: levels={levels}") - codes = tuple(numba.typeof(x) for x in val.codes) # note this produces readonly array(int8, 1d, C) - is_named = val.name is not None - - return MultiIndexType(types.Tuple.from_types(levels), - types.Tuple.from_types(codes), - is_named=is_named) - - -@box(MultiIndexType) -def box_multi_index(typ, val, c): - - print("DEBUG: typ.levels=", typ.levels) - mod_name = c.context.insert_const_string(c.builder.module, "pandas") - pd_class_obj = c.pyapi.import_module_noblock(mod_name) - - multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) - - py_levels = box_tuple(typ.levels, multi_index.levels, c) - py_codes = box_tuple(typ.codes, multi_index.codes, c) - - # dtype and copy params are not stored so use default values - dtype = c.pyapi.make_none() - copy = c.pyapi.bool_from_bool( - c.context.get_constant(types.bool_, False) - ) - sortorder = c.pyapi.make_none() - - if typ.is_named: - name = c.pyapi.from_native_value(types.unicode_type, multi_index.name) - else: - name = c.pyapi.make_none() - - # build MultiIndex names from names of boxed levels (if python level has name attribute) - # TO-DO: refactor this to use native indexes names when all index have it (e.g. StringIndexType) - nlevels = len(typ.levels) - py_nlevels = c.pyapi.tuple_size(py_levels) - py_names = c.pyapi.list_new(py_nlevels) - for i in range(nlevels): - level_type = typ.levels[i] - if isinstance(level_type, sdc_old_index_types): - py_level_name = c.pyapi.make_none() - else: - py_level_obj = c.pyapi.tuple_getitem(py_levels, i) - py_level_name = c.pyapi.object_getattr_string(py_level_obj, 'name') - c.pyapi.list_setitem(py_names, c.context.get_constant(types.intp, i), py_level_name) - # FIXME: check decref is needed for pe_level_obj? - - res = c.pyapi.call_method(pd_class_obj, "MultiIndex", - (py_levels, py_codes, sortorder, py_names, dtype, copy, name)) - - c.pyapi.decref(py_levels) - c.pyapi.decref(py_codes) - c.pyapi.decref(sortorder) - c.pyapi.decref(py_names) - c.pyapi.decref(dtype) - c.pyapi.decref(copy) - c.pyapi.decref(name) - c.pyapi.decref(pd_class_obj) - return res - - -@unbox(MultiIndexType) -def unbox_int64_index(typ, val, c): - - nlevels = len(typ.levels) - levels_types = typ.levels_types - codes_types = typ.codes_types - multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) - - py_levels_data = c.pyapi.object_getattr_string(val, "levels") - native_levels_data = [] - for i in range(nlevels): - idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) - level_data = c.pyapi.object_getitem(py_levels_data, idx) - native_levels_data.append( - _unbox_index_data(levels_types[i], level_data, c).value - ) - c.pyapi.decref(level_data) - c.pyapi.decref(py_levels_data) - multi_index.levels = c.context.make_tuple(c.builder, typ.levels, native_levels_data) - - py_codes_data = c.pyapi.object_getattr_string(val, "codes") - native_codes_data = [] - for i in range(nlevels): - idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) - code_data = c.pyapi.object_getitem(py_codes_data, idx) - native_codes_data.append( - unbox_array(codes_types[i], code_data, c).value - ) - c.pyapi.decref(code_data) - c.pyapi.decref(py_codes_data) - multi_index.codes = c.context.make_tuple(c.builder, typ.codes, native_codes_data) - - if typ.is_named: - name_obj = c.pyapi.object_getattr_string(val, "name") - multi_index.name = numba.cpython.unicode.unbox_unicode_str( - types.unicode_type, name_obj, c).value - c.pyapi.decref(name_obj) +def codespd_multi_index_levels_overload(self): + if not isinstance(self, MultiIndexType): + return None - is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) - return NativeValue(multi_index._getvalue(), is_error=is_error) + def pd_multi_index_codes_impl(self): + return self._codes + + return pd_multi_index_codes_impl @sdc_overload_method(MultiIndexType, 'take') @@ -534,9 +671,10 @@ def pd_multi_index_equals_overload(self, other): return None _func_name = 'Method equals().' - # FIXME: add proper type-checks -# if not isinstance(other, MultiIndexType): -# raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + ty_checker = TypeChecker(_func_name) + + if not (isinstance(other, MultiIndexType) and self.dtype is other.dtype): + ty_checker.raise_exc(other, 'pandas MultiIndex', 'other') def pd_multi_index_equals_impl(self, other): @@ -592,9 +730,10 @@ def pd_multi_index_contains_overload(self, label): return None _func_name = 'Method contains().' - # FIXME: add proper type-checks -# if not isinstance(other, MultiIndexType): -# raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + ty_checker = TypeChecker(_func_name) + + if not (isinstance(label, (types.Tuple, types.UniTuple)) and self.dtype is label): + ty_checker.raise_exc(label, 'tuple ', 'val') def pd_multi_index_contains_impl(self, label): @@ -675,18 +814,15 @@ def pd_multi_index_ravel_impl(self, order='C'): def pd_multi_index_ne_overload(self, other): _func_name = 'Operator ne.' - if not check_types_comparable(self, other): - raise TypingError('{} Not allowed for non comparable indexes. \ - Given: self={}, other={}'.format(_func_name, self, other)) self_is_multi_index = isinstance(self, MultiIndexType) other_is_multi_index = isinstance(other, MultiIndexType) - - possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types - if not (self_is_multi_index and other_is_multi_index - or (self_is_multi_index and isinstance(other, possible_arg_types)) - or (isinstance(self, possible_arg_types) and other_is_multi_index)): - return None + both_are_multi_indexes = self_is_multi_index and other_is_multi_index + if not (both_are_multi_indexes and check_types_comparable(self, other) + or (self_is_multi_index and other is getattr(self, 'dtype', types.none)) + or (self is getattr(other, 'dtype', types.none) and other_is_multi_index)): + raise TypingError('{} Not allowed for non comparable types. \ + Given: self={}, other={}'.format(_func_name, self, other)) def pd_multi_index_ne_impl(self, other): @@ -775,7 +911,7 @@ def pd_multi_index_reindex_impl(self, target, method=None, level=None, limit=Non return pd_multi_index_reindex_impl - +# FIXME: move to indexes_generic or build into index model? @register_jitable def _appender_build_map(index1, index2): res = {} @@ -822,58 +958,6 @@ def _multi_index_append_impl(A, codes_A, B, codes_B): return _multi_index_append_impl -@intrinsic -def sdc_tuple_unzip(typingctx, data_type): - """ This function gets tuple of pairs and repacks them into two tuples, holding - first and seconds elements, i.e. ((a, b), (c, d), (e, f)) -> ((a, c, e), (b, d, f)). """ - - _func_name = 'sdc_tuple_unzip' - _given_args_str = f'Given: data_type={data_type}' - assert isinstance(data_type, (types.Tuple, types.UniTuple)), \ - f"{_func_name} expects tuple as argument. {_given_args_str}" - - data_len = len(data_type) - assert data_len > 0, f"{_func_name}: empty tuple not allowed. {_given_args_str}" - - for x in data_type: - assert isinstance(x, (types.Tuple, types.UniTuple)) and len(x) == len(data_type[0]), \ - f"{_func_name}: non-supported tuple elements types. {_given_args_str}" - - ty_firsts, ty_seconds = map(lambda x: types.Tuple.from_types(x), - zip(*data_type)) - ret_type = types.Tuple([ty_firsts, ty_seconds]) - -# print(f"DEBUG: sdc_multi_index_repack typing: data_type={data_type}") -# print(f"DEBUG: sdc_multi_index_repack typing: ty_levels={ty_levels}") -# print(f"DEBUG: sdc_multi_index_repack typing: ty_codes={ty_codes}") -# print(f"DEBUG: sdc_multi_index_repack typing: ret_type={ret_type}") - - def codegen(context, builder, sig, args): - data_val, = args - - all_firsts = [] - all_seconds = [] - for i in range(data_len): - tup_element_i = builder.extract_value(data_val, i) - first_i = builder.extract_value(tup_element_i, 0) - second_i = builder.extract_value(tup_element_i, 1) - - all_firsts.append(first_i) - all_seconds.append(second_i) - - ### FIXME: building inserting arrays into new tuple and returning it - ### doesn't automatically increfs? Why below is needed? - if context.enable_nrt: - context.nrt.incref(builder, ty_firsts[i], first_i) - context.nrt.incref(builder, ty_seconds[i], second_i) - - first_tup = context.make_tuple(builder, ty_firsts, all_firsts) - second_tup = context.make_tuple(builder, ty_seconds, all_seconds) - return context.make_tuple(builder, ret_type, [first_tup, second_tup]) - - return ret_type(data_type), codegen - - @sdc_overload_method(MultiIndexType, 'append') def pd_multi_index_append_overload(self, other): if not isinstance(self, MultiIndexType): @@ -906,135 +990,6 @@ def pd_multi_index_append_impl(self, other): return pd_multi_index_append_impl - -### FIXME: main question is not should we implement names at all -### but how to implement it? Pandas MultiIndex _name can be different -### than _names (list of level's names), e.g. when it's assigned -### but when created in ctor name argument is specifically checked to -### be index names, so we should probably stick to this behavior: -### ctor arg SHOULD BE list of unicodes! that needs to reset names of -### indexes that we get during construction (after fix_df_index) -def _sdc_multi_index_ctor_typer(typing_ctx, *args): - print("DEBUG: typer for SdcTypeRef: ", args) - - _func_name = '_sdc_multi_index_ctor_typer' - # this types subsequent call to sdc_pandas_multi_index_ctor function with signature: - # args = (levels, codes, sortorder=None, names=None, dtype=None, copy=False, name=None) - - assert len(args) >= 2, f"{_func_name}: Expecting 2 or more positional args, given: {args}" - - levels, codes = args[:2] - if not (isinstance(levels, (types.Tuple, types.UniTuple)) - and isinstance(codes, (types.Tuple, types.UniTuple))): - raise TypingError(f"{_func_name}: levels and codes args must be tuples, given: levels={levels}, codes={codes}") - - nlevels = len(levels) - ty_codes = types.Tuple.from_types( - [typing_ctx._resolve_user_function_type( - fix_df_array, (typ,), {}).return_type for typ in codes] - ) - - if len(args) >= 2 and not (isinstance(args[2], (types.NoneType, types.Omitted)) or args[2] is None): - assert False, f"{_func_name}: argument sortorder is not supported, given: {args[2]}" - if len(args) >= 3 and not (isinstance(args[3], (types.NoneType, types.Omitted)) or args[3] is None): - assert False, f"{_func_name}: argument names is not supported, given: {args[3]}" - if len(args) >= 4 and not (isinstance(args[4], (types.NoneType, types.Omitted)) or args[4] is None): - assert False, f"{_func_name}: argument dtype is not supported, given: {args[4]}" - if len(args) >= 5 and not (isinstance(args[5], (types.Boolean, types.Omitted)) or args[5] is False): - assert False, f"{_func_name}: argument copy is not supported, given: {args[5]}" - - # if ctor args provide list of levels names via name argument - # update type information for elements in ty_levels - name = args[6] if len(args) >= 6 and not args[6] is None else types.none - if not isinstance(name, (types.NoneType, types.Omitted)): - assert (isinstance(name, types.Tuple) - and all(map(lambda x: isinstance(x, (types.StringLiteral, types.UnicodeType, types.NoneType)), name)) - or isinstance(name, types.UniTuple) - and isinstance(name.dtype, (types.UnicodeType, types.NoneType))), \ - f"{_func_name}: argument name must be tuple of strings, given: {args[6]}" - assert len(name) == nlevels, \ - f"{_func_name}: Length of names must match number of levels in MultiIndex, given: {args[6]}" - - ty_levels = types.Tuple.from_types( - [typing_ctx._resolve_user_function_type( - _multi_index_create_level, (t1, t2), {}).return_type for t1, t2 in zip(levels, name)] - ) - else: - ty_levels = types.Tuple.from_types( - [typing_ctx._resolve_user_function_type( - _multi_index_create_level, (typ, types.none), {}).return_type for typ in levels] - ) - - return MultiIndexType(ty_levels, ty_codes, is_named=False) - - -### FIXME: this should not be generic SdcTypeRef, but -### specific class for MultiIndexTypeRef -@type_callable(SdcTypeRef) -def typing_sdctyperef(context): - print("DEBUG: enter typing_sdctyperef") - typing_ctx = context - - def typer(levels, codes, sortorder=None, names=None, - dtype=None, copy=False, name=None): - return _sdc_multi_index_ctor_typer(typing_ctx, levels, codes, sortorder, - names, dtype, copy, name) - - return typer - - -def sdc_indexes_rename(index, name): - pass - - -@sdc_overload(sdc_indexes_rename) -def sdc_index_rename_ovld(index, name): - - if not isinstance(index, sdc_pandas_index_types): - return None - - if isinstance(index, sdc_old_index_types): - def sdc_indexes_rename_stub(index, name): - # cannot rename string or float indexes, TO-DO: StringIndexType - return index - return sdc_indexes_rename_stub - - if isinstance(index, PositionalIndexType): - def sdc_indexes_rename_impl(index, name): - return init_positional_index(len(index), name) - return sdc_indexes_rename_impl - - elif isinstance(index, RangeIndexType): - def sdc_indexes_rename_impl(index, name): - return pd.RangeIndex(index.start, index.stop, index.step, name=name) - return sdc_indexes_rename_impl - - elif isinstance(index, Int64IndexType): - def sdc_indexes_rename_impl(index, name): - return pd.Int64Index(index, name=name) - return sdc_indexes_rename_impl - - -def sdc_indexes_get_name(index): - pass - - -@sdc_overload(sdc_indexes_get_name) -def sdc_indexes_get_name_ovld(index): - - if (isinstance(index, sdc_pandas_index_types) - and not isinstance(index, sdc_old_index_types)): - def sdc_indexes_get_name_impl(index): - return index.name - return sdc_indexes_get_name_impl - - def sdc_indexes_get_name_stub(index): - # cannot rename string or float indexes, TO-DO: StringIndexType - return None - return sdc_indexes_get_name_stub - - -### FIXME: this is a workaround for not having index.set_names def _multi_index_create_level(index_data, name): pass @@ -1042,8 +997,6 @@ def _multi_index_create_level(index_data, name): @sdc_overload(_multi_index_create_level) def _multi_index_create_level_ovld(index_data, name): - print(f"DEBUG: _multi_index_create_level_ovld: index={index_data}, name={name}") - def _multi_index_create_level_impl(index_data, name): index = fix_df_index(index_data) return sdc_indexes_rename(index, name) @@ -1057,8 +1010,6 @@ def _multi_index_create_levels_and_codes(level_data, codes_data, name): @sdc_overload(_multi_index_create_levels_and_codes) def _multi_index_create_levels_and_codes_ovld(level_data, codes_data, name): - print(f"DEBUG: _multi_index_create_levels_and_codes_ovld: index={level_data}, codes_data={codes_data}, name={name}") - def _multi_index_create_levels_and_codes_impl(level_data, codes_data, name): level_data_fixed = fix_df_index(level_data) level = sdc_indexes_rename(level_data_fixed, name) @@ -1085,89 +1036,6 @@ def _multi_index_create_levels_and_codes_impl(level_data, codes_data, name): return _multi_index_create_levels_and_codes_impl -### FIXME: add comment explaining why it's needed -@infer_getattr -class SdcTypeRefAttribute(AttributeTemplate): - key = SdcTypeRef - - def resolve___call__(self, instance): - return type(instance) - - -def sdc_pandas_multi_index_ctor(levels, codes, sortorder=None, names=None, - dtype=None, copy=False, name=None): - pass - - -@sdc_overload(sdc_pandas_multi_index_ctor) -def pd_multi_index_overload(levels, codes, sortorder=None, names=None, - dtype=None, copy=False, name=None): - - _func_name = 'pd.MultiIndex().' - ty_checker = TypeChecker(_func_name) - - # FIXME: add other checks (e.g. for levels and codes) - accepted_index_names = (types.NoneType, types.StringLiteral, types.UnicodeType) - is_name_none = name is None or isinstance(name, (types.NoneType, types.Omitted)) - if not (isinstance(name, (types.Tuple, types.UniTuple)) - and all(map(lambda x: isinstance(x, accepted_index_names), name)) - or is_name_none): - ty_checker.raise_exc(name, 'tuple of strings/nones or none', 'name') - print("DEBUG: sdc_pandas_multi_index_ctor typing:", levels, codes) - - def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, - dtype=None, copy=False, name=None): - - if len(levels) != len(codes): - raise ValueError("Length of levels and codes must be the same.") - if len(levels) == 0: - raise ValueError("Must pass non-zero number of levels/codes") - - # if name is None then all level names are reset - if is_name_none == True: - _names = sdc_tuple_map( - lambda x: None, - levels, - ) - else: - _names = name - - levels_and_codes_pairs = sdc_tuple_map_elementwise( - _multi_index_create_levels_and_codes, - levels, - codes, - _names - ) - - _levels, _codes = sdc_tuple_unzip(levels_and_codes_pairs) - return init_multi_index(_levels, _codes) - - return pd_multi_index_ctor_impl - - -@lower_builtin(SdcTypeRef, types.VarArg(types.Any)) -def sdctyperef_call_impl(context, builder, sig, args): - - # FIXME: this hardcodes template number and selected dispatcher, refactor? - call_sig = context.typing_context._resolve_user_function_type( - sdc_pandas_multi_index_ctor, - sig.args, - {} - ) - fnty = context.typing_context._lookup_global(sdc_pandas_multi_index_ctor) - disp = fnty.templates[0](context.typing_context)._get_impl(call_sig.args, {}) - cres = disp[0].get_compile_result(call_sig) - - res = context.call_internal( - builder, - cres.fndesc, - sig, - args - ) - - return impl_ret_borrowed(context, builder, sig.return_type, res) - - @register_jitable def next_codes_info(level_info, cumprod_list): _, codes = level_info @@ -1233,20 +1101,26 @@ def _make_level_unique_impl(index): @sdc_overload_method(SdcTypeRef, 'from_product', prefer_literal=False) -def multi_index_type_from_product_ovld(cls, iterables, sortorder=None, names=None): +def pd_multi_index_from_product_overload(cls, iterables, sortorder=None, names=None): if cls.instance_type is not MultiIndexType: return - # FIXME: add proper typ checks - print("DEBUG: SdcTypeRef::from_product:", cls, iterables) - _func_name = f'Method MultiIndexType::from_product()' -# ty_checker = TypeChecker(_func_name) -# -# valid_keys_types = (types.Sequence, types.Array, StringArrayType) -# if not isinstance(keys, valid_keys_types): -# ty_checker.raise_exc(keys, f'array or sequence', 'keys') + _func_name = f'Method from_product()' + valid_levels_data_types = sdc_pandas_index_types + sdc_pandas_df_column_types + (types.List, types.ListType) + ty_checker = TypeChecker(_func_name) + if not (isinstance(iterables, (types.List, types.ListType, types.UniTuple)) + and isinstance(iterables.dtype, valid_levels_data_types) + or isinstance(iterables, types.Tuple) + and all(map(lambda x: isinstance(x, valid_levels_data_types), iterables))): + ty_checker.raise_exc(iterables, 'list or tuple of tuples ', 'iterables') + + if not (isinstance(sortorder, (types.Omitted, types.NoneType)) or sortorder is None): + raise TypingError('{} Unsupported parameters. Given sortorder: {}'.format(_func_name, sortorder)) - def multi_index_type_from_product_impl(cls, iterables, sortorder=None, names=None): + if not (isinstance(names, (types.Omitted, types.NoneType)) or names is None): + raise TypingError('{} Unsupported parameters. Given names: {}'.format(_func_name, names)) + + def pd_multi_index_from_product_impl(cls, iterables, sortorder=None, names=None): # TO-DO: support indexes.unique() method and use it here levels_factorized = sdc_tuple_map( @@ -1258,13 +1132,11 @@ def multi_index_type_from_product_impl(cls, iterables, sortorder=None, names=Non sdc_indexes_get_name, iterables ) -# print("DEBUG: levels_factorized=", levels_factorized) index_levels = sdc_tuple_map( lambda x: fix_df_index(list(x[0])), levels_factorized ) -# print("DEBUG: index_levels=", levels_factorized) temp_cumprod_sizes = [1, ] codes_info = sdc_tuple_map( @@ -1272,7 +1144,6 @@ def multi_index_type_from_product_impl(cls, iterables, sortorder=None, names=Non levels_factorized, temp_cumprod_sizes ) -# print("DEBUG: codes_info=", codes_info) res_index_size = temp_cumprod_sizes[-1] index_codes = sdc_tuple_map( @@ -1280,7 +1151,6 @@ def multi_index_type_from_product_impl(cls, iterables, sortorder=None, names=Non codes_info, res_index_size ) -# print("DEBUG: index_codes=", index_codes) res = sdc_pandas_multi_index_ctor( index_levels, @@ -1290,7 +1160,7 @@ def multi_index_type_from_product_impl(cls, iterables, sortorder=None, names=Non return res - return multi_index_type_from_product_impl + return pd_multi_index_from_product_impl def _make_level_dict(index): @@ -1308,29 +1178,6 @@ def _make_level_dict_impl(index): return _make_level_dict_impl - -def _update_levels_and_codes(val, level, codes, indexer): - pass - - -@sdc_overload(_update_levels_and_codes) -def _update_levels_and_codes_ovld(val, level, codes, indexer): - - def _update_levels_and_codes_impl(val, level, codes, indexer): - current_index = indexer[-1] - - if val in level: - code = len(level) - else: - code = level.index(val) - level.append(val) - codes[current_index] = code - - indexer[-1] = current_index + 1 - - return _update_levels_and_codes_impl - - def _multi_index_get_new_code(level, val): _code = level.get(val, -1) @@ -1380,28 +1227,18 @@ def codegen(context, builder, sig, args): @sdc_overload_method(SdcTypeRef, 'from_tuples', prefer_literal=False) -def multi_index_type_from_tuples_ovld(cls, iterables): +def pd_multi_index_from_tuples_overload(cls, iterables): if cls.instance_type is not MultiIndexType: return - # FIXME: add proper typ checks - print("DEBUG: SdcTypeRef::from_tuples:", cls, iterables) - _func_name = f'Method MultiIndexType::from_tuples()' + _func_name = f'Method from_tuples()' ty_checker = TypeChecker(_func_name) if not (isinstance(iterables, (types.List, types.ListType)) and isinstance(iterables.dtype, (types.Tuple, types.UniTuple))): ty_checker.raise_exc(iterables, f'list of tuples', 'iterables') - mindex_dtype = iterables.dtype - nlevels = len(mindex_dtype) - range_tup = tuple(np.arange(nlevels)) - - def multi_index_type_from_tuples_impl(cls, iterables): - - ### what we need is a tuple of dicts (for each level): mapping level label into position - ### it was first seen, but also updating codes arrays as per the index that - ### was received from the dict + def pd_multi_index_type_from_tuples_impl(cls, iterables): index_size = len(iterables) if not index_size: @@ -1433,141 +1270,4 @@ def multi_index_type_from_tuples_impl(cls, iterables): ) return res - return multi_index_type_from_tuples_impl - - -@intrinsic -def sdc_tuple_map(typingctx, func, data, *args): - - print("DEBUG: func=", func) - if not isinstance(func, (types.Dispatcher, types.Function)): - assert False, f"sdc_tuple_map's arg 'func' is expected to be " \ - f"numba compiled function or a dispatcher, given: {func}" - - if not isinstance(data, (types.Tuple, types.UniTuple)): - assert False, f"sdc_tuple_map's arg 'data' is expected to be a tuple, given: {data}" - - nargs = len(args) - tuple_len = len(data) - - func_arg_types = [(typ, ) + args for typ in data] - ret_tuple_types = [] - for i in range(tuple_len): - res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) - ret_tuple_types.append(res_sig.return_type) - ret_type = types.Tuple(ret_tuple_types) - ret_sig = ret_type(func, data, types.StarArgTuple.from_types(args)) - print("DEBUG: func_arg_types=", func_arg_types) - print("DEBUG: ret_type=", ret_type) - print("DEBUG: ret_sig=", ret_sig) - - ### FIXME: this works with single overload for decorated function only - ### but this isn't necessary, just need to find out corresponding template - if isinstance(func, types.Function): - assert len(func.templates) == 1, "Function template has multiple overloads" - - def codegen(context, builder, sig, args): - - tup_val = args[1] # main tuple which elements are mapped - other_val = [] - for i in range(0, nargs): - other_val.append( - builder.extract_value(args[2], i) - ) - - mapped_values = [] - for i in range(tuple_len): - tup_elem = builder.extract_value(tup_val, i) - input_args = [tup_elem] + other_val - call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) - - if isinstance(func, types.Dispatcher): - py_func = func.dispatcher.py_func - else: - # for function overloads get pyfunc from compiled impl - target_disp = func.templates[0](context.typing_context) - py_func = target_disp._get_impl(call_sig.args, {})[0].py_func - - mapped_values.append( - context.compile_internal(builder, - py_func, - call_sig, - input_args) - ) - res = context.make_tuple(builder, ret_type, mapped_values) - return res - - return ret_sig, codegen - - -@intrinsic -def sdc_tuple_map_elementwise(typingctx, func, lhs, rhs, *args): - - print("DEBUG: func=", func) - if not isinstance(func, (types.Dispatcher, types.Function)): - assert False, f"sdc_tuple_map_elementwise's arg 'func' is expected to be " \ - f"numba compiled function or a dispatcher, given: {func}" - - if not (isinstance(lhs, (types.Tuple, types.UniTuple)) - and isinstance(rhs, (types.Tuple, types.UniTuple))): - assert False, f"sdc_tuple_map_elementwise's args are expected to be " \ - f"tuples, given: lhs={lhs}, rhs={rhs}" - - assert len(lhs) == len(rhs), f"lhs and rhs tuples have different sizes: lhs={lhs}, rhs={rhs}" - - nargs = len(args) - tuple_len = len(lhs) - - func_arg_types = [x for x in zip(lhs, rhs, *args)] - ret_tuple_types = [] - for i in range(tuple_len): - res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) - ret_tuple_types.append(res_sig.return_type) - ret_type = types.Tuple(ret_tuple_types) - ret_sig = ret_type(func, lhs, rhs, types.StarArgTuple.from_types(args)) - print("DEBUG: func_arg_types=", func_arg_types) - print("DEBUG: ret_type=", ret_type) - print("DEBUG: ret_sig=", ret_sig) - - if isinstance(func, types.Function): - assert len(func.templates) == 1, "Function template has multiple overloads" - - def codegen(context, builder, sig, args): - lhs_val = args[1] - rhs_val = args[2] - other_vals = [] - for i in range(0, nargs): - other_vals.append( - builder.extract_value(args[3], i) - ) - - mapped_values = [] - for i in range(tuple_len): - lhs_elem = builder.extract_value(lhs_val, i) - rhs_elem = builder.extract_value(rhs_val, i) - other_elems = [] - for other_tup in other_vals: - other_elems.append( - builder.extract_value(other_tup, i) - ) - - input_args = [lhs_elem, rhs_elem] + other_elems - call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) - - if isinstance(func, types.Dispatcher): - py_func = func.dispatcher.py_func - else: - # for function overloads get pyfunc from compiled impl - target_disp = func.templates[0](context.typing_context) - py_func = target_disp._get_impl(call_sig.args, {})[0].py_func - - mapped_values.append( - context.compile_internal(builder, - py_func, - call_sig, - input_args) - ) - res = context.make_tuple(builder, ret_type, mapped_values) - return res - - return ret_sig, codegen + return pd_multi_index_type_from_tuples_impl diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py new file mode 100644 index 000000000..17245a871 --- /dev/null +++ b/sdc/functions/tuple_utils.py @@ -0,0 +1,239 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types, prange +from numba.core import cgutils +from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, type_callable) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature, AttributeTemplate, AbstractTemplate, infer_getattr +from numba.core.imputils import impl_ret_untracked, call_getiter, impl_ret_borrowed +from numba.core.imputils import (impl_ret_new_ref, impl_ret_borrowed, iternext_impl, RefType) +from numba.core.boxing import box_array, unbox_array, box_tuple + +import llvmlite.llvmpy.core as lc + +from sdc.datatypes.indexes import * +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + check_signed_integer, + _check_dtype_param_type, + sdc_pandas_index_types, + check_types_comparable, + ) +from sdc.functions import numpy_like +from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.hiframes.boxing import _infer_index_type, _unbox_index_data +from sdc.datatypes.common_functions import hpat_arrays_append +from sdc.extensions.indexes.indexes_generic import * + +from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType +from numba.core.extending import register_jitable +from numba import literal_unroll +from numba.typed import Dict, List +from sdc.str_arr_type import StringArrayType +from sdc.datatypes.sdc_typeref import SdcTypeRef + + +@intrinsic +def sdc_tuple_map(typingctx, func, data, *args): + + if not isinstance(func, (types.Dispatcher, types.Function)): + assert False, f"sdc_tuple_map's arg 'func' is expected to be " \ + f"numba compiled function or a dispatcher, given: {func}" + + if not isinstance(data, (types.Tuple, types.UniTuple)): + assert False, f"sdc_tuple_map's arg 'data' is expected to be a tuple, given: {data}" + + nargs = len(args) + tuple_len = len(data) + + func_arg_types = [(typ, ) + args for typ in data] + ret_tuple_types = [] + for i in range(tuple_len): + res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) + ret_tuple_types.append(res_sig.return_type) + ret_type = types.Tuple(ret_tuple_types) + ret_sig = ret_type(func, data, types.StarArgTuple.from_types(args)) + + ### FIXME: this works with single overload for decorated function only + ### but this isn't necessary, just need to find out corresponding template + if isinstance(func, types.Function): + assert len(func.templates) == 1, "Function template has multiple overloads" + + def codegen(context, builder, sig, args): + + tup_val = args[1] # main tuple which elements are mapped + other_val = [] + for i in range(0, nargs): + other_val.append( + builder.extract_value(args[2], i) + ) + + mapped_values = [] + for i in range(tuple_len): + tup_elem = builder.extract_value(tup_val, i) + input_args = [tup_elem] + other_val + call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) + + if isinstance(func, types.Dispatcher): + py_func = func.dispatcher.py_func + else: + # for function overloads get pyfunc from compiled impl + target_disp = func.templates[0](context.typing_context) + py_func = target_disp._get_impl(call_sig.args, {})[0].py_func + + mapped_values.append( + context.compile_internal(builder, + py_func, + call_sig, + input_args) + ) + res = context.make_tuple(builder, ret_type, mapped_values) + return res + + return ret_sig, codegen + + +@intrinsic +def sdc_tuple_map_elementwise(typingctx, func, lhs, rhs, *args): + + if not isinstance(func, (types.Dispatcher, types.Function)): + assert False, f"sdc_tuple_map_elementwise's arg 'func' is expected to be " \ + f"numba compiled function or a dispatcher, given: {func}" + + if not (isinstance(lhs, (types.Tuple, types.UniTuple)) + and isinstance(rhs, (types.Tuple, types.UniTuple))): + assert False, f"sdc_tuple_map_elementwise's args are expected to be " \ + f"tuples, given: lhs={lhs}, rhs={rhs}" + + assert len(lhs) == len(rhs), f"lhs and rhs tuples have different sizes: lhs={lhs}, rhs={rhs}" + + nargs = len(args) + tuple_len = len(lhs) + + func_arg_types = [x for x in zip(lhs, rhs, *args)] + ret_tuple_types = [] + for i in range(tuple_len): + res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) + ret_tuple_types.append(res_sig.return_type) + ret_type = types.Tuple(ret_tuple_types) + ret_sig = ret_type(func, lhs, rhs, types.StarArgTuple.from_types(args)) + + if isinstance(func, types.Function): + assert len(func.templates) == 1, "Function template has multiple overloads" + + def codegen(context, builder, sig, args): + lhs_val = args[1] + rhs_val = args[2] + other_vals = [] + for i in range(0, nargs): + other_vals.append( + builder.extract_value(args[3], i) + ) + + mapped_values = [] + for i in range(tuple_len): + lhs_elem = builder.extract_value(lhs_val, i) + rhs_elem = builder.extract_value(rhs_val, i) + other_elems = [] + for other_tup in other_vals: + other_elems.append( + builder.extract_value(other_tup, i) + ) + + input_args = [lhs_elem, rhs_elem] + other_elems + call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) + + if isinstance(func, types.Dispatcher): + py_func = func.dispatcher.py_func + else: + # for function overloads get pyfunc from compiled impl + target_disp = func.templates[0](context.typing_context) + py_func = target_disp._get_impl(call_sig.args, {})[0].py_func + + mapped_values.append( + context.compile_internal(builder, + py_func, + call_sig, + input_args) + ) + res = context.make_tuple(builder, ret_type, mapped_values) + return res + + return ret_sig, codegen + + +@intrinsic +def sdc_tuple_unzip(typingctx, data_type): + """ This function gets tuple of pairs and repacks them into two tuples, holding + first and seconds elements, i.e. ((a, b), (c, d), (e, f)) -> ((a, c, e), (b, d, f)). """ + + _func_name = 'sdc_tuple_unzip' + _given_args_str = f'Given: data_type={data_type}' + assert isinstance(data_type, (types.Tuple, types.UniTuple)), \ + f"{_func_name} expects tuple as argument. {_given_args_str}" + + data_len = len(data_type) + assert data_len > 0, f"{_func_name}: empty tuple not allowed. {_given_args_str}" + + for x in data_type: + assert isinstance(x, (types.Tuple, types.UniTuple)) and len(x) == len(data_type[0]), \ + f"{_func_name}: non-supported tuple elements types. {_given_args_str}" + + ty_firsts, ty_seconds = map(lambda x: types.Tuple.from_types(x), + zip(*data_type)) + ret_type = types.Tuple([ty_firsts, ty_seconds]) + + def codegen(context, builder, sig, args): + data_val, = args + + all_firsts = [] + all_seconds = [] + for i in range(data_len): + tup_element_i = builder.extract_value(data_val, i) + first_i = builder.extract_value(tup_element_i, 0) + second_i = builder.extract_value(tup_element_i, 1) + + all_firsts.append(first_i) + all_seconds.append(second_i) + + if context.enable_nrt: + context.nrt.incref(builder, ty_firsts[i], first_i) + context.nrt.incref(builder, ty_seconds[i], second_i) + + first_tup = context.make_tuple(builder, ty_firsts, all_firsts) + second_tup = context.make_tuple(builder, ty_seconds, all_seconds) + return context.make_tuple(builder, ret_type, [first_tup, second_tup]) + + return ret_type(data_type), codegen diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 8752774a4..7ed5eb1f8 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -149,11 +149,6 @@ def fix_df_array(column): @overload(fix_df_array) def fix_df_array_overload(column): - # FIXME: do we need some restriction on column types here? -# if not isinstance(column, (types.List, types.ListType, types.Array, StringArrayType)): -# return None - - print("DEBUG: fix_df_array_overload column=", column) if (isinstance(column, types.List)): dtype = column.dtype if isinstance(dtype, (types.Number, types.Boolean)): @@ -173,11 +168,7 @@ def fix_df_array_list_str_impl(column): # pragma: no cover return lambda column: np.array(column) if isinstance(column, (types.Array, StringArrayType, Categorical)): - def fix_df_array_array_impl(column): - print("DEBUG: calling fix_df_array, column=", column) - return column - return fix_df_array_array_impl - # return lambda column: column + return lambda column: column def fix_df_index(index, coldata=None): @@ -187,7 +178,6 @@ def fix_df_index(index, coldata=None): @overload(fix_df_index) def fix_df_index_overload(index, coldata=None): - print("DEBUG: fix_df_index_overload index=", index) # FIXME: import here due to circular import between indexes, numpy_like, and api from sdc.extensions.indexes.empty_index_ext import init_empty_index from sdc.extensions.indexes.positional_index_ext import init_positional_index @@ -209,7 +199,6 @@ def fix_df_index_impl(index, coldata=None): elif (isinstance(index, sdc_pandas_index_types) and not isinstance(index, sdc_old_index_types)): ## MAJOR bug fix in a separate PR def fix_df_index_impl(index, coldata=None): - print("DEBUG: calling this fix_df_index, index=", index) return index # currently only signed integer indexes are represented with own type From 6b6dddd78465b303fdd31ba622a55241a19c80b7 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 20 Jul 2021 19:27:04 +0300 Subject: [PATCH 4/7] Fixing PEP, updtaing comments, removing debug scripts --- sdc/datatypes/indexes/__init__.py | 2 +- sdc/datatypes/indexes/multi_index_type.py | 21 --------- sdc/extensions/indexes/multi_index_ext.py | 21 +++++---- sdc/functions/tuple_utils.py | 13 +++--- sdc/hiframes/api.py | 3 +- sdc/tests/indexes/test_multi_index.py | 14 +++--- test_create_multiindex.py | 52 ----------------------- 7 files changed, 28 insertions(+), 98 deletions(-) delete mode 100644 test_create_multiindex.py diff --git a/sdc/datatypes/indexes/__init__.py b/sdc/datatypes/indexes/__init__.py index cae399e6d..121da0020 100644 --- a/sdc/datatypes/indexes/__init__.py +++ b/sdc/datatypes/indexes/__init__.py @@ -30,4 +30,4 @@ from .positional_index_type import PositionalIndexType from .empty_index_type import EmptyIndexType from .int64_index_type import Int64IndexType -from .multi_index_type import MultiIndexType \ No newline at end of file +from .multi_index_type import MultiIndexType diff --git a/sdc/datatypes/indexes/multi_index_type.py b/sdc/datatypes/indexes/multi_index_type.py index ddf3f9b5e..e195c20ba 100644 --- a/sdc/datatypes/indexes/multi_index_type.py +++ b/sdc/datatypes/indexes/multi_index_type.py @@ -111,24 +111,3 @@ def __init__(self, dmm, fe_type): make_attribute_wrapper(MultiIndexType, 'levels', '_levels') make_attribute_wrapper(MultiIndexType, 'codes', '_codes') make_attribute_wrapper(MultiIndexType, 'name', '_name') - - -#### FIXME: move below to one common place: - -# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef -# we have to use our new SdcTypeRef to type objects created from types.Type -# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. -class SdcTypeRef(types.Dummy): - """Reference to a type. - Used when a type is passed as a value. - """ - def __init__(self, instance_type): - self.instance_type = instance_type - super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) - - -@register_model(SdcTypeRef) -class SdcTypeRefModel(models.OpaqueModel): - def __init__(self, dmm, fe_type): - - models.OpaqueModel.__init__(self, dmm, fe_type) diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py index 7fb497d48..f5b3a97b3 100644 --- a/sdc/extensions/indexes/multi_index_ext.py +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -236,13 +236,13 @@ def _sdc_multi_index_ctor_typer(typing_ctx, *args): assert False, f"{_func_name}: argument copy is not supported, given: {args[5]}" # if ctor args provide list of levels names via name argument - # update type information for elements in ty_levels + # update type information for elements in ty_levels (so that levels are named indexes) name = args[6] if len(args) >= 6 and not args[6] is None else types.none if not isinstance(name, (types.NoneType, types.Omitted)): assert (isinstance(name, types.Tuple) - and all(map(lambda x: isinstance(x, (types.StringLiteral, types.UnicodeType, types.NoneType)), name)) - or isinstance(name, types.UniTuple) - and isinstance(name.dtype, (types.UnicodeType, types.NoneType))), \ + and all(map(lambda x: isinstance(x, (types.StringLiteral, types.UnicodeType, types.NoneType)), name)) + or isinstance(name, types.UniTuple) + and isinstance(name.dtype, (types.UnicodeType, types.NoneType))), \ f"{_func_name}: argument name must be tuple of strings, given: {args[6]}" assert len(name) == nlevels, \ f"{_func_name}: Length of names must match number of levels in MultiIndex, given: {args[6]}" @@ -260,7 +260,10 @@ def _sdc_multi_index_ctor_typer(typing_ctx, *args): return MultiIndexType(ty_levels, ty_codes, is_named=False) -### FIXME: this should not be generic SdcTypeRef, but specific type, such as MultiIndexTypeRef +# TO-DO: refactor: this allows SdcTypeRef to be callable and makes pd.MultiIndex.from_product +# work, but this typer handles only case when SdcTypeRef.instance_type is MultiIndexType +# but it may be reference to other type as well (e.g. ConcurrentDictType). Need differentiate +# SdcTypeRef-s for different types. @type_callable(SdcTypeRef) def typing_sdctyperef(context): typing_ctx = context @@ -273,7 +276,8 @@ def typer(levels, codes, sortorder=None, names=None, return typer -### FIXME: add comment explaining why it's needed +# FIXME_Numba#7111: low-level api is used as providing SdcTypeRef.__call__ allows numba +# find existing implementation (until above issue is fixed and @overload can be used) @infer_getattr class SdcTypeRefAttribute(AttributeTemplate): key = SdcTypeRef @@ -322,7 +326,7 @@ def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, raise ValueError("Must pass non-zero number of levels/codes") # if name is None then all level names are reset - if is_name_none == True: + if is_name_none == True: # noqa _names = sdc_tuple_map( lambda x: None, levels, @@ -911,7 +915,7 @@ def pd_multi_index_reindex_impl(self, target, method=None, level=None, limit=Non return pd_multi_index_reindex_impl -# FIXME: move to indexes_generic or build into index model? +# TO-DO: seems like this can be removed when indexes have map_positions property @register_jitable def _appender_build_map(index1, index2): res = {} @@ -1051,7 +1055,6 @@ def next_codes_array(stats, res_size): return np.array(list(np.repeat(codes_pattern, span_i)) * repeat_i) -### FIXME: can we re-use this in from_tuples? def factorize_level(level): pass diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py index 17245a871..9f354a593 100644 --- a/sdc/functions/tuple_utils.py +++ b/sdc/functions/tuple_utils.py @@ -86,8 +86,10 @@ def sdc_tuple_map(typingctx, func, data, *args): ret_type = types.Tuple(ret_tuple_types) ret_sig = ret_type(func, data, types.StarArgTuple.from_types(args)) - ### FIXME: this works with single overload for decorated function only - ### but this isn't necessary, just need to find out corresponding template + # codegen below uses first func template to get the dispatcher, so + # for now deny compilation for overloaded func-s that have multiple overloads + # (using the jitted function dispatcher as func will work anyway) + # TO-DO: improve and upstream to Numba if isinstance(func, types.Function): assert len(func.templates) == 1, "Function template has multiple overloads" @@ -109,7 +111,8 @@ def codegen(context, builder, sig, args): if isinstance(func, types.Dispatcher): py_func = func.dispatcher.py_func else: - # for function overloads get pyfunc from compiled impl + # for function overloads get pyfunc from compiled impl (this + # hardcodes the first available template) target_disp = func.templates[0](context.typing_context) py_func = target_disp._get_impl(call_sig.args, {})[0].py_func @@ -209,10 +212,10 @@ def sdc_tuple_unzip(typingctx, data_type): for x in data_type: assert isinstance(x, (types.Tuple, types.UniTuple)) and len(x) == len(data_type[0]), \ - f"{_func_name}: non-supported tuple elements types. {_given_args_str}" + f"{_func_name}: non-supported tuple elements types. {_given_args_str}" ty_firsts, ty_seconds = map(lambda x: types.Tuple.from_types(x), - zip(*data_type)) + zip(*data_type)) ret_type = types.Tuple([ty_firsts, ty_seconds]) def codegen(context, builder, sig, args): diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 7ed5eb1f8..ccb4dc866 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -195,9 +195,8 @@ def fix_df_index_impl(index, coldata=None): return fix_df_index_impl - # elif isinstance(index, (RangeIndexType, Int64IndexType, EmptyIndexType, PositionalIndexType)): elif (isinstance(index, sdc_pandas_index_types) - and not isinstance(index, sdc_old_index_types)): ## MAJOR bug fix in a separate PR + and not isinstance(index, sdc_old_index_types)): def fix_df_index_impl(index, coldata=None): return index diff --git a/sdc/tests/indexes/test_multi_index.py b/sdc/tests/indexes/test_multi_index.py index 5159e00db..95a47b551 100644 --- a/sdc/tests/indexes/test_multi_index.py +++ b/sdc/tests/indexes/test_multi_index.py @@ -156,10 +156,8 @@ def test_impl(levels, codes, names): pd.Int64Index([5, 2, 1, 4, 3]), pd.Int64Index([5, 2, 1, 4, 3], name='bce'), ] - for data, names in product( - combinations(all_levels, 2), - combinations_with_replacement(test_global_index_names, 2) - ): + for data, names in product(combinations(all_levels, 2), + combinations_with_replacement(test_global_index_names, 2)): # all parameters are supported as tuples only in pd.MultiIndex ctor levels = tuple(data) @@ -562,7 +560,7 @@ def test_impl(index1, index2): n = 11 np.random.seed(0) base_index = _get_multi_index_base_index(n) - index1 = base_index[:n] + index1 = base_index[:n] size_range = np.arange(len(index1)) reindex_by = list(map( lambda x: base_index.take(x), @@ -572,7 +570,7 @@ def test_impl(index1, index2): np.random.choice(size_range, n, replace=False), # random unique values from index1 np.random.choice(np.arange(len(base_index)), n), # random values from larger set size_range[:n // 2], # shorter index - np.random.choice(size_range, 2*n), # longer index + np.random.choice(size_range, 2*n), # longer index ] )) @@ -665,7 +663,7 @@ def test_impl(levels): sdc_func = self.jit(test_impl) np.random.seed(0) - for data in _generate_multi_index_levels(): + for data in list(_generate_multi_index_levels())[:1]: # creating pd.MultiIndex is only supported with levels and codes as tuples levels = tuple(data) with self.subTest(levels=levels): @@ -680,7 +678,7 @@ def test_impl(data): n = 100 np.random.seed(0) - for index in _generate_multi_indexes_fixed(n): + for index in list(_generate_multi_indexes_fixed(n))[:1]: data = list(index.values) with self.subTest(data=data): result = sdc_func(data) diff --git a/test_create_multiindex.py b/test_create_multiindex.py deleted file mode 100644 index b13dddd9c..000000000 --- a/test_create_multiindex.py +++ /dev/null @@ -1,52 +0,0 @@ -import pandas as pd -import numpy as np - -### - - - -### Let's define use-cases first, what we actually need from multi-index -### is having it as DF columns! that is we need to support indexes -### that arise from groupby.agg method. - -def test_impl_1(df): - A = df.groupby('A').agg({'A': ['count', 'min', 'max'], - 'B': ['std', 'mean']}) - return A - -df = pd.DataFrame({ - 'A': [2, 1, 1, 1, 2, 2, 1], - 'B': [-8, 2, 3, 1, 5, 6, 7] -}) - -# print("df:", df) -# res = test_impl_1(df) -# print("res:", res) - - - -def test_impl_2(df): - A = df.groupby('A').agg([lambda x: x.max() - x.min(), lambda x: x.max() + x.min()]) - return A - -df = pd.DataFrame({ - 'A': [2, 1, 1, 1, 2, 2, 1], - 'B': [-8, 2, 3, 1, 5, 6, 7], - 'C': [-81, 21, 31, 11, 51, 61, 71] -}) - -# print("df:", df) -# res = test_impl_2(df) -# print("res:", res) - - -def test_impl_3(): - res = pd.MultiIndex( - levels=[np.array([1, 2]), np.array([3, 4])], - #levels=[["zero", "one"], ["x", "y"]], - codes=[[1, 1, 0, 0], [1, 0, 1, 0]] - ) - return res - -res = test_impl_3() -print("res:", res) From 565aa2c04327a8ac2a8b15147bf5e7d0d02a8e86 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 20 Jul 2021 19:53:48 +0300 Subject: [PATCH 5/7] Make separate classes for SdcTypeRef-s types --- sdc/datatypes/hpat_pandas_series_functions.py | 4 +- sdc/datatypes/sdc_typeref.py | 32 +++++++++------ sdc/extensions/indexes/multi_index_ext.py | 16 ++++---- sdc/extensions/sdc_hashmap_ext.py | 10 ++--- sdc/functions/tuple_utils.py | 41 ++----------------- sdc/sdc_autogenerated.py | 16 ++++---- sdc/sdc_function_templates.py | 2 +- sdc/tests/indexes/test_multi_index.py | 6 +-- 8 files changed, 49 insertions(+), 78 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 348cf2665..24089c5df 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4574,7 +4574,7 @@ def _series_operator_add_str_impl(self, other): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_nan_mask = numpy.zeros(result_size, dtype=numpy.bool_) @@ -4692,7 +4692,7 @@ def _series_operator_mul_common_impl(self, other): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) str_series_operand = self if self_is_string_series == True else other # noqa str_series_indexer = left_indexer if self_is_string_series == True else right_indexer # noqa diff --git a/sdc/datatypes/sdc_typeref.py b/sdc/datatypes/sdc_typeref.py index 589105b89..fccbee249 100644 --- a/sdc/datatypes/sdc_typeref.py +++ b/sdc/datatypes/sdc_typeref.py @@ -37,21 +37,27 @@ # FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef # we have to use our new SdcTypeRef to type objects created from types.Type # (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. -class SdcTypeRef(types.Dummy): - """Reference to a type. +def sdc_make_new_typeref_class(): + class SdcTypeRef(types.Dummy): + """Reference to a type. - Used when a type is passed as a value. - """ - def __init__(self, instance_type): - self.instance_type = instance_type - super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) + Used when a type is passed as a value. + """ + def __init__(self, instance_type): + self.instance_type = instance_type + super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) + @register_model(SdcTypeRef) + class SdcTypeRefModel(models.OpaqueModel): + def __init__(self, dmm, fe_type): -@register_model(SdcTypeRef) -class SdcTypeRefModel(models.OpaqueModel): - def __init__(self, dmm, fe_type): + models.OpaqueModel.__init__(self, dmm, fe_type) - models.OpaqueModel.__init__(self, dmm, fe_type) + return SdcTypeRef + + +ConcurrentDictTypeRef = sdc_make_new_typeref_class() +MultiIndexTypeRef = sdc_make_new_typeref_class() @typeof_impl.register(type) @@ -59,8 +65,8 @@ def mynew_typeof_type(val, c): """ This function is a workaround for """ if issubclass(val, ConcurrentDict): - return SdcTypeRef(ConcurrentDictType) + return ConcurrentDictTypeRef(ConcurrentDictType) elif issubclass(val, pd.MultiIndex): - return SdcTypeRef(MultiIndexType) + return MultiIndexTypeRef(MultiIndexType) else: return numba_typeof_type(val, c) diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py index f5b3a97b3..d81661975 100644 --- a/sdc/extensions/indexes/multi_index_ext.py +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -59,7 +59,7 @@ from numba.core.extending import register_jitable from numba import literal_unroll from numba.typed import Dict, List -from sdc.datatypes.sdc_typeref import SdcTypeRef +from sdc.datatypes.sdc_typeref import MultiIndexTypeRef @typeof_impl.register(pd.MultiIndex) @@ -264,7 +264,7 @@ def _sdc_multi_index_ctor_typer(typing_ctx, *args): # work, but this typer handles only case when SdcTypeRef.instance_type is MultiIndexType # but it may be reference to other type as well (e.g. ConcurrentDictType). Need differentiate # SdcTypeRef-s for different types. -@type_callable(SdcTypeRef) +@type_callable(MultiIndexTypeRef) def typing_sdctyperef(context): typing_ctx = context @@ -276,11 +276,11 @@ def typer(levels, codes, sortorder=None, names=None, return typer -# FIXME_Numba#7111: low-level api is used as providing SdcTypeRef.__call__ allows numba -# find existing implementation (until above issue is fixed and @overload can be used) +# FIXME_Numba#7111: low-level api is used to implement typing and impl of MultiIndex ctor +# which is a workaround numba issue (once it's fixed @overload can be used instead) @infer_getattr class SdcTypeRefAttribute(AttributeTemplate): - key = SdcTypeRef + key = MultiIndexTypeRef def resolve___call__(self, instance): return type(instance) @@ -347,7 +347,7 @@ def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, return pd_multi_index_ctor_impl -@lower_builtin(SdcTypeRef, types.VarArg(types.Any)) +@lower_builtin(MultiIndexTypeRef, types.VarArg(types.Any)) def sdctyperef_call_impl(context, builder, sig, args): # FIXME: this hardcodes template number and selected dispatcher, refactor? @@ -1103,7 +1103,7 @@ def _make_level_unique_impl(index): return _make_level_unique_impl -@sdc_overload_method(SdcTypeRef, 'from_product', prefer_literal=False) +@sdc_overload_method(MultiIndexTypeRef, 'from_product', prefer_literal=False) def pd_multi_index_from_product_overload(cls, iterables, sortorder=None, names=None): if cls.instance_type is not MultiIndexType: return @@ -1229,7 +1229,7 @@ def codegen(context, builder, sig, args): return types.none(val, levels, codes, idx), codegen -@sdc_overload_method(SdcTypeRef, 'from_tuples', prefer_literal=False) +@sdc_overload_method(MultiIndexTypeRef, 'from_tuples', prefer_literal=False) def pd_multi_index_from_tuples_overload(cls, iterables): if cls.instance_type is not MultiIndexType: return diff --git a/sdc/extensions/sdc_hashmap_ext.py b/sdc/extensions/sdc_hashmap_ext.py index 54b8edaa3..d02840035 100644 --- a/sdc/extensions/sdc_hashmap_ext.py +++ b/sdc/extensions/sdc_hashmap_ext.py @@ -59,7 +59,7 @@ ConcDictItemsIterableType, ConcDictValuesIterableType) from numba.extending import register_jitable -from sdc.datatypes.sdc_typeref import SdcTypeRef +from sdc.datatypes.sdc_typeref import ConcurrentDictTypeRef from sdc.utilities.sdc_typing_utils import TypingError, TypeChecker, check_types_comparable from itertools import product @@ -357,13 +357,13 @@ def codegen(context, builder, sig, args): return dict_type(key, value), codegen -@overload_method(SdcTypeRef, 'empty') +@overload_method(ConcurrentDictTypeRef, 'empty') def concurrent_dict_empty(cls, key_type, value_type): if cls.instance_type is not ConcurrentDictType: return - _func_name = 'Method SdcTypeRef::empty().' + _func_name = 'Method ConcurrentDictTypeRef::empty().' ty_checker = TypeChecker(_func_name) supported_key_types = (types.NumberClass, types.TypeRef) @@ -850,7 +850,7 @@ def codegen(context, builder, sig, args): return dict_type(keys, values), codegen -@overload_method(SdcTypeRef, 'from_arrays') +@overload_method(ConcurrentDictTypeRef, 'from_arrays') def concurrent_dict_from_arrays_ovld(cls, keys, values): if cls.instance_type is not ConcurrentDictType: return @@ -867,7 +867,7 @@ def concurrent_dict_from_arrays_impl(cls, keys, values): return concurrent_dict_from_arrays_impl -@overload_method(SdcTypeRef, 'fromkeys', prefer_literal=False) +@overload_method(ConcurrentDictTypeRef, 'fromkeys', prefer_literal=False) def concurrent_dict_type_fromkeys_ovld(cls, keys, value): if cls.instance_type is not ConcurrentDictType: return diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py index 9f354a593..17dffa200 100644 --- a/sdc/functions/tuple_utils.py +++ b/sdc/functions/tuple_utils.py @@ -25,44 +25,9 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -import numba -import numpy as np -import operator -import pandas as pd - -from numba import types, prange -from numba.core import cgutils -from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, type_callable) -from numba.core.errors import TypingError -from numba.core.typing.templates import signature, AttributeTemplate, AbstractTemplate, infer_getattr -from numba.core.imputils import impl_ret_untracked, call_getiter, impl_ret_borrowed -from numba.core.imputils import (impl_ret_new_ref, impl_ret_borrowed, iternext_impl, RefType) -from numba.core.boxing import box_array, unbox_array, box_tuple - -import llvmlite.llvmpy.core as lc - -from sdc.datatypes.indexes import * -from sdc.utilities.sdc_typing_utils import SDCLimitation -from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral -from sdc.utilities.sdc_typing_utils import ( - TypeChecker, - check_signed_integer, - _check_dtype_param_type, - sdc_pandas_index_types, - check_types_comparable, - ) -from sdc.functions import numpy_like -from sdc.hiframes.api import fix_df_array, fix_df_index -from sdc.hiframes.boxing import _infer_index_type, _unbox_index_data -from sdc.datatypes.common_functions import hpat_arrays_append -from sdc.extensions.indexes.indexes_generic import * - -from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType -from numba.core.extending import register_jitable -from numba import literal_unroll -from numba.typed import Dict, List -from sdc.str_arr_type import StringArrayType -from sdc.datatypes.sdc_typeref import SdcTypeRef +from numba import types +from numba.extending import (intrinsic, ) +from numba.core.typing.templates import (signature, ) @intrinsic diff --git a/sdc/sdc_autogenerated.py b/sdc/sdc_autogenerated.py index f701cf5fb..6137aaffb 100644 --- a/sdc/sdc_autogenerated.py +++ b/sdc/sdc_autogenerated.py @@ -89,7 +89,7 @@ def sdc_add_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -229,7 +229,7 @@ def sdc_div_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -369,7 +369,7 @@ def sdc_sub_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -509,7 +509,7 @@ def sdc_mul_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -649,7 +649,7 @@ def sdc_truediv_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -789,7 +789,7 @@ def sdc_floordiv_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -929,7 +929,7 @@ def sdc_mod_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -1069,7 +1069,7 @@ def sdc_pow_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) diff --git a/sdc/sdc_function_templates.py b/sdc/sdc_function_templates.py index 2f58cdeee..5b3355631 100644 --- a/sdc/sdc_function_templates.py +++ b/sdc/sdc_function_templates.py @@ -89,7 +89,7 @@ def sdc_binop_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) diff --git a/sdc/tests/indexes/test_multi_index.py b/sdc/tests/indexes/test_multi_index.py index 95a47b551..f168454fb 100644 --- a/sdc/tests/indexes/test_multi_index.py +++ b/sdc/tests/indexes/test_multi_index.py @@ -140,7 +140,7 @@ def test_impl(): def test_multi_index_create_param_names(self): # using keyword arguments in typeref ctor, is not supported due to limitation of __call__ overload, - # TO-DO: refactor this after @overload is supported for typerefs (see FIXME_Numba#XXXX): + # TO-DO: refactor this after @overload is supported for typerefs (see FIXME_Numba#7111): def test_impl(levels, codes, names): # return pd.MultiIndex(levels, codes, name=names) return pd.MultiIndex(levels, codes, None, None, None, False, names) @@ -663,7 +663,7 @@ def test_impl(levels): sdc_func = self.jit(test_impl) np.random.seed(0) - for data in list(_generate_multi_index_levels())[:1]: + for data in _generate_multi_index_levels(): # creating pd.MultiIndex is only supported with levels and codes as tuples levels = tuple(data) with self.subTest(levels=levels): @@ -678,7 +678,7 @@ def test_impl(data): n = 100 np.random.seed(0) - for index in list(_generate_multi_indexes_fixed(n))[:1]: + for index in _generate_multi_indexes_fixed(n): data = list(index.values) with self.subTest(data=data): result = sdc_func(data) From bfae1ac93a81155f179f6dbe8c8dd64a5bca8326 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Wed, 21 Jul 2021 18:39:56 +0300 Subject: [PATCH 6/7] Move helper functions to new file and a bit of renaming --- sdc/extensions/indexes/indexes_generic.py | 30 +- sdc/extensions/indexes/multi_index_ext.py | 355 ++---------------- sdc/extensions/indexes/multi_index_helpers.py | 317 ++++++++++++++++ 3 files changed, 371 insertions(+), 331 deletions(-) create mode 100644 sdc/extensions/indexes/multi_index_helpers.py diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 40f198b7c..02cf77b97 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -30,12 +30,12 @@ import pandas as pd from numba import types -from numba.typed import Dict +from numba.typed import Dict, List from numba.typed.typedobjectutils import _nonoptional from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_old_index_types from sdc.datatypes.indexes import * -from sdc.utilities.utils import sdc_overload_method, sdc_overload +from sdc.utilities.utils import sdc_overload from sdc.utilities.sdc_typing_utils import ( find_index_common_dtype, sdc_indexes_wo_values_cache, @@ -364,3 +364,29 @@ def sdc_indexes_get_name_stub(index): # cannot rename string or float indexes, TO-DO: StringIndexType return None return sdc_indexes_get_name_stub + + +def sdc_indexes_build_map_positions(self): + pass + + +@sdc_overload(sdc_indexes_build_map_positions) +def sdc_indexes_build_map_positions_ovld(self): + + indexer_dtype = self.dtype + indexer_value_type = types.ListType(types.int64) + + def sdc_indexes_build_map_positions_impl(self): + indexer_map = Dict.empty(indexer_dtype, indexer_value_type) + for i in range(len(self)): + val = self[i] + index_list = indexer_map.get(val, None) + if index_list is None: + indexer_map[val] = List.empty_list(types.int64) + indexer_map[val].append(i) + else: + index_list.append(i) + + return indexer_map + + return sdc_indexes_build_map_positions_impl diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py index d81661975..64d410c8a 100644 --- a/sdc/extensions/indexes/multi_index_ext.py +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -56,10 +56,21 @@ from sdc.extensions.indexes.indexes_generic import * from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType -from numba.core.extending import register_jitable from numba import literal_unroll from numba.typed import Dict, List from sdc.datatypes.sdc_typeref import MultiIndexTypeRef +from sdc.extensions.indexes.multi_index_helpers import ( + _multi_index_binop_helper, + _multi_index_append_level, + _multi_index_create_level, + _multi_index_create_levels_and_codes, + _multi_index_alloc_level_dict, + _multi_index_from_tuples_helper, + cat_array_equal, + next_codes_info, + next_codes_array, + factorize_level, + ) @typeof_impl.register(pd.MultiIndex) @@ -348,9 +359,15 @@ def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, @lower_builtin(MultiIndexTypeRef, types.VarArg(types.Any)) -def sdctyperef_call_impl(context, builder, sig, args): - - # FIXME: this hardcodes template number and selected dispatcher, refactor? +def multi_index_typeref_call_impl(context, builder, sig, args): + + # FIXME_Numba#7111: this uses low-level API as a workaround for numba issue + # TO-DO: remove and use @overload(MultiIndexTypeRef), once issue is fixed + # and now we do the following: + # (1) lookup function type for the actual ctor (sdc_pandas_multi_index_ctor) + # (2) get compiled implementation for provided args (hardcodes 0 as selected overload template, + # i.e. we rely on the fact that sdc_pandas_multi_index_ctor was overloaded only once) + # (3) get the function descriptor from compiled result and emit the call to it call_sig = context.typing_context._resolve_user_function_type( sdc_pandas_multi_index_ctor, sig.args, @@ -595,80 +612,6 @@ def pd_multi_index_names_impl(self): return pd_multi_index_names_impl -# FIXME: move to a different file? -def cat_array_equal(A, codes_A, B, codes_B): - pass - - -@sdc_overload(cat_array_equal) -def sdc_cat_array_equal_overload(A, codes_A, B, codes_B): - - def sdc_cat_array_equal_impl(A, codes_A, B, codes_B): - if len(codes_A) != len(codes_B): - return False - - # FIXME_Numba#5157: change to simple A == B when issue is resolved - eq_res_size = len(codes_A) - eq_res = np.empty(eq_res_size, dtype=types.bool_) - for i in numba.prange(eq_res_size): - eq_res[i] = A[codes_A[i]] == B[codes_B[i]] - return np.all(eq_res) - - return sdc_cat_array_equal_impl - - -@intrinsic -def _multi_index_binop_helper(typingctx, self, other): - """ This function gets two multi_index objects each represented as - Tuple(levels) and Tuple(codes) and repacks these into Tuple of following - elements (self_level_0, self_codes_0, other_level_0, other_codes_0), etc - """ - - nlevels = len(self.levels) - if not len(self.levels) == len(other.levels): - assert True, "Cannot flatten MultiIndex of different nlevels" - - elements_types = zip(self.levels, self.codes, other.levels, other.codes) - ret_type = types.Tuple([types.Tuple.from_types(x) for x in elements_types]) - - def codegen(context, builder, sig, args): - self_val, other_val = args - - self_ctinfo = cgutils.create_struct_proxy(self)( - context, builder, value=self_val) - self_levels = self_ctinfo.levels - self_codes = self_ctinfo.codes - - other_ctinfo = cgutils.create_struct_proxy(other)( - context, builder, value=other_val) - other_levels = other_ctinfo.levels - other_codes = other_ctinfo.codes - - ret_tuples = [] - for i in range(nlevels): - self_level_i = builder.extract_value(self_levels, i) - self_codes_i = builder.extract_value(self_codes, i) - other_level_i = builder.extract_value(other_levels, i) - other_codes_i = builder.extract_value(other_codes, i) - - ret_tuples.append( - context.make_tuple(builder, - ret_type[i], - [self_level_i, self_codes_i, other_level_i, other_codes_i]) - ) - - if context.enable_nrt: - context.nrt.incref(builder, ret_type[i][0], self_level_i) - context.nrt.incref(builder, ret_type[i][1], self_codes_i) - context.nrt.incref(builder, ret_type[i][2], other_level_i) - context.nrt.incref(builder, ret_type[i][3], other_codes_i) - - res = context.make_tuple(builder, ret_type, ret_tuples) - return res - - return ret_type(self, other), codegen - - @sdc_overload_method(MultiIndexType, 'equals') def pd_multi_index_equals_overload(self, other): if not isinstance(self, MultiIndexType): @@ -701,33 +644,6 @@ def pd_multi_index_equals_impl(self, other): return pd_multi_index_equals_impl -# FIXME: move to another file? -def _build_index_map(self): - pass - - -@sdc_overload(_build_index_map) -def _build_index_map_ovld(self): - - indexer_dtype = self.dtype - indexer_value_type = types.ListType(types.int64) - - def _build_index_map(self): - indexer_map = Dict.empty(indexer_dtype, indexer_value_type) - for i in range(len(self)): - val = self[i] - index_list = indexer_map.get(val, None) - if index_list is None: - indexer_map[val] = List.empty_list(types.int64) - indexer_map[val].append(i) - else: - index_list.append(i) - - return indexer_map - - return _build_index_map - - @sdc_overload(operator.contains) def pd_multi_index_contains_overload(self, label): if not isinstance(self, MultiIndexType): @@ -742,7 +658,7 @@ def pd_multi_index_contains_overload(self, label): def pd_multi_index_contains_impl(self, label): # build indexer_map (should already been built in index ctor?) - indexer_map = _build_index_map(self) + indexer_map = sdc_indexes_build_map_positions(self) res = label in indexer_map return res @@ -915,53 +831,6 @@ def pd_multi_index_reindex_impl(self, target, method=None, level=None, limit=Non return pd_multi_index_reindex_impl -# TO-DO: seems like this can be removed when indexes have map_positions property -@register_jitable -def _appender_build_map(index1, index2): - res = {} - for i, val in enumerate(index1): - if val not in res: - res[val] = i - - k, count = i, len(res) - while k < i + len(index2): - val = index2[k - i] - if val not in res: - res[val] = count - count += 1 - k += 1 - - return res - - -def _multi_index_append_level(A, codes_A, B, codes_B): - pass - - -@sdc_overload(_multi_index_append_level) -def _multi_index_append_overload(A, codes_A, B, codes_B): - - def _multi_index_append_impl(A, codes_A, B, codes_B): - - appender_map = _appender_build_map(A, B) - res_size = len(codes_A) + len(codes_B) - res_level = fix_df_index( - list(appender_map.keys()) - ) - - res_codes = np.empty(res_size, dtype=np.int64) - A_size = len(codes_A) - for i in prange(res_size): - if i < A_size: - res_codes[i] = codes_A[i] - else: - res_codes[i] = appender_map[B[codes_B[i - A_size]]] - - return (res_level, res_codes) - - return _multi_index_append_impl - - @sdc_overload_method(MultiIndexType, 'append') def pd_multi_index_append_overload(self, other): if not isinstance(self, MultiIndexType): @@ -994,115 +863,6 @@ def pd_multi_index_append_impl(self, other): return pd_multi_index_append_impl -def _multi_index_create_level(index_data, name): - pass - - -@sdc_overload(_multi_index_create_level) -def _multi_index_create_level_ovld(index_data, name): - - def _multi_index_create_level_impl(index_data, name): - index = fix_df_index(index_data) - return sdc_indexes_rename(index, name) - return _multi_index_create_level_impl - - -def _multi_index_create_levels_and_codes(level_data, codes_data, name): - pass - - -@sdc_overload(_multi_index_create_levels_and_codes) -def _multi_index_create_levels_and_codes_ovld(level_data, codes_data, name): - - def _multi_index_create_levels_and_codes_impl(level_data, codes_data, name): - level_data_fixed = fix_df_index(level_data) - level = sdc_indexes_rename(level_data_fixed, name) - codes = fix_df_array(codes_data) - - # to avoid additional overload make data verification checks inplace - # these checks repeat those in MultiIndex::_verify_integrity - if len(codes) and np.max(codes) >= len(level): - raise ValueError( - "On one of the levels code max >= length of level. " - "NOTE: this index is in an inconsistent state" - ) - if len(codes) and np.min(codes) < -1: - raise ValueError( - "On one of the levels code value < -1") - - # TO-DO: support is_unique for all indexes and use it here - indexer_map = _build_index_map(level) - if len(level) != len(indexer_map): - raise ValueError("Level values must be unique") - - return (level, codes) - - return _multi_index_create_levels_and_codes_impl - - -@register_jitable -def next_codes_info(level_info, cumprod_list): - _, codes = level_info - cumprod_list.append(cumprod_list[-1] * len(codes)) - return codes, cumprod_list[-1] - - -@register_jitable -def next_codes_array(stats, res_size): - codes_pattern, factor = stats - span_i = res_size // factor # tiles whole array - repeat_i = res_size // (len(codes_pattern) * span_i) # repeats each element - return np.array(list(np.repeat(codes_pattern, span_i)) * repeat_i) - - -def factorize_level(level): - pass - - -@sdc_overload(factorize_level) -def factorize_level_ovld(level): - - level_dtype = level.dtype - - def factorize_level_impl(level): - unique_labels = List.empty_list(level_dtype) - res_size = len(level) - codes = np.empty(res_size, types.int64) - if not res_size: - return unique_labels, codes - - indexer_map = Dict.empty(level_dtype, types.int64) - for i in range(res_size): - val = level[i] - _code = indexer_map.get(val, -1) - if _code == -1: - new_code = len(unique_labels) - indexer_map[val] = new_code - unique_labels.append(val) - else: - new_code = _code - - codes[i] = new_code - - return unique_labels, codes - - return factorize_level_impl - - -def _make_level_unique(index): - pass - - -@sdc_overload(_make_level_unique) -def _make_level_unique_ovld(index): - - def _make_level_unique_impl(index): - indexer_map = _build_index_map(index) - return list(indexer_map.keys()) - - return _make_level_unique_impl - - @sdc_overload_method(MultiIndexTypeRef, 'from_product', prefer_literal=False) def pd_multi_index_from_product_overload(cls, iterables, sortorder=None, names=None): if cls.instance_type is not MultiIndexType: @@ -1166,69 +926,6 @@ def pd_multi_index_from_product_impl(cls, iterables, sortorder=None, names=None) return pd_multi_index_from_product_impl -def _make_level_dict(index): - pass - - -@sdc_overload(_make_level_dict) -def _make_level_dict_ovld(index): - - index_type = index - - def _make_level_dict_impl(index): - return Dict.empty(index_type, types.int64) - - return _make_level_dict_impl - - -def _multi_index_get_new_code(level, val): - - _code = level.get(val, -1) - if _code == -1: - res = len(level) - level[val] = res - else: - res = _code - - return types.int64(res) - - -def _multi_index_set_new_code(codes, new_code, i): - codes[i] = new_code - - -@intrinsic -def _multi_index_append_value(typingctx, val, levels, codes, idx): - - nlevels = len(val) - if not (nlevels == len(levels) and nlevels == len(codes)): - assert True, f"Cannot append MultiIndex value to existing codes/levels.\n" \ - f"Given: val={val}, levels={levels}, codes={codes}" - - def codegen(context, builder, sig, args): - index_val, levels_val, codes_val, idx_val = args - - for i in range(nlevels): - label = builder.extract_value(index_val, i) - level_i = builder.extract_value(levels_val, i) - codes_i = builder.extract_value(codes_val, i) - - new_code = context.compile_internal( - builder, - _multi_index_get_new_code, - signature(types.int64, levels[i], val[i]), - [level_i, label] - ) - context.compile_internal( - builder, - _multi_index_set_new_code, - signature(types.none, codes[i], types.int64, idx), - [codes_i, new_code, idx_val] - ) - - return types.none(val, levels, codes, idx), codegen - - @sdc_overload_method(MultiIndexTypeRef, 'from_tuples', prefer_literal=False) def pd_multi_index_from_tuples_overload(cls, iterables): if cls.instance_type is not MultiIndexType: @@ -1247,9 +944,10 @@ def pd_multi_index_type_from_tuples_impl(cls, iterables): if not index_size: raise TypeError("Cannot infer number of levels from empty list") + # use first value to infer types and allocate dicts for result multi index levels example_value = iterables[0] levels_dicts = sdc_tuple_map( - _make_level_dict, + _multi_index_alloc_level_dict, example_value ) index_codes = sdc_tuple_map( @@ -1258,9 +956,8 @@ def pd_multi_index_type_from_tuples_impl(cls, iterables): index_size ) - for i in range(index_size): - val = iterables[i] - _multi_index_append_value(val, levels_dicts, index_codes, i) + for i, val in enumerate(iterables): + _multi_index_from_tuples_helper(val, levels_dicts, index_codes, i) index_levels = sdc_tuple_map( lambda x: list(x.keys()), diff --git a/sdc/extensions/indexes/multi_index_helpers.py b/sdc/extensions/indexes/multi_index_helpers.py new file mode 100644 index 000000000..f335a36d9 --- /dev/null +++ b/sdc/extensions/indexes/multi_index_helpers.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np + +from numba import types, prange +from numba.core import cgutils +from numba.core.typing.templates import signature +from numba.core.extending import (intrinsic, register_jitable, ) +from numba.typed import Dict, List + +from sdc.utilities.utils import sdc_overload +from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.extensions.indexes.indexes_generic import ( + sdc_indexes_rename, + sdc_indexes_build_map_positions, + ) + + +def cat_array_equal(A, codes_A, B, codes_B): + pass + + +@sdc_overload(cat_array_equal) +def sdc_cat_array_equal_overload(A, codes_A, B, codes_B): + + def sdc_cat_array_equal_impl(A, codes_A, B, codes_B): + if len(codes_A) != len(codes_B): + return False + + # FIXME_Numba#5157: change to simple A == B when issue is resolved + eq_res_size = len(codes_A) + eq_res = np.empty(eq_res_size, dtype=types.bool_) + for i in numba.prange(eq_res_size): + eq_res[i] = A[codes_A[i]] == B[codes_B[i]] + return np.all(eq_res) + + return sdc_cat_array_equal_impl + + +@intrinsic +def _multi_index_binop_helper(typingctx, self, other): + """ This function gets two multi_index objects each represented as + Tuple(levels) and Tuple(codes) and repacks these into Tuple of following + elements (self_level_0, self_codes_0, other_level_0, other_codes_0), etc + """ + + nlevels = len(self.levels) + if not len(self.levels) == len(other.levels): + assert True, "Cannot flatten MultiIndex of different nlevels" + + elements_types = zip(self.levels, self.codes, other.levels, other.codes) + ret_type = types.Tuple([types.Tuple.from_types(x) for x in elements_types]) + + def codegen(context, builder, sig, args): + self_val, other_val = args + + self_ctinfo = cgutils.create_struct_proxy(self)( + context, builder, value=self_val) + self_levels = self_ctinfo.levels + self_codes = self_ctinfo.codes + + other_ctinfo = cgutils.create_struct_proxy(other)( + context, builder, value=other_val) + other_levels = other_ctinfo.levels + other_codes = other_ctinfo.codes + + ret_tuples = [] + for i in range(nlevels): + self_level_i = builder.extract_value(self_levels, i) + self_codes_i = builder.extract_value(self_codes, i) + other_level_i = builder.extract_value(other_levels, i) + other_codes_i = builder.extract_value(other_codes, i) + + ret_tuples.append( + context.make_tuple(builder, + ret_type[i], + [self_level_i, self_codes_i, other_level_i, other_codes_i]) + ) + + if context.enable_nrt: + context.nrt.incref(builder, ret_type[i][0], self_level_i) + context.nrt.incref(builder, ret_type[i][1], self_codes_i) + context.nrt.incref(builder, ret_type[i][2], other_level_i) + context.nrt.incref(builder, ret_type[i][3], other_codes_i) + + res = context.make_tuple(builder, ret_type, ret_tuples) + return res + + return ret_type(self, other), codegen + + +# TO-DO: seems like this can be refactored when indexes have cached map_positions property +@register_jitable +def _appender_build_map(index1, index2): + res = {} + for i, val in enumerate(index1): + if val not in res: + res[val] = i + + k, count = i, len(res) + while k < i + len(index2): + val = index2[k - i] + if val not in res: + res[val] = count + count += 1 + k += 1 + + return res + + +def _multi_index_append_level(A, codes_A, B, codes_B): + pass + + +@sdc_overload(_multi_index_append_level) +def _multi_index_append_level_overload(A, codes_A, B, codes_B): + + def _multi_index_append_level_impl(A, codes_A, B, codes_B): + + appender_map = _appender_build_map(A, B) + res_size = len(codes_A) + len(codes_B) + res_level = fix_df_index( + list(appender_map.keys()) + ) + + res_codes = np.empty(res_size, dtype=np.int64) + A_size = len(codes_A) + for i in prange(res_size): + if i < A_size: + res_codes[i] = codes_A[i] + else: + res_codes[i] = appender_map[B[codes_B[i - A_size]]] + + return (res_level, res_codes) + + return _multi_index_append_level_impl + + +def _multi_index_create_level(index_data, name): + pass + + +@sdc_overload(_multi_index_create_level) +def _multi_index_create_level_ovld(index_data, name): + + def _multi_index_create_level_impl(index_data, name): + index = fix_df_index(index_data) + return sdc_indexes_rename(index, name) + return _multi_index_create_level_impl + + +def _multi_index_create_levels_and_codes(level_data, codes_data, name): + pass + + +@sdc_overload(_multi_index_create_levels_and_codes) +def _multi_index_create_levels_and_codes_ovld(level_data, codes_data, name): + + def _multi_index_create_levels_and_codes_impl(level_data, codes_data, name): + level_data_fixed = fix_df_index(level_data) + level = sdc_indexes_rename(level_data_fixed, name) + codes = fix_df_array(codes_data) + + # to avoid additional overload make data verification checks inplace + # these checks repeat those in MultiIndex::_verify_integrity + if len(codes) and np.max(codes) >= len(level): + raise ValueError( + "On one of the levels code max >= length of level. " + "NOTE: this index is in an inconsistent state" + ) + if len(codes) and np.min(codes) < -1: + raise ValueError( + "On one of the levels code value < -1") + + # TO-DO: support is_unique for all indexes and use it here + indexer_map = sdc_indexes_build_map_positions(level) + if len(level) != len(indexer_map): + raise ValueError("Level values must be unique") + + return (level, codes) + + return _multi_index_create_levels_and_codes_impl + + +def factorize_level(level): + pass + + +@sdc_overload(factorize_level) +def factorize_level_ovld(level): + + level_dtype = level.dtype + + def factorize_level_impl(level): + unique_labels = List.empty_list(level_dtype) + res_size = len(level) + codes = np.empty(res_size, types.int64) + if not res_size: + return unique_labels, codes + + indexer_map = Dict.empty(level_dtype, types.int64) + for i in range(res_size): + val = level[i] + _code = indexer_map.get(val, -1) + if _code == -1: + new_code = len(unique_labels) + indexer_map[val] = new_code + unique_labels.append(val) + else: + new_code = _code + + codes[i] = new_code + + return unique_labels, codes + + return factorize_level_impl + + +@register_jitable +def next_codes_info(level_info, cumprod_list): + _, codes = level_info + cumprod_list.append(cumprod_list[-1] * len(codes)) + return codes, cumprod_list[-1] + + +@register_jitable +def next_codes_array(stats, res_size): + codes_pattern, factor = stats + span_i = res_size // factor # tiles whole array + repeat_i = res_size // (len(codes_pattern) * span_i) # repeats each element + return np.array(list(np.repeat(codes_pattern, span_i)) * repeat_i) + + +def _multi_index_alloc_level_dict(index): + pass + + +@sdc_overload(_multi_index_alloc_level_dict) +def _make_level_dict_ovld(index): + + index_type = index + + def _make_level_dict_impl(index): + return Dict.empty(index_type, types.int64) + + return _make_level_dict_impl + + +@intrinsic +def _multi_index_from_tuples_helper(typingctx, val, levels, codes, idx): + + nlevels = len(val) + if not (nlevels == len(levels) and nlevels == len(codes)): + assert True, f"Cannot append MultiIndex value to existing codes/levels.\n" \ + f"Given: val={val}, levels={levels}, codes={codes}" + + def _get_code_for_label(seen_labels, label): + + _code = seen_labels.get(label, -1) + if _code != -1: + return _code + + res = len(seen_labels) + seen_labels[label] = res + return types.int64(res) + + def _set_code_by_position(codes, new_code, i): + codes[i] = new_code + + def codegen(context, builder, sig, args): + index_val, levels_val, codes_val, idx_val = args + + for i in range(nlevels): + label = builder.extract_value(index_val, i) + level_i = builder.extract_value(levels_val, i) + codes_i = builder.extract_value(codes_val, i) + + new_code = context.compile_internal( + builder, + _get_code_for_label, + signature(types.int64, levels[i], val[i]), + [level_i, label] + ) + context.compile_internal( + builder, + _set_code_by_position, + signature(types.none, codes[i], types.int64, idx), + [codes_i, new_code, idx_val] + ) + + return types.none(val, levels, codes, idx), codegen From 7a90f6a23533b115f0331638aee99ccdf61473b9 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 27 Jul 2021 15:36:53 +0300 Subject: [PATCH 7/7] Using infer_global to type classes instead typeof(type) --- sdc/datatypes/indexes/multi_index_type.py | 2 -- sdc/datatypes/sdc_typeref.py | 17 ++++------------- sdc/extensions/indexes/multi_index_ext.py | 2 +- 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/sdc/datatypes/indexes/multi_index_type.py b/sdc/datatypes/indexes/multi_index_type.py index e195c20ba..bc9193325 100644 --- a/sdc/datatypes/indexes/multi_index_type.py +++ b/sdc/datatypes/indexes/multi_index_type.py @@ -30,9 +30,7 @@ models, register_model, make_attribute_wrapper, - typeof_impl, ) -from numba.core.typing.typeof import _typeof_type as numba_typeof_type class MultiIndexIteratorType(types.SimpleIteratorType): diff --git a/sdc/datatypes/sdc_typeref.py b/sdc/datatypes/sdc_typeref.py index fccbee249..ed936e690 100644 --- a/sdc/datatypes/sdc_typeref.py +++ b/sdc/datatypes/sdc_typeref.py @@ -27,8 +27,8 @@ import pandas as pd from numba.core import types -from numba.extending import (models, register_model, typeof_impl, ) -from numba.core.typing.typeof import _typeof_type as numba_typeof_type +from numba.extending import (models, register_model, ) +from numba.core.typing.templates import infer_global from sdc.extensions.sdc_hashmap_type import ConcurrentDict, ConcurrentDictType from sdc.datatypes.indexes import MultiIndexType @@ -59,14 +59,5 @@ def __init__(self, dmm, fe_type): ConcurrentDictTypeRef = sdc_make_new_typeref_class() MultiIndexTypeRef = sdc_make_new_typeref_class() - -@typeof_impl.register(type) -def mynew_typeof_type(val, c): - """ This function is a workaround for """ - - if issubclass(val, ConcurrentDict): - return ConcurrentDictTypeRef(ConcurrentDictType) - elif issubclass(val, pd.MultiIndex): - return MultiIndexTypeRef(MultiIndexType) - else: - return numba_typeof_type(val, c) +infer_global(ConcurrentDict, ConcurrentDictTypeRef(ConcurrentDictType)) +infer_global(pd.MultiIndex, MultiIndexTypeRef(MultiIndexType)) diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py index 64d410c8a..8943b9e6a 100644 --- a/sdc/extensions/indexes/multi_index_ext.py +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -367,7 +367,7 @@ def multi_index_typeref_call_impl(context, builder, sig, args): # (1) lookup function type for the actual ctor (sdc_pandas_multi_index_ctor) # (2) get compiled implementation for provided args (hardcodes 0 as selected overload template, # i.e. we rely on the fact that sdc_pandas_multi_index_ctor was overloaded only once) - # (3) get the function descriptor from compiled result and emit the call to it + # (3) get the function descriptor from compiled result and emit the call to it call_sig = context.typing_context._resolve_user_function_type( sdc_pandas_multi_index_ctor, sig.args,