From ab14b317a994e0e8f9b8a82349edccc95086a1f4 Mon Sep 17 00:00:00 2001 From: Alexey Kozlov Date: Thu, 29 Jul 2021 14:36:49 +0300 Subject: [PATCH 1/2] Add support for standalone use of pandas MultiIndex (#980) * Initial version of pandas MultiIndex class * Fixing tests * Removing debug traces and reorganizing code * Fixing PEP, updtaing comments, removing debug scripts * Make separate classes for SdcTypeRef-s types * Move helper functions to new file and a bit of renaming * Using infer_global to type classes instead typeof(type) --- sdc/__init__.py | 1 + sdc/datatypes/hpat_pandas_series_functions.py | 4 +- sdc/datatypes/indexes/__init__.py | 1 + sdc/datatypes/indexes/multi_index_type.py | 111 ++ sdc/datatypes/sdc_typeref.py | 63 ++ sdc/extensions/indexes/indexes_generic.py | 118 ++- sdc/extensions/indexes/multi_index_ext.py | 973 ++++++++++++++++++ sdc/extensions/indexes/multi_index_helpers.py | 317 ++++++ sdc/extensions/sdc_hashmap_ext.py | 10 +- sdc/extensions/sdc_hashmap_type.py | 43 +- sdc/functions/tuple_utils.py | 207 ++++ sdc/hiframes/api.py | 8 +- sdc/sdc_autogenerated.py | 16 +- sdc/sdc_function_templates.py | 2 +- sdc/tests/indexes/__init__.py | 1 + sdc/tests/indexes/index_datagens.py | 90 +- sdc/tests/indexes/test_multi_index.py | 690 +++++++++++++ sdc/tests/test_compile_time.py | 2 +- sdc/tests/test_utils.py | 13 + sdc/utilities/sdc_typing_utils.py | 5 + 20 files changed, 2609 insertions(+), 66 deletions(-) create mode 100644 sdc/datatypes/indexes/multi_index_type.py create mode 100644 sdc/datatypes/sdc_typeref.py create mode 100644 sdc/extensions/indexes/multi_index_ext.py create mode 100644 sdc/extensions/indexes/multi_index_helpers.py create mode 100644 sdc/functions/tuple_utils.py create mode 100644 sdc/tests/indexes/test_multi_index.py diff --git a/sdc/__init__.py b/sdc/__init__.py index 76c29ae97..e73c51682 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -49,6 +49,7 @@ import sdc.extensions.indexes.range_index_ext import sdc.extensions.indexes.int64_index_ext +import sdc.extensions.indexes.multi_index_ext import sdc.extensions.sdc_hashmap_ext diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 348cf2665..24089c5df 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -4574,7 +4574,7 @@ def _series_operator_add_str_impl(self, other): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_nan_mask = numpy.zeros(result_size, dtype=numpy.bool_) @@ -4692,7 +4692,7 @@ def _series_operator_mul_common_impl(self, other): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) str_series_operand = self if self_is_string_series == True else other # noqa str_series_indexer = left_indexer if self_is_string_series == True else right_indexer # noqa diff --git a/sdc/datatypes/indexes/__init__.py b/sdc/datatypes/indexes/__init__.py index 52d144708..121da0020 100644 --- a/sdc/datatypes/indexes/__init__.py +++ b/sdc/datatypes/indexes/__init__.py @@ -30,3 +30,4 @@ from .positional_index_type import PositionalIndexType from .empty_index_type import EmptyIndexType from .int64_index_type import Int64IndexType +from .multi_index_type import MultiIndexType diff --git a/sdc/datatypes/indexes/multi_index_type.py b/sdc/datatypes/indexes/multi_index_type.py new file mode 100644 index 000000000..bc9193325 --- /dev/null +++ b/sdc/datatypes/indexes/multi_index_type.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper, +) + + +class MultiIndexIteratorType(types.SimpleIteratorType): + def __init__(self, iterable): + self.parent = iterable + yield_type = iterable.dtype + name = "iter[{}->{}],{}".format( + iterable, yield_type, iterable.name + ) + super(MultiIndexIteratorType, self).__init__(name, yield_type) + + +@register_model(MultiIndexIteratorType) +class MultiIndexIterModel(models.StructModel): + def __init__(self, dmm, fe_type): + members = [ + ('parent', fe_type.parent), # reference to the index object + ('state', types.CPointer(types.int64)), # iterator state (i.e. counter) + ] + super(MultiIndexIterModel, self).__init__(dmm, fe_type, members) + + +class MultiIndexType(types.IterableType): + + def __init__(self, levels, codes, is_named=False): + self.levels = levels + self.codes = codes + self.is_named = is_named + super(MultiIndexType, self).__init__( + name='MultiIndexType({}, {}, {})'.format(levels, codes, is_named)) + + @property + def iterator_type(self): + return MultiIndexIteratorType(self).iterator_type + + @property + def dtype(self): + nlevels = len(self.levels) + levels_types = [self.levels.dtype] * nlevels if isinstance(self.levels, types.UniTuple) else self.levels + return types.Tuple.from_types([level.dtype for level in levels_types]) + + @property + def nlevels(self): + return len(self.levels) + + @property + def levels_types(self): + if isinstance(self.levels, types.UniTuple): + return [self.levels.dtype] * self.levels.count + + return self.levels + + @property + def codes_types(self): + if isinstance(self.codes, types.UniTuple): + return [self.codes.dtype] * self.codes.count + + return self.codes + + +@register_model(MultiIndexType) +class MultiIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + levels_type = fe_type.levels + codes_type = fe_type.codes + name_type = types.unicode_type if fe_type.is_named else types.none # TO-DO: change to types.Optional + members = [ + ('levels', levels_type), + ('codes', codes_type), + ('name', name_type), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(MultiIndexType, 'levels', '_levels') +make_attribute_wrapper(MultiIndexType, 'codes', '_codes') +make_attribute_wrapper(MultiIndexType, 'name', '_name') diff --git a/sdc/datatypes/sdc_typeref.py b/sdc/datatypes/sdc_typeref.py new file mode 100644 index 000000000..ed936e690 --- /dev/null +++ b/sdc/datatypes/sdc_typeref.py @@ -0,0 +1,63 @@ +# ***************************************************************************** +# Copyright (c) 2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd + +from numba.core import types +from numba.extending import (models, register_model, ) +from numba.core.typing.templates import infer_global + +from sdc.extensions.sdc_hashmap_type import ConcurrentDict, ConcurrentDictType +from sdc.datatypes.indexes import MultiIndexType + + +# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef +# we have to use our new SdcTypeRef to type objects created from types.Type +# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. +def sdc_make_new_typeref_class(): + class SdcTypeRef(types.Dummy): + """Reference to a type. + + Used when a type is passed as a value. + """ + def __init__(self, instance_type): + self.instance_type = instance_type + super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) + + @register_model(SdcTypeRef) + class SdcTypeRefModel(models.OpaqueModel): + def __init__(self, dmm, fe_type): + + models.OpaqueModel.__init__(self, dmm, fe_type) + + return SdcTypeRef + + +ConcurrentDictTypeRef = sdc_make_new_typeref_class() +MultiIndexTypeRef = sdc_make_new_typeref_class() + +infer_global(ConcurrentDict, ConcurrentDictTypeRef(ConcurrentDictType)) +infer_global(pd.MultiIndex, MultiIndexTypeRef(MultiIndexType)) diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 3462067cc..02cf77b97 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -30,12 +30,12 @@ import pandas as pd from numba import types -from numba.typed import Dict +from numba.typed import Dict, List from numba.typed.typedobjectutils import _nonoptional from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_old_index_types from sdc.datatypes.indexes import * -from sdc.utilities.utils import sdc_overload_method, sdc_overload +from sdc.utilities.utils import sdc_overload from sdc.utilities.sdc_typing_utils import ( find_index_common_dtype, sdc_indexes_wo_values_cache, @@ -96,7 +96,9 @@ def sdc_indexes_operator_eq_ovld(self, other): # TO-DO: this is for numeric indexes only now, extend to string-index when it's added use_self_values = isinstance(self, sdc_pandas_index_types) and not isinstance(self, types.Array) use_other_values = isinstance(other, sdc_pandas_index_types) and not isinstance(other, types.Array) - one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) + + one_operand_is_scalar = (isinstance(other, sdc_pandas_index_types) and self is other.dtype + or isinstance(self, sdc_pandas_index_types) and other is self.dtype) def sdc_indexes_operator_eq_impl(self, other): @@ -217,8 +219,8 @@ def pd_fix_indexes_join_overload(joined, indexer1, indexer2): """ Wraps pandas index.join() into new function that returns indexers as arrays and not optional(array) """ # This function is simply a workaround for problem with parfor lowering - # broken by indexers typed as types.Optional(Array) - FIXME_Numba#XXXX: remove it - # in all places whne parfor issue is fixed + # broken by indexers typed as types.Optional(Array) - FIXME_Numba#6686: remove it + # in all places when parfor issue is fixed def pd_fix_indexes_join_impl(joined, indexer1, indexer2): if indexer1 is not None: _indexer1 = _nonoptional(indexer1) @@ -282,3 +284,109 @@ def sdc_np_array_overload(A): if isinstance(A, Int64IndexType): return lambda A: A._data + + +def sdc_indexes_take(self, target): + pass + + +@sdc_overload(sdc_indexes_take) +def pd_fix_indexes_take_overload(self, indexes): + """ Simply workaround for not having take method as unique indexes due to + the fact that StringArrayType is one of the index types """ + + check = isinstance(self, sdc_pandas_index_types) + if not isinstance(self, sdc_pandas_index_types): + return None + + index_api_supported = not isinstance(self, sdc_old_index_types) + + def pd_fix_indexes_take_impl(self, indexes): + + if index_api_supported == True: # noqa + res = self.take(indexes) + else: + res = numpy_like.take(self, indexes) + + return res + + return pd_fix_indexes_take_impl + + +def sdc_indexes_rename(index, name): + pass + + +@sdc_overload(sdc_indexes_rename) +def sdc_index_rename_ovld(index, name): + + if not isinstance(index, sdc_pandas_index_types): + return None + + if isinstance(index, sdc_old_index_types): + def sdc_indexes_rename_stub(index, name): + # cannot rename string or float indexes, TO-DO: StringIndexType + return index + return sdc_indexes_rename_stub + + if isinstance(index, PositionalIndexType): + from sdc.extensions.indexes.positional_index_ext import init_positional_index + + def sdc_indexes_rename_impl(index, name): + return init_positional_index(len(index), name) + return sdc_indexes_rename_impl + + elif isinstance(index, RangeIndexType): + def sdc_indexes_rename_impl(index, name): + return pd.RangeIndex(index.start, index.stop, index.step, name=name) + return sdc_indexes_rename_impl + + elif isinstance(index, Int64IndexType): + def sdc_indexes_rename_impl(index, name): + return pd.Int64Index(index, name=name) + return sdc_indexes_rename_impl + + +def sdc_indexes_get_name(index): + pass + + +@sdc_overload(sdc_indexes_get_name) +def sdc_indexes_get_name_ovld(index): + + if (isinstance(index, sdc_pandas_index_types) + and not isinstance(index, sdc_old_index_types)): + def sdc_indexes_get_name_impl(index): + return index.name + return sdc_indexes_get_name_impl + + def sdc_indexes_get_name_stub(index): + # cannot rename string or float indexes, TO-DO: StringIndexType + return None + return sdc_indexes_get_name_stub + + +def sdc_indexes_build_map_positions(self): + pass + + +@sdc_overload(sdc_indexes_build_map_positions) +def sdc_indexes_build_map_positions_ovld(self): + + indexer_dtype = self.dtype + indexer_value_type = types.ListType(types.int64) + + def sdc_indexes_build_map_positions_impl(self): + indexer_map = Dict.empty(indexer_dtype, indexer_value_type) + for i in range(len(self)): + val = self[i] + index_list = indexer_map.get(val, None) + if index_list is None: + indexer_map[val] = List.empty_list(types.int64) + indexer_map[val].append(i) + else: + index_list.append(i) + + return indexer_map + + return sdc_indexes_build_map_positions_impl diff --git a/sdc/extensions/indexes/multi_index_ext.py b/sdc/extensions/indexes/multi_index_ext.py new file mode 100644 index 000000000..8943b9e6a --- /dev/null +++ b/sdc/extensions/indexes/multi_index_ext.py @@ -0,0 +1,973 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types, prange +from numba.core import cgutils +from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, type_callable) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature, AttributeTemplate, infer_getattr +from numba.core.imputils import (impl_ret_borrowed, iternext_impl, RefType) +from numba.core.boxing import unbox_array, box_tuple + +import llvmlite.llvmpy.core as lc + +from sdc.datatypes.indexes import * +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + sdc_pandas_index_types, + sdc_pandas_df_column_types, + check_types_comparable, + ) +from sdc.functions import numpy_like +from sdc.functions.tuple_utils import sdc_tuple_map, sdc_tuple_map_elementwise, sdc_tuple_unzip +from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.hiframes.boxing import _infer_index_type, _unbox_index_data +from sdc.extensions.indexes.indexes_generic import * + +from sdc.datatypes.indexes.multi_index_type import MultiIndexIteratorType +from numba import literal_unroll +from numba.typed import Dict, List +from sdc.datatypes.sdc_typeref import MultiIndexTypeRef +from sdc.extensions.indexes.multi_index_helpers import ( + _multi_index_binop_helper, + _multi_index_append_level, + _multi_index_create_level, + _multi_index_create_levels_and_codes, + _multi_index_alloc_level_dict, + _multi_index_from_tuples_helper, + cat_array_equal, + next_codes_info, + next_codes_array, + factorize_level, + ) + + +@typeof_impl.register(pd.MultiIndex) +def typeof_multi_index(val, c): + levels = tuple(_infer_index_type(x) for x in val.levels) + codes = tuple(numba.typeof(x) for x in val.codes) # note this produces readonly array(int8, 1d, C) + is_named = val.name is not None + + return MultiIndexType(types.Tuple.from_types(levels), + types.Tuple.from_types(codes), + is_named=is_named) + + +@box(MultiIndexType) +def box_multi_index(typ, val, c): + + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + + py_levels = box_tuple(typ.levels, multi_index.levels, c) + py_codes = box_tuple(typ.codes, multi_index.codes, c) + + # dtype and copy params are not stored so use default values + dtype = c.pyapi.make_none() + copy = c.pyapi.bool_from_bool( + c.context.get_constant(types.bool_, False) + ) + sortorder = c.pyapi.make_none() + + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, multi_index.name) + else: + name = c.pyapi.make_none() + + # build MultiIndex names from names of boxed levels (if python level has name attribute) + # TO-DO: refactor this to use native indexes names when all index have it (e.g. StringIndexType) + nlevels = len(typ.levels) + py_nlevels = c.pyapi.tuple_size(py_levels) + py_names = c.pyapi.list_new(py_nlevels) + for i in range(nlevels): + level_type = typ.levels[i] + if isinstance(level_type, sdc_old_index_types): + py_level_name = c.pyapi.make_none() + else: + py_level_obj = c.pyapi.tuple_getitem(py_levels, i) + py_level_name = c.pyapi.object_getattr_string(py_level_obj, 'name') + c.pyapi.list_setitem(py_names, c.context.get_constant(types.intp, i), py_level_name) + # FIXME: check decref is needed for pe_level_obj? + + res = c.pyapi.call_method(pd_class_obj, "MultiIndex", + (py_levels, py_codes, sortorder, py_names, dtype, copy, name)) + + c.pyapi.decref(py_levels) + c.pyapi.decref(py_codes) + c.pyapi.decref(sortorder) + c.pyapi.decref(py_names) + c.pyapi.decref(dtype) + c.pyapi.decref(copy) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + + +@unbox(MultiIndexType) +def unbox_int64_index(typ, val, c): + + nlevels = len(typ.levels) + levels_types = typ.levels_types + codes_types = typ.codes_types + multi_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + + py_levels_data = c.pyapi.object_getattr_string(val, "levels") + native_levels_data = [] + for i in range(nlevels): + idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) + level_data = c.pyapi.object_getitem(py_levels_data, idx) + native_levels_data.append( + _unbox_index_data(levels_types[i], level_data, c).value + ) + c.pyapi.decref(level_data) + c.pyapi.decref(py_levels_data) + multi_index.levels = c.context.make_tuple(c.builder, typ.levels, native_levels_data) + + py_codes_data = c.pyapi.object_getattr_string(val, "codes") + native_codes_data = [] + for i in range(nlevels): + idx = c.pyapi.long_from_ulonglong(c.context.get_constant(types.int64, i)) + code_data = c.pyapi.object_getitem(py_codes_data, idx) + native_codes_data.append( + unbox_array(codes_types[i], code_data, c).value + ) + c.pyapi.decref(code_data) + c.pyapi.decref(py_codes_data) + multi_index.codes = c.context.make_tuple(c.builder, typ.codes, native_codes_data) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + multi_index.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(multi_index._getvalue(), is_error=is_error) + + +@intrinsic +def init_multi_index(typingctx, levels, codes): + + if not (isinstance(levels, (types.Tuple, types.UniTuple)) and + isinstance(codes, (types.Tuple, types.UniTuple))): + return None + + def is_valid_level_type(typ): + return isinstance(typ, sdc_pandas_index_types) + + def is_valid_code_type(typ): + return (isinstance(typ, types.Array) and isinstance(typ.dtype, types.Integer)) + + if not all(map(is_valid_level_type, levels)): + return None + + if not all(map(is_valid_code_type, codes)): + return None + + def codegen(context, builder, sig, args): + levels_val, codes_val = args + # create series struct and store values + multi_index = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + + multi_index.levels = levels_val + multi_index.codes = codes_val + multi_index.name = context.get_dummy_value() + + if context.enable_nrt: + context.nrt.incref(builder, sig.args[0], levels_val) + context.nrt.incref(builder, sig.args[1], codes_val) + + return multi_index._getvalue() + + ret_typ = MultiIndexType(levels, codes, is_named=False) # pandas ctor always creates unnamed indexes + sig = signature(ret_typ, levels, codes) + return sig, codegen + + +def _sdc_multi_index_ctor_typer(typing_ctx, *args): + + _func_name = '_sdc_multi_index_ctor_typer' + # this types subsequent call to sdc_pandas_multi_index_ctor function with signature: + # args = (levels, codes, sortorder=None, names=None, dtype=None, copy=False, name=None) + + assert len(args) >= 2, f"{_func_name}: Expecting 2 or more positional args, given: {args}" + + levels, codes = args[:2] + if not (isinstance(levels, (types.Tuple, types.UniTuple)) + and isinstance(codes, (types.Tuple, types.UniTuple))): + raise TypingError(f"{_func_name}: levels and codes args must be tuples, given: levels={levels}, codes={codes}") + + nlevels = len(levels) + ty_codes = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + fix_df_array, (typ,), {}).return_type for typ in codes] + ) + + if len(args) >= 2 and not (isinstance(args[2], (types.NoneType, types.Omitted)) or args[2] is None): + assert False, f"{_func_name}: argument sortorder is not supported, given: {args[2]}" + if len(args) >= 3 and not (isinstance(args[3], (types.NoneType, types.Omitted)) or args[3] is None): + assert False, f"{_func_name}: argument names is not supported, given: {args[3]}" + if len(args) >= 4 and not (isinstance(args[4], (types.NoneType, types.Omitted)) or args[4] is None): + assert False, f"{_func_name}: argument dtype is not supported, given: {args[4]}" + if len(args) >= 5 and not (isinstance(args[5], (types.Boolean, types.Omitted)) or args[5] is False): + assert False, f"{_func_name}: argument copy is not supported, given: {args[5]}" + + # if ctor args provide list of levels names via name argument + # update type information for elements in ty_levels (so that levels are named indexes) + name = args[6] if len(args) >= 6 and not args[6] is None else types.none + if not isinstance(name, (types.NoneType, types.Omitted)): + assert (isinstance(name, types.Tuple) + and all(map(lambda x: isinstance(x, (types.StringLiteral, types.UnicodeType, types.NoneType)), name)) + or isinstance(name, types.UniTuple) + and isinstance(name.dtype, (types.UnicodeType, types.NoneType))), \ + f"{_func_name}: argument name must be tuple of strings, given: {args[6]}" + assert len(name) == nlevels, \ + f"{_func_name}: Length of names must match number of levels in MultiIndex, given: {args[6]}" + + ty_levels = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + _multi_index_create_level, (t1, t2), {}).return_type for t1, t2 in zip(levels, name)] + ) + else: + ty_levels = types.Tuple.from_types( + [typing_ctx._resolve_user_function_type( + _multi_index_create_level, (typ, types.none), {}).return_type for typ in levels] + ) + + return MultiIndexType(ty_levels, ty_codes, is_named=False) + + +# TO-DO: refactor: this allows SdcTypeRef to be callable and makes pd.MultiIndex.from_product +# work, but this typer handles only case when SdcTypeRef.instance_type is MultiIndexType +# but it may be reference to other type as well (e.g. ConcurrentDictType). Need differentiate +# SdcTypeRef-s for different types. +@type_callable(MultiIndexTypeRef) +def typing_sdctyperef(context): + typing_ctx = context + + def typer(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + return _sdc_multi_index_ctor_typer(typing_ctx, levels, codes, sortorder, + names, dtype, copy, name) + + return typer + + +# FIXME_Numba#7111: low-level api is used to implement typing and impl of MultiIndex ctor +# which is a workaround numba issue (once it's fixed @overload can be used instead) +@infer_getattr +class SdcTypeRefAttribute(AttributeTemplate): + key = MultiIndexTypeRef + + def resolve___call__(self, instance): + return type(instance) + + +def sdc_pandas_multi_index_ctor(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + pass + + +@sdc_overload(sdc_pandas_multi_index_ctor) +def pd_multi_index_overload(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + + _func_name = 'pd.MultiIndex().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(sortorder, (types.Omitted, types.NoneType)) or sortorder is None): + raise TypingError('{} Unsupported parameters. Given sortorder: {}'.format(_func_name, sortorder)) + + if not (isinstance(names, (types.Omitted, types.NoneType)) or names is None): + raise TypingError('{} Unsupported parameters. Given names: {}'.format(_func_name, names)) + + if not (isinstance(dtype, (types.Omitted, types.NoneType)) or dtype is None): + raise TypingError('{} Unsupported parameters. Given dtype: {}'.format(_func_name, dtype)) + + if not (isinstance(copy, (types.Omitted, types.Boolean, types.BooleanLiteral)) or copy is False): + raise TypingError('{} Unsupported parameters. Given copy: {}'.format(_func_name, copy)) + + accepted_index_names = (types.NoneType, types.StringLiteral, types.UnicodeType) + is_name_none = name is None or isinstance(name, (types.NoneType, types.Omitted)) + if not (isinstance(name, (types.Tuple, types.UniTuple)) + and all(map(lambda x: isinstance(x, accepted_index_names), name)) + or is_name_none): + ty_checker.raise_exc(name, 'tuple of strings/nones or none', 'name') + + def pd_multi_index_ctor_impl(levels, codes, sortorder=None, names=None, + dtype=None, copy=False, name=None): + + if len(levels) != len(codes): + raise ValueError("Length of levels and codes must be the same.") + if len(levels) == 0: + raise ValueError("Must pass non-zero number of levels/codes") + + # if name is None then all level names are reset + if is_name_none == True: # noqa + _names = sdc_tuple_map( + lambda x: None, + levels, + ) + else: + _names = name + + levels_and_codes_pairs = sdc_tuple_map_elementwise( + _multi_index_create_levels_and_codes, + levels, + codes, + _names + ) + + _levels, _codes = sdc_tuple_unzip(levels_and_codes_pairs) + return init_multi_index(_levels, _codes) + + return pd_multi_index_ctor_impl + + +@lower_builtin(MultiIndexTypeRef, types.VarArg(types.Any)) +def multi_index_typeref_call_impl(context, builder, sig, args): + + # FIXME_Numba#7111: this uses low-level API as a workaround for numba issue + # TO-DO: remove and use @overload(MultiIndexTypeRef), once issue is fixed + # and now we do the following: + # (1) lookup function type for the actual ctor (sdc_pandas_multi_index_ctor) + # (2) get compiled implementation for provided args (hardcodes 0 as selected overload template, + # i.e. we rely on the fact that sdc_pandas_multi_index_ctor was overloaded only once) + # (3) get the function descriptor from compiled result and emit the call to it + call_sig = context.typing_context._resolve_user_function_type( + sdc_pandas_multi_index_ctor, + sig.args, + {} + ) + fnty = context.typing_context._lookup_global(sdc_pandas_multi_index_ctor) + disp = fnty.templates[0](context.typing_context)._get_impl(call_sig.args, {}) + cres = disp[0].get_compile_result(call_sig) + + res = context.call_internal( + builder, + cres.fndesc, + sig, + args + ) + + return impl_ret_borrowed(context, builder, sig.return_type, res) + + +@sdc_overload(len) +def pd_multi_index_len_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_len_impl(self): + return len(self._codes[0]) + + return pd_multi_index_len_impl + + +@intrinsic +def _multi_index_getitem_impl(typingctx, self, idx): + if not isinstance(self, MultiIndexType): + return None + + nlevels = self.nlevels + levels_types = self.levels_types + codes_types = self.codes_types + ret_type = types.Tuple.from_types([index.dtype for index in levels_types]) + + def codegen(context, builder, sig, args): + self_val, idx_val = args + self_ctinfo = context.make_helper(builder, self, self_val) + + res_elements = [] + for level_index in range(nlevels): + level = builder.extract_value(self_ctinfo.levels, level_index) + code = builder.extract_value(self_ctinfo.codes, level_index) + element = context.compile_internal( + builder, + lambda index, code, i: index[code[i]], + signature(levels_types[level_index].dtype, levels_types[level_index], codes_types[level_index], idx), + [level, code, idx_val] + ) + res_elements.append(element) + + return context.make_tuple(builder, ret_type, res_elements) + + return ret_type(self, idx), codegen + + +@sdc_overload(operator.getitem) +def pd_multi_index_getitem_overload(self, idx): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Operator getitem().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(idx, (types.Integer, types.SliceType)) + or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): + ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') + + if isinstance(idx, types.Integer): + def pd_multi_index_getitem_idx_scalar_impl(self, idx): + index_len = len(self) + # FIXME_Numba#5801: Numba type unification rules make this float + idx = types.int64((index_len + idx) if idx < 0 else idx) + if (idx < 0 or idx >= index_len): + raise IndexError("MultiIndex.getitem: index is out of bounds") + + return _multi_index_getitem_impl(self, idx) + + return pd_multi_index_getitem_idx_scalar_impl + + elif isinstance(idx, types.SliceType): + def pd_multi_index_getitem_idx_slice_impl(self, idx): + + new_levels = self._levels + new_codes = sdc_tuple_map( + lambda arr_codes, taken_idxs: arr_codes[taken_idxs], + self._codes, + idx + ) + return pd.MultiIndex(new_levels, new_codes) + + return pd_multi_index_getitem_idx_slice_impl + + elif isinstance(idx, types.Array) and isinstance(idx.dtype, types.Boolean): + def pd_multi_index_getitem_idx_bool_array_impl(self, idx): + + new_levels = self._levels + new_codes = sdc_tuple_map( + lambda arr_codes, taken_idxs: numpy_like.getitem_by_mask(arr_codes, taken_idxs), + self._codes, + idx + ) + return pd.MultiIndex(new_levels, new_codes) + + return pd_multi_index_getitem_idx_bool_array_impl + + elif isinstance(idx, types.Array) and isinstance(idx.dtype, types.Integer): + def pd_multi_index_getitem_as_take_impl(self, idx): + return self.take(idx) + + return pd_multi_index_getitem_as_take_impl + + +@sdc_overload_attribute(MultiIndexType, 'values') +def pd_multi_index_values_overload(self): + if not isinstance(self, MultiIndexType): + return None + + # FIXME: we return a list for now, as there's no arrays of tuples in numba, nor other + # sequence container that is boxed to dtype=object numpy array. TO-DO: replace with other type? + def pd_multi_index_values_impl(self): + res = [] + for i in range(len(self)): + res.append(self[i]) + return res + + return pd_multi_index_values_impl + + +@sdc_overload_attribute(MultiIndexType, 'dtype') +def pd_multi_index_dtype_overload(self): + if not isinstance(self, MultiIndexType): + return None + + mindex_dtype = self.dtype + + def pd_multi_index_dtype_impl(self): + return mindex_dtype + + return pd_multi_index_dtype_impl + + +@sdc_overload_attribute(MultiIndexType, 'levels') +def pd_multi_index_levels_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_levels_impl(self): + return self._levels + + return pd_multi_index_levels_impl + + +@sdc_overload_attribute(MultiIndexType, 'codes') +def codespd_multi_index_levels_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_codes_impl(self): + return self._codes + + return pd_multi_index_codes_impl + + +@sdc_overload_method(MultiIndexType, 'take') +def pd_multi_index_take_overload(self, indexes): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List, types.ListType) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) + and isinstance(indexes.dtype, (types.Integer, types.ListType))): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_multi_index_take_impl(self, indexes): + new_levels = self._levels + new_codes = sdc_tuple_map( + lambda idx, taken_idxs: sdc_indexes_take(idx, taken_idxs), + self._codes, + indexes) + return pd.MultiIndex(new_levels, new_codes) + + return pd_multi_index_take_impl + + +@sdc_overload_attribute(MultiIndexType, 'nlevels') +def pd_multi_index_nlevels_overload(self): + if not isinstance(self, MultiIndexType): + return None + + nlevels_value = len(self.levels) + + def pd_multi_index_nlevels_impl(self): + return nlevels_value + + return pd_multi_index_nlevels_impl + + +@sdc_overload_attribute(MultiIndexType, 'name') +def pd_multi_index_name_overload(self): + if not isinstance(self, MultiIndexType): + return None + + is_named_index = self.is_named + + def pd_multi_index_name_impl(self): + if is_named_index == True: # noqa + return self._name + else: + return None + + return pd_multi_index_name_impl + + +@sdc_overload_attribute(MultiIndexType, 'names') +def pd_multi_index_names_overload(self): + if not isinstance(self, MultiIndexType): + return None + + def pd_multi_index_names_impl(self): + levels_names = sdc_tuple_map( + lambda x: x.name, + self._levels + ) + + # this exploits undesired side-effect of literal_unroll - type-unification + # of resulting list dtype that will be types.Optional(types.unicode_type) + # as using typed.List of Optional values currently fails to compile + res = [] + for i in literal_unroll(levels_names): + res.append(i) + return res + + return pd_multi_index_names_impl + + +@sdc_overload_method(MultiIndexType, 'equals') +def pd_multi_index_equals_overload(self, other): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method equals().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(other, MultiIndexType) and self.dtype is other.dtype): + ty_checker.raise_exc(other, 'pandas MultiIndex', 'other') + + def pd_multi_index_equals_impl(self, other): + + if self.nlevels != other.nlevels: + return False + + self_and_other_data = _multi_index_binop_helper(self, other) + tup_levels_cmp_res = sdc_tuple_map( + lambda x: cat_array_equal(*x), + self_and_other_data, + ) + + # np.all is not supported for Tuples and below compiles a bit faster + # than 'np.all(np.array(list(tup_levels_cmp_res)))' + for cmp_res in tup_levels_cmp_res: + if not cmp_res: + return False + return True + + return pd_multi_index_equals_impl + + +@sdc_overload(operator.contains) +def pd_multi_index_contains_overload(self, label): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method contains().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(label, (types.Tuple, types.UniTuple)) and self.dtype is label): + ty_checker.raise_exc(label, 'tuple ', 'val') + + def pd_multi_index_contains_impl(self, label): + + # build indexer_map (should already been built in index ctor?) + indexer_map = sdc_indexes_build_map_positions(self) + res = label in indexer_map + return res + + return pd_multi_index_contains_impl + + +@sdc_overload(operator.eq) +def pd_multi_index_eq_overload(self, other): + + _func_name = 'Operator eq.' + + self_is_multi_index = isinstance(self, MultiIndexType) + other_is_multi_index = isinstance(other, MultiIndexType) + both_are_multi_indexes = self_is_multi_index and other_is_multi_index + if not (both_are_multi_indexes and check_types_comparable(self, other) + or (self_is_multi_index and other is getattr(self, 'dtype', types.none)) + or (self is getattr(other, 'dtype', types.none) and other_is_multi_index)): + raise TypingError('{} Not allowed for non comparable types. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_multi_index_eq_impl(self, other): + + if both_are_multi_indexes == True: # noqa + self_size = len(self) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if self.nlevels != other.nlevels: + res = np.zeros(self_size, dtype=types.bool_) + else: + res = np.empty(self_size, dtype=types.bool_) + for i in prange(self_size): + res[i] = self[i] == other[i] + + elif self_is_multi_index == True: # noqa + self_size = len(self) + res = np.empty(self_size, dtype=types.bool_) + for i in prange(self_size): + res[i] = self[i] == other + + else: + other_size = len(other) + res = np.empty(other_size, dtype=types.bool_) + for i in prange(other_size): + res[i] = self == other[i] + + return list(res) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + + return pd_multi_index_eq_impl + + +@sdc_overload_method(MultiIndexType, 'ravel') +def pd_multi_index_ravel_overload(self, order='C'): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method ravel().' + + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_multi_index_ravel_impl(self, order='C'): + # np.ravel argument order is not supported in Numba + if order != 'C': + raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") + + return self.values + + return pd_multi_index_ravel_impl + + +@sdc_overload(operator.ne) +def pd_multi_index_ne_overload(self, other): + + _func_name = 'Operator ne.' + + self_is_multi_index = isinstance(self, MultiIndexType) + other_is_multi_index = isinstance(other, MultiIndexType) + both_are_multi_indexes = self_is_multi_index and other_is_multi_index + if not (both_are_multi_indexes and check_types_comparable(self, other) + or (self_is_multi_index and other is getattr(self, 'dtype', types.none)) + or (self is getattr(other, 'dtype', types.none) and other_is_multi_index)): + raise TypingError('{} Not allowed for non comparable types. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_multi_index_ne_impl(self, other): + + eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list + return list(~eq_res) + + return pd_multi_index_ne_impl + + +@lower_builtin(operator.is_, MultiIndexType, MultiIndexType) +def pd_multi_index_is_overload(context, builder, sig, args): + + ty_lhs, ty_rhs = sig.args + if ty_lhs != ty_rhs: + return cgutils.false_bit + + lhs, rhs = args + lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) + rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) + return builder.icmp_signed('==', lhs_ptr, rhs_ptr) + + +@lower_builtin('getiter', MultiIndexType) +def impl_conc_dict_getiter(context, builder, sig, args): + index_type, = sig.args + index_val, = args + + it = context.make_helper(builder, index_type.iterator_type) + it.parent = index_val + zero = context.get_constant(types.intp, 0) + it.state = cgutils.alloca_once_value(builder, zero) + + res = it._getvalue() + return impl_ret_borrowed(context, builder, index_type.iterator_type, res) + + +@lower_builtin('iternext', MultiIndexIteratorType) +@iternext_impl(RefType.BORROWED) +def impl_iterator_iternext(context, builder, sig, args, result): + iter_type, = sig.args + iter_val, = args + + index_type = iter_type.parent + it = context.make_helper(builder, iter_type, iter_val) + + nitems = context.compile_internal( + builder, + lambda index: len(index), + signature(types.int64, index_type), + [it.parent] + ) + + index = builder.load(it.state) + is_valid = builder.icmp(lc.ICMP_SLT, index, nitems) + result.set_valid(is_valid) + + with builder.if_then(is_valid): + element = context.compile_internal( + builder, + lambda index, i: index[i], + signature(index_type.dtype, index_type, types.int64), + [it.parent, index] + ) + result.yield_(element) + nindex = cgutils.increment_index(builder, index) + builder.store(nindex, it.state) + + +@sdc_overload_method(MultiIndexType, 'reindex') +def pd_multi_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + # TO-DO: check why compilation time is more than 10 seconds + def pd_multi_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_multi_index_reindex_impl + + +@sdc_overload_method(MultiIndexType, 'append') +def pd_multi_index_append_overload(self, other): + if not isinstance(self, MultiIndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(other, MultiIndexType)): + ty_checker.raise_exc(other, 'pandas MultiIndex', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_multi_index_append_impl(self, other): + + self_and_other_data = _multi_index_binop_helper(self, other) + tup_append_level_res = sdc_tuple_map( + lambda x: _multi_index_append_level(*x), + self_and_other_data + ) + + new_levels, new_codes = sdc_tuple_unzip(tup_append_level_res) + return pd.MultiIndex( + levels=new_levels, + codes=new_codes + ) + + return pd_multi_index_append_impl + + +@sdc_overload_method(MultiIndexTypeRef, 'from_product', prefer_literal=False) +def pd_multi_index_from_product_overload(cls, iterables, sortorder=None, names=None): + if cls.instance_type is not MultiIndexType: + return + + _func_name = f'Method from_product()' + valid_levels_data_types = sdc_pandas_index_types + sdc_pandas_df_column_types + (types.List, types.ListType) + ty_checker = TypeChecker(_func_name) + if not (isinstance(iterables, (types.List, types.ListType, types.UniTuple)) + and isinstance(iterables.dtype, valid_levels_data_types) + or isinstance(iterables, types.Tuple) + and all(map(lambda x: isinstance(x, valid_levels_data_types), iterables))): + ty_checker.raise_exc(iterables, 'list or tuple of tuples ', 'iterables') + + if not (isinstance(sortorder, (types.Omitted, types.NoneType)) or sortorder is None): + raise TypingError('{} Unsupported parameters. Given sortorder: {}'.format(_func_name, sortorder)) + + if not (isinstance(names, (types.Omitted, types.NoneType)) or names is None): + raise TypingError('{} Unsupported parameters. Given names: {}'.format(_func_name, names)) + + def pd_multi_index_from_product_impl(cls, iterables, sortorder=None, names=None): + + # TO-DO: support indexes.unique() method and use it here + levels_factorized = sdc_tuple_map( + factorize_level, + iterables + ) + + levels_names = sdc_tuple_map( + sdc_indexes_get_name, + iterables + ) + + index_levels = sdc_tuple_map( + lambda x: fix_df_index(list(x[0])), + levels_factorized + ) + + temp_cumprod_sizes = [1, ] + codes_info = sdc_tuple_map( + next_codes_info, + levels_factorized, + temp_cumprod_sizes + ) + + res_index_size = temp_cumprod_sizes[-1] + index_codes = sdc_tuple_map( + next_codes_array, + codes_info, + res_index_size + ) + + res = sdc_pandas_multi_index_ctor( + index_levels, + index_codes, + name=levels_names + ) + + return res + + return pd_multi_index_from_product_impl + + +@sdc_overload_method(MultiIndexTypeRef, 'from_tuples', prefer_literal=False) +def pd_multi_index_from_tuples_overload(cls, iterables): + if cls.instance_type is not MultiIndexType: + return + + _func_name = f'Method from_tuples()' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(iterables, (types.List, types.ListType)) + and isinstance(iterables.dtype, (types.Tuple, types.UniTuple))): + ty_checker.raise_exc(iterables, f'list of tuples', 'iterables') + + def pd_multi_index_type_from_tuples_impl(cls, iterables): + + index_size = len(iterables) + if not index_size: + raise TypeError("Cannot infer number of levels from empty list") + + # use first value to infer types and allocate dicts for result multi index levels + example_value = iterables[0] + levels_dicts = sdc_tuple_map( + _multi_index_alloc_level_dict, + example_value + ) + index_codes = sdc_tuple_map( + lambda _, size: np.empty(size, dtype=types.int64), + example_value, + index_size + ) + + for i, val in enumerate(iterables): + _multi_index_from_tuples_helper(val, levels_dicts, index_codes, i) + + index_levels = sdc_tuple_map( + lambda x: list(x.keys()), + levels_dicts + ) + + res = pd.MultiIndex( + levels=index_levels, + codes=index_codes, + ) + return res + + return pd_multi_index_type_from_tuples_impl diff --git a/sdc/extensions/indexes/multi_index_helpers.py b/sdc/extensions/indexes/multi_index_helpers.py new file mode 100644 index 000000000..f335a36d9 --- /dev/null +++ b/sdc/extensions/indexes/multi_index_helpers.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np + +from numba import types, prange +from numba.core import cgutils +from numba.core.typing.templates import signature +from numba.core.extending import (intrinsic, register_jitable, ) +from numba.typed import Dict, List + +from sdc.utilities.utils import sdc_overload +from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.extensions.indexes.indexes_generic import ( + sdc_indexes_rename, + sdc_indexes_build_map_positions, + ) + + +def cat_array_equal(A, codes_A, B, codes_B): + pass + + +@sdc_overload(cat_array_equal) +def sdc_cat_array_equal_overload(A, codes_A, B, codes_B): + + def sdc_cat_array_equal_impl(A, codes_A, B, codes_B): + if len(codes_A) != len(codes_B): + return False + + # FIXME_Numba#5157: change to simple A == B when issue is resolved + eq_res_size = len(codes_A) + eq_res = np.empty(eq_res_size, dtype=types.bool_) + for i in numba.prange(eq_res_size): + eq_res[i] = A[codes_A[i]] == B[codes_B[i]] + return np.all(eq_res) + + return sdc_cat_array_equal_impl + + +@intrinsic +def _multi_index_binop_helper(typingctx, self, other): + """ This function gets two multi_index objects each represented as + Tuple(levels) and Tuple(codes) and repacks these into Tuple of following + elements (self_level_0, self_codes_0, other_level_0, other_codes_0), etc + """ + + nlevels = len(self.levels) + if not len(self.levels) == len(other.levels): + assert True, "Cannot flatten MultiIndex of different nlevels" + + elements_types = zip(self.levels, self.codes, other.levels, other.codes) + ret_type = types.Tuple([types.Tuple.from_types(x) for x in elements_types]) + + def codegen(context, builder, sig, args): + self_val, other_val = args + + self_ctinfo = cgutils.create_struct_proxy(self)( + context, builder, value=self_val) + self_levels = self_ctinfo.levels + self_codes = self_ctinfo.codes + + other_ctinfo = cgutils.create_struct_proxy(other)( + context, builder, value=other_val) + other_levels = other_ctinfo.levels + other_codes = other_ctinfo.codes + + ret_tuples = [] + for i in range(nlevels): + self_level_i = builder.extract_value(self_levels, i) + self_codes_i = builder.extract_value(self_codes, i) + other_level_i = builder.extract_value(other_levels, i) + other_codes_i = builder.extract_value(other_codes, i) + + ret_tuples.append( + context.make_tuple(builder, + ret_type[i], + [self_level_i, self_codes_i, other_level_i, other_codes_i]) + ) + + if context.enable_nrt: + context.nrt.incref(builder, ret_type[i][0], self_level_i) + context.nrt.incref(builder, ret_type[i][1], self_codes_i) + context.nrt.incref(builder, ret_type[i][2], other_level_i) + context.nrt.incref(builder, ret_type[i][3], other_codes_i) + + res = context.make_tuple(builder, ret_type, ret_tuples) + return res + + return ret_type(self, other), codegen + + +# TO-DO: seems like this can be refactored when indexes have cached map_positions property +@register_jitable +def _appender_build_map(index1, index2): + res = {} + for i, val in enumerate(index1): + if val not in res: + res[val] = i + + k, count = i, len(res) + while k < i + len(index2): + val = index2[k - i] + if val not in res: + res[val] = count + count += 1 + k += 1 + + return res + + +def _multi_index_append_level(A, codes_A, B, codes_B): + pass + + +@sdc_overload(_multi_index_append_level) +def _multi_index_append_level_overload(A, codes_A, B, codes_B): + + def _multi_index_append_level_impl(A, codes_A, B, codes_B): + + appender_map = _appender_build_map(A, B) + res_size = len(codes_A) + len(codes_B) + res_level = fix_df_index( + list(appender_map.keys()) + ) + + res_codes = np.empty(res_size, dtype=np.int64) + A_size = len(codes_A) + for i in prange(res_size): + if i < A_size: + res_codes[i] = codes_A[i] + else: + res_codes[i] = appender_map[B[codes_B[i - A_size]]] + + return (res_level, res_codes) + + return _multi_index_append_level_impl + + +def _multi_index_create_level(index_data, name): + pass + + +@sdc_overload(_multi_index_create_level) +def _multi_index_create_level_ovld(index_data, name): + + def _multi_index_create_level_impl(index_data, name): + index = fix_df_index(index_data) + return sdc_indexes_rename(index, name) + return _multi_index_create_level_impl + + +def _multi_index_create_levels_and_codes(level_data, codes_data, name): + pass + + +@sdc_overload(_multi_index_create_levels_and_codes) +def _multi_index_create_levels_and_codes_ovld(level_data, codes_data, name): + + def _multi_index_create_levels_and_codes_impl(level_data, codes_data, name): + level_data_fixed = fix_df_index(level_data) + level = sdc_indexes_rename(level_data_fixed, name) + codes = fix_df_array(codes_data) + + # to avoid additional overload make data verification checks inplace + # these checks repeat those in MultiIndex::_verify_integrity + if len(codes) and np.max(codes) >= len(level): + raise ValueError( + "On one of the levels code max >= length of level. " + "NOTE: this index is in an inconsistent state" + ) + if len(codes) and np.min(codes) < -1: + raise ValueError( + "On one of the levels code value < -1") + + # TO-DO: support is_unique for all indexes and use it here + indexer_map = sdc_indexes_build_map_positions(level) + if len(level) != len(indexer_map): + raise ValueError("Level values must be unique") + + return (level, codes) + + return _multi_index_create_levels_and_codes_impl + + +def factorize_level(level): + pass + + +@sdc_overload(factorize_level) +def factorize_level_ovld(level): + + level_dtype = level.dtype + + def factorize_level_impl(level): + unique_labels = List.empty_list(level_dtype) + res_size = len(level) + codes = np.empty(res_size, types.int64) + if not res_size: + return unique_labels, codes + + indexer_map = Dict.empty(level_dtype, types.int64) + for i in range(res_size): + val = level[i] + _code = indexer_map.get(val, -1) + if _code == -1: + new_code = len(unique_labels) + indexer_map[val] = new_code + unique_labels.append(val) + else: + new_code = _code + + codes[i] = new_code + + return unique_labels, codes + + return factorize_level_impl + + +@register_jitable +def next_codes_info(level_info, cumprod_list): + _, codes = level_info + cumprod_list.append(cumprod_list[-1] * len(codes)) + return codes, cumprod_list[-1] + + +@register_jitable +def next_codes_array(stats, res_size): + codes_pattern, factor = stats + span_i = res_size // factor # tiles whole array + repeat_i = res_size // (len(codes_pattern) * span_i) # repeats each element + return np.array(list(np.repeat(codes_pattern, span_i)) * repeat_i) + + +def _multi_index_alloc_level_dict(index): + pass + + +@sdc_overload(_multi_index_alloc_level_dict) +def _make_level_dict_ovld(index): + + index_type = index + + def _make_level_dict_impl(index): + return Dict.empty(index_type, types.int64) + + return _make_level_dict_impl + + +@intrinsic +def _multi_index_from_tuples_helper(typingctx, val, levels, codes, idx): + + nlevels = len(val) + if not (nlevels == len(levels) and nlevels == len(codes)): + assert True, f"Cannot append MultiIndex value to existing codes/levels.\n" \ + f"Given: val={val}, levels={levels}, codes={codes}" + + def _get_code_for_label(seen_labels, label): + + _code = seen_labels.get(label, -1) + if _code != -1: + return _code + + res = len(seen_labels) + seen_labels[label] = res + return types.int64(res) + + def _set_code_by_position(codes, new_code, i): + codes[i] = new_code + + def codegen(context, builder, sig, args): + index_val, levels_val, codes_val, idx_val = args + + for i in range(nlevels): + label = builder.extract_value(index_val, i) + level_i = builder.extract_value(levels_val, i) + codes_i = builder.extract_value(codes_val, i) + + new_code = context.compile_internal( + builder, + _get_code_for_label, + signature(types.int64, levels[i], val[i]), + [level_i, label] + ) + context.compile_internal( + builder, + _set_code_by_position, + signature(types.none, codes[i], types.int64, idx), + [codes_i, new_code, idx_val] + ) + + return types.none(val, levels, codes, idx), codegen diff --git a/sdc/extensions/sdc_hashmap_ext.py b/sdc/extensions/sdc_hashmap_ext.py index 5fea972d8..d02840035 100644 --- a/sdc/extensions/sdc_hashmap_ext.py +++ b/sdc/extensions/sdc_hashmap_ext.py @@ -59,7 +59,7 @@ ConcDictItemsIterableType, ConcDictValuesIterableType) from numba.extending import register_jitable -from sdc.extensions.sdc_hashmap_type import SdcTypeRef +from sdc.datatypes.sdc_typeref import ConcurrentDictTypeRef from sdc.utilities.sdc_typing_utils import TypingError, TypeChecker, check_types_comparable from itertools import product @@ -357,13 +357,13 @@ def codegen(context, builder, sig, args): return dict_type(key, value), codegen -@overload_method(SdcTypeRef, 'empty') +@overload_method(ConcurrentDictTypeRef, 'empty') def concurrent_dict_empty(cls, key_type, value_type): if cls.instance_type is not ConcurrentDictType: return - _func_name = 'Method SdcTypeRef::empty().' + _func_name = 'Method ConcurrentDictTypeRef::empty().' ty_checker = TypeChecker(_func_name) supported_key_types = (types.NumberClass, types.TypeRef) @@ -850,7 +850,7 @@ def codegen(context, builder, sig, args): return dict_type(keys, values), codegen -@overload_method(SdcTypeRef, 'from_arrays') +@overload_method(ConcurrentDictTypeRef, 'from_arrays') def concurrent_dict_from_arrays_ovld(cls, keys, values): if cls.instance_type is not ConcurrentDictType: return @@ -867,7 +867,7 @@ def concurrent_dict_from_arrays_impl(cls, keys, values): return concurrent_dict_from_arrays_impl -@overload_method(SdcTypeRef, 'fromkeys', prefer_literal=False) +@overload_method(ConcurrentDictTypeRef, 'fromkeys', prefer_literal=False) def concurrent_dict_type_fromkeys_ovld(cls, keys, value): if cls.instance_type is not ConcurrentDictType: return diff --git a/sdc/extensions/sdc_hashmap_type.py b/sdc/extensions/sdc_hashmap_type.py index b54c49b56..2c598ea6f 100644 --- a/sdc/extensions/sdc_hashmap_type.py +++ b/sdc/extensions/sdc_hashmap_type.py @@ -24,20 +24,11 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -from numba.core.typing.templates import ( - infer_global, AbstractTemplate, signature, - ) -from numba.extending import type_callable, lower_builtin from numba import types -from numba.extending import (models, register_model, make_attribute_wrapper, overload_method) -from sdc.str_ext import string_type +from numba.core.types import IterableType, SimpleIterableType, SimpleIteratorType +from numba.extending import (models, register_model, make_attribute_wrapper, ) from collections.abc import MutableMapping -from numba.core.types import Dummy, IterableType, SimpleIterableType, SimpleIteratorType - -from numba.extending import typeof_impl -from numba.typed import Dict -from numba.core.typing.typeof import _typeof_type as numba_typeof_type class ConcDictIteratorType(SimpleIteratorType): @@ -161,33 +152,3 @@ def _numba_type_(self): if self._dict_type is None: raise TypeError("invalid operation on untyped dictionary") return self._dict_type - - -# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef -# we have to use our new SdcTypeRef to type objects created from types.Type -# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed. -class SdcTypeRef(Dummy): - """Reference to a type. - - Used when a type is passed as a value. - """ - def __init__(self, instance_type): - self.instance_type = instance_type - super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type)) - - -@register_model(SdcTypeRef) -class SdcTypeRefModel(models.OpaqueModel): - def __init__(self, dmm, fe_type): - - models.OpaqueModel.__init__(self, dmm, fe_type) - - -@typeof_impl.register(type) -def mynew_typeof_type(val, c): - """ This function is a workaround for """ - - if not issubclass(val, ConcurrentDict): - return numba_typeof_type(val, c) - else: - return SdcTypeRef(ConcurrentDictType) diff --git a/sdc/functions/tuple_utils.py b/sdc/functions/tuple_utils.py new file mode 100644 index 000000000..17dffa200 --- /dev/null +++ b/sdc/functions/tuple_utils.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import (intrinsic, ) +from numba.core.typing.templates import (signature, ) + + +@intrinsic +def sdc_tuple_map(typingctx, func, data, *args): + + if not isinstance(func, (types.Dispatcher, types.Function)): + assert False, f"sdc_tuple_map's arg 'func' is expected to be " \ + f"numba compiled function or a dispatcher, given: {func}" + + if not isinstance(data, (types.Tuple, types.UniTuple)): + assert False, f"sdc_tuple_map's arg 'data' is expected to be a tuple, given: {data}" + + nargs = len(args) + tuple_len = len(data) + + func_arg_types = [(typ, ) + args for typ in data] + ret_tuple_types = [] + for i in range(tuple_len): + res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) + ret_tuple_types.append(res_sig.return_type) + ret_type = types.Tuple(ret_tuple_types) + ret_sig = ret_type(func, data, types.StarArgTuple.from_types(args)) + + # codegen below uses first func template to get the dispatcher, so + # for now deny compilation for overloaded func-s that have multiple overloads + # (using the jitted function dispatcher as func will work anyway) + # TO-DO: improve and upstream to Numba + if isinstance(func, types.Function): + assert len(func.templates) == 1, "Function template has multiple overloads" + + def codegen(context, builder, sig, args): + + tup_val = args[1] # main tuple which elements are mapped + other_val = [] + for i in range(0, nargs): + other_val.append( + builder.extract_value(args[2], i) + ) + + mapped_values = [] + for i in range(tuple_len): + tup_elem = builder.extract_value(tup_val, i) + input_args = [tup_elem] + other_val + call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) + + if isinstance(func, types.Dispatcher): + py_func = func.dispatcher.py_func + else: + # for function overloads get pyfunc from compiled impl (this + # hardcodes the first available template) + target_disp = func.templates[0](context.typing_context) + py_func = target_disp._get_impl(call_sig.args, {})[0].py_func + + mapped_values.append( + context.compile_internal(builder, + py_func, + call_sig, + input_args) + ) + res = context.make_tuple(builder, ret_type, mapped_values) + return res + + return ret_sig, codegen + + +@intrinsic +def sdc_tuple_map_elementwise(typingctx, func, lhs, rhs, *args): + + if not isinstance(func, (types.Dispatcher, types.Function)): + assert False, f"sdc_tuple_map_elementwise's arg 'func' is expected to be " \ + f"numba compiled function or a dispatcher, given: {func}" + + if not (isinstance(lhs, (types.Tuple, types.UniTuple)) + and isinstance(rhs, (types.Tuple, types.UniTuple))): + assert False, f"sdc_tuple_map_elementwise's args are expected to be " \ + f"tuples, given: lhs={lhs}, rhs={rhs}" + + assert len(lhs) == len(rhs), f"lhs and rhs tuples have different sizes: lhs={lhs}, rhs={rhs}" + + nargs = len(args) + tuple_len = len(lhs) + + func_arg_types = [x for x in zip(lhs, rhs, *args)] + ret_tuple_types = [] + for i in range(tuple_len): + res_sig = func.get_call_type(typingctx, func_arg_types[i], {}) + ret_tuple_types.append(res_sig.return_type) + ret_type = types.Tuple(ret_tuple_types) + ret_sig = ret_type(func, lhs, rhs, types.StarArgTuple.from_types(args)) + + if isinstance(func, types.Function): + assert len(func.templates) == 1, "Function template has multiple overloads" + + def codegen(context, builder, sig, args): + lhs_val = args[1] + rhs_val = args[2] + other_vals = [] + for i in range(0, nargs): + other_vals.append( + builder.extract_value(args[3], i) + ) + + mapped_values = [] + for i in range(tuple_len): + lhs_elem = builder.extract_value(lhs_val, i) + rhs_elem = builder.extract_value(rhs_val, i) + other_elems = [] + for other_tup in other_vals: + other_elems.append( + builder.extract_value(other_tup, i) + ) + + input_args = [lhs_elem, rhs_elem] + other_elems + call_sig = signature(ret_tuple_types[i], *func_arg_types[i]) + + if isinstance(func, types.Dispatcher): + py_func = func.dispatcher.py_func + else: + # for function overloads get pyfunc from compiled impl + target_disp = func.templates[0](context.typing_context) + py_func = target_disp._get_impl(call_sig.args, {})[0].py_func + + mapped_values.append( + context.compile_internal(builder, + py_func, + call_sig, + input_args) + ) + res = context.make_tuple(builder, ret_type, mapped_values) + return res + + return ret_sig, codegen + + +@intrinsic +def sdc_tuple_unzip(typingctx, data_type): + """ This function gets tuple of pairs and repacks them into two tuples, holding + first and seconds elements, i.e. ((a, b), (c, d), (e, f)) -> ((a, c, e), (b, d, f)). """ + + _func_name = 'sdc_tuple_unzip' + _given_args_str = f'Given: data_type={data_type}' + assert isinstance(data_type, (types.Tuple, types.UniTuple)), \ + f"{_func_name} expects tuple as argument. {_given_args_str}" + + data_len = len(data_type) + assert data_len > 0, f"{_func_name}: empty tuple not allowed. {_given_args_str}" + + for x in data_type: + assert isinstance(x, (types.Tuple, types.UniTuple)) and len(x) == len(data_type[0]), \ + f"{_func_name}: non-supported tuple elements types. {_given_args_str}" + + ty_firsts, ty_seconds = map(lambda x: types.Tuple.from_types(x), + zip(*data_type)) + ret_type = types.Tuple([ty_firsts, ty_seconds]) + + def codegen(context, builder, sig, args): + data_val, = args + + all_firsts = [] + all_seconds = [] + for i in range(data_len): + tup_element_i = builder.extract_value(data_val, i) + first_i = builder.extract_value(tup_element_i, 0) + second_i = builder.extract_value(tup_element_i, 1) + + all_firsts.append(first_i) + all_seconds.append(second_i) + + if context.enable_nrt: + context.nrt.incref(builder, ty_firsts[i], first_i) + context.nrt.incref(builder, ty_seconds[i], second_i) + + first_tup = context.make_tuple(builder, ty_firsts, all_firsts) + second_tup = context.make_tuple(builder, ty_seconds, all_seconds) + return context.make_tuple(builder, ret_type, [first_tup, second_tup]) + + return ret_type(data_type), codegen diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index c06203ecd..ccb4dc866 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -44,7 +44,10 @@ if_series_to_array_type) from numba.core.errors import TypingError from sdc.datatypes.categorical.types import Categorical -from sdc.utilities.sdc_typing_utils import sdc_pandas_df_column_types +from sdc.utilities.sdc_typing_utils import ( + sdc_pandas_df_column_types, + sdc_pandas_index_types, + sdc_old_index_types, ) def isna(arr, i): @@ -192,7 +195,8 @@ def fix_df_index_impl(index, coldata=None): return fix_df_index_impl - elif isinstance(index, (RangeIndexType, Int64IndexType, EmptyIndexType, PositionalIndexType)): + elif (isinstance(index, sdc_pandas_index_types) + and not isinstance(index, sdc_old_index_types)): def fix_df_index_impl(index, coldata=None): return index diff --git a/sdc/sdc_autogenerated.py b/sdc/sdc_autogenerated.py index f701cf5fb..6137aaffb 100644 --- a/sdc/sdc_autogenerated.py +++ b/sdc/sdc_autogenerated.py @@ -89,7 +89,7 @@ def sdc_add_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -229,7 +229,7 @@ def sdc_div_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -369,7 +369,7 @@ def sdc_sub_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -509,7 +509,7 @@ def sdc_mul_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -649,7 +649,7 @@ def sdc_truediv_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -789,7 +789,7 @@ def sdc_floordiv_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -929,7 +929,7 @@ def sdc_mod_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) @@ -1069,7 +1069,7 @@ def sdc_pow_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) diff --git a/sdc/sdc_function_templates.py b/sdc/sdc_function_templates.py index 2f58cdeee..5b3355631 100644 --- a/sdc/sdc_function_templates.py +++ b/sdc/sdc_function_templates.py @@ -89,7 +89,7 @@ def sdc_binop_impl(self, other, fill_value=None): else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + # FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_data = numpy.empty(result_size, dtype=numpy.float64) diff --git a/sdc/tests/indexes/__init__.py b/sdc/tests/indexes/__init__.py index c0adc55e5..3c472d8ac 100644 --- a/sdc/tests/indexes/__init__.py +++ b/sdc/tests/indexes/__init__.py @@ -28,4 +28,5 @@ from sdc.tests.indexes.test_range_index import TestRangeIndex from sdc.tests.indexes.test_positional_index import TestPositionalIndex from sdc.tests.indexes.test_int64_index import TestInt64Index +from sdc.tests.indexes.test_multi_index import TestMultiIndex from sdc.tests.indexes.test_indexes import TestIndexes diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py index 244fa52f8..268c9e211 100644 --- a/sdc/tests/indexes/index_datagens.py +++ b/sdc/tests/indexes/index_datagens.py @@ -126,5 +126,93 @@ def get_sample_index(size, sdc_index_type): return pd.RangeIndex(-1, size - 1, 1) if sdc_index_type is Int64IndexType: return pd.Int64Index(np.arange(size)) + if sdc_index_type is MultiIndexType: + levels = [['a', 'b', 'c'], np.arange(size // 2 + 1)] + return pd.MultiIndex.from_product(levels)[:size] - assert False, f"Refusing to create index of non-specific index type: {sdc_index_type}" + assert False, f"Index generation failed: index type not-recognized: {sdc_index_type}" + + +def _get_multi_index_base_index(exceeded_size, nlevels=2, dtypes=None): + """ Produces multi-index with certain nlevels/dtypes, pre-defined values and size >= exceeded_size """ + + str_labels = ['a', 'b', 'c', 'd', 'e'] + sample_labels = { + 'str': str_labels, + 'int': np.arange(exceeded_size // len(str_labels) + 1), + } + + if dtypes is None: + dtypes = ['str', 'int'] + + # first expand, then cut as needed + if len(dtypes) < nlevels: + dtypes = dtypes * (nlevels // len(dtypes) + 1) + if len(dtypes) > nlevels: + dtypes = dtypes[:nlevels] + + all_levels = [sample_labels[ty] for ty in dtypes] + base_index = pd.MultiIndex.from_tuples( + list(product(*all_levels)) + ) + return base_index + + +def _generate_multi_indexes_fixed(size, nlevels=2, dtypes=None, base_index=None): + """ This is used to generate fixed-size multi-indexes of needed nlevels and dtypes + with generated indexes having certain set of values. """ + + size_range = np.arange(size) + base_index = base_index or _get_multi_index_base_index(size) + base_index_range = np.arange(len(base_index)) + + yield base_index[:size] # unique values from first size values of base_index + yield base_index.take(np.random.choice(size_range, size)) # same values, random order, with duplicates + yield base_index.take(np.random.choice(size_range, size, replace=False)) # same values, unique, random order + yield base_index.take(np.random.choice(base_index_range, size)) # random order, with values not in base_index + + +def _generate_multi_index_levels_unique(n=10, k=5): + yield [gen_strlist(n, nchars=2), np.arange(k)] + yield [gen_strlist(n, nchars=2), gen_strlist(2*n, nchars=2), np.arange(k)] + yield [['a', 'b', 'c'], [1, 2, 3], ['d', 'e']] + yield [np.array([100, 200, 300]), np.arange(k)] + yield [pd.Int64Index([100, 200, 300]), pd.RangeIndex(k)] + + # this is to check named levels creation and name/names arguments + yield [pd.Int64Index([100, 200, 300], name="first"), pd.RangeIndex(k, name="second")] + yield [pd.Int64Index([100, 200, 300], name="first"), pd.RangeIndex(k)] + yield [pd.Int64Index([100, 200, 300], name="first"), ] + + +def _generate_multi_index_levels_with_duplicates(n=10, k=5): + yield [['a', 'b', 'c', 'a', 'b'], ] + yield [np.arange(k), ['a', 'b', 'c', 'a', 'b']] + + +def _generate_multi_index_levels(n=10, k=5): + """ This is useful for generating all set of levels specific dtypes, names, etc. """ + return chain( + _generate_multi_index_levels_unique(n, k), + _generate_multi_index_levels_with_duplicates(n, k), + ) + + +def get_codes_from_levels(size, levels, replace=True): + res_codes = [] + for x in levels: + res_codes.append( + np.random.choice(np.arange(len(x)), size, replace) + ) + return res_codes + + +def _generate_multi_indexes(): + n = 100 + gen_levels = _generate_multi_index_levels + gen_unique_levels = _generate_multi_index_levels_unique + return chain( + map(lambda x: pd.MultiIndex.from_product(x), gen_levels()), + map(lambda x: pd.MultiIndex(x, get_codes_from_levels(n, x)), gen_unique_levels()), + _generate_multi_indexes_fixed(n), + ) diff --git a/sdc/tests/indexes/test_multi_index.py b/sdc/tests/indexes/test_multi_index.py new file mode 100644 index 000000000..f168454fb --- /dev/null +++ b/sdc/tests/indexes/test_multi_index.py @@ -0,0 +1,690 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2021, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import unittest +from itertools import (combinations_with_replacement, product, combinations, ) + +from numba.core import types +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_multi_indexes_fixed, + _generate_multi_index_levels_unique, + _generate_multi_index_levels, + _generate_multi_indexes, + _get_multi_index_base_index, + get_sample_index, + get_codes_from_levels, + ) +from sdc.tests.test_base import TestCase +from sdc.datatypes.indexes import * +from sdc.tests.test_utils import skip_numba_jit, assert_pandas_exception + + +class TestMultiIndex(TestCase): + + def test_multi_index_type_inferred(self): + for index, name in product(_generate_multi_indexes(), + test_global_index_names): + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, MultiIndexType) + + index.name = name + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, MultiIndexType) + + def test_multi_index_create_and_box(self): + def test_impl(levels, codes): + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + for data in _generate_multi_index_levels_unique(): + # creating pd.MultiIndex is only supported with levels and codes as tuples + levels = tuple(data) + codes = tuple(get_codes_from_levels(n, levels)) + with self.subTest(levels=levels, codes=codes): + result = sdc_func(levels, codes) + result_ref = test_impl(levels, codes) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_create_invalid_inputs(self): + def test_impl(levels, codes): + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + level_and_codes = [ + (['a', 'b', 'c'], [3, 0, 1, 2, 2]), # code 3 is out of bounds + (['a', 'b', 'c'], [1, 0, 1, -2, 2]), # code -2 is out of bounds + (['a', 'b', 'c', 'a', 'b'], [1, 0, 1, 2, 2]) # duplicate labels in level + ] + exc_strs = [ + "On one of the levels code max >= length of level.", + "On one of the levels code value < -1", + "Level values must be unique", + ] + + for i, level_codes_pair in enumerate(level_and_codes): + levels, codes = (level_codes_pair[0], ), (level_codes_pair[1], ) + test_msg = f"Inconsistent codes: levels={levels}, codes={codes}" + sdc_exc_str = exc_strs[i] + assert_pandas_exception(self, test_msg, sdc_exc_str, test_impl, sdc_func, (levels, codes)) + + def test_multi_index_create_from_tuples(self): + def test_impl(): + codes_max = 5 + levels = ( + ['a', 'b', 'c', 'd', 'e'], + np.arange(codes_max) + ) + codes = ( + np.arange(0, codes_max), + np.arange(codes_max, 0, -1) - 1, + ) + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_index_equal(result, result_ref) + + @skip_numba_jit("MultiIndexType ctor supports levels and codes as tuples only") + def test_multi_index_create_from_lists(self): + def test_impl(): + codes_max = 5 + levels = [ + ['a', 'b', 'c', 'd', 'e'], + np.arange(codes_max), + ] + codes = [ + np.arange(0, codes_max), + np.arange(codes_max, 0, -1) - 1, + ] + + return pd.MultiIndex(levels, codes) + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_create_param_names(self): + + # using keyword arguments in typeref ctor, is not supported due to limitation of __call__ overload, + # TO-DO: refactor this after @overload is supported for typerefs (see FIXME_Numba#7111): + def test_impl(levels, codes, names): + # return pd.MultiIndex(levels, codes, name=names) + return pd.MultiIndex(levels, codes, None, None, None, False, names) + sdc_func = self.jit(test_impl) + + n = 11 + max_codes = 5 + all_levels = [ + [5, 2, 1, 4, 3], + np.arange(max_codes), + pd.RangeIndex(max_codes), + pd.RangeIndex(max_codes, name='abc'), + pd.Int64Index([5, 2, 1, 4, 3]), + pd.Int64Index([5, 2, 1, 4, 3], name='bce'), + ] + for data, names in product(combinations(all_levels, 2), + combinations_with_replacement(test_global_index_names, 2)): + + # all parameters are supported as tuples only in pd.MultiIndex ctor + levels = tuple(data) + codes = tuple(get_codes_from_levels(n, levels)) + _names = tuple(names) + with self.subTest(levels=levels, codes=codes, names=_names): + result = sdc_func(levels, codes, _names) + result_ref = test_impl(levels, codes, _names) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_unbox_and_box(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_attribute_dtype(self): + from numba.typed import List + + # index dtype cannot be returned (boxed), thus it only checks it can be used + def test_impl(index): + return List.empty_list(index.dtype) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + expected = types.Tuple.from_types([types.unicode_type, types.intp]) + self.assertEqual(result._dtype, expected) + + def test_multi_index_attribute_name(self): + def test_impl(index): + return index.name + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + for name in test_global_index_names: + index.name = name + with self.subTest(name=name): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + @skip_numba_jit("StringArrayType as index has no name. TO-DO: StringIndexType") + def test_multi_index_attribute_names(self): + def test_impl(index): + return index.names + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + for names in combinations_with_replacement( + test_global_index_names, + index.nlevels): + index.names = names + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_attribute_nlevels(self): + def test_impl(index): + return index.nlevels + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_len(self): + def test_impl(index): + return len(index) + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_attribute_values(self): + def test_impl(index): + return index.values + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index_data=index): + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.values return list but not numpy array + self.assertEqual(result, list(result_ref)) + + def test_multi_index_attribute_levels(self): + def test_impl(index): + return index.levels + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index_data=index): + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.levels return tuple of levels not list + error_msg = f"Indexes'levels are different:\nresult={result},\nresult_ref{result_ref}" + self.assertEqual(len(result), len(result_ref), error_msg) + self.assertTrue(map( + lambda x, y: pd.testing.assert_index_equal(x, y), + zip(result, result_ref)), + error_msg + ) + + def test_multi_index_attribute_codes(self): + def test_impl(index): + return index.codes + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for index in _generate_multi_indexes(): + with self.subTest(index_data=index): + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.levels return tuple of levels not list + error_msg = f"Indexes'levels are different:\nresult={result},\nresult_ref{result_ref}" + self.assertEqual(len(result), len(result_ref), error_msg) + self.assertTrue(map( + lambda x, y: np.testing.assert_array_equal(x, y), + zip(result, result_ref)), + error_msg + ) + + def test_multi_index_contains(self): + def test_impl(index, value): + return value in index + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + values_to_test = [('a', 1), ('a', 4), ('e', 1), ('x', 5)] + for value in values_to_test: + with self.subTest(value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_getitem_scalar(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + idxs_to_test = [0, n // 2, n - 1, -1] + for idx in idxs_to_test: + with self.subTest(idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + self.assertEqual(result, result_ref) + + def test_multi_index_getitem_scalar_idx_bounds(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + idxs_to_test = [-(n + 1), n] + for idx in idxs_to_test: + with self.subTest(idx=idx): + with self.assertRaises(Exception) as context: + test_impl(index, idx) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index, idx) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn("out of bounds", str(sdc_exception)) + + def test_multi_index_getitem_slice(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 17 + index = get_sample_index(n, MultiIndexType) + slices_params = combinations_with_replacement( + [None, 0, -1, n // 2, n, n - 3, n + 3, -(n + 3)], + 2 + ) + + for slice_start, slice_stop in slices_params: + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_iterator_1(self): + def test_impl(index): + res = [] + for i, label in enumerate(index): + res.append((i, label)) + return res + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_multi_index_iterator_2(self): + def test_impl(index): + res = [] + for label in index: + str_part, _ = label + if str_part == 'a': + res.append(label) + return res + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + @skip_numba_jit("Requires np.array of complex dtypes (tuples) support in Numba") + def test_multi_index_nparray(self): + def test_impl(index): + return np.array(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_eq_index(self): + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_eq_scalar(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + A = get_sample_index(n, MultiIndexType) + scalars_to_test = [('a', 1), ('a', 4), ('e', 1), ('x', 5)] + for B in scalars_to_test: + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + @skip_numba_jit("Requires np.array of complex dtypes (tuples) support in Numba") + def test_multi_index_operator_eq_nparray(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + for A, B in product( + _generate_multi_indexes_fixed(n), + map(lambda x: np.array(x), _generate_multi_indexes_fixed(n)) + ): + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_ne_index(self): + def test_impl(index1, index2): + return index1 != index2 + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_multi_index_operator_is_nounbox(self): + def test_impl_1(): + index1 = pd.MultiIndex( + levels=(['a', 'b', 'c'], [1, 2, 3]), + codes=([0, 1, 0, 1, 2], [0, 0, 1, 1, 2]) + ) + index2 = index1 + return index1 is index2 + sdc_func_1 = self.jit(test_impl_1) + + def test_impl_2(): + index1 = pd.MultiIndex( + levels=(['a', 'b', 'c'], [1, 2, 3]), + codes=([0, 1, 0, 1, 2], [0, 0, 1, 1, 2]) + ) + index2 = pd.MultiIndex( + levels=(['a', 'b', 'c'], [1, 2, 3]), + codes=([0, 1, 0, 1, 2], [0, 0, 1, 1, 2]) + ) + return index1 is index2 + sdc_func_2 = self.jit(test_impl_2) + + # positive testcase + with self.subTest(subtest="same indexes"): + result = sdc_func_1() + result_ref = test_impl_1() + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + result = sdc_func_2() + result_ref = test_impl_2() + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_multi_index_getitem_by_mask(self): + def test_impl(index, mask): + return index[mask] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + mask = np.random.choice([True, False], n) + for index in _generate_multi_indexes_fixed(n): + result = sdc_func(index, mask) + result_ref = test_impl(index, mask) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n, k = 11, 7 + np.random.seed(0) + idx = np.random.choice(np.arange(n), k) + for index in _generate_multi_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_reindex_equal_indexes(self): + + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) + + n = 10 + index1 = get_sample_index(n, MultiIndexType) + index2 = index1.copy(deep=True) + + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_multi_index_reindex(self): + + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + base_index = _get_multi_index_base_index(n) + index1 = base_index[:n] + size_range = np.arange(len(index1)) + reindex_by = list(map( + lambda x: base_index.take(x), + [ + size_range, # same index as index1 + np.random.choice(size_range, n), # random values from index1 with duplicates + np.random.choice(size_range, n, replace=False), # random unique values from index1 + np.random.choice(np.arange(len(base_index)), n), # random values from larger set + size_range[:n // 2], # shorter index + np.random.choice(size_range, 2*n), # longer index + ] + )) + + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_multi_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + def test_multi_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, MultiIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + # SDC MultiIndex.values return list but not numpy array + np.testing.assert_array_equal(result, list(result_ref)) + + def test_multi_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_multi_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) + + index = pd.MultiIndex.from_product([['a', 'b'], [1, 2]]) + other = pd.MultiIndex.from_tuples( + [('a', 3), ('c', 1), ('c', 3), ('b', 2), ('b', 3)]) + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) + + @skip_numba_jit("MultiIndexType.join is not implemented yet") + def test_multi_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + indexes_to_test = list(_generate_multi_indexes_fixed(n)) + for index, other in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) + + def test_multi_index_from_product(self): + def test_impl(levels): + return pd.MultiIndex.from_product(levels) + sdc_func = self.jit(test_impl) + + np.random.seed(0) + for data in _generate_multi_index_levels(): + # creating pd.MultiIndex is only supported with levels and codes as tuples + levels = tuple(data) + with self.subTest(levels=levels): + result = sdc_func(levels) + result_ref = test_impl(levels) + pd.testing.assert_index_equal(result, result_ref) + + def test_multi_index_from_tuples(self): + def test_impl(data): + return pd.MultiIndex.from_tuples(data) + sdc_func = self.jit(test_impl) + + n = 100 + np.random.seed(0) + for index in _generate_multi_indexes_fixed(n): + data = list(index.values) + with self.subTest(data=data): + result = sdc_func(data) + result_ref = test_impl(data) + pd.testing.assert_index_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/test_compile_time.py b/sdc/tests/test_compile_time.py index 03b5fd46a..845b16b42 100644 --- a/sdc/tests/test_compile_time.py +++ b/sdc/tests/test_compile_time.py @@ -69,7 +69,7 @@ def test_impl(S1, S2): test_impl(S1, S2) entry_format = fr'{line_function}{line_pipeline}{line_time}\n' - log_format = fr'^{line_heading}({entry_format})+{line_ending}$' + log_format = fr'{line_heading}({entry_format})+{line_ending}$' self.assertRegex(buffer.getvalue(), log_format) def test_log_format_detailed(self): diff --git a/sdc/tests/test_utils.py b/sdc/tests/test_utils.py index 719682097..110c7424b 100644 --- a/sdc/tests/test_utils.py +++ b/sdc/tests/test_utils.py @@ -254,6 +254,19 @@ def assert_raises_ty_checker(self, err_details, func, *args, **kwargs): self.assertRaisesRegex(TypingError, regex_str, func, *args, **kwargs) +def assert_pandas_exception(self, test_msg, sdc_exc_str, test_impl, sdc_func, args): + with self.subTest(test_msg): + with self.assertRaises(Exception) as context: + test_impl(*args) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(*args) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn(sdc_exc_str, str(sdc_exception)) + + def _make_func_from_text(func_text, func_name='test_impl', global_vars={}): loc_vars = {} exec(func_text, global_vars, loc_vars) diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 3c5c4219e..31dc54035 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -49,6 +49,7 @@ PositionalIndexType, RangeIndexType, Int64IndexType, + MultiIndexType, ) + sdc_old_index_types sdc_indexes_range_like = ( @@ -69,6 +70,7 @@ Categorical, ) + class TypeChecker: """ Validate object type and raise TypingError if the type is invalid, e.g.: @@ -189,6 +191,9 @@ def check_types_comparable(ty_left, ty_right): return isinstance(ty_right, types.UnicodeType) if isinstance(ty_left, types.Boolean): return isinstance(ty_right, types.Boolean) + if isinstance(ty_left, (types.Tuple, types.UniTuple)): + # FIXME: just for now to unblock compilation + return ty_left == ty_right return False From 8049dbabe614d5e18c2aaa72657dc58d5d7c08b8 Mon Sep 17 00:00:00 2001 From: Alexey Kozlov Date: Wed, 4 Aug 2021 17:46:07 +0300 Subject: [PATCH 2/2] Migrating to pyarrow=4.0.1 (#982) Motivation: keep up with the latest versions of dependencies --- README.rst | 4 ++-- conda-recipe/meta.yaml | 2 +- docs/source/getting_started.rst | 4 ++-- requirements.txt | 2 +- sdc/io/csv_ext.py | 8 +++++--- setup.py | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 269dd2afa..9d40ab061 100644 --- a/README.rst +++ b/README.rst @@ -85,7 +85,7 @@ Building on Linux with setuptools export PYVER=<3.6 or 3.7> export NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=2.0.0 gcc_linux-64 gxx_linux-64 + conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.53.1 pandas=1.2.0 pyarrow=4.0.1 gcc_linux-64 gxx_linux-64 source activate sdc-env git clone https://github.com/IntelPython/sdc.git cd sdc @@ -123,7 +123,7 @@ Building on Windows with setuptools set PYVER=<3.6 or 3.7> set NUMPYVER=<1.16 or 1.17> - conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=2.0.0 + conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.53.1 pandas=1.2.0 pyarrow=4.0.1 conda activate sdc-env set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include set LIB=%LIB%;%CONDA_PREFIX%\Library\lib diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 4886a6653..16e552f86 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,6 +1,6 @@ {% set NUMBA_VERSION = "==0.53.1" %} {% set PANDAS_VERSION = "==1.2.0" %} -{% set PYARROW_VERSION = "==2.0.0" %} +{% set PYARROW_VERSION = "==4.0.1" %} package: name: sdc diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 2156c2214..0065b1d02 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla Intel SDC conda package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=2.0.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=4.0.1 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc_env > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels Intel SDC wheel package can be installed using the steps below: :: - > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=2.0.0 pandas=1.2.0 -c anaconda -c conda-forge + > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=4.0.1 pandas=1.2.0 -c anaconda -c conda-forge > conda activate sdc_env > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc diff --git a/requirements.txt b/requirements.txt index 5b123c130..db7518037 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy>=1.16 pandas==1.2.0 -pyarrow==2.0.0 +pyarrow==4.0.1 numba==0.53.1 tbb tbb-devel diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index e772d8b4c..cb800b9c2 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -470,9 +470,11 @@ def pandas_read_csv( try: for column in parse_dates: name = f"f{column}" - # TODO: Try to help pyarrow infer date type - set DateType. - # dtype[name] = pyarrow.from_numpy_dtype(np.datetime64) # string - del column_types[name] + # starting from pyarrow=3.0.0 strings are parsed to DateType (converted back to 'object' + # when using to_pandas), but not TimestampType (that is used to represent np.datetime64) + # see: pyarrow.from_numpy_dtype(np.datetime64('NaT', 's')) + # so make pyarrow infer needed type manually + column_types[name] = pyarrow.timestamp('s') except: pass parse_options = pyarrow.csv.ParseOptions( diff --git a/setup.py b/setup.py index a730e7373..0d494bf2a 100644 --- a/setup.py +++ b/setup.py @@ -404,7 +404,7 @@ def run(self): install_requires=[ 'numpy>=1.16', 'pandas==1.2.0', - 'pyarrow==2.0.0', + 'pyarrow==4.0.1', 'numba==0.53.1', 'tbb' ],