From 868a5e0c410c79269c9a7f7ffaf083a2f7c4b82e Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Mon, 7 Dec 2020 17:06:33 +0300 Subject: [PATCH 1/5] Add PositionalIndex and EmptyIndex types and align indexes API Motivation: removing types.None as representation of default pd.RangeIndex created when index=None is used in DF and Series ctors, that requires most of Series and DF implementations to branch basing on index types. Instead all Series and DF functions should use common indexes API aligned to pandas one. Changed in this PR: - types.None index is removed; - Added EmptyIndexType (instead types.None index) to represent empty pandas index with dtype='Object' i.e. pd.Index([]); - Added PositionalIndexType as a replacement for types.None index for non-empty DFs and Series; - Changed unboxing of RangeIndex objects to either RangeIndexType of PositionalIndexType depending on whether it's default range or not; - Changed fix_df_index and dataframe_constructor.py to create EmptyIndexType or other index depending on whether tuple of columns is empty; - Moved implementations for index types from common and numpy_like functions to specific index files; - Updated operators and other Series methods implementations to avoid branching on index types and used index objects API instead; - Reorganized index tests and added tests verifying specific function (e.g. series reindexing) for all types of indexes. --- sdc/__init__.py | 1 + sdc/datatypes/common_functions.py | 316 ++--- .../hpat_pandas_dataframe_functions.py | 385 +++--- .../hpat_pandas_groupby_functions.py | 15 +- sdc/datatypes/hpat_pandas_series_functions.py | 294 ++--- .../hpat_pandas_stringmethods_functions.py | 2 +- sdc/datatypes/indexes/__init__.py | 32 + sdc/datatypes/indexes/empty_index_type.py | 59 + sdc/datatypes/indexes/int64_index_type.py | 65 + .../indexes/positional_index_type.py | 62 + .../{ => indexes}/range_index_type.py | 0 sdc/extensions/indexes/empty_index_ext.py | 135 +++ sdc/extensions/indexes/indexes_generic.py | 282 +++++ sdc/extensions/indexes/int64_index_ext.py | 560 +++++++++ .../indexes/positional_index_ext.py | 474 ++++++++ sdc/extensions/indexes/range_index_ext.py | 249 +++- sdc/functions/numpy_like.py | 203 +++- sdc/hiframes/api.py | 43 +- sdc/hiframes/boxing.py | 69 +- sdc/hiframes/pd_series_ext.py | 3 +- sdc/hiframes/pd_series_type.py | 3 +- sdc/rewrites/dataframe_constructor.py | 40 +- sdc/sdc_autogenerated.py | 1078 ++++++----------- sdc/sdc_function_templates.py | 159 +-- sdc/tests/__init__.py | 2 +- sdc/tests/indexes/__init__.py | 31 + sdc/tests/indexes/index_datagens.py | 130 ++ sdc/tests/indexes/test_empty_index.py | 131 ++ sdc/tests/indexes/test_indexes.py | 375 ++++++ sdc/tests/indexes/test_int64_index.py | 550 +++++++++ sdc/tests/indexes/test_positional_index.py | 575 +++++++++ .../test_range_index.py} | 502 +++----- sdc/tests/test_dataframe.py | 28 +- sdc/tests/test_date.py | 4 +- sdc/tests/test_series.py | 77 +- sdc/tests/test_series_ops.py | 16 + sdc/utilities/sdc_typing_utils.py | 63 +- 37 files changed, 5077 insertions(+), 1936 deletions(-) create mode 100644 sdc/datatypes/indexes/__init__.py create mode 100644 sdc/datatypes/indexes/empty_index_type.py create mode 100644 sdc/datatypes/indexes/int64_index_type.py create mode 100644 sdc/datatypes/indexes/positional_index_type.py rename sdc/datatypes/{ => indexes}/range_index_type.py (100%) create mode 100644 sdc/extensions/indexes/empty_index_ext.py create mode 100644 sdc/extensions/indexes/indexes_generic.py create mode 100644 sdc/extensions/indexes/int64_index_ext.py create mode 100644 sdc/extensions/indexes/positional_index_ext.py create mode 100644 sdc/tests/indexes/__init__.py create mode 100644 sdc/tests/indexes/index_datagens.py create mode 100644 sdc/tests/indexes/test_empty_index.py create mode 100644 sdc/tests/indexes/test_indexes.py create mode 100644 sdc/tests/indexes/test_int64_index.py create mode 100644 sdc/tests/indexes/test_positional_index.py rename sdc/tests/{test_indexes.py => indexes/test_range_index.py} (60%) diff --git a/sdc/__init__.py b/sdc/__init__.py index 2a514b70a..b752b3c34 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -48,6 +48,7 @@ import sdc.datatypes.series.init import sdc.extensions.indexes.range_index_ext +import sdc.extensions.indexes.int64_index_ext from ._version import get_versions diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index bffdc5b30..8dbd6701a 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -41,26 +41,27 @@ from numba.extending import register_jitable from numba.np import numpy_support from numba.typed import Dict +from numba.typed.typedobjectutils import _nonoptional import sdc +from sdc.datatypes.indexes import * from sdc.hiframes.api import isna from sdc.hiframes.pd_series_type import SeriesType from sdc.functions import numpy_like from sdc.str_arr_type import string_array_type, StringArrayType -from sdc.datatypes.range_index_type import RangeIndexType from sdc.str_arr_ext import (num_total_chars, append_string_array_to, str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type, cp_str_list_to_array, create_str_arr_from_list, get_utf8_size, str_arr_set_na_by_mask) from sdc.utilities.prange_utils import parallel_chunks from sdc.utilities.utils import sdc_overload, sdc_register_jitable -from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes, - TypeChecker) - - -class SDCLimitation(Exception): - """Exception to be raised in case of SDC limitation""" - pass +from sdc.utilities.sdc_typing_utils import ( + find_common_dtype_from_numpy_dtypes, + TypeChecker, + sdc_pandas_index_types, + sdc_pandas_df_column_types, + sdc_old_index_types, + ) def hpat_arrays_append(A, B): @@ -71,20 +72,31 @@ def hpat_arrays_append(A, B): def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" - A_is_range_index = isinstance(A, RangeIndexType) - B_is_range_index = isinstance(B, RangeIndexType) - if isinstance(A, (types.Array, RangeIndexType)): - if isinstance(B, (types.Array, RangeIndexType)): + if not isinstance(A, sdc_pandas_df_column_types): + return None + + # this function should work with arrays, not indexes, but until all indexes support + # common API (e.g. append is not supported for types.Array indexes) it is simplier to support + # indexes here rather than branch depending on index types on call site + # TO-DO: clean-up when Float64Index and StringArrayIndex are supported + # if not (isinstance(B, sdc_pandas_df_column_types) or isinstance(B.dtype, sdc_pandas_df_column_types)): + # return None + valid_num_single_B_dtype = (types.Array, ) + sdc_pandas_index_types + valid_num_seq_B_dtypes = (types.Array, ) + sdc_pandas_index_types + + if isinstance(A, types.Array): + if isinstance(B, valid_num_single_B_dtype): + convert_B = not isinstance(B, types.Array) def _append_single_numeric_impl(A, B): - _A = A.values if A_is_range_index == True else A # noqa - _B = B.values if B_is_range_index == True else B # noqa - return numpy.concatenate((_A, _B,)) + _B = B if convert_B == False else B.values + return numpy.concatenate((A, _B,)) return _append_single_numeric_impl - elif isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType)): - B_dtype_is_range_index = isinstance(B.dtype, RangeIndexType) + + elif (isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, valid_num_seq_B_dtypes)): numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) + convert_B = not isinstance(B.dtype, types.Array) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): @@ -92,13 +104,14 @@ def _append_list_numeric_impl(A, B): new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) - _A = numpy.array(A) if A_is_range_index == True else A # noqa - new_data[:stop] = _A + new_data[:stop] = A for arr in B: - _arr = numpy.array(arr) if B_dtype_is_range_index == True else arr # noqa start = stop - stop = start + len(_arr) - new_data[start:stop] = _arr + stop = start + len(arr) + if convert_B == False: # noqa + new_data[start:stop] = arr + else: + new_data[start:stop] = arr.values return new_data return _append_list_numeric_impl @@ -209,49 +222,14 @@ def _hpat_ensure_array_capacity(new_size, arr): return res -def sdc_join_series_indexes(left, right): +def _sdc_internal_join(left, right): pass -@sdc_overload(sdc_join_series_indexes, jit_options={'parallel': False}) -def sdc_join_series_indexes_overload(left, right): - """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" - - # check that both operands are of types used for representing Pandas indexes - if not (isinstance(left, (types.Array, StringArrayType, RangeIndexType)) - and isinstance(right, (types.Array, StringArrayType, RangeIndexType))): - return None - - convert_left = isinstance(left, RangeIndexType) - convert_right = isinstance(right, RangeIndexType) - - def _convert_to_arrays_impl(left, right): - _left = left.values if convert_left == True else left # noqa - _right = right.values if convert_right == True else right # noqa - return sdc_join_series_indexes(_left, _right) - - if isinstance(left, RangeIndexType) and isinstance(right, RangeIndexType): - - def sdc_join_range_indexes_impl(left, right): - if (left is right or numpy_like.array_equal(left, right)): - joined = left.values - lidx = numpy.arange(len(joined)) - ridx = lidx - return joined, lidx, ridx - else: - return sdc_join_series_indexes(left.values, right.values) - - return sdc_join_range_indexes_impl - - elif isinstance(left, RangeIndexType) and isinstance(right, types.Array): - return _convert_to_arrays_impl - - elif isinstance(left, types.Array) and isinstance(right, RangeIndexType): - return _convert_to_arrays_impl +@sdc_overload(_sdc_internal_join, jit_options={'parallel': False}) +def _sdc_internal_join_ovld(left, right): - # TODO: remove code duplication below and merge numeric and StringArray impls into one - # needs equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays - elif isinstance(left, types.Array) and isinstance(right, types.Array): + if isinstance(left, types.Array) and isinstance(right, types.Array): numba_common_dtype = find_common_dtype_from_numpy_dtypes([left.dtype, right.dtype], []) if isinstance(numba_common_dtype, types.Number): @@ -611,107 +589,6 @@ def _sdc_asarray_impl(data): return None -def _sdc_take(data, indexes): - pass - - -@sdc_overload(_sdc_take) -def _sdc_take_overload(data, indexes): - - if not isinstance(data, (types.Array, StringArrayType, RangeIndexType)): - return None - if not (isinstance(indexes, (types.Array, types.List)) - and isinstance(indexes.dtype, (types.Integer, types.ListType))): - return None - - if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List, RangeIndexType)): - arr_dtype = data.dtype - - def _sdc_take_list_impl(data, indexes): - res_size = 0 - for i in numba.prange(len(indexes)): - res_size += len(indexes[i]) - res_arr = numpy.empty(res_size, dtype=arr_dtype) - for i in numba.prange(len(indexes)): - start = 0 - for l in range(len(indexes[0:i])): - start += len(indexes[l]) - current_pos = start - for j in range(len(indexes[i])): - res_arr[current_pos] = data[indexes[i][j]] - current_pos += 1 - return res_arr - - return _sdc_take_list_impl - - elif isinstance(indexes.dtype, types.ListType) and data == string_array_type: - def _sdc_take_list_str_impl(data, indexes): - res_size = 0 - for i in numba.prange(len(indexes)): - res_size += len(indexes[i]) - nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) - num_total_bytes = 0 - for i in numba.prange(len(indexes)): - start = 0 - for l in range(len(indexes[0:i])): - start += len(indexes[l]) - current_pos = start - for j in range(len(indexes[i])): - num_total_bytes += get_utf8_size(data[indexes[i][j]]) - if isna(data, indexes[i][j]): - nan_mask[current_pos] = True - current_pos += 1 - res_arr = pre_alloc_string_array(res_size, num_total_bytes) - for i in numba.prange(len(indexes)): - start = 0 - for l in range(len(indexes[0:i])): - start += len(indexes[l]) - current_pos = start - for j in range(len(indexes[i])): - res_arr[current_pos] = data[indexes[i][j]] - if nan_mask[current_pos]: - str_arr_set_na(res_arr, current_pos) - current_pos += 1 - - return res_arr - - return _sdc_take_list_str_impl - - elif isinstance(data, (types.Array, RangeIndexType)): - arr_dtype = data.dtype - - def _sdc_take_array_impl(data, indexes): - res_size = len(indexes) - res_arr = numpy.empty(res_size, dtype=arr_dtype) - for i in numba.prange(res_size): - res_arr[i] = data[indexes[i]] - return res_arr - - return _sdc_take_array_impl - - elif isinstance(data, StringArrayType): - def _sdc_take_str_arr_impl(data, indexes): - res_size = len(indexes) - nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) - num_total_bytes = 0 - for i in numba.prange(res_size): - num_total_bytes += get_utf8_size(data[indexes[i]]) - if isna(data, indexes[i]): - nan_mask[i] = True - - res_arr = pre_alloc_string_array(res_size, num_total_bytes) - for i in numpy.arange(res_size): - res_arr[i] = data[indexes[i]] - if nan_mask[i]: - str_arr_set_na(res_arr, i) - - return res_arr - - return _sdc_take_str_arr_impl - - return None - - def _almost_equal(x, y): """Check if floats are almost equal based on the float epsilon""" pass @@ -735,62 +612,91 @@ def sdc_reindex_series(arr, index, name, by_index): pass +# TO-DO: support Series.reindex() that should replace this function @sdc_overload(sdc_reindex_series) def sdc_reindex_series_overload(arr, index, name, by_index): """ Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """ - range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType) + range_indexes = (isinstance(index, (PositionalIndexType, RangeIndexType)) + and isinstance(by_index, (PositionalIndexType, RangeIndexType))) data_dtype, index_dtype = arr.dtype, index.dtype data_is_str_arr = isinstance(arr.dtype, types.UnicodeType) - def sdc_reindex_series_impl(arr, index, name, by_index): - - # no reindexing is needed if indexes are equal - if range_indexes == True: # noqa - equal_indexes = numpy_like.array_equal(index, by_index) - else: - equal_indexes = False - if (index is by_index or equal_indexes): - return pandas.Series(data=arr, index=by_index, name=name) + # use old implementation if old indexes types are used + if (isinstance(index, sdc_old_index_types) or isinstance(by_index, sdc_old_index_types)): - if data_is_str_arr == True: # noqa - _res_data = [''] * len(by_index) - res_data_nan_mask = numpy.zeros(len(by_index), dtype=types.bool_) - else: - _res_data = numpy.empty(len(by_index), dtype=data_dtype) + def sdc_reindex_series_old_impl(arr, index, name, by_index): - # build a dict of self.index values to their positions: - map_index_to_position = Dict.empty( - key_type=index_dtype, - value_type=types.int32 - ) + # no reindexing is needed if indexes are equal, but only check if it's fast + if range_indexes == True: # noqa + equal_indexes = index.equals(by_index) + else: + equal_indexes = False + if (index is by_index or equal_indexes): + return pandas.Series(data=arr, index=by_index, name=name) - for i, value in enumerate(index): - if value in map_index_to_position: - raise ValueError("cannot reindex from a duplicate axis") + if data_is_str_arr == True: # noqa + _res_data = [''] * len(by_index) + res_data_nan_mask = numpy.zeros(len(by_index), dtype=types.bool_) else: - map_index_to_position[value] = i - - index_mismatch = 0 - # FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used - for i in numpy.arange(len(by_index)): - if by_index[i] in map_index_to_position: - pos_in_self = map_index_to_position[by_index[i]] - _res_data[i] = arr[pos_in_self] - if data_is_str_arr == True: # noqa - res_data_nan_mask[i] = isna(arr, i) + _res_data = numpy.empty(len(by_index), dtype=data_dtype) + + # build a dict of self.index values to their positions: + map_index_to_position = Dict.empty( + key_type=index_dtype, + value_type=types.int32 + ) + + for i, value in enumerate(index): + if value in map_index_to_position: + raise ValueError("cannot reindex from a duplicate axis") + else: + map_index_to_position[value] = i + + index_mismatch = 0 + for i in numba.prange(len(by_index)): + val = by_index[i] + if val in map_index_to_position: + pos_in_self = map_index_to_position[val] + _res_data[i] = arr[pos_in_self] + if data_is_str_arr == True: # noqa + res_data_nan_mask[i] = isna(arr, i) + else: + index_mismatch += 1 + if index_mismatch: + msg = "Unalignable boolean Series provided as indexer " + \ + "(index of the boolean Series and of the indexed object do not match)." + raise IndexingError(msg) + + if data_is_str_arr == True: # noqa + res_data = create_str_arr_from_list(_res_data) + str_arr_set_na_by_mask(res_data, res_data_nan_mask) else: - index_mismatch += 1 - if index_mismatch: - msg = "Unalignable boolean Series provided as indexer " + \ - "(index of the boolean Series and of the indexed object do not match)." - raise IndexingError(msg) - - if data_is_str_arr == True: # noqa - res_data = create_str_arr_from_list(_res_data) - str_arr_set_na_by_mask(res_data, res_data_nan_mask) + res_data = _res_data + + return pandas.Series(data=res_data, index=by_index, name=name) + + return sdc_reindex_series_old_impl + + def sdc_reindex_series_impl(arr, index, name, by_index): + + _, new_order = index.reindex(by_index) + if new_order is not None: + new_order_as_array = _nonoptional(new_order) + index_mismatch = 0 + for i in numba.prange(len(by_index)): + if new_order_as_array[i] == -1: + index_mismatch += 1 + + if index_mismatch: + # TO-DO: seems it covers only specific series reindex case, generalize? + msg = "Unalignable boolean Series provided as indexer " + \ + "(index of the boolean Series and of the indexed object do not match)." + raise IndexingError(msg) + + res_data = numpy_like.take(arr, new_order_as_array) else: - res_data = _res_data + res_data = arr return pandas.Series(data=res_data, index=by_index, name=name) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 31f3738d9..b955839a3 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -41,15 +41,18 @@ from numba import literally from numba.typed import List, Dict from numba.core.errors import TypingError +from numba.core.registry import cpu_target from pandas.core.indexing import IndexingError +from sdc.datatypes.indexes import * from sdc.hiframes.pd_dataframe_ext import DataFrameType from sdc.hiframes.pd_series_type import SeriesType from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, kwsparams2list, gen_impl_generator, find_common_dtype_from_numpy_dtypes) from sdc.str_arr_ext import StringArrayType -from sdc.datatypes.range_index_type import RangeIndexType + +from sdc.extensions.indexes.empty_index_ext import init_empty_index from sdc.hiframes.pd_dataframe_type import DataFrameType from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps @@ -57,7 +60,7 @@ from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType, dataframe_getitem_accessor_init) -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation from sdc.datatypes.hpat_pandas_dataframe_rolling_types import _hpat_pandas_df_rolling_init from sdc.datatypes.hpat_pandas_rolling_types import ( gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl) @@ -66,7 +69,10 @@ from sdc.utilities.utils import sdc_overload, sdc_overload_method, sdc_overload_attribute from sdc.hiframes.api import isna from sdc.functions.numpy_like import getitem_by_mask, find_idx -from sdc.datatypes.common_functions import _sdc_take, sdc_reindex_series +from sdc.functions.numpy_like import take as nplike_take +from sdc.datatypes.common_functions import (sdc_reindex_series, + fill_array, + fill_str_array,) from sdc.utilities.prange_utils import parallel_chunks @@ -99,22 +105,10 @@ def hpat_pandas_dataframe_index(df): ty_checker = TypeChecker('Attribute index.') ty_checker.check(df, DataFrameType) - if isinstance(df.index, types.NoneType): - empty_df = not df.columns - - def hpat_pandas_df_index_none_impl(df): - if empty_df == True: # noqa - return numpy.arange(0) - else: - return pandas.RangeIndex(len(df)) - - return hpat_pandas_df_index_none_impl - else: - - def hpat_pandas_df_index_impl(df): - return df._index + def hpat_pandas_df_index_impl(df): + return df._index - return hpat_pandas_df_index_impl + return hpat_pandas_df_index_impl @sdc_overload_attribute(DataFrameType, 'columns') @@ -174,7 +168,7 @@ def sdc_pandas_dataframe_values_impl(self): columns_num = len(self.columns) func_lines = [ f'def sdc_pandas_dataframe_values_impl(self):', - f' length = {df_length_expr(self)}', + f' length = len(self._index)', ] for i, col in enumerate(self.columns): col_loc = self.column_loc[col] @@ -311,11 +305,11 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ other_type_id, other_col_id = other_col_loc.type_id, other_col_loc.col_id func_text.append(f'new_col_{idx}_data_other = ' f'other._data[{other_type_id}][{other_col_id}]') - s1 = f'init_series(new_col_{idx}_data_df)' - s2 = f'init_series(new_col_{idx}_data_other)' + s1 = f'pandas.Series(new_col_{idx}_data_df)' + s2 = f'pandas.Series(new_col_{idx}_data_other)' func_text.append(f'new_col_{idx} = {s1}.append({s2})._data') else: - func_text.append(f'new_col_{idx}_data = init_series(new_col_{idx}_data_df)._data') + func_text.append(f'new_col_{idx}_data = pandas.Series(new_col_{idx}_data_df)._data') if col_name in string_type_columns: func_text.append(f'new_col_{idx} = fill_str_array(new_col_{idx}_data, len_df+len_other)') else: @@ -327,7 +321,7 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ other_col_loc = other.column_loc[col_name] other_type_id, other_col_id = other_col_loc.type_id, other_col_loc.col_id func_text.append(f'new_col_{idx}_data_other = other._data[{other_type_id}][{other_col_id}]') - func_text.append(f'new_col_{idx}_data = init_series(new_col_{idx}_data_other)._data') + func_text.append(f'new_col_{idx}_data = pandas.Series(new_col_{idx}_data_other)._data') if col_name in string_type_columns: func_text.append( f'new_col_{idx}_other = ' @@ -346,17 +340,15 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ func_text.append(f'raise SDCLimitation("Indexes of dataframes are expected to have comparable ' f'(both Numeric or String) types if parameter ignore_index is set to False.")') else: - func_text += [f'joined_index = hpat_arrays_append(df.index, other.index)\n', + func_text += [f'joined_index = df._index.append(other._index)\n', f'return pandas.DataFrame({{{data}}}, index=joined_index)\n'] func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) global_vars = {'pandas': pandas, - 'init_series': sdc.hiframes.api.init_series, - 'fill_array': sdc.datatypes.common_functions.fill_array, - 'fill_str_array': sdc.datatypes.common_functions.fill_str_array, - 'hpat_arrays_append': sdc.datatypes.common_functions.hpat_arrays_append, + 'fill_array': fill_array, + 'fill_str_array': fill_str_array, 'SDCLimitation': SDCLimitation} return func_def, global_vars @@ -419,9 +411,7 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=False, verify_integrity= if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)): ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') - none_or_numeric_indexes = ((isinstance(df.index, types.NoneType) or isinstance(df.index, types.Number)) and - (isinstance(other.index, types.NoneType) or isinstance(other.index, types.Number))) - indexes_comparable = check_types_comparable(df.index, other.index) or none_or_numeric_indexes + indexes_comparable = check_types_comparable(df.index, other.index) if isinstance(ignore_index, types.Literal): ignore_index = ignore_index.literal_value @@ -509,7 +499,7 @@ def _df_head_impl(df, n=5): results = [] joined = ', '.join(func_params) func_lines = [f'def _df_{func_name}_impl(df, {joined}):'] - ind = df_index_codegen_head(df) + ind = 'index=df._index[:n]' for i, c in enumerate(df.columns): col_loc = df.column_loc[c] type_id, col_id = col_loc.type_id, col_loc.col_id @@ -541,14 +531,6 @@ def sdc_pandas_dataframe_head_codegen(df, func_name, params, ser_params): return _reduce_impl -def df_index_codegen_head(self): - # TODO: Rewrite when DF constructor will be fixed with index=None - if isinstance(self.index, types.NoneType): - return '' - - return 'index=df._index[:n]' - - @sdc_overload_method(DataFrameType, 'head') def head_overload(df, n=5): """ @@ -1489,6 +1471,7 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, else: # this works because global tuple of strings is captured as Tuple of StringLiterals columns_as_tuple = tuple(columns.initial_value) + def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors="raise"): @@ -1535,23 +1518,6 @@ def sdc_pandas_dataframe_drop_impl(df, args, columns): return sdc_pandas_dataframe_drop_impl(df, args, columns) -def df_length_expr(self): - """Generate expression to get length of DF""" - if self.columns: - return 'len(self._data[0][0])' - - return '0' - - -def df_index_expr(self, length_expr=None): - """Generate expression to get or create index of DF""" - if isinstance(self.index, types.NoneType): - if length_expr is None: - length_expr = df_length_expr(self) - - return 'self._index' - - def df_getitem_slice_idx_main_codelines(self, idx): """Generate main code lines for df.getitem with idx of slice""" @@ -1570,17 +1536,14 @@ def df_getitem_slice_idx_main_codelines(self, idx): all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', ' res_data = f'({all_lists_joined})' if n_lists > 0 else '()' func_lines += [ - f' if self_index_is_none == True:', - f' old_index = pandas.RangeIndex(len(self))', - f' else:', - f' old_index = self._index', f' res_data = {res_data}', - f' res_index = old_index[idx]', + f' res_index = self._index[idx]', f' return init_dataframe_internal(res_data, res_index, df_type)' ] return func_lines + def df_getitem_tuple_idx_main_codelines(self, literal_idx): """Generate main code lines for df.getitem with idx of tuple""" results = [] @@ -1604,17 +1567,15 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx): def df_getitem_bool_series_idx_main_codelines(self, idx): """Generate main code lines for df.getitem""" - length_expr = df_length_expr(self) # optimization for default indexes in df and idx when index alignment is trivial - if (isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType)): - func_lines = [f' length = {length_expr}', - f' self_index = self.index', - f' if length > len(idx):', + if (isinstance(self.index, PositionalIndexType) and isinstance(idx.index, PositionalIndexType)): + func_lines = [f' self_index = self._index', + f' if len(self_index) > len(idx):', f' msg = "Unalignable boolean Series provided as indexer " + \\', f' "(index of the boolean Series and of the indexed object do not match)."', f' raise IndexingError(msg)', - f' # do not trim idx._data to length as getitem_by_mask handles such case', + f' # do not trim idx._data to df length as getitem_by_mask handles such case', f' res_index = getitem_by_mask(self_index, idx._data)', f' # df index is default, same as positions so it can be used in take'] results = [] @@ -1634,11 +1595,11 @@ def df_getitem_bool_series_idx_main_codelines(self, idx): ] else: func_lines = [ - f' length = {length_expr}', - f' self_index = self.index', - f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)', - f' res_index = getitem_by_mask(self_index, reindexed_idx._data)', - f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)' + f' self_index = self._index', + f' idx_reindexed_by_self = sdc_reindex_series(idx._data, idx._index, idx._name, self_index)', + f' final_mask = idx_reindexed_by_self._data', + f' res_index = self_index[final_mask]', + f' selected_pos = getitem_by_mask(numpy.arange(len(self_index)), final_mask)' ] results = [] for i, col in enumerate(self.columns): @@ -1662,12 +1623,14 @@ def df_getitem_bool_series_idx_main_codelines(self, idx): def df_getitem_bool_array_idx_main_codelines(self, idx): """Generate main code lines for df.getitem""" - func_lines = [f' length = {df_length_expr(self)}', + has_positional_index = isinstance(idx, PositionalIndexType) + res_index_expr = 'taken_pos' if has_positional_index else 'self._index.take(taken_pos)' + func_lines = [f' length = len(self._index)', f' if length != len(idx):', f' raise ValueError("Item wrong length.")', - f' self_index = self.index', - f' taken_pos = getitem_by_mask(self_index, idx)', - f' res_index = sdc_take(self_index, taken_pos)'] + f' taken_pos = getitem_by_mask(numpy.arange(length), idx)', + f' res_index = {res_index_expr}' + ] results = [] for i, col in enumerate(self.columns): col_loc = self.column_loc[col] @@ -1699,12 +1662,8 @@ def _df_getitem_slice_idx_impl(self, idx): list_0 = self._data[0].copy() for i, item in enumerate(list_0): list_0[i] = item[idx] - if self_index_is_none == True: - old_index = pandas.RangeIndex(len(self)) - else: - old_index = self._index res_data = (list_0, ) - res_index = old_index[idx] + res_index = self._index[idx] return init_dataframe_internal(res_data, res_index, df_type) """ func_lines = ['def _df_getitem_slice_idx_impl(self, idx):'] @@ -1715,16 +1674,20 @@ def _df_getitem_slice_idx_impl(self, idx): func_lines += df_getitem_key_error_codelines() func_text = '\n'.join(func_lines) - # TO-DO: need DefaultIndex to handle self.index[idx] construct inside func - self_index_is_none = isinstance(self.index, types.NoneType) - new_index_type = RangeIndexType(False) if self_index_is_none else self.index + # since we need to know result df type to call init_dataframe_internal + # deduce the resulting df index type + index_getitem_sig = cpu_target.typing_context.resolve_function_type( + operator.getitem, + (self.index, idx), + {} + ) + new_index_type = index_getitem_sig.return_type df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc) global_vars = {'pandas': pandas, 'numpy': numpy, 'df_type': df_type, - 'init_dataframe_internal': init_dataframe_internal, - 'self_index_is_none': self_index_is_none} + 'init_dataframe_internal': init_dataframe_internal} return func_text, global_vars @@ -1732,13 +1695,13 @@ def _df_getitem_slice_idx_impl(self, idx): def df_getitem_tuple_idx_codegen(self, idx): """ Example of generated implementation with provided index: - def _df_getitem_tuple_idx_impl(self, idx) - res_index = self._index - data_1 = self._data[1] - res_data_1 = pandas.Series(data_1, index=res_index, name="B") - data_2 = self._data[2] + def _df_getitem_tuple_idx_impl(self, idx): + res_index = self.index + data_0 = self._data[0][0] + res_data_0 = pandas.Series(data_0, index=res_index, name="A") + data_2 = self._data[0][2] res_data_2 = pandas.Series(data_2, index=res_index, name="C") - return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index) + return pandas.DataFrame({"A": res_data_0, "C": res_data_2}, index=res_index) """ func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):'] literal_idx = {col.literal_value for col in idx} @@ -1760,27 +1723,28 @@ def df_getitem_bool_series_idx_codegen(self, idx): """ Example of generated implementation with provided index: def _df_getitem_bool_series_idx_impl(self, idx): - length = len(self._data[0][0]) - self_index = range(len(self._data[0][0])) - if length > len(idx): + self_index = self._index + if len(self_index) > len(idx): msg = "Unalignable boolean Series provided as indexer " + \ "(index of the boolean Series and of the indexed object do not match)." raise IndexingError(msg) - # do not trim idx._data to length as getitem_by_mask handles such case + # do not trim idx._data to df length as getitem_by_mask handles such case res_index = getitem_by_mask(self_index, idx._data) # df index is default, same as positions so it can be used in take data_0 = self._data[0][0] res_data_0 = sdc_take(data_0, res_index) - data_1 = self._data[1][0] + data_1 = self._data[0][1] res_data_1 = sdc_take(data_1, res_index) - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index) + data_2 = self._data[0][2] + res_data_2 = sdc_take(data_2, res_index) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1, "C": res_data_2}, index=res_index) """ func_lines = ['def _df_getitem_bool_series_idx_impl(self, idx):'] func_lines += df_getitem_bool_series_idx_main_codelines(self, idx) func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, 'numpy': numpy, 'getitem_by_mask': getitem_by_mask, - 'sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'sdc_reindex_series': sdc_reindex_series, 'IndexingError': IndexingError} @@ -1791,24 +1755,25 @@ def df_getitem_bool_array_idx_codegen(self, idx): """ Example of generated implementation with provided index: def _df_getitem_bool_array_idx_impl(self, idx): - length = len(self._data[0][0]) + length = len(self._index) if length != len(idx): raise ValueError("Item wrong length.") - self_index = range(len(self._data[0][0])) - taken_pos = getitem_by_mask(self_index, idx) - res_index = sdc_take(self_index, taken_pos) + taken_pos = getitem_by_mask(numpy.arange(length), idx) + res_index = self._index.take(taken_pos) data_0 = self._data[0][0] res_data_0 = sdc_take(data_0, taken_pos) data_1 = self._data[1][0] res_data_1 = sdc_take(data_1, taken_pos) - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index) + data_2 = self._data[2][0] + res_data_2 = sdc_take(data_2, taken_pos) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1, "C": res_data_2}, index=res_index) """ func_lines = ['def _df_getitem_bool_array_idx_impl(self, idx):'] func_lines += df_getitem_bool_array_idx_main_codelines(self, idx) func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, 'numpy': numpy, 'getitem_by_mask': getitem_by_mask, - 'sdc_take': _sdc_take} + 'sdc_take': nplike_take} return func_text, global_vars @@ -1972,20 +1937,8 @@ def _df_getitem_unicode_idx_impl(self, idx): return gen_df_getitem_slice_idx_impl(self, idx) if isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean): - self_index_is_none = isinstance(self.index, types.NoneType) - idx_index_is_none = isinstance(idx.index, types.NoneType) - - if self_index_is_none and not idx_index_is_none: - if not check_index_is_numeric(idx): - ty_checker.raise_exc(idx.index.dtype, 'number', 'idx.index.dtype') - - if not self_index_is_none and idx_index_is_none: - if not check_index_is_numeric(self): - ty_checker.raise_exc(idx.index.dtype, self.index.dtype, 'idx.index.dtype') - - if not self_index_is_none and not idx_index_is_none: - if not check_types_comparable(self.index, idx.index): - ty_checker.raise_exc(idx.index.dtype, self.index.dtype, 'idx.index.dtype') + if not check_types_comparable(self.index, idx.index): + ty_checker.raise_exc(idx.index.dtype, self.index.dtype, 'idx.index.dtype') return gen_df_getitem_bool_series_idx_impl(self, idx) @@ -2001,10 +1954,10 @@ def df_getitem_tuple_at_codegen(self, row, col): """ Example of generated implementation: def _df_getitem_tuple_at_impl(self, idx): - row, _ = idx - data = self._dataframe._data[1][0] - res_data = pandas.Series(data, index=self._dataframe.index) - return res_data.at[row] + row, _ = idx + data = self._dataframe._data[2][0] + res_data = pandas.Series(data, index=self._dataframe.index) + return res_data.at[row] """ func_lines = ['def _df_getitem_tuple_at_impl(self, idx):', ' row, _ = idx'] @@ -2032,23 +1985,25 @@ def df_getitem_single_label_loc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_single_label_loc_impl(self, idx): - idx_list = find_idx(self._dataframe._index, idx) - data_0 = _sdc_take(self._dataframe._data[0][0], idx_list) - res_data_0 = pandas.Series(data_0) - data_1 = _sdc_take(self._dataframe._data[1][0], idx_list) - res_data_1 = pandas.Series(data_1) - if len(idx_list) < 1: - raise KeyError('Index is not in the DataFrame') - new_index = _sdc_take(self._dataframe._index, idx_list) - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=new_index) - """ - if isinstance(self.index, types.NoneType): + idx_list = find_idx(self._dataframe._index, idx) + data_0 = sdc_take(self._dataframe._data[0][0], idx_list) + res_data_0 = pandas.Series(data_0) + data_1 = sdc_take(self._dataframe._data[1][0], idx_list) + res_data_1 = pandas.Series(data_1) + data_2 = sdc_take(self._dataframe._data[0][1], idx_list) + res_data_2 = pandas.Series(data_2) + if len(idx_list) < 1: + raise KeyError('Index is not in the DataFrame') + new_index = self._dataframe._index.take(idx_list) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1, "C": res_data_2}, index=new_index) + """ + if isinstance(self.index, PositionalIndexType): fill_list = [' idx_list = numpy.array([idx])'] new_index = [' new_index = numpy.array([idx])'] else: fill_list = [' idx_list = find_idx(self._dataframe._index, idx)'] - new_index = [' new_index = _sdc_take(self._dataframe._index, idx_list)'] + new_index = [' new_index = self._dataframe._index.take(idx_list)'] fill_list_text = '\n'.join(fill_list) new_index_text = '\n'.join(new_index) @@ -2060,7 +2015,7 @@ def _df_getitem_single_label_loc_impl(self, idx): type_id, col_id = col_loc.type_id, col_loc.col_id data = f'data_{i}' res_data = f'res_data_{i}' - func_lines += [f' {data} = _sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)', + func_lines += [f' {data} = sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)', f' {res_data} = pandas.Series({data})'] results.append((c, res_data)) @@ -2074,7 +2029,7 @@ def _df_getitem_single_label_loc_impl(self, idx): func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, 'numpy': numpy, 'numba': numba, - '_sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'find_idx': find_idx, 'KeyError': KeyError} @@ -2085,20 +2040,22 @@ def df_getitem_int_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_int_iloc_impl(self, idx): - if -1 < idx < len(self._dataframe.index): - data_0 = pandas.Series(self._dataframe._data[0][0]) - result_0 = data_0.iat[idx] - data_1 = pandas.Series(self._dataframe._data[0][1]) - result_1 = data_1.iat[idx] - return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx)) - raise IndexingError('Index is out of bounds for axis') + if -1 < idx < len(self._dataframe.index): + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iat[idx] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iat[idx] + data_2 = pandas.Series(self._dataframe._data[1][0]) + result_2 = data_2.iat[idx] + return pandas.Series(data=[result_0, result_1, result_2], index=['A', 'B', 'C'], name=str(idx)) + raise IndexingError('Index is out of bounds for axis') """ func_lines = ['def _df_getitem_int_iloc_impl(self, idx):', ' if -1 < idx < len(self._dataframe.index):'] results = [] index = [] name = 'self._dataframe._index[idx]' - if isinstance(self.index, types.NoneType): + if isinstance(self.index, PositionalIndexType): name = 'idx' for i, c in enumerate(self.columns): col_loc = self.column_loc[c] @@ -2122,11 +2079,13 @@ def df_getitem_slice_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_slice_iloc_impl(self, idx): - data_0 = pandas.Series(self._dataframe._data[0][0]) - result_0 = data_0.iloc[idx] - data_1 = pandas.Series(self._dataframe._data[1][0]) - result_1 = data_1.iloc[idx] - return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=self._dataframe.index[idx]) + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iloc[idx] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iloc[idx] + data_2 = pandas.Series(self._dataframe._data[1][0]) + result_2 = data_2.iloc[idx] + return pandas.DataFrame(data={"A": result_0, "B": result_1, "C": result_2}, index=self._dataframe.index[idx]) """ func_lines = ['def _df_getitem_slice_iloc_impl(self, idx):'] results = [] @@ -2150,17 +2109,19 @@ def df_getitem_list_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_list_iloc_impl(self, idx): - check_idx = False - for i in idx: - if -1 < i < len(self._dataframe.index): - check_idx = True - if check_idx == True: - data_0 = pandas.Series(self._dataframe._data[0][0]) - result_0 = data_0.iloc[numpy.array(idx)] - data_1 = pandas.Series(self._dataframe._data[1][0]) - result_1 = data_1.iloc[numpy.array(idx)] - return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=idx) - raise IndexingError('Index is out of bounds for axis') + check_idx = False + for i in idx: + if -1 < i < len(self._dataframe.index): + check_idx = True + if check_idx == True: + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iloc[numpy.array(idx)] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iloc[numpy.array(idx)] + data_2 = pandas.Series(self._dataframe._data[1][0]) + result_2 = data_2.iloc[numpy.array(idx)] + return pandas.DataFrame(data={"A": result_0, "B": result_1, "C": result_2}, index=idx) + raise IndexingError('Index is out of bounds for axis') """ func_lines = ['def _df_getitem_list_iloc_impl(self, idx):', ' check_idx = False', @@ -2170,7 +2131,7 @@ def _df_getitem_list_iloc_impl(self, idx): ' if check_idx == True:'] results = [] index = '[self._dataframe._index[i] for i in idx]' - if isinstance(self.index, types.NoneType): + if isinstance(self.index, PositionalIndexType): index = 'idx' for i, c in enumerate(self.columns): col_loc = self.column_loc[c] @@ -2193,14 +2154,15 @@ def df_getitem_list_bool_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_list_bool_iloc_impl(self, idx): - if len(self._dataframe.index) == len(idx): - data_0 = self._dataframe._data[0][0] - result_0 = pandas.Series(data_0[numpy.array(idx)]) - data_1 = self._dataframe._data[1][0] - result_1 = pandas.Series(data_1[numpy.array(idx)]) - return pandas.DataFrame(data={"A": result_0, "B": result_1}, - index=self._dataframe.index[numpy.array(idx)]) - raise IndexingError('Item wrong length') + if len(self._dataframe.index) == len(idx): + data_0 = self._dataframe._data[0][0] + result_0 = pandas.Series(data_0[numpy.array(idx)]) + data_1 = self._dataframe._data[0][1] + result_1 = pandas.Series(data_1[numpy.array(idx)]) + data_2 = self._dataframe._data[1][0] + result_2 = pandas.Series(data_2[numpy.array(idx)]) + return pandas.DataFrame(data={"A": result_0, "B": result_1, "C": result_2}, index=self._dataframe.index[numpy.array(idx)]) + raise IndexingError('Item wrong length') """ func_lines = ['def _df_getitem_list_bool_iloc_impl(self, idx):'] results = [] @@ -2257,7 +2219,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx): if accessor == 'at': num_idx = (isinstance(idx[0], types.Number) - and isinstance(self.dataframe.index, (types.Array, types.NoneType, RangeIndexType))) + and isinstance(self.dataframe.index, (PositionalIndexType, RangeIndexType, Int64IndexType))) str_idx = (isinstance(idx[0], (types.UnicodeType, types.StringLiteral)) and isinstance(self.dataframe.index, StringArrayType)) if isinstance(idx, types.Tuple) and isinstance(idx[1], types.StringLiteral): @@ -2317,6 +2279,7 @@ def df_getitem_iat_tuple_impl(self, idx): return gen_df_getitem_iloc_int_impl(self.dataframe, idx) if isinstance(idx, (types.Tuple, types.UniTuple)): + def df_getitem_tuple_iat_impl(self, idx): return self._dataframe.iat[idx] @@ -2603,14 +2566,6 @@ def pct_change_overload(df, periods=1, fill_method='pad', limit=None, freq=None) return sdc_pandas_dataframe_apply_columns(df, name, params, ser_par) -def df_index_codegen_isin(df_type, df, data): - if isinstance(df_type.index, types.NoneType): - func_lines = [f' return pandas.DataFrame({{{data}}})'] - else: - func_lines = [f' return pandas.DataFrame({{{data}}}, index={df}._index)'] - return func_lines - - def sdc_pandas_dataframe_isin_dict_codegen(func_name, df_type, values, all_params): """ Example of generated implementation: @@ -2655,7 +2610,7 @@ def _df_isin_impl(df, values): ] result_name.append((result_c, c)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name) - func_lines += df_index_codegen_isin(df_type, df, data) + func_lines.append(f' return pandas.DataFrame({{{data}}}, index={df}._index)') func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, @@ -2718,7 +2673,7 @@ def _df_isin_impl(df, values): f' result = numpy.empty(len(series_{c}._data), numpy.bool_)', f' result_len = len(series_{c}._data)' ] - if isinstance(values.index, types.NoneType) and isinstance(df_type.index, types.NoneType): + if isinstance(values.index, PositionalIndexType) and isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' if i <= len(values._data):', @@ -2729,7 +2684,7 @@ def _df_isin_impl(df, values): f' else:', f' result[i] = False' ] - elif isinstance(values.index, types.NoneType): + elif isinstance(values.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' idx = {df}.index[i]', @@ -2744,7 +2699,7 @@ def _df_isin_impl(df, values): f' result[i] = False', f' break' ] - elif isinstance(df_type.index, types.NoneType): + elif isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' value = series_{c}._data[i]', @@ -2780,7 +2735,7 @@ def _df_isin_impl(df, values): result_name.append((result_c, c)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name) - func_lines += df_index_codegen_isin(df_type, df, data) + func_lines.append(f' return pandas.DataFrame({{{data}}}, index={df}._index)') func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, @@ -2851,7 +2806,7 @@ def _df_isin_impl(df, values): f' result = numpy.empty(len(series_{c}._data), numpy.bool_)', f' result_len = len(series_{c}._data)' ] - if isinstance(in_df.index, types.NoneType) and isinstance(df_type.index, types.NoneType): + if isinstance(df.index, PositionalIndexType) and isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' if i <= len(series_{c}_values):', @@ -2861,7 +2816,7 @@ def _df_isin_impl(df, values): f' result[i] = False', f' else:', f' result[i] = False'] - elif isinstance(df_type.index, types.NoneType): + elif isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' value = series_{c}._data[i]', @@ -2876,7 +2831,7 @@ def _df_isin_impl(df, values): f' result[i] = False', f' break', ] - elif isinstance(in_df.index, types.NoneType): + elif isinstance(df.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' idx = {df}.index[i]', @@ -2913,7 +2868,7 @@ def _df_isin_impl(df, values): func_lines += [f' {result_c} = pandas.Series(result)'] result_name.append((result_c, c)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name) - func_lines += df_index_codegen_isin(df_type, df, data) + func_lines.append(f' return pandas.DataFrame({{{data}}}, index={df}._index)') func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, @@ -3150,10 +3105,11 @@ def sdc_pandas_dataframe_groupby_impl(self, by=None, axis=0, level=None, as_inde def df_set_column_index_codelines(self): """Generate code lines with definition of resulting index for DF set_column""" + index_param_expr = 'self._index' if not isinstance(self.index, EmptyIndexType) else 'None' func_lines = [] if self.columns: func_lines += [ - f' length = {df_length_expr(self)}', + f' length = len(self._index)', f' if length == 0:', f' raise SDCLimitation("Could not set item for DataFrame with empty columns")', f' elif length != len(value):', @@ -3161,7 +3117,7 @@ def df_set_column_index_codelines(self): ] else: func_lines += [' length = len(value)'] - func_lines += [f' res_index = {df_index_expr(self, length_expr="length")}'] + func_lines += [f' res_index = {index_param_expr}'] return func_lines @@ -3177,13 +3133,14 @@ def df_add_column_codelines(self, key): res_data = f'res_data_{i}' func_lines += [ f' data_{i} = self._data[{type_id}][{col_id}]', - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")', + # f' {res_data} = pandas.Series(data_{i}, index=series_index, name="{col}")', + f' {res_data} = data_{i}', ] results.append((col, res_data)) res_data = 'new_res_data' literal_key = key.literal_value - func_lines += [f' {res_data} = pandas.Series(value, index=res_index, name="{literal_key}")'] + func_lines += [f' {res_data} = value'] results.append((literal_key, res_data)) data = ', '.join(f'"{col}": {data}' for col, data in results) @@ -3208,12 +3165,12 @@ def df_replace_column_codelines(self, key): res_data = f'res_data_{i}' func_lines += [ - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")', + f' {res_data} = data_{i}', ] results.append((col, res_data)) data = ', '.join(f'"{col}": {data}' for col, data in results) - func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index)'] + func_lines += [f' return pandas.DataFrame({{{data}}}, index=self._index)'] return func_lines @@ -3221,19 +3178,19 @@ def df_replace_column_codelines(self, key): def df_add_column_codegen(self, key): """ Example of generated implementation: - def _df_add_column_impl(self, key, value): - length = len(self._data[0]) - if length == 0: - raise SDCLimitation("Could not set item for empty DataFrame") - elif length != len(value): - raise ValueError("Length of values does not match length of index") - res_index = numpy.arange(length) - data_0 = self._data[0] - res_data_0 = pandas.Series(data_0, index=res_index, name="A") - data_1 = self._data[1] - res_data_1 = pandas.Series(data_1, index=res_index, name="C") - new_res_data = pandas.Series(value, index=res_index, name="B") - return pandas.DataFrame({"A": res_data_0, "C": res_data_1, "B": new_res_data}, index=res_index) + def _df_add_column_impl(self, key, value): + length = len(self._index) + if length == 0: + raise SDCLimitation("Could not set item for DataFrame with empty columns") + elif length != len(value): + raise ValueError("Length of values does not match length of index") + res_index = self._index + data_0 = self._data[0][0] + res_data_0 = data_0 + data_1 = self._data[1][0] + res_data_1 = data_1 + new_res_data = value + return pandas.DataFrame({"A": res_data_0, "C": res_data_1, "B": new_res_data}, index=res_index) """ func_lines = [f'def _df_add_column_impl(self, key, value):'] func_lines += df_add_column_codelines(self, key) @@ -3249,17 +3206,17 @@ def df_replace_column_codegen(self, key): """ Example of generated implementation: def _df_replace_column_impl(self, key, value): - length = len(self._data[0]) + length = len(self._index) if length == 0: raise SDCLimitation("Could not set item for DataFrame with empty columns") elif length != len(value): raise ValueError("Length of values does not match length of index") - res_index = numpy.arange(length) + res_index = self._index data_0 = value - res_data_0 = pandas.Series(data_0, index=res_index, name="A") - data_1 = self._data[1] - res_data_1 = pandas.Series(data_1, index=res_index, name="C") - return pandas.DataFrame({"A": res_data_0, "C": res_data_1}, index=res_index) + res_data_0 = data_0 + data_1 = self._data[1][0] + res_data_1 = data_1 + return pandas.DataFrame({"A": res_data_0, "C": res_data_1}, index=self._index) """ func_lines = [f'def _df_replace_column_impl(self, key, value):'] func_lines += df_replace_column_codelines(self, key) diff --git a/sdc/datatypes/hpat_pandas_groupby_functions.py b/sdc/datatypes/hpat_pandas_groupby_functions.py index 83f752e9d..aa83fcc0e 100644 --- a/sdc/datatypes/hpat_pandas_groupby_functions.py +++ b/sdc/datatypes/hpat_pandas_groupby_functions.py @@ -41,12 +41,13 @@ from numba.core.typing import signature from numba import literally -from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray, _sdc_take +from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray from sdc.datatypes.hpat_pandas_groupby_types import DataFrameGroupByType, SeriesGroupByType from sdc.utilities.sdc_typing_utils import TypeChecker, kwsparams2list, sigparams2list from sdc.utilities.utils import (sdc_overload, sdc_overload_method, sdc_register_jitable) from sdc.hiframes.pd_series_type import SeriesType from sdc.str_ext import string_type +from sdc.functions.numpy_like import take as nplike_take performance_limitation = "This function may reveal slower performance than Pandas* on user system.\ @@ -218,7 +219,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc, f' column_data_{i} = {df}._data[{type_id}][{col_id}]', f' for j in numpy.arange(res_index_len):', f' idx = argsorted_index[j] if {groupby_param_sort} else j', - f' group_arr_{i} = _sdc_take(column_data_{i}, list({groupby_dict}[group_keys[idx]]))', + f' group_arr_{i} = sdc_take(column_data_{i}, list({groupby_dict}[group_keys[idx]]))', f' group_series_{i} = pandas.Series(group_arr_{i})', f' result_data_{i}[j] = group_series_{i}.{func_name}({extra_impl_params})', ] @@ -226,7 +227,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc, data = ', '.join(f'\'{column_names[i]}\': result_data_{i}' for i in range(len(columns))) func_lines.extend(['\n'.join([ f' if {groupby_param_sort}:', - f' res_index = _sdc_take(group_keys, argsorted_index)', + f' res_index = sdc_take(group_keys, argsorted_index)', f' else:', f' res_index = group_keys', f' return pandas.DataFrame({{{data}}}, index=res_index)' @@ -236,7 +237,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc, global_vars = {'pandas': pandas, 'numpy': numpy, '_sdc_asarray': _sdc_asarray, - '_sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'sdc_arrays_argsort': sdc_arrays_argsort} return func_text, global_vars @@ -262,11 +263,11 @@ def _sdc_pandas_series_groupby_generic_func_codegen(func_name, func_params, defa f' result_data = numpy.empty(res_index_len, dtype=res_dtype)', f' for j in numpy.arange(res_index_len):', f' idx = argsorted_index[j] if {groupby_param_sort} else j', - f' group_arr = _sdc_take({series}._data, list({groupby_dict}[group_keys[idx]]))', + f' group_arr = sdc_take({series}._data, list({groupby_dict}[group_keys[idx]]))', f' group_series = pandas.Series(group_arr)', f' result_data[j] = group_series.{func_name}({extra_impl_params})', f' if {groupby_param_sort}:', - f' res_index = _sdc_take(group_keys, argsorted_index)', + f' res_index = sdc_take(group_keys, argsorted_index)', f' else:', f' res_index = group_keys', f' return pandas.Series(data=result_data, index=res_index, name={series}._name)' @@ -276,7 +277,7 @@ def _sdc_pandas_series_groupby_generic_func_codegen(func_name, func_params, defa global_vars = {'pandas': pandas, 'numpy': numpy, '_sdc_asarray': _sdc_asarray, - '_sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'sdc_arrays_argsort': sdc_arrays_argsort} return func_text, global_vars diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 610a21fc7..0738fe9c3 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -51,9 +51,12 @@ import sdc.datatypes.common_functions as common_functions from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, find_common_dtype_from_numpy_dtypes, has_literal_value, - has_python_value) -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.common_functions import (sdc_join_series_indexes, sdc_arrays_argsort, sdc_reindex_series) + has_python_value, + sdc_old_index_types, + find_index_common_dtype, + ) +from sdc.datatypes.indexes import * +from sdc.datatypes.common_functions import (sdc_arrays_argsort, sdc_reindex_series, _sdc_internal_join) from sdc.datatypes.hpat_pandas_rolling_types import ( gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl) from sdc.datatypes.hpat_pandas_series_rolling_types import _hpat_pandas_series_rolling_init @@ -71,6 +74,7 @@ from sdc.hiframes.api import isna from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby from sdc.utilities.prange_utils import parallel_chunks +from sdc.extensions.indexes.indexes_generic import sdc_indexes_join_outer, sdc_fix_indexes_join from .pandas_series_functions import apply from .pandas_series_functions import map as _map @@ -145,9 +149,8 @@ def hpat_pandas_series_iat_impl(self, idx): # Note: Loc slice without start is not supported min_int64 = numpy.iinfo('int64').min max_int64 = numpy.iinfo('int64').max - index_is_none = (self.series.index is None or - isinstance(self.series.index, numba.types.misc.NoneType)) - if isinstance(idx, types.SliceType) and not index_is_none: + index_is_positional = isinstance(self.series.index, PositionalIndexType) + if isinstance(idx, types.SliceType) and not index_is_positional: def hpat_pandas_series_loc_slice_impl(self, idx): series = self._series index = series.index @@ -201,7 +204,7 @@ def hpat_pandas_series_loc_slice_impl(self, idx): return hpat_pandas_series_loc_slice_impl - if isinstance(idx, types.SliceType) and index_is_none: + if isinstance(idx, types.SliceType) and index_is_positional: def hpat_pandas_series_loc_slice_noidx_impl(self, idx): max_slice = sys.maxsize start = idx.start @@ -372,16 +375,16 @@ def hpat_pandas_series_getitem(self, idx): return None # Note: Getitem return Series - index_is_none = isinstance(self.index, numba.types.misc.NoneType) - index_is_none_or_numeric = index_is_none or (self.index and isinstance(self.index.dtype, types.Number)) - index_is_string = not index_is_none and isinstance(self.index.dtype, (types.UnicodeType, types.StringLiteral)) + index_is_positional = isinstance(self.index, PositionalIndexType) + index_is_numeric = isinstance(self.index.dtype, types.Number) + index_is_string = isinstance(self.index.dtype, types.UnicodeType) if ( - isinstance(idx, types.Number) and index_is_none_or_numeric or + isinstance(idx, types.Number) and index_is_numeric or (isinstance(idx, (types.UnicodeType, types.StringLiteral)) and index_is_string) ): def hpat_pandas_series_getitem_index_impl(self, idx): - index = self.index + index = self._index mask = numpy.empty(len(self._data), numpy.bool_) for i in numba.prange(len(index)): mask[i] = index[i] == idx @@ -403,7 +406,7 @@ def hpat_pandas_series_getitem_idx_slice_impl(self, idx): return hpat_pandas_series_getitem_idx_slice_impl if (isinstance(idx, (types.List, types.Array)) - and isinstance(idx.dtype, (types.Boolean, bool))): + and isinstance(idx.dtype, types.Boolean)): def hpat_pandas_series_getitem_idx_list_impl(self, idx): if len(self) != len(idx): @@ -420,11 +423,9 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx): # idx is Series and it's index is any, idx.dtype is Boolean if (isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean)): - none_indexes = isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType) - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(idx.index, types.NoneType) or check_index_is_numeric(idx))) - if not (none_or_numeric_indexes - or check_types_comparable(self.index, idx.index)): + positional_indexes = (isinstance(self.index, PositionalIndexType) + and isinstance(idx.index, PositionalIndexType)) + if not check_types_comparable(self.index, idx.index): msg = '{} The index of boolean indexer is not comparable to Series index.' + \ ' Given: self.index={}, idx.index={}' raise TypingError(msg.format(_func_name, self.index, idx.index)) @@ -433,7 +434,7 @@ def _series_getitem_idx_bool_indexer_impl(self, idx): # TO-DO: replace sdc_reindex_series with reindex methods and move this logic to impl # for specific index types (needs proper index type instead of types.none as index) - if none_indexes == True: # noqa + if positional_indexes == True: # noqa if len(self) > len(idx): msg = "Unalignable boolean Series provided as indexer " + \ "(index of the boolean Series and of the indexed object do not match)." @@ -453,8 +454,8 @@ def _series_getitem_idx_bool_indexer_impl(self, idx): return _series_getitem_idx_bool_indexer_impl - # idx is Series and it's index is None, idx.dtype is not Boolean - if (isinstance(idx, SeriesType) and index_is_none + # idx is Series and it's index is PositionalIndex, idx.dtype is not Boolean + if (isinstance(idx, SeriesType) and index_is_positional and not isinstance(idx.data.dtype, (types.Boolean, bool))): def hpat_pandas_series_getitem_idx_list_impl(self, idx): res = numpy.copy(self._data[:len(idx._data)]) @@ -466,8 +467,8 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx): return pandas.Series(data=res, index=index[idx._data], name=self._name) return hpat_pandas_series_getitem_idx_list_impl - # idx is Series and it's index is not None, idx.dtype is not Boolean - if (isinstance(idx, SeriesType) and not isinstance(self.index, types.NoneType) + # idx is Series and it's index is not PositionalIndex, idx.dtype is not Boolean + if (isinstance(idx, SeriesType) and not isinstance(self.index, PositionalIndexType) and not isinstance(idx.data.dtype, (types.Boolean, bool))): def hpat_pandas_series_getitem_idx_series_impl(self, idx): index = self.index @@ -600,8 +601,9 @@ def sdc_pandas_series_setitem(self, idx, value): idx_is_boolean_array = isinstance(idx, types.Array) and isinstance(idx.dtype, types.Boolean) idx_is_boolean_series = isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean) idx_and_self_index_comparable = check_types_comparable(self.index, idx) - self_index_is_none = isinstance(self.index, types.NoneType) - assign_along_positions = ((self_index_is_none + self_index_is_positional = isinstance(self.index, PositionalIndexType) + idx_index_is_positional = isinstance(idx, SeriesType) and isinstance(idx.index, PositionalIndexType) + assign_along_positions = ((self_index_is_positional or isinstance(idx, types.SliceType) or not idx_and_self_index_comparable) and not idx_is_boolean_series @@ -613,15 +615,16 @@ def sdc_pandas_series_setitem(self, idx, value): idx_is_numeric_or_boolean_series = (isinstance(idx, SeriesType) and isinstance(idx.dtype, (types.Number, types.Boolean))) assign_via_idx_mask = idx_is_scalar and idx_and_self_index_comparable - assign_via_idx_data = idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable + assign_via_idx_values = (self_index_is_positional and idx_index_is_positional + or idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable) def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): if assign_via_idx_mask == True: # noqa - # FIXME_Numba#5157: using asarray since eq impl for RangeIndexType returns list + # FIXME_Numba#5157: using asarray since eq impl for index types returns list _idx = numpy.asarray(self._index == idx) - elif assign_via_idx_data == True: # noqa - _idx = idx._data + elif assign_via_idx_values == True: # noqa + _idx = idx.values else: _idx = idx @@ -633,15 +636,11 @@ def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): if (idx_is_boolean_array or idx_is_boolean_series) and value_is_series: - self_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype - value_index_dtype = types.int64 if isinstance(value.index, types.NoneType) else value.index.dtype - if (isinstance(self_index_dtype, types.Number) and isinstance(value_index_dtype, types.Number)): - indexes_common_dtype = find_common_dtype_from_numpy_dtypes([self_index_dtype, value_index_dtype], []) - elif (isinstance(self_index_dtype, types.UnicodeType) and isinstance(value_index_dtype, types.UnicodeType)): - indexes_common_dtype = types.unicode_type - else: + if not check_types_comparable(self.index, value.index): msg = '{} The self and value indexes must be comparable. Given: self.dtype={}, value.dtype={}' - raise TypingError(msg.format(_func_name, self_index_dtype, value_index_dtype)) + raise TypingError(msg.format(_func_name, self.index, value.index)) + + _, indexes_common_dtype = find_index_common_dtype(self.index, value.index) if idx_is_boolean_array: @@ -810,10 +809,9 @@ def sdc_pandas_series_setitem_idx_str_series_align_impl(self, idx, value): if number_of_found != idx_data_size: raise ValueError("Reindexing not possible: idx has index not found in Series") - if value_is_series == True: # noqa - self._data[set_positions] = value._data - else: - self._data[set_positions] = value + set_values = value if value_is_series == False else value._data # noqa + self._data[set_positions] = set_values + return self return sdc_pandas_series_setitem_idx_str_series_align_impl @@ -1652,16 +1650,10 @@ def hpat_pandas_series_index(self): ty_checker = TypeChecker(_func_name) ty_checker.check(self, SeriesType) - if isinstance(self.index, types.NoneType): - def hpat_pandas_series_index_none_impl(self): - return pandas.RangeIndex(len(self._data)) - - return hpat_pandas_series_index_none_impl - else: - def hpat_pandas_series_index_impl(self): - return self._index + def hpat_pandas_series_index_impl(self): + return self._index - return hpat_pandas_series_index_impl + return hpat_pandas_series_index_impl hpat_pandas_series_rolling = sdc_overload_method(SeriesType, 'rolling')( @@ -1905,21 +1897,24 @@ def hpat_pandas_series_astype_no_modify_impl(self, dtype, copy=True, errors='rai (isinstance(dtype, types.StringLiteral) and dtype.literal_value == 'str')) # Needs Numba astype impl support converting unicode_type to NumberClass and other types - if (isinstance(self.data, StringArrayType) and not str_check): - if isinstance(dtype, types.functions.NumberClass) and errors == 'raise': - raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype}') - if isinstance(dtype, types.StringLiteral) and errors == 'raise': - try: - literal_value = numpy.dtype(dtype.literal_value) - except: - pass # Will raise the exception later - else: - raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype.literal_value}') + if isinstance(self.data, StringArrayType): + if not str_check: + if isinstance(dtype, types.functions.NumberClass) and errors == 'raise': + raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype}') + if isinstance(dtype, types.StringLiteral) and errors == 'raise': + try: + literal_value = numpy.dtype(dtype.literal_value) + except: + pass # Will raise the exception later + else: + raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype.literal_value}') + else: + return hpat_pandas_series_astype_no_modify_impl data_narr = isinstance(self.data, types.npytypes.Array) dtype_num_liter = isinstance(dtype, (types.functions.NumberClass, types.StringLiteral)) - if data_narr and dtype_num_liter or str_check: + if data_narr and dtype_num_liter: return hpat_pandas_series_astype_numba_impl if errors == 'raise': @@ -2141,6 +2136,7 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr ty_checker = TypeChecker(_func_name) ty_checker.check(self, SeriesType) + other_is_series = isinstance(to_append, SeriesType) if not (isinstance(to_append, SeriesType) or (isinstance(to_append, (types.UniTuple, types.List)) and isinstance(to_append.dtype, SeriesType))): ty_checker.raise_exc(to_append, 'series or list/tuple of series', 'to_append') @@ -2160,17 +2156,21 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr or has_python_value(ignore_index, False) or isinstance(ignore_index, types.Omitted)) to_append_is_series = isinstance(to_append, SeriesType) + index_api_supported = not isinstance(self.index, sdc_old_index_types) if ignore_index_is_false: def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False): if to_append_is_series == True: # noqa new_data = common_functions.hpat_arrays_append(self._data, to_append._data) - new_index = common_functions.hpat_arrays_append(self.index, to_append.index) + _self_index = self._index.values if index_api_supported == True else self._index + new_index = common_functions.hpat_arrays_append(_self_index, to_append._index) else: data_arrays_to_append = [series._data for series in to_append] - index_arrays_to_append = [series.index for series in to_append] + index_arrays_to_append = [series._index for series in to_append] + new_data = common_functions.hpat_arrays_append(self._data, data_arrays_to_append) - new_index = common_functions.hpat_arrays_append(self.index, index_arrays_to_append) + _self_index = self._index.values if index_api_supported == True else self._index + new_index = common_functions.hpat_arrays_append(_self_index, index_arrays_to_append) return pandas.Series(new_data, new_index) @@ -2228,22 +2228,17 @@ def hpat_pandas_series_copy(self, deep=True): if not isinstance(deep, (types.Omitted, types.Boolean)) and not deep: ty_checker.raise_exc(deep, 'boolean', 'deep') - if isinstance(self.index, types.NoneType): - def hpat_pandas_series_copy_impl(self, deep=True): - if deep: - return pandas.Series(data=numpy_like.copy(self._data), name=self._name) - else: - return pandas.Series(data=self._data, name=self._name) - return hpat_pandas_series_copy_impl - else: - def hpat_pandas_series_copy_impl(self, deep=True): - if deep: - return pandas.Series(data=numpy_like.copy(self._data), index=numpy_like.copy(self._index), - name=self._name) - else: - # Shallow copy of index is not supported yet - return pandas.Series(data=self._data, index=numpy_like.copy(self._index), name=self._name) - return hpat_pandas_series_copy_impl + index_api_supported = not isinstance(self.index, sdc_old_index_types) + def hpat_pandas_series_copy_impl(self, deep=True): + new_series_data = numpy_like.copy(self._data) if deep else self._data + + if index_api_supported == False: # noqa + new_series_index = self._index.copy() if deep else self._index + else: + new_series_index = self._index.copy(deep=deep) + return pandas.Series(new_series_data, new_series_index, name=self._name) + + return hpat_pandas_series_copy_impl @sdc_overload_method(SeriesType, 'corr') @@ -2342,16 +2337,10 @@ def hpat_pandas_series_head(self, n=5): if not isinstance(n, (types.Integer, types.Omitted, types.NoneType)) and n != 5: ty_checker.raise_exc(n, 'int', 'n') - if isinstance(self.index, types.NoneType): - def hpat_pandas_series_head_impl(self, n=5): - return pandas.Series(data=self._data[:n], name=self._name) - - return hpat_pandas_series_head_impl - else: - def hpat_pandas_series_head_index_impl(self, n=5): - return pandas.Series(data=self._data[:n], index=self._index[:n], name=self._name) + def hpat_pandas_series_head_index_impl(self, n=5): + return pandas.Series(data=self._data[:n], index=self._index[:n], name=self._name) - return hpat_pandas_series_head_index_impl + return hpat_pandas_series_head_index_impl @sdc_overload_method(SeriesType, 'isnull') @@ -2703,14 +2692,6 @@ def hpat_pandas_series_take(self, indices, axis=0, is_copy=False): if not isinstance(indices, (types.List, types.Array)): ty_checker.raise_exc(indices, 'array-like', 'indices') - if isinstance(self.index, types.NoneType) or self.index is None: - def hpat_pandas_series_take_noindex_impl(self, indices, axis=0, is_copy=False): - local_data = [self._data[i] for i in indices] - - return pandas.Series(local_data, indices) - - return hpat_pandas_series_take_noindex_impl - def hpat_pandas_series_take_impl(self, indices, axis=0, is_copy=False): local_data = [self._data[i] for i in indices] local_index = [self._index[i] for i in indices] @@ -2777,7 +2758,7 @@ def hpat_pandas_series_idxmax(self, axis=None, skipna=None): if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'None', 'axis') - none_index = isinstance(self.index, types.NoneType) or self.index is None + positional_index = isinstance(self.index, PositionalIndexType) if isinstance(self.data, StringArrayType): def hpat_pandas_series_idxmax_str_impl(self, axis=None, skipna=None): if skipna is None: @@ -2786,7 +2767,7 @@ def hpat_pandas_series_idxmax_str_impl(self, axis=None, skipna=None): raise ValueError("Method idxmax(). Unsupported parameter 'skipna'=False with str data") result = numpy.argmax(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -2805,7 +2786,7 @@ def hpat_pandas_series_idxmax_impl(self, axis=None, skipna=None): else: result = numpy_like.argmax(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -2994,7 +2975,7 @@ def hpat_pandas_series_rename(self, index=None, copy=True, inplace=False, level= types.StringLiteral, types.Integer)) and level is not None: ty_checker.raise_exc(level, 'Integer or string', 'level') - def hpat_pandas_series_rename_idx_impl(self, index=None, copy=True, inplace=False, level=None): + def hpat_pandas_series_rename_impl(self, index=None, copy=True, inplace=False, level=None): if copy is True: series_data = self._data.copy() series_index = self._index.copy() @@ -3004,17 +2985,7 @@ def hpat_pandas_series_rename_idx_impl(self, index=None, copy=True, inplace=Fals return pandas.Series(data=series_data, index=series_index, name=index) - def hpat_pandas_series_rename_noidx_impl(self, index=None, copy=True, inplace=False, level=None): - if copy is True: - series_data = self._data.copy() - else: - series_data = self._data - - return pandas.Series(data=series_data, index=self._index, name=index) - - if isinstance(self.index, types.NoneType): - return hpat_pandas_series_rename_noidx_impl - return hpat_pandas_series_rename_idx_impl + return hpat_pandas_series_rename_impl @sdc_overload_method(SeriesType, 'min') @@ -3314,7 +3285,7 @@ def hpat_pandas_series_idxmin(self, axis=None, skipna=None): if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'None', 'axis') - none_index = isinstance(self.index, types.NoneType) or self.index is None + positional_index = isinstance(self.index, PositionalIndexType) if isinstance(self.data, StringArrayType): def hpat_pandas_series_idxmin_str_impl(self, axis=None, skipna=None): if skipna is None: @@ -3323,7 +3294,7 @@ def hpat_pandas_series_idxmin_str_impl(self, axis=None, skipna=None): raise ValueError("Method idxmin(). Unsupported parameter 'skipna'=False with str data") result = numpy.argmin(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -3342,7 +3313,7 @@ def hpat_pandas_series_idxmin_impl(self, axis=None, skipna=None): else: result = numpy_like.argmin(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -3805,7 +3776,7 @@ def hpat_pandas_series_argsort(self, axis=0, kind='quicksort', order=None): and order is not None: ty_checker.raise_exc(order, 'None', 'order') - if not isinstance(self.index, types.NoneType): + if not isinstance(self.index, PositionalIndexType): def hpat_pandas_series_argsort_idx_impl(self, axis=0, kind='quicksort', order=None): if kind != 'quicksort' and kind != 'mergesort': raise ValueError("Method argsort(). Unsupported parameter. Given 'kind' != 'quicksort' or 'mergesort'") @@ -4033,20 +4004,19 @@ def hpat_pandas_series_dropna(self, axis=0, inplace=False): if not (inplace is False or isinstance(inplace, types.Omitted)): ty_checker.raise_exc(inplace, 'bool', 'inplace') - if (isinstance(self.data.dtype, types.Number) - and isinstance(self.index, (types.Number, types.NoneType, RangeIndexType))): + # if both data and index are numeric (i.e. types.Array) dispatch to numpy_like.dropna impl + if (isinstance(self.dtype, types.Number) and isinstance(self.index.dtype, types.Number)): def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False): - index = self.index - return numpy_like.dropna(self._data, index, self._name) + return numpy_like.dropna(self._data, self._index, self._name) return hpat_pandas_series_dropna_impl else: def hpat_pandas_series_dropna_str_impl(self, axis=0, inplace=False): - # generate Series index if needed by using SeriesType.index (i.e. not self._index) + # TO-DO: verify these operations are fused na_data_arr = sdc.hiframes.api.get_nan_mask(self._data) data = self._data[~na_data_arr] - index = self.index[~na_data_arr] + index = self._index[~na_data_arr] return pandas.Series(data, index, self._name) return hpat_pandas_series_dropna_str_impl @@ -4528,9 +4498,7 @@ def sdc_pandas_str_series_operator_add(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -4553,7 +4521,7 @@ def _series_operator_add_scalar_impl(self, other): else: # both operands are string series # TO-DO: None indexes branch is dead code, remove? - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): + if (isinstance(self.index, PositionalIndexType) and isinstance(other.index, PositionalIndexType)): def _series_operator_add_none_indexes_impl(self, other): if (len(self._data) == len(other._data)): @@ -4582,32 +4550,18 @@ def _series_operator_add_none_indexes_impl(self, other): return _series_operator_add_none_indexes_impl else: - left_index_is_range = isinstance(self.index, RangeIndexType) - numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( - [self.index.dtype, other.index.dtype], []) - common_dtype_different = (numba_index_common_dtype != self.index.dtype - or numba_index_common_dtype != other.index.dtype) - - def _series_operator_add_common_impl(self, other): - left_index, right_index = self.index, other.index - - # TO-DO: coversion of RangeIndexType to np.array may happen several times here: - # in array_equal, in astype or left_index.values - need caching of array allocated once + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - # check if indexes are equal and series don't have to be aligned - if (left_index is right_index or numpy_like.array_equal(left_index, right_index)): - result_data = self._data + other._data - - if common_dtype_different == True: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa - - return pandas.Series(result_data, index=result_index) - - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) + def _series_operator_add_str_impl(self, other): + left_index, right_index = self._index, other._index + if index_api_supported == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_nan_mask = numpy.zeros(result_size, dtype=numpy.bool_) result_data_as_list = [] @@ -4626,7 +4580,7 @@ def _series_operator_add_common_impl(self, other): return pandas.Series(result_data, joined_index) - return _series_operator_add_common_impl + return _series_operator_add_str_impl return None @@ -4659,9 +4613,7 @@ def sdc_pandas_str_series_operator_mul(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -4688,7 +4640,7 @@ def _series_operator_mul_scalar_impl(self, other): self_is_series = isinstance(self, SeriesType) # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): + if (isinstance(self.index, PositionalIndexType) and isinstance(other.index, PositionalIndexType)): def _series_operator_mul_none_indexes_impl(self, other): series_operand = self if self_is_series == True else other # noqa @@ -4716,32 +4668,18 @@ def _series_operator_mul_none_indexes_impl(self, other): return _series_operator_mul_none_indexes_impl else: - left_index_is_range = isinstance(self.index, RangeIndexType) - numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( - [self.index.dtype, other.index.dtype], []) - common_dtype_different = (numba_index_common_dtype != self.index.dtype - or numba_index_common_dtype != other.index.dtype) + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) def _series_operator_mul_common_impl(self, other): - left_index, right_index = self.index, other.index - - # TO-DO: coversion of RangeIndexType to np.array may happen several times here: - # in array_equal, in astype or left_index.values - need caching of array allocated once - - # check if indexes are equal and series don't have to be aligned - if (left_index is right_index or numpy_like.array_equal(left_index, right_index)): - result_data = self._data * other._data - - if common_dtype_different == True: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa - - return pandas.Series(result_data, index=result_index) - - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) + left_index, right_index = self._index, other._index + if index_api_supported == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) str_series_operand = self if self_is_string_series == True else other # noqa str_series_indexer = left_indexer if self_is_string_series == True else right_indexer # noqa result_size = len(joined_index) diff --git a/sdc/datatypes/hpat_pandas_stringmethods_functions.py b/sdc/datatypes/hpat_pandas_stringmethods_functions.py index cd8067a0d..04616c275 100644 --- a/sdc/datatypes/hpat_pandas_stringmethods_functions.py +++ b/sdc/datatypes/hpat_pandas_stringmethods_functions.py @@ -89,7 +89,7 @@ def hpat_pandas_stringmethods_upper_impl(self): from sdc.utilities.utils import sdc_overload_method, sdc_register_jitable from sdc.hiframes.api import get_nan_mask from sdc.str_arr_ext import str_arr_set_na_by_mask, create_str_arr_from_list -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation @sdc_overload_method(StringMethodsType, 'center') diff --git a/sdc/datatypes/indexes/__init__.py b/sdc/datatypes/indexes/__init__.py new file mode 100644 index 000000000..52d144708 --- /dev/null +++ b/sdc/datatypes/indexes/__init__.py @@ -0,0 +1,32 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# modules are dependent on each other e.g. positional_index_type +# needs range_index_type to be imported, so below order matters +from .range_index_type import RangeIndexType +from .positional_index_type import PositionalIndexType +from .empty_index_type import EmptyIndexType +from .int64_index_type import Int64IndexType diff --git a/sdc/datatypes/indexes/empty_index_type.py b/sdc/datatypes/indexes/empty_index_type.py new file mode 100644 index 000000000..68cebeb32 --- /dev/null +++ b/sdc/datatypes/indexes/empty_index_type.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper +) + + +class EmptyIndexType(types.Type): + + # this index represents special case of pd.Index([]) with dtype='object' + # for overload typing functions assume it has following dtype + dtype = types.pyobject + + def __init__(self, is_named=False): + self.is_named = is_named + super(EmptyIndexType, self).__init__( + name='EmptyIndexType({})'.format(is_named)) + + +@register_model(EmptyIndexType) +class EmptyIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + name_type = types.unicode_type if fe_type.is_named else types.none + members = [ + ('name', name_type), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(EmptyIndexType, 'name', '_name') diff --git a/sdc/datatypes/indexes/int64_index_type.py b/sdc/datatypes/indexes/int64_index_type.py new file mode 100644 index 000000000..745d394a7 --- /dev/null +++ b/sdc/datatypes/indexes/int64_index_type.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper +) + + +class Int64IndexType(types.IterableType): + dtype = types.int64 + + def __init__(self, data, is_named=False): + self.data = data + self.is_named = is_named + super(Int64IndexType, self).__init__( + name='Int64IndexType({}, {})'.format(data, is_named)) + + @property + def iterator_type(self): + res = self.data.iterator_type + return res + + +@register_model(Int64IndexType) +class Int64IndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + data_type = fe_type.data + name_type = types.unicode_type if fe_type.is_named else types.none + members = [ + ('data', data_type), + ('name', name_type), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(Int64IndexType, 'data', '_data') +make_attribute_wrapper(Int64IndexType, 'name', '_name') diff --git a/sdc/datatypes/indexes/positional_index_type.py b/sdc/datatypes/indexes/positional_index_type.py new file mode 100644 index 000000000..3896be5f9 --- /dev/null +++ b/sdc/datatypes/indexes/positional_index_type.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper +) + +from sdc.datatypes.indexes import RangeIndexType + +class PositionalIndexType(types.IterableType): + dtype = types.int64 + + def __init__(self, is_named=False): + self.data = RangeIndexType(is_named) + self.is_named = is_named + super(PositionalIndexType, self).__init__( + name='PositionalIndexType({})'.format(is_named)) + + @property + def iterator_type(self): + res = self.data.iterator_type + return res + + +@register_model(PositionalIndexType) +class PositionalIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + members = [ + ('data', fe_type.data), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +make_attribute_wrapper(PositionalIndexType, 'data', '_data') diff --git a/sdc/datatypes/range_index_type.py b/sdc/datatypes/indexes/range_index_type.py similarity index 100% rename from sdc/datatypes/range_index_type.py rename to sdc/datatypes/indexes/range_index_type.py diff --git a/sdc/extensions/indexes/empty_index_ext.py b/sdc/extensions/indexes/empty_index_ext.py new file mode 100644 index 000000000..c67ec3c6d --- /dev/null +++ b/sdc/extensions/indexes/empty_index_ext.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd + +from numba import types +from numba.core import cgutils +from numba.extending import (NativeValue, intrinsic, box, unbox, ) +from numba.core.typing.templates import signature + +from sdc.datatypes.indexes import EmptyIndexType +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method +from sdc.utilities.sdc_typing_utils import TypeChecker + + +@intrinsic +def init_empty_index(typingctx, name=None): + name = types.none if name is None else name + is_named = False if name is types.none else True + + def codegen(context, builder, sig, args): + name_val, = args + # create series struct and store values + index_struct = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + + if is_named: + if isinstance(name, types.StringLiteral): + index_struct.name = numba.cpython.unicode.make_string_from_constant( + context, builder, types.unicode_type, name.literal_value) + else: + index_struct.name = name_val + + if context.enable_nrt and is_named: + context.nrt.incref(builder, sig.args[1], name_val) + + return index_struct._getvalue() + + ret_typ = EmptyIndexType(is_named) + sig = signature(ret_typ, name) + return sig, codegen + + +@box(EmptyIndexType) +def box_empty_index(typ, val, c): + + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + empty_index = cgutils.create_struct_proxy( + typ)(c.context, c.builder, val) + + data = c.pyapi.list_new(c.context.get_constant(types.int64, 0)) + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, empty_index.name) + else: + name = c.pyapi.make_none() + + res = c.pyapi.call_method(pd_class_obj, "Index", (data, name)) + + c.pyapi.decref(data) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + + +@unbox(EmptyIndexType) +def unbox_empty_index(typ, val, c): + + index_struct = cgutils.create_struct_proxy(typ)(c.context, c.builder) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + index_struct.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(index_struct._getvalue(), is_error=is_error) + + +@sdc_overload_method(EmptyIndexType, 'take') +def pd_empty_index_take_overload(self, indexes): + if not isinstance(self, EmptyIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) and isinstance(indexes.dtype, types.Integer)): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_empty_index_take_impl(self, indexes): + return init_empty_index(name=self._name) + + return pd_empty_index_take_impl + + +@sdc_overload(len) +def pd_empty_index_len_overload(self): + if not isinstance(self, EmptyIndexType): + return None + + def pd_empty_index_len_impl(self): + return 0 + + return pd_empty_index_len_impl diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py new file mode 100644 index 000000000..23a2b66ed --- /dev/null +++ b/sdc/extensions/indexes/indexes_generic.py @@ -0,0 +1,282 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd + +from numba import types +from numba.typed import Dict +from numba.typed.typedobjectutils import _nonoptional + +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_old_index_types +from sdc.datatypes.indexes import * +from sdc.utilities.utils import sdc_overload_method, sdc_overload +from sdc.utilities.sdc_typing_utils import ( + find_index_common_dtype, + sdc_indexes_wo_values_cache, + ) +from sdc.hiframes.api import fix_df_index +from sdc.functions import numpy_like +from sdc.datatypes.common_functions import _sdc_internal_join + + +def sdc_numeric_indexes_equals(left, right): + pass + + +@sdc_overload(sdc_numeric_indexes_equals) +def sdc_numeric_indexes_equals_ovld(left, right): + + if not (isinstance(left, sdc_pandas_index_types) + or isinstance(right, sdc_pandas_index_types)): + return None + + convert_A = not isinstance(left, types.Array) + convert_B = not isinstance(right, types.Array) + + def sdc_numeric_indexes_equals_impl(left, right): + left = left.values if convert_A == True else left # noqa + right = right.values if convert_B == True else right # noqa + + return numpy_like.array_equal(left, right) + + return sdc_numeric_indexes_equals_impl + + +def sdc_indexes_attribute_dtype(self): + pass + + +@sdc_overload(sdc_indexes_attribute_dtype) +def sdc_indexes_attribute_dtype_ovld(self): + + if not isinstance(self, sdc_pandas_index_types): + return None + + index_dtype = self.data.dtype + + def sdc_indexes_attribute_dtype_impl(self): + return index_dtype + + return sdc_indexes_attribute_dtype_impl + + +def sdc_indexes_operator_eq(self): + pass + + +@sdc_overload(sdc_indexes_operator_eq) +def sdc_indexes_operator_eq_ovld(self, other): + + # TO-DO: this is for numeric indexes only now, extend to string-index when it's added + use_self_values = isinstance(self, sdc_pandas_index_types) and not isinstance(self, types.Array) + use_other_values = isinstance(other, sdc_pandas_index_types) and not isinstance(other, types.Array) + one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) + + def sdc_indexes_operator_eq_impl(self, other): + + if one_operand_is_scalar == False: # noqa + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + left = self.values if use_self_values == True else self # noqa + right = other.values if use_other_values == True else other # noqa + return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + + return sdc_indexes_operator_eq_impl + + +def sdc_indexes_reindex(self, target): + pass + + +@sdc_overload(sdc_indexes_reindex) +def pd_indexes_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + + index_dtype = self.dtype + def pd_indexes_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + """ Simplified version of pandas.core.index.base.reindex """ + + if (self is target or self.equals(target)): + return target, None + + # build a dict of 'self' index values to their positions: + map_index_to_position = Dict.empty( + key_type=index_dtype, + value_type=types.int32 + ) + + # TO-DO: needs concurrent hash map + for i, value in enumerate(self): + if value in map_index_to_position: + raise ValueError("cannot reindex from a duplicate axis") + else: + map_index_to_position[value] = i + + res_size = len(target) + indexer = np.empty(res_size, dtype=np.int64) + for i in numba.prange(res_size): + val = target[i] + if val in map_index_to_position: + indexer[i] = map_index_to_position[val] + else: + indexer[i] = -1 + + return target, indexer + + return pd_indexes_index_reindex_impl + + +def sdc_indexes_join_outer(left, right): + pass + + +@sdc_overload(sdc_indexes_join_outer, jit_options={'parallel': False}) +def pd_indexes_join_overload(left, right): + """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" + + # check that both operands are of types used for representing Pandas indexes + if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types) + and not isinstance(left, EmptyIndexType) + and not isinstance(right, EmptyIndexType)): + return None + + # for index types with dtype=int64 resulting index should be of Int64Index type + if (isinstance(left, (PositionalIndexType, RangeIndexType, Int64IndexType)) + and isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType))): + + def _convert_to_arrays_impl(left, right): + + if (left is right or left.equals(right)): + return pd.Int64Index(left.values), None, None + + joined_data, indexer1, indexer2 = _sdc_internal_join(left.values, right.values) + return pd.Int64Index(joined_data), indexer1, indexer2 + + return _convert_to_arrays_impl + + # for joining with deprecated types.Array indexes (e.g. representing UInt64Index) + # resulting index will be of numpy array type. TO-DO: remove once pd.Index overload + # is supported and all indexes are represented with distinct types + else: + convert_left = isinstance(left, (PositionalIndexType, RangeIndexType, Int64IndexType)) + convert_right = isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType)) + index_dtypes_match, res_index_dtype = find_index_common_dtype(left, right) + def pd_indexes_join_array_indexes_impl(left, right): + + _left = left.values if convert_left == True else left # noqa + _right = right.values if convert_right == True else right # noqa + if (_left is _right + or numpy_like.array_equal(_left, _right)): + if index_dtypes_match == False: # noqa + joined_index = numpy_like.astype(_left, res_index_dtype) + else: + joined_index = _left + return joined_index, None, None + + return _sdc_internal_join(_left, _right) + + return pd_indexes_join_array_indexes_impl + + return None + + +def sdc_fix_indexes_join(joined, indexer1, indexer2): + pass + + +@sdc_overload(sdc_fix_indexes_join) +def pd_fix_indexes_join_overload(joined, indexer1, indexer2): + """ Wraps pandas index.join() into new function that returns indexers as arrays and not optional(array) """ + + # This function is simply a workaround for problem with parfor lowering + # broken by indexers typed as types.Optional(Array) - FIXME_Numba#XXXX: remove it + # in all places whne parfor issue is fixed + def pd_fix_indexes_join_impl(joined, indexer1, indexer2): + if indexer1 is not None: + _indexer1 = _nonoptional(indexer1) + else: + _indexer1 = np.arange(len(joined)) + + if indexer2 is not None: + _indexer2 = _nonoptional(indexer2) + else: + _indexer2 = _indexer1 + + return joined, _indexer1, _indexer2 + + return pd_fix_indexes_join_impl + + +def sdc_unify_index_types(left, right): + pass + + +@sdc_overload(sdc_unify_index_types) +def sdc_unify_index_types_overload(left, right): + """ For equal indexes of different dtypes produced index of common dtype """ + + index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(left, right) + is_left_index_cached = not isinstance(left, sdc_indexes_wo_values_cache) + is_left_index_array = isinstance(left, types.Array) + is_right_index_cached = not isinstance(right, sdc_indexes_wo_values_cache) + is_right_index_array = isinstance(right, types.Array) + + def sdc_unify_index_types_impl(left, right): + if index_dtypes_match == True: # noqa + return left + else: + if is_left_index_cached == True: # noqa + index_data = left.values if is_left_index_array == False else left # noqa + elif is_right_index_cached == True: # noqa + index_data = right.values if is_right_index_array == False else right # noqa + else: + # using numpy_like.astype but not index.astype since latter works differently + index_data = numpy_like.astype(left, numba_index_common_dtype) + + return fix_df_index(index_data) + + return sdc_unify_index_types_impl + + +@sdc_overload(np.array) +def sdc_np_array_overload(A): + """ Overload provides np.array(A) implementations for internal pandas index types """ + + if not (isinstance(A, sdc_pandas_index_types) + and not isinstance(A, sdc_old_index_types)): + return None + + if isinstance(A, PositionalIndexType): + return lambda A: np.arange(len(A)) + + if isinstance(A, RangeIndexType): + return lambda A: np.arange(A.start, A.stop, A.step) + + if isinstance(A, Int64IndexType): + return lambda A: A._data diff --git a/sdc/extensions/indexes/int64_index_ext.py b/sdc/extensions/indexes/int64_index_ext.py new file mode 100644 index 000000000..425b57a1f --- /dev/null +++ b/sdc/extensions/indexes/int64_index_ext.py @@ -0,0 +1,560 @@ +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types, prange +from numba.core import cgutils +from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, ) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature +from numba.core.imputils import impl_ret_untracked, call_getiter +from numba.core.boxing import box_array, unbox_array + +from sdc.datatypes.indexes import * +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + check_signed_integer, + _check_dtype_param_type, + sdc_pandas_index_types, + check_types_comparable, + ) +from sdc.functions import numpy_like +from sdc.hiframes.api import fix_df_index +from sdc.extensions.indexes.indexes_generic import * +from sdc.datatypes.common_functions import hpat_arrays_append + + +@intrinsic +def init_int64_index(typingctx, data, name=None): + + if not (isinstance(data, types.Array) and data.dtype is types.int64): + return None + assert data.ndim == 1, "Index data must be 1-dimensional" + + name = types.none if name is None else name + is_named = False if name is types.none else True + + def codegen(context, builder, sig, args): + data_val, name_val = args + # create series struct and store values + int64_index = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + + int64_index.data = data_val + + if is_named: + if isinstance(name, types.StringLiteral): + int64_index.name = numba.cpython.unicode.make_string_from_constant( + context, builder, types.unicode_type, name.literal_value) + else: + int64_index.name = name_val + + if context.enable_nrt: + context.nrt.incref(builder, sig.args[0], data_val) + if is_named: + context.nrt.incref(builder, sig.args[1], name_val) + + return int64_index._getvalue() + + ret_typ = Int64IndexType(data, is_named) + sig = signature(ret_typ, data, name) + return sig, codegen + + +@sdc_overload(pd.Int64Index) +def pd_int64_index_overload(data, dtype=None, copy=False, name=None): + + _func_name = 'pd.Int64Index().' + ty_checker = TypeChecker(_func_name) + + convertible_indexes = (PositionalIndexType, RangeIndexType, Int64IndexType) + if not (isinstance(data, (types.Array, types.List)) and isinstance(data.dtype, types.Integer) + or isinstance(data, convertible_indexes)): + ty_checker.raise_exc(data, 'array/list of integers or integer index', 'data') + + dtype_is_number_class = isinstance(dtype, types.NumberClass) + dtype_is_numpy_signed_int = (check_signed_integer(dtype) + or dtype_is_number_class and check_signed_integer(dtype.dtype)) + dtype_is_unicode_str = isinstance(dtype, (types.UnicodeType, types.StringLiteral)) + if not _check_dtype_param_type(dtype): + ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') + + if not (isinstance(copy, (types.NoneType, types.Omitted, types.Boolean)) or copy is False): + ty_checker.raise_exc(copy, 'bool', 'copy') + + if not (isinstance(name, (types.NoneType, types.Omitted, types.StringLiteral, types.UnicodeType)) or name is None): + ty_checker.raise_exc(name, 'string or none', 'name') + + is_data_array = isinstance(data, types.Array) + is_data_index = isinstance(data, convertible_indexes) + data_dtype_is_int64 = data.dtype is types.int64 + + def pd_int64_index_ctor_impl(data, dtype=None, copy=False, name=None): + + if not (dtype is None + or dtype_is_numpy_signed_int + or dtype_is_unicode_str and dtype in ('int8', 'int16', 'int32', 'int64')): + raise ValueError("Incorrect `dtype` passed: expected signed integer") + + if is_data_array == True: # noqa + _data = data + elif is_data_index == True: # noqa + _data = data.values + else: + # using fix_df_index to get array since it handles index=None + _data = fix_df_index(data)._data + + if data_dtype_is_int64 == False: # noqa + _data = numpy_like.astype(_data, dtype=types.int64) + else: + if copy: + _data = np.copy(_data) + return init_int64_index(_data, name) + + return pd_int64_index_ctor_impl + + +@typeof_impl.register(pd.Int64Index) +def typeof_int64_index(val, c): + index_data_ty = numba.typeof(val._data) + is_named = val.name is not None + return Int64IndexType(index_data_ty, is_named=is_named) + + +@box(Int64IndexType) +def box_int64_index(typ, val, c): + + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + int64_index = cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + data = box_array(typ.data, int64_index.data, c) + + # dtype and copy params are not stored so use default values + dtype = c.pyapi.make_none() + copy = c.pyapi.bool_from_bool( + c.context.get_constant(types.bool_, False) + ) + + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, int64_index.name) + else: + name = c.pyapi.make_none() + + res = c.pyapi.call_method(pd_class_obj, "Int64Index", (data, dtype, copy, name)) + + c.pyapi.decref(data) + c.pyapi.decref(dtype) + c.pyapi.decref(copy) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + + +@unbox(Int64IndexType) +def unbox_int64_index(typ, val, c): + + # TODO: support index unboxing with reference to parent in Numba? + int64_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + index_data = c.pyapi.object_getattr_string(val, "_data") + int64_index.data = unbox_array(typ.data, index_data, c).value + c.pyapi.decref(index_data) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + int64_index.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(int64_index._getvalue(), is_error=is_error) + + +@sdc_overload_attribute(Int64IndexType, 'name') +def pd_int64_index_name_overload(self): + if not isinstance(self, Int64IndexType): + return None + + is_named_index = self.is_named + + def pd_int64_index_name_impl(self): + if is_named_index == True: # noqa + return self._name + else: + return None + + return pd_int64_index_name_impl + + +@sdc_overload_attribute(Int64IndexType, 'dtype') +def pd_int64_index_dtype_overload(self): + if not isinstance(self, Int64IndexType): + return None + + def pd_int64_index_dtype_impl(self): + return sdc_indexes_attribute_dtype(self) + + return pd_int64_index_dtype_impl + + +@sdc_overload_attribute(Int64IndexType, 'values') +def pd_int64_index_values_overload(self): + if not isinstance(self, Int64IndexType): + return None + + def pd_int64_index_values_impl(self): + return self._data + + return pd_int64_index_values_impl + + +@sdc_overload(len) +def pd_int64_index_len_overload(self): + if not isinstance(self, Int64IndexType): + return None + + def pd_int64_index_len_impl(self): + return len(self._data) + + return pd_int64_index_len_impl + + +@sdc_overload(operator.contains) +def pd_int64_index_contains_overload(self, val): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Operator contains().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(val, types.Integer)): + ty_checker.raise_exc(val, 'integer scalar', 'val') + + def pd_int64_index_contains_impl(self, val): + # TO-DO: add operator.contains support for arrays in Numba or numpy_like + for i in prange(len(self._data)): + if val == self._data[i]: + break + else: + return False + + return True + + return pd_int64_index_contains_impl + + +@sdc_overload_method(Int64IndexType, 'copy') +def pd_int64_index_copy_overload(self, name=None, deep=False, dtype=None): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method copy().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): + ty_checker.raise_exc(name, 'string or none', 'name') + + if not (isinstance(deep, (types.NoneType, types.Omitted, types.Boolean)) or deep is False): + ty_checker.raise_exc(deep, 'boolean', 'deep') + + if not _check_dtype_param_type(dtype): + ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') + + name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None + keep_name = name_is_none and self.is_named + + # FIXME: deep=True/False is not handled at all - and has to be supported! + # Support for other indexes too! + # FIXME: add tests for all index types on copy_param_deep + def pd_int64_index_copy_impl(self, name=None, deep=False, dtype=None): + + _name = self._name if keep_name == True else name # noqa + new_index_data = self._data if not deep else numpy_like.copy(self._data) + return init_int64_index(new_index_data, _name) + + return pd_int64_index_copy_impl + + +@sdc_overload(operator.getitem) +def pd_int64_index_getitem_overload(self, idx): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Operator getitem().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(idx, (types.Integer, types.SliceType)) + or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): + ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') + + if isinstance(idx, types.Integer): + def pd_int64_index_getitem_impl(self, idx): + index_len = len(self._data) + # FIXME_Numba#5801: Numba type unification rules make this float + idx = types.int64((index_len + idx) if idx < 0 else idx) + if (idx < 0 or idx >= index_len): + raise IndexError("Int64Index.getitem: index is out of bounds") + + return self._data[idx] + + return pd_int64_index_getitem_impl + + else: + def pd_int64_index_getitem_impl(self, idx): + index_data = self._data[idx] + return pd.Int64Index(index_data, name=self._name) + + return pd_int64_index_getitem_impl + + +@sdc_overload(operator.eq) +def pd_int64_index_eq_overload(self, other): + + _func_name = 'Operator eq.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_int64_index = isinstance(self, Int64IndexType) + other_is_int64_index = isinstance(other, Int64IndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_int64_index and other_is_int64_index + or (self_is_int64_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_int64_index)): + return None + + def pd_int64_index_eq_impl(self, other): + return sdc_indexes_operator_eq(self, other) + + return pd_int64_index_eq_impl + + +@sdc_overload(operator.ne) +def pd_int64_index_ne_overload(self, other): + + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_int64_index = isinstance(self, Int64IndexType) + other_is_int64_index = isinstance(other, Int64IndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_int64_index and other_is_int64_index + or (self_is_int64_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_int64_index)): + return None + + def pd_int64_index_ne_impl(self, other): + + eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list + return list(~eq_res) + + return pd_int64_index_ne_impl + + +@lower_builtin(operator.is_, Int64IndexType, Int64IndexType) +def pd_int64_index_is_overload(context, builder, sig, args): + + ty_lhs, ty_rhs = sig.args + if ty_lhs != ty_rhs: + return cgutils.false_bit + + lhs, rhs = args + lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) + rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) + return builder.icmp_signed('==', lhs_ptr, rhs_ptr) + + +@lower_builtin('getiter', Int64IndexType) +def pd_int64_index_getiter(context, builder, sig, args): + """ Returns a new iterator object for Int64IndexType by delegating to array __iter__ """ + (value,) = args + int64_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) + res = call_getiter(context, builder, sig.args[0].data, int64_index.data) + return impl_ret_untracked(context, builder, Int64IndexType, res) + + +@sdc_overload_method(Int64IndexType, 'ravel') +def pd_int64_index_ravel_overload(self, order='C'): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method ravel().' + + # np.ravel argument order is not supported in Numba + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_int64_index_ravel_impl(self, order='C'): + return self.values + + return pd_int64_index_ravel_impl + + +@sdc_overload_method(Int64IndexType, 'equals') +def pd_int64_index_equals_overload(self, other): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method equals().' + if not isinstance(other, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_int64_index_equals_impl(self, other): + return sdc_numeric_indexes_equals(self, other) + + return pd_int64_index_equals_impl + + +@sdc_overload_method(Int64IndexType, 'reindex') +def pd_int64_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + + def pd_int64_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_int64_index_reindex_impl + + +@sdc_overload_method(Int64IndexType, 'take') +def pd_int64_index_take_overload(self, indexes): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List, types.ListType) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) + and isinstance(indexes.dtype, (types.Integer, types.ListType))): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + # separate handling when indexes is nested lists produces with parallel impls + if isinstance(indexes.dtype, types.ListType): + def pd_int64_index_take_chunked_impl(self, indexes): + new_index_data = numpy_like.take(self.values, indexes) + return pd.Int64Index(new_index_data, name=self._name) + + return pd_int64_index_take_chunked_impl + + convert_target = isinstance(indexes, sdc_pandas_index_types) and not isinstance(indexes, types.Array) + def pd_int64_index_take_impl(self, indexes): + _indexes = indexes.values if convert_target == True else indexes + new_index_data = numpy_like.take(self._data, _indexes) + return pd.Int64Index(new_index_data, name=self._name) + + return pd_int64_index_take_impl + + +@sdc_overload_method(Int64IndexType, 'append') +def pd_int64_index_append_overload(self, other): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + convert_other = not isinstance(other, types.Array) + _, res_index_dtype = find_index_common_dtype(self, other) + return_as_array_index = res_index_dtype is not types.int64 + def pd_int64_index_append_impl(self, other): + _other = other.values if convert_other == True else other # noqa + new_index_data = hpat_arrays_append(self._data, _other) + # this is only needed while some indexes are represented with arrays + # TO-DO: support pd.Index() overload with dtype arg to create indexes + if return_as_array_index == False: + return pd.Int64Index(new_index_data) + else: + return new_index_data + + return pd_int64_index_append_impl + + +@sdc_overload_method(Int64IndexType, 'join') +def pd_int64_index_join_overload(self, other, how, level=None, return_indexers=False, sort=False): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method join().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not isinstance(how, types.StringLiteral): + ty_checker.raise_exc(how, 'string', 'how') + if not how.literal_value == 'outer': + raise SDCLimitation(f"{_func_name} Only supporting 'outer' now. Given 'how': {how.literal_value}") + + if not (isinstance(level, (types.Omitted, types.NoneType)) or level is None): + ty_checker.raise_exc(level, 'None', 'level') + + if not (isinstance(return_indexers, (types.Omitted, BooleanLiteral)) or return_indexers is False): + ty_checker.raise_exc(return_indexers, 'boolean', 'return_indexers') + + if not (isinstance(sort, (types.Omitted, types.Boolean)) or sort is False): + ty_checker.raise_exc(sort, 'boolean', 'sort') + + _return_indexers = return_indexers.literal_value + def pd_int64_index_join_impl(self, other, how, level=None, return_indexers=False, sort=False): + + if _return_indexers == True: + return sdc_indexes_join_outer(self, other) + else: + joined_index, = sdc_indexes_join_outer(self, other) + return joined_index + + return pd_int64_index_join_impl diff --git a/sdc/extensions/indexes/positional_index_ext.py b/sdc/extensions/indexes/positional_index_ext.py new file mode 100644 index 000000000..8bd89b442 --- /dev/null +++ b/sdc/extensions/indexes/positional_index_ext.py @@ -0,0 +1,474 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types +from numba.core import cgutils +from numba.extending import (NativeValue, intrinsic, box, unbox, lower_builtin, ) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature +from numba.core.imputils import impl_ret_untracked, call_getiter + +from sdc.datatypes.indexes import PositionalIndexType, RangeIndexType +from sdc.datatypes.indexes.range_index_type import RangeIndexDataType +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.extensions.indexes.range_index_ext import box_range_index, unbox_range_index +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + _check_dtype_param_type, + check_types_comparable, + sdc_pandas_index_types, + ) +from sdc.extensions.indexes.indexes_generic import * + + +@intrinsic +def init_positional_index(typingctx, size, name=None): + name = types.none if name is None else name + is_named = False if name is types.none else True + + ret_typ = PositionalIndexType(is_named) + inner_sig = signature(ret_typ.data, size, name) + def codegen(context, builder, sig, args): + data_val, name_val = args + + # create positional_index struct and store created instance + # of RangeIndexType as data member + positional_index = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + positional_index.data = context.compile_internal( + builder, + lambda size, name: pd.RangeIndex(size, name=name), + inner_sig, + [data_val, name_val] + ) + + return positional_index._getvalue() + + sig = signature(ret_typ, size, name) + return sig, codegen + + +@box(PositionalIndexType) +def box_positional_index(typ, val, c): + + positional_index = numba.core.cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + data_range_index = numba.core.cgutils.create_struct_proxy(typ.data)( + c.context, c.builder, positional_index.data) + return box_range_index(typ.data, data_range_index._getvalue(), c) + + +@unbox(PositionalIndexType) +def unbox_positional_index(typ, val, c): + + positional_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + res = unbox_range_index(typ.data, val, c) + positional_index.data = res.value + is_error = res.is_error + + return NativeValue(positional_index._getvalue(), is_error=is_error) + + +@sdc_overload_attribute(PositionalIndexType, 'start') +def pd_positional_index_start_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_start_impl(self): + _self = self._data + return _self.start + + return pd_positional_index_start_impl + + +@sdc_overload_attribute(PositionalIndexType, 'stop') +def pd_positional_index_stop_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_stop_impl(self): + _self = self._data + return _self.stop + + return pd_positional_index_stop_impl + + +@sdc_overload_attribute(PositionalIndexType, 'step') +def pd_positional_index_step_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_step_impl(self): + _self = self._data + return _self.step + + return pd_positional_index_step_impl + + +@sdc_overload_attribute(PositionalIndexType, 'name') +def pd_positional_index_name_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + is_named_index = self.is_named + def pd_positional_index_name_impl(self): + _self = self._data + if is_named_index == True: # noqa + return _self.name + else: + return None + + return pd_positional_index_name_impl + + +@sdc_overload_attribute(PositionalIndexType, 'dtype') +def pd_positional_index_dtype_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_dtype_impl(self): + return sdc_indexes_attribute_dtype(self) + + return pd_positional_index_dtype_impl + +@sdc_overload_attribute(PositionalIndexType, 'values') +def pd_positional_index_values_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_values_impl(self): + # TO-DO: add caching when Numba supports writable attributes? + return np.array(self) + + return pd_positional_index_values_impl + +@sdc_overload(len) +def pd_positional_index_len_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_len_impl(self): + return len(self._data) + + return pd_positional_index_len_impl + + +@sdc_overload(operator.contains) +def pd_range_index_contains_overload(self, val): + if not isinstance(self, PositionalIndexType): + return None + + def pd_range_index_contains_impl(self, val): + _self = self._data + return val in self._data + + return pd_range_index_contains_impl + + +@sdc_overload_method(PositionalIndexType, 'copy') +def pd_positional_index_copy_overload(self, name=None, deep=False, dtype=None): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method copy().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): + ty_checker.raise_exc(name, 'string or none', 'name') + + if not (isinstance(deep, (types.NoneType, types.Omitted, types.Boolean)) or deep is False): + ty_checker.raise_exc(deep, 'boolean', 'deep') + + if not _check_dtype_param_type(dtype): + ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') + + name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None + keep_name = name_is_none and self.is_named + def pd_positional_index_copy_impl(self, name=None, deep=False, dtype=None): + + _name = self.name if keep_name == True else name # noqa + return init_positional_index(len(self), _name) + + return pd_positional_index_copy_impl + + +@sdc_overload(operator.getitem) +def pd_positional_index_getitem_overload(self, idx): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Operator getitem().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(idx, (types.Integer, types.SliceType)) + or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): + ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') + + def pd_positional_index_getitem_impl(self, idx): + _self = self._data + return _self[idx] + + return pd_positional_index_getitem_impl + + +@sdc_overload(operator.eq) +def pd_positional_index_eq_overload(self, other): + + _func_name = 'Operator eq.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_positional_index = isinstance(self, PositionalIndexType) + other_is_positional_index = isinstance(other, PositionalIndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_positional_index and other_is_positional_index + or (self_is_positional_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_positional_index)): + return None + + def pd_positional_index_eq_impl(self, other): + return sdc_indexes_operator_eq(self, other) + + return pd_positional_index_eq_impl + + +@sdc_overload(operator.ne) +def pd_positional_index_ne_overload(self, other): + + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_positional_index = isinstance(self, PositionalIndexType) + other_is_positional_index = isinstance(other, PositionalIndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_positional_index and other_is_positional_index + or (self_is_positional_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_positional_index)): + return None + + def pd_positional_index_ne_impl(self, other): + + eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list + return list(~eq_res) + + return pd_positional_index_ne_impl + + +@lower_builtin(operator.is_, PositionalIndexType, PositionalIndexType) +def pd_positional_index_is_overload(context, builder, sig, args): + + ty_lhs, ty_rhs = sig.args + if ty_lhs != ty_rhs: + return cgutils.false_bit + + lhs, rhs = args + lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) + rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) + return builder.icmp_signed('==', lhs_ptr, rhs_ptr) + + +@lower_builtin('getiter', PositionalIndexType) +def pd_positional_index_getiter(context, builder, sig, args): + """ Returns a new iterator object for PositionalIndexType by delegating to range.__iter__ """ + (value,) = args + positional_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) + range_index = cgutils.create_struct_proxy(sig.args[0].data)(context, builder, positional_index.data) + res = call_getiter(context, builder, RangeIndexDataType, range_index.data) + return impl_ret_untracked(context, builder, PositionalIndexType, res) + + + + + +@sdc_overload_method(PositionalIndexType, 'ravel') +def pd_positional_index_ravel_overload(self, order='C'): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method ravel().' + # np.ravel argument order is not supported in Numba + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_positional_index_ravel_impl(self, order='C'): + _self = self._data + return _self.values + + return pd_positional_index_ravel_impl + + +@sdc_overload_method(PositionalIndexType, 'equals') +def pd_positional_index_equals_overload(self, other): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method equals().' + if not isinstance(other, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_positional_index_equals_impl(self, other): + + _self = self._data + return _self.equals(other) + + return pd_positional_index_equals_impl + + +@sdc_overload_method(PositionalIndexType, 'reindex') +def pd_positional_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + + def pd_positional_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_positional_index_reindex_impl + + +@sdc_overload_method(PositionalIndexType, 'take') +def pd_positional_index_take_overload(self, indexes): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) and isinstance(indexes.dtype, types.Integer)): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_positional_index_take_impl(self, indexes): + _self = self._data + return _self.take(indexes) + + return pd_positional_index_take_impl + + +@sdc_overload_method(PositionalIndexType, 'append') +def pd_positional_index_append_overload(self, other): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_positional_index_append_impl(self, other): + _self = self._data + return _self.append(other) + + return pd_positional_index_append_impl + + +@sdc_overload_method(PositionalIndexType, 'join') +def pd_positional_index_join_overload(self, other, how, level=None, return_indexers=False, sort=False): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method join().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not isinstance(how, types.StringLiteral): + ty_checker.raise_exc(how, 'string', 'how') + if not how.literal_value == 'outer': + raise SDCLimitation(f"{_func_name} Only supporting 'outer' now. Given 'how': {how.literal_value}") + + if not (isinstance(level, (types.Omitted, types.NoneType)) or level is None): + ty_checker.raise_exc(level, 'None', 'level') + + if not (isinstance(return_indexers, (types.Omitted, BooleanLiteral)) or return_indexers is False): + ty_checker.raise_exc(return_indexers, 'boolean', 'return_indexers') + + if not (isinstance(sort, (types.Omitted, types.Boolean)) or sort is False): + ty_checker.raise_exc(sort, 'boolean', 'sort') + + _return_indexers = return_indexers.literal_value + if isinstance(self, PositionalIndexType) and isinstance(other, PositionalIndexType): + + def pd_indexes_join_positional_impl(self, other, how, level=None, return_indexers=False, sort=False): + self_size, other_size = len(self), len(other) + min_size = min(len(self), len(other)) + max_size = max(self_size, other_size) + + joined_index = init_positional_index(max_size) + if _return_indexers == True: # noqa + self_indexer = None if self_size == other_size else np.arange(max_size) + other_indexer = None if self_size == other_size else np.arange(max_size) + if self_size > other_size: + other_indexer[min_size:] = -1 + elif self_size < other_size: + self_indexer[min_size:] = -1 + + result = joined_index, self_indexer, other_indexer + else: + result = joined_index + + return result + + return pd_indexes_join_positional_impl + + else: + + def pd_positional_index_join_common_impl(self, other, how, level=None, return_indexers=False, sort=False): + if _return_indexers == True: + return sdc_indexes_join_outer(self, other) + else: + return sdc_indexes_join_outer(self, other)[0] + + return pd_positional_index_join_common_impl diff --git a/sdc/extensions/indexes/range_index_ext.py b/sdc/extensions/indexes/range_index_ext.py index 7b24e7528..9a1718801 100644 --- a/sdc/extensions/indexes/range_index_ext.py +++ b/sdc/extensions/indexes/range_index_ext.py @@ -33,23 +33,24 @@ from numba import types from numba.core import cgutils from numba.extending import (typeof_impl, NativeValue, intrinsic, box, unbox, lower_builtin, ) - +from numba.core.errors import TypingError from numba.core.typing.templates import signature from numba.core.imputils import impl_ret_untracked, call_getiter -from sdc.datatypes.range_index_type import RangeIndexType, RangeIndexDataType -from sdc.datatypes.common_functions import SDCLimitation, _sdc_take -from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method -from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array -from sdc.functions.numpy_like import getitem_by_mask - - -def _check_dtype_param_type(dtype): - """ Returns True is dtype is a valid type for dtype parameter and False otherwise. - Used in RangeIndex ctor and other methods that take dtype parameter. """ - - valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) - return isinstance(dtype, valid_dtype_types) or dtype is None +from sdc.datatypes.indexes import PositionalIndexType, RangeIndexType +from sdc.datatypes.indexes.range_index_type import RangeIndexDataType +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + check_signed_integer, + sdc_pandas_index_types, + check_types_comparable, + _check_dtype_param_type, + sdc_indexes_range_like, + ) +from sdc.functions import numpy_like +from sdc.extensions.indexes.indexes_generic import * @intrinsic @@ -96,8 +97,9 @@ def pd_range_index_overload(start=None, stop=None, step=None, dtype=None, copy=F if not (isinstance(copy, types.Omitted) or fastpath is None): raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'fastpath': {fastpath}") - dtype_is_np_int64 = dtype is types.NumberClass(types.int64) - dtype_is_np_int32 = dtype is types.NumberClass(types.int32) + dtype_is_number_class = isinstance(dtype, types.NumberClass) + dtype_is_numpy_signed_int = (check_signed_integer(dtype) + or dtype_is_number_class and check_signed_integer(dtype.dtype)) dtype_is_unicode_str = isinstance(dtype, (types.UnicodeType, types.StringLiteral)) if not _check_dtype_param_type(dtype): ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') @@ -125,10 +127,8 @@ def pd_range_index_ctor_dummy_impl( def pd_range_index_ctor_impl(start=None, stop=None, step=None, dtype=None, copy=False, name=None, fastpath=None): if not (dtype is None - or dtype_is_unicode_str and dtype == 'int64' - or dtype_is_unicode_str and dtype == 'int32' - or dtype_is_np_int64 - or dtype_is_np_int32): + or dtype_is_numpy_signed_int + or dtype_is_unicode_str and dtype in ('int8', 'int16', 'int32', 'int64')): raise ValueError("Incorrect `dtype` passed: expected signed integer") # TODO: add support of int32 type @@ -150,8 +150,19 @@ def pd_range_index_ctor_impl(start=None, stop=None, step=None, dtype=None, copy= @typeof_impl.register(pd.RangeIndex) def typeof_range_index(val, c): + # Note: unboxing pd.RangeIndex creates instance of PositionalIndexType + # if index values are trivial range, but creating pd.RangeIndex() with same + # parameters via ctor will create instance of RangeIndexType. + + # This is needed for specializing of Series and DF methods on combination of + # index types and preserving PositionalIndexType as result index type (when possible), + # since in pandas operations on two range indexes may give: + # either RangeIndex or Int64Index (in common case) is_named = val.name is not None - return RangeIndexType(is_named=is_named) + if not (val.start == 0 and val.stop > 0 and val.step == 1): + return RangeIndexType(is_named=is_named) + else: + return PositionalIndexType(is_named=is_named) @box(RangeIndexType) @@ -272,10 +283,8 @@ def pd_range_index_dtype_overload(self): if not isinstance(self, RangeIndexType): return None - range_index_dtype = self.dtype - def pd_range_index_dtype_impl(self): - return range_index_dtype + return sdc_indexes_attribute_dtype(self) return pd_range_index_dtype_impl @@ -325,7 +334,7 @@ def pd_range_index_copy_overload(self, name=None, deep=False, dtype=None): if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): ty_checker.raise_exc(name, 'string or none', 'name') - if not (isinstance(deep, (types.Omitted, types.Boolean)) or deep is False): + if not (isinstance(deep, (types.NoneType, types.Omitted, types.Boolean)) or deep is False): ty_checker.raise_exc(deep, 'boolean', 'deep') if not _check_dtype_param_type(dtype): @@ -356,7 +365,8 @@ def pd_range_index_getitem_overload(self, idx): if isinstance(idx, types.Integer): def pd_range_index_getitem_impl(self, idx): range_len = len(self._data) - idx = (range_len + idx) if idx < 0 else idx + # FIXME_Numba#5801: Numba type unification rules make this float + idx = types.int64((range_len + idx) if idx < 0 else idx) if (idx < 0 or idx >= range_len): raise IndexError("RangeIndex.getitem: index is out of bounds") return self.start + self.step * idx @@ -375,17 +385,17 @@ def pd_range_index_getitem_impl(self, idx): return pd_range_index_getitem_impl - # returns np.array which is used to represent pandas Int64Index now if isinstance(idx, (types.Array, types.List)): if isinstance(idx.dtype, types.Integer): def pd_range_index_getitem_impl(self, idx): - return _sdc_take(self, idx) + res_as_arr = self.take(idx) + return pd.Int64Index(res_as_arr, name=self._name) return pd_range_index_getitem_impl elif isinstance(idx.dtype, types.Boolean): def pd_range_index_getitem_impl(self, idx): - return getitem_by_mask(self, idx) + return numpy_like.getitem_by_mask(self, idx) return pd_range_index_getitem_impl @@ -393,25 +403,22 @@ def pd_range_index_getitem_impl(self, idx): @sdc_overload(operator.eq) def pd_range_index_eq_overload(self, other): + _func_name = 'Operator eq.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + self_is_range_index = isinstance(self, RangeIndexType) other_is_range_index = isinstance(other, RangeIndexType) + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types if not (self_is_range_index and other_is_range_index - or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number) and other_is_range_index))): + or (self_is_range_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_range_index)): return None - one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) def pd_range_index_eq_impl(self, other): - - if one_operand_is_scalar == False: # noqa - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - # names do not matter when comparing pd.RangeIndex - left = self.values if self_is_range_index == True else self # noqa - right = other.values if other_is_range_index == True else other # noqa - return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + return sdc_indexes_operator_eq(self, other) return pd_range_index_eq_impl @@ -419,12 +426,18 @@ def pd_range_index_eq_impl(self, other): @sdc_overload(operator.ne) def pd_range_index_ne_overload(self, other): + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + self_is_range_index = isinstance(self, RangeIndexType) other_is_range_index = isinstance(other, RangeIndexType) + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types if not (self_is_range_index and other_is_range_index - or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number) and other_is_range_index))): + or (self_is_range_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_range_index)): return None def pd_range_index_ne_impl(self, other): @@ -453,5 +466,153 @@ def pd_range_index_getiter(context, builder, sig, args): """ Returns a new iterator object for RangeIndexType by delegating to range.__iter__ """ (value,) = args range_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) - res = call_getiter(context, builder, types.range_state64_type, range_index.data) + res = call_getiter(context, builder, RangeIndexDataType, range_index.data) return impl_ret_untracked(context, builder, RangeIndexType, res) + + +@sdc_overload_method(RangeIndexType, 'ravel') +def pd_range_index_ravel_overload(self, order='C'): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method ravel().' + + # np.ravel argument order is not supported in Numba + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_range_index_ravel_impl(self, order='C'): + return self.values + + return pd_range_index_ravel_impl + + +@sdc_overload_method(RangeIndexType, 'equals') +def pd_range_index_equals_overload(self, other): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method equals().' + if not isinstance(other, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + if isinstance(other, sdc_indexes_range_like): + + def pd_range_index_equals_impl(self, other): + + if len(self) != len(other): + return False + if len(self) == 0: + return True + + if len(self) == 1: + return self.start == other.start + + return self.start == other.start and self.step == other.step + else: + + def pd_range_index_equals_impl(self, other): + return sdc_numeric_indexes_equals(self, other) + + return pd_range_index_equals_impl + + +@sdc_overload_method(RangeIndexType, 'reindex') +def pd_range_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + + def pd_range_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_range_index_reindex_impl + + +@sdc_overload_method(RangeIndexType, 'take') +def pd_range_index_take_overload(self, indexes): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) and isinstance(indexes.dtype, types.Integer)): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_range_index_take_impl(self, indexes): + _self = pd.Int64Index(self.values, name=self._name) + return _self.take(indexes) + + return pd_range_index_take_impl + + +@sdc_overload_method(RangeIndexType, 'append') +def pd_range_index_append_overload(self, other): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_range_index_append_impl(self, other): + int64_index = pd.Int64Index(self.values, name=self._name) + return int64_index.append(other) + + return pd_range_index_append_impl + + +@sdc_overload_method(RangeIndexType, 'join') +def pd_range_index_join_overload(self, other, how, level=None, return_indexers=False, sort=False): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method join().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not isinstance(how, types.StringLiteral): + ty_checker.raise_exc(how, 'string', 'how') + if not how.literal_value == 'outer': + raise SDCLimitation(f"{_func_name} Only supporting 'outer' now. Given 'how': {how.literal_value}") + + if not (isinstance(level, (types.Omitted, types.NoneType)) or level is None): + ty_checker.raise_exc(level, 'None', 'level') + + if not (isinstance(return_indexers, (types.Omitted, BooleanLiteral)) or return_indexers is False): + ty_checker.raise_exc(return_indexers, 'boolean', 'return_indexers') + + if not (isinstance(sort, (types.Omitted, types.Boolean)) or sort is False): + ty_checker.raise_exc(sort, 'boolean', 'sort') + + _return_indexers = return_indexers.literal_value + def pd_range_index_join_impl(self, other, how, level=None, return_indexers=False, sort=False): + if _return_indexers == True: + return sdc_indexes_join_outer(self, other) + else: + joined_index, = sdc_indexes_join_outer(self, other) + return joined_index + + return pd_range_index_join_impl diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index 1e67e8ccd..f7441e476 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -46,17 +46,20 @@ import sdc from sdc.functions.statistics import skew_formula from sdc.hiframes.api import isna -from sdc.datatypes.range_index_type import RangeIndexType + +from sdc.datatypes.indexes import * from sdc.utilities.sdc_typing_utils import TypeChecker, is_default from sdc.utilities.utils import (sdc_overload, sdc_register_jitable, min_dtype_int_val, max_dtype_int_val, min_dtype_float_val, max_dtype_float_val) from sdc.str_arr_ext import (StringArrayType, pre_alloc_string_array, get_utf8_size, string_array_type, create_str_arr_from_list, str_arr_set_na_by_mask, - num_total_chars, str_arr_is_na) + num_total_chars, str_arr_is_na, str_arr_set_na) from sdc.utilities.prange_utils import parallel_chunks -from sdc.utilities.sdc_typing_utils import check_types_comparable +from sdc.utilities.sdc_typing_utils import check_types_comparable, SDCLimitation from sdc.functions.sort import parallel_sort, parallel_stable_sort, parallel_argsort, parallel_stable_argsort +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_pandas_df_column_types + def astype(self, dtype): pass @@ -120,9 +123,13 @@ def sdc_astype_overload(self, dtype): """ ty_checker = TypeChecker("numpy-like 'astype'") - if not isinstance(self, (types.Array, StringArrayType, RangeIndexType)): + valid_self_types = sdc_pandas_df_column_types + if not (isinstance(self, valid_self_types)): return None + if isinstance(self, StringArrayType): + return SDCLimitation("numpy_like.astype not implemented for string data") + accepted_dtype_types = (types.functions.NumberClass, types.Function, types.StringLiteral) if not isinstance(dtype, accepted_dtype_types): def impl(self, dtype): @@ -156,7 +163,7 @@ def sdc_astype_number_to_string_impl(self, dtype): return sdc_astype_number_to_string_impl - if (isinstance(self, (types.Array, RangeIndexType)) + if (isinstance(self, types.Array) and isinstance(dtype, (types.StringLiteral, types.functions.NumberClass))): def sdc_astype_number_impl(self, dtype): arr = numpy.empty(len(self), dtype=numpy.dtype(dtype)) @@ -344,7 +351,8 @@ def sdc_copy_overload(self): Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k copy """ - if not isinstance(self, (types.Array, StringArrayType, RangeIndexType)): + valid_self_types = sdc_pandas_df_column_types + if not (isinstance(self, valid_self_types)): return None if isinstance(self, types.Array): @@ -360,12 +368,14 @@ def sdc_copy_array_impl(self): return sdc_copy_array_impl - if isinstance(self, (StringArrayType, RangeIndexType)): + elif isinstance(self, StringArrayType): def sdc_copy_str_arr_impl(self): return self.copy() return sdc_copy_str_arr_impl + else: + return None @sdc_overload(notnan) def sdc_notnan_overload(self): @@ -953,7 +963,7 @@ def getitem_by_mask(arr, idx): @sdc_overload(getitem_by_mask) -def getitem_by_mask_overload(arr, idx): +def getitem_by_mask_overload(self, idx): """ Creates a new array from arr by selecting elements indicated by Boolean mask idx. @@ -971,13 +981,22 @@ def getitem_by_mask_overload(arr, idx): """ - if not isinstance(arr, (types.Array, StringArrayType, RangeIndexType)): - return + valid_self_types = (types.Array,) + sdc_pandas_index_types + if not isinstance(self, valid_self_types): + return None - res_dtype = arr.dtype - is_str_arr = arr == string_array_type - def getitem_by_mask_impl(arr, idx): - chunks = parallel_chunks(len(arr)) + # for empty index assume it's returns itself + if isinstance(self, EmptyIndexType): + def getitem_by_mask_empty_index_impl(self, idx): + return self + return getitem_by_mask_empty_index_impl + + res_dtype = self.dtype + is_str_arr = self == string_array_type + is_numeric_index = isinstance(self, (PositionalIndexType, RangeIndexType, Int64IndexType)) + + def getitem_by_mask_impl(self, idx): + chunks = parallel_chunks(len(self)) arr_len = numpy.empty(len(chunks), dtype=numpy.int64) length = 0 @@ -1002,16 +1021,18 @@ def getitem_by_mask_impl(arr, idx): for j in range(chunk.start, chunk.stop): if idx[j]: - value = arr[j] + value = self[j] result_data[current_pos] = value if is_str_arr == True: # noqa - result_nan_mask[current_pos] = isna(arr, j) + result_nan_mask[current_pos] = isna(self, j) current_pos += 1 if is_str_arr == True: # noqa result_data_as_str_arr = create_str_arr_from_list(result_data) str_arr_set_na_by_mask(result_data_as_str_arr, result_nan_mask) return result_data_as_str_arr + elif is_numeric_index == True: # noqa + return pandas.Int64Index(result_data, name=self.name) else: return result_data @@ -1088,8 +1109,8 @@ def array_equal(A, B): def sdc_array_equal_overload(A, B): """ Checks 1D sequences A and B of comparable dtypes are equal """ - if not (isinstance(A, (types.Array, StringArrayType, types.NoneType, RangeIndexType)) - or isinstance(B, (types.Array, StringArrayType, types.NoneType, RangeIndexType))): + valid_arg_types = sdc_pandas_df_column_types + if not (isinstance(A, valid_arg_types) or isinstance(B, valid_arg_types)): return None _func_name = "numpy-like 'array_equal'" @@ -1109,27 +1130,16 @@ def sdc_array_equal_str_arr_impl(A, B): return sdc_array_equal_str_arr_impl else: - both_range_indexes = isinstance(A, RangeIndexType) and isinstance(B, RangeIndexType) def sdc_array_equal_impl(A, B): - if both_range_indexes == True: # noqa - if len(A) != len(B): - return False - if len(A) == 0: - return True - if len(A) == 1: - return A.start == B.start - - return A.start == B.start and A.step == B.step - else: - if len(A) != len(B): - return False - # FIXME_Numba#5157: change to simple A == B when issue is resolved - eq_res_size = len(A) - eq_res = numpy.empty(eq_res_size, dtype=types.bool_) - for i in numba.prange(eq_res_size): - eq_res[i] = A[i] == B[i] - return numpy.all(eq_res) + if len(A) != len(B): + return False + # FIXME_Numba#5157: change to simple A == B when issue is resolved + eq_res_size = len(A) + eq_res = numpy.empty(eq_res_size, dtype=types.bool_) + for i in numba.prange(eq_res_size): + eq_res[i] = A[i] == B[i] + return numpy.all(eq_res) return sdc_array_equal_impl @@ -1139,9 +1149,6 @@ def sdc_np_array_overload(A): if isinstance(A, types.Array): return lambda A: A - if isinstance(A, RangeIndexType): - return lambda A: np.arange(A.start, A.stop, A.step) - if isinstance(A, types.containers.Set): # TODO: naive implementation, data from set can probably # be copied to array more efficienty @@ -1263,3 +1270,119 @@ def argsort_impl(a, axis=-1, kind=None, order=None): raise ValueError("Unsupported value of 'kind' parameter") return argsort_impl + + +def take(data, indices): + pass + + + +@sdc_overload(take) +def sdc_take_overload(data, indices): + + valid_data_types = sdc_pandas_df_column_types + if not (isinstance(data, valid_data_types)): + return None + + valid_indexes_types = (types.Array, types.List, types.ListType) + sdc_pandas_index_types + valid_indexes_dtypes = (types.Integer, types.ListType) + if not (isinstance(indices, valid_indexes_types) + and isinstance(indices.dtype, valid_indexes_dtypes) + and (isinstance(indices.dtype, types.Integer) + or isinstance(indices.dtype.dtype, types.Integer))): + return None + + data_dtype = data.dtype + if isinstance(indices.dtype, types.ListType): + + if isinstance(data_dtype, types.Number): + + def sdc_take_array_indices_seq_impl(data, indices): + res_size = 0 + for i in numba.prange(len(indices)): + res_size += len(indices[i]) + res_arr = numpy.empty(res_size, dtype=data_dtype) + for i in numba.prange(len(indices)): + start = 0 + for l in range(len(indices[0:i])): + start += len(indices[l]) + current_pos = start + for j in range(len(indices[i])): + res_arr[current_pos] = data[indices[i][j]] + current_pos += 1 + return res_arr + + return sdc_take_array_indices_seq_impl + + elif isinstance(data, StringArrayType): + def sdc_take_str_arr_indices_seq_impl(data, indices): + res_size = 0 + for i in numba.prange(len(indices)): + res_size += len(indices[i]) + nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) + num_total_bytes = 0 + for i in numba.prange(len(indices)): + start = 0 + for l in range(len(indices[0:i])): + start += len(indices[l]) + current_pos = start + for j in range(len(indices[i])): + num_total_bytes += get_utf8_size(data[indices[i][j]]) + if isna(data, indices[i][j]): + nan_mask[current_pos] = True + current_pos += 1 + res_arr = pre_alloc_string_array(res_size, num_total_bytes) + for i in numba.prange(len(indices)): + start = 0 + for l in range(len(indices[0:i])): + start += len(indices[l]) + current_pos = start + for j in range(len(indices[i])): + res_arr[current_pos] = data[indices[i][j]] + if nan_mask[current_pos]: + str_arr_set_na(res_arr, current_pos) + current_pos += 1 + + return res_arr + + return sdc_take_str_arr_indices_seq_impl + + else: + return None + + else: + if isinstance(data_dtype, (types.Number, types.Boolean)): + + def sdc_take_array_impl(data, indices): + res_size = len(indices) + res_arr = numpy.empty(res_size, dtype=data_dtype) + for i in numba.prange(res_size): + res_arr[i] = data[indices[i]] + return res_arr + + return sdc_take_array_impl + + elif isinstance(data, StringArrayType): + def sdc_take_str_arr_impl(data, indices): + res_size = len(indices) + nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) + num_total_bytes = 0 + for i in numba.prange(res_size): + num_total_bytes += get_utf8_size(data[indices[i]]) + if isna(data, indices[i]): + nan_mask[i] = True + + res_arr = pre_alloc_string_array(res_size, num_total_bytes) + for i in numpy.arange(res_size): + res_arr[i] = data[indices[i]] + if nan_mask[i]: + str_arr_set_na(res_arr, i) + + return res_arr + + return sdc_take_str_arr_impl + + else: + return None + + return None diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 77436f49b..c06203ecd 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -24,7 +24,6 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - import numpy as np import pandas as pd @@ -38,12 +37,14 @@ import sdc from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (StringArrayType, string_array_type) -from sdc.datatypes.range_index_type import RangeIndexType + +from sdc.datatypes.indexes import * from sdc.hiframes.pd_series_ext import ( SeriesType, if_series_to_array_type) from numba.core.errors import TypingError from sdc.datatypes.categorical.types import Categorical +from sdc.utilities.sdc_typing_utils import sdc_pandas_df_column_types def isna(arr, i): @@ -160,32 +161,50 @@ def fix_df_array_list_str_impl(column): # pragma: no cover if isinstance(column, SeriesType): return lambda column: column._data - if isinstance(column, RangeIndexType): + if isinstance(column, (RangeIndexType, Int64IndexType)): return lambda column: np.array(column) if isinstance(column, (types.Array, StringArrayType, Categorical)): return lambda column: column -def fix_df_index(index): +def fix_df_index(index, coldata=None): return index @overload(fix_df_index) -def fix_df_index_overload(index): - - # TO-DO: replace types.none index with separate type, e.g. DefaultIndex - if (index is None or isinstance(index, types.NoneType)): - def fix_df_index_impl(index): +def fix_df_index_overload(index, coldata=None): + + # FIXME: import here due to circular import between indexes, numpy_like, and api + from sdc.extensions.indexes.empty_index_ext import init_empty_index + from sdc.extensions.indexes.positional_index_ext import init_positional_index + + # index here is param supplied to Series/DF ctors, so it can be None + if index is None or isinstance(index, types.NoneType): + if coldata is None or isinstance(coldata, (types.NoneType, types.Omitted)): + def fix_df_index_impl(index, coldata=None): + return init_empty_index() + elif isinstance(coldata, sdc_pandas_df_column_types): + def fix_df_index_impl(index, coldata=None): + return init_positional_index(len(coldata)) + else: return None - elif isinstance(index, RangeIndexType): - def fix_df_index_impl(index): + return fix_df_index_impl + + elif isinstance(index, (RangeIndexType, Int64IndexType, EmptyIndexType, PositionalIndexType)): + def fix_df_index_impl(index, coldata=None): return index + # currently only signed integer indexes are represented with own type + # TO-DO: support Uint64Index and Float64Indexes + elif isinstance(index.dtype, types.Integer) and index.dtype.signed: + def fix_df_index_impl(index, coldata=None): + index_data = fix_df_array(index) + return pd.Int64Index(index_data) else: # default case, transform index the same as df data - def fix_df_index_impl(index): + def fix_df_index_impl(index, coldata=None): return fix_df_array(index) return fix_df_index_impl diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 5e6930da9..656b3833c 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -47,16 +47,20 @@ from sdc.datatypes.categorical.boxing import unbox_Categorical, box_Categorical from sdc.hiframes.pd_series_ext import SeriesType from sdc.hiframes.pd_series_type import _get_series_array_type - from sdc.hiframes.pd_dataframe_ext import get_structure_maps +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types from .. import hstr_ext import llvmlite.binding as ll from llvmlite import ir as lir from llvmlite.llvmpy.core import Type as LLType -from sdc.datatypes.range_index_type import RangeIndexType + +from sdc.datatypes.indexes import * from sdc.extensions.indexes.range_index_ext import box_range_index, unbox_range_index +from sdc.extensions.indexes.empty_index_ext import box_empty_index, unbox_empty_index +from sdc.extensions.indexes.int64_index_ext import box_int64_index, unbox_int64_index from sdc.str_arr_type import StringArrayType +from sdc.extensions.indexes.positional_index_ext import unbox_positional_index, box_positional_index ll.add_symbol('array_size', hstr_ext.array_size) ll.add_symbol('array_getptr1', hstr_ext.array_getptr1) @@ -192,25 +196,33 @@ def _infer_series_list_dtype(S): def _infer_index_type(index): """ Deduces native Numba type used to represent index Python object """ + + index_is_named = index.name is not None + # more specific types go first (e.g. RangeIndex is subtype of Int64Index) if isinstance(index, pd.RangeIndex): # depending on actual index value unbox to diff types: none-index if it matches # positions or to RangeIndexType in general case - if (index.start == 0 and index.step == 1 and index.name is None): - return types.none + if (index.start == 0 and index.step == 1): + return PositionalIndexType(is_named=index_is_named) else: - if index.name is None: - return RangeIndexType() - else: - return RangeIndexType(is_named=True) + return RangeIndexType(is_named=index_is_named) # for unsupported pandas indexes we explicitly unbox to None if isinstance(index, pd.DatetimeIndex): return types.none + + if isinstance(index, pd.Int64Index): + index_data_type = numba.typeof(index._data) + return Int64IndexType(index_data_type, is_named=index_is_named) + if index.dtype == np.dtype('O'): # TO-DO: should we check that all elements are strings? if len(index) > 0 and isinstance(index[0], str): return string_array_type + elif len(index) == 0: + return EmptyIndexType(is_named=index_is_named) else: + assert False, f"Unboxing failed: cannot infer type for index:\n\t{index}" return types.none numba_index_type = numpy_support.from_dtype(index.dtype) @@ -264,11 +276,9 @@ def box_dataframe(typ, val, c): df_obj = pyapi.call_method(class_obj, "DataFrame", (df_dict,)) pyapi.decref(df_dict) - # set df.index if necessary - if typ.index != types.none: - index_obj = _box_index_data(typ.index, dataframe.index, c) - pyapi.object_setattr_string(df_obj, 'index', index_obj) - pyapi.decref(index_obj) + index_obj = _box_index_data(typ.index, dataframe.index, c) + pyapi.object_setattr_string(df_obj, 'index', index_obj) + pyapi.decref(index_obj) for arrays_list_obj in arrays_list_objs.values(): pyapi.decref(arrays_list_obj) @@ -320,12 +330,24 @@ def _unbox_index_data(index_typ, index_obj, c): c: LLVM context object Returns: LLVM instructions to generate native value """ + + if isinstance(index_typ, EmptyIndexType): + return unbox_empty_index(index_typ, index_obj, c) + + if isinstance(index_typ, PositionalIndexType): + return unbox_positional_index(index_typ, index_obj, c) + if isinstance(index_typ, RangeIndexType): return unbox_range_index(index_typ, index_obj, c) + if isinstance(index_typ, Int64IndexType): + return unbox_int64_index(index_typ, index_obj, c) + if index_typ == string_array_type: return unbox_str_series(index_typ, index_obj, c) + # this is still here only because of Float64Index represented as array + # TO-DO: remove when it's added if isinstance(index_typ, types.Array): index_data = c.pyapi.object_getattr_string(index_obj, "_data") res = unbox_array(index_typ, index_data, c) @@ -333,6 +355,7 @@ def _unbox_index_data(index_typ, index_obj, c): return res if isinstance(index_typ, types.NoneType): + assert False, "unboxing to None index!" return unbox_none(index_typ, index_obj, c) assert False, f"_unbox_index_data: unexpected index type({index_typ}) while unboxing" @@ -382,11 +405,7 @@ def box_series(typ, val, c): typ)(c.context, c.builder, val) arr = _box_series_data(dtype, typ.data, series.data, c) - - if typ.index is types.none: - index = c.pyapi.make_none() - else: - index = _box_index_data(typ.index, series.index, c) + index = _box_index_data(typ.index, series.index, c) if typ.is_named: name = c.pyapi.from_native_value(string_type, series.name) @@ -437,16 +456,22 @@ def _box_index_data(index_typ, val, c): c: LLVM context object Returns: Python object native value is boxed into """ - assert isinstance(index_typ, (RangeIndexType, StringArrayType, types.Array, types.NoneType)) + assert isinstance(index_typ, sdc_pandas_index_types) - if isinstance(index_typ, RangeIndexType): + if isinstance(index_typ, EmptyIndexType): + index = box_empty_index(index_typ, val, c) + elif isinstance(index_typ, PositionalIndexType): + index = box_positional_index(index_typ, val, c) + elif isinstance(index_typ, RangeIndexType): index = box_range_index(index_typ, val, c) + elif isinstance(index_typ, Int64IndexType): + index = box_int64_index(index_typ, val, c) elif isinstance(index_typ, types.Array): index = box_array(index_typ, val, c) elif isinstance(index_typ, StringArrayType): index = box_str_arr(string_array_type, val, c) - else: # index_typ is types.none - index = c.pyapi.make_none() + else: + assert False, f"_box_index_data called with unknown index type: {index_typ}" return index diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index d48aaf0f1..7bee3bf48 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -48,7 +48,6 @@ from sdc.hiframes.pd_series_type import SeriesType from sdc.datatypes.categorical.pdimpl import is_categoricaldtype from sdc.datatypes.series.pdimpl import _Series_category -from sdc.datatypes.range_index_type import RangeIndexType def is_str_series_typ(t): @@ -138,7 +137,7 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): fix_data = sdc.hiframes.api.fix_df_array(data) - fix_index = sdc.hiframes.api.fix_df_index(index) + fix_index = sdc.hiframes.api.fix_df_index(index, fix_data) return sdc.hiframes.api.init_series(fix_data, fix_index, name) return hpat_pandas_series_ctor_impl diff --git a/sdc/hiframes/pd_series_type.py b/sdc/hiframes/pd_series_type.py index 32e004a14..569df88a3 100644 --- a/sdc/hiframes/pd_series_type.py +++ b/sdc/hiframes/pd_series_type.py @@ -38,6 +38,7 @@ from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (string_array_type, iternext_str_array, StringArrayType) from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical +from sdc.datatypes.indexes.positional_index_type import PositionalIndexType class SeriesType(types.IterableType): @@ -54,7 +55,7 @@ def __init__(self, dtype, data=None, index=None, is_named=False): if isinstance(dtype, types.Record) else dtype) self.data = data if index is None: - index = types.none + index = PositionalIndexType(False) self.index = index # keep is_named in type to enable boxing self.is_named = is_named diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index debf0b73c..473b4efed 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -48,6 +48,10 @@ from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.hiframes.api import fix_df_array, fix_df_index from sdc.str_ext import string_type +from sdc.extensions.indexes.empty_index_ext import init_empty_index +from sdc.datatypes.indexes.empty_index_type import EmptyIndexType +from sdc.utilities.sdc_typing_utils import TypeChecker +from sdc.str_arr_type import StringArrayType @register_rewrite('before-inference') @@ -114,7 +118,9 @@ def apply(self): 'string_type': string_type, 'intrinsic': intrinsic, 'fix_df_array': fix_df_array, - 'fix_df_index': fix_df_index + 'fix_df_index': fix_df_index, + 'init_empty_index': init_empty_index, + 'EmptyIndexType': EmptyIndexType }) setattr(pd_dataframe_ext_module, func_name, init_df) @@ -197,6 +203,7 @@ def {func_name}(typingctx, {params}): """ n_cols = {n_cols} + is_df_empty = {n_cols == 0} input_data_typs = ({', '.join(args_col_data) + suffix}) fnty = typingctx.resolve_value_type(fix_df_array) @@ -209,7 +216,8 @@ def {func_name}(typingctx, {params}): input_index_typ = index fnty = typingctx.resolve_value_type(fix_df_index) - fixed_index_sig = fnty.get_call_type(typingctx, (input_index_typ,), {{}}) + fixed_index_sig = fnty.get_call_type(typingctx, + (input_index_typ, {'data_typs[0]' if n_cols > 0 else ''}), {{}}) index_typ = fixed_index_sig.return_type need_fix_index = index_typ != input_index_typ @@ -244,7 +252,11 @@ def codegen(context, builder, sig, args): builder, types.Tuple(data_list_type), data_lists) if need_fix_index == True: - index = context.compile_internal(builder, lambda a: fix_df_index(a), fixed_index_sig, [index]) + if is_df_empty == True: + first_col_data = context.get_dummy_value() + else: + first_col_data = data_arrs_transformed[0] + index = context.compile_internal(builder, lambda a, d: fix_df_index(a, d), fixed_index_sig, [index, first_col_data]) dataframe.data = data_tup dataframe.index = index @@ -286,21 +298,21 @@ def pd_dataframe_overload(data, index=None, columns=None, dtype=None, copy=False """ ty_checker = TypeChecker('Method DataFrame') - ty_checker.check(self, DataFrameType) - if not isinstance(data, dict): - ty_checker.raise_exc(pat, 'dict', 'data') + if not isinstance(data, (types.DictType, types.LiteralStrKeyDict)): + ty_checker.raise_exc(data, 'dict', 'data') - if not isinstance(index, (types.Ommited, types.Array, StringArray, types.NoneType)) and index is not None: - ty_checker.raise_exc(na, 'array-like', 'index') + if not (isinstance(index, (types.Omitted, types.ListType, types.List, + types.Array, StringArrayType, types.NoneType) or index is None)): + ty_checker.raise_exc(index, 'array-like', 'index') - if not isinstance(columns, (types.Ommited, types.NoneType)) and columns is not None: - ty_checker.raise_exc(na, 'None', 'columns') + if not (isinstance(columns, (types.Omitted, types.NoneType, types.Tuple, types.UniTuple) or columns is None)): + ty_checker.raise_exc(columns, 'tuple of strings', 'columns') - if not isinstance(dtype, (types.Ommited, types.NoneType)) and dtype is not None: - ty_checker.raise_exc(na, 'None', 'dtype') + if not (isinstance(dtype, (types.Omitted, types.NoneType) or dtype is None)): + ty_checker.raise_exc(dtype, 'None', 'dtype') - if not isinstance(copy, (types.Ommited, types.NoneType)) and columns is not False: - ty_checker.raise_exc(na, 'False', 'copy') + if not (isinstance(copy, (types.Omitted, types.NoneType) or columns is False)): + ty_checker.raise_exc(copy, 'False', 'copy') return None diff --git a/sdc/sdc_autogenerated.py b/sdc/sdc_autogenerated.py index 66567e94c..83dcb220e 100644 --- a/sdc/sdc_autogenerated.py +++ b/sdc/sdc_autogenerated.py @@ -39,15 +39,13 @@ from numba.core.errors import TypingError from numba import types -from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, - find_common_dtype_from_numpy_dtypes, find_index_common_dtype) -from sdc.datatypes.common_functions import (sdc_join_series_indexes, ) +from sdc.utilities.sdc_typing_utils import (TypeChecker, check_types_comparable, sdc_old_index_types,) from sdc.hiframes.api import isna from sdc.hiframes.pd_series_type import SeriesType from sdc.str_arr_ext import (string_array_type, str_arr_is_na) from sdc.utilities.utils import sdc_overload, sdc_overload_method from sdc.functions import numpy_like -from sdc.datatypes.range_index_type import RangeIndexType +from sdc.extensions.indexes.indexes_generic import sdc_indexes_join_outer, sdc_fix_indexes_join, sdc_unify_index_types def sdc_add(self, other, fill_value=None): @@ -79,63 +77,36 @@ def sdc_add_impl(self, other, fill_value=None): return sdc_add_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_add_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - - return pandas.Series(result_data) - - return sdc_add_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_add_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_add_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.add(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_add_impl + return sdc_add_impl @sdc_overload_method(SeriesType, 'add') @@ -190,9 +161,7 @@ def sdc_pandas_series_add(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -247,63 +216,36 @@ def sdc_div_impl(self, other, fill_value=None): return sdc_div_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_div_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - - return pandas.Series(result_data) - - return sdc_div_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_div_impl(self, other, fill_value=None): - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_div_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.div(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_div_impl + return sdc_div_impl @sdc_overload_method(SeriesType, 'div') @@ -358,9 +300,7 @@ def sdc_pandas_series_div(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -415,63 +355,36 @@ def sdc_sub_impl(self, other, fill_value=None): return sdc_sub_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_sub_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left - _right - - return pandas.Series(result_data) - return sdc_sub_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_sub_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_sub_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.sub(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left - _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left - _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_sub_impl + return sdc_sub_impl @sdc_overload_method(SeriesType, 'sub') @@ -526,9 +439,7 @@ def sdc_pandas_series_sub(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -583,63 +494,36 @@ def sdc_mul_impl(self, other, fill_value=None): return sdc_mul_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_mul_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left * _right - - return pandas.Series(result_data) - - return sdc_mul_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_mul_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_mul_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.mul(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left * _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left * _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_mul_impl + return sdc_mul_impl @sdc_overload_method(SeriesType, 'mul') @@ -694,9 +578,7 @@ def sdc_pandas_series_mul(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -751,63 +633,36 @@ def sdc_truediv_impl(self, other, fill_value=None): return sdc_truediv_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_truediv_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - - return pandas.Series(result_data) - - return sdc_truediv_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_truediv_impl(self, other, fill_value=None): - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_truediv_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.truediv(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_truediv_impl + return sdc_truediv_impl @sdc_overload_method(SeriesType, 'truediv') @@ -862,9 +717,7 @@ def sdc_pandas_series_truediv(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -919,63 +772,36 @@ def sdc_floordiv_impl(self, other, fill_value=None): return sdc_floordiv_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_floordiv_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left // _right - - return pandas.Series(result_data) - return sdc_floordiv_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_floordiv_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_floordiv_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.floordiv(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left // _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left // _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_floordiv_impl + return sdc_floordiv_impl @sdc_overload_method(SeriesType, 'floordiv') @@ -1030,9 +856,7 @@ def sdc_pandas_series_floordiv(self, other, level=None, fill_value=None, axis=0) operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1087,63 +911,36 @@ def sdc_mod_impl(self, other, fill_value=None): return sdc_mod_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_mod_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left % _right - - return pandas.Series(result_data) - - return sdc_mod_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_mod_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_mod_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.mod(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left % _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left % _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_mod_impl + return sdc_mod_impl @sdc_overload_method(SeriesType, 'mod') @@ -1198,9 +995,7 @@ def sdc_pandas_series_mod(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1255,63 +1050,36 @@ def sdc_pow_impl(self, other, fill_value=None): return sdc_pow_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_pow_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left ** _right - - return pandas.Series(result_data) - - return sdc_pow_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_pow_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_pow_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.pow(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left ** _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left ** _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_pow_impl + return sdc_pow_impl @sdc_overload_method(SeriesType, 'pow') @@ -1366,9 +1134,7 @@ def sdc_pandas_series_pow(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1415,50 +1181,34 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_lt_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data < other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left < _right - - return pandas.Series(result_data) - - return _series_lt_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_lt_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.lt(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data < other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left < _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_lt_common_impl + return _series_lt_common_impl return None @@ -1521,9 +1271,7 @@ def sdc_pandas_series_lt(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1552,59 +1300,43 @@ def sdc_gt_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_gt_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self > other, index=self._index, name=self._name) - return _series_gt_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_gt_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data > other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left > _right - - return pandas.Series(result_data) - - return _series_gt_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_gt_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.gt(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data > other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left > _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_gt_common_impl + return _series_lt_common_impl return None @@ -1667,9 +1399,7 @@ def sdc_pandas_series_gt(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1698,59 +1428,43 @@ def sdc_le_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_le_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self <= other, index=self._index, name=self._name) - return _series_le_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_le_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data <= other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left <= _right - - return pandas.Series(result_data) - - return _series_le_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_le_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.le(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data <= other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left <= _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_le_common_impl + return _series_lt_common_impl return None @@ -1813,9 +1527,7 @@ def sdc_pandas_series_le(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1844,59 +1556,43 @@ def sdc_ge_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_ge_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self >= other, index=self._index, name=self._name) - return _series_ge_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_ge_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data >= other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left >= _right - - return pandas.Series(result_data) - - return _series_ge_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_ge_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.ge(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data >= other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left >= _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_ge_common_impl + return _series_lt_common_impl return None @@ -1959,9 +1655,7 @@ def sdc_pandas_series_ge(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1990,59 +1684,43 @@ def sdc_ne_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_ne_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self != other, index=self._index, name=self._name) - return _series_ne_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_ne_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data != other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left != _right - - return pandas.Series(result_data) - - return _series_ne_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_ne_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.ne(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data != other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left != _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_ne_common_impl + return _series_lt_common_impl return None @@ -2105,9 +1783,7 @@ def sdc_pandas_series_ne(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2136,59 +1812,43 @@ def sdc_eq_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_eq_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self == other, index=self._index, name=self._name) - return _series_eq_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_eq_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data == other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left == _right - - return pandas.Series(result_data) - - return _series_eq_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_eq_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.eq(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data == other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left == _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_eq_common_impl + return _series_lt_common_impl return None @@ -2251,9 +1911,7 @@ def sdc_pandas_series_eq(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2317,9 +1975,7 @@ def sdc_pandas_series_operator_add(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2382,9 +2038,7 @@ def sdc_pandas_series_operator_sub(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2447,9 +2101,7 @@ def sdc_pandas_series_operator_mul(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2512,9 +2164,7 @@ def sdc_pandas_series_operator_truediv(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2577,9 +2227,7 @@ def sdc_pandas_series_operator_floordiv(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2642,9 +2290,7 @@ def sdc_pandas_series_operator_mod(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2707,9 +2353,7 @@ def sdc_pandas_series_operator_pow(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2762,9 +2406,7 @@ def sdc_pandas_series_operator_lt(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2817,9 +2459,7 @@ def sdc_pandas_series_operator_gt(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2872,9 +2512,7 @@ def sdc_pandas_series_operator_le(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2927,9 +2565,7 @@ def sdc_pandas_series_operator_ge(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2982,9 +2618,7 @@ def sdc_pandas_series_operator_ne(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -3037,9 +2671,7 @@ def sdc_pandas_series_operator_eq(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) diff --git a/sdc/sdc_function_templates.py b/sdc/sdc_function_templates.py index 0c6d34356..891f4c9c6 100644 --- a/sdc/sdc_function_templates.py +++ b/sdc/sdc_function_templates.py @@ -40,15 +40,13 @@ from numba.core.errors import TypingError from numba import types -from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, - find_common_dtype_from_numpy_dtypes, find_index_common_dtype) -from sdc.datatypes.common_functions import (sdc_join_series_indexes, ) +from sdc.utilities.sdc_typing_utils import (TypeChecker, check_types_comparable, sdc_old_index_types,) from sdc.hiframes.api import isna from sdc.hiframes.pd_series_type import SeriesType from sdc.str_arr_ext import (string_array_type, str_arr_is_na) from sdc.utilities.utils import sdc_overload, sdc_overload_method from sdc.functions import numpy_like -from sdc.datatypes.range_index_type import RangeIndexType +from sdc.extensions.indexes.indexes_generic import sdc_indexes_join_outer, sdc_fix_indexes_join, sdc_unify_index_types def sdc_binop(self, other, fill_value=None): @@ -79,63 +77,36 @@ def sdc_binop_impl(self, other, fill_value=None): return sdc_binop_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_binop_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - - return pandas.Series(result_data) - - return sdc_binop_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_binop_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) + def sdc_binop_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.binop(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_binop_impl + return sdc_binop_impl def sdc_pandas_series_binop(self, other, level=None, fill_value=None, axis=0): @@ -189,9 +160,7 @@ def sdc_pandas_series_binop(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -228,59 +197,43 @@ def sdc_comp_binop_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_comp_binop_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self < other, index=self._index, name=self._name) - return _series_comp_binop_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_comp_binop_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): raise ValueError("Can only compare identically-labeled Series objects") - - if fill_value_is_none == True: # noqa - result_data = self._data < other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left < _right - - return pandas.Series(result_data) - - return _series_comp_binop_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_comp_binop_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.comp_binop(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data < other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left < _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_comp_binop_common_impl + return _series_lt_common_impl return None @@ -342,9 +295,7 @@ def sdc_pandas_series_comp_binop(self, other, level=None, fill_value=None, axis= operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -407,9 +358,7 @@ def sdc_pandas_series_operator_binop(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -461,9 +410,7 @@ def sdc_pandas_series_operator_comp_binop(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) diff --git a/sdc/tests/__init__.py b/sdc/tests/__init__.py index eaba6a8a9..eeb4014b8 100644 --- a/sdc/tests/__init__.py +++ b/sdc/tests/__init__.py @@ -44,7 +44,7 @@ from sdc.tests.test_io import * from sdc.tests.test_hpat_jit import * -from sdc.tests.test_indexes import * +from sdc.tests.indexes import * from sdc.tests.test_sdc_numpy import * from sdc.tests.test_prange_utils import * diff --git a/sdc/tests/indexes/__init__.py b/sdc/tests/indexes/__init__.py new file mode 100644 index 000000000..c0adc55e5 --- /dev/null +++ b/sdc/tests/indexes/__init__.py @@ -0,0 +1,31 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from sdc.tests.indexes.test_empty_index import TestEmptyIndex +from sdc.tests.indexes.test_range_index import TestRangeIndex +from sdc.tests.indexes.test_positional_index import TestPositionalIndex +from sdc.tests.indexes.test_int64_index import TestInt64Index +from sdc.tests.indexes.test_indexes import TestIndexes diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py new file mode 100644 index 000000000..626c86234 --- /dev/null +++ b/sdc/tests/indexes/index_datagens.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +from itertools import (product, combinations_with_replacement, filterfalse, chain) + +from sdc.tests.test_utils import gen_strlist +from sdc.datatypes.indexes import * + +test_global_index_names = [None, 'abc', 'index'] +test_global_range_member_values = [1, 2, 10, -5, 0, None] + + +def _generate_all_range_params(): + + def valid_params_predicate(range_params): + # if step is zero or all start/stop/step are None range is invalid + return (range_params[-1] == 0 + or all(map(lambda x: x is None, range_params))) + + return filterfalse( + valid_params_predicate, + combinations_with_replacement(test_global_range_member_values, 3) + ) + + +def _generate_positional_range_params(): + + # for PositionalIndexType represented ranges only + starts, stops, steps = [0,], [1, 2, 10,], [1,] + return product(starts, stops, steps) + + +def _generate_custom_range_params(): + + # for non PositionalIndexType represented range objects + def valid_positional_index_predicate(range_params): + index = pd.RangeIndex(*range_params) + return index.start == 0 and index.stop > 0 and index.step == 1 + + return filterfalse( + valid_positional_index_predicate, + _generate_all_range_params() + ) + + +def _generate_positional_indexes_fixed(size, start=1, step=3): + yield pd.RangeIndex(size) + yield pd.RangeIndex(size, name='abc') + + +def _generate_custom_range_indexes_fixed(size, start=1, step=3): + yield pd.RangeIndex(stop=step * size, step=step) + yield pd.RangeIndex(stop=2*step*size, step=2*step) + yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) + yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) + + +def _generate_range_indexes_fixed(size, start=1, step=3): + return chain( + _generate_positional_indexes_fixed(size, start, step), + _generate_custom_range_indexes_fixed(size, start, step), + ) + + +def _generate_index_param_values(n): + return chain( + [None], + _generate_range_indexes_fixed(n), + _generate_int64_indexes_fixed(n), + [np.arange(n) / 2], + [np.arange(n, dtype=np.uint64)], + [gen_strlist(n)], + ) + + +def _generate_valid_int64_index_data(): + n = 100 + yield np.arange(n) + yield np.arange(n) % 2 + yield np.ones(n, dtype=np.int16) + yield list(np.arange(n)) + yield pd.RangeIndex(n) + yield pd.Int64Index(np.arange(n)) + yield np.arange(n) * 2 + yield np.arange(2 * n) + + +def _generate_int64_indexes_fixed(size): + yield pd.Int64Index(np.arange(size)) + yield pd.Int64Index(np.arange(size), name='abc') + yield pd.Int64Index([i if i % 2 else 0 for i in range(size)]) + yield pd.Int64Index([i // 2 for i in range(size)]) + yield pd.Int64Index(np.ones(size)) + + +def get_sample_index(size, sdc_index_type): + if sdc_index_type is PositionalIndexType: + return pd.RangeIndex(size) + if sdc_index_type is RangeIndexType: + return pd.RangeIndex(-1, size - 1, 1) + if sdc_index_type is Int64IndexType: + return pd.Int64Index(np.arange(size)) + + assert False, f"Refusing to create index of non-specific index type: {sdc_index_type}" diff --git a/sdc/tests/indexes/test_empty_index.py b/sdc/tests/indexes/test_empty_index.py new file mode 100644 index 000000000..1b6baaaec --- /dev/null +++ b/sdc/tests/indexes/test_empty_index.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import unittest + +from sdc.tests.test_base import TestCase + + +class TestEmptyIndex(TestCase): + """ Verifies basic support for empty DF and using special EmptyIndexType + for respresnting it's index """ + + def test_create_empty_df(self): + def test_impl(): + df = pd.DataFrame({}, index=None) + return len(df) + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + self.assertEqual(result, result_ref) + + def test_unbox_empty_df(self): + def test_impl(df): + return len(df) + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = test_impl(df) + self.assertEqual(result, result_ref) + + def test_box_empty_df(self): + def test_impl(): + df = pd.DataFrame({}, index=None) + return df + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_frame_equal(result, result_ref) + + def test_empty_df_round_trip(self): + def test_impl(df): + return df + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = test_impl(df) + pd.testing.assert_frame_equal(result, result_ref) + + def test_empty_df_unboxed_get_index_1(self): + def test_impl(df): + return df.index + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = test_impl(df) + pd.testing.assert_index_equal(result, result_ref) + + def test_empty_df_unboxed_get_index_2(self): + + def py_func(df): + return df.index + + @self.jit + def sdc_func(df): + return df._index + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = py_func(df) + pd.testing.assert_index_equal(result, result_ref) + + def test_empty_df_created_get_index_1(self): + def test_impl(): + df = pd.DataFrame({}, index=None) + return df.index + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_index_equal(result, result_ref) + + def test_empty_df_created_get_index_2(self): + + def py_func(): + df = pd.DataFrame({}, index=None) + return df.index + + @self.jit + def sdc_func(): + df = pd.DataFrame({}, index=None) + return df._index + + result = sdc_func() + result_ref = py_func() + pd.testing.assert_index_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/indexes/test_indexes.py b/sdc/tests/indexes/test_indexes.py new file mode 100644 index 000000000..c23603317 --- /dev/null +++ b/sdc/tests/indexes/test_indexes.py @@ -0,0 +1,375 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +import unittest +from itertools import product + +from sdc.tests.indexes import ( + TestEmptyIndex, + TestPositionalIndex, + TestRangeIndex, + TestInt64Index, + ) +from sdc.tests.indexes.index_datagens import _generate_index_param_values, get_sample_index +from sdc.datatypes.indexes import * + + +class TestIndexes( + TestEmptyIndex, + TestPositionalIndex, + TestRangeIndex, + TestInt64Index + ): + """ This suite combines tests from all concrete index-type suites and also adds + tests for common use-cases that need to be checked for all index-types. """ + + def assert_indexes_equal(self, index1, index2): + # for SDC indexes that are represented with arrays (e.g. Uint64Index) + supported_pandas_indexes = (pd.RangeIndex, pd.Int64Index, ) + if (not isinstance(index1, supported_pandas_indexes) + or not isinstance(index2, supported_pandas_indexes)): + index1 = np.asarray(index1) + index2 = np.asarray(index2) + np.testing.assert_array_equal(index1, index2) + else: + pd.testing.assert_index_equal(index1, index2) + + @unittest.skip("TODO: support boxing/unboxing and parent ref for Python ranges in Numba") + def test_indexes_unbox_data_id_check(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.RangeIndex(n, name='abc'), # only this one fails, other pass + pd.Int64Index(np.arange(n), name='abc'), + ] + data_attr_names_map = { + pd.RangeIndex: '_range', + pd.Int64Index: '_data', + } + + for index in indexes_to_test: + with self.subTest(index_type=type(index)): + result = sdc_func(index) + result_ref = test_impl(index) + + data1, data2, data3 = map( + lambda x: getattr(x, data_attr_names_map[type(x)]), + [index, result, result_ref] + ) + self.assertIs(data1, data3) + self.assertIs(data2, data3) + + @unittest.skip("Needs writable native struct type members in Numba") + def test_indexes_named_set_name(self): + def test_impl(index): + index.name = 'def' + return index + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.RangeIndex(n, name='abc'), + pd.Int64Index(np.arange(n), name='abc'), + ] + + for index in indexes_to_test: + with self.subTest(index_type=type(index)): + index1 = index.copy(deep=True) + index2 = index.copy(deep=True) + result = sdc_func(index1) + result_ref = test_impl(index2) + pd.testing.assert_index_equal(result, result_ref) + + @unittest.skip("Needs writable native struct type members and single common type for name") + def test_indexes_unnamed_set_name(self): + def test_impl(index): + index.name = 'def' + return index + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.RangeIndex(n), + pd.Int64Index(np.arange(n)), + ] + + for index in indexes_to_test: + with self.subTest(index_type=type(index)): + index1 = index.copy(deep=True) + index2 = index.copy(deep=True) + result = sdc_func(index1) + result_ref = test_impl(index2) + pd.testing.assert_index_equal(result, result_ref) + + @unittest.skip("Need support unboxing pandas indexes with parent ref") + def test_indexes_operator_is_unbox(self): + def test_impl(index1, index2): + return index1 is index2 + sdc_func = self.jit(test_impl) + + indexes_to_test = [ + pd.RangeIndex(1, 21, 3), + pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), + ] + + for index in indexes_to_test: + # positive testcase + with self.subTest(subtest="same indexes"): + index1 = index.copy(deep=True) + index2 = index1 + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + index1 = index.copy(deep=True) + index2 = index.copy(deep=True) + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_indexes_unbox_series_with_index(self): + @self.jit + def test_impl(S): + # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, + # but this requires separate type (e.g. PositionalIndexType) instead of types.none as default index + return S.index + + n = 11 + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(series_index=index): + S = pd.Series(np.ones(n), index=index) + result = test_impl(S) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_create_series_with_index(self): + @self.jit + def test_impl(data, index): + S = pd.Series(data=data, index=index) + return S.index + + n = 11 + series_data = np.ones(n) + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(series_index=index): + result = test_impl(series_data, index) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_box_series_with_index(self): + def test_impl(data, index): + return pd.Series(data=data, index=index) + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.ones(n) + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + result = sdc_func(series_data, index) + result_ref = test_impl(series_data, index) + pd.testing.assert_series_equal(result, result_ref) + + def test_indexes_get_series_index(self): + def test_impl(S): + return S.index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_index_param_values(n): + with self.subTest(series_index=index): + S = pd.Series(np.ones(n), index=index) + result = sdc_func(S) + result_ref = test_impl(S) + self.assert_indexes_equal(result, result_ref) + + def test_indexes_unbox_df_with_index(self): + @self.jit + def test_impl(df): + # TO-DO: this actually includes calling 'index' attribute overload, should really be df._index, + # but this requires separate type (e.g. PositionalIndexType) instead of types.none as default index + return df.index + + n = 11 + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(df_index=index): + df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}, index=index) + result = test_impl(df) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_create_df_with_index(self): + @self.jit + def test_impl(A, B, index): + df = pd.DataFrame({'A': A, 'B': B}, index=index) + return df.index + + n = 11 + A, B = np.ones(n), np.arange(n) + for index in _generate_index_param_values(n): + expected_res = pd.RangeIndex(n) if index is None else index + with self.subTest(df_index=index): + result = test_impl(A, B, index) + self.assert_indexes_equal(result, expected_res) + + def test_indexes_box_df_with_index(self): + def test_impl(A, B, index): + return pd.DataFrame({'A': A, 'B': B}, index=index) + sdc_func = self.jit(test_impl) + + n = 11 + A, B = np.ones(n), np.arange(n, dtype=np.intp) + for index in _generate_index_param_values(n): + with self.subTest(df_index=index): + result = sdc_func(A, B, index) + result_ref = test_impl(A, B, index) + pd.testing.assert_frame_equal(result, result_ref) + + def test_indexes_get_df_index(self): + def test_impl(df): + return df.index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_index_param_values(n): + with self.subTest(df_index=index): + df = pd.DataFrame({'A': np.ones(n)}, index=index) + result = sdc_func(df) + result_ref = test_impl(df) + self.assert_indexes_equal(result, result_ref) + + def test_indexes_support_numpy_like_take_by(self): + """ Verifies numpy_like.take can handle SDC index types as indices """ + + from sdc.functions import numpy_like + def pyfunc(arr, index): + return np.take(arr, index) + + @self.jit + def sdc_func(arr, index): + return numpy_like.take(arr, index) + + n, k = 1000, 200 + np.random.seed(0) + arr = np.arange(n) * 2 + indexes_to_test = [ + get_sample_index(k, PositionalIndexType), + get_sample_index(k, RangeIndexType), + get_sample_index(k, Int64IndexType), + ] + for index in indexes_to_test: + with self.subTest(index=index): + result = sdc_func(arr, index) + result_ref = pyfunc(arr, index) + np.testing.assert_array_equal(result, result_ref) + + def test_indexes_support_series_operator_add(self): + def test_impl(data, index1, index2): + S1 = pd.Series(data, index=index1) + S2 = pd.Series(2 * data + 1, index=index2) + return S1 + S2 + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.arange(n, dtype=np.float64) + index_params_to_test = [ + None, + pd.RangeIndex(0, -n, -1), + pd.Int64Index(np.arange(n) * 2), + ] + + for index1, index2 in product(index_params_to_test, repeat=2): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(series_data, index1, index2) + result_ref = test_impl(series_data, index1, index2) + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) + + def test_indexes_support_series_operator_lt(self): + def test_impl(data, index1, index2): + S1 = pd.Series(data, index=index1) + S2 = pd.Series(2 * data + 1, index=index2) + return S1 < S2 + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.arange(n, dtype=np.float64) + index_params_to_test = [ + None, + pd.RangeIndex(0, -n, -1), + pd.Int64Index(np.arange(n) * 2), + ] + + for index1 in index_params_to_test: + index2 = index1 + with self.subTest(index1=index1, index2=index2): + result = sdc_func(series_data, index1, index2) + result_ref = test_impl(series_data, index1, index2) + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) + + def test_indexes_support_series_reindexing(self): + from sdc.datatypes.common_functions import sdc_reindex_series + + def pyfunc(data, index, name, by_index): + S = pd.Series(data, index, name=name) + return S.reindex(by_index) + + @self.jit + def sdc_func(data, index, name, by_index): + return sdc_reindex_series(data, index, name, by_index) + + n = 17 + np.random.seed(0) + mask = np.random.choice([True, False], n) + name = 'asdf' + + range_index = pd.RangeIndex(n) + int64_index = pd.Int64Index(np.random.choice(range_index.values, n, replace=False)) + indexes_combinations = [ + (range_index, range_index), + (range_index, range_index[::-1]), + (range_index[::-1], range_index), + (range_index, int64_index), + (int64_index, range_index), + ] + + for index1, index2 in indexes_combinations: + with self.subTest(index1=index1, index2=index2): + result = sdc_func(mask, index1, name, index2) + result_ref = pyfunc(mask, index1, name, index2) + pd.testing.assert_series_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/indexes/test_int64_index.py b/sdc/tests/indexes/test_int64_index.py new file mode 100644 index 000000000..7fa52fd17 --- /dev/null +++ b/sdc/tests/indexes/test_int64_index.py @@ -0,0 +1,550 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import unittest +from itertools import (combinations_with_replacement, product, ) + +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_valid_int64_index_data, + _generate_int64_indexes_fixed, + get_sample_index, + ) +from sdc.tests.test_base import TestCase +from sdc.datatypes.indexes import * + + +class TestInt64Index(TestCase): + + def test_int64_index_type_inferred(self): + + for data in _generate_valid_int64_index_data(): + for name in test_global_index_names: + index = pd.Int64Index(data, name=name) + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, Int64IndexType) + + def test_int64_index_create_and_box(self): + def test_impl(data, name): + return pd.Int64Index(data, name=name) + sdc_func = self.jit(test_impl) + + name = 'index' + for data in _generate_valid_int64_index_data(): + with self.subTest(index_data=data): + result = sdc_func(data, name) + result_ref = test_impl(data, name) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_unbox_and_box(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + n = 11 + for index in _generate_int64_indexes_fixed(n): + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_create_param_copy_true(self): + def test_impl(arr): + return pd.Int64Index(arr, copy=True) + sdc_func = self.jit(test_impl) + + index_data_to_test = [ + np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64), + list(np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64)), + pd.RangeIndex(11), + pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), + ] + + for index_data in index_data_to_test: + with self.subTest(index_data=index_data): + result = sdc_func(index_data) + result_ref = test_impl(index_data) + pd.testing.assert_index_equal(result, result_ref) + self.assertEqual(result._data is result_ref._data, False) + + def test_int64_index_create_param_copy_default(self): + def test_impl(arr): + return pd.Int64Index(arr) + sdc_func = self.jit(test_impl) + + # only test data that has underlying array that can be referenced + # and ensure it has int64 dtype as otherwise there will always be a copy + index_data_to_test = [ + np.array([1, 2, 3, 5, 6, 3, 4], dtype=np.int64), + pd.Int64Index([1, 2, 3, 5, 6, 3, 4]), + ] + + for index_data in index_data_to_test: + with self.subTest(index_data=index_data): + result = sdc_func(index_data) + result_ref = test_impl(index_data) + pd.testing.assert_index_equal(result, result_ref) + self.assertEqual(result._data is result_ref._data, True) + + def test_int64_index_create_param_dtype(self): + def test_impl(n, dtype): + return pd.Int64Index(np.arange(n), dtype=dtype) + sdc_func = self.jit(test_impl) + + n = 11 + supported_dtypes = [None, np.int64, 'int64', np.int32, 'int32'] + for dtype in supported_dtypes: + with self.subTest(dtype=dtype): + result = sdc_func(n, dtype) + result_ref = test_impl(n, dtype) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_create_param_dtype_invalid(self): + def test_impl(n, dtype): + return pd.Int64Index(np.arange(n), dtype=dtype) + sdc_func = self.jit(test_impl) + + n = 11 + invalid_dtypes = ['float', 'uint'] + for dtype in invalid_dtypes: + with self.subTest(dtype=dtype): + with self.assertRaises(Exception) as context: + test_impl(n, dtype) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(n, dtype) + sdc_exception = context.exception + self.assertIn(str(sdc_exception), str(pandas_exception)) + + def test_int64_index_attribute_dtype(self): + def test_impl(index): + return index.dtype + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_attribute_name(self): + def test_impl(index): + return index.name + sdc_func = self.jit(test_impl) + + n = 11 + index_data = np.arange(n) * 2 + for name in test_global_index_names: + with self.subTest(name=name): + index = pd.Int64Index(index_data, name=name) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_len(self): + def test_impl(index): + return len(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2, name='index') + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_attribute_values(self): + def test_impl(index): + return index.values + sdc_func = self.jit(test_impl) + + for data in _generate_valid_int64_index_data(): + index = pd.Int64Index(data) + with self.subTest(index_data=data): + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_contains(self): + def test_impl(index, value): + return value in index + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([1, 11, 2]) + values_to_test = [-5, 15, 1, 11, 5, 6] + for value in values_to_test: + with self.subTest(value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_getitem_scalar(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + for data in _generate_valid_int64_index_data(): + index = pd.Int64Index(data) + n = len(index) + values_to_test = [-n, n // 2, n - 1] + for idx in values_to_test: + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + self.assertEqual(result, result_ref) + + def test_int64_index_getitem_scalar_idx_bounds(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2, name='abc') + values_to_test = [-(n + 1), n] + for idx in values_to_test: + with self.subTest(idx=idx): + with self.assertRaises(Exception) as context: + test_impl(index, idx) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index, idx) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn("out of bounds", str(sdc_exception)) + + def test_int64_index_getitem_slice(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 17 + slices_params = combinations_with_replacement( + [None, 0, -1, n // 2, n, n - 3, n + 3, -(n + 3)], + 2 + ) + + for data in _generate_valid_int64_index_data(): + index = pd.Int64Index(data, name='abc') + for slice_start, slice_stop in slices_params: + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_iterator_1(self): + def test_impl(index): + res = [] + for i, label in enumerate(index): + res.append((i, label)) + return res + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([5, 3, 2, 1, 7, 4]) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_iterator_2(self): + def test_impl(index): + res = [] + for label in index: + if not label % 2: + res.append(label) + return res + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([5, 3, 2, 1, 7, 4]) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_int64_index_nparray(self): + def test_impl(index): + return np.array(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, Int64IndexType) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_eq_index(self): + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_int64_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_eq_scalar(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + A = pd.Int64Index(np.arange(n) * 2) + scalars_to_test = [0, 22, 13, -5, 4.0] + for B in scalars_to_test: + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_eq_nparray(self): + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + for A, B in product( + _generate_int64_indexes_fixed(n), + map(lambda x: np.array(x), _generate_int64_indexes_fixed(n)) + ): + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_ne_index(self): + def test_impl(index1, index2): + return index1 != index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_int64_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_operator_is_nounbox(self): + def test_impl_1(data): + index1 = pd.Int64Index(data) + index2 = index1 + return index1 is index2 + sdc_func_1 = self.jit(test_impl_1) + + def test_impl_2(data): + index1 = pd.Int64Index(data) + index2 = pd.Int64Index(data) + return index1 is index2 + sdc_func_2 = self.jit(test_impl_2) + + # positive testcase + index_data = [1, 2, 3, 5, 6, 3, 4] + with self.subTest(subtest="same indexes"): + result = sdc_func_1(index_data) + result_ref = test_impl_1(index_data) + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + result = sdc_func_2(index_data) + result_ref = test_impl_2(index_data) + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_int64_index_getitem_by_mask(self): + def test_impl(index, mask): + return index[mask] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + mask = np.random.choice([True, False], n) + for index in _generate_int64_indexes_fixed(n): + result = sdc_func(index, mask) + result_ref = test_impl(index, mask) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n, k = 11, 7 + np.random.seed(0) + idx = np.random.choice(np.arange(n), k) + for index in _generate_int64_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_reindex_equal_indexes(self): + + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) + + n = 10 + np.random.seed(0) + index1 = pd.Int64Index(np.arange(n)) + index2 = pd.Int64Index(np.copy(index1.values)) + + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_int64_index_reindex(self): + + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) + + n = 10 + np.random.seed(0) + index_data = np.arange(n) + index1 = pd.Int64Index(np.random.choice(index_data, n, replace=False)) + reindex_by = [ + pd.RangeIndex(n + 2), + pd.RangeIndex(0, n, 2), + pd.Int64Index(np.random.choice(index_data, n, replace=False)), + pd.Int64Index(np.random.choice([0, 1, 11, 12, 100], n)) + ] + + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_int64_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) + + n = 11 + indexes_to_test = [ + pd.Int64Index(np.arange(n)), + pd.Int64Index(np.arange(n), name='asd'), + pd.Int64Index(np.arange(n) * 2, name='asd'), + pd.Int64Index(np.arange(2 * n)), + ] + for index1, index2 in combinations_with_replacement(indexes_to_test, 2): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + def test_int64_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.Int64Index(np.arange(n) * 2) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_int64_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_int64_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(n, PositionalIndexType), + get_sample_index(n, RangeIndexType), + get_sample_index(n, Int64IndexType), + ] + for index, other in product( + _generate_int64_indexes_fixed(n), + other_indexes + ): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(2 * n, PositionalIndexType), + get_sample_index(2 * n, RangeIndexType), + get_sample_index(2 * n, Int64IndexType), + ] + for index, other in product( + _generate_int64_indexes_fixed(n), + other_indexes + ): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/indexes/test_positional_index.py b/sdc/tests/indexes/test_positional_index.py new file mode 100644 index 000000000..bf180aed9 --- /dev/null +++ b/sdc/tests/indexes/test_positional_index.py @@ -0,0 +1,575 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +import unittest +from itertools import (combinations_with_replacement, product, chain, ) + +import numba +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_positional_range_params, + _generate_positional_indexes_fixed, + get_sample_index, + ) +from sdc.tests.test_base import TestCase +from sdc.extensions.indexes.positional_index_ext import init_positional_index +from sdc.datatypes.indexes import * + + +class TestPositionalIndex(TestCase): + + def test_positional_index_type_inferred(self): + + for params in _generate_positional_range_params(): + start, stop, step = params + for name in test_global_index_names: + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, PositionalIndexType) + + def test_positional_index_create_and_box(self): + @self.jit + def sdc_func(stop, name): + return init_positional_index(stop, name=name) + + for size, name in product([1, 5, 17], test_global_index_names): + with self.subTest(size=size, name=name): + result = sdc_func(size, name) + expected_res = pd.RangeIndex(size, name=name) + pd.testing.assert_index_equal(result, expected_res) + + def test_positional_index_unbox_and_box(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + start, stop, step = params + for name in test_global_index_names: + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_create_param_name_literal_str(self): + @self.jit + def sdc_func(stop): + return init_positional_index(stop, name='index') + + n = 11 + result = sdc_func(n) + expected_res = pd.RangeIndex(n, name='index') + pd.testing.assert_index_equal(result, expected_res) + + def test_positional_index_attribute_start(self): + def test_impl(index): + return index.start + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_stop(self): + def test_impl(index): + return index.stop + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_step(self): + def test_impl(index): + return index.step + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_dtype(self): + def test_impl(index): + return index.dtype + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(11) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_name(self): + def test_impl(index): + return index.name + sdc_func = self.jit(test_impl) + + n = 11 + for name in test_global_index_names: + with self.subTest(name=name): + index = pd.RangeIndex(n, name=name) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_len(self): + def test_impl(index): + return len(index) + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_values(self): + def test_impl(index): + return index.values + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_contains(self): + def test_impl(index, value): + return value in index + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(11) + values_to_test = [-5, 15, 1, 11, 5, 6] + for value in values_to_test: + with self.subTest(value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_copy(self): + def test_impl(index, new_name): + return index.copy(name=new_name) + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + start, stop, step = params + for name, new_name in product(test_global_index_names, repeat=2): + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index, new_name=new_name): + result = sdc_func(index, new_name) + result_ref = test_impl(index, new_name) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_getitem_scalar(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + n = len(index) + if not n: # test only non-empty ranges + continue + values_to_test = [-n, n // 2, n - 1] + for idx in values_to_test: + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + self.assertEqual(result, result_ref) + + def test_positional_index_getitem_scalar_idx_bounds(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.RangeIndex(n, name='abc') + values_to_test = [-(n + 1), n] + for idx in values_to_test: + with self.subTest(idx=idx): + with self.assertRaises(Exception) as context: + test_impl(index, idx) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index, idx) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn("out of bounds", str(sdc_exception)) + + def test_positional_index_getitem_slice(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + index_len = 17 + slices_params = combinations_with_replacement( + [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], + 2, + ) + + index = pd.RangeIndex(0, index_len, 1, name='abc') + for slice_start, slice_stop in slices_params: + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_iterator_1(self): + def test_impl(index): + res = [] + for i, label in enumerate(index): + res.append((i, label)) + return res + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(0, 21, 1) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_iterator_2(self): + def test_impl(index): + res = [] + for label in index: + if not label % 2: + res.append(label) + return res + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(0, 21, 1) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_nparray(self): + def test_impl(index): + return np.array(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, PositionalIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_eq_index_1(self): + """ Verifies operator.eq implementation for pandas PositionalIndex in a case of equal range sizes """ + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_positional_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_eq_index_2(self): + """ Verifies operator.eq implementation for pandas PositionalIndex in a case of non equal range sizes """ + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + index1 = pd.RangeIndex(11) + index2 = pd.RangeIndex(22) + with self.assertRaises(Exception) as context: + test_impl(index1, index2) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index1, index2) + sdc_exception = context.exception + self.assertIn(str(sdc_exception), str(pandas_exception)) + + def test_positional_index_operator_eq_scalar(self): + """ Verifies operator.eq implementation for pandas PositionalIndex and a scalar value """ + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + A = pd.RangeIndex(n) + scalars_to_test = [ + A.start, + float(A.start), + A.start + 1, + (A.start + A.stop) / 2, + A.stop, + ] + for B in scalars_to_test: + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_eq_nparray(self): + """ Verifies operator.eq implementation for pandas PositionalIndex and a numpy array """ + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + for A, B in product( + _generate_positional_indexes_fixed(n), + map(lambda x: np.array(x), _generate_positional_indexes_fixed(n)) + ): + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_ne_index(self): + """ Verifies operator.ne implementation for pandas PositionalIndex in a case of non equal range sizes """ + def test_impl(index1, index2): + return index1 != index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_positional_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_is_nounbox(self): + def test_impl_1(*args): + index1 = pd.RangeIndex(*args) + index2 = index1 + return index1 is index2 + sdc_func_1 = self.jit(test_impl_1) + + def test_impl_2(*args): + index1 = pd.RangeIndex(*args) + index2 = pd.RangeIndex(*args) + return index1 is index2 + sdc_func_2 = self.jit(test_impl_2) + + # positive testcase + params = 1, 21, 3 + with self.subTest(subtest="same indexes"): + result = sdc_func_1(*params) + result_ref = test_impl_1(*params) + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + result = sdc_func_2(*params) + result_ref = test_impl_2(*params) + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_positional_index_getitem_by_mask(self): + def test_impl(index, mask): + return index[mask] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + mask = np.random.choice([True, False], n) + for index in _generate_positional_indexes_fixed(n): + result = sdc_func(index, mask) + result_ref = test_impl(index, mask) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n, k = 11, 7 + np.random.seed(0) + idx = np.random.choice(np.arange(n), k) + for index in _generate_positional_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) + + n = 11 + self_indexes = list(chain( + _generate_positional_indexes_fixed(n), + _generate_positional_indexes_fixed(2 * n) + )) + + all_positional_indexes = list(_generate_positional_indexes_fixed(n)) + other_indexes = chain( + all_positional_indexes, + map(lambda x: pd.Int64Index(x), all_positional_indexes), + ) + + for index1, index2 in product(self_indexes, other_indexes): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + def test_positional_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.RangeIndex(n) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_reindex_equal_indexes(self): + + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) + + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(0, n, 1) + index2 = index1.copy(deep=True) + + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_positional_index_reindex(self): + + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) + + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(0, n, 1) + reindex_by = [ + pd.RangeIndex(n + 2), + pd.RangeIndex(0, n, 2), + pd.Int64Index(np.random.choice(index1.values, n, replace=False)), + pd.Int64Index(np.random.choice([0, 1, 11, 12, 100], n)) + ] + + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_positional_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_positional_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(n, PositionalIndexType), + get_sample_index(n, RangeIndexType), + get_sample_index(n, Int64IndexType), + ] + for index, other in product( + _generate_positional_indexes_fixed(n), + other_indexes + ): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(2 * n, PositionalIndexType), + get_sample_index(2 * n, RangeIndexType), + get_sample_index(2 * n, Int64IndexType), + ] + for index, other in product( + _generate_positional_indexes_fixed(n), + other_indexes + ): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/test_indexes.py b/sdc/tests/indexes/test_range_index.py similarity index 60% rename from sdc/tests/test_indexes.py rename to sdc/tests/indexes/test_range_index.py index b277ac9a1..cbd07e620 100644 --- a/sdc/tests/test_indexes.py +++ b/sdc/tests/indexes/test_range_index.py @@ -25,56 +25,44 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import numba import numpy as np import pandas as pd import unittest +from itertools import (combinations_with_replacement, product, chain, ) -from itertools import (combinations_with_replacement, product, filterfalse, chain) - +from numba.core.errors import TypingError +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_custom_range_params, + _generate_range_indexes_fixed, + _generate_custom_range_indexes_fixed, + get_sample_index + ) from sdc.tests.test_base import TestCase from sdc.utilities.sdc_typing_utils import kwsparams2list from sdc.tests.test_series import _make_func_from_text -from numba.core.errors import TypingError - +from sdc.datatypes.indexes import * -test_global_index_names = [None, 'abc', 'index'] -test_global_range_member_values = [1, 2, 10, -5, 0, None] +class TestRangeIndex(TestCase): -def _generate_valid_range_params(): - - def valid_params_predicate(range_params): - # if step is zero or all start/stop/step are None range is invalid - return (range_params[-1] == 0 - or all(map(lambda x: x is None, range_params))) - - return filterfalse( - valid_params_predicate, - combinations_with_replacement(test_global_range_member_values, 3) - ) - - -def _generate_range_indexes_fixed(size, start=1, step=3): - yield pd.RangeIndex(size) - yield pd.RangeIndex(size, name='abc') - yield pd.RangeIndex(stop=step * size, step=step) - yield pd.RangeIndex(stop=2*step*size, step=2*step) - yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) - yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) - - -def _generate_index_param_values(n): - return chain([None], _generate_range_indexes_fixed(n)) - + def test_range_index_type_inferred(self): -class TestRangeIndex(TestCase): + for params in _generate_custom_range_params(): + start, stop, step = params + for name in test_global_index_names: + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, RangeIndexType) def test_range_index_create_and_box(self): def test_impl(start, stop, step, name): return pd.RangeIndex(start, stop, step, name=name) sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params for name in test_global_index_names: with self.subTest(start=start, stop=stop, step=step, name=name): @@ -87,7 +75,7 @@ def test_impl(index): return index sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params for name in test_global_index_names: index = pd.RangeIndex(start, stop, step, name=name) @@ -96,18 +84,6 @@ def test_impl(index): result_ref = test_impl(index) pd.testing.assert_index_equal(result, result_ref) - @unittest.skip("TODO: support boxing/unboxing and parent ref for Python ranges in Numba") - def test_range_index_unbox_data_id_check(self): - def test_impl(index): - return index - sdc_func = self.jit(test_impl) - - index = pd.RangeIndex(11, name='abc') - result = sdc_func(index) - result_ref = test_impl(index) - self.assertIs(index._range, result_ref._range) - self.assertIs(result._range, result_ref._range) - @unittest.skip("TODO: add support for integers as floats in ctor") def test_range_index_create_from_floats(self): def test_impl(*args): @@ -119,7 +95,7 @@ def test_impl(*args): result_ref = test_impl(start, stop, step) pd.testing.assert_index_equal(result, result_ref) - def test_range_index_create_invalid1(self): + def test_range_index_create_invalid_1(self): def test_impl(start, stop, step): return pd.RangeIndex(start, stop, step) sdc_func = self.jit(test_impl) @@ -135,7 +111,7 @@ def test_impl(start, stop, step): sdc_exception = context.exception self.assertIn(str(sdc_exception), str(pandas_exception)) - def test_range_index_create_invalid2(self): + def test_range_index_create_invalid_2(self): def test_impl(): return pd.RangeIndex(name='index') sdc_func = self.jit(test_impl) @@ -226,7 +202,7 @@ def test_impl(*args): return index.start sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -239,7 +215,7 @@ def test_impl(*args): return index.stop sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -252,7 +228,7 @@ def test_impl(*args): return index.step sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -288,7 +264,7 @@ def test_impl(*args): return len(index) sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -300,7 +276,7 @@ def test_impl(index): return index.values sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): index = pd.RangeIndex(*params) with self.subTest(index=index): result = sdc_func(index) @@ -325,7 +301,7 @@ def test_impl(index, new_name): return index.copy(name=new_name) sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params for name, new_name in product(test_global_index_names, repeat=2): index = pd.RangeIndex(start, stop, step, name=name) @@ -339,7 +315,7 @@ def test_impl(index, idx): return index[idx] sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): index = pd.RangeIndex(*params) n = len(index) if not n: # test only non-empty ranges @@ -357,7 +333,7 @@ def test_impl(index, idx): sdc_func = self.jit(test_impl) n = 11 - index = pd.RangeIndex(n, name='abc') + index = pd.RangeIndex(start=0, stop=-n, step=-1, name='abc') values_to_test = [-(n + 1), n] for idx in values_to_test: with self.subTest(idx=idx): @@ -376,166 +352,19 @@ def test_impl(index, idx): return index[idx] sdc_func = self.jit(test_impl) - index_len = 17 - start_values, step_values = [0, 5, -5], [1, 2, 7] + n = 17 slices_params = combinations_with_replacement( - [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], + [None, 0, -1, n // 2, n, n - 3, n + 3, -(n + 3)], 2 ) - - for start, step, slice_step in product(start_values, step_values, step_values): - stop = start + index_len + for index in _generate_custom_range_indexes_fixed(n): for slice_start, slice_stop in slices_params: - idx = slice(slice_start, slice_stop, slice_step) - index = pd.RangeIndex(start, stop, step, name='abc') - with self.subTest(index=index, idx=idx): - result = sdc_func(index, idx) - result_ref = test_impl(index, idx) - pd.testing.assert_index_equal(result, result_ref) - - @unittest.skip("Needs writable native struct type members in Numba") - def test_range_index_named_set_name(self): - def test_impl(index): - index.name = 'def' - return index - sdc_func = self.jit(test_impl) - - n = 11 - index1 = pd.RangeIndex(n, name='abc') - index2 = index1.copy(deep=True) - result = sdc_func(index1) - result_ref = test_impl(index2) - pd.testing.assert_index_equal(result, result_ref) - - @unittest.skip("Needs writable native struct type members and single common type for name") - def test_range_index_unnamed_set_name(self): - def test_impl(index): - index.name = 'def' - return index - sdc_func = self.jit(test_impl) - - n = 11 - index1 = pd.RangeIndex(n, name='abc') - index2 = index1.copy(deep=True) - result = sdc_func(index1) - result_ref = test_impl(index2) - pd.testing.assert_index_equal(result, result_ref) - - def _test_range_indexes(self, test_impl, indexes, size, apply_func): - for index in indexes: - expected_res = pd.RangeIndex(size) if index is None else index - with self.subTest(series_index=index): - args = apply_func(size, index) - result = test_impl(args) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_unbox_series_with_index(self): - @self.jit - def test_impl(S): - # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, - # but this requires separate type (e.g. DefaultIndexType) instead of types.none as native index - return S.index - - n = 11 - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(series_index=index): - S = pd.Series(np.ones(n), index=index) - result = test_impl(S) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_create_series_with_index(self): - @self.jit - def test_impl(data, index): - S = pd.Series(data=data, index=index) - return S.index - - n = 11 - series_data = np.ones(n) - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(series_index=index): - result = test_impl(series_data, index) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_box_series_with_index(self): - def test_impl(data, index): - return pd.Series(data=data, index=index) - sdc_func = self.jit(test_impl) - - n = 11 - series_data = np.ones(n) - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - result = sdc_func(series_data, index) - result_ref = test_impl(series_data, index) - pd.testing.assert_series_equal(result, result_ref) - - def test_range_index_get_series_index(self): - def test_impl(S): - return S.index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - S = pd.Series(np.ones(n), index=index) - result = sdc_func(S) - result_ref = test_impl(S) - pd.testing.assert_index_equal(result, result_ref) - - def test_range_index_unbox_df_with_index(self): - @self.jit - def test_impl(df): - return df.index - - n = 11 - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(df_index=index): - df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}, index=index) - result = test_impl(df) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_create_df_with_index(self): - @self.jit - def test_impl(A, B, index): - df = pd.DataFrame({'A': A, 'B': B}, index=index) - return df.index - - n = 11 - A, B = np.ones(n), np.arange(n) - for index in _generate_index_param_values(n): - expected_res = pd.RangeIndex(n) if index is None else index - with self.subTest(df_index=index): - result = test_impl(A, B, index) - pd.testing.assert_index_equal(result, expected_res) - - def test_range_index_box_df_with_index(self): - def test_impl(A, B, index): - return pd.DataFrame({'A': A, 'B': B}, index=index) - sdc_func = self.jit(test_impl) - - n = 11 - A, B = np.ones(n), np.arange(n, dtype=np.intp) - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - result = sdc_func(A, B, index) - result_ref = test_impl(A, B, index) - pd.testing.assert_frame_equal(result, result_ref) - - def test_range_index_get_df_index(self): - def test_impl(df): - return df.index - sdc_func = self.jit(test_impl) - - n = 11 - for index in _generate_index_param_values(n): - with self.subTest(series_index=index): - df = pd.DataFrame({'A': np.ones(n)}, index=index) - result = sdc_func(df) - result_ref = test_impl(df) - pd.testing.assert_index_equal(result, result_ref) + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) def test_range_index_iterator_1(self): def test_impl(index): @@ -569,7 +398,8 @@ def test_impl(index): return np.array(index) sdc_func = self.jit(test_impl) - index = pd.RangeIndex(1, 21, 3) + n = 11 + index = get_sample_index(n, RangeIndexType) result = sdc_func(index) result_ref = test_impl(index) np.testing.assert_array_equal(result, result_ref) @@ -581,7 +411,7 @@ def test_impl(index1, index2): sdc_func = self.jit(test_impl) n = 11 - for index1, index2 in product(_generate_range_indexes_fixed(n), repeat=2): + for index1, index2 in product(_generate_custom_range_indexes_fixed(n), repeat=2): with self.subTest(index1=index1, index2=index2): result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray result_ref = test_impl(index1, index2) @@ -636,8 +466,8 @@ def test_impl(A, B): n = 11 for A, B in product( - _generate_range_indexes_fixed(n), - map(lambda x: np.array(x), _generate_range_indexes_fixed(n)) + _generate_custom_range_indexes_fixed(n), + map(lambda x: np.array(x), _generate_custom_range_indexes_fixed(n)) ): for swap_operands in (False, True): if swap_operands: @@ -654,35 +484,13 @@ def test_impl(index1, index2): sdc_func = self.jit(test_impl) n = 11 - for index1, index2 in product(_generate_range_indexes_fixed(n), repeat=2): + for index1, index2 in product(_generate_custom_range_indexes_fixed(n), repeat=2): with self.subTest(index1=index1, index2=index2): result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray result_ref = test_impl(index1, index2) np.testing.assert_array_equal(result, result_ref) - @unittest.skip("Need support unboxing Python range in Numba with parent ref") - def test_range_index_operator_is_1(self): - def test_impl(index1, index2): - return index1 is index2 - sdc_func = self.jit(test_impl) - - # positive testcase - with self.subTest(subtest="same indexes"): - index1 = pd.RangeIndex(1, 21, 3) - index2 = index1 - result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) - self.assertEqual(result, result_ref) - - # negative testcase - with self.subTest(subtest="not same indexes"): - index1 = pd.RangeIndex(1, 21, 3) - index2 = pd.RangeIndex(1, 21, 3) - result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) - self.assertEqual(result, result_ref) - - def test_range_index_operator_is_2(self): + def test_range_index_operator_is_nounbox(self): def test_impl_1(*args): index1 = pd.RangeIndex(*args) index2 = index1 @@ -701,12 +509,14 @@ def test_impl_2(*args): result = sdc_func_1(*params) result_ref = test_impl_1(*params) self.assertEqual(result, result_ref) + self.assertEqual(result, True) # negative testcase with self.subTest(subtest="not same indexes"): result = sdc_func_2(*params) result_ref = test_impl_2(*params) self.assertEqual(result, result_ref) + self.assertEqual(result, False) def test_range_index_getitem_by_mask(self): def test_impl(index, mask): @@ -716,116 +526,158 @@ def test_impl(index, mask): n = 11 np.random.seed(0) mask = np.random.choice([True, False], n) - for index in _generate_range_indexes_fixed(n): + for index in _generate_custom_range_indexes_fixed(n): result = sdc_func(index, mask) result_ref = test_impl(index, mask) - # FIXME: replace with pd.testing.assert_index_equal when Int64Index is supported - np.testing.assert_array_equal(result, result_ref.values) + pd.testing.assert_index_equal(result, result_ref) - def test_range_index_support_reindexing(self): - from sdc.datatypes.common_functions import sdc_reindex_series - - def pyfunc(data, index, name, by_index): - S = pd.Series(data, index, name=name) - return S.reindex(by_index) - - @self.jit - def sdc_func(data, index, name, by_index): - return sdc_reindex_series(data, index, name, by_index) + def test_range_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) - n = 100 + n, k = 11, 7 np.random.seed(0) - mask = np.random.choice([True, False], n) - name = 'asdf' - index1 = pd.RangeIndex(n) - index2 = index1[::-1] - result = sdc_func(mask, index1, name, index2) - result_ref = pyfunc(mask, index1, name, index2) - pd.testing.assert_series_equal(result, result_ref) + idx = np.random.choice(np.arange(n), k) + for index in _generate_custom_range_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) - def test_range_index_support_join(self): - from sdc.datatypes.common_functions import sdc_join_series_indexes + def test_range_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) - def pyfunc(index1, index2): - return index1.join(index2, how='outer', return_indexers=True) + n = 11 + index = pd.RangeIndex(n) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) - @self.jit - def sdc_func(index1, index2): - return sdc_join_series_indexes(index1, index2) + def test_range_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) - index1 = pd.RangeIndex(1, 21, 3, name='asv') - index2 = pd.RangeIndex(19, -1, -3, name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - results_names = ['result index', 'left indexer', 'right indexer'] - for i, name in enumerate(results_names): - result_elem = result[i] - result_ref_elem = result_ref[i].values if not i else result_ref[i] - np.testing.assert_array_equal(result_elem, result_ref_elem, f"Mismatch in {name}") + n = 11 + self_indexes = list(chain( + _generate_custom_range_indexes_fixed(n), + _generate_custom_range_indexes_fixed(2 * n) + )) + + all_range_indexes = list(_generate_range_indexes_fixed(n)) + other_indexes = chain( + all_range_indexes, + map(lambda x: pd.Int64Index(x), all_range_indexes), + ) - def test_range_index_support_take(self): - from sdc.datatypes.common_functions import _sdc_take + for index1, index2 in product(self_indexes, other_indexes): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) - def pyfunc(index1, indexes): - return index1.values.take(indexes) + def test_range_index_reindex_equal_indexes(self): - @self.jit - def sdc_func(index1, indexes): - return _sdc_take(index1, indexes) + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) - n, k = 1000, 200 + n = 20 np.random.seed(0) - index = pd.RangeIndex(stop=3 * n, step=3, name='asd') - indexes = np.random.choice(np.arange(n), n)[:k] - result = sdc_func(index, indexes) - result_ref = pyfunc(index, indexes) - np.testing.assert_array_equal(result, result_ref) + index1 = pd.RangeIndex(-1, n, 1) + index2 = index1.copy(deep=True) - def test_range_index_support_astype(self): - from sdc.functions.numpy_like import astype + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) - def pyfunc(index): - return index.values.astype(np.int64) + def test_range_index_reindex(self): - @self.jit - def sdc_func(index): - return astype(index, np.int64) + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) - index = pd.RangeIndex(stop=11, name='asd') - np.testing.assert_array_equal(sdc_func(index), pyfunc(index)) + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(-1, n, 1) + reindex_by = [ + pd.RangeIndex(0, n + 2, 2), + pd.Int64Index(np.random.choice(index1.values, n, replace=False)), + pd.Int64Index(np.random.choice([0, 1, 11, 12, 100], n)) + ] - def test_range_index_support_array_equal(self): - from sdc.functions.numpy_like import array_equal + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) - def pyfunc(index1, index2): - return np.array_equal(index1.values, index2.values) + def test_range_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) - @self.jit - def sdc_func(index1, index2): - return array_equal(index1, index2) + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_range_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) - for params1, params2 in product(_generate_valid_range_params(), repeat=2): - for name1, name2 in product(test_global_index_names, repeat=2): - index1 = pd.RangeIndex(*params1, name=name1) - index2 = pd.RangeIndex(*params2, name=name2) - with self.subTest(index1=index1, index2=index2): - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - self.assertEqual(result, result_ref) + def test_range_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) - def test_range_index_support_copy(self): - from sdc.functions.numpy_like import copy + n = 11 + other_indexes = [ + get_sample_index(n, PositionalIndexType), + get_sample_index(n, RangeIndexType), + get_sample_index(n, Int64IndexType), + ] + for index, other in product( + _generate_range_indexes_fixed(n), + other_indexes + ): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) - @self.jit - def sdc_func(index): - return copy(index) + def test_range_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): - for name in test_global_index_names: - index = pd.RangeIndex(*params, name=name) - with self.subTest(index=index): - result = sdc_func(index) - pd.testing.assert_index_equal(result, index) + n = 11 + other_indexes = [ + get_sample_index(2 * n, PositionalIndexType), + get_sample_index(2 * n, RangeIndexType), + get_sample_index(2 * n, Int64IndexType), + ] + for index, other in product( + _generate_range_indexes_fixed(n), + other_indexes + ): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) if __name__ == "__main__": diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index c245f0694..23ec45639 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -39,7 +39,7 @@ from pandas.core.indexing import IndexingError import sdc -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation from sdc.tests.gen_test_data import ParquetGenerator from sdc.tests.test_base import TestCase from sdc.tests.test_utils import (check_numba_version, @@ -362,6 +362,30 @@ def test_impl(n): self.assertEqual(count_parfor_REPs(), 0) self.assertEqual(count_parfor_OneDs(), 1) + @unittest.skip("Works, but compile time needs debug") + def test_column_getitem_repeats(self): + def test_impl(a, b, c): + df = pd.DataFrame({ + 'A': a, + 'B': b, + 'C': c, + }) + + A = df['A'] + B = df['B'] + C = df['C'] + return A[0] + B[0] + C[0] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + a = np.ones(n) + b = np.random.ranf(n) + c = np.random.randint(-100, 100, n) + result = sdc_func(a, b, c) + result_ref = pd.Series(test_impl(a, b, c)) + pd.testing.assert_series_equal(result, result_ref) + @skip_numba_jit def test_column_list_getitem1(self): def test_impl(df): @@ -1840,7 +1864,7 @@ def test_df_drop_one_column(self): """ Verifies df.drop handles string literal as columns param """ def test_impl(): df = pd.DataFrame({ - 'A': [1.0, 2.0, np.nan, 1.0], + 'A': np.array([1.0, 2.0, np.nan, 1.0]), 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0] }) diff --git a/sdc/tests/test_date.py b/sdc/tests/test_date.py index 83671cee4..83d001349 100644 --- a/sdc/tests/test_date.py +++ b/sdc/tests/test_date.py @@ -81,7 +81,9 @@ def test_impl(A): hpat_func = self.jit(test_impl) df = self._gen_str_date_df() A = pd.DatetimeIndex(df['str_date']).to_series() - np.testing.assert_array_equal(hpat_func(A), test_impl(A)) + result = hpat_func(A) + result_ref = test_impl(A) + np.testing.assert_array_equal(result, result_ref) @skip_numba_jit def test_datetime_getitem(self): diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 3c5db9c1f..8314706ed 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -33,7 +33,7 @@ import sdc import string import unittest -from itertools import combinations, combinations_with_replacement, product +from itertools import combinations, combinations_with_replacement, islice, permutations, product import numba from numba import types from numba.core.config import IS_32BITS @@ -62,7 +62,7 @@ gen_frand_array, gen_strlist, _make_func_from_text) -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation _cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [ @@ -323,24 +323,7 @@ def test_impl(n): n = 11 pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) - def test_create_series_index1(self): - # create and box an indexed Series - def test_impl(): - A = pd.Series([1, 2, 3], ['A', 'C', 'B']) - return A - hpat_func = self.jit(test_impl) - - pd.testing.assert_series_equal(hpat_func(), test_impl()) - - def test_create_series_index2(self): - def test_impl(): - A = pd.Series([1, 2, 3], index=[2, 1, 0]) - return A - hpat_func = self.jit(test_impl) - - pd.testing.assert_series_equal(hpat_func(), test_impl()) - - def test_create_series_index3(self): + def test_create_series_param_name_literal(self): def test_impl(): A = pd.Series([1, 2, 3], index=['A', 'C', 'B'], name='A') return A @@ -348,7 +331,7 @@ def test_impl(): pd.testing.assert_series_equal(hpat_func(), test_impl()) - def test_create_series_index4(self): + def test_create_series_param_name(self): def test_impl(name): A = pd.Series([1, 2, 3], index=['A', 'C', 'B'], name=name) return A @@ -376,7 +359,7 @@ def test_impl(A): S = pd.Series(['a', 'b', 'c'], name='A') self.assertEqual(hpat_func(S), test_impl(S)) - def test_pass_series_index1(self): + def test_pass_series_all_indexes(self): def test_impl(A): return A hpat_func = self.jit(test_impl) @@ -387,6 +370,7 @@ def test_impl(A): list(np.arange(n)), np.arange(n), pd.RangeIndex(n), + pd.Int64Index(np.arange(n)), gen_strlist(n) ] for index in indexes_to_test: @@ -2206,13 +2190,15 @@ def test_series_value_counts_index(self): def test_impl(S): return S.value_counts() - hpat_func = self.jit(test_impl) + sdc_func = self.jit(test_impl) for data in test_global_input_data_integer64: + index = np.arange(start=1, stop=len(data) + 1) with self.subTest(series_data=data): - index = np.arange(start=1, stop=len(data) + 1) S = pd.Series(data, index=index) - pd.testing.assert_series_equal(hpat_func(S).sort_index(), test_impl(S).sort_index()) + result = sdc_func(S) + result_ref = test_impl(S) + pd.testing.assert_series_equal(result.sort_index(), result_ref.sort_index()) def test_series_value_counts_no_unboxing(self): def test_impl(): @@ -4796,7 +4782,36 @@ def test_series_cov_impl(s1, s2, min_periods=None): msg = 'Method cov(). The object min_periods' self.assertIn(msg, str(raises.exception)) - @skip_numba_jit + def test_series_div_special(self): + @self.jit + def test_func(S1, S2): + return S1.div(S2) + # return S1 + S2 + + S1 = pd.Series( + np.arange(12), + index=pd.RangeIndex(start=0, stop=12, step=1) + ) + S2 = pd.Series( + # [1.1, 0.3, np.nan, 1., np.inf, 0., 1.1, np.nan, 2.2, np.inf, 2., 2.], + np.arange(12), + index=pd.RangeIndex(start=0, stop=12, step=1) + ) + + res = test_func(S1, S2) + + def test_series_get_index(self): + @self.jit + def test_func(S1): + return S1._index.values + + S1 = pd.Series( + np.arange(12), + index=pd.RangeIndex(start=0, stop=12, step=1) + ) + + res = test_func(S1) + def test_series_pct_change(self): def test_series_pct_change_impl(S, periods, method): return S.pct_change(periods=periods, fill_method=method, limit=None, freq=None) @@ -4814,7 +4829,13 @@ def test_series_pct_change_impl(S, periods, method): for input_data in test_input_data: S = pd.Series(input_data) for periods in [0, 1, 2, 5, 10, -1, -2, -5]: - for method in [None, 'pad', 'ffill', 'backfill', 'bfill']: + for method in [ + None, + 'pad', + 'ffill', + 'backfill', + 'bfill' + ]: result_ref = test_series_pct_change_impl(S, periods, method) result = hpat_func(S, periods, method) pd.testing.assert_series_equal(result, result_ref) @@ -5009,7 +5030,7 @@ def test_impl(S, idx, value): 'not a Boolean or integer indexer or a Slice. Given: self.index={}, idx={}' with self.assertRaises(TypingError) as raises: hpat_func(S, idx, value) - msg = msg_tmpl.format('none', 'unicode_type') + msg = msg_tmpl.format('PositionalIndexType(False)', 'unicode_type') self.assertIn(msg, str(raises.exception)) def test_series_istitle_str(self): diff --git a/sdc/tests/test_series_ops.py b/sdc/tests/test_series_ops.py index cbf9782b2..5dbda0d42 100644 --- a/sdc/tests/test_series_ops.py +++ b/sdc/tests/test_series_ops.py @@ -1157,6 +1157,22 @@ def test_impl(a, b, value): result_ref = test_impl(S1, scalar, fill_value) pd.testing.assert_series_equal(result, result_ref) + @skip_numba_jit("Expected to fail due to type-stability of index operations") + def test_series_operator_add_index_type_check(self): + def test_impl(S1, S2): + return S1 + S2 + hpat_func = self.jit(test_impl) + + n = 11 + series_data = np.arange(n, dtype=np.float64) + index_data = pd.RangeIndex(n, 0, -1) + S1 = pd.Series(series_data, index_data) + S2 = pd.Series(2 * series_data + 1, index_data) + result = hpat_func(S1, S2) + result_ref = test_impl(S1, S2) + pd.testing.assert_series_equal(result, result_ref) + pd.testing.assert_index_equal(result.index, result_ref.index, exact=True) + if __name__ == "__main__": unittest.main() diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 81bc81c31..b1b6a8ebf 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -23,7 +23,6 @@ # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - """ | This file contains SDC utility functions related to typing compilation phase @@ -39,8 +38,34 @@ from numba.np import numpy_support from sdc.str_arr_type import string_array_type -from sdc.datatypes.range_index_type import RangeIndexType - +from sdc.datatypes.indexes import * +from sdc.str_arr_ext import StringArrayType + + +sdc_old_index_types = (types.Array, StringArrayType, ) +sdc_pandas_index_types = ( + EmptyIndexType, + PositionalIndexType, + RangeIndexType, + Int64IndexType, + ) + sdc_old_index_types + +sdc_indexes_range_like = ( + PositionalIndexType, + RangeIndexType, + ) + +# TO-DO: support caching of data allocated for range indexes at request for .values +sdc_indexes_wo_values_cache = ( + EmptyIndexType, + PositionalIndexType, + RangeIndexType, + ) + +sdc_pandas_df_column_types = ( + types.Array, + StringArrayType, + ) class TypeChecker: """ @@ -91,6 +116,11 @@ def check(self, data, accepted_type, name=''): self.raise_exc(data, accepted_type.__name__, name=name) +class SDCLimitation(Exception): + """Exception to be raised in case of SDC limitation""" + pass + + def kwsparams2list(params): """Convert parameters dict to a list of string of a format 'key=value'""" return ['{}={}'.format(k, v) for k, v in params.items()] @@ -138,7 +168,7 @@ def check_is_numeric_array(type_var): def check_index_is_numeric(ty_series): """Used during typing to check that series has numeric index""" - return check_is_numeric_array(ty_series.index) + return isinstance(ty_series.index.dtype, types.Number) def check_types_comparable(ty_left, ty_right): @@ -182,20 +212,21 @@ def find_common_dtype_from_numpy_dtypes(array_types, scalar_types): return numba_common_dtype -def find_index_common_dtype(self, other): +def find_index_common_dtype(left, right): """Used to find common dtype for indexes of two series and verify if index dtypes are equal""" - self_index_dtype = RangeIndexType.dtype if isinstance(self.index, types.NoneType) else self.index.dtype - other_index_dtype = RangeIndexType.dtype if isinstance(other.index, types.NoneType) else other.index.dtype - index_dtypes_match = self_index_dtype == other_index_dtype + left_index_dtype = left.dtype + right_index_dtype = right.dtype + index_dtypes_match = left_index_dtype == right_index_dtype if not index_dtypes_match: numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( - [self_index_dtype, other_index_dtype], []) + [left_index_dtype, right_index_dtype], []) else: - numba_index_common_dtype = self_index_dtype + numba_index_common_dtype = left_index_dtype return index_dtypes_match, numba_index_common_dtype + def gen_impl_generator(codegen, impl_name): """Generate generator of an implementation""" def _df_impl_generator(*args, **kwargs): @@ -208,3 +239,15 @@ def _df_impl_generator(*args, **kwargs): return _impl return _df_impl_generator + + +def check_signed_integer(ty): + return isinstance(ty, types.Integer) and ty.signed + + +def _check_dtype_param_type(dtype): + """ Returns True is dtype is a valid type for dtype parameter and False otherwise. + Used in RangeIndex ctor and other methods that take dtype parameter. """ + + valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) + return isinstance(dtype, valid_dtype_types) or dtype is None From 013b18023bffa22d9df6aa9d2d278e4b5a37076e Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 19 Jan 2021 03:09:56 +0300 Subject: [PATCH 2/5] Resolving minor issues and fixing tests --- sdc/extensions/indexes/int64_index_ext.py | 7 ++--- sdc/extensions/indexes/range_index_ext.py | 4 +++ sdc/tests/indexes/test_int64_index.py | 31 +++++++++++++++++++++++ sdc/tests/test_date.py | 7 ++--- 4 files changed, 43 insertions(+), 6 deletions(-) diff --git a/sdc/extensions/indexes/int64_index_ext.py b/sdc/extensions/indexes/int64_index_ext.py index 425b57a1f..d450f7c4c 100644 --- a/sdc/extensions/indexes/int64_index_ext.py +++ b/sdc/extensions/indexes/int64_index_ext.py @@ -292,9 +292,6 @@ def pd_int64_index_copy_overload(self, name=None, deep=False, dtype=None): name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None keep_name = name_is_none and self.is_named - # FIXME: deep=True/False is not handled at all - and has to be supported! - # Support for other indexes too! - # FIXME: add tests for all index types on copy_param_deep def pd_int64_index_copy_impl(self, name=None, deep=False, dtype=None): _name = self._name if keep_name == True else name # noqa @@ -418,6 +415,10 @@ def pd_int64_index_ravel_overload(self, order='C'): raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) def pd_int64_index_ravel_impl(self, order='C'): + # np.ravel argument order is not supported in Numba + if order != 'C': + raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") + return self.values return pd_int64_index_ravel_impl diff --git a/sdc/extensions/indexes/range_index_ext.py b/sdc/extensions/indexes/range_index_ext.py index 9a1718801..7bb1b1c45 100644 --- a/sdc/extensions/indexes/range_index_ext.py +++ b/sdc/extensions/indexes/range_index_ext.py @@ -482,6 +482,10 @@ def pd_range_index_ravel_overload(self, order='C'): raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) def pd_range_index_ravel_impl(self, order='C'): + # np.ravel argument order is not supported in Numba + if order != 'C': + raise ValueError(f"Unsupported value for argument 'order' (only default 'C' is supported)") + return self.values return pd_range_index_ravel_impl diff --git a/sdc/tests/indexes/test_int64_index.py b/sdc/tests/indexes/test_int64_index.py index 7fa52fd17..2750049f7 100644 --- a/sdc/tests/indexes/test_int64_index.py +++ b/sdc/tests/indexes/test_int64_index.py @@ -206,6 +206,37 @@ def test_impl(index, value): result_ref = test_impl(index, value) np.testing.assert_array_equal(result, result_ref) + def test_int64_index_copy(self): + def test_impl(index, new_name): + return index.copy(name=new_name) + sdc_func = self.jit(test_impl) + + for data in _generate_valid_int64_index_data(): + for name, new_name in product(test_global_index_names, repeat=2): + index = pd.Int64Index(data, name=name) + with self.subTest(index=index, new_name=new_name): + result = sdc_func(index, new_name) + result_ref = test_impl(index, new_name) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_copy_param_deep(self): + def test_impl(index, deep): + return index.copy(deep=deep) + sdc_func = self.jit(test_impl) + + index = pd.Int64Index([1, 11, 2]) + for deep in [True, False]: + with self.subTest(deep=deep): + result = sdc_func(index, deep) + result_ref = test_impl(index, deep) + pd.testing.assert_index_equal(result, result_ref) + # pandas uses ndarray views when copies index, so for python + # case check that data arrays share the same memory + self.assertEqual( + result._data is index._data, + result_ref._data.base is index._data + ) + def test_int64_index_getitem_scalar(self): def test_impl(index, idx): return index[idx] diff --git a/sdc/tests/test_date.py b/sdc/tests/test_date.py index 83d001349..99509b74b 100644 --- a/sdc/tests/test_date.py +++ b/sdc/tests/test_date.py @@ -74,16 +74,17 @@ def test_impl(df): df = self._gen_str_date_df() np.testing.assert_array_equal(hpat_func(df), test_impl(df)) + @skip_numba_jit("DatetimeIndex unboxing not supported") def test_datetime_arg(self): def test_impl(A): return A - hpat_func = self.jit(test_impl) + sdc_func = self.jit(test_impl) df = self._gen_str_date_df() A = pd.DatetimeIndex(df['str_date']).to_series() - result = hpat_func(A) + result = sdc_func(A) result_ref = test_impl(A) - np.testing.assert_array_equal(result, result_ref) + pd.testing.assert_series_equal(result, result_ref) @skip_numba_jit def test_datetime_getitem(self): From 2dcdcf86ecddfea7c1d76cafa9c6b3fa4da42d7e Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Wed, 20 Jan 2021 02:34:26 +0300 Subject: [PATCH 3/5] Fixing types.None index in read_csv --- sdc/datatypes/indexes/empty_index_type.py | 4 ++++ sdc/datatypes/indexes/positional_index_type.py | 4 ++++ sdc/io/csv_ext.py | 5 ++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sdc/datatypes/indexes/empty_index_type.py b/sdc/datatypes/indexes/empty_index_type.py index 68cebeb32..76eda45a2 100644 --- a/sdc/datatypes/indexes/empty_index_type.py +++ b/sdc/datatypes/indexes/empty_index_type.py @@ -56,4 +56,8 @@ def __init__(self, dmm, fe_type): models.StructModel.__init__(self, dmm, fe_type, members) +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.EmptyIndexType = EmptyIndexType + + make_attribute_wrapper(EmptyIndexType, 'name', '_name') diff --git a/sdc/datatypes/indexes/positional_index_type.py b/sdc/datatypes/indexes/positional_index_type.py index 3896be5f9..f7f245e33 100644 --- a/sdc/datatypes/indexes/positional_index_type.py +++ b/sdc/datatypes/indexes/positional_index_type.py @@ -59,4 +59,8 @@ def __init__(self, dmm, fe_type): models.StructModel.__init__(self, dmm, fe_type, members) +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.PositionalIndexType = PositionalIndexType + + make_attribute_wrapper(PositionalIndexType, 'data', '_data') diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index 78aaafeb6..e772d8b4c 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -61,6 +61,8 @@ import pyarrow import pyarrow.csv +from sdc.datatypes.indexes.empty_index_type import EmptyIndexType +from sdc.datatypes.indexes.positional_index_type import PositionalIndexType class CsvReader(ir.Stmt): @@ -524,9 +526,10 @@ def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names column_loc, _, _ = get_structure_maps(col_typs, return_columns) + index_type = PositionalIndexType(False) if col_typs else EmptyIndexType(False) df_type = DataFrameType( tuple(col_typs), - types.none, + index_type, tuple(col_names), column_loc=column_loc ) From c09be09c9128a977b73ed4f09d7ece37907b9ee6 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Mon, 1 Feb 2021 22:21:50 +0300 Subject: [PATCH 4/5] Fixing PEP remarks --- sdc/datatypes/common_functions.py | 2 +- .../hpat_pandas_dataframe_functions.py | 5 ++- sdc/datatypes/hpat_pandas_series_functions.py | 23 +++++----- .../indexes/positional_index_type.py | 1 + sdc/extensions/indexes/empty_index_ext.py | 2 +- sdc/extensions/indexes/indexes_generic.py | 4 +- sdc/extensions/indexes/int64_index_ext.py | 10 +++-- .../indexes/positional_index_ext.py | 11 ++--- sdc/extensions/indexes/range_index_ext.py | 4 +- sdc/functions/numpy_like.py | 13 +++--- sdc/rewrites/dataframe_constructor.py | 5 ++- sdc/sdc_autogenerated.py | 42 ++++++++++++------- sdc/sdc_function_templates.py | 6 ++- sdc/tests/indexes/index_datagens.py | 2 +- sdc/tests/indexes/test_indexes.py | 1 + sdc/tests/indexes/test_int64_index.py | 10 ++--- sdc/tests/indexes/test_positional_index.py | 10 ++--- sdc/tests/indexes/test_range_index.py | 10 ++--- sdc/tests/test_series.py | 8 +--- 19 files changed, 93 insertions(+), 76 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 8dbd6701a..47e435c8a 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -88,7 +88,7 @@ def hpat_arrays_append_overload(A, B): if isinstance(B, valid_num_single_B_dtype): convert_B = not isinstance(B, types.Array) def _append_single_numeric_impl(A, B): - _B = B if convert_B == False else B.values + _B = B if convert_B == False else B.values # noqa return numpy.concatenate((A, _B,)) return _append_single_numeric_impl diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index b955839a3..24d211aa5 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -2161,7 +2161,10 @@ def _df_getitem_list_bool_iloc_impl(self, idx): result_1 = pandas.Series(data_1[numpy.array(idx)]) data_2 = self._dataframe._data[1][0] result_2 = pandas.Series(data_2[numpy.array(idx)]) - return pandas.DataFrame(data={"A": result_0, "B": result_1, "C": result_2}, index=self._dataframe.index[numpy.array(idx)]) + return pandas.DataFrame(data={"A": result_0, + "B": result_1, + "C": result_2}, + index=self._dataframe.index[numpy.array(idx)]) raise IndexingError('Item wrong length') """ func_lines = ['def _df_getitem_list_bool_iloc_impl(self, idx):'] diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 0738fe9c3..6c0b668e4 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -424,7 +424,7 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx): if (isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean)): positional_indexes = (isinstance(self.index, PositionalIndexType) - and isinstance(idx.index, PositionalIndexType)) + and isinstance(idx.index, PositionalIndexType)) if not check_types_comparable(self.index, idx.index): msg = '{} The index of boolean indexer is not comparable to Series index.' + \ ' Given: self.index={}, idx.index={}' @@ -616,7 +616,7 @@ def sdc_pandas_series_setitem(self, idx, value): and isinstance(idx.dtype, (types.Number, types.Boolean))) assign_via_idx_mask = idx_is_scalar and idx_and_self_index_comparable assign_via_idx_values = (self_index_is_positional and idx_index_is_positional - or idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable) + or idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable) def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): @@ -1904,10 +1904,10 @@ def hpat_pandas_series_astype_no_modify_impl(self, dtype, copy=True, errors='rai if isinstance(dtype, types.StringLiteral) and errors == 'raise': try: literal_value = numpy.dtype(dtype.literal_value) - except: - pass # Will raise the exception later + except TypeError: + pass # Will raise the exception later else: - raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype.literal_value}') + raise TypingError(f'Needs Numba astype impl support converting unicode_type to {literal_value}') else: return hpat_pandas_series_astype_no_modify_impl @@ -2162,14 +2162,14 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False): if to_append_is_series == True: # noqa new_data = common_functions.hpat_arrays_append(self._data, to_append._data) - _self_index = self._index.values if index_api_supported == True else self._index + _self_index = self._index.values if index_api_supported == True else self._index # noqa new_index = common_functions.hpat_arrays_append(_self_index, to_append._index) else: data_arrays_to_append = [series._data for series in to_append] index_arrays_to_append = [series._index for series in to_append] new_data = common_functions.hpat_arrays_append(self._data, data_arrays_to_append) - _self_index = self._index.values if index_api_supported == True else self._index + _self_index = self._index.values if index_api_supported == True else self._index # noqa new_index = common_functions.hpat_arrays_append(_self_index, index_arrays_to_append) return pandas.Series(new_data, new_index) @@ -2229,6 +2229,7 @@ def hpat_pandas_series_copy(self, deep=True): ty_checker.raise_exc(deep, 'boolean', 'deep') index_api_supported = not isinstance(self.index, sdc_old_index_types) + def hpat_pandas_series_copy_impl(self, deep=True): new_series_data = numpy_like.copy(self._data) if deep else self._data @@ -4551,11 +4552,11 @@ def _series_operator_add_none_indexes_impl(self, other): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - or isinstance(other.index, sdc_old_index_types)) + or isinstance(other.index, sdc_old_index_types)) def _series_operator_add_str_impl(self, other): left_index, right_index = self._index, other._index - if index_api_supported == True: + if index_api_supported == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -4669,11 +4670,11 @@ def _series_operator_mul_none_indexes_impl(self, other): return _series_operator_mul_none_indexes_impl else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - or isinstance(other.index, sdc_old_index_types)) + or isinstance(other.index, sdc_old_index_types)) def _series_operator_mul_common_impl(self, other): left_index, right_index = self._index, other._index - if index_api_supported == True: + if index_api_supported == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) diff --git a/sdc/datatypes/indexes/positional_index_type.py b/sdc/datatypes/indexes/positional_index_type.py index f7f245e33..2cbf1ae77 100644 --- a/sdc/datatypes/indexes/positional_index_type.py +++ b/sdc/datatypes/indexes/positional_index_type.py @@ -34,6 +34,7 @@ from sdc.datatypes.indexes import RangeIndexType + class PositionalIndexType(types.IterableType): dtype = types.int64 diff --git a/sdc/extensions/indexes/empty_index_ext.py b/sdc/extensions/indexes/empty_index_ext.py index c67ec3c6d..470331de7 100644 --- a/sdc/extensions/indexes/empty_index_ext.py +++ b/sdc/extensions/indexes/empty_index_ext.py @@ -59,7 +59,7 @@ def codegen(context, builder, sig, args): index_struct.name = name_val if context.enable_nrt and is_named: - context.nrt.incref(builder, sig.args[1], name_val) + context.nrt.incref(builder, sig.args[1], name_val) return index_struct._getvalue() diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 23a2b66ed..3462067cc 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -119,6 +119,7 @@ def sdc_indexes_reindex(self, target): def pd_indexes_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): index_dtype = self.dtype + def pd_indexes_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): """ Simplified version of pandas.core.index.base.reindex """ @@ -168,7 +169,7 @@ def pd_indexes_join_overload(left, right): # for index types with dtype=int64 resulting index should be of Int64Index type if (isinstance(left, (PositionalIndexType, RangeIndexType, Int64IndexType)) - and isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType))): + and isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType))): def _convert_to_arrays_impl(left, right): @@ -187,6 +188,7 @@ def _convert_to_arrays_impl(left, right): convert_left = isinstance(left, (PositionalIndexType, RangeIndexType, Int64IndexType)) convert_right = isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType)) index_dtypes_match, res_index_dtype = find_index_common_dtype(left, right) + def pd_indexes_join_array_indexes_impl(left, right): _left = left.values if convert_left == True else left # noqa diff --git a/sdc/extensions/indexes/int64_index_ext.py b/sdc/extensions/indexes/int64_index_ext.py index d450f7c4c..fe7c8d89e 100644 --- a/sdc/extensions/indexes/int64_index_ext.py +++ b/sdc/extensions/indexes/int64_index_ext.py @@ -456,7 +456,6 @@ def pd_int64_index_reindex_overload(self, target, method=None, level=None, limit raise TypingError('{} Not allowed for non comparable indexes. \ Given: self={}, target={}'.format(_func_name, self, target)) - def pd_int64_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) @@ -485,8 +484,9 @@ def pd_int64_index_take_chunked_impl(self, indexes): return pd_int64_index_take_chunked_impl convert_target = isinstance(indexes, sdc_pandas_index_types) and not isinstance(indexes, types.Array) + def pd_int64_index_take_impl(self, indexes): - _indexes = indexes.values if convert_target == True else indexes + _indexes = indexes.values if convert_target == True else indexes # noqa new_index_data = numpy_like.take(self._data, _indexes) return pd.Int64Index(new_index_data, name=self._name) @@ -511,12 +511,13 @@ def pd_int64_index_append_overload(self, other): convert_other = not isinstance(other, types.Array) _, res_index_dtype = find_index_common_dtype(self, other) return_as_array_index = res_index_dtype is not types.int64 + def pd_int64_index_append_impl(self, other): _other = other.values if convert_other == True else other # noqa new_index_data = hpat_arrays_append(self._data, _other) # this is only needed while some indexes are represented with arrays # TO-DO: support pd.Index() overload with dtype arg to create indexes - if return_as_array_index == False: + if return_as_array_index == False: # noqa return pd.Int64Index(new_index_data) else: return new_index_data @@ -550,9 +551,10 @@ def pd_int64_index_join_overload(self, other, how, level=None, return_indexers=F ty_checker.raise_exc(sort, 'boolean', 'sort') _return_indexers = return_indexers.literal_value + def pd_int64_index_join_impl(self, other, how, level=None, return_indexers=False, sort=False): - if _return_indexers == True: + if _return_indexers == True: # noqa return sdc_indexes_join_outer(self, other) else: joined_index, = sdc_indexes_join_outer(self, other) diff --git a/sdc/extensions/indexes/positional_index_ext.py b/sdc/extensions/indexes/positional_index_ext.py index 8bd89b442..2c512d4fe 100644 --- a/sdc/extensions/indexes/positional_index_ext.py +++ b/sdc/extensions/indexes/positional_index_ext.py @@ -58,6 +58,7 @@ def init_positional_index(typingctx, size, name=None): ret_typ = PositionalIndexType(is_named) inner_sig = signature(ret_typ.data, size, name) + def codegen(context, builder, sig, args): data_val, name_val = args @@ -140,6 +141,7 @@ def pd_positional_index_name_overload(self): return None is_named_index = self.is_named + def pd_positional_index_name_impl(self): _self = self._data if is_named_index == True: # noqa @@ -160,6 +162,7 @@ def pd_positional_index_dtype_impl(self): return pd_positional_index_dtype_impl + @sdc_overload_attribute(PositionalIndexType, 'values') def pd_positional_index_values_overload(self): if not isinstance(self, PositionalIndexType): @@ -171,6 +174,7 @@ def pd_positional_index_values_impl(self): return pd_positional_index_values_impl + @sdc_overload(len) def pd_positional_index_len_overload(self): if not isinstance(self, PositionalIndexType): @@ -213,6 +217,7 @@ def pd_positional_index_copy_overload(self, name=None, deep=False, dtype=None): name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None keep_name = name_is_none and self.is_named + def pd_positional_index_copy_impl(self, name=None, deep=False, dtype=None): _name = self.name if keep_name == True else name # noqa @@ -311,9 +316,6 @@ def pd_positional_index_getiter(context, builder, sig, args): return impl_ret_untracked(context, builder, PositionalIndexType, res) - - - @sdc_overload_method(PositionalIndexType, 'ravel') def pd_positional_index_ravel_overload(self, order='C'): if not isinstance(self, PositionalIndexType): @@ -365,7 +367,6 @@ def pd_positional_index_reindex_overload(self, target, method=None, level=None, raise TypingError('{} Not allowed for non comparable indexes. \ Given: self={}, target={}'.format(_func_name, self, target)) - def pd_positional_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) @@ -466,7 +467,7 @@ def pd_indexes_join_positional_impl(self, other, how, level=None, return_indexer else: def pd_positional_index_join_common_impl(self, other, how, level=None, return_indexers=False, sort=False): - if _return_indexers == True: + if _return_indexers == True: # noqa return sdc_indexes_join_outer(self, other) else: return sdc_indexes_join_outer(self, other)[0] diff --git a/sdc/extensions/indexes/range_index_ext.py b/sdc/extensions/indexes/range_index_ext.py index 7bb1b1c45..0cde89438 100644 --- a/sdc/extensions/indexes/range_index_ext.py +++ b/sdc/extensions/indexes/range_index_ext.py @@ -538,7 +538,6 @@ def pd_range_index_reindex_overload(self, target, method=None, level=None, limit raise TypingError('{} Not allowed for non comparable indexes. \ Given: self={}, target={}'.format(_func_name, self, target)) - def pd_range_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) @@ -612,8 +611,9 @@ def pd_range_index_join_overload(self, other, how, level=None, return_indexers=F ty_checker.raise_exc(sort, 'boolean', 'sort') _return_indexers = return_indexers.literal_value + def pd_range_index_join_impl(self, other, how, level=None, return_indexers=False, sort=False): - if _return_indexers == True: + if _return_indexers == True: # noqa return sdc_indexes_join_outer(self, other) else: joined_index, = sdc_indexes_join_outer(self, other) diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index f7441e476..1978a978f 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -1276,7 +1276,6 @@ def take(data, indices): pass - @sdc_overload(take) def sdc_take_overload(data, indices): @@ -1304,8 +1303,8 @@ def sdc_take_array_indices_seq_impl(data, indices): res_arr = numpy.empty(res_size, dtype=data_dtype) for i in numba.prange(len(indices)): start = 0 - for l in range(len(indices[0:i])): - start += len(indices[l]) + for list_elem in range(len(indices[0:i])): + start += len(indices[list_elem]) current_pos = start for j in range(len(indices[i])): res_arr[current_pos] = data[indices[i][j]] @@ -1323,8 +1322,8 @@ def sdc_take_str_arr_indices_seq_impl(data, indices): num_total_bytes = 0 for i in numba.prange(len(indices)): start = 0 - for l in range(len(indices[0:i])): - start += len(indices[l]) + for list_elem in range(len(indices[0:i])): + start += len(indices[list_elem]) current_pos = start for j in range(len(indices[i])): num_total_bytes += get_utf8_size(data[indices[i][j]]) @@ -1334,8 +1333,8 @@ def sdc_take_str_arr_indices_seq_impl(data, indices): res_arr = pre_alloc_string_array(res_size, num_total_bytes) for i in numba.prange(len(indices)): start = 0 - for l in range(len(indices[0:i])): - start += len(indices[l]) + for list_elem in range(len(indices[0:i])): + start += len(indices[list_elem]) current_pos = start for j in range(len(indices[i])): res_arr[current_pos] = data[indices[i][j]] diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index 473b4efed..cb3051684 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -256,7 +256,10 @@ def codegen(context, builder, sig, args): first_col_data = context.get_dummy_value() else: first_col_data = data_arrs_transformed[0] - index = context.compile_internal(builder, lambda a, d: fix_df_index(a, d), fixed_index_sig, [index, first_col_data]) + index = context.compile_internal(builder, + lambda a, d: fix_df_index(a, d), + fixed_index_sig, + [index, first_col_data]) dataframe.data = data_tup dataframe.index = index diff --git a/sdc/sdc_autogenerated.py b/sdc/sdc_autogenerated.py index 83dcb220e..f701cf5fb 100644 --- a/sdc/sdc_autogenerated.py +++ b/sdc/sdc_autogenerated.py @@ -80,10 +80,11 @@ def sdc_add_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_add_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -219,10 +220,11 @@ def sdc_div_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_div_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -358,10 +360,11 @@ def sdc_sub_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_sub_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -497,10 +500,11 @@ def sdc_mul_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_mul_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -636,10 +640,11 @@ def sdc_truediv_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_truediv_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -775,10 +780,11 @@ def sdc_floordiv_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_floordiv_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -914,10 +920,11 @@ def sdc_mod_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_mod_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -1053,10 +1060,11 @@ def sdc_pow_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_pow_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -1182,7 +1190,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index @@ -1310,7 +1319,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index @@ -1438,7 +1448,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index @@ -1566,7 +1577,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index @@ -1694,7 +1706,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index @@ -1822,7 +1835,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index diff --git a/sdc/sdc_function_templates.py b/sdc/sdc_function_templates.py index 891f4c9c6..2f58cdeee 100644 --- a/sdc/sdc_function_templates.py +++ b/sdc/sdc_function_templates.py @@ -80,10 +80,11 @@ def sdc_binop_impl(self, other, fill_value=None): use_index_methods = not (isinstance(self.index, sdc_old_index_types) or isinstance(other.index, sdc_old_index_types)) + def sdc_binop_impl(self, other, fill_value=None): left_index, right_index = self._index, other._index - if use_index_methods == True: + if use_index_methods == True: # noqa indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) else: indexes_join_res = sdc_indexes_join_outer(left_index, right_index) @@ -207,7 +208,8 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: index_api_supported = not (isinstance(self.index, sdc_old_index_types) - and isinstance(other.index, sdc_old_index_types)) + and isinstance(other.index, sdc_old_index_types)) + def _series_lt_common_impl(self, other, fill_value=None): left_index, right_index = self.index, other.index diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py index 626c86234..244fa52f8 100644 --- a/sdc/tests/indexes/index_datagens.py +++ b/sdc/tests/indexes/index_datagens.py @@ -52,7 +52,7 @@ def valid_params_predicate(range_params): def _generate_positional_range_params(): # for PositionalIndexType represented ranges only - starts, stops, steps = [0,], [1, 2, 10,], [1,] + starts, stops, steps = [0, ], [1, 2, 10, ], [1, ] return product(starts, stops, steps) diff --git a/sdc/tests/indexes/test_indexes.py b/sdc/tests/indexes/test_indexes.py index c23603317..b40dd1008 100644 --- a/sdc/tests/indexes/test_indexes.py +++ b/sdc/tests/indexes/test_indexes.py @@ -274,6 +274,7 @@ def test_indexes_support_numpy_like_take_by(self): """ Verifies numpy_like.take can handle SDC index types as indices """ from sdc.functions import numpy_like + def pyfunc(arr, index): return np.take(arr, index) diff --git a/sdc/tests/indexes/test_int64_index.py b/sdc/tests/indexes/test_int64_index.py index 2750049f7..552e01f7b 100644 --- a/sdc/tests/indexes/test_int64_index.py +++ b/sdc/tests/indexes/test_int64_index.py @@ -456,7 +456,7 @@ def test_func(index1, index2): index2 = pd.Int64Index(np.copy(index1.values)) result = sdc_func(index1, index2) - result_ref = test_func(index1, index2) + result_ref = test_func(index1, index2) pd.testing.assert_index_equal(result[0], result_ref[0]) np.testing.assert_array_equal(result[1], result_ref[1]) @@ -480,7 +480,7 @@ def test_impl(index1, index2): for index2 in reindex_by: with self.subTest(index2=index2): result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) + result_ref = test_impl(index1, index2) pd.testing.assert_index_equal(result[0], result_ref[0]) np.testing.assert_array_equal(result[1], result_ref[1]) @@ -546,8 +546,7 @@ def test_impl(index, other): ] for index, other in product( _generate_int64_indexes_fixed(n), - other_indexes - ): + other_indexes): with self.subTest(index=index, other=other): result = sdc_func(index, other) result_ref = test_impl(index, other) @@ -566,8 +565,7 @@ def test_impl(index, other): ] for index, other in product( _generate_int64_indexes_fixed(n), - other_indexes - ): + other_indexes): with self.subTest(index=index, other=other): result = sdc_func(index, other) result_ref = test_impl(index, other) diff --git a/sdc/tests/indexes/test_positional_index.py b/sdc/tests/indexes/test_positional_index.py index bf180aed9..1585969c6 100644 --- a/sdc/tests/indexes/test_positional_index.py +++ b/sdc/tests/indexes/test_positional_index.py @@ -480,7 +480,7 @@ def test_func(index1, index2): index2 = index1.copy(deep=True) result = sdc_func(index1, index2) - result_ref = test_func(index1, index2) + result_ref = test_func(index1, index2) pd.testing.assert_index_equal(result[0], result_ref[0]) np.testing.assert_array_equal(result[1], result_ref[1]) @@ -503,7 +503,7 @@ def test_impl(index1, index2): for index2 in reindex_by: with self.subTest(index2=index2): result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) + result_ref = test_impl(index1, index2) pd.testing.assert_index_equal(result[0], result_ref[0]) np.testing.assert_array_equal(result[1], result_ref[1]) @@ -540,8 +540,7 @@ def test_impl(index, other): ] for index, other in product( _generate_positional_indexes_fixed(n), - other_indexes - ): + other_indexes): with self.subTest(index=index, other=other): result = sdc_func(index, other) result_ref = test_impl(index, other) @@ -560,8 +559,7 @@ def test_impl(index, other): ] for index, other in product( _generate_positional_indexes_fixed(n), - other_indexes - ): + other_indexes): with self.subTest(index=index, other=other): result = sdc_func(index, other) result_ref = test_impl(index, other) diff --git a/sdc/tests/indexes/test_range_index.py b/sdc/tests/indexes/test_range_index.py index cbd07e620..1bc7a34c8 100644 --- a/sdc/tests/indexes/test_range_index.py +++ b/sdc/tests/indexes/test_range_index.py @@ -590,7 +590,7 @@ def test_func(index1, index2): index2 = index1.copy(deep=True) result = sdc_func(index1, index2) - result_ref = test_func(index1, index2) + result_ref = test_func(index1, index2) pd.testing.assert_index_equal(result[0], result_ref[0]) np.testing.assert_array_equal(result[1], result_ref[1]) @@ -612,7 +612,7 @@ def test_impl(index1, index2): for index2 in reindex_by: with self.subTest(index2=index2): result = sdc_func(index1, index2) - result_ref = test_impl(index1, index2) + result_ref = test_impl(index1, index2) pd.testing.assert_index_equal(result[0], result_ref[0]) np.testing.assert_array_equal(result[1], result_ref[1]) @@ -649,8 +649,7 @@ def test_impl(index, other): ] for index, other in product( _generate_range_indexes_fixed(n), - other_indexes - ): + other_indexes): with self.subTest(index=index, other=other): result = sdc_func(index, other) result_ref = test_impl(index, other) @@ -669,8 +668,7 @@ def test_impl(index, other): ] for index, other in product( _generate_range_indexes_fixed(n), - other_indexes - ): + other_indexes): with self.subTest(index=index, other=other): result = sdc_func(index, other) result_ref = test_impl(index, other) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 8314706ed..d7143990e 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -4829,13 +4829,7 @@ def test_series_pct_change_impl(S, periods, method): for input_data in test_input_data: S = pd.Series(input_data) for periods in [0, 1, 2, 5, 10, -1, -2, -5]: - for method in [ - None, - 'pad', - 'ffill', - 'backfill', - 'bfill' - ]: + for method in [None, 'pad', 'ffill', 'backfill', 'bfill']: result_ref = test_series_pct_change_impl(S, periods, method) result = hpat_func(S, periods, method) pd.testing.assert_series_equal(result, result_ref) From 7b880dbc30a83a30c8c62085520ab336c359454d Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 2 Feb 2021 03:12:32 +0300 Subject: [PATCH 5/5] Revert back change in Series.astype breaking float to str conversion --- sdc/datatypes/hpat_pandas_series_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 6c0b668e4..4d52ec542 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -1914,7 +1914,7 @@ def hpat_pandas_series_astype_no_modify_impl(self, dtype, copy=True, errors='rai data_narr = isinstance(self.data, types.npytypes.Array) dtype_num_liter = isinstance(dtype, (types.functions.NumberClass, types.StringLiteral)) - if data_narr and dtype_num_liter: + if data_narr and dtype_num_liter or str_check: return hpat_pandas_series_astype_numba_impl if errors == 'raise':