diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 8924a99be..fb4c70109 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -41,14 +41,14 @@ from numba.extending import register_jitable from numba.np import numpy_support from numba.typed import Dict +from numba.typed.typedobjectutils import _nonoptional import sdc +from sdc.datatypes.indexes import * from sdc.hiframes.api import isna from sdc.hiframes.pd_series_type import SeriesType from sdc.functions import numpy_like from sdc.str_arr_type import string_array_type, StringArrayType -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType from sdc.str_arr_ext import (num_total_chars, append_string_array_to, str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type, cp_str_list_to_array, create_str_arr_from_list, get_utf8_size, @@ -57,13 +57,11 @@ from sdc.utilities.utils import sdc_overload, sdc_register_jitable from sdc.utilities.sdc_typing_utils import ( find_common_dtype_from_numpy_dtypes, - TypeChecker) -from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types - - -class SDCLimitation(Exception): - """Exception to be raised in case of SDC limitation""" - pass + TypeChecker, + sdc_pandas_index_types, + sdc_pandas_df_column_types, + sdc_old_index_types, + ) def hpat_arrays_append(A, B): @@ -74,22 +72,31 @@ def hpat_arrays_append(A, B): def hpat_arrays_append_overload(A, B): """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" - use_A_array = isinstance(A, (RangeIndexType, Int64IndexType)) - use_B_array = isinstance(B, (RangeIndexType, Int64IndexType)) - if isinstance(A, (types.Array, RangeIndexType, Int64IndexType)): - if isinstance(B, (types.Array, RangeIndexType, Int64IndexType)): + if not isinstance(A, sdc_pandas_df_column_types): + return None + + # this function should work with arrays, not indexes, but until all indexes support + # common API (e.g. append is not supported for types.Array indexes) it is simplier to support + # indexes here rather than branch depending on index types on call site + # TO-DO: clean-up when Float64Index and StringArrayIndex are supported + # if not (isinstance(B, sdc_pandas_df_column_types) or isinstance(B.dtype, sdc_pandas_df_column_types)): + # return None + valid_num_single_B_dtype = (types.Array, ) + sdc_pandas_index_types + valid_num_seq_B_dtypes = (types.Array, ) + sdc_pandas_index_types + + if isinstance(A, types.Array): + if isinstance(B, valid_num_single_B_dtype): + convert_B = not isinstance(B, types.Array) def _append_single_numeric_impl(A, B): - _A = A.values if use_A_array == True else A # noqa - _B = B.values if use_B_array == True else B # noqa - return numpy.concatenate((_A, _B,)) + _B = B if convert_B == False else B.values # noqa + return numpy.concatenate((A, _B,)) return _append_single_numeric_impl - elif (isinstance(B, (types.UniTuple, types.List)) - and isinstance(B.dtype, (types.Array, RangeIndexType, Int64IndexType))): - B_dtype_is_index = isinstance(B.dtype, (RangeIndexType, Int64IndexType)) + elif (isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, valid_num_seq_B_dtypes)): numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) + convert_B = not isinstance(B.dtype, types.Array) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): @@ -97,13 +104,14 @@ def _append_list_numeric_impl(A, B): new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) - _A = numpy.array(A) if use_A_array == True else A # noqa - new_data[:stop] = _A + new_data[:stop] = A for arr in B: - _arr = arr.values if B_dtype_is_index == True else arr # noqa start = stop - stop = start + len(_arr) - new_data[start:stop] = _arr + stop = start + len(arr) + if convert_B == False: # noqa + new_data[start:stop] = arr + else: + new_data[start:stop] = arr.values return new_data return _append_list_numeric_impl @@ -214,49 +222,14 @@ def _hpat_ensure_array_capacity(new_size, arr): return res -def sdc_join_series_indexes(left, right): +def _sdc_internal_join(left, right): pass -@sdc_overload(sdc_join_series_indexes, jit_options={'parallel': False}) -def sdc_join_series_indexes_overload(left, right): - """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" - - # check that both operands are of types used for representing Pandas indexes - if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types) - and not isinstance(left, types.NoneType) - and not isinstance(right, types.NoneType)): - return None - - convert_left = isinstance(left, (RangeIndexType, Int64IndexType)) - convert_right = isinstance(right, (RangeIndexType, Int64IndexType)) - - def _convert_to_arrays_impl(left, right): - _left = left.values if convert_left == True else left # noqa - _right = right.values if convert_right == True else right # noqa - return sdc_join_series_indexes(_left, _right) - - if isinstance(left, RangeIndexType) and isinstance(right, RangeIndexType): - - def sdc_join_range_indexes_impl(left, right): - if (left is right or numpy_like.array_equal(left, right)): - joined = left.values - lidx = numpy.arange(len(joined)) - ridx = lidx - return joined, lidx, ridx - else: - return sdc_join_series_indexes(left.values, right.values) - - return sdc_join_range_indexes_impl - - elif (isinstance(left, (RangeIndexType, Int64IndexType, types.Array)) - and isinstance(right, (RangeIndexType, Int64IndexType, types.Array)) - and not (isinstance(left, types.Array) and isinstance(right, types.Array))): - return _convert_to_arrays_impl +@sdc_overload(_sdc_internal_join, jit_options={'parallel': False}) +def _sdc_internal_join_ovld(left, right): - # TODO: remove code duplication below and merge numeric and StringArray impls into one - # needs equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays - elif isinstance(left, types.Array) and isinstance(right, types.Array): + if isinstance(left, types.Array) and isinstance(right, types.Array): numba_common_dtype = find_common_dtype_from_numpy_dtypes([left.dtype, right.dtype], []) if isinstance(numba_common_dtype, types.Number): @@ -614,110 +587,6 @@ def _sdc_asarray_impl(data): return None -def _sdc_take(data, indexes): - pass - - -@sdc_overload(_sdc_take) -def _sdc_take_overload(data, indexes): - - valid_data_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(data, valid_data_types) and not isinstance(data, types.NoneType)): - return None - - if not (isinstance(indexes, (types.Array, types.List, Int64IndexType)) - and isinstance(indexes.dtype, (types.Integer, types.ListType))): - return None - - if (isinstance(indexes.dtype, types.ListType) - and isinstance(data, (types.Array, types.List, RangeIndexType, Int64IndexType))): - arr_dtype = data.dtype - - def _sdc_take_list_impl(data, indexes): - res_size = 0 - for i in numba.prange(len(indexes)): - res_size += len(indexes[i]) - res_arr = numpy.empty(res_size, dtype=arr_dtype) - for i in numba.prange(len(indexes)): - start = 0 - for l in range(len(indexes[0:i])): - start += len(indexes[l]) - current_pos = start - for j in range(len(indexes[i])): - res_arr[current_pos] = data[indexes[i][j]] - current_pos += 1 - return res_arr - - return _sdc_take_list_impl - - elif isinstance(indexes.dtype, types.ListType) and data == string_array_type: - def _sdc_take_list_str_impl(data, indexes): - res_size = 0 - for i in numba.prange(len(indexes)): - res_size += len(indexes[i]) - nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) - num_total_bytes = 0 - for i in numba.prange(len(indexes)): - start = 0 - for l in range(len(indexes[0:i])): - start += len(indexes[l]) - current_pos = start - for j in range(len(indexes[i])): - num_total_bytes += get_utf8_size(data[indexes[i][j]]) - if isna(data, indexes[i][j]): - nan_mask[current_pos] = True - current_pos += 1 - res_arr = pre_alloc_string_array(res_size, num_total_bytes) - for i in numba.prange(len(indexes)): - start = 0 - for l in range(len(indexes[0:i])): - start += len(indexes[l]) - current_pos = start - for j in range(len(indexes[i])): - res_arr[current_pos] = data[indexes[i][j]] - if nan_mask[current_pos]: - str_arr_set_na(res_arr, current_pos) - current_pos += 1 - - return res_arr - - return _sdc_take_list_str_impl - - elif isinstance(data, (types.Array, RangeIndexType, Int64IndexType)): - arr_dtype = data.dtype - - def _sdc_take_array_impl(data, indexes): - res_size = len(indexes) - res_arr = numpy.empty(res_size, dtype=arr_dtype) - for i in numba.prange(res_size): - res_arr[i] = data[indexes[i]] - return res_arr - - return _sdc_take_array_impl - - elif isinstance(data, StringArrayType): - def _sdc_take_str_arr_impl(data, indexes): - res_size = len(indexes) - nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) - num_total_bytes = 0 - for i in numba.prange(res_size): - num_total_bytes += get_utf8_size(data[indexes[i]]) - if isna(data, indexes[i]): - nan_mask[i] = True - - res_arr = pre_alloc_string_array(res_size, num_total_bytes) - for i in numpy.arange(res_size): - res_arr[i] = data[indexes[i]] - if nan_mask[i]: - str_arr_set_na(res_arr, i) - - return res_arr - - return _sdc_take_str_arr_impl - - return None - - def _almost_equal(x, y): """Check if floats are almost equal based on the float epsilon""" pass @@ -741,65 +610,91 @@ def sdc_reindex_series(arr, index, name, by_index): pass +# TO-DO: support Series.reindex() that should replace this function @sdc_overload(sdc_reindex_series) def sdc_reindex_series_overload(arr, index, name, by_index): """ Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """ - range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType) - int64_indexes = isinstance(index, Int64IndexType) and isinstance(by_index, Int64IndexType) + range_indexes = (isinstance(index, (PositionalIndexType, RangeIndexType)) + and isinstance(by_index, (PositionalIndexType, RangeIndexType))) data_dtype, index_dtype = arr.dtype, index.dtype data_is_str_arr = isinstance(arr.dtype, types.UnicodeType) - def sdc_reindex_series_impl(arr, index, name, by_index): - - # no reindexing is needed if indexes are equal - if range_indexes == True: # noqa - equal_indexes = numpy_like.array_equal(index, by_index) - elif int64_indexes == True: # noqa - equal_indexes = numpy_like.array_equal(index, by_index) - else: - equal_indexes = False - if (index is by_index or equal_indexes): - return pandas.Series(data=arr, index=by_index, name=name) + # use old implementation if old indexes types are used + if (isinstance(index, sdc_old_index_types) or isinstance(by_index, sdc_old_index_types)): - if data_is_str_arr == True: # noqa - _res_data = [''] * len(by_index) - res_data_nan_mask = numpy.zeros(len(by_index), dtype=types.bool_) - else: - _res_data = numpy.empty(len(by_index), dtype=data_dtype) + def sdc_reindex_series_old_impl(arr, index, name, by_index): - # build a dict of self.index values to their positions: - map_index_to_position = Dict.empty( - key_type=index_dtype, - value_type=types.int32 - ) + # no reindexing is needed if indexes are equal, but only check if it's fast + if range_indexes == True: # noqa + equal_indexes = index.equals(by_index) + else: + equal_indexes = False + if (index is by_index or equal_indexes): + return pandas.Series(data=arr, index=by_index, name=name) - for i, value in enumerate(index): - if value in map_index_to_position: - raise ValueError("cannot reindex from a duplicate axis") + if data_is_str_arr == True: # noqa + _res_data = [''] * len(by_index) + res_data_nan_mask = numpy.zeros(len(by_index), dtype=types.bool_) else: - map_index_to_position[value] = i - - index_mismatch = 0 - for i in numba.prange(len(by_index)): - val = by_index[i] - if val in map_index_to_position: - pos_in_self = map_index_to_position[val] - _res_data[i] = arr[pos_in_self] - if data_is_str_arr == True: # noqa - res_data_nan_mask[i] = isna(arr, i) + _res_data = numpy.empty(len(by_index), dtype=data_dtype) + + # build a dict of self.index values to their positions: + map_index_to_position = Dict.empty( + key_type=index_dtype, + value_type=types.int32 + ) + + for i, value in enumerate(index): + if value in map_index_to_position: + raise ValueError("cannot reindex from a duplicate axis") + else: + map_index_to_position[value] = i + + index_mismatch = 0 + for i in numba.prange(len(by_index)): + val = by_index[i] + if val in map_index_to_position: + pos_in_self = map_index_to_position[val] + _res_data[i] = arr[pos_in_self] + if data_is_str_arr == True: # noqa + res_data_nan_mask[i] = isna(arr, i) + else: + index_mismatch += 1 + if index_mismatch: + msg = "Unalignable boolean Series provided as indexer " + \ + "(index of the boolean Series and of the indexed object do not match)." + raise IndexingError(msg) + + if data_is_str_arr == True: # noqa + res_data = create_str_arr_from_list(_res_data) + str_arr_set_na_by_mask(res_data, res_data_nan_mask) else: - index_mismatch += 1 - if index_mismatch: - msg = "Unalignable boolean Series provided as indexer " + \ - "(index of the boolean Series and of the indexed object do not match)." - raise IndexingError(msg) - - if data_is_str_arr == True: # noqa - res_data = create_str_arr_from_list(_res_data) - str_arr_set_na_by_mask(res_data, res_data_nan_mask) + res_data = _res_data + + return pandas.Series(data=res_data, index=by_index, name=name) + + return sdc_reindex_series_old_impl + + def sdc_reindex_series_impl(arr, index, name, by_index): + + _, new_order = index.reindex(by_index) + if new_order is not None: + new_order_as_array = _nonoptional(new_order) + index_mismatch = 0 + for i in numba.prange(len(by_index)): + if new_order_as_array[i] == -1: + index_mismatch += 1 + + if index_mismatch: + # TO-DO: seems it covers only specific series reindex case, generalize? + msg = "Unalignable boolean Series provided as indexer " + \ + "(index of the boolean Series and of the indexed object do not match)." + raise IndexingError(msg) + + res_data = numpy_like.take(arr, new_order_as_array) else: - res_data = _res_data + res_data = arr return pandas.Series(data=res_data, index=by_index, name=name) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index de7edef66..7e1c84322 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -41,16 +41,18 @@ from numba import literally from numba.typed import List, Dict from numba.core.errors import TypingError +from numba.core.registry import cpu_target from pandas.core.indexing import IndexingError +from sdc.datatypes.indexes import * from sdc.hiframes.pd_dataframe_ext import DataFrameType from sdc.hiframes.pd_series_type import SeriesType from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, kwsparams2list, gen_impl_generator, find_common_dtype_from_numpy_dtypes) from sdc.str_arr_ext import StringArrayType -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType + +from sdc.extensions.indexes.empty_index_ext import init_empty_index from sdc.hiframes.pd_dataframe_type import DataFrameType from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps @@ -58,7 +60,7 @@ from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType, dataframe_getitem_accessor_init) -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation from sdc.datatypes.hpat_pandas_dataframe_rolling_types import _hpat_pandas_df_rolling_init from sdc.datatypes.hpat_pandas_rolling_types import ( gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl) @@ -67,7 +69,10 @@ from sdc.utilities.utils import sdc_overload, sdc_overload_method, sdc_overload_attribute from sdc.hiframes.api import isna from sdc.functions.numpy_like import getitem_by_mask, find_idx -from sdc.datatypes.common_functions import _sdc_take, sdc_reindex_series +from sdc.functions.numpy_like import take as nplike_take +from sdc.datatypes.common_functions import (sdc_reindex_series, + fill_array, + fill_str_array,) from sdc.utilities.prange_utils import parallel_chunks @@ -100,22 +105,10 @@ def hpat_pandas_dataframe_index(df): ty_checker = TypeChecker('Attribute index.') ty_checker.check(df, DataFrameType) - if isinstance(df.index, types.NoneType): - empty_df = not df.columns - - def hpat_pandas_df_index_none_impl(df): - if empty_df == True: # noqa - return numpy.arange(0) - else: - return pandas.RangeIndex(len(df)) - - return hpat_pandas_df_index_none_impl - else: - - def hpat_pandas_df_index_impl(df): - return df._index + def hpat_pandas_df_index_impl(df): + return df._index - return hpat_pandas_df_index_impl + return hpat_pandas_df_index_impl @sdc_overload_attribute(DataFrameType, 'columns') @@ -175,7 +168,7 @@ def sdc_pandas_dataframe_values_impl(self): columns_num = len(self.columns) func_lines = [ f'def sdc_pandas_dataframe_values_impl(self):', - f' length = {df_length_expr(self)}', + f' length = len(self._index)', ] for i, col in enumerate(self.columns): col_loc = self.column_loc[col] @@ -312,11 +305,11 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ other_type_id, other_col_id = other_col_loc.type_id, other_col_loc.col_id func_text.append(f'new_col_{idx}_data_other = ' f'other._data[{other_type_id}][{other_col_id}]') - s1 = f'init_series(new_col_{idx}_data_df)' - s2 = f'init_series(new_col_{idx}_data_other)' + s1 = f'pandas.Series(new_col_{idx}_data_df)' + s2 = f'pandas.Series(new_col_{idx}_data_other)' func_text.append(f'new_col_{idx} = {s1}.append({s2})._data') else: - func_text.append(f'new_col_{idx}_data = init_series(new_col_{idx}_data_df)._data') + func_text.append(f'new_col_{idx}_data = pandas.Series(new_col_{idx}_data_df)._data') if col_name in string_type_columns: func_text.append(f'new_col_{idx} = fill_str_array(new_col_{idx}_data, len_df+len_other)') else: @@ -328,7 +321,7 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ other_col_loc = other.column_loc[col_name] other_type_id, other_col_id = other_col_loc.type_id, other_col_loc.col_id func_text.append(f'new_col_{idx}_data_other = other._data[{other_type_id}][{other_col_id}]') - func_text.append(f'new_col_{idx}_data = init_series(new_col_{idx}_data_other)._data') + func_text.append(f'new_col_{idx}_data = pandas.Series(new_col_{idx}_data_other)._data') if col_name in string_type_columns: func_text.append( f'new_col_{idx}_other = ' @@ -347,17 +340,15 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ func_text.append(f'raise SDCLimitation("Indexes of dataframes are expected to have comparable ' f'(both Numeric or String) types if parameter ignore_index is set to False.")') else: - func_text += [f'joined_index = hpat_arrays_append(df.index, other.index)\n', + func_text += [f'joined_index = df._index.append(other._index)\n', f'return pandas.DataFrame({{{data}}}, index=joined_index)\n'] func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) global_vars = {'pandas': pandas, - 'init_series': sdc.hiframes.api.init_series, - 'fill_array': sdc.datatypes.common_functions.fill_array, - 'fill_str_array': sdc.datatypes.common_functions.fill_str_array, - 'hpat_arrays_append': sdc.datatypes.common_functions.hpat_arrays_append, + 'fill_array': fill_array, + 'fill_str_array': fill_str_array, 'SDCLimitation': SDCLimitation} return func_def, global_vars @@ -420,9 +411,7 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=False, verify_integrity= if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)): ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') - none_or_numeric_indexes = ((isinstance(df.index, types.NoneType) or isinstance(df.index, types.Number)) and - (isinstance(other.index, types.NoneType) or isinstance(other.index, types.Number))) - indexes_comparable = check_types_comparable(df.index, other.index) or none_or_numeric_indexes + indexes_comparable = check_types_comparable(df.index, other.index) if isinstance(ignore_index, types.Literal): ignore_index = ignore_index.literal_value @@ -510,7 +499,7 @@ def _df_head_impl(df, n=5): results = [] joined = ', '.join(func_params) func_lines = [f'def _df_{func_name}_impl(df, {joined}):'] - ind = df_index_codegen_head(df) + ind = 'index=df._index[:n]' for i, c in enumerate(df.columns): col_loc = df.column_loc[c] type_id, col_id = col_loc.type_id, col_loc.col_id @@ -542,14 +531,6 @@ def sdc_pandas_dataframe_head_codegen(df, func_name, params, ser_params): return _reduce_impl -def df_index_codegen_head(self): - # TODO: Rewrite when DF constructor will be fixed with index=None - if isinstance(self.index, types.NoneType): - return '' - - return 'index=df._index[:n]' - - @sdc_overload_method(DataFrameType, 'head') def head_overload(df, n=5): """ @@ -1490,6 +1471,7 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, else: # this works because global tuple of strings is captured as Tuple of StringLiterals columns_as_tuple = tuple(columns.initial_value) + def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors="raise"): @@ -1536,23 +1518,6 @@ def sdc_pandas_dataframe_drop_impl(df, args, columns): return sdc_pandas_dataframe_drop_impl(df, args, columns) -def df_length_expr(self): - """Generate expression to get length of DF""" - if self.columns: - return 'len(self._data[0][0])' - - return '0' - - -def df_index_expr(self, length_expr=None): - """Generate expression to get or create index of DF""" - if isinstance(self.index, types.NoneType): - if length_expr is None: - length_expr = df_length_expr(self) - - return 'self._index' - - def df_getitem_slice_idx_main_codelines(self, idx): """Generate main code lines for df.getitem with idx of slice""" @@ -1571,17 +1536,14 @@ def df_getitem_slice_idx_main_codelines(self, idx): all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', ' res_data = f'({all_lists_joined})' if n_lists > 0 else '()' func_lines += [ - f' if self_index_is_none == True:', - f' old_index = pandas.RangeIndex(len(self))', - f' else:', - f' old_index = self._index', f' res_data = {res_data}', - f' res_index = old_index[idx]', + f' res_index = self._index[idx]', f' return init_dataframe_internal(res_data, res_index, df_type)' ] return func_lines + def df_getitem_tuple_idx_main_codelines(self, literal_idx): """Generate main code lines for df.getitem with idx of tuple""" results = [] @@ -1605,17 +1567,15 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx): def df_getitem_bool_series_idx_main_codelines(self, idx): """Generate main code lines for df.getitem""" - length_expr = df_length_expr(self) # optimization for default indexes in df and idx when index alignment is trivial - if (isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType)): - func_lines = [f' length = {length_expr}', - f' self_index = self.index', - f' if length > len(idx):', + if (isinstance(self.index, PositionalIndexType) and isinstance(idx.index, PositionalIndexType)): + func_lines = [f' self_index = self._index', + f' if len(self_index) > len(idx):', f' msg = "Unalignable boolean Series provided as indexer " + \\', f' "(index of the boolean Series and of the indexed object do not match)."', f' raise IndexingError(msg)', - f' # do not trim idx._data to length as getitem_by_mask handles such case', + f' # do not trim idx._data to df length as getitem_by_mask handles such case', f' res_index = getitem_by_mask(self_index, idx._data)', f' # df index is default, same as positions so it can be used in take'] results = [] @@ -1635,11 +1595,11 @@ def df_getitem_bool_series_idx_main_codelines(self, idx): ] else: func_lines = [ - f' length = {length_expr}', - f' self_index = self.index', - f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)', - f' res_index = getitem_by_mask(self_index, reindexed_idx._data)', - f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)' + f' self_index = self._index', + f' idx_reindexed_by_self = sdc_reindex_series(idx._data, idx._index, idx._name, self_index)', + f' final_mask = idx_reindexed_by_self._data', + f' res_index = self_index[final_mask]', + f' selected_pos = getitem_by_mask(numpy.arange(len(self_index)), final_mask)' ] results = [] for i, col in enumerate(self.columns): @@ -1663,12 +1623,14 @@ def df_getitem_bool_series_idx_main_codelines(self, idx): def df_getitem_bool_array_idx_main_codelines(self, idx): """Generate main code lines for df.getitem""" - func_lines = [f' length = {df_length_expr(self)}', + has_positional_index = isinstance(idx, PositionalIndexType) + res_index_expr = 'taken_pos' if has_positional_index else 'self._index.take(taken_pos)' + func_lines = [f' length = len(self._index)', f' if length != len(idx):', f' raise ValueError("Item wrong length.")', - f' self_index = self.index', - f' taken_pos = getitem_by_mask(self_index, idx)', - f' res_index = sdc_take(self_index, taken_pos)'] + f' taken_pos = getitem_by_mask(numpy.arange(length), idx)', + f' res_index = {res_index_expr}' + ] results = [] for i, col in enumerate(self.columns): col_loc = self.column_loc[col] @@ -1700,12 +1662,8 @@ def _df_getitem_slice_idx_impl(self, idx): list_0 = self._data[0].copy() for i, item in enumerate(list_0): list_0[i] = item[idx] - if self_index_is_none == True: - old_index = pandas.RangeIndex(len(self)) - else: - old_index = self._index res_data = (list_0, ) - res_index = old_index[idx] + res_index = self._index[idx] return init_dataframe_internal(res_data, res_index, df_type) """ func_lines = ['def _df_getitem_slice_idx_impl(self, idx):'] @@ -1716,16 +1674,20 @@ def _df_getitem_slice_idx_impl(self, idx): func_lines += df_getitem_key_error_codelines() func_text = '\n'.join(func_lines) - # TO-DO: need DefaultIndex to handle self.index[idx] construct inside func - self_index_is_none = isinstance(self.index, types.NoneType) - new_index_type = RangeIndexType(False) if self_index_is_none else self.index + # since we need to know result df type to call init_dataframe_internal + # deduce the resulting df index type + index_getitem_sig = cpu_target.typing_context.resolve_function_type( + operator.getitem, + (self.index, idx), + {} + ) + new_index_type = index_getitem_sig.return_type df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc) global_vars = {'pandas': pandas, 'numpy': numpy, 'df_type': df_type, - 'init_dataframe_internal': init_dataframe_internal, - 'self_index_is_none': self_index_is_none} + 'init_dataframe_internal': init_dataframe_internal} return func_text, global_vars @@ -1733,13 +1695,13 @@ def _df_getitem_slice_idx_impl(self, idx): def df_getitem_tuple_idx_codegen(self, idx): """ Example of generated implementation with provided index: - def _df_getitem_tuple_idx_impl(self, idx) - res_index = self._index - data_1 = self._data[1] - res_data_1 = pandas.Series(data_1, index=res_index, name="B") - data_2 = self._data[2] + def _df_getitem_tuple_idx_impl(self, idx): + res_index = self.index + data_0 = self._data[0][0] + res_data_0 = pandas.Series(data_0, index=res_index, name="A") + data_2 = self._data[0][2] res_data_2 = pandas.Series(data_2, index=res_index, name="C") - return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index) + return pandas.DataFrame({"A": res_data_0, "C": res_data_2}, index=res_index) """ func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):'] literal_idx = {col.literal_value for col in idx} @@ -1761,27 +1723,28 @@ def df_getitem_bool_series_idx_codegen(self, idx): """ Example of generated implementation with provided index: def _df_getitem_bool_series_idx_impl(self, idx): - length = len(self._data[0][0]) - self_index = range(len(self._data[0][0])) - if length > len(idx): + self_index = self._index + if len(self_index) > len(idx): msg = "Unalignable boolean Series provided as indexer " + \ "(index of the boolean Series and of the indexed object do not match)." raise IndexingError(msg) - # do not trim idx._data to length as getitem_by_mask handles such case + # do not trim idx._data to df length as getitem_by_mask handles such case res_index = getitem_by_mask(self_index, idx._data) # df index is default, same as positions so it can be used in take data_0 = self._data[0][0] res_data_0 = sdc_take(data_0, res_index) - data_1 = self._data[1][0] + data_1 = self._data[0][1] res_data_1 = sdc_take(data_1, res_index) - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index) + data_2 = self._data[0][2] + res_data_2 = sdc_take(data_2, res_index) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1, "C": res_data_2}, index=res_index) """ func_lines = ['def _df_getitem_bool_series_idx_impl(self, idx):'] func_lines += df_getitem_bool_series_idx_main_codelines(self, idx) func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, 'numpy': numpy, 'getitem_by_mask': getitem_by_mask, - 'sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'sdc_reindex_series': sdc_reindex_series, 'IndexingError': IndexingError} @@ -1792,24 +1755,25 @@ def df_getitem_bool_array_idx_codegen(self, idx): """ Example of generated implementation with provided index: def _df_getitem_bool_array_idx_impl(self, idx): - length = len(self._data[0][0]) + length = len(self._index) if length != len(idx): raise ValueError("Item wrong length.") - self_index = range(len(self._data[0][0])) - taken_pos = getitem_by_mask(self_index, idx) - res_index = sdc_take(self_index, taken_pos) + taken_pos = getitem_by_mask(numpy.arange(length), idx) + res_index = self._index.take(taken_pos) data_0 = self._data[0][0] res_data_0 = sdc_take(data_0, taken_pos) data_1 = self._data[1][0] res_data_1 = sdc_take(data_1, taken_pos) - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index) + data_2 = self._data[2][0] + res_data_2 = sdc_take(data_2, taken_pos) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1, "C": res_data_2}, index=res_index) """ func_lines = ['def _df_getitem_bool_array_idx_impl(self, idx):'] func_lines += df_getitem_bool_array_idx_main_codelines(self, idx) func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, 'numpy': numpy, 'getitem_by_mask': getitem_by_mask, - 'sdc_take': _sdc_take} + 'sdc_take': nplike_take} return func_text, global_vars @@ -1973,20 +1937,8 @@ def _df_getitem_unicode_idx_impl(self, idx): return gen_df_getitem_slice_idx_impl(self, idx) if isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean): - self_index_is_none = isinstance(self.index, types.NoneType) - idx_index_is_none = isinstance(idx.index, types.NoneType) - - if self_index_is_none and not idx_index_is_none: - if not check_index_is_numeric(idx): - ty_checker.raise_exc(idx.index.dtype, 'number', 'idx.index.dtype') - - if not self_index_is_none and idx_index_is_none: - if not check_index_is_numeric(self): - ty_checker.raise_exc(idx.index.dtype, self.index.dtype, 'idx.index.dtype') - - if not self_index_is_none and not idx_index_is_none: - if not check_types_comparable(self.index, idx.index): - ty_checker.raise_exc(idx.index.dtype, self.index.dtype, 'idx.index.dtype') + if not check_types_comparable(self.index, idx.index): + ty_checker.raise_exc(idx.index.dtype, self.index.dtype, 'idx.index.dtype') return gen_df_getitem_bool_series_idx_impl(self, idx) @@ -2002,10 +1954,10 @@ def df_getitem_tuple_at_codegen(self, row, col): """ Example of generated implementation: def _df_getitem_tuple_at_impl(self, idx): - row, _ = idx - data = self._dataframe._data[1][0] - res_data = pandas.Series(data, index=self._dataframe.index) - return res_data.at[row] + row, _ = idx + data = self._dataframe._data[2][0] + res_data = pandas.Series(data, index=self._dataframe.index) + return res_data.at[row] """ func_lines = ['def _df_getitem_tuple_at_impl(self, idx):', ' row, _ = idx'] @@ -2033,23 +1985,25 @@ def df_getitem_single_label_loc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_single_label_loc_impl(self, idx): - idx_list = find_idx(self._dataframe._index, idx) - data_0 = _sdc_take(self._dataframe._data[0][0], idx_list) - res_data_0 = pandas.Series(data_0) - data_1 = _sdc_take(self._dataframe._data[1][0], idx_list) - res_data_1 = pandas.Series(data_1) - if len(idx_list) < 1: - raise KeyError('Index is not in the DataFrame') - new_index = _sdc_take(self._dataframe._index, idx_list) - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=new_index) - """ - if isinstance(self.index, types.NoneType): + idx_list = find_idx(self._dataframe._index, idx) + data_0 = sdc_take(self._dataframe._data[0][0], idx_list) + res_data_0 = pandas.Series(data_0) + data_1 = sdc_take(self._dataframe._data[1][0], idx_list) + res_data_1 = pandas.Series(data_1) + data_2 = sdc_take(self._dataframe._data[0][1], idx_list) + res_data_2 = pandas.Series(data_2) + if len(idx_list) < 1: + raise KeyError('Index is not in the DataFrame') + new_index = self._dataframe._index.take(idx_list) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1, "C": res_data_2}, index=new_index) + """ + if isinstance(self.index, PositionalIndexType): fill_list = [' idx_list = numpy.array([idx])'] new_index = [' new_index = numpy.array([idx])'] else: fill_list = [' idx_list = find_idx(self._dataframe._index, idx)'] - new_index = [' new_index = _sdc_take(self._dataframe._index, idx_list)'] + new_index = [' new_index = self._dataframe._index.take(idx_list)'] fill_list_text = '\n'.join(fill_list) new_index_text = '\n'.join(new_index) @@ -2061,7 +2015,7 @@ def _df_getitem_single_label_loc_impl(self, idx): type_id, col_id = col_loc.type_id, col_loc.col_id data = f'data_{i}' res_data = f'res_data_{i}' - func_lines += [f' {data} = _sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)', + func_lines += [f' {data} = sdc_take(self._dataframe._data[{type_id}][{col_id}], idx_list)', f' {res_data} = pandas.Series({data})'] results.append((c, res_data)) @@ -2075,7 +2029,7 @@ def _df_getitem_single_label_loc_impl(self, idx): func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, 'numpy': numpy, 'numba': numba, - '_sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'find_idx': find_idx, 'KeyError': KeyError} @@ -2086,20 +2040,22 @@ def df_getitem_int_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_int_iloc_impl(self, idx): - if -1 < idx < len(self._dataframe.index): - data_0 = pandas.Series(self._dataframe._data[0][0]) - result_0 = data_0.iat[idx] - data_1 = pandas.Series(self._dataframe._data[0][1]) - result_1 = data_1.iat[idx] - return pandas.Series(data=[result_0, result_1], index=['A', 'B'], name=str(idx)) - raise IndexingError('Index is out of bounds for axis') + if -1 < idx < len(self._dataframe.index): + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iat[idx] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iat[idx] + data_2 = pandas.Series(self._dataframe._data[1][0]) + result_2 = data_2.iat[idx] + return pandas.Series(data=[result_0, result_1, result_2], index=['A', 'B', 'C'], name=str(idx)) + raise IndexingError('Index is out of bounds for axis') """ func_lines = ['def _df_getitem_int_iloc_impl(self, idx):', ' if -1 < idx < len(self._dataframe.index):'] results = [] index = [] name = 'self._dataframe._index[idx]' - if isinstance(self.index, types.NoneType): + if isinstance(self.index, PositionalIndexType): name = 'idx' for i, c in enumerate(self.columns): col_loc = self.column_loc[c] @@ -2123,11 +2079,13 @@ def df_getitem_slice_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_slice_iloc_impl(self, idx): - data_0 = pandas.Series(self._dataframe._data[0][0]) - result_0 = data_0.iloc[idx] - data_1 = pandas.Series(self._dataframe._data[1][0]) - result_1 = data_1.iloc[idx] - return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=self._dataframe.index[idx]) + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iloc[idx] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iloc[idx] + data_2 = pandas.Series(self._dataframe._data[1][0]) + result_2 = data_2.iloc[idx] + return pandas.DataFrame(data={"A": result_0, "B": result_1, "C": result_2}, index=self._dataframe.index[idx]) """ func_lines = ['def _df_getitem_slice_iloc_impl(self, idx):'] results = [] @@ -2151,17 +2109,19 @@ def df_getitem_list_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_list_iloc_impl(self, idx): - check_idx = False - for i in idx: - if -1 < i < len(self._dataframe.index): - check_idx = True - if check_idx == True: - data_0 = pandas.Series(self._dataframe._data[0][0]) - result_0 = data_0.iloc[numpy.array(idx)] - data_1 = pandas.Series(self._dataframe._data[1][0]) - result_1 = data_1.iloc[numpy.array(idx)] - return pandas.DataFrame(data={"A": result_0, "B": result_1}, index=idx) - raise IndexingError('Index is out of bounds for axis') + check_idx = False + for i in idx: + if -1 < i < len(self._dataframe.index): + check_idx = True + if check_idx == True: + data_0 = pandas.Series(self._dataframe._data[0][0]) + result_0 = data_0.iloc[numpy.array(idx)] + data_1 = pandas.Series(self._dataframe._data[0][1]) + result_1 = data_1.iloc[numpy.array(idx)] + data_2 = pandas.Series(self._dataframe._data[1][0]) + result_2 = data_2.iloc[numpy.array(idx)] + return pandas.DataFrame(data={"A": result_0, "B": result_1, "C": result_2}, index=idx) + raise IndexingError('Index is out of bounds for axis') """ func_lines = ['def _df_getitem_list_iloc_impl(self, idx):', ' check_idx = False', @@ -2171,7 +2131,7 @@ def _df_getitem_list_iloc_impl(self, idx): ' if check_idx == True:'] results = [] index = '[self._dataframe._index[i] for i in idx]' - if isinstance(self.index, types.NoneType): + if isinstance(self.index, PositionalIndexType): index = 'idx' for i, c in enumerate(self.columns): col_loc = self.column_loc[c] @@ -2194,14 +2154,18 @@ def df_getitem_list_bool_iloc_codegen(self, idx): """ Example of generated implementation: def _df_getitem_list_bool_iloc_impl(self, idx): - if len(self._dataframe.index) == len(idx): - data_0 = self._dataframe._data[0][0] - result_0 = pandas.Series(data_0[numpy.array(idx)]) - data_1 = self._dataframe._data[1][0] - result_1 = pandas.Series(data_1[numpy.array(idx)]) - return pandas.DataFrame(data={"A": result_0, "B": result_1}, - index=self._dataframe.index[numpy.array(idx)]) - raise IndexingError('Item wrong length') + if len(self._dataframe.index) == len(idx): + data_0 = self._dataframe._data[0][0] + result_0 = pandas.Series(data_0[numpy.array(idx)]) + data_1 = self._dataframe._data[0][1] + result_1 = pandas.Series(data_1[numpy.array(idx)]) + data_2 = self._dataframe._data[1][0] + result_2 = pandas.Series(data_2[numpy.array(idx)]) + return pandas.DataFrame(data={"A": result_0, + "B": result_1, + "C": result_2}, + index=self._dataframe.index[numpy.array(idx)]) + raise IndexingError('Item wrong length') """ func_lines = ['def _df_getitem_list_bool_iloc_impl(self, idx):'] results = [] @@ -2258,7 +2222,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx): if accessor == 'at': num_idx = (isinstance(idx[0], types.Number) - and isinstance(self.dataframe.index, (types.NoneType, RangeIndexType, Int64IndexType))) + and isinstance(self.dataframe.index, (PositionalIndexType, RangeIndexType, Int64IndexType))) str_idx = (isinstance(idx[0], (types.UnicodeType, types.StringLiteral)) and isinstance(self.dataframe.index, StringArrayType)) if isinstance(idx, types.Tuple) and isinstance(idx[1], types.StringLiteral): @@ -2318,6 +2282,7 @@ def df_getitem_iat_tuple_impl(self, idx): return gen_df_getitem_iloc_int_impl(self.dataframe, idx) if isinstance(idx, (types.Tuple, types.UniTuple)): + def df_getitem_tuple_iat_impl(self, idx): return self._dataframe.iat[idx] @@ -2604,14 +2569,6 @@ def pct_change_overload(df, periods=1, fill_method='pad', limit=None, freq=None) return sdc_pandas_dataframe_apply_columns(df, name, params, ser_par) -def df_index_codegen_isin(df_type, df, data): - if isinstance(df_type.index, types.NoneType): - func_lines = [f' return pandas.DataFrame({{{data}}})'] - else: - func_lines = [f' return pandas.DataFrame({{{data}}}, index={df}._index)'] - return func_lines - - def sdc_pandas_dataframe_isin_dict_codegen(func_name, df_type, values, all_params): """ Example of generated implementation: @@ -2656,7 +2613,7 @@ def _df_isin_impl(df, values): ] result_name.append((result_c, c)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name) - func_lines += df_index_codegen_isin(df_type, df, data) + func_lines.append(f' return pandas.DataFrame({{{data}}}, index={df}._index)') func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, @@ -2719,7 +2676,7 @@ def _df_isin_impl(df, values): f' result = numpy.empty(len(series_{c}._data), numpy.bool_)', f' result_len = len(series_{c}._data)' ] - if isinstance(values.index, types.NoneType) and isinstance(df_type.index, types.NoneType): + if isinstance(values.index, PositionalIndexType) and isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' if i <= len(values._data):', @@ -2730,7 +2687,7 @@ def _df_isin_impl(df, values): f' else:', f' result[i] = False' ] - elif isinstance(values.index, types.NoneType): + elif isinstance(values.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' idx = {df}.index[i]', @@ -2745,7 +2702,7 @@ def _df_isin_impl(df, values): f' result[i] = False', f' break' ] - elif isinstance(df_type.index, types.NoneType): + elif isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' value = series_{c}._data[i]', @@ -2781,7 +2738,7 @@ def _df_isin_impl(df, values): result_name.append((result_c, c)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name) - func_lines += df_index_codegen_isin(df_type, df, data) + func_lines.append(f' return pandas.DataFrame({{{data}}}, index={df}._index)') func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, @@ -2852,7 +2809,7 @@ def _df_isin_impl(df, values): f' result = numpy.empty(len(series_{c}._data), numpy.bool_)', f' result_len = len(series_{c}._data)' ] - if isinstance(in_df.index, types.NoneType) and isinstance(df_type.index, types.NoneType): + if isinstance(df.index, PositionalIndexType) and isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' if i <= len(series_{c}_values):', @@ -2862,7 +2819,7 @@ def _df_isin_impl(df, values): f' result[i] = False', f' else:', f' result[i] = False'] - elif isinstance(df_type.index, types.NoneType): + elif isinstance(df_type.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' value = series_{c}._data[i]', @@ -2877,7 +2834,7 @@ def _df_isin_impl(df, values): f' result[i] = False', f' break', ] - elif isinstance(in_df.index, types.NoneType): + elif isinstance(df.index, PositionalIndexType): func_lines += [ f' for i in range(result_len):', f' idx = {df}.index[i]', @@ -2914,7 +2871,7 @@ def _df_isin_impl(df, values): func_lines += [f' {result_c} = pandas.Series(result)'] result_name.append((result_c, c)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name) - func_lines += df_index_codegen_isin(df_type, df, data) + func_lines.append(f' return pandas.DataFrame({{{data}}}, index={df}._index)') func_text = '\n'.join(func_lines) global_vars = {'pandas': pandas, @@ -3151,10 +3108,11 @@ def sdc_pandas_dataframe_groupby_impl(self, by=None, axis=0, level=None, as_inde def df_set_column_index_codelines(self): """Generate code lines with definition of resulting index for DF set_column""" + index_param_expr = 'self._index' if not isinstance(self.index, EmptyIndexType) else 'None' func_lines = [] if self.columns: func_lines += [ - f' length = {df_length_expr(self)}', + f' length = len(self._index)', f' if length == 0:', f' raise SDCLimitation("Could not set item for DataFrame with empty columns")', f' elif length != len(value):', @@ -3162,7 +3120,7 @@ def df_set_column_index_codelines(self): ] else: func_lines += [' length = len(value)'] - func_lines += [f' res_index = {df_index_expr(self, length_expr="length")}'] + func_lines += [f' res_index = {index_param_expr}'] return func_lines @@ -3178,13 +3136,13 @@ def df_add_column_codelines(self, key): res_data = f'res_data_{i}' func_lines += [ f' data_{i} = self._data[{type_id}][{col_id}]', - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")', + f' {res_data} = data_{i}', ] results.append((col, res_data)) res_data = 'new_res_data' literal_key = key.literal_value - func_lines += [f' {res_data} = pandas.Series(value, index=res_index, name="{literal_key}")'] + func_lines += [f' {res_data} = value'] results.append((literal_key, res_data)) data = ', '.join(f'"{col}": {data}' for col, data in results) @@ -3209,12 +3167,12 @@ def df_replace_column_codelines(self, key): res_data = f'res_data_{i}' func_lines += [ - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")', + f' {res_data} = data_{i}', ] results.append((col, res_data)) data = ', '.join(f'"{col}": {data}' for col, data in results) - func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index)'] + func_lines += [f' return pandas.DataFrame({{{data}}}, index=self._index)'] return func_lines @@ -3222,19 +3180,19 @@ def df_replace_column_codelines(self, key): def df_add_column_codegen(self, key): """ Example of generated implementation: - def _df_add_column_impl(self, key, value): - length = len(self._data[0]) - if length == 0: - raise SDCLimitation("Could not set item for empty DataFrame") - elif length != len(value): - raise ValueError("Length of values does not match length of index") - res_index = numpy.arange(length) - data_0 = self._data[0] - res_data_0 = pandas.Series(data_0, index=res_index, name="A") - data_1 = self._data[1] - res_data_1 = pandas.Series(data_1, index=res_index, name="C") - new_res_data = pandas.Series(value, index=res_index, name="B") - return pandas.DataFrame({"A": res_data_0, "C": res_data_1, "B": new_res_data}, index=res_index) + def _df_add_column_impl(self, key, value): + length = len(self._index) + if length == 0: + raise SDCLimitation("Could not set item for DataFrame with empty columns") + elif length != len(value): + raise ValueError("Length of values does not match length of index") + res_index = self._index + data_0 = self._data[0][0] + res_data_0 = data_0 + data_1 = self._data[1][0] + res_data_1 = data_1 + new_res_data = value + return pandas.DataFrame({"A": res_data_0, "C": res_data_1, "B": new_res_data}, index=res_index) """ func_lines = [f'def _df_add_column_impl(self, key, value):'] func_lines += df_add_column_codelines(self, key) @@ -3250,17 +3208,17 @@ def df_replace_column_codegen(self, key): """ Example of generated implementation: def _df_replace_column_impl(self, key, value): - length = len(self._data[0]) + length = len(self._index) if length == 0: raise SDCLimitation("Could not set item for DataFrame with empty columns") elif length != len(value): raise ValueError("Length of values does not match length of index") - res_index = numpy.arange(length) + res_index = self._index data_0 = value - res_data_0 = pandas.Series(data_0, index=res_index, name="A") - data_1 = self._data[1] - res_data_1 = pandas.Series(data_1, index=res_index, name="C") - return pandas.DataFrame({"A": res_data_0, "C": res_data_1}, index=res_index) + res_data_0 = data_0 + data_1 = self._data[1][0] + res_data_1 = data_1 + return pandas.DataFrame({"A": res_data_0, "C": res_data_1}, index=self._index) """ func_lines = [f'def _df_replace_column_impl(self, key, value):'] func_lines += df_replace_column_codelines(self, key) diff --git a/sdc/datatypes/hpat_pandas_groupby_functions.py b/sdc/datatypes/hpat_pandas_groupby_functions.py index 83f752e9d..aa83fcc0e 100644 --- a/sdc/datatypes/hpat_pandas_groupby_functions.py +++ b/sdc/datatypes/hpat_pandas_groupby_functions.py @@ -41,12 +41,13 @@ from numba.core.typing import signature from numba import literally -from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray, _sdc_take +from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray from sdc.datatypes.hpat_pandas_groupby_types import DataFrameGroupByType, SeriesGroupByType from sdc.utilities.sdc_typing_utils import TypeChecker, kwsparams2list, sigparams2list from sdc.utilities.utils import (sdc_overload, sdc_overload_method, sdc_register_jitable) from sdc.hiframes.pd_series_type import SeriesType from sdc.str_ext import string_type +from sdc.functions.numpy_like import take as nplike_take performance_limitation = "This function may reveal slower performance than Pandas* on user system.\ @@ -218,7 +219,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc, f' column_data_{i} = {df}._data[{type_id}][{col_id}]', f' for j in numpy.arange(res_index_len):', f' idx = argsorted_index[j] if {groupby_param_sort} else j', - f' group_arr_{i} = _sdc_take(column_data_{i}, list({groupby_dict}[group_keys[idx]]))', + f' group_arr_{i} = sdc_take(column_data_{i}, list({groupby_dict}[group_keys[idx]]))', f' group_series_{i} = pandas.Series(group_arr_{i})', f' result_data_{i}[j] = group_series_{i}.{func_name}({extra_impl_params})', ] @@ -226,7 +227,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc, data = ', '.join(f'\'{column_names[i]}\': result_data_{i}' for i in range(len(columns))) func_lines.extend(['\n'.join([ f' if {groupby_param_sort}:', - f' res_index = _sdc_take(group_keys, argsorted_index)', + f' res_index = sdc_take(group_keys, argsorted_index)', f' else:', f' res_index = group_keys', f' return pandas.DataFrame({{{data}}}, index=res_index)' @@ -236,7 +237,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc, global_vars = {'pandas': pandas, 'numpy': numpy, '_sdc_asarray': _sdc_asarray, - '_sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'sdc_arrays_argsort': sdc_arrays_argsort} return func_text, global_vars @@ -262,11 +263,11 @@ def _sdc_pandas_series_groupby_generic_func_codegen(func_name, func_params, defa f' result_data = numpy.empty(res_index_len, dtype=res_dtype)', f' for j in numpy.arange(res_index_len):', f' idx = argsorted_index[j] if {groupby_param_sort} else j', - f' group_arr = _sdc_take({series}._data, list({groupby_dict}[group_keys[idx]]))', + f' group_arr = sdc_take({series}._data, list({groupby_dict}[group_keys[idx]]))', f' group_series = pandas.Series(group_arr)', f' result_data[j] = group_series.{func_name}({extra_impl_params})', f' if {groupby_param_sort}:', - f' res_index = _sdc_take(group_keys, argsorted_index)', + f' res_index = sdc_take(group_keys, argsorted_index)', f' else:', f' res_index = group_keys', f' return pandas.Series(data=result_data, index=res_index, name={series}._name)' @@ -276,7 +277,7 @@ def _sdc_pandas_series_groupby_generic_func_codegen(func_name, func_params, defa global_vars = {'pandas': pandas, 'numpy': numpy, '_sdc_asarray': _sdc_asarray, - '_sdc_take': _sdc_take, + 'sdc_take': nplike_take, 'sdc_arrays_argsort': sdc_arrays_argsort} return func_text, global_vars diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 1c18ba2e6..3594902a8 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -51,10 +51,12 @@ import sdc.datatypes.common_functions as common_functions from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, find_common_dtype_from_numpy_dtypes, has_literal_value, - has_python_value) -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType -from sdc.datatypes.common_functions import (sdc_join_series_indexes, sdc_arrays_argsort, sdc_reindex_series) + has_python_value, + sdc_old_index_types, + find_index_common_dtype, + ) +from sdc.datatypes.indexes import * +from sdc.datatypes.common_functions import (sdc_arrays_argsort, sdc_reindex_series, _sdc_internal_join) from sdc.datatypes.hpat_pandas_rolling_types import ( gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl) from sdc.datatypes.hpat_pandas_series_rolling_types import _hpat_pandas_series_rolling_init @@ -72,6 +74,7 @@ from sdc.hiframes.api import isna from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby from sdc.utilities.prange_utils import parallel_chunks +from sdc.extensions.indexes.indexes_generic import sdc_indexes_join_outer, sdc_fix_indexes_join from sdc.set_ext import build_set from .pandas_series_functions import apply @@ -147,9 +150,8 @@ def hpat_pandas_series_iat_impl(self, idx): # Note: Loc slice without start is not supported min_int64 = numpy.iinfo('int64').min max_int64 = numpy.iinfo('int64').max - index_is_none = (self.series.index is None or - isinstance(self.series.index, numba.types.misc.NoneType)) - if isinstance(idx, types.SliceType) and not index_is_none: + index_is_positional = isinstance(self.series.index, PositionalIndexType) + if isinstance(idx, types.SliceType) and not index_is_positional: def hpat_pandas_series_loc_slice_impl(self, idx): series = self._series index = series.index @@ -203,7 +205,7 @@ def hpat_pandas_series_loc_slice_impl(self, idx): return hpat_pandas_series_loc_slice_impl - if isinstance(idx, types.SliceType) and index_is_none: + if isinstance(idx, types.SliceType) and index_is_positional: def hpat_pandas_series_loc_slice_noidx_impl(self, idx): max_slice = sys.maxsize start = idx.start @@ -374,16 +376,16 @@ def hpat_pandas_series_getitem(self, idx): return None # Note: Getitem return Series - index_is_none = isinstance(self.index, numba.types.misc.NoneType) - index_is_none_or_numeric = index_is_none or (self.index and isinstance(self.index.dtype, types.Number)) - index_is_string = not index_is_none and isinstance(self.index.dtype, (types.UnicodeType, types.StringLiteral)) + index_is_positional = isinstance(self.index, PositionalIndexType) + index_is_numeric = isinstance(self.index.dtype, types.Number) + index_is_string = isinstance(self.index.dtype, types.UnicodeType) if ( - isinstance(idx, types.Number) and index_is_none_or_numeric or + isinstance(idx, types.Number) and index_is_numeric or (isinstance(idx, (types.UnicodeType, types.StringLiteral)) and index_is_string) ): def hpat_pandas_series_getitem_index_impl(self, idx): - index = self.index + index = self._index mask = numpy.empty(len(self._data), numpy.bool_) for i in numba.prange(len(index)): mask[i] = index[i] == idx @@ -405,7 +407,7 @@ def hpat_pandas_series_getitem_idx_slice_impl(self, idx): return hpat_pandas_series_getitem_idx_slice_impl if (isinstance(idx, (types.List, types.Array)) - and isinstance(idx.dtype, (types.Boolean, bool))): + and isinstance(idx.dtype, types.Boolean)): def hpat_pandas_series_getitem_idx_list_impl(self, idx): if len(self) != len(idx): @@ -422,11 +424,9 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx): # idx is Series and it's index is any, idx.dtype is Boolean if (isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean)): - none_indexes = isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType) - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(idx.index, types.NoneType) or check_index_is_numeric(idx))) - if not (none_or_numeric_indexes - or check_types_comparable(self.index, idx.index)): + positional_indexes = (isinstance(self.index, PositionalIndexType) + and isinstance(idx.index, PositionalIndexType)) + if not check_types_comparable(self.index, idx.index): msg = '{} The index of boolean indexer is not comparable to Series index.' + \ ' Given: self.index={}, idx.index={}' raise TypingError(msg.format(_func_name, self.index, idx.index)) @@ -435,7 +435,7 @@ def _series_getitem_idx_bool_indexer_impl(self, idx): # TO-DO: replace sdc_reindex_series with reindex methods and move this logic to impl # for specific index types (needs proper index type instead of types.none as index) - if none_indexes == True: # noqa + if positional_indexes == True: # noqa if len(self) > len(idx): msg = "Unalignable boolean Series provided as indexer " + \ "(index of the boolean Series and of the indexed object do not match)." @@ -455,8 +455,8 @@ def _series_getitem_idx_bool_indexer_impl(self, idx): return _series_getitem_idx_bool_indexer_impl - # idx is Series and it's index is None, idx.dtype is not Boolean - if (isinstance(idx, SeriesType) and index_is_none + # idx is Series and it's index is PositionalIndex, idx.dtype is not Boolean + if (isinstance(idx, SeriesType) and index_is_positional and not isinstance(idx.data.dtype, (types.Boolean, bool))): def hpat_pandas_series_getitem_idx_list_impl(self, idx): res = numpy.copy(self._data[:len(idx._data)]) @@ -468,8 +468,8 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx): return pandas.Series(data=res, index=index[idx._data], name=self._name) return hpat_pandas_series_getitem_idx_list_impl - # idx is Series and it's index is not None, idx.dtype is not Boolean - if (isinstance(idx, SeriesType) and not isinstance(self.index, types.NoneType) + # idx is Series and it's index is not PositionalIndex, idx.dtype is not Boolean + if (isinstance(idx, SeriesType) and not isinstance(self.index, PositionalIndexType) and not isinstance(idx.data.dtype, (types.Boolean, bool))): def hpat_pandas_series_getitem_idx_series_impl(self, idx): index = self.index @@ -602,8 +602,9 @@ def sdc_pandas_series_setitem(self, idx, value): idx_is_boolean_array = isinstance(idx, types.Array) and isinstance(idx.dtype, types.Boolean) idx_is_boolean_series = isinstance(idx, SeriesType) and isinstance(idx.dtype, types.Boolean) idx_and_self_index_comparable = check_types_comparable(self.index, idx) - self_index_is_none = isinstance(self.index, types.NoneType) - assign_along_positions = ((self_index_is_none + self_index_is_positional = isinstance(self.index, PositionalIndexType) + idx_index_is_positional = isinstance(idx, SeriesType) and isinstance(idx.index, PositionalIndexType) + assign_along_positions = ((self_index_is_positional or isinstance(idx, types.SliceType) or not idx_and_self_index_comparable) and not idx_is_boolean_series @@ -615,15 +616,16 @@ def sdc_pandas_series_setitem(self, idx, value): idx_is_numeric_or_boolean_series = (isinstance(idx, SeriesType) and isinstance(idx.dtype, (types.Number, types.Boolean))) assign_via_idx_mask = idx_is_scalar and idx_and_self_index_comparable - assign_via_idx_data = idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable + assign_via_idx_values = (self_index_is_positional and idx_index_is_positional + or idx_is_numeric_or_boolean_series and not idx_and_self_index_comparable) def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): if assign_via_idx_mask == True: # noqa # FIXME_Numba#5157: using asarray since eq impl for index types returns list _idx = numpy.asarray(self._index == idx) - elif assign_via_idx_data == True: # noqa - _idx = idx._data + elif assign_via_idx_values == True: # noqa + _idx = idx.values else: _idx = idx @@ -635,15 +637,11 @@ def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value): if (idx_is_boolean_array or idx_is_boolean_series) and value_is_series: - self_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype - value_index_dtype = types.int64 if isinstance(value.index, types.NoneType) else value.index.dtype - if (isinstance(self_index_dtype, types.Number) and isinstance(value_index_dtype, types.Number)): - indexes_common_dtype = find_common_dtype_from_numpy_dtypes([self_index_dtype, value_index_dtype], []) - elif (isinstance(self_index_dtype, types.UnicodeType) and isinstance(value_index_dtype, types.UnicodeType)): - indexes_common_dtype = types.unicode_type - else: + if not check_types_comparable(self.index, value.index): msg = '{} The self and value indexes must be comparable. Given: self.dtype={}, value.dtype={}' - raise TypingError(msg.format(_func_name, self_index_dtype, value_index_dtype)) + raise TypingError(msg.format(_func_name, self.index, value.index)) + + _, indexes_common_dtype = find_index_common_dtype(self.index, value.index) if idx_is_boolean_array: @@ -812,10 +810,9 @@ def sdc_pandas_series_setitem_idx_str_series_align_impl(self, idx, value): if number_of_found != idx_data_size: raise KeyError("Reindexing not possible: idx has index not found in Series") - if value_is_series == True: # noqa - self._data[set_positions] = value._data - else: - self._data[set_positions] = value + set_values = value if value_is_series == False else value._data # noqa + self._data[set_positions] = set_values + return self return sdc_pandas_series_setitem_idx_str_series_align_impl @@ -1654,16 +1651,10 @@ def hpat_pandas_series_index(self): ty_checker = TypeChecker(_func_name) ty_checker.check(self, SeriesType) - if isinstance(self.index, types.NoneType): - def hpat_pandas_series_index_none_impl(self): - return pandas.RangeIndex(len(self._data)) - - return hpat_pandas_series_index_none_impl - else: - def hpat_pandas_series_index_impl(self): - return self._index + def hpat_pandas_series_index_impl(self): + return self._index - return hpat_pandas_series_index_impl + return hpat_pandas_series_index_impl hpat_pandas_series_rolling = sdc_overload_method(SeriesType, 'rolling')( @@ -1907,16 +1898,19 @@ def hpat_pandas_series_astype_no_modify_impl(self, dtype, copy=True, errors='rai (isinstance(dtype, types.StringLiteral) and dtype.literal_value == 'str')) # Needs Numba astype impl support converting unicode_type to NumberClass and other types - if (isinstance(self.data, StringArrayType) and not str_check): - if isinstance(dtype, types.functions.NumberClass) and errors == 'raise': - raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype}') - if isinstance(dtype, types.StringLiteral) and errors == 'raise': - try: - literal_value = numpy.dtype(dtype.literal_value) - except: - pass # Will raise the exception later - else: - raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype.literal_value}') + if isinstance(self.data, StringArrayType): + if not str_check: + if isinstance(dtype, types.functions.NumberClass) and errors == 'raise': + raise TypingError(f'Needs Numba astype impl support converting unicode_type to {dtype}') + if isinstance(dtype, types.StringLiteral) and errors == 'raise': + try: + literal_value = numpy.dtype(dtype.literal_value) + except TypeError: + pass # Will raise the exception later + else: + raise TypingError(f'Needs Numba astype impl support converting unicode_type to {literal_value}') + else: + return hpat_pandas_series_astype_no_modify_impl data_narr = isinstance(self.data, types.npytypes.Array) dtype_num_liter = isinstance(dtype, (types.functions.NumberClass, types.StringLiteral)) @@ -2143,6 +2137,7 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr ty_checker = TypeChecker(_func_name) ty_checker.check(self, SeriesType) + other_is_series = isinstance(to_append, SeriesType) if not (isinstance(to_append, SeriesType) or (isinstance(to_append, (types.UniTuple, types.List)) and isinstance(to_append.dtype, SeriesType))): ty_checker.raise_exc(to_append, 'series or list/tuple of series', 'to_append') @@ -2162,17 +2157,21 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr or has_python_value(ignore_index, False) or isinstance(ignore_index, types.Omitted)) to_append_is_series = isinstance(to_append, SeriesType) + index_api_supported = not isinstance(self.index, sdc_old_index_types) if ignore_index_is_false: def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False): if to_append_is_series == True: # noqa new_data = common_functions.hpat_arrays_append(self._data, to_append._data) - new_index = common_functions.hpat_arrays_append(self.index, to_append.index) + _self_index = self._index.values if index_api_supported == True else self._index # noqa + new_index = common_functions.hpat_arrays_append(_self_index, to_append._index) else: data_arrays_to_append = [series._data for series in to_append] - index_arrays_to_append = [series.index for series in to_append] + index_arrays_to_append = [series._index for series in to_append] + new_data = common_functions.hpat_arrays_append(self._data, data_arrays_to_append) - new_index = common_functions.hpat_arrays_append(self.index, index_arrays_to_append) + _self_index = self._index.values if index_api_supported == True else self._index # noqa + new_index = common_functions.hpat_arrays_append(_self_index, index_arrays_to_append) return pandas.Series(new_data, new_index) @@ -2230,22 +2229,18 @@ def hpat_pandas_series_copy(self, deep=True): if not isinstance(deep, (types.Omitted, types.Boolean)) and not deep: ty_checker.raise_exc(deep, 'boolean', 'deep') - if isinstance(self.index, types.NoneType): - def hpat_pandas_series_copy_impl(self, deep=True): - if deep: - return pandas.Series(data=numpy_like.copy(self._data), name=self._name) - else: - return pandas.Series(data=self._data, name=self._name) - return hpat_pandas_series_copy_impl - else: - def hpat_pandas_series_copy_impl(self, deep=True): - if deep: - return pandas.Series(data=numpy_like.copy(self._data), index=numpy_like.copy(self._index), - name=self._name) - else: - # Shallow copy of index is not supported yet - return pandas.Series(data=self._data, index=numpy_like.copy(self._index), name=self._name) - return hpat_pandas_series_copy_impl + index_api_supported = not isinstance(self.index, sdc_old_index_types) + + def hpat_pandas_series_copy_impl(self, deep=True): + new_series_data = numpy_like.copy(self._data) if deep else self._data + + if index_api_supported == False: # noqa + new_series_index = self._index.copy() if deep else self._index + else: + new_series_index = self._index.copy(deep=deep) + return pandas.Series(new_series_data, new_series_index, name=self._name) + + return hpat_pandas_series_copy_impl @sdc_overload_method(SeriesType, 'corr') @@ -2344,16 +2339,10 @@ def hpat_pandas_series_head(self, n=5): if not isinstance(n, (types.Integer, types.Omitted, types.NoneType)) and n != 5: ty_checker.raise_exc(n, 'int', 'n') - if isinstance(self.index, types.NoneType): - def hpat_pandas_series_head_impl(self, n=5): - return pandas.Series(data=self._data[:n], name=self._name) - - return hpat_pandas_series_head_impl - else: - def hpat_pandas_series_head_index_impl(self, n=5): - return pandas.Series(data=self._data[:n], index=self._index[:n], name=self._name) + def hpat_pandas_series_head_index_impl(self, n=5): + return pandas.Series(data=self._data[:n], index=self._index[:n], name=self._name) - return hpat_pandas_series_head_index_impl + return hpat_pandas_series_head_index_impl @sdc_overload_method(SeriesType, 'isnull') @@ -2705,14 +2694,6 @@ def hpat_pandas_series_take(self, indices, axis=0, is_copy=False): if not isinstance(indices, (types.List, types.Array)): ty_checker.raise_exc(indices, 'array-like', 'indices') - if isinstance(self.index, types.NoneType) or self.index is None: - def hpat_pandas_series_take_noindex_impl(self, indices, axis=0, is_copy=False): - local_data = [self._data[i] for i in indices] - - return pandas.Series(local_data, indices) - - return hpat_pandas_series_take_noindex_impl - def hpat_pandas_series_take_impl(self, indices, axis=0, is_copy=False): local_data = [self._data[i] for i in indices] local_index = [self._index[i] for i in indices] @@ -2779,7 +2760,7 @@ def hpat_pandas_series_idxmax(self, axis=None, skipna=None): if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'None', 'axis') - none_index = isinstance(self.index, types.NoneType) or self.index is None + positional_index = isinstance(self.index, PositionalIndexType) if isinstance(self.data, StringArrayType): def hpat_pandas_series_idxmax_str_impl(self, axis=None, skipna=None): if skipna is None: @@ -2788,7 +2769,7 @@ def hpat_pandas_series_idxmax_str_impl(self, axis=None, skipna=None): raise ValueError("Method idxmax(). Unsupported parameter 'skipna'=False with str data") result = numpy.argmax(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -2807,7 +2788,7 @@ def hpat_pandas_series_idxmax_impl(self, axis=None, skipna=None): else: result = numpy_like.argmax(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -2996,7 +2977,7 @@ def hpat_pandas_series_rename(self, index=None, copy=True, inplace=False, level= types.StringLiteral, types.Integer)) and level is not None: ty_checker.raise_exc(level, 'Integer or string', 'level') - def hpat_pandas_series_rename_idx_impl(self, index=None, copy=True, inplace=False, level=None): + def hpat_pandas_series_rename_impl(self, index=None, copy=True, inplace=False, level=None): if copy is True: series_data = self._data.copy() series_index = self._index.copy() @@ -3006,17 +2987,7 @@ def hpat_pandas_series_rename_idx_impl(self, index=None, copy=True, inplace=Fals return pandas.Series(data=series_data, index=series_index, name=index) - def hpat_pandas_series_rename_noidx_impl(self, index=None, copy=True, inplace=False, level=None): - if copy is True: - series_data = self._data.copy() - else: - series_data = self._data - - return pandas.Series(data=series_data, index=self._index, name=index) - - if isinstance(self.index, types.NoneType): - return hpat_pandas_series_rename_noidx_impl - return hpat_pandas_series_rename_idx_impl + return hpat_pandas_series_rename_impl @sdc_overload_method(SeriesType, 'min') @@ -3316,7 +3287,7 @@ def hpat_pandas_series_idxmin(self, axis=None, skipna=None): if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'None', 'axis') - none_index = isinstance(self.index, types.NoneType) or self.index is None + positional_index = isinstance(self.index, PositionalIndexType) if isinstance(self.data, StringArrayType): def hpat_pandas_series_idxmin_str_impl(self, axis=None, skipna=None): if skipna is None: @@ -3325,7 +3296,7 @@ def hpat_pandas_series_idxmin_str_impl(self, axis=None, skipna=None): raise ValueError("Method idxmin(). Unsupported parameter 'skipna'=False with str data") result = numpy.argmin(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -3344,7 +3315,7 @@ def hpat_pandas_series_idxmin_impl(self, axis=None, skipna=None): else: result = numpy_like.argmin(self._data) - if none_index == True: # noqa + if positional_index == True: # noqa return result else: return self._index[int(result)] @@ -3807,7 +3778,7 @@ def hpat_pandas_series_argsort(self, axis=0, kind='quicksort', order=None): and order is not None: ty_checker.raise_exc(order, 'None', 'order') - if not isinstance(self.index, types.NoneType): + if not isinstance(self.index, PositionalIndexType): def hpat_pandas_series_argsort_idx_impl(self, axis=0, kind='quicksort', order=None): if kind != 'quicksort' and kind != 'mergesort': raise ValueError("Method argsort(). Unsupported parameter. Given 'kind' != 'quicksort' or 'mergesort'") @@ -4033,21 +4004,19 @@ def hpat_pandas_series_dropna(self, axis=0, inplace=False): if not (inplace is False or isinstance(inplace, types.Omitted)): ty_checker.raise_exc(inplace, 'bool', 'inplace') - if (isinstance(self.data.dtype, types.Number) - and (isinstance(self.index, types.NoneType) - or isinstance(self.index.dtype, types.Number))): + # if both data and index are numeric (i.e. types.Array) dispatch to numpy_like.dropna impl + if (isinstance(self.dtype, types.Number) and isinstance(self.index.dtype, types.Number)): def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False): - index = self.index - return numpy_like.dropna(self._data, index, self._name) + return numpy_like.dropna(self._data, self._index, self._name) return hpat_pandas_series_dropna_impl else: def hpat_pandas_series_dropna_str_impl(self, axis=0, inplace=False): - # generate Series index if needed by using SeriesType.index (i.e. not self._index) + # TO-DO: verify these operations are fused na_data_arr = sdc.hiframes.api.get_nan_mask(self._data) data = self._data[~na_data_arr] - index = self.index[~na_data_arr] + index = self._index[~na_data_arr] return pandas.Series(data, index, self._name) return hpat_pandas_series_dropna_str_impl @@ -4529,9 +4498,7 @@ def sdc_pandas_str_series_operator_add(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -4554,7 +4521,7 @@ def _series_operator_add_scalar_impl(self, other): else: # both operands are string series # TO-DO: None indexes branch is dead code, remove? - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): + if (isinstance(self.index, PositionalIndexType) and isinstance(other.index, PositionalIndexType)): def _series_operator_add_none_indexes_impl(self, other): if (len(self._data) == len(other._data)): @@ -4583,32 +4550,18 @@ def _series_operator_add_none_indexes_impl(self, other): return _series_operator_add_none_indexes_impl else: - left_index_is_range = isinstance(self.index, RangeIndexType) - numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( - [self.index.dtype, other.index.dtype], []) - common_dtype_different = (numba_index_common_dtype != self.index.dtype - or numba_index_common_dtype != other.index.dtype) - - def _series_operator_add_common_impl(self, other): - left_index, right_index = self.index, other.index - - # TO-DO: coversion of RangeIndexType to np.array may happen several times here: - # in array_equal, in astype or left_index.values - need caching of array allocated once - - # check if indexes are equal and series don't have to be aligned - if (left_index is right_index or numpy_like.array_equal(left_index, right_index)): - result_data = self._data + other._data - - if common_dtype_different == True: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa - - return pandas.Series(result_data, index=result_index) + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) + def _series_operator_add_str_impl(self, other): + left_index, right_index = self._index, other._index + if index_api_supported == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) result_size = len(joined_index) result_nan_mask = numpy.zeros(result_size, dtype=numpy.bool_) result_data_as_list = [] @@ -4627,7 +4580,7 @@ def _series_operator_add_common_impl(self, other): return pandas.Series(result_data, joined_index) - return _series_operator_add_common_impl + return _series_operator_add_str_impl return None @@ -4660,9 +4613,7 @@ def sdc_pandas_str_series_operator_mul(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -4689,7 +4640,7 @@ def _series_operator_mul_scalar_impl(self, other): self_is_series = isinstance(self, SeriesType) # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): + if (isinstance(self.index, PositionalIndexType) and isinstance(other.index, PositionalIndexType)): def _series_operator_mul_none_indexes_impl(self, other): series_operand = self if self_is_series == True else other # noqa @@ -4717,32 +4668,18 @@ def _series_operator_mul_none_indexes_impl(self, other): return _series_operator_mul_none_indexes_impl else: - left_index_is_range = isinstance(self.index, RangeIndexType) - numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( - [self.index.dtype, other.index.dtype], []) - common_dtype_different = (numba_index_common_dtype != self.index.dtype - or numba_index_common_dtype != other.index.dtype) + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) def _series_operator_mul_common_impl(self, other): - left_index, right_index = self.index, other.index - - # TO-DO: coversion of RangeIndexType to np.array may happen several times here: - # in array_equal, in astype or left_index.values - need caching of array allocated once - - # check if indexes are equal and series don't have to be aligned - if (left_index is right_index or numpy_like.array_equal(left_index, right_index)): - result_data = self._data * other._data - - if common_dtype_different == True: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa - - return pandas.Series(result_data, index=result_index) - - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) + left_index, right_index = self._index, other._index + if index_api_supported == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) str_series_operand = self if self_is_string_series == True else other # noqa str_series_indexer = left_indexer if self_is_string_series == True else right_indexer # noqa result_size = len(joined_index) diff --git a/sdc/datatypes/hpat_pandas_stringmethods_functions.py b/sdc/datatypes/hpat_pandas_stringmethods_functions.py index cd8067a0d..04616c275 100644 --- a/sdc/datatypes/hpat_pandas_stringmethods_functions.py +++ b/sdc/datatypes/hpat_pandas_stringmethods_functions.py @@ -89,7 +89,7 @@ def hpat_pandas_stringmethods_upper_impl(self): from sdc.utilities.utils import sdc_overload_method, sdc_register_jitable from sdc.hiframes.api import get_nan_mask from sdc.str_arr_ext import str_arr_set_na_by_mask, create_str_arr_from_list -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation @sdc_overload_method(StringMethodsType, 'center') diff --git a/sdc/datatypes/indexes/__init__.py b/sdc/datatypes/indexes/__init__.py new file mode 100644 index 000000000..52d144708 --- /dev/null +++ b/sdc/datatypes/indexes/__init__.py @@ -0,0 +1,32 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# modules are dependent on each other e.g. positional_index_type +# needs range_index_type to be imported, so below order matters +from .range_index_type import RangeIndexType +from .positional_index_type import PositionalIndexType +from .empty_index_type import EmptyIndexType +from .int64_index_type import Int64IndexType diff --git a/sdc/datatypes/indexes/empty_index_type.py b/sdc/datatypes/indexes/empty_index_type.py new file mode 100644 index 000000000..76eda45a2 --- /dev/null +++ b/sdc/datatypes/indexes/empty_index_type.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper +) + + +class EmptyIndexType(types.Type): + + # this index represents special case of pd.Index([]) with dtype='object' + # for overload typing functions assume it has following dtype + dtype = types.pyobject + + def __init__(self, is_named=False): + self.is_named = is_named + super(EmptyIndexType, self).__init__( + name='EmptyIndexType({})'.format(is_named)) + + +@register_model(EmptyIndexType) +class EmptyIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + name_type = types.unicode_type if fe_type.is_named else types.none + members = [ + ('name', name_type), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.EmptyIndexType = EmptyIndexType + + +make_attribute_wrapper(EmptyIndexType, 'name', '_name') diff --git a/sdc/datatypes/int64_index_type.py b/sdc/datatypes/indexes/int64_index_type.py similarity index 100% rename from sdc/datatypes/int64_index_type.py rename to sdc/datatypes/indexes/int64_index_type.py diff --git a/sdc/datatypes/indexes/positional_index_type.py b/sdc/datatypes/indexes/positional_index_type.py new file mode 100644 index 000000000..2cbf1ae77 --- /dev/null +++ b/sdc/datatypes/indexes/positional_index_type.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numba import types +from numba.extending import ( + models, + register_model, + make_attribute_wrapper +) + +from sdc.datatypes.indexes import RangeIndexType + + +class PositionalIndexType(types.IterableType): + dtype = types.int64 + + def __init__(self, is_named=False): + self.data = RangeIndexType(is_named) + self.is_named = is_named + super(PositionalIndexType, self).__init__( + name='PositionalIndexType({})'.format(is_named)) + + @property + def iterator_type(self): + res = self.data.iterator_type + return res + + +@register_model(PositionalIndexType) +class PositionalIndexModel(models.StructModel): + def __init__(self, dmm, fe_type): + + members = [ + ('data', fe_type.data), + ] + models.StructModel.__init__(self, dmm, fe_type, members) + + +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.PositionalIndexType = PositionalIndexType + + +make_attribute_wrapper(PositionalIndexType, 'data', '_data') diff --git a/sdc/datatypes/range_index_type.py b/sdc/datatypes/indexes/range_index_type.py similarity index 100% rename from sdc/datatypes/range_index_type.py rename to sdc/datatypes/indexes/range_index_type.py diff --git a/sdc/extensions/indexes/empty_index_ext.py b/sdc/extensions/indexes/empty_index_ext.py new file mode 100644 index 000000000..470331de7 --- /dev/null +++ b/sdc/extensions/indexes/empty_index_ext.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd + +from numba import types +from numba.core import cgutils +from numba.extending import (NativeValue, intrinsic, box, unbox, ) +from numba.core.typing.templates import signature + +from sdc.datatypes.indexes import EmptyIndexType +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method +from sdc.utilities.sdc_typing_utils import TypeChecker + + +@intrinsic +def init_empty_index(typingctx, name=None): + name = types.none if name is None else name + is_named = False if name is types.none else True + + def codegen(context, builder, sig, args): + name_val, = args + # create series struct and store values + index_struct = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + + if is_named: + if isinstance(name, types.StringLiteral): + index_struct.name = numba.cpython.unicode.make_string_from_constant( + context, builder, types.unicode_type, name.literal_value) + else: + index_struct.name = name_val + + if context.enable_nrt and is_named: + context.nrt.incref(builder, sig.args[1], name_val) + + return index_struct._getvalue() + + ret_typ = EmptyIndexType(is_named) + sig = signature(ret_typ, name) + return sig, codegen + + +@box(EmptyIndexType) +def box_empty_index(typ, val, c): + + mod_name = c.context.insert_const_string(c.builder.module, "pandas") + pd_class_obj = c.pyapi.import_module_noblock(mod_name) + + empty_index = cgutils.create_struct_proxy( + typ)(c.context, c.builder, val) + + data = c.pyapi.list_new(c.context.get_constant(types.int64, 0)) + if typ.is_named: + name = c.pyapi.from_native_value(types.unicode_type, empty_index.name) + else: + name = c.pyapi.make_none() + + res = c.pyapi.call_method(pd_class_obj, "Index", (data, name)) + + c.pyapi.decref(data) + c.pyapi.decref(name) + c.pyapi.decref(pd_class_obj) + return res + + +@unbox(EmptyIndexType) +def unbox_empty_index(typ, val, c): + + index_struct = cgutils.create_struct_proxy(typ)(c.context, c.builder) + + if typ.is_named: + name_obj = c.pyapi.object_getattr_string(val, "name") + index_struct.name = numba.cpython.unicode.unbox_unicode_str( + types.unicode_type, name_obj, c).value + c.pyapi.decref(name_obj) + + is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred()) + return NativeValue(index_struct._getvalue(), is_error=is_error) + + +@sdc_overload_method(EmptyIndexType, 'take') +def pd_empty_index_take_overload(self, indexes): + if not isinstance(self, EmptyIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) and isinstance(indexes.dtype, types.Integer)): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_empty_index_take_impl(self, indexes): + return init_empty_index(name=self._name) + + return pd_empty_index_take_impl + + +@sdc_overload(len) +def pd_empty_index_len_overload(self): + if not isinstance(self, EmptyIndexType): + return None + + def pd_empty_index_len_impl(self): + return 0 + + return pd_empty_index_len_impl diff --git a/sdc/extensions/indexes/indexes_generic.py b/sdc/extensions/indexes/indexes_generic.py index 397698565..3462067cc 100644 --- a/sdc/extensions/indexes/indexes_generic.py +++ b/sdc/extensions/indexes/indexes_generic.py @@ -30,11 +30,255 @@ import pandas as pd from numba import types +from numba.typed import Dict +from numba.typed.typedobjectutils import _nonoptional +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_old_index_types +from sdc.datatypes.indexes import * +from sdc.utilities.utils import sdc_overload_method, sdc_overload +from sdc.utilities.sdc_typing_utils import ( + find_index_common_dtype, + sdc_indexes_wo_values_cache, + ) +from sdc.hiframes.api import fix_df_index +from sdc.functions import numpy_like +from sdc.datatypes.common_functions import _sdc_internal_join -def _check_dtype_param_type(dtype): - """ Returns True is dtype is a valid type for dtype parameter and False otherwise. - Used in RangeIndex ctor and other methods that take dtype parameter. """ - valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) - return isinstance(dtype, valid_dtype_types) or dtype is None +def sdc_numeric_indexes_equals(left, right): + pass + + +@sdc_overload(sdc_numeric_indexes_equals) +def sdc_numeric_indexes_equals_ovld(left, right): + + if not (isinstance(left, sdc_pandas_index_types) + or isinstance(right, sdc_pandas_index_types)): + return None + + convert_A = not isinstance(left, types.Array) + convert_B = not isinstance(right, types.Array) + + def sdc_numeric_indexes_equals_impl(left, right): + left = left.values if convert_A == True else left # noqa + right = right.values if convert_B == True else right # noqa + + return numpy_like.array_equal(left, right) + + return sdc_numeric_indexes_equals_impl + + +def sdc_indexes_attribute_dtype(self): + pass + + +@sdc_overload(sdc_indexes_attribute_dtype) +def sdc_indexes_attribute_dtype_ovld(self): + + if not isinstance(self, sdc_pandas_index_types): + return None + + index_dtype = self.data.dtype + + def sdc_indexes_attribute_dtype_impl(self): + return index_dtype + + return sdc_indexes_attribute_dtype_impl + + +def sdc_indexes_operator_eq(self): + pass + + +@sdc_overload(sdc_indexes_operator_eq) +def sdc_indexes_operator_eq_ovld(self, other): + + # TO-DO: this is for numeric indexes only now, extend to string-index when it's added + use_self_values = isinstance(self, sdc_pandas_index_types) and not isinstance(self, types.Array) + use_other_values = isinstance(other, sdc_pandas_index_types) and not isinstance(other, types.Array) + one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) + + def sdc_indexes_operator_eq_impl(self, other): + + if one_operand_is_scalar == False: # noqa + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + left = self.values if use_self_values == True else self # noqa + right = other.values if use_other_values == True else other # noqa + return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + + return sdc_indexes_operator_eq_impl + + +def sdc_indexes_reindex(self, target): + pass + + +@sdc_overload(sdc_indexes_reindex) +def pd_indexes_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + + index_dtype = self.dtype + + def pd_indexes_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + """ Simplified version of pandas.core.index.base.reindex """ + + if (self is target or self.equals(target)): + return target, None + + # build a dict of 'self' index values to their positions: + map_index_to_position = Dict.empty( + key_type=index_dtype, + value_type=types.int32 + ) + + # TO-DO: needs concurrent hash map + for i, value in enumerate(self): + if value in map_index_to_position: + raise ValueError("cannot reindex from a duplicate axis") + else: + map_index_to_position[value] = i + + res_size = len(target) + indexer = np.empty(res_size, dtype=np.int64) + for i in numba.prange(res_size): + val = target[i] + if val in map_index_to_position: + indexer[i] = map_index_to_position[val] + else: + indexer[i] = -1 + + return target, indexer + + return pd_indexes_index_reindex_impl + + +def sdc_indexes_join_outer(left, right): + pass + + +@sdc_overload(sdc_indexes_join_outer, jit_options={'parallel': False}) +def pd_indexes_join_overload(left, right): + """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" + + # check that both operands are of types used for representing Pandas indexes + if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types) + and not isinstance(left, EmptyIndexType) + and not isinstance(right, EmptyIndexType)): + return None + + # for index types with dtype=int64 resulting index should be of Int64Index type + if (isinstance(left, (PositionalIndexType, RangeIndexType, Int64IndexType)) + and isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType))): + + def _convert_to_arrays_impl(left, right): + + if (left is right or left.equals(right)): + return pd.Int64Index(left.values), None, None + + joined_data, indexer1, indexer2 = _sdc_internal_join(left.values, right.values) + return pd.Int64Index(joined_data), indexer1, indexer2 + + return _convert_to_arrays_impl + + # for joining with deprecated types.Array indexes (e.g. representing UInt64Index) + # resulting index will be of numpy array type. TO-DO: remove once pd.Index overload + # is supported and all indexes are represented with distinct types + else: + convert_left = isinstance(left, (PositionalIndexType, RangeIndexType, Int64IndexType)) + convert_right = isinstance(right, (PositionalIndexType, RangeIndexType, Int64IndexType)) + index_dtypes_match, res_index_dtype = find_index_common_dtype(left, right) + + def pd_indexes_join_array_indexes_impl(left, right): + + _left = left.values if convert_left == True else left # noqa + _right = right.values if convert_right == True else right # noqa + if (_left is _right + or numpy_like.array_equal(_left, _right)): + if index_dtypes_match == False: # noqa + joined_index = numpy_like.astype(_left, res_index_dtype) + else: + joined_index = _left + return joined_index, None, None + + return _sdc_internal_join(_left, _right) + + return pd_indexes_join_array_indexes_impl + + return None + + +def sdc_fix_indexes_join(joined, indexer1, indexer2): + pass + + +@sdc_overload(sdc_fix_indexes_join) +def pd_fix_indexes_join_overload(joined, indexer1, indexer2): + """ Wraps pandas index.join() into new function that returns indexers as arrays and not optional(array) """ + + # This function is simply a workaround for problem with parfor lowering + # broken by indexers typed as types.Optional(Array) - FIXME_Numba#XXXX: remove it + # in all places whne parfor issue is fixed + def pd_fix_indexes_join_impl(joined, indexer1, indexer2): + if indexer1 is not None: + _indexer1 = _nonoptional(indexer1) + else: + _indexer1 = np.arange(len(joined)) + + if indexer2 is not None: + _indexer2 = _nonoptional(indexer2) + else: + _indexer2 = _indexer1 + + return joined, _indexer1, _indexer2 + + return pd_fix_indexes_join_impl + + +def sdc_unify_index_types(left, right): + pass + + +@sdc_overload(sdc_unify_index_types) +def sdc_unify_index_types_overload(left, right): + """ For equal indexes of different dtypes produced index of common dtype """ + + index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(left, right) + is_left_index_cached = not isinstance(left, sdc_indexes_wo_values_cache) + is_left_index_array = isinstance(left, types.Array) + is_right_index_cached = not isinstance(right, sdc_indexes_wo_values_cache) + is_right_index_array = isinstance(right, types.Array) + + def sdc_unify_index_types_impl(left, right): + if index_dtypes_match == True: # noqa + return left + else: + if is_left_index_cached == True: # noqa + index_data = left.values if is_left_index_array == False else left # noqa + elif is_right_index_cached == True: # noqa + index_data = right.values if is_right_index_array == False else right # noqa + else: + # using numpy_like.astype but not index.astype since latter works differently + index_data = numpy_like.astype(left, numba_index_common_dtype) + + return fix_df_index(index_data) + + return sdc_unify_index_types_impl + + +@sdc_overload(np.array) +def sdc_np_array_overload(A): + """ Overload provides np.array(A) implementations for internal pandas index types """ + + if not (isinstance(A, sdc_pandas_index_types) + and not isinstance(A, sdc_old_index_types)): + return None + + if isinstance(A, PositionalIndexType): + return lambda A: np.arange(len(A)) + + if isinstance(A, RangeIndexType): + return lambda A: np.arange(A.start, A.stop, A.step) + + if isinstance(A, Int64IndexType): + return lambda A: A._data diff --git a/sdc/extensions/indexes/int64_index_ext.py b/sdc/extensions/indexes/int64_index_ext.py index 97db3fd4b..6a22c11e7 100644 --- a/sdc/extensions/indexes/int64_index_ext.py +++ b/sdc/extensions/indexes/int64_index_ext.py @@ -36,15 +36,22 @@ from numba.core.errors import TypingError from numba.core.typing.templates import signature from numba.core.imputils import impl_ret_untracked, call_getiter +from numba.core.boxing import box_array, unbox_array -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType -from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method -from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array, check_signed_integer +from sdc.datatypes.indexes import * +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + check_signed_integer, + _check_dtype_param_type, + sdc_pandas_index_types, + check_types_comparable, + ) from sdc.functions import numpy_like -from numba.core.boxing import box_array, unbox_array from sdc.hiframes.api import fix_df_index -from sdc.extensions.indexes.indexes_generic import _check_dtype_param_type +from sdc.extensions.indexes.indexes_generic import * +from sdc.datatypes.common_functions import hpat_arrays_append @intrinsic @@ -90,8 +97,9 @@ def pd_int64_index_overload(data, dtype=None, copy=False, name=None): _func_name = 'pd.Int64Index().' ty_checker = TypeChecker(_func_name) + convertible_indexes = (PositionalIndexType, RangeIndexType, Int64IndexType) if not (isinstance(data, (types.Array, types.List)) and isinstance(data.dtype, types.Integer) - or isinstance(data, (RangeIndexType, Int64IndexType))): + or isinstance(data, convertible_indexes)): ty_checker.raise_exc(data, 'array/list of integers or integer index', 'data') dtype_is_number_class = isinstance(dtype, types.NumberClass) @@ -108,7 +116,7 @@ def pd_int64_index_overload(data, dtype=None, copy=False, name=None): ty_checker.raise_exc(name, 'string or none', 'name') is_data_array = isinstance(data, types.Array) - is_data_index = isinstance(data, (RangeIndexType, Int64IndexType)) + is_data_index = isinstance(data, convertible_indexes) data_dtype_is_int64 = data.dtype is types.int64 def pd_int64_index_ctor_impl(data, dtype=None, copy=False, name=None): @@ -123,6 +131,7 @@ def pd_int64_index_ctor_impl(data, dtype=None, copy=False, name=None): elif is_data_index == True: # noqa _data = data.values else: + # using fix_df_index to get array since it handles index=None _data = fix_df_index(data)._data if data_dtype_is_int64 == False: # noqa @@ -212,10 +221,8 @@ def pd_int64_index_dtype_overload(self): if not isinstance(self, Int64IndexType): return None - range_index_dtype = self.dtype - def pd_int64_index_dtype_impl(self): - return range_index_dtype + return sdc_indexes_attribute_dtype(self) return pd_int64_index_dtype_impl @@ -276,7 +283,7 @@ def pd_int64_index_copy_overload(self, name=None, deep=False, dtype=None): if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): ty_checker.raise_exc(name, 'string or none', 'name') - if not (isinstance(deep, (types.Omitted, types.Boolean)) or deep is False): + if not (isinstance(deep, (types.NoneType, types.Omitted, types.Boolean)) or deep is False): ty_checker.raise_exc(deep, 'boolean', 'deep') if not _check_dtype_param_type(dtype): @@ -326,29 +333,25 @@ def pd_int64_index_getitem_impl(self, idx): return pd_int64_index_getitem_impl -# TO-DO: this and many other impls are generic and should be moved to indexes_generic.py @sdc_overload(operator.eq) def pd_int64_index_eq_overload(self, other): - self_is_index = isinstance(self, Int64IndexType) - other_is_index = isinstance(other, Int64IndexType) + _func_name = 'Operator eq.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) - if not (self_is_index and other_is_index - or (self_is_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_index)): + self_is_int64_index = isinstance(self, Int64IndexType) + other_is_int64_index = isinstance(other, Int64IndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_int64_index and other_is_int64_index + or (self_is_int64_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_int64_index)): return None - one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) def pd_int64_index_eq_impl(self, other): - - if one_operand_is_scalar == False: # noqa - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - # names do not matter when comparing pd.Int64Index - left = self.values if self_is_index == True else self # noqa - right = other.values if other_is_index == True else other # noqa - return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + return sdc_indexes_operator_eq(self, other) return pd_int64_index_eq_impl @@ -356,12 +359,18 @@ def pd_int64_index_eq_impl(self, other): @sdc_overload(operator.ne) def pd_int64_index_ne_overload(self, other): - self_is_index = isinstance(self, Int64IndexType) - other_is_index = isinstance(other, Int64IndexType) + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_int64_index = isinstance(self, Int64IndexType) + other_is_int64_index = isinstance(other, Int64IndexType) - if not (self_is_index and other_is_index - or (self_is_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_index)): + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_int64_index and other_is_int64_index + or (self_is_int64_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_int64_index)): return None def pd_int64_index_ne_impl(self, other): @@ -401,7 +410,6 @@ def pd_int64_index_ravel_overload(self, order='C'): _func_name = 'Method ravel().' - # np.ravel argument order is not supported in Numba if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) @@ -413,3 +421,142 @@ def pd_int64_index_ravel_impl(self, order='C'): return self.values return pd_int64_index_ravel_impl + + +@sdc_overload_method(Int64IndexType, 'equals') +def pd_int64_index_equals_overload(self, other): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method equals().' + if not isinstance(other, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_int64_index_equals_impl(self, other): + return sdc_numeric_indexes_equals(self, other) + + return pd_int64_index_equals_impl + + +@sdc_overload_method(Int64IndexType, 'reindex') +def pd_int64_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + def pd_int64_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_int64_index_reindex_impl + + +@sdc_overload_method(Int64IndexType, 'take') +def pd_int64_index_take_overload(self, indexes): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List, types.ListType) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) + and isinstance(indexes.dtype, (types.Integer, types.ListType))): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + # separate handling when indexes is nested lists produces with parallel impls + if isinstance(indexes.dtype, types.ListType): + def pd_int64_index_take_chunked_impl(self, indexes): + new_index_data = numpy_like.take(self.values, indexes) + return pd.Int64Index(new_index_data, name=self._name) + + return pd_int64_index_take_chunked_impl + + convert_target = isinstance(indexes, sdc_pandas_index_types) and not isinstance(indexes, types.Array) + + def pd_int64_index_take_impl(self, indexes): + _indexes = indexes.values if convert_target == True else indexes # noqa + new_index_data = numpy_like.take(self._data, _indexes) + return pd.Int64Index(new_index_data, name=self._name) + + return pd_int64_index_take_impl + + +@sdc_overload_method(Int64IndexType, 'append') +def pd_int64_index_append_overload(self, other): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + convert_other = not isinstance(other, types.Array) + _, res_index_dtype = find_index_common_dtype(self, other) + return_as_array_index = res_index_dtype is not types.int64 + + def pd_int64_index_append_impl(self, other): + _other = other.values if convert_other == True else other # noqa + new_index_data = hpat_arrays_append(self._data, _other) + # this is only needed while some indexes are represented with arrays + # TO-DO: support pd.Index() overload with dtype arg to create indexes + if return_as_array_index == False: # noqa + return pd.Int64Index(new_index_data) + else: + return new_index_data + + return pd_int64_index_append_impl + + +@sdc_overload_method(Int64IndexType, 'join') +def pd_int64_index_join_overload(self, other, how, level=None, return_indexers=False, sort=False): + if not isinstance(self, Int64IndexType): + return None + + _func_name = 'Method join().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not isinstance(how, types.StringLiteral): + ty_checker.raise_exc(how, 'string', 'how') + if not how.literal_value == 'outer': + raise SDCLimitation(f"{_func_name} Only supporting 'outer' now. Given 'how': {how.literal_value}") + + if not (isinstance(level, (types.Omitted, types.NoneType)) or level is None): + ty_checker.raise_exc(level, 'None', 'level') + + if not (isinstance(return_indexers, (types.Omitted, BooleanLiteral)) or return_indexers is False): + ty_checker.raise_exc(return_indexers, 'boolean', 'return_indexers') + + if not (isinstance(sort, (types.Omitted, types.Boolean)) or sort is False): + ty_checker.raise_exc(sort, 'boolean', 'sort') + + _return_indexers = return_indexers.literal_value + + def pd_int64_index_join_impl(self, other, how, level=None, return_indexers=False, sort=False): + + if _return_indexers == True: # noqa + return sdc_indexes_join_outer(self, other) + else: + joined_index, = sdc_indexes_join_outer(self, other) + return joined_index + + return pd_int64_index_join_impl diff --git a/sdc/extensions/indexes/positional_index_ext.py b/sdc/extensions/indexes/positional_index_ext.py new file mode 100644 index 000000000..2c512d4fe --- /dev/null +++ b/sdc/extensions/indexes/positional_index_ext.py @@ -0,0 +1,475 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import operator +import pandas as pd + +from numba import types +from numba.core import cgutils +from numba.extending import (NativeValue, intrinsic, box, unbox, lower_builtin, ) +from numba.core.errors import TypingError +from numba.core.typing.templates import signature +from numba.core.imputils import impl_ret_untracked, call_getiter + +from sdc.datatypes.indexes import PositionalIndexType, RangeIndexType +from sdc.datatypes.indexes.range_index_type import RangeIndexDataType +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.extensions.indexes.range_index_ext import box_range_index, unbox_range_index +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + _check_dtype_param_type, + check_types_comparable, + sdc_pandas_index_types, + ) +from sdc.extensions.indexes.indexes_generic import * + + +@intrinsic +def init_positional_index(typingctx, size, name=None): + name = types.none if name is None else name + is_named = False if name is types.none else True + + ret_typ = PositionalIndexType(is_named) + inner_sig = signature(ret_typ.data, size, name) + + def codegen(context, builder, sig, args): + data_val, name_val = args + + # create positional_index struct and store created instance + # of RangeIndexType as data member + positional_index = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + positional_index.data = context.compile_internal( + builder, + lambda size, name: pd.RangeIndex(size, name=name), + inner_sig, + [data_val, name_val] + ) + + return positional_index._getvalue() + + sig = signature(ret_typ, size, name) + return sig, codegen + + +@box(PositionalIndexType) +def box_positional_index(typ, val, c): + + positional_index = numba.core.cgutils.create_struct_proxy(typ)(c.context, c.builder, val) + data_range_index = numba.core.cgutils.create_struct_proxy(typ.data)( + c.context, c.builder, positional_index.data) + return box_range_index(typ.data, data_range_index._getvalue(), c) + + +@unbox(PositionalIndexType) +def unbox_positional_index(typ, val, c): + + positional_index = cgutils.create_struct_proxy(typ)(c.context, c.builder) + res = unbox_range_index(typ.data, val, c) + positional_index.data = res.value + is_error = res.is_error + + return NativeValue(positional_index._getvalue(), is_error=is_error) + + +@sdc_overload_attribute(PositionalIndexType, 'start') +def pd_positional_index_start_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_start_impl(self): + _self = self._data + return _self.start + + return pd_positional_index_start_impl + + +@sdc_overload_attribute(PositionalIndexType, 'stop') +def pd_positional_index_stop_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_stop_impl(self): + _self = self._data + return _self.stop + + return pd_positional_index_stop_impl + + +@sdc_overload_attribute(PositionalIndexType, 'step') +def pd_positional_index_step_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_step_impl(self): + _self = self._data + return _self.step + + return pd_positional_index_step_impl + + +@sdc_overload_attribute(PositionalIndexType, 'name') +def pd_positional_index_name_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + is_named_index = self.is_named + + def pd_positional_index_name_impl(self): + _self = self._data + if is_named_index == True: # noqa + return _self.name + else: + return None + + return pd_positional_index_name_impl + + +@sdc_overload_attribute(PositionalIndexType, 'dtype') +def pd_positional_index_dtype_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_dtype_impl(self): + return sdc_indexes_attribute_dtype(self) + + return pd_positional_index_dtype_impl + + +@sdc_overload_attribute(PositionalIndexType, 'values') +def pd_positional_index_values_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_values_impl(self): + # TO-DO: add caching when Numba supports writable attributes? + return np.array(self) + + return pd_positional_index_values_impl + + +@sdc_overload(len) +def pd_positional_index_len_overload(self): + if not isinstance(self, PositionalIndexType): + return None + + def pd_positional_index_len_impl(self): + return len(self._data) + + return pd_positional_index_len_impl + + +@sdc_overload(operator.contains) +def pd_range_index_contains_overload(self, val): + if not isinstance(self, PositionalIndexType): + return None + + def pd_range_index_contains_impl(self, val): + _self = self._data + return val in self._data + + return pd_range_index_contains_impl + + +@sdc_overload_method(PositionalIndexType, 'copy') +def pd_positional_index_copy_overload(self, name=None, deep=False, dtype=None): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method copy().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): + ty_checker.raise_exc(name, 'string or none', 'name') + + if not (isinstance(deep, (types.NoneType, types.Omitted, types.Boolean)) or deep is False): + ty_checker.raise_exc(deep, 'boolean', 'deep') + + if not _check_dtype_param_type(dtype): + ty_checker.raise_exc(dtype, 'int64 dtype', 'dtype') + + name_is_none = isinstance(name, (types.NoneType, types.Omitted)) or name is None + keep_name = name_is_none and self.is_named + + def pd_positional_index_copy_impl(self, name=None, deep=False, dtype=None): + + _name = self.name if keep_name == True else name # noqa + return init_positional_index(len(self), _name) + + return pd_positional_index_copy_impl + + +@sdc_overload(operator.getitem) +def pd_positional_index_getitem_overload(self, idx): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Operator getitem().' + ty_checker = TypeChecker(_func_name) + + if not (isinstance(idx, (types.Integer, types.SliceType)) + or isinstance(idx, (types.Array, types.List)) and isinstance(idx.dtype, (types.Integer, types.Boolean))): + ty_checker.raise_exc(idx, 'integer, slice, integer array or list', 'idx') + + def pd_positional_index_getitem_impl(self, idx): + _self = self._data + return _self[idx] + + return pd_positional_index_getitem_impl + + +@sdc_overload(operator.eq) +def pd_positional_index_eq_overload(self, other): + + _func_name = 'Operator eq.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_positional_index = isinstance(self, PositionalIndexType) + other_is_positional_index = isinstance(other, PositionalIndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_positional_index and other_is_positional_index + or (self_is_positional_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_positional_index)): + return None + + def pd_positional_index_eq_impl(self, other): + return sdc_indexes_operator_eq(self, other) + + return pd_positional_index_eq_impl + + +@sdc_overload(operator.ne) +def pd_positional_index_ne_overload(self, other): + + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + self_is_positional_index = isinstance(self, PositionalIndexType) + other_is_positional_index = isinstance(other, PositionalIndexType) + + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types + if not (self_is_positional_index and other_is_positional_index + or (self_is_positional_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_positional_index)): + return None + + def pd_positional_index_ne_impl(self, other): + + eq_res = np.asarray(self == other) # FIXME_Numba#5157: remove np.asarray and return as list + return list(~eq_res) + + return pd_positional_index_ne_impl + + +@lower_builtin(operator.is_, PositionalIndexType, PositionalIndexType) +def pd_positional_index_is_overload(context, builder, sig, args): + + ty_lhs, ty_rhs = sig.args + if ty_lhs != ty_rhs: + return cgutils.false_bit + + lhs, rhs = args + lhs_ptr = builder.ptrtoint(lhs.operands[0], cgutils.intp_t) + rhs_ptr = builder.ptrtoint(rhs.operands[0], cgutils.intp_t) + return builder.icmp_signed('==', lhs_ptr, rhs_ptr) + + +@lower_builtin('getiter', PositionalIndexType) +def pd_positional_index_getiter(context, builder, sig, args): + """ Returns a new iterator object for PositionalIndexType by delegating to range.__iter__ """ + (value,) = args + positional_index = cgutils.create_struct_proxy(sig.args[0])(context, builder, value) + range_index = cgutils.create_struct_proxy(sig.args[0].data)(context, builder, positional_index.data) + res = call_getiter(context, builder, RangeIndexDataType, range_index.data) + return impl_ret_untracked(context, builder, PositionalIndexType, res) + + +@sdc_overload_method(PositionalIndexType, 'ravel') +def pd_positional_index_ravel_overload(self, order='C'): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method ravel().' + # np.ravel argument order is not supported in Numba + if not (isinstance(order, (types.Omitted, types.StringLiteral, types.UnicodeType)) or order == 'C'): + raise TypingError('{} Unsupported parameters. Given order: {}'.format(_func_name, order)) + + def pd_positional_index_ravel_impl(self, order='C'): + _self = self._data + return _self.values + + return pd_positional_index_ravel_impl + + +@sdc_overload_method(PositionalIndexType, 'equals') +def pd_positional_index_equals_overload(self, other): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method equals().' + if not isinstance(other, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_positional_index_equals_impl(self, other): + + _self = self._data + return _self.equals(other) + + return pd_positional_index_equals_impl + + +@sdc_overload_method(PositionalIndexType, 'reindex') +def pd_positional_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + def pd_positional_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_positional_index_reindex_impl + + +@sdc_overload_method(PositionalIndexType, 'take') +def pd_positional_index_take_overload(self, indexes): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) and isinstance(indexes.dtype, types.Integer)): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_positional_index_take_impl(self, indexes): + _self = self._data + return _self.take(indexes) + + return pd_positional_index_take_impl + + +@sdc_overload_method(PositionalIndexType, 'append') +def pd_positional_index_append_overload(self, other): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_positional_index_append_impl(self, other): + _self = self._data + return _self.append(other) + + return pd_positional_index_append_impl + + +@sdc_overload_method(PositionalIndexType, 'join') +def pd_positional_index_join_overload(self, other, how, level=None, return_indexers=False, sort=False): + if not isinstance(self, PositionalIndexType): + return None + + _func_name = 'Method join().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not isinstance(how, types.StringLiteral): + ty_checker.raise_exc(how, 'string', 'how') + if not how.literal_value == 'outer': + raise SDCLimitation(f"{_func_name} Only supporting 'outer' now. Given 'how': {how.literal_value}") + + if not (isinstance(level, (types.Omitted, types.NoneType)) or level is None): + ty_checker.raise_exc(level, 'None', 'level') + + if not (isinstance(return_indexers, (types.Omitted, BooleanLiteral)) or return_indexers is False): + ty_checker.raise_exc(return_indexers, 'boolean', 'return_indexers') + + if not (isinstance(sort, (types.Omitted, types.Boolean)) or sort is False): + ty_checker.raise_exc(sort, 'boolean', 'sort') + + _return_indexers = return_indexers.literal_value + if isinstance(self, PositionalIndexType) and isinstance(other, PositionalIndexType): + + def pd_indexes_join_positional_impl(self, other, how, level=None, return_indexers=False, sort=False): + self_size, other_size = len(self), len(other) + min_size = min(len(self), len(other)) + max_size = max(self_size, other_size) + + joined_index = init_positional_index(max_size) + if _return_indexers == True: # noqa + self_indexer = None if self_size == other_size else np.arange(max_size) + other_indexer = None if self_size == other_size else np.arange(max_size) + if self_size > other_size: + other_indexer[min_size:] = -1 + elif self_size < other_size: + self_indexer[min_size:] = -1 + + result = joined_index, self_indexer, other_indexer + else: + result = joined_index + + return result + + return pd_indexes_join_positional_impl + + else: + + def pd_positional_index_join_common_impl(self, other, how, level=None, return_indexers=False, sort=False): + if _return_indexers == True: # noqa + return sdc_indexes_join_outer(self, other) + else: + return sdc_indexes_join_outer(self, other)[0] + + return pd_positional_index_join_common_impl diff --git a/sdc/extensions/indexes/range_index_ext.py b/sdc/extensions/indexes/range_index_ext.py index cc04cfcba..44fdafa69 100644 --- a/sdc/extensions/indexes/range_index_ext.py +++ b/sdc/extensions/indexes/range_index_ext.py @@ -37,14 +37,20 @@ from numba.core.typing.templates import signature from numba.core.imputils import impl_ret_untracked, call_getiter -from sdc.datatypes.range_index_type import RangeIndexType, RangeIndexDataType -from sdc.datatypes.common_functions import SDCLimitation, _sdc_take -from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method -from sdc.utilities.sdc_typing_utils import TypeChecker, check_is_numeric_array, check_signed_integer -from sdc.functions.numpy_like import getitem_by_mask -from sdc.functions.numpy_like import astype as nplike_astype -from numba.core.boxing import box_array, unbox_array -from sdc.extensions.indexes.indexes_generic import _check_dtype_param_type +from sdc.datatypes.indexes import PositionalIndexType, RangeIndexType +from sdc.datatypes.indexes.range_index_type import RangeIndexDataType +from sdc.utilities.sdc_typing_utils import SDCLimitation +from sdc.utilities.utils import sdc_overload, sdc_overload_attribute, sdc_overload_method, BooleanLiteral +from sdc.utilities.sdc_typing_utils import ( + TypeChecker, + check_signed_integer, + sdc_pandas_index_types, + check_types_comparable, + _check_dtype_param_type, + sdc_indexes_range_like, + ) +from sdc.functions import numpy_like +from sdc.extensions.indexes.indexes_generic import * @intrinsic @@ -144,8 +150,19 @@ def pd_range_index_ctor_impl(start=None, stop=None, step=None, dtype=None, copy= @typeof_impl.register(pd.RangeIndex) def typeof_range_index(val, c): + # Note: unboxing pd.RangeIndex creates instance of PositionalIndexType + # if index values are trivial range, but creating pd.RangeIndex() with same + # parameters via ctor will create instance of RangeIndexType. + + # This is needed for specializing of Series and DF methods on combination of + # index types and preserving PositionalIndexType as result index type (when possible), + # since in pandas operations on two range indexes may give: + # either RangeIndex or Int64Index (in common case) is_named = val.name is not None - return RangeIndexType(is_named=is_named) + if not (val.start == 0 and val.stop > 0 and val.step == 1): + return RangeIndexType(is_named=is_named) + else: + return PositionalIndexType(is_named=is_named) @box(RangeIndexType) @@ -266,10 +283,8 @@ def pd_range_index_dtype_overload(self): if not isinstance(self, RangeIndexType): return None - range_index_dtype = self.dtype - def pd_range_index_dtype_impl(self): - return range_index_dtype + return sdc_indexes_attribute_dtype(self) return pd_range_index_dtype_impl @@ -319,7 +334,7 @@ def pd_range_index_copy_overload(self, name=None, deep=False, dtype=None): if not (isinstance(name, (types.NoneType, types.Omitted, types.UnicodeType)) or name is None): ty_checker.raise_exc(name, 'string or none', 'name') - if not (isinstance(deep, (types.Omitted, types.Boolean)) or deep is False): + if not (isinstance(deep, (types.NoneType, types.Omitted, types.Boolean)) or deep is False): ty_checker.raise_exc(deep, 'boolean', 'deep') if not _check_dtype_param_type(dtype): @@ -374,13 +389,13 @@ def pd_range_index_getitem_impl(self, idx): if isinstance(idx.dtype, types.Integer): def pd_range_index_getitem_impl(self, idx): - res_as_arr = _sdc_take(self, idx) + res_as_arr = self.take(idx) return pd.Int64Index(res_as_arr, name=self._name) return pd_range_index_getitem_impl elif isinstance(idx.dtype, types.Boolean): def pd_range_index_getitem_impl(self, idx): - return getitem_by_mask(self, idx) + return numpy_like.getitem_by_mask(self, idx) return pd_range_index_getitem_impl @@ -388,25 +403,22 @@ def pd_range_index_getitem_impl(self, idx): @sdc_overload(operator.eq) def pd_range_index_eq_overload(self, other): + _func_name = 'Operator eq.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + self_is_range_index = isinstance(self, RangeIndexType) other_is_range_index = isinstance(other, RangeIndexType) + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types if not (self_is_range_index and other_is_range_index - or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_range_index)): + or (self_is_range_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_range_index)): return None - one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number) def pd_range_index_eq_impl(self, other): - - if one_operand_is_scalar == False: # noqa - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - # names do not matter when comparing pd.RangeIndex - left = self.values if self_is_range_index == True else self # noqa - right = other.values if other_is_range_index == True else other # noqa - return list(left == right) # FIXME_Numba#5157: result must be np.array, remove list when Numba is fixed + return sdc_indexes_operator_eq(self, other) return pd_range_index_eq_impl @@ -414,12 +426,18 @@ def pd_range_index_eq_impl(self, other): @sdc_overload(operator.ne) def pd_range_index_ne_overload(self, other): + _func_name = 'Operator ne.' + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + self_is_range_index = isinstance(self, RangeIndexType) other_is_range_index = isinstance(other, RangeIndexType) + possible_arg_types = (types.Array, types.Number) + sdc_pandas_index_types if not (self_is_range_index and other_is_range_index - or (self_is_range_index and (check_is_numeric_array(other) or isinstance(other, types.Number))) - or ((check_is_numeric_array(self) or isinstance(self, types.Number)) and other_is_range_index)): + or (self_is_range_index and isinstance(other, possible_arg_types)) + or (isinstance(self, possible_arg_types) and other_is_range_index)): return None def pd_range_index_ne_impl(self, other): @@ -470,3 +488,134 @@ def pd_range_index_ravel_impl(self, order='C'): return self.values return pd_range_index_ravel_impl + + +@sdc_overload_method(RangeIndexType, 'equals') +def pd_range_index_equals_overload(self, other): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method equals().' + if not isinstance(other, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'other': {other}") + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + if isinstance(other, sdc_indexes_range_like): + + def pd_range_index_equals_impl(self, other): + + if len(self) != len(other): + return False + if len(self) == 0: + return True + + if len(self) == 1: + return self.start == other.start + + return self.start == other.start and self.step == other.step + else: + + def pd_range_index_equals_impl(self, other): + return sdc_numeric_indexes_equals(self, other) + + return pd_range_index_equals_impl + + +@sdc_overload_method(RangeIndexType, 'reindex') +def pd_range_index_reindex_overload(self, target, method=None, level=None, limit=None, tolerance=None): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method reindex().' + if not isinstance(target, sdc_pandas_index_types): + raise SDCLimitation(f"{_func_name} Unsupported parameter. Given 'target': {target}") + + if not check_types_comparable(self, target): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, target={}'.format(_func_name, self, target)) + + def pd_range_index_reindex_impl(self, target, method=None, level=None, limit=None, tolerance=None): + return sdc_indexes_reindex(self, target=target, method=method, level=level, tolerance=tolerance) + + return pd_range_index_reindex_impl + + +@sdc_overload_method(RangeIndexType, 'take') +def pd_range_index_take_overload(self, indexes): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method take().' + ty_checker = TypeChecker(_func_name) + + valid_indexes_types = (types.Array, types.List) + sdc_pandas_index_types + if not (isinstance(indexes, valid_indexes_types) and isinstance(indexes.dtype, types.Integer)): + ty_checker.raise_exc(indexes, 'array/list of integers or integer index', 'indexes') + + def pd_range_index_take_impl(self, indexes): + _self = pd.Int64Index(self.values, name=self._name) + return _self.take(indexes) + + return pd_range_index_take_impl + + +@sdc_overload_method(RangeIndexType, 'append') +def pd_range_index_append_overload(self, other): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method append().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not check_types_comparable(self, other): + raise TypingError('{} Not allowed for non comparable indexes. \ + Given: self={}, other={}'.format(_func_name, self, other)) + + def pd_range_index_append_impl(self, other): + int64_index = pd.Int64Index(self.values, name=self._name) + return int64_index.append(other) + + return pd_range_index_append_impl + + +@sdc_overload_method(RangeIndexType, 'join') +def pd_range_index_join_overload(self, other, how, level=None, return_indexers=False, sort=False): + if not isinstance(self, RangeIndexType): + return None + + _func_name = 'Method join().' + ty_checker = TypeChecker(_func_name) + + if not isinstance(other, sdc_pandas_index_types): + ty_checker.raise_exc(other, 'pandas index', 'other') + + if not isinstance(how, types.StringLiteral): + ty_checker.raise_exc(how, 'string', 'how') + if not how.literal_value == 'outer': + raise SDCLimitation(f"{_func_name} Only supporting 'outer' now. Given 'how': {how.literal_value}") + + if not (isinstance(level, (types.Omitted, types.NoneType)) or level is None): + ty_checker.raise_exc(level, 'None', 'level') + + if not (isinstance(return_indexers, (types.Omitted, BooleanLiteral)) or return_indexers is False): + ty_checker.raise_exc(return_indexers, 'boolean', 'return_indexers') + + if not (isinstance(sort, (types.Omitted, types.Boolean)) or sort is False): + ty_checker.raise_exc(sort, 'boolean', 'sort') + + _return_indexers = return_indexers.literal_value + + def pd_range_index_join_impl(self, other, how, level=None, return_indexers=False, sort=False): + if _return_indexers == True: # noqa + return sdc_indexes_join_outer(self, other) + else: + joined_index, = sdc_indexes_join_outer(self, other) + return joined_index + + return pd_range_index_join_impl diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index aa9e00a70..8636e0bc2 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -46,19 +46,19 @@ import sdc from sdc.functions.statistics import skew_formula from sdc.hiframes.api import isna -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType + +from sdc.datatypes.indexes import * from sdc.utilities.sdc_typing_utils import TypeChecker, is_default from sdc.utilities.utils import (sdc_overload, sdc_register_jitable, min_dtype_int_val, max_dtype_int_val, min_dtype_float_val, max_dtype_float_val) from sdc.str_arr_ext import (StringArrayType, pre_alloc_string_array, get_utf8_size, string_array_type, create_str_arr_from_list, str_arr_set_na_by_mask, - num_total_chars, str_arr_is_na) + num_total_chars, str_arr_is_na, str_arr_set_na) from sdc.utilities.prange_utils import parallel_chunks -from sdc.utilities.sdc_typing_utils import check_types_comparable +from sdc.utilities.sdc_typing_utils import check_types_comparable, SDCLimitation from sdc.functions.sort import parallel_sort, parallel_stable_sort, parallel_argsort, parallel_stable_argsort -from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types +from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_pandas_df_column_types def astype(self, dtype): @@ -123,11 +123,13 @@ def sdc_astype_overload(self, dtype): """ ty_checker = TypeChecker("numpy-like 'astype'") - valid_self_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(self, valid_self_types) - and not isinstance(self, types.NoneType)): + valid_self_types = sdc_pandas_df_column_types + if not (isinstance(self, valid_self_types)): return None + if isinstance(self, StringArrayType): + return SDCLimitation("numpy_like.astype not implemented for string data") + accepted_dtype_types = (types.functions.NumberClass, types.Function, types.StringLiteral) if not isinstance(dtype, accepted_dtype_types): def impl(self, dtype): @@ -161,7 +163,7 @@ def sdc_astype_number_to_string_impl(self, dtype): return sdc_astype_number_to_string_impl - if (isinstance(self, (types.Array, RangeIndexType, Int64IndexType)) + if (isinstance(self, types.Array) and isinstance(dtype, (types.StringLiteral, types.functions.NumberClass))): def sdc_astype_number_impl(self, dtype): arr = numpy.empty(len(self), dtype=numpy.dtype(dtype)) @@ -349,9 +351,8 @@ def sdc_copy_overload(self): Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k copy """ - valid_self_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(self, valid_self_types) - and not isinstance(self, types.NoneType)): + valid_self_types = sdc_pandas_df_column_types + if not (isinstance(self, valid_self_types)): return None if isinstance(self, types.Array): @@ -367,12 +368,14 @@ def sdc_copy_array_impl(self): return sdc_copy_array_impl - if isinstance(self, (StringArrayType, RangeIndexType, Int64IndexType)): + elif isinstance(self, StringArrayType): def sdc_copy_str_arr_impl(self): return self.copy() return sdc_copy_str_arr_impl + else: + return None @sdc_overload(notnan) def sdc_notnan_overload(self): @@ -979,13 +982,18 @@ def getitem_by_mask_overload(self, idx): """ valid_self_types = (types.Array,) + sdc_pandas_index_types - if not (isinstance(self, valid_self_types) - and not isinstance(self, types.NoneType)): + if not isinstance(self, valid_self_types): return None + # for empty index assume it's returns itself + if isinstance(self, EmptyIndexType): + def getitem_by_mask_empty_index_impl(self, idx): + return self + return getitem_by_mask_empty_index_impl + res_dtype = self.dtype is_str_arr = self == string_array_type - is_numeric_index = isinstance(self, (RangeIndexType, Int64IndexType)) + is_numeric_index = isinstance(self, (PositionalIndexType, RangeIndexType, Int64IndexType)) def getitem_by_mask_impl(self, idx): chunks = parallel_chunks(len(self)) @@ -1024,7 +1032,7 @@ def getitem_by_mask_impl(self, idx): str_arr_set_na_by_mask(result_data_as_str_arr, result_nan_mask) return result_data_as_str_arr elif is_numeric_index == True: # noqa - return pandas.Int64Index(result_data, name=self._name) + return pandas.Int64Index(result_data, name=self.name) else: return result_data @@ -1101,7 +1109,7 @@ def array_equal(A, B): def sdc_array_equal_overload(A, B): """ Checks 1D sequences A and B of comparable dtypes are equal """ - valid_arg_types = (types.Array,) + sdc_pandas_index_types + valid_arg_types = sdc_pandas_df_column_types if not (isinstance(A, valid_arg_types) or isinstance(B, valid_arg_types)): return None @@ -1122,27 +1130,16 @@ def sdc_array_equal_str_arr_impl(A, B): return sdc_array_equal_str_arr_impl else: - both_range_indexes = isinstance(A, RangeIndexType) and isinstance(B, RangeIndexType) def sdc_array_equal_impl(A, B): - if both_range_indexes == True: # noqa - if len(A) != len(B): - return False - if len(A) == 0: - return True - if len(A) == 1: - return A.start == B.start - - return A.start == B.start and A.step == B.step - else: - if len(A) != len(B): - return False - # FIXME_Numba#5157: change to simple A == B when issue is resolved - eq_res_size = len(A) - eq_res = numpy.empty(eq_res_size, dtype=types.bool_) - for i in numba.prange(eq_res_size): - eq_res[i] = A[i] == B[i] - return numpy.all(eq_res) + if len(A) != len(B): + return False + # FIXME_Numba#5157: change to simple A == B when issue is resolved + eq_res_size = len(A) + eq_res = numpy.empty(eq_res_size, dtype=types.bool_) + for i in numba.prange(eq_res_size): + eq_res[i] = A[i] == B[i] + return numpy.all(eq_res) return sdc_array_equal_impl @@ -1152,12 +1149,6 @@ def sdc_np_array_overload(A): if isinstance(A, types.Array): return lambda A: A - if isinstance(A, RangeIndexType): - return lambda A: np.arange(A.start, A.stop, A.step) - - if isinstance(A, Int64IndexType): - return lambda A: A._data - if isinstance(A, types.containers.Set): # TODO: naive implementation, data from set can probably # be copied to array more efficienty @@ -1279,3 +1270,118 @@ def argsort_impl(a, axis=-1, kind=None, order=None, ascending=True): raise ValueError("Unsupported value of 'kind' parameter") return argsort_impl + + +def take(data, indices): + pass + + +@sdc_overload(take) +def sdc_take_overload(data, indices): + + valid_data_types = sdc_pandas_df_column_types + if not (isinstance(data, valid_data_types)): + return None + + valid_indexes_types = (types.Array, types.List, types.ListType) + sdc_pandas_index_types + valid_indexes_dtypes = (types.Integer, types.ListType) + if not (isinstance(indices, valid_indexes_types) + and isinstance(indices.dtype, valid_indexes_dtypes) + and (isinstance(indices.dtype, types.Integer) + or isinstance(indices.dtype.dtype, types.Integer))): + return None + + data_dtype = data.dtype + if isinstance(indices.dtype, types.ListType): + + if isinstance(data_dtype, types.Number): + + def sdc_take_array_indices_seq_impl(data, indices): + res_size = 0 + for i in numba.prange(len(indices)): + res_size += len(indices[i]) + res_arr = numpy.empty(res_size, dtype=data_dtype) + for i in numba.prange(len(indices)): + start = 0 + for list_elem in range(len(indices[0:i])): + start += len(indices[list_elem]) + current_pos = start + for j in range(len(indices[i])): + res_arr[current_pos] = data[indices[i][j]] + current_pos += 1 + return res_arr + + return sdc_take_array_indices_seq_impl + + elif isinstance(data, StringArrayType): + def sdc_take_str_arr_indices_seq_impl(data, indices): + res_size = 0 + for i in numba.prange(len(indices)): + res_size += len(indices[i]) + nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) + num_total_bytes = 0 + for i in numba.prange(len(indices)): + start = 0 + for list_elem in range(len(indices[0:i])): + start += len(indices[list_elem]) + current_pos = start + for j in range(len(indices[i])): + num_total_bytes += get_utf8_size(data[indices[i][j]]) + if isna(data, indices[i][j]): + nan_mask[current_pos] = True + current_pos += 1 + res_arr = pre_alloc_string_array(res_size, num_total_bytes) + for i in numba.prange(len(indices)): + start = 0 + for list_elem in range(len(indices[0:i])): + start += len(indices[list_elem]) + current_pos = start + for j in range(len(indices[i])): + res_arr[current_pos] = data[indices[i][j]] + if nan_mask[current_pos]: + str_arr_set_na(res_arr, current_pos) + current_pos += 1 + + return res_arr + + return sdc_take_str_arr_indices_seq_impl + + else: + return None + + else: + if isinstance(data_dtype, (types.Number, types.Boolean)): + + def sdc_take_array_impl(data, indices): + res_size = len(indices) + res_arr = numpy.empty(res_size, dtype=data_dtype) + for i in numba.prange(res_size): + res_arr[i] = data[indices[i]] + return res_arr + + return sdc_take_array_impl + + elif isinstance(data, StringArrayType): + def sdc_take_str_arr_impl(data, indices): + res_size = len(indices) + nan_mask = numpy.zeros(res_size, dtype=numpy.bool_) + num_total_bytes = 0 + for i in numba.prange(res_size): + num_total_bytes += get_utf8_size(data[indices[i]]) + if isna(data, indices[i]): + nan_mask[i] = True + + res_arr = pre_alloc_string_array(res_size, num_total_bytes) + for i in numpy.arange(res_size): + res_arr[i] = data[indices[i]] + if nan_mask[i]: + str_arr_set_na(res_arr, i) + + return res_arr + + return sdc_take_str_arr_impl + + else: + return None + + return None diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index 8a0b7f622..c06203ecd 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -24,7 +24,6 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - import numpy as np import pandas as pd @@ -38,13 +37,14 @@ import sdc from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (StringArrayType, string_array_type) -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType + +from sdc.datatypes.indexes import * from sdc.hiframes.pd_series_ext import ( SeriesType, if_series_to_array_type) from numba.core.errors import TypingError from sdc.datatypes.categorical.types import Categorical +from sdc.utilities.sdc_typing_utils import sdc_pandas_df_column_types def isna(arr, i): @@ -168,31 +168,43 @@ def fix_df_array_list_str_impl(column): # pragma: no cover return lambda column: column -def fix_df_index(index): +def fix_df_index(index, coldata=None): return index @overload(fix_df_index) -def fix_df_index_overload(index): - - # TO-DO: replace types.none index with separate type, e.g. DefaultIndex - if (index is None or isinstance(index, types.NoneType)): - def fix_df_index_impl(index): +def fix_df_index_overload(index, coldata=None): + + # FIXME: import here due to circular import between indexes, numpy_like, and api + from sdc.extensions.indexes.empty_index_ext import init_empty_index + from sdc.extensions.indexes.positional_index_ext import init_positional_index + + # index here is param supplied to Series/DF ctors, so it can be None + if index is None or isinstance(index, types.NoneType): + if coldata is None or isinstance(coldata, (types.NoneType, types.Omitted)): + def fix_df_index_impl(index, coldata=None): + return init_empty_index() + elif isinstance(coldata, sdc_pandas_df_column_types): + def fix_df_index_impl(index, coldata=None): + return init_positional_index(len(coldata)) + else: return None - elif isinstance(index, (RangeIndexType, Int64IndexType)): - def fix_df_index_impl(index): + return fix_df_index_impl + + elif isinstance(index, (RangeIndexType, Int64IndexType, EmptyIndexType, PositionalIndexType)): + def fix_df_index_impl(index, coldata=None): return index # currently only signed integer indexes are represented with own type # TO-DO: support Uint64Index and Float64Indexes elif isinstance(index.dtype, types.Integer) and index.dtype.signed: - def fix_df_index_impl(index): + def fix_df_index_impl(index, coldata=None): index_data = fix_df_array(index) return pd.Int64Index(index_data) else: # default case, transform index the same as df data - def fix_df_index_impl(index): + def fix_df_index_impl(index, coldata=None): return fix_df_array(index) return fix_df_index_impl diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 12cb8850f..656b3833c 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -54,11 +54,13 @@ import llvmlite.binding as ll from llvmlite import ir as lir from llvmlite.llvmpy.core import Type as LLType -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType + +from sdc.datatypes.indexes import * from sdc.extensions.indexes.range_index_ext import box_range_index, unbox_range_index +from sdc.extensions.indexes.empty_index_ext import box_empty_index, unbox_empty_index from sdc.extensions.indexes.int64_index_ext import box_int64_index, unbox_int64_index from sdc.str_arr_type import StringArrayType +from sdc.extensions.indexes.positional_index_ext import unbox_positional_index, box_positional_index ll.add_symbol('array_size', hstr_ext.array_size) ll.add_symbol('array_getptr1', hstr_ext.array_getptr1) @@ -195,17 +197,15 @@ def _infer_series_list_dtype(S): def _infer_index_type(index): """ Deduces native Numba type used to represent index Python object """ + index_is_named = index.name is not None # more specific types go first (e.g. RangeIndex is subtype of Int64Index) if isinstance(index, pd.RangeIndex): # depending on actual index value unbox to diff types: none-index if it matches # positions or to RangeIndexType in general case - if (index.start == 0 and index.step == 1 and index.name is None): - return types.none + if (index.start == 0 and index.step == 1): + return PositionalIndexType(is_named=index_is_named) else: - if index.name is None: - return RangeIndexType() - else: - return RangeIndexType(is_named=True) + return RangeIndexType(is_named=index_is_named) # for unsupported pandas indexes we explicitly unbox to None if isinstance(index, pd.DatetimeIndex): @@ -213,16 +213,16 @@ def _infer_index_type(index): if isinstance(index, pd.Int64Index): index_data_type = numba.typeof(index._data) - if index.name is None: - return Int64IndexType(index_data_type) - else: - return Int64IndexType(index_data_type, is_named=True) + return Int64IndexType(index_data_type, is_named=index_is_named) if index.dtype == np.dtype('O'): # TO-DO: should we check that all elements are strings? if len(index) > 0 and isinstance(index[0], str): return string_array_type + elif len(index) == 0: + return EmptyIndexType(is_named=index_is_named) else: + assert False, f"Unboxing failed: cannot infer type for index:\n\t{index}" return types.none numba_index_type = numpy_support.from_dtype(index.dtype) @@ -276,11 +276,9 @@ def box_dataframe(typ, val, c): df_obj = pyapi.call_method(class_obj, "DataFrame", (df_dict,)) pyapi.decref(df_dict) - # set df.index if necessary - if typ.index != types.none: - index_obj = _box_index_data(typ.index, dataframe.index, c) - pyapi.object_setattr_string(df_obj, 'index', index_obj) - pyapi.decref(index_obj) + index_obj = _box_index_data(typ.index, dataframe.index, c) + pyapi.object_setattr_string(df_obj, 'index', index_obj) + pyapi.decref(index_obj) for arrays_list_obj in arrays_list_objs.values(): pyapi.decref(arrays_list_obj) @@ -332,6 +330,13 @@ def _unbox_index_data(index_typ, index_obj, c): c: LLVM context object Returns: LLVM instructions to generate native value """ + + if isinstance(index_typ, EmptyIndexType): + return unbox_empty_index(index_typ, index_obj, c) + + if isinstance(index_typ, PositionalIndexType): + return unbox_positional_index(index_typ, index_obj, c) + if isinstance(index_typ, RangeIndexType): return unbox_range_index(index_typ, index_obj, c) @@ -350,6 +355,7 @@ def _unbox_index_data(index_typ, index_obj, c): return res if isinstance(index_typ, types.NoneType): + assert False, "unboxing to None index!" return unbox_none(index_typ, index_obj, c) assert False, f"_unbox_index_data: unexpected index type({index_typ}) while unboxing" @@ -399,11 +405,7 @@ def box_series(typ, val, c): typ)(c.context, c.builder, val) arr = _box_series_data(dtype, typ.data, series.data, c) - - if typ.index is types.none: - index = c.pyapi.make_none() - else: - index = _box_index_data(typ.index, series.index, c) + index = _box_index_data(typ.index, series.index, c) if typ.is_named: name = c.pyapi.from_native_value(string_type, series.name) @@ -456,7 +458,11 @@ def _box_index_data(index_typ, val, c): """ assert isinstance(index_typ, sdc_pandas_index_types) - if isinstance(index_typ, RangeIndexType): + if isinstance(index_typ, EmptyIndexType): + index = box_empty_index(index_typ, val, c) + elif isinstance(index_typ, PositionalIndexType): + index = box_positional_index(index_typ, val, c) + elif isinstance(index_typ, RangeIndexType): index = box_range_index(index_typ, val, c) elif isinstance(index_typ, Int64IndexType): index = box_int64_index(index_typ, val, c) @@ -464,8 +470,8 @@ def _box_index_data(index_typ, val, c): index = box_array(index_typ, val, c) elif isinstance(index_typ, StringArrayType): index = box_str_arr(string_array_type, val, c) - else: # index_typ is types.none - index = c.pyapi.make_none() + else: + assert False, f"_box_index_data called with unknown index type: {index_typ}" return index diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 0f062a4bf..7bee3bf48 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -137,7 +137,7 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): fix_data = sdc.hiframes.api.fix_df_array(data) - fix_index = sdc.hiframes.api.fix_df_index(index) + fix_index = sdc.hiframes.api.fix_df_index(index, fix_data) return sdc.hiframes.api.init_series(fix_data, fix_index, name) return hpat_pandas_series_ctor_impl diff --git a/sdc/hiframes/pd_series_type.py b/sdc/hiframes/pd_series_type.py index 32e004a14..569df88a3 100644 --- a/sdc/hiframes/pd_series_type.py +++ b/sdc/hiframes/pd_series_type.py @@ -38,6 +38,7 @@ from sdc.str_ext import string_type, list_string_array_type from sdc.str_arr_ext import (string_array_type, iternext_str_array, StringArrayType) from sdc.datatypes.categorical.types import CategoricalDtypeType, Categorical +from sdc.datatypes.indexes.positional_index_type import PositionalIndexType class SeriesType(types.IterableType): @@ -54,7 +55,7 @@ def __init__(self, dtype, data=None, index=None, is_named=False): if isinstance(dtype, types.Record) else dtype) self.data = data if index is None: - index = types.none + index = PositionalIndexType(False) self.index = index # keep is_named in type to enable boxing self.is_named = is_named diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index 78aaafeb6..e772d8b4c 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -61,6 +61,8 @@ import pyarrow import pyarrow.csv +from sdc.datatypes.indexes.empty_index_type import EmptyIndexType +from sdc.datatypes.indexes.positional_index_type import PositionalIndexType class CsvReader(ir.Stmt): @@ -524,9 +526,10 @@ def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names column_loc, _, _ = get_structure_maps(col_typs, return_columns) + index_type = PositionalIndexType(False) if col_typs else EmptyIndexType(False) df_type = DataFrameType( tuple(col_typs), - types.none, + index_type, tuple(col_names), column_loc=column_loc ) diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index debf0b73c..cb3051684 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -48,6 +48,10 @@ from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.hiframes.api import fix_df_array, fix_df_index from sdc.str_ext import string_type +from sdc.extensions.indexes.empty_index_ext import init_empty_index +from sdc.datatypes.indexes.empty_index_type import EmptyIndexType +from sdc.utilities.sdc_typing_utils import TypeChecker +from sdc.str_arr_type import StringArrayType @register_rewrite('before-inference') @@ -114,7 +118,9 @@ def apply(self): 'string_type': string_type, 'intrinsic': intrinsic, 'fix_df_array': fix_df_array, - 'fix_df_index': fix_df_index + 'fix_df_index': fix_df_index, + 'init_empty_index': init_empty_index, + 'EmptyIndexType': EmptyIndexType }) setattr(pd_dataframe_ext_module, func_name, init_df) @@ -197,6 +203,7 @@ def {func_name}(typingctx, {params}): """ n_cols = {n_cols} + is_df_empty = {n_cols == 0} input_data_typs = ({', '.join(args_col_data) + suffix}) fnty = typingctx.resolve_value_type(fix_df_array) @@ -209,7 +216,8 @@ def {func_name}(typingctx, {params}): input_index_typ = index fnty = typingctx.resolve_value_type(fix_df_index) - fixed_index_sig = fnty.get_call_type(typingctx, (input_index_typ,), {{}}) + fixed_index_sig = fnty.get_call_type(typingctx, + (input_index_typ, {'data_typs[0]' if n_cols > 0 else ''}), {{}}) index_typ = fixed_index_sig.return_type need_fix_index = index_typ != input_index_typ @@ -244,7 +252,14 @@ def codegen(context, builder, sig, args): builder, types.Tuple(data_list_type), data_lists) if need_fix_index == True: - index = context.compile_internal(builder, lambda a: fix_df_index(a), fixed_index_sig, [index]) + if is_df_empty == True: + first_col_data = context.get_dummy_value() + else: + first_col_data = data_arrs_transformed[0] + index = context.compile_internal(builder, + lambda a, d: fix_df_index(a, d), + fixed_index_sig, + [index, first_col_data]) dataframe.data = data_tup dataframe.index = index @@ -286,21 +301,21 @@ def pd_dataframe_overload(data, index=None, columns=None, dtype=None, copy=False """ ty_checker = TypeChecker('Method DataFrame') - ty_checker.check(self, DataFrameType) - if not isinstance(data, dict): - ty_checker.raise_exc(pat, 'dict', 'data') + if not isinstance(data, (types.DictType, types.LiteralStrKeyDict)): + ty_checker.raise_exc(data, 'dict', 'data') - if not isinstance(index, (types.Ommited, types.Array, StringArray, types.NoneType)) and index is not None: - ty_checker.raise_exc(na, 'array-like', 'index') + if not (isinstance(index, (types.Omitted, types.ListType, types.List, + types.Array, StringArrayType, types.NoneType) or index is None)): + ty_checker.raise_exc(index, 'array-like', 'index') - if not isinstance(columns, (types.Ommited, types.NoneType)) and columns is not None: - ty_checker.raise_exc(na, 'None', 'columns') + if not (isinstance(columns, (types.Omitted, types.NoneType, types.Tuple, types.UniTuple) or columns is None)): + ty_checker.raise_exc(columns, 'tuple of strings', 'columns') - if not isinstance(dtype, (types.Ommited, types.NoneType)) and dtype is not None: - ty_checker.raise_exc(na, 'None', 'dtype') + if not (isinstance(dtype, (types.Omitted, types.NoneType) or dtype is None)): + ty_checker.raise_exc(dtype, 'None', 'dtype') - if not isinstance(copy, (types.Ommited, types.NoneType)) and columns is not False: - ty_checker.raise_exc(na, 'False', 'copy') + if not (isinstance(copy, (types.Omitted, types.NoneType) or columns is False)): + ty_checker.raise_exc(copy, 'False', 'copy') return None diff --git a/sdc/sdc_autogenerated.py b/sdc/sdc_autogenerated.py index 66567e94c..f701cf5fb 100644 --- a/sdc/sdc_autogenerated.py +++ b/sdc/sdc_autogenerated.py @@ -39,15 +39,13 @@ from numba.core.errors import TypingError from numba import types -from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, - find_common_dtype_from_numpy_dtypes, find_index_common_dtype) -from sdc.datatypes.common_functions import (sdc_join_series_indexes, ) +from sdc.utilities.sdc_typing_utils import (TypeChecker, check_types_comparable, sdc_old_index_types,) from sdc.hiframes.api import isna from sdc.hiframes.pd_series_type import SeriesType from sdc.str_arr_ext import (string_array_type, str_arr_is_na) from sdc.utilities.utils import sdc_overload, sdc_overload_method from sdc.functions import numpy_like -from sdc.datatypes.range_index_type import RangeIndexType +from sdc.extensions.indexes.indexes_generic import sdc_indexes_join_outer, sdc_fix_indexes_join, sdc_unify_index_types def sdc_add(self, other, fill_value=None): @@ -79,63 +77,37 @@ def sdc_add_impl(self, other, fill_value=None): return sdc_add_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_add_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - - return pandas.Series(result_data) - - return sdc_add_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_add_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_add_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.add(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_add_impl + return sdc_add_impl @sdc_overload_method(SeriesType, 'add') @@ -190,9 +162,7 @@ def sdc_pandas_series_add(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -247,63 +217,37 @@ def sdc_div_impl(self, other, fill_value=None): return sdc_div_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_div_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - return pandas.Series(result_data) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - return sdc_div_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_div_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_div_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.div(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_div_impl + return sdc_div_impl @sdc_overload_method(SeriesType, 'div') @@ -358,9 +302,7 @@ def sdc_pandas_series_div(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -415,63 +357,37 @@ def sdc_sub_impl(self, other, fill_value=None): return sdc_sub_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_sub_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left - _right - - return pandas.Series(result_data) - - return sdc_sub_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - def sdc_sub_impl(self, other, fill_value=None): + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_sub_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.sub(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left - _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left - _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_sub_impl + return sdc_sub_impl @sdc_overload_method(SeriesType, 'sub') @@ -526,9 +442,7 @@ def sdc_pandas_series_sub(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -583,63 +497,37 @@ def sdc_mul_impl(self, other, fill_value=None): return sdc_mul_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_mul_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left * _right - - return pandas.Series(result_data) - return sdc_mul_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - def sdc_mul_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_mul_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.mul(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left * _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left * _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_mul_impl + return sdc_mul_impl @sdc_overload_method(SeriesType, 'mul') @@ -694,9 +582,7 @@ def sdc_pandas_series_mul(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -751,63 +637,37 @@ def sdc_truediv_impl(self, other, fill_value=None): return sdc_truediv_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_truediv_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - return pandas.Series(result_data) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - return sdc_truediv_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_truediv_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_truediv_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.truediv(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left / _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_truediv_impl + return sdc_truediv_impl @sdc_overload_method(SeriesType, 'truediv') @@ -862,9 +722,7 @@ def sdc_pandas_series_truediv(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -919,63 +777,37 @@ def sdc_floordiv_impl(self, other, fill_value=None): return sdc_floordiv_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_floordiv_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left // _right + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - return pandas.Series(result_data) - - return sdc_floordiv_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_floordiv_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_floordiv_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.floordiv(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left // _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left // _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_floordiv_impl + return sdc_floordiv_impl @sdc_overload_method(SeriesType, 'floordiv') @@ -1030,9 +862,7 @@ def sdc_pandas_series_floordiv(self, other, level=None, fill_value=None, axis=0) operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1087,63 +917,37 @@ def sdc_mod_impl(self, other, fill_value=None): return sdc_mod_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_mod_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left % _right - - return pandas.Series(result_data) - - return sdc_mod_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_mod_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_mod_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.mod(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left % _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left % _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_mod_impl + return sdc_mod_impl @sdc_overload_method(SeriesType, 'mod') @@ -1198,9 +1002,7 @@ def sdc_pandas_series_mod(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1255,63 +1057,37 @@ def sdc_pow_impl(self, other, fill_value=None): return sdc_pow_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_pow_impl(self, other, fill_value=None): - - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) - - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left ** _right - return pandas.Series(result_data) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - return sdc_pow_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_pow_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_pow_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.pow(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left ** _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left ** _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_pow_impl + return sdc_pow_impl @sdc_overload_method(SeriesType, 'pow') @@ -1366,9 +1142,7 @@ def sdc_pandas_series_pow(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1415,50 +1189,35 @@ def _series_lt_scalar_impl(self, other, fill_value=None): else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_lt_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data < other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left < _right - - return pandas.Series(result_data) - - return _series_lt_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_lt_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.lt(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data < other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left < _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_lt_common_impl + return _series_lt_common_impl return None @@ -1521,9 +1280,7 @@ def sdc_pandas_series_lt(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1552,59 +1309,44 @@ def sdc_gt_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_gt_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self > other, index=self._index, name=self._name) - return _series_gt_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_gt_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data > other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left > _right - - return pandas.Series(result_data) - - return _series_gt_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_gt_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.gt(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data > other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left > _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_gt_common_impl + return _series_lt_common_impl return None @@ -1667,9 +1409,7 @@ def sdc_pandas_series_gt(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1698,59 +1438,44 @@ def sdc_le_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_le_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self <= other, index=self._index, name=self._name) - return _series_le_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_le_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data <= other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left <= _right - - return pandas.Series(result_data) - - return _series_le_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_le_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.le(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data <= other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left <= _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_le_common_impl + return _series_lt_common_impl return None @@ -1813,9 +1538,7 @@ def sdc_pandas_series_le(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1844,59 +1567,44 @@ def sdc_ge_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_ge_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self >= other, index=self._index, name=self._name) - return _series_ge_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_ge_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data >= other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left >= _right - - return pandas.Series(result_data) - - return _series_ge_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_ge_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.ge(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data >= other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left >= _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_ge_common_impl + return _series_lt_common_impl return None @@ -1959,9 +1667,7 @@ def sdc_pandas_series_ge(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -1990,59 +1696,44 @@ def sdc_ne_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_ne_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self != other, index=self._index, name=self._name) - return _series_ne_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_ne_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data != other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left != _right - - return pandas.Series(result_data) - - return _series_ne_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_ne_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.ne(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data != other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left != _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_ne_common_impl + return _series_lt_common_impl return None @@ -2105,9 +1796,7 @@ def sdc_pandas_series_ne(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2136,59 +1825,44 @@ def sdc_eq_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_eq_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self == other, index=self._index, name=self._name) - return _series_eq_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_eq_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data == other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left == _right - - return pandas.Series(result_data) - - return _series_eq_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_eq_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.eq(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data == other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left == _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_eq_common_impl + return _series_lt_common_impl return None @@ -2251,9 +1925,7 @@ def sdc_pandas_series_eq(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2317,9 +1989,7 @@ def sdc_pandas_series_operator_add(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2382,9 +2052,7 @@ def sdc_pandas_series_operator_sub(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2447,9 +2115,7 @@ def sdc_pandas_series_operator_mul(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2512,9 +2178,7 @@ def sdc_pandas_series_operator_truediv(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2577,9 +2241,7 @@ def sdc_pandas_series_operator_floordiv(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2642,9 +2304,7 @@ def sdc_pandas_series_operator_mod(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2707,9 +2367,7 @@ def sdc_pandas_series_operator_pow(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2762,9 +2420,7 @@ def sdc_pandas_series_operator_lt(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2817,9 +2473,7 @@ def sdc_pandas_series_operator_gt(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2872,9 +2526,7 @@ def sdc_pandas_series_operator_le(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2927,9 +2579,7 @@ def sdc_pandas_series_operator_ge(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -2982,9 +2632,7 @@ def sdc_pandas_series_operator_ne(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -3037,9 +2685,7 @@ def sdc_pandas_series_operator_eq(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) diff --git a/sdc/sdc_function_templates.py b/sdc/sdc_function_templates.py index 0c6d34356..2f58cdeee 100644 --- a/sdc/sdc_function_templates.py +++ b/sdc/sdc_function_templates.py @@ -40,15 +40,13 @@ from numba.core.errors import TypingError from numba import types -from sdc.utilities.sdc_typing_utils import (TypeChecker, check_index_is_numeric, check_types_comparable, - find_common_dtype_from_numpy_dtypes, find_index_common_dtype) -from sdc.datatypes.common_functions import (sdc_join_series_indexes, ) +from sdc.utilities.sdc_typing_utils import (TypeChecker, check_types_comparable, sdc_old_index_types,) from sdc.hiframes.api import isna from sdc.hiframes.pd_series_type import SeriesType from sdc.str_arr_ext import (string_array_type, str_arr_is_na) from sdc.utilities.utils import sdc_overload, sdc_overload_method from sdc.functions import numpy_like -from sdc.datatypes.range_index_type import RangeIndexType +from sdc.extensions.indexes.indexes_generic import sdc_indexes_join_outer, sdc_fix_indexes_join, sdc_unify_index_types def sdc_binop(self, other, fill_value=None): @@ -79,63 +77,37 @@ def sdc_binop_impl(self, other, fill_value=None): return sdc_binop_impl else: # both operands are numeric series - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def sdc_binop_impl(self, other, fill_value=None): - left_size, right_size = len(self._data), len(other._data) - max_data_size = max(left_size, right_size) - result_data = numpy.empty(max_data_size, dtype=numpy.float64) + use_index_methods = not (isinstance(self.index, sdc_old_index_types) + or isinstance(other.index, sdc_old_index_types)) - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - for i in numba.prange(max_data_size): - left_nan = (i >= left_size or numpy.isnan(self._data[i])) - right_nan = (i >= right_size or numpy.isnan(other._data[i])) - _left = _fill_value if left_nan else self._data[i] - _right = _fill_value if right_nan else other._data[i] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - - return pandas.Series(result_data) - - return sdc_binop_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def sdc_binop_impl(self, other, fill_value=None): - - # check if indexes are equal and series don't have to be aligned - left_index, right_index = self.index, other.index - if (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + def sdc_binop_impl(self, other, fill_value=None): - _left = pandas.Series(self._data) - _right = pandas.Series(other._data) - partial_res = _left.binop(_right, fill_value=fill_value) + left_index, right_index = self._index, other._index + if use_index_methods == True: # noqa + indexes_join_res = left_index.join(right_index, 'outer', return_indexers=True) + else: + indexes_join_res = sdc_indexes_join_outer(left_index, right_index) - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + # FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed + joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res) + result_size = len(joined_index) + result_data = numpy.empty(result_size, dtype=numpy.float64) - return pandas.Series(partial_res._data, index=result_index) + _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa + for i in numba.prange(result_size): + left_pos = left_indexer[i] + right_pos = right_indexer[i] - _fill_value = numpy.nan if fill_value_is_none == True else fill_value # noqa - # TODO: replace below with core join(how='outer', return_indexers=True) when implemented - joined_index, left_indexer, right_indexer = sdc_join_series_indexes(left_index, right_index) - result_size = len(joined_index) - result_data = numpy.empty(result_size, dtype=numpy.float64) - for i in numba.prange(result_size): - left_pos, right_pos = left_indexer[i], right_indexer[i] - left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) - right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) - _left = _fill_value if left_nan else self._data[left_pos] - _right = _fill_value if right_nan else other._data[right_pos] - result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right + left_nan = (left_pos == -1 or numpy.isnan(self._data[left_pos])) + right_nan = (right_pos == -1 or numpy.isnan(other._data[right_pos])) + _left = _fill_value if left_nan else self._data[left_pos] + _right = _fill_value if right_nan else other._data[right_pos] + result_data[i] = numpy.nan if (left_nan and right_nan) else _left + _right - return pandas.Series(result_data, index=joined_index) + return pandas.Series(result_data, index=joined_index) - return sdc_binop_impl + return sdc_binop_impl def sdc_pandas_series_binop(self, other, level=None, fill_value=None, axis=0): @@ -189,9 +161,7 @@ def sdc_pandas_series_binop(self, other, level=None, fill_value=None, axis=0): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -228,59 +198,44 @@ def sdc_comp_binop_ovld(self, other, fill_value=None): fill_value_is_none = isinstance(fill_value, (types.NoneType, types.Omitted)) or fill_value is None if not operands_are_series: - def _series_comp_binop_scalar_impl(self, other, fill_value=None): + def _series_lt_scalar_impl(self, other, fill_value=None): _self = numpy_like.fillna(self._data, inplace=False, value=fill_value) return pandas.Series(_self < other, index=self._index, name=self._name) - return _series_comp_binop_scalar_impl + return _series_lt_scalar_impl else: - # optimization for series with default indexes, that can be aligned differently - if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): - def _series_comp_binop_none_indexes_impl(self, other, fill_value=None): + index_api_supported = not (isinstance(self.index, sdc_old_index_types) + and isinstance(other.index, sdc_old_index_types)) - left_size, right_size = len(self._data), len(other._data) - if left_size != right_size: - raise ValueError("Can only compare identically-labeled Series objects") + def _series_lt_common_impl(self, other, fill_value=None): - if fill_value_is_none == True: # noqa - result_data = self._data < other._data - else: - result_data = numpy.empty(left_size, dtype=types.bool_) - for i in numba.prange(left_size): - left_nan = isna(self._data, i) - right_nan = isna(other._data, i) - _left = fill_value if left_nan else self._data[i] - _right = fill_value if right_nan else other._data[i] - result_data[i] = False if (left_nan and right_nan) else _left < _right - - return pandas.Series(result_data) - - return _series_comp_binop_none_indexes_impl - else: - left_index_is_range = isinstance(self.index, (RangeIndexType, types.NoneType)) - index_dtypes_match, numba_index_common_dtype = find_index_common_dtype(self, other) - - def _series_comp_binop_common_impl(self, other, fill_value=None): - - left_index, right_index = self.index, other.index - if not (left_index is right_index - or numpy_like.array_equal(left_index, right_index)): + left_index, right_index = self.index, other.index + if index_api_supported == True: # noqa + if not (left_index is right_index or left_index.equals(right_index)): + raise ValueError("Can only compare identically-labeled Series objects") + else: + if not (left_index is right_index or numpy_like.array_equal(left_index, right_index)): raise ValueError("Can only compare identically-labeled Series objects") - _left, _right = pandas.Series(self._data), pandas.Series(other._data) - partial_res = _left.comp_binop(_right, fill_value=fill_value) - - if index_dtypes_match == False: # noqa - result_index = numpy_like.astype(left_index, numba_index_common_dtype) - else: - result_index = left_index.values if left_index_is_range == True else left_index # noqa + res_size = len(left_index) + if fill_value_is_none == True: # noqa + res_data = self._data < other._data + else: + res_data = numpy.empty(res_size, dtype=types.bool_) + for i in numba.prange(res_size): + left_nan = isna(self._data, i) + right_nan = isna(other._data, i) + _left = fill_value if left_nan else self._data[i] + _right = fill_value if right_nan else other._data[i] + res_data[i] = False if (left_nan and right_nan) else _left < _right - return pandas.Series(partial_res._data, result_index) + res_index = sdc_unify_index_types(left_index, right_index) + return pandas.Series(res_data, index=res_index) - return _series_comp_binop_common_impl + return _series_lt_common_impl return None @@ -342,9 +297,7 @@ def sdc_pandas_series_comp_binop(self, other, level=None, fill_value=None, axis= operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -407,9 +360,7 @@ def sdc_pandas_series_operator_binop(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) @@ -461,9 +412,7 @@ def sdc_pandas_series_operator_comp_binop(self, other): operands_are_series = self_is_series and other_is_series if operands_are_series: - none_or_numeric_indexes = ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) - and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))) - series_indexes_comparable = check_types_comparable(self.index, other.index) or none_or_numeric_indexes + series_indexes_comparable = check_types_comparable(self.index, other.index) if not series_indexes_comparable: raise TypingError('{} Not implemented for series with not-comparable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) diff --git a/sdc/tests/indexes/__init__.py b/sdc/tests/indexes/__init__.py index 756d8cb55..c0adc55e5 100644 --- a/sdc/tests/indexes/__init__.py +++ b/sdc/tests/indexes/__init__.py @@ -24,6 +24,8 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +from sdc.tests.indexes.test_empty_index import TestEmptyIndex from sdc.tests.indexes.test_range_index import TestRangeIndex +from sdc.tests.indexes.test_positional_index import TestPositionalIndex from sdc.tests.indexes.test_int64_index import TestInt64Index from sdc.tests.indexes.test_indexes import TestIndexes diff --git a/sdc/tests/indexes/index_datagens.py b/sdc/tests/indexes/index_datagens.py index ba1ea5700..244fa52f8 100644 --- a/sdc/tests/indexes/index_datagens.py +++ b/sdc/tests/indexes/index_datagens.py @@ -27,15 +27,16 @@ import numpy as np import pandas as pd -from itertools import (combinations_with_replacement, filterfalse, chain) -from sdc.tests.test_utils import gen_strlist +from itertools import (product, combinations_with_replacement, filterfalse, chain) +from sdc.tests.test_utils import gen_strlist +from sdc.datatypes.indexes import * test_global_index_names = [None, 'abc', 'index'] test_global_range_member_values = [1, 2, 10, -5, 0, None] -def _generate_valid_range_params(): +def _generate_all_range_params(): def valid_params_predicate(range_params): # if step is zero or all start/stop/step are None range is invalid @@ -48,15 +49,45 @@ def valid_params_predicate(range_params): ) -def _generate_range_indexes_fixed(size, start=1, step=3): +def _generate_positional_range_params(): + + # for PositionalIndexType represented ranges only + starts, stops, steps = [0, ], [1, 2, 10, ], [1, ] + return product(starts, stops, steps) + + +def _generate_custom_range_params(): + + # for non PositionalIndexType represented range objects + def valid_positional_index_predicate(range_params): + index = pd.RangeIndex(*range_params) + return index.start == 0 and index.stop > 0 and index.step == 1 + + return filterfalse( + valid_positional_index_predicate, + _generate_all_range_params() + ) + + +def _generate_positional_indexes_fixed(size, start=1, step=3): yield pd.RangeIndex(size) yield pd.RangeIndex(size, name='abc') + + +def _generate_custom_range_indexes_fixed(size, start=1, step=3): yield pd.RangeIndex(stop=step * size, step=step) yield pd.RangeIndex(stop=2*step*size, step=2*step) yield pd.RangeIndex(start=start, stop=start + size*step - step//2, step=step) yield pd.RangeIndex(start=start + step, stop=start + (size + 1)*step, step=step) +def _generate_range_indexes_fixed(size, start=1, step=3): + return chain( + _generate_positional_indexes_fixed(size, start, step), + _generate_custom_range_indexes_fixed(size, start, step), + ) + + def _generate_index_param_values(n): return chain( [None], @@ -86,3 +117,14 @@ def _generate_int64_indexes_fixed(size): yield pd.Int64Index([i if i % 2 else 0 for i in range(size)]) yield pd.Int64Index([i // 2 for i in range(size)]) yield pd.Int64Index(np.ones(size)) + + +def get_sample_index(size, sdc_index_type): + if sdc_index_type is PositionalIndexType: + return pd.RangeIndex(size) + if sdc_index_type is RangeIndexType: + return pd.RangeIndex(-1, size - 1, 1) + if sdc_index_type is Int64IndexType: + return pd.Int64Index(np.arange(size)) + + assert False, f"Refusing to create index of non-specific index type: {sdc_index_type}" diff --git a/sdc/tests/indexes/test_empty_index.py b/sdc/tests/indexes/test_empty_index.py new file mode 100644 index 000000000..1b6baaaec --- /dev/null +++ b/sdc/tests/indexes/test_empty_index.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import unittest + +from sdc.tests.test_base import TestCase + + +class TestEmptyIndex(TestCase): + """ Verifies basic support for empty DF and using special EmptyIndexType + for respresnting it's index """ + + def test_create_empty_df(self): + def test_impl(): + df = pd.DataFrame({}, index=None) + return len(df) + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + self.assertEqual(result, result_ref) + + def test_unbox_empty_df(self): + def test_impl(df): + return len(df) + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = test_impl(df) + self.assertEqual(result, result_ref) + + def test_box_empty_df(self): + def test_impl(): + df = pd.DataFrame({}, index=None) + return df + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_frame_equal(result, result_ref) + + def test_empty_df_round_trip(self): + def test_impl(df): + return df + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = test_impl(df) + pd.testing.assert_frame_equal(result, result_ref) + + def test_empty_df_unboxed_get_index_1(self): + def test_impl(df): + return df.index + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = test_impl(df) + pd.testing.assert_index_equal(result, result_ref) + + def test_empty_df_unboxed_get_index_2(self): + + def py_func(df): + return df.index + + @self.jit + def sdc_func(df): + return df._index + + df = pd.DataFrame({}, index=None) + result = sdc_func(df) + result_ref = py_func(df) + pd.testing.assert_index_equal(result, result_ref) + + def test_empty_df_created_get_index_1(self): + def test_impl(): + df = pd.DataFrame({}, index=None) + return df.index + sdc_func = self.jit(test_impl) + + result = sdc_func() + result_ref = test_impl() + pd.testing.assert_index_equal(result, result_ref) + + def test_empty_df_created_get_index_2(self): + + def py_func(): + df = pd.DataFrame({}, index=None) + return df.index + + @self.jit + def sdc_func(): + df = pd.DataFrame({}, index=None) + return df._index + + result = sdc_func() + result_ref = py_func() + pd.testing.assert_index_equal(result, result_ref) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/indexes/test_indexes.py b/sdc/tests/indexes/test_indexes.py index fa8bf6f71..b40dd1008 100644 --- a/sdc/tests/indexes/test_indexes.py +++ b/sdc/tests/indexes/test_indexes.py @@ -28,12 +28,21 @@ import numpy as np import pandas as pd import unittest +from itertools import product -from sdc.tests.indexes import TestRangeIndex, TestInt64Index -from sdc.tests.indexes.index_datagens import _generate_index_param_values +from sdc.tests.indexes import ( + TestEmptyIndex, + TestPositionalIndex, + TestRangeIndex, + TestInt64Index, + ) +from sdc.tests.indexes.index_datagens import _generate_index_param_values, get_sample_index +from sdc.datatypes.indexes import * class TestIndexes( + TestEmptyIndex, + TestPositionalIndex, TestRangeIndex, TestInt64Index ): @@ -155,7 +164,7 @@ def test_indexes_unbox_series_with_index(self): @self.jit def test_impl(S): # TO-DO: this actually includes calling 'index' attribute overload, should really be S._index, - # but this requires separate type (e.g. DefaultIndexType) instead of types.none as default index + # but this requires separate type (e.g. PositionalIndexType) instead of types.none as default index return S.index n = 11 @@ -193,7 +202,7 @@ def test_impl(data, index): result_ref = test_impl(series_data, index) pd.testing.assert_series_equal(result, result_ref) - def test_indexes_index_get_series_index(self): + def test_indexes_get_series_index(self): def test_impl(S): return S.index sdc_func = self.jit(test_impl) @@ -206,11 +215,11 @@ def test_impl(S): result_ref = test_impl(S) self.assert_indexes_equal(result, result_ref) - def test_indexes_index_unbox_df_with_index(self): + def test_indexes_unbox_df_with_index(self): @self.jit def test_impl(df): # TO-DO: this actually includes calling 'index' attribute overload, should really be df._index, - # but this requires separate type (e.g. DefaultIndexType) instead of types.none as default index + # but this requires separate type (e.g. PositionalIndexType) instead of types.none as default index return df.index n = 11 @@ -221,7 +230,7 @@ def test_impl(df): result = test_impl(df) self.assert_indexes_equal(result, expected_res) - def test_indexes_index_create_df_with_index(self): + def test_indexes_create_df_with_index(self): @self.jit def test_impl(A, B, index): df = pd.DataFrame({'A': A, 'B': B}, index=index) @@ -235,7 +244,7 @@ def test_impl(A, B, index): result = test_impl(A, B, index) self.assert_indexes_equal(result, expected_res) - def test_indexes_index_box_df_with_index(self): + def test_indexes_box_df_with_index(self): def test_impl(A, B, index): return pd.DataFrame({'A': A, 'B': B}, index=index) sdc_func = self.jit(test_impl) @@ -248,7 +257,7 @@ def test_impl(A, B, index): result_ref = test_impl(A, B, index) pd.testing.assert_frame_equal(result, result_ref) - def test_indexes_index_get_df_index(self): + def test_indexes_get_df_index(self): def test_impl(df): return df.index sdc_func = self.jit(test_impl) @@ -261,6 +270,107 @@ def test_impl(df): result_ref = test_impl(df) self.assert_indexes_equal(result, result_ref) + def test_indexes_support_numpy_like_take_by(self): + """ Verifies numpy_like.take can handle SDC index types as indices """ + + from sdc.functions import numpy_like + + def pyfunc(arr, index): + return np.take(arr, index) + + @self.jit + def sdc_func(arr, index): + return numpy_like.take(arr, index) + + n, k = 1000, 200 + np.random.seed(0) + arr = np.arange(n) * 2 + indexes_to_test = [ + get_sample_index(k, PositionalIndexType), + get_sample_index(k, RangeIndexType), + get_sample_index(k, Int64IndexType), + ] + for index in indexes_to_test: + with self.subTest(index=index): + result = sdc_func(arr, index) + result_ref = pyfunc(arr, index) + np.testing.assert_array_equal(result, result_ref) + + def test_indexes_support_series_operator_add(self): + def test_impl(data, index1, index2): + S1 = pd.Series(data, index=index1) + S2 = pd.Series(2 * data + 1, index=index2) + return S1 + S2 + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.arange(n, dtype=np.float64) + index_params_to_test = [ + None, + pd.RangeIndex(0, -n, -1), + pd.Int64Index(np.arange(n) * 2), + ] + + for index1, index2 in product(index_params_to_test, repeat=2): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(series_data, index1, index2) + result_ref = test_impl(series_data, index1, index2) + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) + + def test_indexes_support_series_operator_lt(self): + def test_impl(data, index1, index2): + S1 = pd.Series(data, index=index1) + S2 = pd.Series(2 * data + 1, index=index2) + return S1 < S2 + sdc_func = self.jit(test_impl) + + n = 11 + series_data = np.arange(n, dtype=np.float64) + index_params_to_test = [ + None, + pd.RangeIndex(0, -n, -1), + pd.Int64Index(np.arange(n) * 2), + ] + + for index1 in index_params_to_test: + index2 = index1 + with self.subTest(index1=index1, index2=index2): + result = sdc_func(series_data, index1, index2) + result_ref = test_impl(series_data, index1, index2) + pd.testing.assert_series_equal(result, result_ref, check_dtype=False) + + def test_indexes_support_series_reindexing(self): + from sdc.datatypes.common_functions import sdc_reindex_series + + def pyfunc(data, index, name, by_index): + S = pd.Series(data, index, name=name) + return S.reindex(by_index) + + @self.jit + def sdc_func(data, index, name, by_index): + return sdc_reindex_series(data, index, name, by_index) + + n = 17 + np.random.seed(0) + mask = np.random.choice([True, False], n) + name = 'asdf' + + range_index = pd.RangeIndex(n) + int64_index = pd.Int64Index(np.random.choice(range_index.values, n, replace=False)) + indexes_combinations = [ + (range_index, range_index), + (range_index, range_index[::-1]), + (range_index[::-1], range_index), + (range_index, int64_index), + (int64_index, range_index), + ] + + for index1, index2 in indexes_combinations: + with self.subTest(index1=index1, index2=index2): + result = sdc_func(mask, index1, name, index2) + result_ref = pyfunc(mask, index1, name, index2) + pd.testing.assert_series_equal(result, result_ref) + if __name__ == "__main__": unittest.main() diff --git a/sdc/tests/indexes/test_int64_index.py b/sdc/tests/indexes/test_int64_index.py index 875d6e6dc..552e01f7b 100644 --- a/sdc/tests/indexes/test_int64_index.py +++ b/sdc/tests/indexes/test_int64_index.py @@ -25,6 +25,7 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import numba import numpy as np import pandas as pd import unittest @@ -34,12 +35,23 @@ test_global_index_names, _generate_valid_int64_index_data, _generate_int64_indexes_fixed, + get_sample_index, ) from sdc.tests.test_base import TestCase +from sdc.datatypes.indexes import * class TestInt64Index(TestCase): + def test_int64_index_type_inferred(self): + + for data in _generate_valid_int64_index_data(): + for name in test_global_index_names: + index = pd.Int64Index(data, name=name) + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, Int64IndexType) + def test_int64_index_create_and_box(self): def test_impl(data, name): return pd.Int64Index(data, name=name) @@ -265,24 +277,21 @@ def test_impl(index, idx): return index[idx] sdc_func = self.jit(test_impl) - index_len = 11 + n = 17 slices_params = combinations_with_replacement( - [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], - 3 + [None, 0, -1, n // 2, n, n - 3, n + 3, -(n + 3)], + 2 ) for data in _generate_valid_int64_index_data(): - for slice_start, slice_stop, slice_step in slices_params: - # slice step cannot be zero - if slice_step == 0: - continue - - idx = slice(slice_start, slice_stop, slice_step) - index = pd.Int64Index(data, name='abc') - with self.subTest(index=index, idx=idx): - result = sdc_func(index, idx) - result_ref = test_impl(index, idx) - pd.testing.assert_index_equal(result, result_ref) + index = pd.Int64Index(data, name='abc') + for slice_start, slice_stop in slices_params: + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) def test_int64_index_iterator_1(self): def test_impl(index): @@ -317,7 +326,7 @@ def test_impl(index): sdc_func = self.jit(test_impl) n = 11 - index = pd.Int64Index(np.arange(n) * 2) + index = get_sample_index(n, Int64IndexType) result = sdc_func(index) result_ref = test_impl(index) np.testing.assert_array_equal(result, result_ref) @@ -422,106 +431,63 @@ def test_impl(index, mask): result_ref = test_impl(index, mask) pd.testing.assert_index_equal(result, result_ref) - def test_int64_index_support_reindexing(self): - from sdc.datatypes.common_functions import sdc_reindex_series + def test_int64_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n, k = 11, 7 + np.random.seed(0) + idx = np.random.choice(np.arange(n), k) + for index in _generate_int64_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) - def pyfunc(data, index, name, by_index): - S = pd.Series(data, index, name=name) - return S.reindex(by_index) + def test_int64_index_reindex_equal_indexes(self): - @self.jit - def sdc_func(data, index, name, by_index): - return sdc_reindex_series(data, index, name, by_index) + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) n = 10 np.random.seed(0) - mask = np.random.choice([True, False], n) - name = 'asdf' index1 = pd.Int64Index(np.arange(n)) - index2 = pd.Int64Index(np.arange(n))[::-1] - result = sdc_func(mask, index1, name, index2) - result_ref = pyfunc(mask, index1, name, index2) - pd.testing.assert_series_equal(result, result_ref) + index2 = pd.Int64Index(np.copy(index1.values)) - def test_int64_index_support_join(self): - from sdc.datatypes.common_functions import sdc_join_series_indexes - - def pyfunc(index1, index2): - return index1.join(index2, how='outer', return_indexers=True) - - @self.jit - def sdc_func(index1, index2): - return sdc_join_series_indexes(index1, index2) - - index1 = pd.Int64Index(np.arange(-5, 5, 1), name='asv') - index2 = pd.Int64Index(np.arange(0, 10, 2), name='df') result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - results_names = ['result index', 'left indexer', 'right indexer'] - for i, name in enumerate(results_names): - result_elem = result[i] - result_ref_elem = result_ref[i].values if not i else result_ref[i] - np.testing.assert_array_equal(result_elem, result_ref_elem, f"Mismatch in {name}") - - def test_int64_index_support_take_from(self): - from sdc.datatypes.common_functions import _sdc_take - - def pyfunc(index1, indexes): - return index1.values.take(indexes) - - @self.jit - def sdc_func(index1, indexes): - return _sdc_take(index1, indexes) - - n, k = 1000, 200 - np.random.seed(0) - index = pd.Int64Index(np.arange(n) * 2, name='asd') - indexes = np.random.choice(np.arange(n), n)[:k] - result = sdc_func(index, indexes) - result_ref = pyfunc(index, indexes) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_support_take_by(self): - from sdc.datatypes.common_functions import _sdc_take + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) - def pyfunc(arr, index): - return np.take(arr, index) + def test_int64_index_reindex(self): - @self.jit - def sdc_func(arr, index): - return _sdc_take(arr, index) + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) - n, k = 1000, 200 + n = 10 np.random.seed(0) - arr = np.arange(n) * 2 - index = pd.Int64Index(np.random.choice(np.arange(n), n)[:k]) - result = sdc_func(arr, index) - result_ref = pyfunc(arr, index) - np.testing.assert_array_equal(result, result_ref) - - def test_int64_index_support_astype(self): - from sdc.functions.numpy_like import astype - - def pyfunc(index): - return index.values.astype(np.int64) - - @self.jit - def sdc_func(index): - return astype(index, np.int64) - - n = 100 - index = pd.Int64Index(np.arange(n) * 2, name='asd') - np.testing.assert_array_equal(sdc_func(index), pyfunc(index)) - - def test_int64_index_support_array_equal(self): - from sdc.functions.numpy_like import array_equal + index_data = np.arange(n) + index1 = pd.Int64Index(np.random.choice(index_data, n, replace=False)) + reindex_by = [ + pd.RangeIndex(n + 2), + pd.RangeIndex(0, n, 2), + pd.Int64Index(np.random.choice(index_data, n, replace=False)), + pd.Int64Index(np.random.choice([0, 1, 11, 12, 100], n)) + ] - def pyfunc(index1, index2): - return np.array_equal(index1.values, index2.values) + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) - @self.jit - def sdc_func(index1, index2): - return array_equal(index1, index2) + def test_int64_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) n = 11 indexes_to_test = [ @@ -533,40 +499,9 @@ def sdc_func(index1, index2): for index1, index2 in combinations_with_replacement(indexes_to_test, 2): with self.subTest(index1=index1, index2=index2): result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) + result_ref = test_impl(index1, index2) self.assertEqual(result, result_ref) - def test_int64_index_support_copy(self): - from sdc.functions.numpy_like import copy - - @self.jit - def sdc_func(index): - return copy(index) - - for data in _generate_valid_int64_index_data(): - for name in test_global_index_names: - index = pd.Int64Index(data, name=name) - with self.subTest(index=index): - result = sdc_func(index) - pd.testing.assert_index_equal(result, index) - - def test_int64_index_support_append(self): - from sdc.datatypes.common_functions import hpat_arrays_append - - def pyfunc(index1, index2): - return index1.append(index2) - - @self.jit - def sdc_func(index1, index2): - return hpat_arrays_append(index1, index2) - - n = 11 - index1 = pd.Int64Index(np.arange(n), name='asv') - index2 = pd.Int64Index(2 * np.arange(n), name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - np.testing.assert_array_equal(result, result_ref) - def test_int64_index_ravel(self): def test_impl(index): return index.ravel() @@ -578,6 +513,67 @@ def test_impl(index): result_ref = test_impl(index) np.testing.assert_array_equal(result, result_ref) + def test_int64_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_int64_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(n, PositionalIndexType), + get_sample_index(n, RangeIndexType), + get_sample_index(n, Int64IndexType), + ] + for index, other in product( + _generate_int64_indexes_fixed(n), + other_indexes): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) + + def test_int64_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(2 * n, PositionalIndexType), + get_sample_index(2 * n, RangeIndexType), + get_sample_index(2 * n, Int64IndexType), + ] + for index, other in product( + _generate_int64_indexes_fixed(n), + other_indexes): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) + if __name__ == "__main__": unittest.main() diff --git a/sdc/tests/indexes/test_positional_index.py b/sdc/tests/indexes/test_positional_index.py new file mode 100644 index 000000000..1585969c6 --- /dev/null +++ b/sdc/tests/indexes/test_positional_index.py @@ -0,0 +1,573 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numpy as np +import pandas as pd +import unittest +from itertools import (combinations_with_replacement, product, chain, ) + +import numba +from sdc.tests.indexes.index_datagens import ( + test_global_index_names, + _generate_positional_range_params, + _generate_positional_indexes_fixed, + get_sample_index, + ) +from sdc.tests.test_base import TestCase +from sdc.extensions.indexes.positional_index_ext import init_positional_index +from sdc.datatypes.indexes import * + + +class TestPositionalIndex(TestCase): + + def test_positional_index_type_inferred(self): + + for params in _generate_positional_range_params(): + start, stop, step = params + for name in test_global_index_names: + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, PositionalIndexType) + + def test_positional_index_create_and_box(self): + @self.jit + def sdc_func(stop, name): + return init_positional_index(stop, name=name) + + for size, name in product([1, 5, 17], test_global_index_names): + with self.subTest(size=size, name=name): + result = sdc_func(size, name) + expected_res = pd.RangeIndex(size, name=name) + pd.testing.assert_index_equal(result, expected_res) + + def test_positional_index_unbox_and_box(self): + def test_impl(index): + return index + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + start, stop, step = params + for name in test_global_index_names: + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_create_param_name_literal_str(self): + @self.jit + def sdc_func(stop): + return init_positional_index(stop, name='index') + + n = 11 + result = sdc_func(n) + expected_res = pd.RangeIndex(n, name='index') + pd.testing.assert_index_equal(result, expected_res) + + def test_positional_index_attribute_start(self): + def test_impl(index): + return index.start + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_stop(self): + def test_impl(index): + return index.stop + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_step(self): + def test_impl(index): + return index.step + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_dtype(self): + def test_impl(index): + return index.dtype + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(11) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_name(self): + def test_impl(index): + return index.name + sdc_func = self.jit(test_impl) + + n = 11 + for name in test_global_index_names: + with self.subTest(name=name): + index = pd.RangeIndex(n, name=name) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_len(self): + def test_impl(index): + return len(index) + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_attribute_values(self): + def test_impl(index): + return index.values + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + with self.subTest(index=index): + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_contains(self): + def test_impl(index, value): + return value in index + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(11) + values_to_test = [-5, 15, 1, 11, 5, 6] + for value in values_to_test: + with self.subTest(value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_copy(self): + def test_impl(index, new_name): + return index.copy(name=new_name) + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + start, stop, step = params + for name, new_name in product(test_global_index_names, repeat=2): + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index, new_name=new_name): + result = sdc_func(index, new_name) + result_ref = test_impl(index, new_name) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_getitem_scalar(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + for params in _generate_positional_range_params(): + index = pd.RangeIndex(*params) + n = len(index) + if not n: # test only non-empty ranges + continue + values_to_test = [-n, n // 2, n - 1] + for idx in values_to_test: + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + self.assertEqual(result, result_ref) + + def test_positional_index_getitem_scalar_idx_bounds(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.RangeIndex(n, name='abc') + values_to_test = [-(n + 1), n] + for idx in values_to_test: + with self.subTest(idx=idx): + with self.assertRaises(Exception) as context: + test_impl(index, idx) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index, idx) + sdc_exception = context.exception + self.assertIsInstance(sdc_exception, type(pandas_exception)) + self.assertIn("out of bounds", str(sdc_exception)) + + def test_positional_index_getitem_slice(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + index_len = 17 + slices_params = combinations_with_replacement( + [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], + 2, + ) + + index = pd.RangeIndex(0, index_len, 1, name='abc') + for slice_start, slice_stop in slices_params: + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_iterator_1(self): + def test_impl(index): + res = [] + for i, label in enumerate(index): + res.append((i, label)) + return res + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(0, 21, 1) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_iterator_2(self): + def test_impl(index): + res = [] + for label in index: + if not label % 2: + res.append(label) + return res + sdc_func = self.jit(test_impl) + + index = pd.RangeIndex(0, 21, 1) + result = sdc_func(index) + result_ref = test_impl(index) + self.assertEqual(result, result_ref) + + def test_positional_index_nparray(self): + def test_impl(index): + return np.array(index) + sdc_func = self.jit(test_impl) + + n = 11 + index = get_sample_index(n, PositionalIndexType) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_eq_index_1(self): + """ Verifies operator.eq implementation for pandas PositionalIndex in a case of equal range sizes """ + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_positional_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_eq_index_2(self): + """ Verifies operator.eq implementation for pandas PositionalIndex in a case of non equal range sizes """ + def test_impl(index1, index2): + return index1 == index2 + sdc_func = self.jit(test_impl) + + index1 = pd.RangeIndex(11) + index2 = pd.RangeIndex(22) + with self.assertRaises(Exception) as context: + test_impl(index1, index2) + pandas_exception = context.exception + + with self.assertRaises(type(pandas_exception)) as context: + sdc_func(index1, index2) + sdc_exception = context.exception + self.assertIn(str(sdc_exception), str(pandas_exception)) + + def test_positional_index_operator_eq_scalar(self): + """ Verifies operator.eq implementation for pandas PositionalIndex and a scalar value """ + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + A = pd.RangeIndex(n) + scalars_to_test = [ + A.start, + float(A.start), + A.start + 1, + (A.start + A.stop) / 2, + A.stop, + ] + for B in scalars_to_test: + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_eq_nparray(self): + """ Verifies operator.eq implementation for pandas PositionalIndex and a numpy array """ + def test_impl(A, B): + return A == B + sdc_func = self.jit(test_impl) + + n = 11 + for A, B in product( + _generate_positional_indexes_fixed(n), + map(lambda x: np.array(x), _generate_positional_indexes_fixed(n)) + ): + for swap_operands in (False, True): + if swap_operands: + A, B = B, A + with self.subTest(left=A, right=B): + result = np.asarray(sdc_func(A, B)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(A, B) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_ne_index(self): + """ Verifies operator.ne implementation for pandas PositionalIndex in a case of non equal range sizes """ + def test_impl(index1, index2): + return index1 != index2 + sdc_func = self.jit(test_impl) + + n = 11 + for index1, index2 in product(_generate_positional_indexes_fixed(n), repeat=2): + with self.subTest(index1=index1, index2=index2): + result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray + result_ref = test_impl(index1, index2) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_operator_is_nounbox(self): + def test_impl_1(*args): + index1 = pd.RangeIndex(*args) + index2 = index1 + return index1 is index2 + sdc_func_1 = self.jit(test_impl_1) + + def test_impl_2(*args): + index1 = pd.RangeIndex(*args) + index2 = pd.RangeIndex(*args) + return index1 is index2 + sdc_func_2 = self.jit(test_impl_2) + + # positive testcase + params = 1, 21, 3 + with self.subTest(subtest="same indexes"): + result = sdc_func_1(*params) + result_ref = test_impl_1(*params) + self.assertEqual(result, result_ref) + self.assertEqual(result, True) + + # negative testcase + with self.subTest(subtest="not same indexes"): + result = sdc_func_2(*params) + result_ref = test_impl_2(*params) + self.assertEqual(result, result_ref) + self.assertEqual(result, False) + + def test_positional_index_getitem_by_mask(self): + def test_impl(index, mask): + return index[mask] + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + mask = np.random.choice([True, False], n) + for index in _generate_positional_indexes_fixed(n): + result = sdc_func(index, mask) + result_ref = test_impl(index, mask) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) + + n, k = 11, 7 + np.random.seed(0) + idx = np.random.choice(np.arange(n), k) + for index in _generate_positional_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) + + n = 11 + self_indexes = list(chain( + _generate_positional_indexes_fixed(n), + _generate_positional_indexes_fixed(2 * n) + )) + + all_positional_indexes = list(_generate_positional_indexes_fixed(n)) + other_indexes = chain( + all_positional_indexes, + map(lambda x: pd.Int64Index(x), all_positional_indexes), + ) + + for index1, index2 in product(self_indexes, other_indexes): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) + + def test_positional_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) + + n = 11 + index = pd.RangeIndex(n) + result = sdc_func(index) + result_ref = test_impl(index) + np.testing.assert_array_equal(result, result_ref) + + def test_positional_index_reindex_equal_indexes(self): + + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) + + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(0, n, 1) + index2 = index1.copy(deep=True) + + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_positional_index_reindex(self): + + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) + + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(0, n, 1) + reindex_by = [ + pd.RangeIndex(n + 2), + pd.RangeIndex(0, n, 2), + pd.Int64Index(np.random.choice(index1.values, n, replace=False)), + pd.Int64Index(np.random.choice([0, 1, 11, 12, 100], n)) + ] + + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) + + def test_positional_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) + + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_positional_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(n, PositionalIndexType), + get_sample_index(n, RangeIndexType), + get_sample_index(n, Int64IndexType), + ] + for index, other in product( + _generate_positional_indexes_fixed(n), + other_indexes): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) + + def test_positional_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) + sdc_func = self.jit(test_impl) + + n = 11 + other_indexes = [ + get_sample_index(2 * n, PositionalIndexType), + get_sample_index(2 * n, RangeIndexType), + get_sample_index(2 * n, Int64IndexType), + ] + for index, other in product( + _generate_positional_indexes_fixed(n), + other_indexes): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/indexes/test_range_index.py b/sdc/tests/indexes/test_range_index.py index e9369c9f7..1bc7a34c8 100644 --- a/sdc/tests/indexes/test_range_index.py +++ b/sdc/tests/indexes/test_range_index.py @@ -25,31 +25,44 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import numba import numpy as np import pandas as pd import unittest -from itertools import (combinations_with_replacement, product, ) +from itertools import (combinations_with_replacement, product, chain, ) from numba.core.errors import TypingError from sdc.tests.indexes.index_datagens import ( test_global_index_names, - _generate_valid_range_params, + _generate_custom_range_params, _generate_range_indexes_fixed, - _generate_index_param_values, + _generate_custom_range_indexes_fixed, + get_sample_index ) from sdc.tests.test_base import TestCase from sdc.utilities.sdc_typing_utils import kwsparams2list from sdc.tests.test_series import _make_func_from_text +from sdc.datatypes.indexes import * class TestRangeIndex(TestCase): + def test_range_index_type_inferred(self): + + for params in _generate_custom_range_params(): + start, stop, step = params + for name in test_global_index_names: + index = pd.RangeIndex(start, stop, step, name=name) + with self.subTest(index=index): + native_index_type = numba.typeof(index) + self.assertIsInstance(native_index_type, RangeIndexType) + def test_range_index_create_and_box(self): def test_impl(start, stop, step, name): return pd.RangeIndex(start, stop, step, name=name) sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params for name in test_global_index_names: with self.subTest(start=start, stop=stop, step=step, name=name): @@ -62,7 +75,7 @@ def test_impl(index): return index sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params for name in test_global_index_names: index = pd.RangeIndex(start, stop, step, name=name) @@ -189,7 +202,7 @@ def test_impl(*args): return index.start sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -202,7 +215,7 @@ def test_impl(*args): return index.stop sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -215,7 +228,7 @@ def test_impl(*args): return index.step sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -251,7 +264,7 @@ def test_impl(*args): return len(index) sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params with self.subTest(start=start, stop=stop, step=step): result = sdc_func(*params) @@ -263,7 +276,7 @@ def test_impl(index): return index.values sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): index = pd.RangeIndex(*params) with self.subTest(index=index): result = sdc_func(index) @@ -288,7 +301,7 @@ def test_impl(index, new_name): return index.copy(name=new_name) sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): start, stop, step = params for name, new_name in product(test_global_index_names, repeat=2): index = pd.RangeIndex(start, stop, step, name=name) @@ -302,7 +315,7 @@ def test_impl(index, idx): return index[idx] sdc_func = self.jit(test_impl) - for params in _generate_valid_range_params(): + for params in _generate_custom_range_params(): index = pd.RangeIndex(*params) n = len(index) if not n: # test only non-empty ranges @@ -320,7 +333,7 @@ def test_impl(index, idx): sdc_func = self.jit(test_impl) n = 11 - index = pd.RangeIndex(n, name='abc') + index = pd.RangeIndex(start=0, stop=-n, step=-1, name='abc') values_to_test = [-(n + 1), n] for idx in values_to_test: with self.subTest(idx=idx): @@ -339,22 +352,19 @@ def test_impl(index, idx): return index[idx] sdc_func = self.jit(test_impl) - index_len = 17 - start_values, step_values = [0, 5, -5], [1, 2, 7] + n = 17 slices_params = combinations_with_replacement( - [None, 0, -1, index_len // 2, index_len, index_len - 3, index_len + 3, -(index_len + 3)], + [None, 0, -1, n // 2, n, n - 3, n + 3, -(n + 3)], 2 ) - - for start, step, slice_step in product(start_values, step_values, step_values): - stop = start + index_len + for index in _generate_custom_range_indexes_fixed(n): for slice_start, slice_stop in slices_params: - idx = slice(slice_start, slice_stop, slice_step) - index = pd.RangeIndex(start, stop, step, name='abc') - with self.subTest(index=index, idx=idx): - result = sdc_func(index, idx) - result_ref = test_impl(index, idx) - pd.testing.assert_index_equal(result, result_ref) + for slice_step in [1, -1, 2]: + idx = slice(slice_start, slice_stop, slice_step) + with self.subTest(index=index, idx=idx): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) def test_range_index_iterator_1(self): def test_impl(index): @@ -388,7 +398,8 @@ def test_impl(index): return np.array(index) sdc_func = self.jit(test_impl) - index = pd.RangeIndex(1, 21, 3) + n = 11 + index = get_sample_index(n, RangeIndexType) result = sdc_func(index) result_ref = test_impl(index) np.testing.assert_array_equal(result, result_ref) @@ -400,7 +411,7 @@ def test_impl(index1, index2): sdc_func = self.jit(test_impl) n = 11 - for index1, index2 in product(_generate_range_indexes_fixed(n), repeat=2): + for index1, index2 in product(_generate_custom_range_indexes_fixed(n), repeat=2): with self.subTest(index1=index1, index2=index2): result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray result_ref = test_impl(index1, index2) @@ -455,8 +466,8 @@ def test_impl(A, B): n = 11 for A, B in product( - _generate_range_indexes_fixed(n), - map(lambda x: np.array(x), _generate_range_indexes_fixed(n)) + _generate_custom_range_indexes_fixed(n), + map(lambda x: np.array(x), _generate_custom_range_indexes_fixed(n)) ): for swap_operands in (False, True): if swap_operands: @@ -473,7 +484,7 @@ def test_impl(index1, index2): sdc_func = self.jit(test_impl) n = 11 - for index1, index2 in product(_generate_range_indexes_fixed(n), repeat=2): + for index1, index2 in product(_generate_custom_range_indexes_fixed(n), repeat=2): with self.subTest(index1=index1, index2=index2): result = np.asarray(sdc_func(index1, index2)) # FIXME_Numba#5157: remove np.asarray result_ref = test_impl(index1, index2) @@ -515,143 +526,156 @@ def test_impl(index, mask): n = 11 np.random.seed(0) mask = np.random.choice([True, False], n) - for index in _generate_range_indexes_fixed(n): + for index in _generate_custom_range_indexes_fixed(n): result = sdc_func(index, mask) result_ref = test_impl(index, mask) pd.testing.assert_index_equal(result, result_ref) - def test_range_index_support_reindexing(self): - from sdc.datatypes.common_functions import sdc_reindex_series - - def pyfunc(data, index, name, by_index): - S = pd.Series(data, index, name=name) - return S.reindex(by_index) - - @self.jit - def sdc_func(data, index, name, by_index): - return sdc_reindex_series(data, index, name, by_index) + def test_range_index_getitem_by_array(self): + def test_impl(index, idx): + return index[idx] + sdc_func = self.jit(test_impl) - n = 100 + n, k = 11, 7 np.random.seed(0) - mask = np.random.choice([True, False], n) - name = 'asdf' - index1 = pd.RangeIndex(n) - index2 = index1[::-1] - result = sdc_func(mask, index1, name, index2) - result_ref = pyfunc(mask, index1, name, index2) - pd.testing.assert_series_equal(result, result_ref) - - def test_range_index_support_join(self): - from sdc.datatypes.common_functions import sdc_join_series_indexes - - def pyfunc(index1, index2): - return index1.join(index2, how='outer', return_indexers=True) - - @self.jit - def sdc_func(index1, index2): - return sdc_join_series_indexes(index1, index2) - - index1 = pd.RangeIndex(1, 21, 3, name='asv') - index2 = pd.RangeIndex(19, -1, -3, name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - results_names = ['result index', 'left indexer', 'right indexer'] - for i, name in enumerate(results_names): - result_elem = result[i] - result_ref_elem = result_ref[i].values if not i else result_ref[i] - np.testing.assert_array_equal(result_elem, result_ref_elem, f"Mismatch in {name}") - - def test_range_index_support_take(self): - from sdc.datatypes.common_functions import _sdc_take - - def pyfunc(index1, indexes): - return index1.values.take(indexes) + idx = np.random.choice(np.arange(n), k) + for index in _generate_custom_range_indexes_fixed(n): + result = sdc_func(index, idx) + result_ref = test_impl(index, idx) + pd.testing.assert_index_equal(result, result_ref) - @self.jit - def sdc_func(index1, indexes): - return _sdc_take(index1, indexes) + def test_range_index_ravel(self): + def test_impl(index): + return index.ravel() + sdc_func = self.jit(test_impl) - n, k = 1000, 200 - np.random.seed(0) - index = pd.RangeIndex(stop=3 * n, step=3, name='asd') - indexes = np.random.choice(np.arange(n), n)[:k] - result = sdc_func(index, indexes) - result_ref = pyfunc(index, indexes) + n = 11 + index = pd.RangeIndex(n) + result = sdc_func(index) + result_ref = test_impl(index) np.testing.assert_array_equal(result, result_ref) - def test_range_index_support_astype(self): - from sdc.functions.numpy_like import astype + def test_range_index_equals(self): + def test_impl(index1, index2): + return index1.equals(index2) + sdc_func = self.jit(test_impl) - def pyfunc(index): - return index.values.astype(np.int64) + n = 11 + self_indexes = list(chain( + _generate_custom_range_indexes_fixed(n), + _generate_custom_range_indexes_fixed(2 * n) + )) + + all_range_indexes = list(_generate_range_indexes_fixed(n)) + other_indexes = chain( + all_range_indexes, + map(lambda x: pd.Int64Index(x), all_range_indexes), + ) - @self.jit - def sdc_func(index): - return astype(index, np.int64) + for index1, index2 in product(self_indexes, other_indexes): + with self.subTest(index1=index1, index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + self.assertEqual(result, result_ref) - index = pd.RangeIndex(stop=11, name='asd') - np.testing.assert_array_equal(sdc_func(index), pyfunc(index)) + def test_range_index_reindex_equal_indexes(self): - def test_range_index_support_array_equal(self): - from sdc.functions.numpy_like import array_equal + def test_func(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_func) - def pyfunc(index1, index2): - return np.array_equal(index1.values, index2.values) + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(-1, n, 1) + index2 = index1.copy(deep=True) - @self.jit - def sdc_func(index1, index2): - return array_equal(index1, index2) + result = sdc_func(index1, index2) + result_ref = test_func(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) - for params1, params2 in product(_generate_valid_range_params(), repeat=2): - for name1, name2 in product(test_global_index_names, repeat=2): - index1 = pd.RangeIndex(*params1, name=name1) - index2 = pd.RangeIndex(*params2, name=name2) - with self.subTest(index1=index1, index2=index2): - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - self.assertEqual(result, result_ref) + def test_range_index_reindex(self): - def test_range_index_support_copy(self): - from sdc.functions.numpy_like import copy + def test_impl(index1, index2): + return index1.reindex(index2) + sdc_func = self.jit(test_impl) - @self.jit - def sdc_func(index): - return copy(index) + n = 20 + np.random.seed(0) + index1 = pd.RangeIndex(-1, n, 1) + reindex_by = [ + pd.RangeIndex(0, n + 2, 2), + pd.Int64Index(np.random.choice(index1.values, n, replace=False)), + pd.Int64Index(np.random.choice([0, 1, 11, 12, 100], n)) + ] - for params in _generate_valid_range_params(): - for name in test_global_index_names: - index = pd.RangeIndex(*params, name=name) - with self.subTest(index=index): - result = sdc_func(index) - pd.testing.assert_index_equal(result, index) + for index2 in reindex_by: + with self.subTest(index2=index2): + result = sdc_func(index1, index2) + result_ref = test_impl(index1, index2) + pd.testing.assert_index_equal(result[0], result_ref[0]) + np.testing.assert_array_equal(result[1], result_ref[1]) - def test_range_index_support_append(self): - from sdc.datatypes.common_functions import hpat_arrays_append + def test_range_index_take(self): + def test_impl(index, value): + return index.take(value) + sdc_func = self.jit(test_impl) - def pyfunc(index1, index2): - return index1.append(index2) + n = 11 + np.random.seed(0) + index_pos = np.arange(n) + values_to_test = [ + np.random.choice(index_pos, 2*n), + list(np.random.choice(index_pos, n, replace=False)), + pd.RangeIndex(n // 2), + pd.Int64Index(index_pos[n // 2:]) + ] + for index, value in product(_generate_range_indexes_fixed(n), values_to_test): + with self.subTest(index=index, value=value): + result = sdc_func(index, value) + result_ref = test_impl(index, value) + pd.testing.assert_index_equal(result, result_ref) - @self.jit - def sdc_func(index1, index2): - return hpat_arrays_append(index1, index2) + def test_range_index_append(self): + def test_impl(index, other): + return index.append(other) + sdc_func = self.jit(test_impl) n = 11 - index1 = pd.RangeIndex(1, 21, 3, name='asv') - index2 = pd.RangeIndex(19, -1, -3, name='df') - result = sdc_func(index1, index2) - result_ref = pyfunc(index1, index2) - np.testing.assert_array_equal(result, result_ref) + other_indexes = [ + get_sample_index(n, PositionalIndexType), + get_sample_index(n, RangeIndexType), + get_sample_index(n, Int64IndexType), + ] + for index, other in product( + _generate_range_indexes_fixed(n), + other_indexes): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + pd.testing.assert_index_equal(result, result_ref) - def test_range_index_ravel(self): - def test_impl(index): - return index.ravel() + def test_range_index_join(self): + def test_impl(index, other): + return index.join(other, 'outer', return_indexers=True) sdc_func = self.jit(test_impl) n = 11 - index = pd.RangeIndex(n) - result = sdc_func(index) - result_ref = test_impl(index) - np.testing.assert_array_equal(result, result_ref) + other_indexes = [ + get_sample_index(2 * n, PositionalIndexType), + get_sample_index(2 * n, RangeIndexType), + get_sample_index(2 * n, Int64IndexType), + ] + for index, other in product( + _generate_range_indexes_fixed(n), + other_indexes): + with self.subTest(index=index, other=other): + result = sdc_func(index, other) + result_ref = test_impl(index, other) + # check_names=False, since pandas behavior is not type-stable + pd.testing.assert_index_equal(result[0], result_ref[0], check_names=False) + np.testing.assert_array_equal(result[1], result_ref[1]) + np.testing.assert_array_equal(result[2], result_ref[2]) if __name__ == "__main__": diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index aa90c7f23..23ec45639 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -39,7 +39,7 @@ from pandas.core.indexing import IndexingError import sdc -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation from sdc.tests.gen_test_data import ParquetGenerator from sdc.tests.test_base import TestCase from sdc.tests.test_utils import (check_numba_version, @@ -1864,7 +1864,7 @@ def test_df_drop_one_column(self): """ Verifies df.drop handles string literal as columns param """ def test_impl(): df = pd.DataFrame({ - 'A': [1.0, 2.0, np.nan, 1.0], + 'A': np.array([1.0, 2.0, np.nan, 1.0]), 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0] }) diff --git a/sdc/tests/test_date.py b/sdc/tests/test_date.py index 83d001349..99509b74b 100644 --- a/sdc/tests/test_date.py +++ b/sdc/tests/test_date.py @@ -74,16 +74,17 @@ def test_impl(df): df = self._gen_str_date_df() np.testing.assert_array_equal(hpat_func(df), test_impl(df)) + @skip_numba_jit("DatetimeIndex unboxing not supported") def test_datetime_arg(self): def test_impl(A): return A - hpat_func = self.jit(test_impl) + sdc_func = self.jit(test_impl) df = self._gen_str_date_df() A = pd.DatetimeIndex(df['str_date']).to_series() - result = hpat_func(A) + result = sdc_func(A) result_ref = test_impl(A) - np.testing.assert_array_equal(result, result_ref) + pd.testing.assert_series_equal(result, result_ref) @skip_numba_jit def test_datetime_getitem(self): diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index f431aa0d7..4ec29885d 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -62,7 +62,7 @@ gen_frand_array, gen_strlist, _make_func_from_text) -from sdc.datatypes.common_functions import SDCLimitation +from sdc.utilities.sdc_typing_utils import SDCLimitation _cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [ @@ -4792,7 +4792,36 @@ def test_series_cov_impl(s1, s2, min_periods=None): msg = 'Method cov(). The object min_periods' self.assertIn(msg, str(raises.exception)) - @skip_numba_jit + def test_series_div_special(self): + @self.jit + def test_func(S1, S2): + return S1.div(S2) + # return S1 + S2 + + S1 = pd.Series( + np.arange(12), + index=pd.RangeIndex(start=0, stop=12, step=1) + ) + S2 = pd.Series( + # [1.1, 0.3, np.nan, 1., np.inf, 0., 1.1, np.nan, 2.2, np.inf, 2., 2.], + np.arange(12), + index=pd.RangeIndex(start=0, stop=12, step=1) + ) + + res = test_func(S1, S2) + + def test_series_get_index(self): + @self.jit + def test_func(S1): + return S1._index.values + + S1 = pd.Series( + np.arange(12), + index=pd.RangeIndex(start=0, stop=12, step=1) + ) + + res = test_func(S1) + def test_series_pct_change(self): def test_series_pct_change_impl(S, periods, method): return S.pct_change(periods=periods, fill_method=method, limit=None, freq=None) @@ -5005,7 +5034,7 @@ def test_impl(S, idx, value): 'not a Boolean or integer indexer or a Slice. Given: self.index={}, idx={}' with self.assertRaises(TypingError) as raises: hpat_func(S, idx, value) - msg = msg_tmpl.format('none', 'unicode_type') + msg = msg_tmpl.format('PositionalIndexType(False)', 'unicode_type') self.assertIn(msg, str(raises.exception)) def test_series_istitle_str(self): diff --git a/sdc/tests/test_series_ops.py b/sdc/tests/test_series_ops.py index cbf9782b2..5dbda0d42 100644 --- a/sdc/tests/test_series_ops.py +++ b/sdc/tests/test_series_ops.py @@ -1157,6 +1157,22 @@ def test_impl(a, b, value): result_ref = test_impl(S1, scalar, fill_value) pd.testing.assert_series_equal(result, result_ref) + @skip_numba_jit("Expected to fail due to type-stability of index operations") + def test_series_operator_add_index_type_check(self): + def test_impl(S1, S2): + return S1 + S2 + hpat_func = self.jit(test_impl) + + n = 11 + series_data = np.arange(n, dtype=np.float64) + index_data = pd.RangeIndex(n, 0, -1) + S1 = pd.Series(series_data, index_data) + S2 = pd.Series(2 * series_data + 1, index_data) + result = hpat_func(S1, S2) + result_ref = test_impl(S1, S2) + pd.testing.assert_series_equal(result, result_ref) + pd.testing.assert_index_equal(result.index, result_ref.index, exact=True) + if __name__ == "__main__": unittest.main() diff --git a/sdc/utilities/sdc_typing_utils.py b/sdc/utilities/sdc_typing_utils.py index 1d489d17f..b1b6a8ebf 100644 --- a/sdc/utilities/sdc_typing_utils.py +++ b/sdc/utilities/sdc_typing_utils.py @@ -23,7 +23,6 @@ # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - """ | This file contains SDC utility functions related to typing compilation phase @@ -39,19 +38,34 @@ from numba.np import numpy_support from sdc.str_arr_type import string_array_type -from sdc.datatypes.range_index_type import RangeIndexType -from sdc.datatypes.int64_index_type import Int64IndexType +from sdc.datatypes.indexes import * from sdc.str_arr_ext import StringArrayType +sdc_old_index_types = (types.Array, StringArrayType, ) sdc_pandas_index_types = ( - types.NoneType, - types.Array, - StringArrayType, + EmptyIndexType, + PositionalIndexType, RangeIndexType, Int64IndexType, + ) + sdc_old_index_types + +sdc_indexes_range_like = ( + PositionalIndexType, + RangeIndexType, + ) + +# TO-DO: support caching of data allocated for range indexes at request for .values +sdc_indexes_wo_values_cache = ( + EmptyIndexType, + PositionalIndexType, + RangeIndexType, ) +sdc_pandas_df_column_types = ( + types.Array, + StringArrayType, + ) class TypeChecker: """ @@ -102,6 +116,11 @@ def check(self, data, accepted_type, name=''): self.raise_exc(data, accepted_type.__name__, name=name) +class SDCLimitation(Exception): + """Exception to be raised in case of SDC limitation""" + pass + + def kwsparams2list(params): """Convert parameters dict to a list of string of a format 'key=value'""" return ['{}={}'.format(k, v) for k, v in params.items()] @@ -193,17 +212,17 @@ def find_common_dtype_from_numpy_dtypes(array_types, scalar_types): return numba_common_dtype -def find_index_common_dtype(self, other): +def find_index_common_dtype(left, right): """Used to find common dtype for indexes of two series and verify if index dtypes are equal""" - self_index_dtype = RangeIndexType.dtype if isinstance(self.index, types.NoneType) else self.index.dtype - other_index_dtype = RangeIndexType.dtype if isinstance(other.index, types.NoneType) else other.index.dtype - index_dtypes_match = self_index_dtype == other_index_dtype + left_index_dtype = left.dtype + right_index_dtype = right.dtype + index_dtypes_match = left_index_dtype == right_index_dtype if not index_dtypes_match: numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( - [self_index_dtype, other_index_dtype], []) + [left_index_dtype, right_index_dtype], []) else: - numba_index_common_dtype = self_index_dtype + numba_index_common_dtype = left_index_dtype return index_dtypes_match, numba_index_common_dtype @@ -224,3 +243,11 @@ def _df_impl_generator(*args, **kwargs): def check_signed_integer(ty): return isinstance(ty, types.Integer) and ty.signed + + +def _check_dtype_param_type(dtype): + """ Returns True is dtype is a valid type for dtype parameter and False otherwise. + Used in RangeIndex ctor and other methods that take dtype parameter. """ + + valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass) + return isinstance(dtype, valid_dtype_types) or dtype is None