diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index d359d2c97..1561d0ed1 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -31,14 +31,17 @@ """ import numpy +import pandas +import numba from numba import types from numba.errors import TypingError from numba.extending import overload from numba import numpy_support import sdc -from sdc.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to) +from sdc.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to, + str_arr_is_na, pre_alloc_string_array, str_arr_set_na) class TypeChecker: @@ -91,7 +94,7 @@ def check(self, data, accepted_type, name=''): def has_literal_value(var, value): - '''Used during typing to check that variable var is a Numba literal value equal to value''' + """Used during typing to check that variable var is a Numba literal value equal to value""" if not isinstance(var, types.Literal): return False @@ -103,7 +106,7 @@ def has_literal_value(var, value): def has_python_value(var, value): - '''Used during typing to check that variable var was resolved as Python type and has specific value''' + """Used during typing to check that variable var was resolved as Python type and has specific value""" if not isinstance(var, type(value)): return False @@ -114,13 +117,18 @@ def has_python_value(var, value): return var == value +def check_index_is_numeric(ty_series): + """Used during typing to check that series has numeric index""" + return isinstance(ty_series.index, types.Array) and isinstance(ty_series.index.dtype, types.Number) + + def hpat_arrays_append(A, B): pass @overload(hpat_arrays_append) def hpat_arrays_append_overload(A, B): - '''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A''' + """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" if isinstance(A, types.Array): if isinstance(B, types.Array): @@ -131,9 +139,7 @@ def _append_single_numeric_impl(A, B): elif isinstance(B, (types.UniTuple, types.List)): # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way # to resolve common dtype of heterogeneous sequence of arrays - np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)] - np_common_dtype = numpy.find_common_type([], np_dtypes) - numba_common_dtype = numpy_support.from_dtype(np_common_dtype) + numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): @@ -181,3 +187,257 @@ def _append_list_string_array_impl(A, B): return new_data return _append_list_string_array_impl + + +@numba.njit +def _hpat_ensure_array_capacity(new_size, arr): + """ Function ensuring that the size of numpy array is at least as specified + Returns newly allocated array of bigger size with copied elements if existing size is less than requested + """ + + k = len(arr) + if k >= new_size: + return arr + + n = k + while n < new_size: + n = 2 * n + res = numpy.empty(n, arr.dtype) + res[:k] = arr[:k] + return res + + +def find_common_dtype_from_numpy_dtypes(array_types, scalar_types): + """Used to find common numba dtype for a sequences of numba dtypes each representing some numpy dtype""" + np_array_dtypes = [numpy_support.as_dtype(dtype) for dtype in array_types] + np_scalar_dtypes = [numpy_support.as_dtype(dtype) for dtype in scalar_types] + np_common_dtype = numpy.find_common_type(np_array_dtypes, np_scalar_dtypes) + numba_common_dtype = numpy_support.from_dtype(np_common_dtype) + + return numba_common_dtype + + +def hpat_join_series_indexes(left, right): + pass + + +@overload(hpat_join_series_indexes) +def hpat_join_series_indexes_overload(left, right): + """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" + + # TODO: eliminate code duplication by merging implementations for numeric and StringArray + # requires equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays + if (isinstance(left, types.Array) and isinstance(right, types.Array)): + + numba_common_dtype = find_common_dtype_from_numpy_dtypes([left.dtype, right.dtype], []) + if isinstance(numba_common_dtype, types.Number): + + def hpat_join_series_indexes_impl(left, right): + + # allocate result arrays + lsize = len(left) + rsize = len(right) + est_total_size = int(1.1 * (lsize + rsize)) + + lidx = numpy.empty(est_total_size, numpy.int64) + ridx = numpy.empty(est_total_size, numpy.int64) + joined = numpy.empty(est_total_size, numba_common_dtype) + + # sort arrays saving the old positions + sorted_left = numpy.argsort(left, kind='mergesort') + sorted_right = numpy.argsort(right, kind='mergesort') + + i, j, k = 0, 0, 0 + while (i < lsize and j < rsize): + joined = _hpat_ensure_array_capacity(k + 1, joined) + lidx = _hpat_ensure_array_capacity(k + 1, lidx) + ridx = _hpat_ensure_array_capacity(k + 1, ridx) + + left_index = left[sorted_left[i]] + right_index = right[sorted_right[j]] + + if (left_index < right_index): + joined[k] = left_index + lidx[k] = sorted_left[i] + ridx[k] = -1 + i += 1 + k += 1 + elif (left_index > right_index): + joined[k] = right_index + lidx[k] = -1 + ridx[k] = sorted_right[j] + j += 1 + k += 1 + else: + # find ends of sequences of equal index values in left and right + ni, nj = i, j + while (ni < lsize and left[sorted_left[ni]] == left_index): + ni += 1 + while (nj < rsize and right[sorted_right[nj]] == right_index): + nj += 1 + + # join the blocks found into results + for s in numpy.arange(i, ni, 1): + block_size = nj - j + to_joined = numpy.repeat(left_index, block_size) + to_lidx = numpy.repeat(sorted_left[s], block_size) + to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) + + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + + joined[k:k + block_size] = to_joined + lidx[k:k + block_size] = to_lidx + ridx[k:k + block_size] = to_ridx + k += block_size + i = ni + j = nj + + # fill the end of joined with remaining part of left or right + if i < lsize: + block_size = lsize - i + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + ridx[k: k + block_size] = numpy.repeat(-1, block_size) + while i < lsize: + joined[k] = left[sorted_left[i]] + lidx[k] = sorted_left[i] + i += 1 + k += 1 + + elif j < rsize: + block_size = rsize - j + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + lidx[k: k + block_size] = numpy.repeat(-1, block_size) + while j < rsize: + joined[k] = right[sorted_right[j]] + ridx[k] = sorted_right[j] + j += 1 + k += 1 + + return joined[:k], lidx[:k], ridx[:k] + + return hpat_join_series_indexes_impl + + else: + # TODO: support joining indexes with common dtype=object - requires Numba + # support of such numpy arrays in nopython mode, for now just return None + return None + + elif (left == string_array_type and right == string_array_type): + + def hpat_join_series_indexes_impl(left, right): + + # allocate result arrays + lsize = len(left) + rsize = len(right) + est_total_size = int(1.1 * (lsize + rsize)) + + lidx = numpy.empty(est_total_size, numpy.int64) + ridx = numpy.empty(est_total_size, numpy.int64) + + # use Series.sort_values since argsort for StringArrays not implemented + original_left_series = pandas.Series(left) + original_right_series = pandas.Series(right) + + # sort arrays saving the old positions + left_series = original_left_series.sort_values(kind='mergesort') + right_series = original_right_series.sort_values(kind='mergesort') + sorted_left = left_series._index + sorted_right = right_series._index + + i, j, k = 0, 0, 0 + while (i < lsize and j < rsize): + lidx = _hpat_ensure_array_capacity(k + 1, lidx) + ridx = _hpat_ensure_array_capacity(k + 1, ridx) + + left_index = left[sorted_left[i]] + right_index = right[sorted_right[j]] + + if (left_index < right_index): + lidx[k] = sorted_left[i] + ridx[k] = -1 + i += 1 + k += 1 + elif (left_index > right_index): + lidx[k] = -1 + ridx[k] = sorted_right[j] + j += 1 + k += 1 + else: + # find ends of sequences of equal index values in left and right + ni, nj = i, j + while (ni < lsize and left[sorted_left[ni]] == left_index): + ni += 1 + while (nj < rsize and right[sorted_right[nj]] == right_index): + nj += 1 + + # join the blocks found into results + for s in numpy.arange(i, ni, 1): + block_size = nj - j + to_lidx = numpy.repeat(sorted_left[s], block_size) + to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) + + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + + lidx[k:k + block_size] = to_lidx + ridx[k:k + block_size] = to_ridx + k += block_size + i = ni + j = nj + + # fill the end of joined with remaining part of left or right + if i < lsize: + block_size = lsize - i + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + ridx[k: k + block_size] = numpy.repeat(-1, block_size) + while i < lsize: + lidx[k] = sorted_left[i] + i += 1 + k += 1 + + elif j < rsize: + block_size = rsize - j + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + lidx[k: k + block_size] = numpy.repeat(-1, block_size) + while j < rsize: + ridx[k] = sorted_right[j] + j += 1 + k += 1 + + # count total number of characters and allocate joined array + total_joined_size = k + num_chars_in_joined = 0 + for i in numpy.arange(total_joined_size): + if lidx[i] != -1: + num_chars_in_joined += len(left[lidx[i]]) + elif ridx[i] != -1: + num_chars_in_joined += len(right[ridx[i]]) + + joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) + + # iterate over joined and fill it with indexes using lidx and ridx indexers + for i in numpy.arange(total_joined_size): + if lidx[i] != -1: + joined[i] = left[lidx[i]] + if (str_arr_is_na(left, lidx[i])): + str_arr_set_na(joined, i) + elif ridx[i] != -1: + joined[i] = right[ridx[i]] + if (str_arr_is_na(right, ridx[i])): + str_arr_set_na(joined, i) + else: + str_arr_set_na(joined, i) + + return joined, lidx, ridx + + return hpat_join_series_indexes_impl + + return None diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index dc639d94d..9fa9ebbb3 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -41,9 +41,12 @@ import sdc import sdc.datatypes.common_functions as common_functions from sdc.datatypes.common_functions import TypeChecker +from sdc.datatypes.common_functions import (check_index_is_numeric, find_common_dtype_from_numpy_dtypes, + hpat_join_series_indexes) from sdc.datatypes.hpat_pandas_stringmethods_types import StringMethodsType from sdc.hiframes.pd_series_ext import SeriesType -from sdc.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars) +from sdc.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars, string_array_type, + str_arr_is_na, pre_alloc_string_array, str_arr_set_na) from sdc.utils import to_array @@ -3722,3 +3725,132 @@ def hpat_pandas_series_pct_change_impl(self, periods=1, fill_method='pad', limit return pandas.Series(result) return hpat_pandas_series_pct_change_impl + + +@overload(operator.add) +def hpat_pandas_series_operator_add(self, other): + """ + Pandas Series operator :attr:`pandas.Series.add` implementation + + Note: Currently implemented for numeric Series only. + Differs from Pandas in returning Series with fixed dtype :obj:`float64` + + .. only:: developer + + **Test**: python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op1 + python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op2 + python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op3 + python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op4 + python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_operator_add* + + Parameters + ---------- + series: :obj:`pandas.Series` + Input series + other: :obj:`pandas.Series` or :obj:`scalar` + Series or scalar value to be used as a second argument of binary operation + + Returns + ------- + :obj:`pandas.Series` + The result of the operation + """ + + _func_name = 'Operator add().' + + ty_checker = TypeChecker('Operator add().') + ty_checker.check(self, SeriesType) + + if not isinstance(other, (SeriesType, types.Number)): + ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') + + series_indexes_alignable = False + none_or_numeric_indexes = False + if isinstance(other, SeriesType): + if (other.index == string_array_type and self.index == string_array_type): + series_indexes_alignable = True + + if ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) + and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))): + series_indexes_alignable = True + none_or_numeric_indexes = True + + if isinstance(other, SeriesType) and not series_indexes_alignable: + raise TypingError('{} Not implemented for series with not-alignable indexes. \ + Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) + + # specializations for numeric series - TODO: support arithmetic operation on StringArrays + if (isinstance(other, types.Number)): + def hpat_pandas_series_add_scalar_impl(self, other): + return pandas.Series(self._data + other, self._index) + + return hpat_pandas_series_add_scalar_impl + + elif (isinstance(other, SeriesType)): + + # optimization for series with default indexes, that can be aligned differently + if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): + def hpat_pandas_series_add_impl(self, other): + + if (len(self._data) == len(other._data)): + return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64)) + else: + min_data_size = min(len(self._data), len(other._data)) + max_data_size = max(len(self._data), len(other._data)) + new_data = numpy.empty(max_data_size, dtype=numpy.float64) + new_data[:min_data_size] = self._data[:min_data_size] + other._data[:min_data_size] + new_data[min_data_size:] = numpy.repeat(numpy.nan, max_data_size - min_data_size) + + return pandas.Series(new_data, self._index) + + return hpat_pandas_series_add_impl + else: + # for numeric indexes find common dtype to be used when creating joined index + if none_or_numeric_indexes: + ty_left_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype + ty_right_index_dtype = types.int64 if isinstance(other.index, types.NoneType) else other.index.dtype + numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( + [ty_left_index_dtype, ty_right_index_dtype], []) + + def hpat_pandas_series_add_impl(self, other): + left_index, right_index = self.index, other.index + + # check if indexes are equal and series don't have to be aligned + if none_or_numeric_indexes == True: # noqa + if (numpy.array_equal(left_index, right_index)): + return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64), + numpy.asarray(left_index, numba_index_common_dtype)) + else: + # TODO: replace with StringArrays comparison + is_index_equal = (len(self._index) == len(other._index) + and num_total_chars(self._index) == num_total_chars(other._index)) + for i in numpy.arange(len(self._index)): + if (self._index[i] != other._index[i] + or str_arr_is_na(self._index, i) is not str_arr_is_na(other._index, i)): + is_index_equal = False + + if is_index_equal: + return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64), + self._index) + + # TODO: replace below with core join(how='outer', return_indexers=True) when implemented + joined_index, left_indexer, right_indexer = hpat_join_series_indexes(left_index, right_index) + + joined_index_range = numpy.arange(len(joined_index)) + left_values = numpy.asarray( + [self._data[left_indexer[i]] for i in joined_index_range], + numpy.float64 + ) + left_values[left_indexer == -1] = numpy.nan + + right_values = numpy.asarray( + [other._data[right_indexer[i]] for i in joined_index_range], + numpy.float64 + ) + right_values[right_indexer == -1] = numpy.nan + + return pandas.Series(left_values + right_values, joined_index) + + return hpat_pandas_series_add_impl + + return None diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index f023aea48..f2e28be4a 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -914,15 +914,15 @@ def __init__(self, stype): super(SeriesIatType, self).__init__(name) -# PR135. This needs to be commented out -@infer_global(operator.getitem) -class GetItemSeriesIat(AbstractTemplate): - key = operator.getitem +if sdc.config.config_pipeline_hpat_default: + @infer_global(operator.getitem) + class GetItemSeriesIat(AbstractTemplate): + key = operator.getitem - def generic(self, args, kws): - # iat[] is the same as regular getitem - if isinstance(args[0], SeriesIatType): - return GetItemSeries.generic(self, (args[0].stype, args[1]), kws) + def generic(self, args, kws): + # iat[] is the same as regular getitem + if isinstance(args[0], SeriesIatType): + return GetItemSeries.generic(self, (args[0].stype, args[1]), kws) @infer @@ -1031,7 +1031,7 @@ def generic_expand_cumulative_series(self, args, kws): _non_hpat_pipeline_attrs = [ 'resolve_append', 'resolve_combine', 'resolve_corr', 'resolve_cov', 'resolve_dropna', 'resolve_fillna', 'resolve_head', 'resolve_nlargest', - 'resolve_nsmallest', 'resolve_pct_change' + 'resolve_nsmallest', 'resolve_pct_change', 'resolve_loc' ] # use ArrayAttribute for attributes not defined in SeriesAttribute @@ -1047,72 +1047,72 @@ def generic_expand_cumulative_series(self, args, kws): if attr in SeriesAttribute.__dict__: delattr(SeriesAttribute, attr) -# PR135. This needs to be commented out -@infer_global(operator.getitem) -class GetItemSeries(AbstractTemplate): - key = operator.getitem - - def generic(self, args, kws): - assert not kws - [in_arr, in_idx] = args - is_arr_series = False - is_idx_series = False - is_arr_dt_index = False - - if not isinstance(in_arr, SeriesType) and not isinstance(in_idx, SeriesType): - return None - - if isinstance(in_arr, SeriesType): - in_arr = series_to_array_type(in_arr) - is_arr_series = True - if in_arr.dtype == types.NPDatetime('ns'): - is_arr_dt_index = True - - if isinstance(in_idx, SeriesType): - in_idx = series_to_array_type(in_idx) - is_idx_series = True - - # TODO: dt_index - if in_arr == string_array_type: - # XXX fails due in overload - # compile_internal version results in symbol not found! - # sig = self.context.resolve_function_type( - # operator.getitem, (in_arr, in_idx), kws) - # HACK to get avoid issues for now - if isinstance(in_idx, (types.Integer, types.IntegerLiteral)): - sig = string_type(in_arr, in_idx) - else: - sig = GetItemStringArray.generic(self, (in_arr, in_idx), kws) - elif in_arr == list_string_array_type: - # TODO: split view - # mimic array indexing for list - if (isinstance(in_idx, types.Array) and in_idx.ndim == 1 - and isinstance( - in_idx.dtype, (types.Integer, types.Boolean))): - sig = signature(in_arr, in_arr, in_idx) - else: - sig = numba.typing.collections.GetItemSequence.generic( +if sdc.config.config_pipeline_hpat_default: + @infer_global(operator.getitem) + class GetItemSeries(AbstractTemplate): + key = operator.getitem + + def generic(self, args, kws): + assert not kws + [in_arr, in_idx] = args + is_arr_series = False + is_idx_series = False + is_arr_dt_index = False + + if not isinstance(in_arr, SeriesType) and not isinstance(in_idx, SeriesType): + return None + + if isinstance(in_arr, SeriesType): + in_arr = series_to_array_type(in_arr) + is_arr_series = True + if in_arr.dtype == types.NPDatetime('ns'): + is_arr_dt_index = True + + if isinstance(in_idx, SeriesType): + in_idx = series_to_array_type(in_idx) + is_idx_series = True + + # TODO: dt_index + if in_arr == string_array_type: + # XXX fails due in overload + # compile_internal version results in symbol not found! + # sig = self.context.resolve_function_type( + # operator.getitem, (in_arr, in_idx), kws) + # HACK to get avoid issues for now + if isinstance(in_idx, (types.Integer, types.IntegerLiteral)): + sig = string_type(in_arr, in_idx) + else: + sig = GetItemStringArray.generic(self, (in_arr, in_idx), kws) + elif in_arr == list_string_array_type: + # TODO: split view + # mimic array indexing for list + if (isinstance(in_idx, types.Array) and in_idx.ndim == 1 + and isinstance( + in_idx.dtype, (types.Integer, types.Boolean))): + sig = signature(in_arr, in_arr, in_idx) + else: + sig = numba.typing.collections.GetItemSequence.generic( + self, (in_arr, in_idx), kws) + elif in_arr == string_array_split_view_type: + sig = GetItemStringArraySplitView.generic( self, (in_arr, in_idx), kws) - elif in_arr == string_array_split_view_type: - sig = GetItemStringArraySplitView.generic( - self, (in_arr, in_idx), kws) - else: - out = get_array_index_type(in_arr, in_idx) - sig = signature(out.result, in_arr, out.index) - - if sig is not None: - arg1 = sig.args[0] - arg2 = sig.args[1] - if is_arr_series: - sig.return_type = if_arr_to_series_type(sig.return_type) - arg1 = if_arr_to_series_type(arg1) - if is_idx_series: - arg2 = if_arr_to_series_type(arg2) - sig.args = (arg1, arg2) - # dt_index and Series(dt64) should return Timestamp - if is_arr_dt_index and sig.return_type == types.NPDatetime('ns'): - sig.return_type = pandas_timestamp_type - return sig + else: + out = get_array_index_type(in_arr, in_idx) + sig = signature(out.result, in_arr, out.index) + + if sig is not None: + arg1 = sig.args[0] + arg2 = sig.args[1] + if is_arr_series: + sig.return_type = if_arr_to_series_type(sig.return_type) + arg1 = if_arr_to_series_type(arg1) + if is_idx_series: + arg2 = if_arr_to_series_type(arg2) + sig.args = (arg1, arg2) + # dt_index and Series(dt64) should return Timestamp + if is_arr_dt_index and sig.return_type == types.NPDatetime('ns'): + sig.return_type = pandas_timestamp_type + return sig @infer_global(operator.setitem) @@ -1239,10 +1239,11 @@ def generic(self, args, kws): return series_op_generic(SeriesUnaryOpUfuncs, self, args, kws) -# TODO: change class name to Series in install_operations -SeriesOpUfuncs.install_operations() -SeriesInplaceOpUfuncs.install_operations() -SeriesUnaryOpUfuncs.install_operations() +if sdc.config.config_pipeline_hpat_default: + # TODO: change class name to Series in install_operations + SeriesOpUfuncs.install_operations() + SeriesInplaceOpUfuncs.install_operations() + SeriesUnaryOpUfuncs.install_operations() class Series_Numpy_rules_ufunc(Numpy_rules_ufunc): diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 0d47f70dd..3f836780c 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -33,7 +33,7 @@ import numpy as np import pyarrow.parquet as pq import sdc -from itertools import islice, permutations, product +from itertools import islice, permutations, product, combinations from sdc.tests.test_base import TestCase from sdc.tests.test_utils import ( count_array_REPs, count_parfor_REPs, count_array_OneDs, get_start_end, @@ -1050,6 +1050,7 @@ def test_series_op1(self): test_impl = _make_func_use_binop1(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 df = pd.DataFrame({'A': np.arange(1, n), 'B': np.ones(n - 1)}) pd.testing.assert_series_equal(hpat_func(df.A, df.B), test_impl(df.A, df.B), check_names=False) @@ -1062,6 +1063,7 @@ def test_series_op2(self): test_impl = _make_func_use_binop1(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 if platform.system() == 'Windows' and not IS_32BITS: df = pd.DataFrame({'A': np.arange(1, n, dtype=np.int64)}) @@ -1069,26 +1071,30 @@ def test_series_op2(self): df = pd.DataFrame({'A': np.arange(1, n)}) pd.testing.assert_series_equal(hpat_func(df.A, 1), test_impl(df.A, 1), check_names=False) + @skip_numba_jit('Not implemented in new-pipeline yet') def test_series_op3(self): - arithmetic_binops = ('+', '-', '*', '/', '//', '%', '**') + arithmetic_binops = ('+=', '-=', '*=', '/=', '//=', '%=', '**=') for operator in arithmetic_binops: test_impl = _make_func_use_binop2(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 - df = pd.DataFrame({'A': np.arange(1, n), 'B': np.ones(n - 1)}) + df = pd.DataFrame({'A': np.arange(1, n, dtype=np.float64), 'B': np.ones(n - 1)}) pd.testing.assert_series_equal(hpat_func(df.A, df.B), test_impl(df.A, df.B), check_names=False) + @skip_numba_jit('Not implemented in new-pipeline yet') def test_series_op4(self): - arithmetic_binops = ('+', '-', '*', '/', '//', '%', '**') + arithmetic_binops = ('+=', '-=', '*=', '/=', '//=', '%=', '**=') for operator in arithmetic_binops: test_impl = _make_func_use_binop2(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 - df = pd.DataFrame({'A': np.arange(1, n)}) + df = pd.DataFrame({'A': np.arange(1, n, dtype=np.float64)}) pd.testing.assert_series_equal(hpat_func(df.A, 1), test_impl(df.A, 1), check_names=False) def test_series_op5(self): @@ -4579,6 +4585,227 @@ def test_series_pct_change_impl(S, periods=1, fill_method='pad', limit=None, fre msg = 'Method pct_change(). The object periods' self.assertIn(msg, str(raises.exception)) + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_scalar(self): + """Verifies Series.operator.add implementation for numeric series and scalar second operand""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtype_to_index = {'None': None, + 'int': np.arange(n, dtype='int'), + 'float': np.arange(n, dtype='float'), + 'string': ['aa', 'aa', 'b', 'b', 'cccc', 'dd', 'ddd']} + + int_scalar = 24 + for dtype, index_data in dtype_to_index.items(): + with self.subTest(index_dtype=dtype, index=index_data): + if platform.system() == 'Windows' and not IS_32BITS: + A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) + else: + A = pd.Series(np.arange(n), index=index_data) + pd.testing.assert_series_equal(hpat_func(A, int_scalar), test_impl(A, int_scalar), check_names=False) + + float_scalar = 24.0 + for dtype, index_data in dtype_to_index.items(): + with self.subTest(index_dtype=dtype, index=index_data): + if platform.system() == 'Windows' and not IS_32BITS: + A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) + else: + A = pd.Series(np.arange(n), index=index_data) + ref_result = test_impl(A, float_scalar) + result = hpat_func(A, float_scalar) + pd.testing.assert_series_equal(result, ref_result, check_names=False) + + def test_series_operator_add_numeric_same_index_default(self): + """Verifies implementation of Series.operator.add between two numeric Series + with default indexes and same size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(dtypes_to_test, 2): + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), dtype=dtype_left) + B = pd.Series(np.arange(n)**2, dtype=dtype_right) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @skip_numba_jit + @skip_sdc_jit("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" + "AssertionError: Series.index are different\n" + "Series.index classes are not equivalent\n" + "[left]: Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype='float64')\n" + "[right]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')\n") + def test_series_operator_add_numeric_same_index_numeric(self): + """Verifies implementation of Series.operator.add between two numeric Series + with the same numeric indexes of different dtypes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(dtypes_to_test, 2): + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) + B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_same_index_numeric_fixme(self): + """ Same as test_series_operator_add_same_index_numeric but with w/a for the problem. + Can be deleted when the latter is fixed """ + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + int_dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(int_dtypes_to_test, 2): + # FIXME: skip the sub-test if one of the dtypes is float and the other is integer + if not (np.issubdtype(dtype_left, np.integer) and np.issubdtype(dtype_right, np.integer) + or np.issubdtype(dtype_left, np.float) and np.issubdtype(dtype_right, np.float)): + continue + + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) + B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_same_index_str(self): + """Verifies implementation of Series.operator.add between two numeric Series with the same string indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + A = pd.Series(np.arange(n), index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) + B = pd.Series(np.arange(n)**2, index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_int(self): + """Verifies implementation of Series.operator.add between two numeric Series with non-equal integer indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = [0, 1, 1, 2, 3, 3, 3, 4, 6, 8, 9] + index_B = [0, 1, 1, 3, 4, 4, 5, 5, 6, 6, 9] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_str(self): + """Verifies implementation of Series.operator.add between two numeric Series with non-equal string indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = ['', '', 'aa', 'aa', 'ae', 'ae', 'b', 'ccc', 'cccc', 'oo', 's'] + index_B = ['', '', 'aa', 'aa', 'cc', 'cccc', 'e', 'f', 'h', 'oo', 's'] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_numba_jit('TODO: fix Series.sort_values to handle both None and '' in string series') + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_str_fixme(self): + """Same as test_series_operator_add_align_index_str but with None values in string indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = ['', '', 'aa', 'aa', 'ae', 'b', 'ccc', 'cccc', 'oo', None, None] + index_B = ['', '', 'aa', 'aa', 'cccc', 'f', 'h', 'oo', 's', None, None] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_other_dtype(self): + """Verifies implementation of Series.operator.add between two numeric Series + with non-equal integer indexes of different dtypes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + A = pd.Series(np.arange(3*n), index=np.arange(-n, 2*n, 1, dtype=np.int64)) + B = pd.Series(np.arange(3*n)**2, index=np.arange(0, 3*n, 1, dtype=np.float64)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_sdc_jit('Arithmetic operations on Series with different sizes are not supported in old-style') + def test_series_operator_add_numeric_diff_series_sizes(self): + """Verifies implementation of Series.operator.add between two numeric Series with different sizes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + size_A, size_B = 7, 25 + A = pd.Series(np.arange(size_A)) + B = pd.Series(np.arange(size_B)**2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_sdc_jit('Arithmetic operations on Series requiring alignment of indexes are not supported in old-style') + def test_series_operator_add_align_index_int_capacity(self): + """Verifies implementation of Series.operator.add and alignment of numeric indexes of large size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 20000 + np.random.seed(0) + index1 = np.random.randint(-30, 30, n) + index2 = np.random.randint(-30, 30, n) + A = pd.Series(np.random.ranf(n), index=index1) + B = pd.Series(np.random.ranf(n), index=index2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_numba_jit + @skip_sdc_jit('Test hangs due to a call of Series.sort_values') + def test_series_operator_add_align_index_str_capacity(self): + """Verifies implementation of Series.operator.add and alignment of string indexes of large size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 20000 + np.random.seed(0) + valid_ids = ['', 'aaa', 'a', 'b', 'ccc', 'ef', 'ff', 'fff', 'fa', 'dddd'] + index1 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] + index2 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] + A = pd.Series(np.random.ranf(n), index=index1) + B = pd.Series(np.random.ranf(n), index=index2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @skip_numba_jit + @skip_sdc_jit("TODO: support arithemetic operations on StringArrays and extend Series.operator.add overload") + def test_series_operator_add_str_same_index_default(self): + """Verifies implementation of Series.operator.add between two string Series + with default indexes and same size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + A = pd.Series(['a', '', 'ae', 'b', 'cccc', 'oo', None]) + B = pd.Series(['b', 'aa', '', 'b', 'o', None, 'oo']) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + if __name__ == "__main__": unittest.main()