From 0db8ef8a29f4c9003f0681acb99eaf857e3206a7 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Mon, 18 Nov 2019 20:44:51 +0300 Subject: [PATCH 1/4] Add support for Series.operator.add in a new-style --- sdc/datatypes/common_functions.py | 17 + sdc/datatypes/hpat_pandas_series_functions.py | 345 +++++++++++++++++- sdc/hiframes/pd_series_ext.py | 9 +- sdc/tests/test_series.py | 271 +++++++++++++- 4 files changed, 636 insertions(+), 6 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index d359d2c97..631aebccc 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -32,6 +32,7 @@ import numpy +import numba from numba import types from numba.errors import TypingError from numba.extending import overload @@ -181,3 +182,19 @@ def _append_list_string_array_impl(A, B): return new_data return _append_list_string_array_impl + + +@numba.njit +def _hpat_ensure_array_capacity(new_size, arr): + '''Function creating a copy of numpy array with a size more than specified''' + # TODO: replace this function with np.resize when supported by Numba + k = len(arr) + if k > new_size: + return arr + + n = k + while n < new_size: + n = 2 * n + res = numpy.empty(n, arr.dtype) + res[:k] = arr[:k] + return res diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index dc639d94d..30b703940 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -37,13 +37,16 @@ from numba.errors import TypingError from numba.extending import overload, overload_method, overload_attribute from numba import types +from numba import numpy_support import sdc import sdc.datatypes.common_functions as common_functions from sdc.datatypes.common_functions import TypeChecker +from sdc.datatypes.common_functions import _hpat_ensure_array_capacity from sdc.datatypes.hpat_pandas_stringmethods_types import StringMethodsType from sdc.hiframes.pd_series_ext import SeriesType -from sdc.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars) +from sdc.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars, string_array_type, + str_arr_is_na, pre_alloc_string_array, str_arr_set_na) from sdc.utils import to_array @@ -3722,3 +3725,343 @@ def hpat_pandas_series_pct_change_impl(self, periods=1, fill_method='pad', limit return pandas.Series(result) return hpat_pandas_series_pct_change_impl + + +def hpat_join_series_indexes(left, right): + pass + +@overload(hpat_join_series_indexes) +def hpat_join_series_indexes_overload(left, right): + '''Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm''' + + if (isinstance(left, types.Array) and isinstance(right, types.Array)): + np_dtypes = [numpy_support.as_dtype(left.dtype), numpy_support.as_dtype(right.dtype)] + np_common_dtype = numpy.find_common_type([], np_dtypes) + numba_common_dtype = numpy_support.from_dtype(np_common_dtype) + if (isinstance(left.dtype, types.Number) and isinstance(right.dtype, types.Number)): + + def hpat_join_series_indexes_impl(left, right): + + # allocate result arrays + lsize = len(left) + rsize = len(right) + est_total_size = int(1.1 * (lsize + rsize)) + + lidx = numpy.empty(est_total_size, numpy.int64) + ridx = numpy.empty(est_total_size, numpy.int64) + joined = numpy.empty(est_total_size, numba_common_dtype) + + # sort arrays saving the old positions + sorted_left = numpy.argsort(left, kind='mergesort') + sorted_right = numpy.argsort(right, kind='mergesort') + + i, j, k = 0, 0, 0 + while (i < lsize and j < rsize): + joined = _hpat_ensure_array_capacity(k, joined) + lidx = _hpat_ensure_array_capacity(k, lidx) + ridx = _hpat_ensure_array_capacity(k, ridx) + + left_index = left[sorted_left[i]] + right_index = right[sorted_right[j]] + + if (left_index < right_index): + joined[k] = left_index + lidx[k] = sorted_left[i] + ridx[k] = -1 + i += 1 + k += 1 + elif (left_index > right_index): + joined[k] = right_index + lidx[k] = -1 + ridx[k] = sorted_right[j] + j += 1 + k += 1 + else: + # find ends of sequences of equivalent index values in left and right + ni, nj = i, j + while (ni < lsize and left[sorted_left[ni]] == left_index): + ni += 1 + while (nj < rsize and right[sorted_right[nj]] == right_index): + nj += 1 + + # join the blocks found into results + for s in numpy.arange(i, ni, 1): + block_size = nj - j + to_joined = numpy.repeat(left_index, block_size) + to_lidx = numpy.repeat(sorted_left[s], block_size) + to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) + + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + + joined[k:k + block_size] = to_joined + lidx[k:k + block_size] = to_lidx + ridx[k:k + block_size] = to_ridx + k += block_size + i = ni + j = nj + + # fill the end of joined with remaining part of left or right + if i < lsize: + block_size = lsize - i + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + ridx[k: k + block_size] = numpy.repeat(-1, block_size) + while i < lsize: + joined[k] = left[sorted_left[i]] + lidx[k] = sorted_left[i] + i += 1 + k += 1 + + elif j < rsize: + block_size = rsize - j + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + lidx[k: k + block_size] = numpy.repeat(-1, block_size) + while j < rsize: + joined[k] = right[sorted_right[j]] + ridx[k] = sorted_right[j] + j += 1 + k += 1 + + return joined[:k], lidx[:k], ridx[:k] + + return hpat_join_series_indexes_impl + + elif (left == string_array_type and right == string_array_type): + + def hpat_join_series_indexes_impl(left, right): + + # allocate result arrays + lsize = len(left) + rsize = len(right) + est_total_size = int(1.1 * (lsize + rsize)) + + lidx = numpy.empty(est_total_size, numpy.int64) + ridx = numpy.empty(est_total_size, numpy.int64) + + # use Series.sort_values since argsort for StringArrays not implemented + original_left_series = pandas.Series(left) + original_right_series = pandas.Series(right) + + # sort arrays saving the old positions + left_series = original_left_series.sort_values(kind='mergesort') + right_series = original_right_series.sort_values(kind='mergesort') + sorted_left = left_series._index + sorted_right = right_series._index + + i, j, k = 0, 0, 0 + while (i < lsize and j < rsize): + lidx = _hpat_ensure_array_capacity(k, lidx) + ridx = _hpat_ensure_array_capacity(k, ridx) + + left_index = left[sorted_left[i]] + right_index = right[sorted_right[j]] + + if (left_index < right_index): + lidx[k] = sorted_left[i] + ridx[k] = -1 + i += 1 + k += 1 + elif (left_index > right_index): + lidx[k] = -1 + ridx[k] = sorted_right[j] + j += 1 + k += 1 + else: + # find ends of sequences of equivalent index values in left and right + ni, nj = i, j + while (ni < lsize and left[sorted_left[ni]] == left_index): + ni += 1 + while (nj < rsize and right[sorted_right[nj]] == right_index): + nj += 1 + + # join the blocks found into results + for s in numpy.arange(i, ni, 1): + block_size = nj - j + to_lidx = numpy.repeat(sorted_left[s], block_size) + to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) + + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + + lidx[k:k + block_size] = to_lidx + ridx[k:k + block_size] = to_ridx + k += block_size + i = ni + j = nj + + # fill the end of joined with remaining part of left or right + if i < lsize: + block_size = lsize - i + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + ridx[k: k + block_size] = numpy.repeat(-1, block_size) + while i < lsize: + lidx[k] = sorted_left[i] + i += 1 + k += 1 + + elif j < rsize: + block_size = rsize - j + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + lidx[k: k + block_size] = numpy.repeat(-1, block_size) + while j < rsize: + ridx[k] = sorted_right[j] + j += 1 + k += 1 + + # count total number of characters and allocate joined array + total_joined_size = k + num_chars_in_joined = 0 + for i in numpy.arange(total_joined_size): + if lidx[i] != -1: + num_chars_in_joined += len(left[lidx[i]]) + elif ridx[i] != -1: + num_chars_in_joined += len(right[ridx[i]]) + + joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) + + # iterate over joined and fill it with indexes using lidx and ridx indexers + for i in numpy.arange(total_joined_size): + if lidx[i] != -1: + joined[i] = left[lidx[i]] + if (str_arr_is_na(left, lidx[i])): + str_arr_set_na(joined, i) + elif ridx[i] != -1: + joined[i] = right[ridx[i]] + if (str_arr_is_na(right, ridx[i])): + str_arr_set_na(joined, i) + else: + str_arr_set_na(joined, i) + + return joined, lidx, ridx + + return hpat_join_series_indexes_impl + + +@overload(operator.add) +def hpat_pandas_series_operator_add(self, other): + """ + Pandas Series operator :attr:`pandas.Series.add` implementation + + .. only:: developer + + **Test**: python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op1 + python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op2 + python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op3 + python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op4 + python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_operator_add* + + Parameters + ---------- + series: :obj:`pandas.Series` + Input series + other: :obj:`pandas.Series` or :obj:`scalar` + Series or scalar value to be used as a second argument of binary operation + + Returns + ------- + :obj:`pandas.Series` + The result of the operation + """ + + _func_name = 'Operator add().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not (isinstance(other, SeriesType) + or isinstance(other, types.Number)): + raise TypingError('{} The object must be a pandas.series or a scalar. Given: {}'.format(_func_name, other)) + + series_indexes_alignable = False + if isinstance(other, SeriesType): + if (other.index == string_array_type and self.index == string_array_type): + series_indexes_alignable = True + + if ((isinstance(self.index, types.NoneType) or + isinstance(self.index, types.Array) and isinstance(self.index.dtype, types.Number)) + and (isinstance(other.index, types.NoneType) or + isinstance(other.index, types.Array) and isinstance(other.index.dtype, types.Number))): + series_indexes_alignable = True + + if isinstance(other, SeriesType) and not series_indexes_alignable: + raise TypingError('{} Not implemented for series with not-alignable indexes. \ + Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) + + if (isinstance(other, types.Number)): + def hpat_pandas_series_add_scalar_impl(self, other): + return pandas.Series(self._data + other, self._index) + + return hpat_pandas_series_add_scalar_impl + + elif (isinstance(other, SeriesType)): + is_numeric_index = isinstance(self.index, (types.Array, types.NoneType)) + + if is_numeric_index: + ty_left_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype + ty_right_index_dtype = types.int64 if isinstance(other.index, types.NoneType) else other.index.dtype + np_index_dtypes = [numpy_support.as_dtype(ty_left_index_dtype), numpy_support.as_dtype(ty_right_index_dtype)] + np_index_common_dtype = numpy.find_common_type([], np_index_dtypes) + numba_index_common_dtype = numpy_support.from_dtype(np_index_common_dtype) + + if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): + def hpat_pandas_series_add_impl(self, other): + + if (len(self._data) == len(other._data)): + return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64)) + else: + min_data_size = min(len(self._data), len(other._data)) + max_data_size = max(len(self._data), len(other._data)) + new_data = numpy.empty(max_data_size, dtype=numpy.float64) + new_data[:min_data_size] = self._data[:min_data_size] + other._data[:min_data_size] + new_data[min_data_size:] = numpy.repeat(numpy.nan, max_data_size - min_data_size) + + return pandas.Series(new_data, self._index) + else: + def hpat_pandas_series_add_impl(self, other): + left_index, right_index = self.index, other.index + + # check if indexes are equal and series don't have to be aligned + if is_numeric_index == True: # noqa + if (numpy.array_equal(left_index, right_index)): + return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64), + numpy.asarray(left_index, numba_index_common_dtype)) + else: + # TODO: replace with StringArrays comparison + is_index_equal = (len(self._index) == len(other._index) + and num_total_chars(self._index) == num_total_chars(other._index)) + for i in numpy.arange(len(self._index)): + if (self._index[i] != other._index[i] + or str_arr_is_na(self._index, i) is not str_arr_is_na(other._index, i)): + is_index_equal = False + + if is_index_equal: + return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64), + self._index) + + # TODO: replace below with core join(how='outer') when implemented + joined_index, left_indexer, right_indexer = hpat_join_series_indexes(left_index, right_index) + + joined_index_range = numpy.arange(len(joined_index)) + left_values = numpy.asarray( + [self._data[left_indexer[i]] for i in joined_index_range], + numpy.float64 + ) + left_values[left_indexer == -1] = numpy.nan + + right_values = numpy.asarray( + [other._data[right_indexer[i]] for i in joined_index_range], + numpy.float64 + ) + right_values[right_indexer == -1] = numpy.nan + + return pandas.Series(left_values + right_values, joined_index) + + + return hpat_pandas_series_add_impl diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index f023aea48..037658f68 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -1239,10 +1239,11 @@ def generic(self, args, kws): return series_op_generic(SeriesUnaryOpUfuncs, self, args, kws) -# TODO: change class name to Series in install_operations -SeriesOpUfuncs.install_operations() -SeriesInplaceOpUfuncs.install_operations() -SeriesUnaryOpUfuncs.install_operations() +if sdc.config.config_pipeline_hpat_default: + # TODO: change class name to Series in install_operations + SeriesOpUfuncs.install_operations() + SeriesInplaceOpUfuncs.install_operations() + SeriesUnaryOpUfuncs.install_operations() class Series_Numpy_rules_ufunc(Numpy_rules_ufunc): diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 0d47f70dd..7c24d167e 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -33,7 +33,7 @@ import numpy as np import pyarrow.parquet as pq import sdc -from itertools import islice, permutations, product +from itertools import islice, permutations, product, combinations from sdc.tests.test_base import TestCase from sdc.tests.test_utils import ( count_array_REPs, count_parfor_REPs, count_array_OneDs, get_start_end, @@ -4579,6 +4579,275 @@ def test_series_pct_change_impl(S, periods=1, fill_method='pad', limit=None, fre msg = 'Method pct_change(). The object periods' self.assertIn(msg, str(raises.exception)) + def test_series_setitem_for_value(self): + def test_impl(S, val): + S[3] = val + return S + + hpat_func = self.jit(test_impl) + S = pd.Series([0, 1, 2, 3, 4]) + value = 50 + result_ref = test_impl(S, value) + result = hpat_func(S, value) + pd.testing.assert_series_equal(result_ref, result) + + def test_series_setitem_for_slice(self): + def test_impl(S, val): + S[2:] = val + return S + + hpat_func = self.jit(test_impl) + S = pd.Series([0, 1, 2, 3, 4]) + value = 50 + result_ref = test_impl(S, value) + result = hpat_func(S, value) + pd.testing.assert_series_equal(result_ref, result) + + def test_series_setitem_for_series(self): + def test_impl(S, ind, val): + S[ind] = val + return S + + hpat_func = self.jit(test_impl) + S = pd.Series([0, 1, 2, 3, 4]) + ind = pd.Series([0, 2, 4]) + value = 50 + result_ref = test_impl(S, ind, value) + result = hpat_func(S, ind, value) + pd.testing.assert_series_equal(result_ref, result) + + def test_series_setitem_unsupported(self): + def test_impl(S, ind, val): + S[ind] = val + return S + + hpat_func = self.jit(test_impl) + S = pd.Series([0, 1, 2, 3, 4, 5]) + ind1 = 5 + ind2 = '3' + value1 = 'ababa' + value2 = 101 + + with self.assertRaises(TypingError) as raises: + hpat_func(S, ind1, value1) + msg = 'Operator setitem(). Value must be one type with series.' + self.assertIn(msg, str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + hpat_func(S, ind2, value2) + msg = 'Operator setitem(). The index must be an Integer, Slice or a pandas.series.' + self.assertIn(msg, str(raises.exception)) + + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_scalar(self): + '''Verifies Series.operator.add implementation for scalar second operand''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtype_to_index = {'None': None, + 'int': np.arange(n, dtype='int'), + 'float': np.arange(n, dtype='float'), + 'string': ['aa', 'aa', 'b', 'b', 'cccc', 'dd', 'ddd']} + + int_scalar = 24 + for dtype, index_data in dtype_to_index.items(): + with self.subTest(index_dtype=dtype, index=index_data): + if platform.system() == 'Windows' and not IS_32BITS: + A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) + else: + A = pd.Series(np.arange(n), index=index_data) + pd.testing.assert_series_equal(hpat_func(A, int_scalar), test_impl(A, int_scalar), check_names=False) + + float_scalar = 24.0 + for dtype, index_data in dtype_to_index.items(): + with self.subTest(index_dtype=dtype, index=index_data): + if platform.system() == 'Windows' and not IS_32BITS: + A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) + else: + A = pd.Series(np.arange(n), index=index_data) + pd.testing.assert_series_equal(hpat_func(A, float_scalar), test_impl(A, float_scalar), check_names=False) + + def test_series_operator_add_same_index_default(self): + '''Verifies implementation of Series.operator.add between two Series with default indexes and same size''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(dtypes_to_test, 2): + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), dtype=dtype_left) + B = pd.Series(np.arange(n)**2, dtype=dtype_right) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @unittest.skip("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" + "AssertionError: Series.index are different\n" + "Series.index classes are not equivalent\n" + "[left]: Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype='float64')\n" + "[right]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')\n") + def test_series_operator_add_same_index_numeric_fixme(self): + '''Verifies implementation of Series.operator.add between two Series with the same numeric indexes of different dtypes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(dtypes_to_test, 2): + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) + B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_same_index_numeric(self): + ''' Same as test_series_operator_add_same_index_numeric but with w/a for the problem. + Can be deleted when the latter is fixed ''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + int_dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(int_dtypes_to_test, 2): + # FIXME: skip the sub-test if one of the dtypes is float and the other is integer + if not (np.issubdtype(dtype_left, np.integer) and np.issubdtype(dtype_right, np.integer) + or np.issubdtype(dtype_left, np.float) and np.issubdtype(dtype_right, np.float)): + continue + + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) + B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_same_index_str(self): + '''Verifies implementation of Series.operator.add between two Series with the same string indexes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + A = pd.Series(np.arange(n), index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) + B = pd.Series(np.arange(n)**2, index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_align_index_int(self): + '''Verifies implementation of Series.operator.add between two Series with non-equal integer indexes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = [0, 1, 1, 2, 3, 3, 3, 4, 6, 8, 9] + index_B = [0, 1, 1, 3, 4, 4, 5, 5, 6, 6, 9] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_align_index_str(self): + '''Verifies implementation of Series.operator.add between two Series with non-equal string indexes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = ['', '', 'aa', 'aa', 'ae', 'ae', 'b', 'ccc', 'cccc', 'oo', 's'] + index_B = ['', '', 'aa', 'aa', 'cc', 'cccc', 'e', 'f', 'h', 'oo', 's'] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skip("TODO: fix Series.sort_values to handle both None and '' in string series\n") + def test_series_operator_add_align_index_str_fixme(self): + '''Same as test_series_operator_add_align_index_str but with None values in string indexes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = ['', '', 'aa', 'aa', 'ae', 'b', 'ccc', 'cccc', 'oo', None, None] + index_B = ['', '', 'aa', 'aa', 'cccc', 'f', 'h', 'oo', 's', None, None] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_align_index_other_dtype(self): + '''Verifies implementation of Series.operator.add between two Series + with non-equal integer indexes of different dtypes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + A = pd.Series(np.arange(3*n), index=np.arange(-n, 2*n, 1, dtype=np.int64)) + B = pd.Series(np.arange(3*n)**2, index=np.arange(0, 3*n, 1, dtype=np.float64)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with different sizes are not supported in old-style') + def test_series_operator_add_diff_series_sizes(self): + '''Verifies implementation of Series.operator.add between two Series with different sizes''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + size_A, size_B = 7, 25 + A = pd.Series(np.arange(size_A)) + B = pd.Series(np.arange(size_B)**2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series requiring alignment of indexes are not supported in old-style') + def test_series_operator_add_align_index_int_capacity(self): + '''Verifies implementation of Series.operator.add and alignment of numeric indexes of large size''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 20000 + np.random.seed(0) + index1 = np.random.randint(-30, 30, n) + index2 = np.random.randint(-30, 30, n) + A = pd.Series(np.random.ranf(n), index=index1) + B = pd.Series(np.random.ranf(n), index=index2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skip("Test hangs due to a call of Series.sort_values") + def test_series_operator_add_align_index_str_capacity(self): + '''Verifies implementation of Series.operator.add and alignment of string indexes of large size''' + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 20000 + np.random.seed(0) + valid_ids = ['', 'aaa', 'a', 'b', 'ccc', 'ef', 'ff', 'fff', 'fa', 'dddd'] + index1 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] + index2 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] + A = pd.Series(np.random.ranf(n), index=index1) + B = pd.Series(np.random.ranf(n), index=index2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + if __name__ == "__main__": unittest.main() From 80de726e958a3154826d6be0b596b0505c023e55 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Fri, 22 Nov 2019 19:40:12 +0300 Subject: [PATCH 2/4] Applying review comments and addding tests --- sdc/datatypes/common_functions.py | 5 +- sdc/datatypes/hpat_pandas_series_functions.py | 21 +- sdc/tests/test_series.py | 225 ++++++++++++++++++ 3 files changed, 238 insertions(+), 13 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 631aebccc..a794d78c3 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -121,7 +121,7 @@ def hpat_arrays_append(A, B): @overload(hpat_arrays_append) def hpat_arrays_append_overload(A, B): - '''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A''' + """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A""" if isinstance(A, types.Array): if isinstance(B, types.Array): @@ -186,8 +186,7 @@ def _append_list_string_array_impl(A, B): @numba.njit def _hpat_ensure_array_capacity(new_size, arr): - '''Function creating a copy of numpy array with a size more than specified''' - # TODO: replace this function with np.resize when supported by Numba + """Function creating a copy of numpy array with a size more than specified""" k = len(arr) if k > new_size: return arr diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 30b703940..7abb7f9b7 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -3730,6 +3730,7 @@ def hpat_pandas_series_pct_change_impl(self, periods=1, fill_method='pad', limit def hpat_join_series_indexes(left, right): pass + @overload(hpat_join_series_indexes) def hpat_join_series_indexes_overload(left, right): '''Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm''' @@ -3784,7 +3785,7 @@ def hpat_join_series_indexes_impl(left, right): while (nj < rsize and right[sorted_right[nj]] == right_index): nj += 1 - # join the blocks found into results + # join the blocks found into results for s in numpy.arange(i, ni, 1): block_size = nj - j to_joined = numpy.repeat(left_index, block_size) @@ -3972,12 +3973,11 @@ def hpat_pandas_series_operator_add(self, other): _func_name = 'Operator add().' - if not isinstance(self, SeriesType): - raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + ty_checker = TypeChecker('Operator add().') + ty_checker.check(self, SeriesType) - if not (isinstance(other, SeriesType) - or isinstance(other, types.Number)): - raise TypingError('{} The object must be a pandas.series or a scalar. Given: {}'.format(_func_name, other)) + if not isinstance(other, (SeriesType, types.Number)): + ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') series_indexes_alignable = False if isinstance(other, SeriesType): @@ -3985,15 +3985,16 @@ def hpat_pandas_series_operator_add(self, other): series_indexes_alignable = True if ((isinstance(self.index, types.NoneType) or - isinstance(self.index, types.Array) and isinstance(self.index.dtype, types.Number)) + isinstance(self.index, types.Array) and isinstance(self.index.dtype, types.Number)) and (isinstance(other.index, types.NoneType) or - isinstance(other.index, types.Array) and isinstance(other.index.dtype, types.Number))): + isinstance(other.index, types.Array) and isinstance(other.index.dtype, types.Number))): series_indexes_alignable = True if isinstance(other, SeriesType) and not series_indexes_alignable: raise TypingError('{} Not implemented for series with not-alignable indexes. \ Given: self.index={}, other.index={}'.format(_func_name, self.index, other.index)) + # specializations for numeric series - TODO: support arithmetic operation on StringArrays if (isinstance(other, types.Number)): def hpat_pandas_series_add_scalar_impl(self, other): return pandas.Series(self._data + other, self._index) @@ -4006,7 +4007,8 @@ def hpat_pandas_series_add_scalar_impl(self, other): if is_numeric_index: ty_left_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype ty_right_index_dtype = types.int64 if isinstance(other.index, types.NoneType) else other.index.dtype - np_index_dtypes = [numpy_support.as_dtype(ty_left_index_dtype), numpy_support.as_dtype(ty_right_index_dtype)] + np_index_dtypes = [numpy_support.as_dtype(ty_left_index_dtype), + numpy_support.as_dtype(ty_right_index_dtype)] np_index_common_dtype = numpy.find_common_type([], np_index_dtypes) numba_index_common_dtype = numpy_support.from_dtype(np_index_common_dtype) @@ -4063,5 +4065,4 @@ def hpat_pandas_series_add_impl(self, other): return pandas.Series(left_values + right_values, joined_index) - return hpat_pandas_series_add_impl diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 7c24d167e..95073361a 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -4848,6 +4848,231 @@ def test_impl(A, B): B = pd.Series(np.random.ranf(n), index=index2) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_scalar(self): + """Verifies Series.operator.add implementation for numeric series and scalar second operand""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtype_to_index = {'None': None, + 'int': np.arange(n, dtype='int'), + 'float': np.arange(n, dtype='float'), + 'string': ['aa', 'aa', 'b', 'b', 'cccc', 'dd', 'ddd']} + + int_scalar = 24 + for dtype, index_data in dtype_to_index.items(): + with self.subTest(index_dtype=dtype, index=index_data): + if platform.system() == 'Windows' and not IS_32BITS: + A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) + else: + A = pd.Series(np.arange(n), index=index_data) + pd.testing.assert_series_equal(hpat_func(A, int_scalar), test_impl(A, int_scalar), check_names=False) + + float_scalar = 24.0 + for dtype, index_data in dtype_to_index.items(): + with self.subTest(index_dtype=dtype, index=index_data): + if platform.system() == 'Windows' and not IS_32BITS: + A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) + else: + A = pd.Series(np.arange(n), index=index_data) + ref_result = test_impl(A, float_scalar) + result = hpat_func(A, float_scalar) + pd.testing.assert_series_equal(result, ref_result, check_names=False) + + def test_series_operator_add_numeric_same_index_default(self): + """Verifies implementation of Series.operator.add between two numeric Series + with default indexes and same size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(dtypes_to_test, 2): + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), dtype=dtype_left) + B = pd.Series(np.arange(n)**2, dtype=dtype_right) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @unittest.skip("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" + "AssertionError: Series.index are different\n" + "Series.index classes are not equivalent\n" + "[left]: Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype='float64')\n" + "[right]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')\n") + def test_series_operator_add_numeric_same_index_numeric_fixme(self): + """Verifies implementation of Series.operator.add between two numeric Series + with the same numeric indexes of different dtypes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(dtypes_to_test, 2): + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) + B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_same_index_numeric(self): + """ Same as test_series_operator_add_same_index_numeric but with w/a for the problem. + Can be deleted when the latter is fixed """ + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + int_dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) + for dtype_left, dtype_right in combinations(int_dtypes_to_test, 2): + # FIXME: skip the sub-test if one of the dtypes is float and the other is integer + if not (np.issubdtype(dtype_left, np.integer) and np.issubdtype(dtype_right, np.integer) + or np.issubdtype(dtype_left, np.float) and np.issubdtype(dtype_right, np.float)): + continue + + with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): + A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) + B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_same_index_str(self): + """Verifies implementation of Series.operator.add between two numeric Series with the same string indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + A = pd.Series(np.arange(n), index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) + B = pd.Series(np.arange(n)**2, index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_int(self): + """Verifies implementation of Series.operator.add between two numeric Series with non-equal integer indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = [0, 1, 1, 2, 3, 3, 3, 4, 6, 8, 9] + index_B = [0, 1, 1, 3, 4, 4, 5, 5, 6, 6, 9] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_str(self): + """Verifies implementation of Series.operator.add between two numeric Series with non-equal string indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = ['', '', 'aa', 'aa', 'ae', 'ae', 'b', 'ccc', 'cccc', 'oo', 's'] + index_B = ['', '', 'aa', 'aa', 'cc', 'cccc', 'e', 'f', 'h', 'oo', 's'] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skip("TODO: fix Series.sort_values to handle both None and '' in string series\n") + def test_series_operator_add_numeric_align_index_str_fixme(self): + """Same as test_series_operator_add_align_index_str but with None values in string indexes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 11 + index_A = ['', '', 'aa', 'aa', 'ae', 'b', 'ccc', 'cccc', 'oo', None, None] + index_B = ['', '', 'aa', 'aa', 'cccc', 'f', 'h', 'oo', 's', None, None] + np.random.shuffle(index_A) + np.random.shuffle(index_B) + A = pd.Series(np.arange(n), index=index_A) + B = pd.Series(np.arange(n)**2, index=index_B) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_align_index_other_dtype(self): + """Verifies implementation of Series.operator.add between two numeric Series + with non-equal integer indexes of different dtypes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 7 + A = pd.Series(np.arange(3*n), index=np.arange(-n, 2*n, 1, dtype=np.int64)) + B = pd.Series(np.arange(3*n)**2, index=np.arange(0, 3*n, 1, dtype=np.float64)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series with different sizes are not supported in old-style') + def test_series_operator_add_numeric_diff_series_sizes(self): + """Verifies implementation of Series.operator.add between two numeric Series with different sizes""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + size_A, size_B = 7, 25 + A = pd.Series(np.arange(size_A)) + B = pd.Series(np.arange(size_B)**2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skipIf(sdc.config.config_pipeline_hpat_default, + 'Arithmetic operations on Series requiring alignment of indexes are not supported in old-style') + def test_series_operator_add_align_index_int_capacity(self): + """Verifies implementation of Series.operator.add and alignment of numeric indexes of large size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 20000 + np.random.seed(0) + index1 = np.random.randint(-30, 30, n) + index2 = np.random.randint(-30, 30, n) + A = pd.Series(np.random.ranf(n), index=index1) + B = pd.Series(np.random.ranf(n), index=index2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skip("Test hangs due to a call of Series.sort_values") + def test_series_operator_add_align_index_str_capacity(self): + """Verifies implementation of Series.operator.add and alignment of string indexes of large size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + n = 20000 + np.random.seed(0) + valid_ids = ['', 'aaa', 'a', 'b', 'ccc', 'ef', 'ff', 'fff', 'fa', 'dddd'] + index1 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] + index2 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] + A = pd.Series(np.random.ranf(n), index=index1) + B = pd.Series(np.random.ranf(n), index=index2) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + + @unittest.skip("TODO: support arithemetic operations on StringArrays and extend Series.operator.add overload") + def test_series_operator_add_str_same_index_default(self): + """Verifies implementation of Series.operator.add between two string Series + with default indexes and same size""" + def test_impl(A, B): + return A + B + hpat_func = self.jit(test_impl) + + A = pd.Series(['a', '', 'ae', 'b', 'cccc', 'oo', None]) + B = pd.Series(['b', 'aa', '', 'b', 'o', None, 'oo']) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) + if __name__ == "__main__": unittest.main() From 738d74ad99fbb15005ca646b506168819309bbf0 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Wed, 4 Dec 2019 17:33:31 +0300 Subject: [PATCH 3/4] More comments and refactoring from review --- sdc/datatypes/common_functions.py | 18 +- sdc/datatypes/hpat_pandas_series_functions.py | 3 +- sdc/tests/test_series.py | 265 +++--------------- 3 files changed, 52 insertions(+), 234 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index a794d78c3..f22028a14 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -92,7 +92,7 @@ def check(self, data, accepted_type, name=''): def has_literal_value(var, value): - '''Used during typing to check that variable var is a Numba literal value equal to value''' + """Used during typing to check that variable var is a Numba literal value equal to value""" if not isinstance(var, types.Literal): return False @@ -104,7 +104,7 @@ def has_literal_value(var, value): def has_python_value(var, value): - '''Used during typing to check that variable var was resolved as Python type and has specific value''' + """Used during typing to check that variable var was resolved as Python type and has specific value""" if not isinstance(var, type(value)): return False @@ -115,6 +115,11 @@ def has_python_value(var, value): return var == value +def check_index_is_numeric(ty_series): + """Used during typing to check that series has numeric index""" + return isinstance(ty_series.index, types.Array) and isinstance(ty_series.index.dtype, types.Number) + + def hpat_arrays_append(A, B): pass @@ -197,3 +202,12 @@ def _hpat_ensure_array_capacity(new_size, arr): res = numpy.empty(n, arr.dtype) res[:k] = arr[:k] return res + +def find_common_dtype_for_scalar_numpy_types(dtype1, dtype2): + """Used to find common numba dtype for two numba dtypes each representing some scalar numpy dtype""" + np_dtypes = [numpy_support.as_dtype(dtype1), + numpy_support.as_dtype(dtype2)] + np_common_dtype = numpy.find_common_type([], np_dtypes) + numba_common_dtype = numpy_support.from_dtype(np_common_dtype) + + return numba_common_dtype \ No newline at end of file diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 7abb7f9b7..22361bd87 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -42,7 +42,8 @@ import sdc import sdc.datatypes.common_functions as common_functions from sdc.datatypes.common_functions import TypeChecker -from sdc.datatypes.common_functions import _hpat_ensure_array_capacity +from sdc.datatypes.common_functions import (_hpat_ensure_array_capacity, check_index_is_numeric, + find_common_dtype_for_scalar_numpy_types) from sdc.datatypes.hpat_pandas_stringmethods_types import StringMethodsType from sdc.hiframes.pd_series_ext import SeriesType from sdc.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars, string_array_type, diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 95073361a..3c85a607e 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -4639,10 +4639,9 @@ def test_impl(S, ind, val): self.assertIn(msg, str(raises.exception)) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_scalar(self): - '''Verifies Series.operator.add implementation for scalar second operand''' + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_scalar(self): + """Verifies Series.operator.add implementation for numeric series and scalar second operand""" def test_impl(A, B): return A + B hpat_func = self.jit(test_impl) @@ -4669,10 +4668,13 @@ def test_impl(A, B): A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) else: A = pd.Series(np.arange(n), index=index_data) - pd.testing.assert_series_equal(hpat_func(A, float_scalar), test_impl(A, float_scalar), check_names=False) + ref_result = test_impl(A, float_scalar) + result = hpat_func(A, float_scalar) + pd.testing.assert_series_equal(result, ref_result, check_names=False) - def test_series_operator_add_same_index_default(self): - '''Verifies implementation of Series.operator.add between two Series with default indexes and same size''' + def test_series_operator_add_numeric_same_index_default(self): + """Verifies implementation of Series.operator.add between two numeric Series + with default indexes and same size""" def test_impl(A, B): return A + B hpat_func = self.jit(test_impl) @@ -4685,224 +4687,31 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, dtype=dtype_right) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - @unittest.skip("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" - "AssertionError: Series.index are different\n" - "Series.index classes are not equivalent\n" - "[left]: Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype='float64')\n" - "[right]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')\n") - def test_series_operator_add_same_index_numeric_fixme(self): - '''Verifies implementation of Series.operator.add between two Series with the same numeric indexes of different dtypes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 7 - dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) - for dtype_left, dtype_right in combinations(dtypes_to_test, 2): - with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): - A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) - B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_same_index_numeric(self): - ''' Same as test_series_operator_add_same_index_numeric but with w/a for the problem. - Can be deleted when the latter is fixed ''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 7 - int_dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) - for dtype_left, dtype_right in combinations(int_dtypes_to_test, 2): - # FIXME: skip the sub-test if one of the dtypes is float and the other is integer - if not (np.issubdtype(dtype_left, np.integer) and np.issubdtype(dtype_right, np.integer) - or np.issubdtype(dtype_left, np.float) and np.issubdtype(dtype_right, np.float)): - continue - - with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): - A = pd.Series(np.arange(n), index=np.arange(n, dtype=dtype_left)) - B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_same_index_str(self): - '''Verifies implementation of Series.operator.add between two Series with the same string indexes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 7 - A = pd.Series(np.arange(n), index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) - B = pd.Series(np.arange(n)**2, index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_align_index_int(self): - '''Verifies implementation of Series.operator.add between two Series with non-equal integer indexes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 11 - index_A = [0, 1, 1, 2, 3, 3, 3, 4, 6, 8, 9] - index_B = [0, 1, 1, 3, 4, 4, 5, 5, 6, 6, 9] - np.random.shuffle(index_A) - np.random.shuffle(index_B) - A = pd.Series(np.arange(n), index=index_A) - B = pd.Series(np.arange(n)**2, index=index_B) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_align_index_str(self): - '''Verifies implementation of Series.operator.add between two Series with non-equal string indexes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 11 - index_A = ['', '', 'aa', 'aa', 'ae', 'ae', 'b', 'ccc', 'cccc', 'oo', 's'] - index_B = ['', '', 'aa', 'aa', 'cc', 'cccc', 'e', 'f', 'h', 'oo', 's'] - np.random.shuffle(index_A) - np.random.shuffle(index_B) - A = pd.Series(np.arange(n), index=index_A) - B = pd.Series(np.arange(n)**2, index=index_B) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skip("TODO: fix Series.sort_values to handle both None and '' in string series\n") - def test_series_operator_add_align_index_str_fixme(self): - '''Same as test_series_operator_add_align_index_str but with None values in string indexes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 11 - index_A = ['', '', 'aa', 'aa', 'ae', 'b', 'ccc', 'cccc', 'oo', None, None] - index_B = ['', '', 'aa', 'aa', 'cccc', 'f', 'h', 'oo', 's', None, None] - np.random.shuffle(index_A) - np.random.shuffle(index_B) - A = pd.Series(np.arange(n), index=index_A) - B = pd.Series(np.arange(n)**2, index=index_B) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_align_index_other_dtype(self): - '''Verifies implementation of Series.operator.add between two Series - with non-equal integer indexes of different dtypes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 7 - A = pd.Series(np.arange(3*n), index=np.arange(-n, 2*n, 1, dtype=np.int64)) - B = pd.Series(np.arange(3*n)**2, index=np.arange(0, 3*n, 1, dtype=np.float64)) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with different sizes are not supported in old-style') - def test_series_operator_add_diff_series_sizes(self): - '''Verifies implementation of Series.operator.add between two Series with different sizes''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - size_A, size_B = 7, 25 - A = pd.Series(np.arange(size_A)) - B = pd.Series(np.arange(size_B)**2) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series requiring alignment of indexes are not supported in old-style') - def test_series_operator_add_align_index_int_capacity(self): - '''Verifies implementation of Series.operator.add and alignment of numeric indexes of large size''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 20000 - np.random.seed(0) - index1 = np.random.randint(-30, 30, n) - index2 = np.random.randint(-30, 30, n) - A = pd.Series(np.random.ranf(n), index=index1) - B = pd.Series(np.random.ranf(n), index=index2) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skip("Test hangs due to a call of Series.sort_values") - def test_series_operator_add_align_index_str_capacity(self): - '''Verifies implementation of Series.operator.add and alignment of string indexes of large size''' - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 20000 - np.random.seed(0) - valid_ids = ['', 'aaa', 'a', 'b', 'ccc', 'ef', 'ff', 'fff', 'fa', 'dddd'] - index1 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] - index2 = [valid_ids[i] for i in np.random.randint(0, len(valid_ids), n)] - A = pd.Series(np.random.ranf(n), index=index1) - B = pd.Series(np.random.ranf(n), index=index2) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_numeric_scalar(self): - """Verifies Series.operator.add implementation for numeric series and scalar second operand""" + def test_series_operator_add_series_dtype_promotion(self): + """Verifies implementation of Series.operator.add between two numeric Series of different dtypes""" def test_impl(A, B): return A + B hpat_func = self.jit(test_impl) n = 7 - dtype_to_index = {'None': None, - 'int': np.arange(n, dtype='int'), - 'float': np.arange(n, dtype='float'), - 'string': ['aa', 'aa', 'b', 'b', 'cccc', 'dd', 'ddd']} - - int_scalar = 24 - for dtype, index_data in dtype_to_index.items(): - with self.subTest(index_dtype=dtype, index=index_data): - if platform.system() == 'Windows' and not IS_32BITS: - A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) - else: - A = pd.Series(np.arange(n), index=index_data) - pd.testing.assert_series_equal(hpat_func(A, int_scalar), test_impl(A, int_scalar), check_names=False) + A = pd.Series(np.array(np.arange(n), dtype=np.int32)) + B = pd.Series(np.array(np.arange(n)**2, dtype=np.float32)) + pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - float_scalar = 24.0 - for dtype, index_data in dtype_to_index.items(): - with self.subTest(index_dtype=dtype, index=index_data): - if platform.system() == 'Windows' and not IS_32BITS: - A = pd.Series(np.arange(n, dtype=np.int64), index=index_data) - else: - A = pd.Series(np.arange(n), index=index_data) - ref_result = test_impl(A, float_scalar) - result = hpat_func(A, float_scalar) - pd.testing.assert_series_equal(result, ref_result, check_names=False) - - def test_series_operator_add_numeric_same_index_default(self): - """Verifies implementation of Series.operator.add between two numeric Series - with default indexes and same size""" - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 7 dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) for dtype_left, dtype_right in combinations(dtypes_to_test, 2): with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): - A = pd.Series(np.arange(n), dtype=dtype_left) - B = pd.Series(np.arange(n)**2, dtype=dtype_right) + A = pd.Series(np.array(np.arange(n), dtype=dtype_left)) + B = pd.Series(np.array(np.arange(n)**2, dtype=dtype_right)) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - @unittest.skip("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" - "AssertionError: Series.index are different\n" - "Series.index classes are not equivalent\n" - "[left]: Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype='float64')\n" - "[right]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')\n") - def test_series_operator_add_numeric_same_index_numeric_fixme(self): + @skip_numba_jit + @skip_sdc_jit("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" + "AssertionError: Series.index are different\n" + "Series.index classes are not equivalent\n" + "[left]: Float64Index([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], dtype='float64')\n" + "[right]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')\n") + def test_series_operator_add_numeric_same_index_numeric(self): """Verifies implementation of Series.operator.add between two numeric Series with the same numeric indexes of different dtypes""" def test_impl(A, B): @@ -4917,9 +4726,8 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') - def test_series_operator_add_numeric_same_index_numeric(self): + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') + def test_series_operator_add_numeric_same_index_numeric_fixme(self): """ Same as test_series_operator_add_same_index_numeric but with w/a for the problem. Can be deleted when the latter is fixed """ def test_impl(A, B): @@ -4939,8 +4747,7 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=np.arange(n, dtype=dtype_right)) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') def test_series_operator_add_numeric_same_index_str(self): """Verifies implementation of Series.operator.add between two numeric Series with the same string indexes""" def test_impl(A, B): @@ -4952,8 +4759,7 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=['a', 'c', 'e', 'c', 'b', 'a', 'o']) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') def test_series_operator_add_numeric_align_index_int(self): """Verifies implementation of Series.operator.add between two numeric Series with non-equal integer indexes""" def test_impl(A, B): @@ -4969,8 +4775,7 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=index_B) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') def test_series_operator_add_numeric_align_index_str(self): """Verifies implementation of Series.operator.add between two numeric Series with non-equal string indexes""" def test_impl(A, B): @@ -4986,7 +4791,7 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=index_B) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skip("TODO: fix Series.sort_values to handle both None and '' in string series\n") + @skip_sdc_jit("TODO: fix Series.sort_values to handle both None and '' in string series") def test_series_operator_add_numeric_align_index_str_fixme(self): """Same as test_series_operator_add_align_index_str but with None values in string indexes""" def test_impl(A, B): @@ -5002,8 +4807,7 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=index_B) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with non-default indexes are not supported in old-style') + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') def test_series_operator_add_numeric_align_index_other_dtype(self): """Verifies implementation of Series.operator.add between two numeric Series with non-equal integer indexes of different dtypes""" @@ -5016,8 +4820,7 @@ def test_impl(A, B): B = pd.Series(np.arange(3*n)**2, index=np.arange(0, 3*n, 1, dtype=np.float64)) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series with different sizes are not supported in old-style') + @skip_sdc_jit('Arithmetic operations on Series with different sizes are not supported in old-style') def test_series_operator_add_numeric_diff_series_sizes(self): """Verifies implementation of Series.operator.add between two numeric Series with different sizes""" def test_impl(A, B): @@ -5029,8 +4832,7 @@ def test_impl(A, B): B = pd.Series(np.arange(size_B)**2) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skipIf(sdc.config.config_pipeline_hpat_default, - 'Arithmetic operations on Series requiring alignment of indexes are not supported in old-style') + @skip_sdc_jit('Arithmetic operations on Series requiring alignment of indexes are not supported in old-style') def test_series_operator_add_align_index_int_capacity(self): """Verifies implementation of Series.operator.add and alignment of numeric indexes of large size""" def test_impl(A, B): @@ -5045,7 +4847,8 @@ def test_impl(A, B): B = pd.Series(np.random.ranf(n), index=index2) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skip("Test hangs due to a call of Series.sort_values") + @skip_numba_jit + @skip_sdc_jit('Test hangs due to a call of Series.sort_values') def test_series_operator_add_align_index_str_capacity(self): """Verifies implementation of Series.operator.add and alignment of string indexes of large size""" def test_impl(A, B): From 7c630fab503adf097208ed5f3b4d3c706dfa4109 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Fri, 6 Dec 2019 16:46:34 +0300 Subject: [PATCH 4/4] Bugfix in indexes join and minor changes --- sdc/datatypes/common_functions.py | 254 ++++++++++++++++- sdc/datatypes/hpat_pandas_series_functions.py | 261 ++---------------- sdc/hiframes/pd_series_ext.py | 148 +++++----- sdc/tests/test_series.py | 98 +------ 4 files changed, 354 insertions(+), 407 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index f22028a14..1561d0ed1 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -31,6 +31,7 @@ """ import numpy +import pandas import numba from numba import types @@ -39,7 +40,8 @@ from numba import numpy_support import sdc -from sdc.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to) +from sdc.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to, + str_arr_is_na, pre_alloc_string_array, str_arr_set_na) class TypeChecker: @@ -137,9 +139,7 @@ def _append_single_numeric_impl(A, B): elif isinstance(B, (types.UniTuple, types.List)): # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way # to resolve common dtype of heterogeneous sequence of arrays - np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)] - np_common_dtype = numpy.find_common_type([], np_dtypes) - numba_common_dtype = numpy_support.from_dtype(np_common_dtype) + numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], []) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): @@ -191,9 +191,12 @@ def _append_list_string_array_impl(A, B): @numba.njit def _hpat_ensure_array_capacity(new_size, arr): - """Function creating a copy of numpy array with a size more than specified""" + """ Function ensuring that the size of numpy array is at least as specified + Returns newly allocated array of bigger size with copied elements if existing size is less than requested + """ + k = len(arr) - if k > new_size: + if k >= new_size: return arr n = k @@ -203,11 +206,238 @@ def _hpat_ensure_array_capacity(new_size, arr): res[:k] = arr[:k] return res -def find_common_dtype_for_scalar_numpy_types(dtype1, dtype2): - """Used to find common numba dtype for two numba dtypes each representing some scalar numpy dtype""" - np_dtypes = [numpy_support.as_dtype(dtype1), - numpy_support.as_dtype(dtype2)] - np_common_dtype = numpy.find_common_type([], np_dtypes) + +def find_common_dtype_from_numpy_dtypes(array_types, scalar_types): + """Used to find common numba dtype for a sequences of numba dtypes each representing some numpy dtype""" + np_array_dtypes = [numpy_support.as_dtype(dtype) for dtype in array_types] + np_scalar_dtypes = [numpy_support.as_dtype(dtype) for dtype in scalar_types] + np_common_dtype = numpy.find_common_type(np_array_dtypes, np_scalar_dtypes) numba_common_dtype = numpy_support.from_dtype(np_common_dtype) - return numba_common_dtype \ No newline at end of file + return numba_common_dtype + + +def hpat_join_series_indexes(left, right): + pass + + +@overload(hpat_join_series_indexes) +def hpat_join_series_indexes_overload(left, right): + """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm""" + + # TODO: eliminate code duplication by merging implementations for numeric and StringArray + # requires equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays + if (isinstance(left, types.Array) and isinstance(right, types.Array)): + + numba_common_dtype = find_common_dtype_from_numpy_dtypes([left.dtype, right.dtype], []) + if isinstance(numba_common_dtype, types.Number): + + def hpat_join_series_indexes_impl(left, right): + + # allocate result arrays + lsize = len(left) + rsize = len(right) + est_total_size = int(1.1 * (lsize + rsize)) + + lidx = numpy.empty(est_total_size, numpy.int64) + ridx = numpy.empty(est_total_size, numpy.int64) + joined = numpy.empty(est_total_size, numba_common_dtype) + + # sort arrays saving the old positions + sorted_left = numpy.argsort(left, kind='mergesort') + sorted_right = numpy.argsort(right, kind='mergesort') + + i, j, k = 0, 0, 0 + while (i < lsize and j < rsize): + joined = _hpat_ensure_array_capacity(k + 1, joined) + lidx = _hpat_ensure_array_capacity(k + 1, lidx) + ridx = _hpat_ensure_array_capacity(k + 1, ridx) + + left_index = left[sorted_left[i]] + right_index = right[sorted_right[j]] + + if (left_index < right_index): + joined[k] = left_index + lidx[k] = sorted_left[i] + ridx[k] = -1 + i += 1 + k += 1 + elif (left_index > right_index): + joined[k] = right_index + lidx[k] = -1 + ridx[k] = sorted_right[j] + j += 1 + k += 1 + else: + # find ends of sequences of equal index values in left and right + ni, nj = i, j + while (ni < lsize and left[sorted_left[ni]] == left_index): + ni += 1 + while (nj < rsize and right[sorted_right[nj]] == right_index): + nj += 1 + + # join the blocks found into results + for s in numpy.arange(i, ni, 1): + block_size = nj - j + to_joined = numpy.repeat(left_index, block_size) + to_lidx = numpy.repeat(sorted_left[s], block_size) + to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) + + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + + joined[k:k + block_size] = to_joined + lidx[k:k + block_size] = to_lidx + ridx[k:k + block_size] = to_ridx + k += block_size + i = ni + j = nj + + # fill the end of joined with remaining part of left or right + if i < lsize: + block_size = lsize - i + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + ridx[k: k + block_size] = numpy.repeat(-1, block_size) + while i < lsize: + joined[k] = left[sorted_left[i]] + lidx[k] = sorted_left[i] + i += 1 + k += 1 + + elif j < rsize: + block_size = rsize - j + joined = _hpat_ensure_array_capacity(k + block_size, joined) + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + lidx[k: k + block_size] = numpy.repeat(-1, block_size) + while j < rsize: + joined[k] = right[sorted_right[j]] + ridx[k] = sorted_right[j] + j += 1 + k += 1 + + return joined[:k], lidx[:k], ridx[:k] + + return hpat_join_series_indexes_impl + + else: + # TODO: support joining indexes with common dtype=object - requires Numba + # support of such numpy arrays in nopython mode, for now just return None + return None + + elif (left == string_array_type and right == string_array_type): + + def hpat_join_series_indexes_impl(left, right): + + # allocate result arrays + lsize = len(left) + rsize = len(right) + est_total_size = int(1.1 * (lsize + rsize)) + + lidx = numpy.empty(est_total_size, numpy.int64) + ridx = numpy.empty(est_total_size, numpy.int64) + + # use Series.sort_values since argsort for StringArrays not implemented + original_left_series = pandas.Series(left) + original_right_series = pandas.Series(right) + + # sort arrays saving the old positions + left_series = original_left_series.sort_values(kind='mergesort') + right_series = original_right_series.sort_values(kind='mergesort') + sorted_left = left_series._index + sorted_right = right_series._index + + i, j, k = 0, 0, 0 + while (i < lsize and j < rsize): + lidx = _hpat_ensure_array_capacity(k + 1, lidx) + ridx = _hpat_ensure_array_capacity(k + 1, ridx) + + left_index = left[sorted_left[i]] + right_index = right[sorted_right[j]] + + if (left_index < right_index): + lidx[k] = sorted_left[i] + ridx[k] = -1 + i += 1 + k += 1 + elif (left_index > right_index): + lidx[k] = -1 + ridx[k] = sorted_right[j] + j += 1 + k += 1 + else: + # find ends of sequences of equal index values in left and right + ni, nj = i, j + while (ni < lsize and left[sorted_left[ni]] == left_index): + ni += 1 + while (nj < rsize and right[sorted_right[nj]] == right_index): + nj += 1 + + # join the blocks found into results + for s in numpy.arange(i, ni, 1): + block_size = nj - j + to_lidx = numpy.repeat(sorted_left[s], block_size) + to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) + + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + + lidx[k:k + block_size] = to_lidx + ridx[k:k + block_size] = to_ridx + k += block_size + i = ni + j = nj + + # fill the end of joined with remaining part of left or right + if i < lsize: + block_size = lsize - i + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + ridx[k: k + block_size] = numpy.repeat(-1, block_size) + while i < lsize: + lidx[k] = sorted_left[i] + i += 1 + k += 1 + + elif j < rsize: + block_size = rsize - j + lidx = _hpat_ensure_array_capacity(k + block_size, lidx) + ridx = _hpat_ensure_array_capacity(k + block_size, ridx) + lidx[k: k + block_size] = numpy.repeat(-1, block_size) + while j < rsize: + ridx[k] = sorted_right[j] + j += 1 + k += 1 + + # count total number of characters and allocate joined array + total_joined_size = k + num_chars_in_joined = 0 + for i in numpy.arange(total_joined_size): + if lidx[i] != -1: + num_chars_in_joined += len(left[lidx[i]]) + elif ridx[i] != -1: + num_chars_in_joined += len(right[ridx[i]]) + + joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) + + # iterate over joined and fill it with indexes using lidx and ridx indexers + for i in numpy.arange(total_joined_size): + if lidx[i] != -1: + joined[i] = left[lidx[i]] + if (str_arr_is_na(left, lidx[i])): + str_arr_set_na(joined, i) + elif ridx[i] != -1: + joined[i] = right[ridx[i]] + if (str_arr_is_na(right, ridx[i])): + str_arr_set_na(joined, i) + else: + str_arr_set_na(joined, i) + + return joined, lidx, ridx + + return hpat_join_series_indexes_impl + + return None diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 22361bd87..9fa9ebbb3 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -37,13 +37,12 @@ from numba.errors import TypingError from numba.extending import overload, overload_method, overload_attribute from numba import types -from numba import numpy_support import sdc import sdc.datatypes.common_functions as common_functions from sdc.datatypes.common_functions import TypeChecker -from sdc.datatypes.common_functions import (_hpat_ensure_array_capacity, check_index_is_numeric, - find_common_dtype_for_scalar_numpy_types) +from sdc.datatypes.common_functions import (check_index_is_numeric, find_common_dtype_from_numpy_dtypes, + hpat_join_series_indexes) from sdc.datatypes.hpat_pandas_stringmethods_types import StringMethodsType from sdc.hiframes.pd_series_ext import SeriesType from sdc.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars, string_array_type, @@ -3728,229 +3727,14 @@ def hpat_pandas_series_pct_change_impl(self, periods=1, fill_method='pad', limit return hpat_pandas_series_pct_change_impl -def hpat_join_series_indexes(left, right): - pass - - -@overload(hpat_join_series_indexes) -def hpat_join_series_indexes_overload(left, right): - '''Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm''' - - if (isinstance(left, types.Array) and isinstance(right, types.Array)): - np_dtypes = [numpy_support.as_dtype(left.dtype), numpy_support.as_dtype(right.dtype)] - np_common_dtype = numpy.find_common_type([], np_dtypes) - numba_common_dtype = numpy_support.from_dtype(np_common_dtype) - if (isinstance(left.dtype, types.Number) and isinstance(right.dtype, types.Number)): - - def hpat_join_series_indexes_impl(left, right): - - # allocate result arrays - lsize = len(left) - rsize = len(right) - est_total_size = int(1.1 * (lsize + rsize)) - - lidx = numpy.empty(est_total_size, numpy.int64) - ridx = numpy.empty(est_total_size, numpy.int64) - joined = numpy.empty(est_total_size, numba_common_dtype) - - # sort arrays saving the old positions - sorted_left = numpy.argsort(left, kind='mergesort') - sorted_right = numpy.argsort(right, kind='mergesort') - - i, j, k = 0, 0, 0 - while (i < lsize and j < rsize): - joined = _hpat_ensure_array_capacity(k, joined) - lidx = _hpat_ensure_array_capacity(k, lidx) - ridx = _hpat_ensure_array_capacity(k, ridx) - - left_index = left[sorted_left[i]] - right_index = right[sorted_right[j]] - - if (left_index < right_index): - joined[k] = left_index - lidx[k] = sorted_left[i] - ridx[k] = -1 - i += 1 - k += 1 - elif (left_index > right_index): - joined[k] = right_index - lidx[k] = -1 - ridx[k] = sorted_right[j] - j += 1 - k += 1 - else: - # find ends of sequences of equivalent index values in left and right - ni, nj = i, j - while (ni < lsize and left[sorted_left[ni]] == left_index): - ni += 1 - while (nj < rsize and right[sorted_right[nj]] == right_index): - nj += 1 - - # join the blocks found into results - for s in numpy.arange(i, ni, 1): - block_size = nj - j - to_joined = numpy.repeat(left_index, block_size) - to_lidx = numpy.repeat(sorted_left[s], block_size) - to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) - - joined = _hpat_ensure_array_capacity(k + block_size, joined) - lidx = _hpat_ensure_array_capacity(k + block_size, lidx) - ridx = _hpat_ensure_array_capacity(k + block_size, ridx) - - joined[k:k + block_size] = to_joined - lidx[k:k + block_size] = to_lidx - ridx[k:k + block_size] = to_ridx - k += block_size - i = ni - j = nj - - # fill the end of joined with remaining part of left or right - if i < lsize: - block_size = lsize - i - joined = _hpat_ensure_array_capacity(k + block_size, joined) - lidx = _hpat_ensure_array_capacity(k + block_size, lidx) - ridx = _hpat_ensure_array_capacity(k + block_size, ridx) - ridx[k: k + block_size] = numpy.repeat(-1, block_size) - while i < lsize: - joined[k] = left[sorted_left[i]] - lidx[k] = sorted_left[i] - i += 1 - k += 1 - - elif j < rsize: - block_size = rsize - j - joined = _hpat_ensure_array_capacity(k + block_size, joined) - lidx = _hpat_ensure_array_capacity(k + block_size, lidx) - ridx = _hpat_ensure_array_capacity(k + block_size, ridx) - lidx[k: k + block_size] = numpy.repeat(-1, block_size) - while j < rsize: - joined[k] = right[sorted_right[j]] - ridx[k] = sorted_right[j] - j += 1 - k += 1 - - return joined[:k], lidx[:k], ridx[:k] - - return hpat_join_series_indexes_impl - - elif (left == string_array_type and right == string_array_type): - - def hpat_join_series_indexes_impl(left, right): - - # allocate result arrays - lsize = len(left) - rsize = len(right) - est_total_size = int(1.1 * (lsize + rsize)) - - lidx = numpy.empty(est_total_size, numpy.int64) - ridx = numpy.empty(est_total_size, numpy.int64) - - # use Series.sort_values since argsort for StringArrays not implemented - original_left_series = pandas.Series(left) - original_right_series = pandas.Series(right) - - # sort arrays saving the old positions - left_series = original_left_series.sort_values(kind='mergesort') - right_series = original_right_series.sort_values(kind='mergesort') - sorted_left = left_series._index - sorted_right = right_series._index - - i, j, k = 0, 0, 0 - while (i < lsize and j < rsize): - lidx = _hpat_ensure_array_capacity(k, lidx) - ridx = _hpat_ensure_array_capacity(k, ridx) - - left_index = left[sorted_left[i]] - right_index = right[sorted_right[j]] - - if (left_index < right_index): - lidx[k] = sorted_left[i] - ridx[k] = -1 - i += 1 - k += 1 - elif (left_index > right_index): - lidx[k] = -1 - ridx[k] = sorted_right[j] - j += 1 - k += 1 - else: - # find ends of sequences of equivalent index values in left and right - ni, nj = i, j - while (ni < lsize and left[sorted_left[ni]] == left_index): - ni += 1 - while (nj < rsize and right[sorted_right[nj]] == right_index): - nj += 1 - - # join the blocks found into results - for s in numpy.arange(i, ni, 1): - block_size = nj - j - to_lidx = numpy.repeat(sorted_left[s], block_size) - to_ridx = numpy.array([sorted_right[k] for k in numpy.arange(j, nj, 1)], numpy.int64) - - lidx = _hpat_ensure_array_capacity(k + block_size, lidx) - ridx = _hpat_ensure_array_capacity(k + block_size, ridx) - - lidx[k:k + block_size] = to_lidx - ridx[k:k + block_size] = to_ridx - k += block_size - i = ni - j = nj - - # fill the end of joined with remaining part of left or right - if i < lsize: - block_size = lsize - i - lidx = _hpat_ensure_array_capacity(k + block_size, lidx) - ridx = _hpat_ensure_array_capacity(k + block_size, ridx) - ridx[k: k + block_size] = numpy.repeat(-1, block_size) - while i < lsize: - lidx[k] = sorted_left[i] - i += 1 - k += 1 - - elif j < rsize: - block_size = rsize - j - lidx = _hpat_ensure_array_capacity(k + block_size, lidx) - ridx = _hpat_ensure_array_capacity(k + block_size, ridx) - lidx[k: k + block_size] = numpy.repeat(-1, block_size) - while j < rsize: - ridx[k] = sorted_right[j] - j += 1 - k += 1 - - # count total number of characters and allocate joined array - total_joined_size = k - num_chars_in_joined = 0 - for i in numpy.arange(total_joined_size): - if lidx[i] != -1: - num_chars_in_joined += len(left[lidx[i]]) - elif ridx[i] != -1: - num_chars_in_joined += len(right[ridx[i]]) - - joined = pre_alloc_string_array(total_joined_size, num_chars_in_joined) - - # iterate over joined and fill it with indexes using lidx and ridx indexers - for i in numpy.arange(total_joined_size): - if lidx[i] != -1: - joined[i] = left[lidx[i]] - if (str_arr_is_na(left, lidx[i])): - str_arr_set_na(joined, i) - elif ridx[i] != -1: - joined[i] = right[ridx[i]] - if (str_arr_is_na(right, ridx[i])): - str_arr_set_na(joined, i) - else: - str_arr_set_na(joined, i) - - return joined, lidx, ridx - - return hpat_join_series_indexes_impl - - @overload(operator.add) def hpat_pandas_series_operator_add(self, other): """ Pandas Series operator :attr:`pandas.Series.add` implementation + Note: Currently implemented for numeric Series only. + Differs from Pandas in returning Series with fixed dtype :obj:`float64` + .. only:: developer **Test**: python -m hpat.runtests sdc.tests.test_series.TestSeries.test_series_op1 @@ -3981,15 +3765,15 @@ def hpat_pandas_series_operator_add(self, other): ty_checker.raise_exc(other, 'pandas.series or scalar', 'other') series_indexes_alignable = False + none_or_numeric_indexes = False if isinstance(other, SeriesType): if (other.index == string_array_type and self.index == string_array_type): series_indexes_alignable = True - if ((isinstance(self.index, types.NoneType) or - isinstance(self.index, types.Array) and isinstance(self.index.dtype, types.Number)) - and (isinstance(other.index, types.NoneType) or - isinstance(other.index, types.Array) and isinstance(other.index.dtype, types.Number))): + if ((isinstance(self.index, types.NoneType) or check_index_is_numeric(self)) + and (isinstance(other.index, types.NoneType) or check_index_is_numeric(other))): series_indexes_alignable = True + none_or_numeric_indexes = True if isinstance(other, SeriesType) and not series_indexes_alignable: raise TypingError('{} Not implemented for series with not-alignable indexes. \ @@ -4003,16 +3787,8 @@ def hpat_pandas_series_add_scalar_impl(self, other): return hpat_pandas_series_add_scalar_impl elif (isinstance(other, SeriesType)): - is_numeric_index = isinstance(self.index, (types.Array, types.NoneType)) - - if is_numeric_index: - ty_left_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype - ty_right_index_dtype = types.int64 if isinstance(other.index, types.NoneType) else other.index.dtype - np_index_dtypes = [numpy_support.as_dtype(ty_left_index_dtype), - numpy_support.as_dtype(ty_right_index_dtype)] - np_index_common_dtype = numpy.find_common_type([], np_index_dtypes) - numba_index_common_dtype = numpy_support.from_dtype(np_index_common_dtype) + # optimization for series with default indexes, that can be aligned differently if (isinstance(self.index, types.NoneType) and isinstance(other.index, types.NoneType)): def hpat_pandas_series_add_impl(self, other): @@ -4026,12 +3802,21 @@ def hpat_pandas_series_add_impl(self, other): new_data[min_data_size:] = numpy.repeat(numpy.nan, max_data_size - min_data_size) return pandas.Series(new_data, self._index) + + return hpat_pandas_series_add_impl else: + # for numeric indexes find common dtype to be used when creating joined index + if none_or_numeric_indexes: + ty_left_index_dtype = types.int64 if isinstance(self.index, types.NoneType) else self.index.dtype + ty_right_index_dtype = types.int64 if isinstance(other.index, types.NoneType) else other.index.dtype + numba_index_common_dtype = find_common_dtype_from_numpy_dtypes( + [ty_left_index_dtype, ty_right_index_dtype], []) + def hpat_pandas_series_add_impl(self, other): left_index, right_index = self.index, other.index # check if indexes are equal and series don't have to be aligned - if is_numeric_index == True: # noqa + if none_or_numeric_indexes == True: # noqa if (numpy.array_equal(left_index, right_index)): return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64), numpy.asarray(left_index, numba_index_common_dtype)) @@ -4048,7 +3833,7 @@ def hpat_pandas_series_add_impl(self, other): return pandas.Series(numpy.asarray(self._data + other._data, numpy.float64), self._index) - # TODO: replace below with core join(how='outer') when implemented + # TODO: replace below with core join(how='outer', return_indexers=True) when implemented joined_index, left_indexer, right_indexer = hpat_join_series_indexes(left_index, right_index) joined_index_range = numpy.arange(len(joined_index)) @@ -4066,4 +3851,6 @@ def hpat_pandas_series_add_impl(self, other): return pandas.Series(left_values + right_values, joined_index) - return hpat_pandas_series_add_impl + return hpat_pandas_series_add_impl + + return None diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 037658f68..f2e28be4a 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -914,15 +914,15 @@ def __init__(self, stype): super(SeriesIatType, self).__init__(name) -# PR135. This needs to be commented out -@infer_global(operator.getitem) -class GetItemSeriesIat(AbstractTemplate): - key = operator.getitem +if sdc.config.config_pipeline_hpat_default: + @infer_global(operator.getitem) + class GetItemSeriesIat(AbstractTemplate): + key = operator.getitem - def generic(self, args, kws): - # iat[] is the same as regular getitem - if isinstance(args[0], SeriesIatType): - return GetItemSeries.generic(self, (args[0].stype, args[1]), kws) + def generic(self, args, kws): + # iat[] is the same as regular getitem + if isinstance(args[0], SeriesIatType): + return GetItemSeries.generic(self, (args[0].stype, args[1]), kws) @infer @@ -1031,7 +1031,7 @@ def generic_expand_cumulative_series(self, args, kws): _non_hpat_pipeline_attrs = [ 'resolve_append', 'resolve_combine', 'resolve_corr', 'resolve_cov', 'resolve_dropna', 'resolve_fillna', 'resolve_head', 'resolve_nlargest', - 'resolve_nsmallest', 'resolve_pct_change' + 'resolve_nsmallest', 'resolve_pct_change', 'resolve_loc' ] # use ArrayAttribute for attributes not defined in SeriesAttribute @@ -1047,72 +1047,72 @@ def generic_expand_cumulative_series(self, args, kws): if attr in SeriesAttribute.__dict__: delattr(SeriesAttribute, attr) -# PR135. This needs to be commented out -@infer_global(operator.getitem) -class GetItemSeries(AbstractTemplate): - key = operator.getitem - - def generic(self, args, kws): - assert not kws - [in_arr, in_idx] = args - is_arr_series = False - is_idx_series = False - is_arr_dt_index = False - - if not isinstance(in_arr, SeriesType) and not isinstance(in_idx, SeriesType): - return None - - if isinstance(in_arr, SeriesType): - in_arr = series_to_array_type(in_arr) - is_arr_series = True - if in_arr.dtype == types.NPDatetime('ns'): - is_arr_dt_index = True - - if isinstance(in_idx, SeriesType): - in_idx = series_to_array_type(in_idx) - is_idx_series = True - - # TODO: dt_index - if in_arr == string_array_type: - # XXX fails due in overload - # compile_internal version results in symbol not found! - # sig = self.context.resolve_function_type( - # operator.getitem, (in_arr, in_idx), kws) - # HACK to get avoid issues for now - if isinstance(in_idx, (types.Integer, types.IntegerLiteral)): - sig = string_type(in_arr, in_idx) - else: - sig = GetItemStringArray.generic(self, (in_arr, in_idx), kws) - elif in_arr == list_string_array_type: - # TODO: split view - # mimic array indexing for list - if (isinstance(in_idx, types.Array) and in_idx.ndim == 1 - and isinstance( - in_idx.dtype, (types.Integer, types.Boolean))): - sig = signature(in_arr, in_arr, in_idx) - else: - sig = numba.typing.collections.GetItemSequence.generic( +if sdc.config.config_pipeline_hpat_default: + @infer_global(operator.getitem) + class GetItemSeries(AbstractTemplate): + key = operator.getitem + + def generic(self, args, kws): + assert not kws + [in_arr, in_idx] = args + is_arr_series = False + is_idx_series = False + is_arr_dt_index = False + + if not isinstance(in_arr, SeriesType) and not isinstance(in_idx, SeriesType): + return None + + if isinstance(in_arr, SeriesType): + in_arr = series_to_array_type(in_arr) + is_arr_series = True + if in_arr.dtype == types.NPDatetime('ns'): + is_arr_dt_index = True + + if isinstance(in_idx, SeriesType): + in_idx = series_to_array_type(in_idx) + is_idx_series = True + + # TODO: dt_index + if in_arr == string_array_type: + # XXX fails due in overload + # compile_internal version results in symbol not found! + # sig = self.context.resolve_function_type( + # operator.getitem, (in_arr, in_idx), kws) + # HACK to get avoid issues for now + if isinstance(in_idx, (types.Integer, types.IntegerLiteral)): + sig = string_type(in_arr, in_idx) + else: + sig = GetItemStringArray.generic(self, (in_arr, in_idx), kws) + elif in_arr == list_string_array_type: + # TODO: split view + # mimic array indexing for list + if (isinstance(in_idx, types.Array) and in_idx.ndim == 1 + and isinstance( + in_idx.dtype, (types.Integer, types.Boolean))): + sig = signature(in_arr, in_arr, in_idx) + else: + sig = numba.typing.collections.GetItemSequence.generic( + self, (in_arr, in_idx), kws) + elif in_arr == string_array_split_view_type: + sig = GetItemStringArraySplitView.generic( self, (in_arr, in_idx), kws) - elif in_arr == string_array_split_view_type: - sig = GetItemStringArraySplitView.generic( - self, (in_arr, in_idx), kws) - else: - out = get_array_index_type(in_arr, in_idx) - sig = signature(out.result, in_arr, out.index) - - if sig is not None: - arg1 = sig.args[0] - arg2 = sig.args[1] - if is_arr_series: - sig.return_type = if_arr_to_series_type(sig.return_type) - arg1 = if_arr_to_series_type(arg1) - if is_idx_series: - arg2 = if_arr_to_series_type(arg2) - sig.args = (arg1, arg2) - # dt_index and Series(dt64) should return Timestamp - if is_arr_dt_index and sig.return_type == types.NPDatetime('ns'): - sig.return_type = pandas_timestamp_type - return sig + else: + out = get_array_index_type(in_arr, in_idx) + sig = signature(out.result, in_arr, out.index) + + if sig is not None: + arg1 = sig.args[0] + arg2 = sig.args[1] + if is_arr_series: + sig.return_type = if_arr_to_series_type(sig.return_type) + arg1 = if_arr_to_series_type(arg1) + if is_idx_series: + arg2 = if_arr_to_series_type(arg2) + sig.args = (arg1, arg2) + # dt_index and Series(dt64) should return Timestamp + if is_arr_dt_index and sig.return_type == types.NPDatetime('ns'): + sig.return_type = pandas_timestamp_type + return sig @infer_global(operator.setitem) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 3c85a607e..3f836780c 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -1050,6 +1050,7 @@ def test_series_op1(self): test_impl = _make_func_use_binop1(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 df = pd.DataFrame({'A': np.arange(1, n), 'B': np.ones(n - 1)}) pd.testing.assert_series_equal(hpat_func(df.A, df.B), test_impl(df.A, df.B), check_names=False) @@ -1062,6 +1063,7 @@ def test_series_op2(self): test_impl = _make_func_use_binop1(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 if platform.system() == 'Windows' and not IS_32BITS: df = pd.DataFrame({'A': np.arange(1, n, dtype=np.int64)}) @@ -1069,26 +1071,30 @@ def test_series_op2(self): df = pd.DataFrame({'A': np.arange(1, n)}) pd.testing.assert_series_equal(hpat_func(df.A, 1), test_impl(df.A, 1), check_names=False) + @skip_numba_jit('Not implemented in new-pipeline yet') def test_series_op3(self): - arithmetic_binops = ('+', '-', '*', '/', '//', '%', '**') + arithmetic_binops = ('+=', '-=', '*=', '/=', '//=', '%=', '**=') for operator in arithmetic_binops: test_impl = _make_func_use_binop2(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 - df = pd.DataFrame({'A': np.arange(1, n), 'B': np.ones(n - 1)}) + df = pd.DataFrame({'A': np.arange(1, n, dtype=np.float64), 'B': np.ones(n - 1)}) pd.testing.assert_series_equal(hpat_func(df.A, df.B), test_impl(df.A, df.B), check_names=False) + @skip_numba_jit('Not implemented in new-pipeline yet') def test_series_op4(self): - arithmetic_binops = ('+', '-', '*', '/', '//', '%', '**') + arithmetic_binops = ('+=', '-=', '*=', '/=', '//=', '%=', '**=') for operator in arithmetic_binops: test_impl = _make_func_use_binop2(operator) hpat_func = self.jit(test_impl) + # TODO: extend to test arithmetic operations between numeric Series of different dtypes n = 11 - df = pd.DataFrame({'A': np.arange(1, n)}) + df = pd.DataFrame({'A': np.arange(1, n, dtype=np.float64)}) pd.testing.assert_series_equal(hpat_func(df.A, 1), test_impl(df.A, 1), check_names=False) def test_series_op5(self): @@ -4579,66 +4585,6 @@ def test_series_pct_change_impl(S, periods=1, fill_method='pad', limit=None, fre msg = 'Method pct_change(). The object periods' self.assertIn(msg, str(raises.exception)) - def test_series_setitem_for_value(self): - def test_impl(S, val): - S[3] = val - return S - - hpat_func = self.jit(test_impl) - S = pd.Series([0, 1, 2, 3, 4]) - value = 50 - result_ref = test_impl(S, value) - result = hpat_func(S, value) - pd.testing.assert_series_equal(result_ref, result) - - def test_series_setitem_for_slice(self): - def test_impl(S, val): - S[2:] = val - return S - - hpat_func = self.jit(test_impl) - S = pd.Series([0, 1, 2, 3, 4]) - value = 50 - result_ref = test_impl(S, value) - result = hpat_func(S, value) - pd.testing.assert_series_equal(result_ref, result) - - def test_series_setitem_for_series(self): - def test_impl(S, ind, val): - S[ind] = val - return S - - hpat_func = self.jit(test_impl) - S = pd.Series([0, 1, 2, 3, 4]) - ind = pd.Series([0, 2, 4]) - value = 50 - result_ref = test_impl(S, ind, value) - result = hpat_func(S, ind, value) - pd.testing.assert_series_equal(result_ref, result) - - def test_series_setitem_unsupported(self): - def test_impl(S, ind, val): - S[ind] = val - return S - - hpat_func = self.jit(test_impl) - S = pd.Series([0, 1, 2, 3, 4, 5]) - ind1 = 5 - ind2 = '3' - value1 = 'ababa' - value2 = 101 - - with self.assertRaises(TypingError) as raises: - hpat_func(S, ind1, value1) - msg = 'Operator setitem(). Value must be one type with series.' - self.assertIn(msg, str(raises.exception)) - - with self.assertRaises(TypingError) as raises: - hpat_func(S, ind2, value2) - msg = 'Operator setitem(). The index must be an Integer, Slice or a pandas.series.' - self.assertIn(msg, str(raises.exception)) - - @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') def test_series_operator_add_numeric_scalar(self): """Verifies Series.operator.add implementation for numeric series and scalar second operand""" @@ -4687,24 +4633,6 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, dtype=dtype_right) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - def test_series_operator_add_series_dtype_promotion(self): - """Verifies implementation of Series.operator.add between two numeric Series of different dtypes""" - def test_impl(A, B): - return A + B - hpat_func = self.jit(test_impl) - - n = 7 - A = pd.Series(np.array(np.arange(n), dtype=np.int32)) - B = pd.Series(np.array(np.arange(n)**2, dtype=np.float32)) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - - dtypes_to_test = (np.int32, np.int64, np.float32, np.float64) - for dtype_left, dtype_right in combinations(dtypes_to_test, 2): - with self.subTest(left_series_dtype=dtype_left, right_series_dtype=dtype_right): - A = pd.Series(np.array(np.arange(n), dtype=dtype_left)) - B = pd.Series(np.array(np.arange(n)**2, dtype=dtype_right)) - pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False) - @skip_numba_jit @skip_sdc_jit("TODO: find out why pandas aligning series indexes produces Int64Index when common dtype is float\n" "AssertionError: Series.index are different\n" @@ -4791,7 +4719,8 @@ def test_impl(A, B): B = pd.Series(np.arange(n)**2, index=index_B) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @skip_sdc_jit("TODO: fix Series.sort_values to handle both None and '' in string series") + @skip_numba_jit('TODO: fix Series.sort_values to handle both None and '' in string series') + @skip_sdc_jit('Arithmetic operations on Series with non-default indexes are not supported in old-style') def test_series_operator_add_numeric_align_index_str_fixme(self): """Same as test_series_operator_add_align_index_str but with None values in string indexes""" def test_impl(A, B): @@ -4864,7 +4793,8 @@ def test_impl(A, B): B = pd.Series(np.random.ranf(n), index=index2) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B), check_dtype=False, check_names=False) - @unittest.skip("TODO: support arithemetic operations on StringArrays and extend Series.operator.add overload") + @skip_numba_jit + @skip_sdc_jit("TODO: support arithemetic operations on StringArrays and extend Series.operator.add overload") def test_series_operator_add_str_same_index_default(self): """Verifies implementation of Series.operator.add between two string Series with default indexes and same size"""