diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index cead5f7cd..b96464173 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -2926,7 +2926,7 @@ def hpat_pandas_series_take_impl(self, indices, axis=0, is_copy=False): @sdc_overload_method(SeriesType, 'idxmax') -def hpat_pandas_series_idxmax(self, axis=None, skipna=True): +def hpat_pandas_series_idxmax(self, axis=None, skipna=None): """ Intel Scalable Dataframe Compiler User Guide ******************************************** @@ -2975,25 +2975,48 @@ def hpat_pandas_series_idxmax(self, axis=None, skipna=True): if not isinstance(self.data.dtype, types.Number): ty_checker.raise_exc(self.data.dtype, 'int, float', 'self.data.dtype') - if not (isinstance(skipna, (types.Omitted, types.Boolean, bool)) or skipna is True): + if not (isinstance(skipna, (types.Omitted, types.Boolean, bool)) or skipna is None): ty_checker.raise_exc(skipna, 'bool', 'skipna') if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'None', 'axis') - if isinstance(self.index, types.NoneType) or self.index is None: - def hpat_pandas_series_idxmax_impl(self, axis=None, skipna=True): - return numpy.argmax(self._data) - - return hpat_pandas_series_idxmax_impl + none_index = isinstance(self.index, types.NoneType) or self.index is None + if isinstance(self.data, StringArrayType): + def hpat_pandas_series_idxmax_str_impl(self, axis=None, skipna=None): + if skipna is None: + _skipna = True + else: + raise ValueError("Method idxmax(). Unsupported parameter 'skipna'=False with str data") - else: - def hpat_pandas_series_idxmax_index_impl(self, axis=None, skipna=True): - # no numpy.nanargmax is supported by Numba at this time result = numpy.argmax(self._data) + if none_index == True: # noqa + return result + else: + return self._index[int(result)] + + return hpat_pandas_series_idxmax_str_impl + + def hpat_pandas_series_idxmax_impl(self, axis=None, skipna=None): + # return numpy.argmax(self._data) + if skipna is None: + _skipna = True + else: + _skipna = skipna + + if _skipna: + result = numpy_like.nanargmax(self._data) + else: + result = numpy_like.argmax(self._data) + + if none_index == True: # noqa + return result + else: return self._index[int(result)] - return hpat_pandas_series_idxmax_index_impl + return numpy_like.argmax(self._data) + + return hpat_pandas_series_idxmax_impl @sdc_overload_method(SeriesType, 'mul') @@ -3987,7 +4010,7 @@ def hpat_pandas_series_ge_impl(self, other, level=None, fill_value=None, axis=0) @sdc_overload_method(SeriesType, 'idxmin') -def hpat_pandas_series_idxmin(self, axis=None, skipna=True): +def hpat_pandas_series_idxmin(self, axis=None, skipna=None): """ Intel Scalable Dataframe Compiler User Guide ******************************************** @@ -4036,25 +4059,48 @@ def hpat_pandas_series_idxmin(self, axis=None, skipna=True): if not isinstance(self.data.dtype, types.Number): ty_checker.raise_exc(self.data.dtype, 'int, float', 'self.data.dtype') - if not (isinstance(skipna, (types.Omitted, types.Boolean, bool)) or skipna is True): + if not (isinstance(skipna, (types.Omitted, types.Boolean, bool)) or skipna is None): ty_checker.raise_exc(skipna, 'bool', 'skipna') if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'None', 'axis') - if isinstance(self.index, types.NoneType) or self.index is None: - def hpat_pandas_series_idxmin_impl(self, axis=None, skipna=True): - return numpy.argmin(self._data) - - return hpat_pandas_series_idxmin_impl + none_index = isinstance(self.index, types.NoneType) or self.index is None + if isinstance(self.data, StringArrayType): + def hpat_pandas_series_idxmin_str_impl(self, axis=None, skipna=None): + if skipna is None: + _skipna = True + else: + raise ValueError("Method idxmin(). Unsupported parameter 'skipna'=False with str data") - else: - def hpat_pandas_series_idxmin_index_impl(self, axis=None, skipna=True): - # no numpy.nanargmin is supported by Numba at this time result = numpy.argmin(self._data) + if none_index == True: # noqa + return result + else: + return self._index[int(result)] + + return hpat_pandas_series_idxmin_str_impl + + def hpat_pandas_series_idxmin_impl(self, axis=None, skipna=None): + # return numpy.argmin(self._data) + if skipna is None: + _skipna = True + else: + _skipna = skipna + + if _skipna: + result = numpy_like.nanargmin(self._data) + else: + result = numpy_like.argmin(self._data) + + if none_index == True: # noqa + return result + else: return self._index[int(result)] - return hpat_pandas_series_idxmin_index_impl + return numpy_like.argmin(self._data) + + return hpat_pandas_series_idxmin_impl @sdc_overload_method(SeriesType, 'lt') diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index caf310acd..3ac537d4e 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -33,6 +33,7 @@ import numba import numpy +import sys import pandas import numpy as np @@ -42,6 +43,9 @@ import sdc from sdc.utilities.sdc_typing_utils import TypeChecker +from sdc.utilities.utils import (sdc_overload, sdc_register_jitable, + min_dtype_int_val, max_dtype_int_val, min_dtype_float_val, + max_dtype_float_val) from sdc.str_arr_ext import (StringArrayType, pre_alloc_string_array, get_utf8_size, str_arr_is_na) from sdc.utilities.utils import sdc_overload, sdc_register_jitable from sdc.utilities.prange_utils import parallel_chunks @@ -51,6 +55,22 @@ def astype(self, dtype): pass +def argmin(self): + pass + + +def argmax(self): + pass + + +def nanargmin(self): + pass + + +def nanargmax(self): + pass + + def fillna(self, inplace=False, value=None): pass @@ -133,7 +153,170 @@ def sdc_astype_number_impl(self, dtype): return sdc_astype_number_impl - ty_checker.raise_exc(self.dtype, 'str or type', 'self.dtype') + +def sdc_nanarg_overload(reduce_op): + def nanarg_impl(self): + """ + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Parallel replacement of numpy.nanargmin/numpy.nanargmax. + + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k nanargmin + Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k nanargmax + + """ + + ty_checker = TypeChecker("numpy-like 'nanargmin'/'nanargmax'") + dtype = self.dtype + isnan = get_isnan(dtype) + max_int64 = max_dtype_int_val(numpy_support.from_dtype(numpy.int64)) + if isinstance(dtype, types.Integer): + initial_result = { + min: max_dtype_int_val(dtype), + max: min_dtype_int_val(dtype), + }[reduce_op] + + if isinstance(dtype, types.Float): + initial_result = { + min: max_dtype_float_val(dtype), + max: min_dtype_float_val(dtype), + }[reduce_op] + + if not isinstance(self, types.Array): + return None + + if isinstance(dtype, types.Number): + def sdc_nanargmin_impl(self): + chunks = parallel_chunks(len(self)) + arr_res = numpy.empty(shape=len(chunks), dtype=dtype) + arr_pos = numpy.empty(shape=len(chunks), dtype=numpy.int64) + for i in prange(len(chunks)): + chunk = chunks[i] + res = initial_result + pos = max_int64 + for j in range(chunk.start, chunk.stop): + if reduce_op(res, self[j]) != self[j]: + continue + if isnan(self[j]): + continue + if res == self[j]: + pos = min(pos, j) + else: + pos = j + res = self[j] + arr_res[i] = res + arr_pos[i] = pos + + general_res = initial_result + general_pos = max_int64 + for i in range(len(chunks)): + if reduce_op(general_res, arr_res[i]) != arr_res[i]: + continue + if general_res == arr_res[i]: + general_pos = min(general_pos, arr_pos[i]) + else: + general_pos = arr_pos[i] + general_res = arr_res[i] + + return general_pos + + return sdc_nanargmin_impl + + ty_checker.raise_exc(dtype, 'number', 'self.dtype') + return nanarg_impl + + +sdc_overload(nanargmin)(sdc_nanarg_overload(min)) +sdc_overload(nanargmax)(sdc_nanarg_overload(max)) + + +def sdc_arg_overload(reduce_op): + def arg_impl(self): + """ + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Parallel replacement of numpy.argmin/numpy.argmax. + + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k argmin + Test: python -m sdc.runtests sdc.tests.test_sdc_numpy -k argmax + + """ + + ty_checker = TypeChecker("numpy-like 'argmin'/'argmax'") + dtype = self.dtype + isnan = get_isnan(dtype) + max_int64 = max_dtype_int_val(numpy_support.from_dtype(numpy.int64)) + if isinstance(dtype, types.Integer): + initial_result = { + min: max_dtype_int_val(dtype), + max: min_dtype_int_val(dtype), + }[reduce_op] + + if isinstance(dtype, types.Float): + initial_result = { + min: max_dtype_float_val(dtype), + max: min_dtype_float_val(dtype), + }[reduce_op] + + if not isinstance(self, types.Array): + return None + + if isinstance(dtype, types.Number): + def sdc_argmin_impl(self): + chunks = parallel_chunks(len(self)) + arr_res = numpy.empty(shape=len(chunks), dtype=dtype) + arr_pos = numpy.empty(shape=len(chunks), dtype=numpy.int64) + for i in prange(len(chunks)): + chunk = chunks[i] + res = initial_result + pos = max_int64 + for j in range(chunk.start, chunk.stop): + if not isnan(self[j]): + if reduce_op(res, self[j]) != self[j]: + continue + if res == self[j]: + pos = min(pos, j) + else: + pos = j + res = self[j] + else: + if numpy.isnan(res): + pos = min(pos, j) + else: + pos = j + res = self[j] + + arr_res[i] = res + arr_pos[i] = pos + general_res = initial_result + general_pos = max_int64 + for i in range(len(chunks)): + if not isnan(arr_res[i]): + if reduce_op(general_res, arr_res[i]) != arr_res[i]: + continue + if general_res == arr_res[i]: + general_pos = min(general_pos, arr_pos[i]) + else: + general_pos = arr_pos[i] + general_res = arr_res[i] + else: + if numpy.isnan(general_res): + general_pos = min(general_pos, arr_pos[i]) + else: + general_pos = arr_pos[i] + general_res = arr_res[i] + return general_pos + + return sdc_argmin_impl + + ty_checker.raise_exc(dtype, 'number', 'self.dtype') + return arg_impl + + +sdc_overload(argmin)(sdc_arg_overload(min)) +sdc_overload(argmax)(sdc_arg_overload(max)) @sdc_overload(copy) diff --git a/sdc/tests/test_sdc_numpy.py b/sdc/tests/test_sdc_numpy.py index 8c630e191..8aeee04e2 100644 --- a/sdc/tests/test_sdc_numpy.py +++ b/sdc/tests/test_sdc_numpy.py @@ -240,6 +240,66 @@ def sdc_impl(): sdc_func = self.jit(sdc_impl) np.testing.assert_array_equal(sdc_func(), ref_impl()) + def test_argmin(self): + def ref_impl(a): + return np.argmin(a) + + def sdc_impl(a): + return numpy_like.argmin(a) + + sdc_func = self.jit(sdc_impl) + + cases = [[5, 2, 0, 333, -4], [3.3, 5.4, np.nan, 7.9, np.nan]] + for case in cases: + a = np.array(case) + with self.subTest(data=case): + np.testing.assert_array_equal(sdc_func(a), ref_impl(a)) + + def test_argmax(self): + def ref_impl(a): + return np.argmax(a) + + def sdc_impl(a): + return numpy_like.argmax(a) + + sdc_func = self.jit(sdc_impl) + + cases = [[np.nan, np.nan, np.inf, np.nan], [5, 2, 0, 333, -4], [3.3, 5.4, np.nan, 7.9, np.nan]] + for case in cases: + a = np.array(case) + with self.subTest(data=case): + np.testing.assert_array_equal(sdc_func(a), ref_impl(a)) + + def test_nanargmin(self): + def ref_impl(a): + return np.nanargmin(a) + + def sdc_impl(a): + return numpy_like.nanargmin(a) + + sdc_func = self.jit(sdc_impl) + + cases = [[5, 2, 0, 333, -4], [3.3, 5.4, np.nan, 7.9, np.nan]] + for case in cases: + a = np.array(case) + with self.subTest(data=case): + np.testing.assert_array_equal(sdc_func(a), ref_impl(a)) + + def test_nanargmax(self): + def ref_impl(a): + return np.nanargmax(a) + + def sdc_impl(a): + return numpy_like.nanargmax(a) + + sdc_func = self.jit(sdc_impl) + + cases = [[np.nan, np.nan, np.inf, np.nan], [5, 2, -9, 333, -4], [3.3, 5.4, np.nan, 7.9]] + for case in cases: + a = np.array(case) + with self.subTest(data=case): + np.testing.assert_array_equal(sdc_func(a), ref_impl(a)) + class TestArrayReductions(TestCase): diff --git a/sdc/tests/tests_perf/test_perf_numpy.py b/sdc/tests/tests_perf/test_perf_numpy.py index d5343cc4f..8cd5df78a 100644 --- a/sdc/tests/tests_perf/test_perf_numpy.py +++ b/sdc/tests/tests_perf/test_perf_numpy.py @@ -85,6 +85,24 @@ def _test_case(self, cases, name, total_data_length, data_num=1, input_data=test CE(type_='Numba', code='data.astype(np.int64)', jitted=True), CE(type_='SDC', code='sdc.functions.numpy_like.astype(data, np.int64)', jitted=True), ], usecase_params='data'), + TC(name='nanargmin', size=[10 ** 7], call_expr=[ + CE(type_='Python', code='np.nanargmin(data)', jitted=False), + CE(type_='SDC', code='sdc.functions.numpy_like.nanargmin(data)', jitted=True), + ], usecase_params='data'), + TC(name='nanargmax', size=[10 ** 7], call_expr=[ + CE(type_='Python', code='np.nanargmax(data)', jitted=False), + CE(type_='SDC', code='sdc.functions.numpy_like.nanargmax(data)', jitted=True), + ], usecase_params='data'), + TC(name='argmax', size=[10 ** 7], call_expr=[ + CE(type_='Python', code='np.argmax(data)', jitted=False), + CE(type_='Numba', code='np.argmax(data)', jitted=True), + CE(type_='SDC', code='sdc.functions.numpy_like.argmax(data)', jitted=True), + ], usecase_params='data'), + TC(name='argmin', size=[10 ** 7], call_expr=[ + CE(type_='Python', code='np.argmin(data)', jitted=False), + CE(type_='Numba', code='np.argmin(data)', jitted=True), + CE(type_='SDC', code='sdc.functions.numpy_like.argmin(data)', jitted=True), + ], usecase_params='data'), TC(name='copy', size=[10 ** 7], call_expr=[ CE(type_='Python', code='np.copy(data)', jitted=False), CE(type_='Numba', code='np.copy(data)', jitted=True), diff --git a/sdc/utilities/utils.py b/sdc/utilities/utils.py index bd7f0a02d..cdd0cd9bc 100644 --- a/sdc/utilities/utils.py +++ b/sdc/utilities/utils.py @@ -38,6 +38,7 @@ from numba.typing.templates import infer_global, AbstractTemplate from numba.targets.imputils import lower_builtin from numba.extending import overload, intrinsic, lower_cast +from numba import numpy_support import numpy as np import sdc from sdc.str_ext import string_type, list_string_array_type @@ -83,6 +84,26 @@ class CTypeEnum(Enum): } +def min_dtype_int_val(dtype): + numpy_dtype = numpy_support.as_dtype(dtype) + return np.iinfo(numpy_dtype).min + + +def max_dtype_int_val(dtype): + numpy_dtype = numpy_support.as_dtype(dtype) + return np.iinfo(numpy_dtype).max + + +def min_dtype_float_val(dtype): + numpy_dtype = numpy_support.as_dtype(dtype) + return np.finfo(numpy_dtype).min + + +def max_dtype_float_val(dtype): + numpy_dtype = numpy_support.as_dtype(dtype) + return np.finfo(numpy_dtype).max + + # silence Numba error messages for now # TODO: customize through @sdc.jit numba.errors.error_extras = {