From 2484e0307875ce367111c6a2296487cc775517a1 Mon Sep 17 00:00:00 2001 From: "Kozlov, Alexey" Date: Tue, 8 Oct 2019 01:04:14 +0300 Subject: [PATCH] Refactor Series.median() in a new style via np.median() --- .../datatypes/hpat_pandas_series_functions.py | 64 +++++++++++++++++++ hpat/hiframes/pd_series_ext.py | 14 ++-- hpat/tests/test_series.py | 32 ++++++++++ 3 files changed, 103 insertions(+), 7 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 2545f96f8..7dd99a242 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -1547,3 +1547,67 @@ def hpat_pandas_series_nunique_impl(self, dropna=True): return len(data_set) + 1 return hpat_pandas_series_nunique_impl + + +@overload_method(SeriesType, 'median') +def hpat_pandas_series_median(self, axis=None, skipna=True, level=None, numeric_only=None): + """ + Pandas Series method :meth:`pandas.Series.median` implementation. + + .. only:: developer + + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_median1 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_median_skipna_default1 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_median_skipna_false1 + + Parameters + ----------- + self: :obj:`pandas.Series` + input series + axis: :obj:`int` or :obj:`string` {0 or `index`, None}, default None + The axis for the function to be applied on. + *unsupported* + skipna: :obj:`bool`, default True + exclude NA/null values when computing the result + level: :obj:`int` or :obj:`string`, default None + *unsupported* + numeric_only: :obj:`bool` or None, default None + *unsupported* + + Returns + ------- + :obj:`float` or :obj:`pandas.Series` (if level is specified) + median of values in the series + + """ + + _func_name = 'Method median().' + + if not isinstance(self, SeriesType): + raise TypingError( + '{} The object must be a pandas.series. Given self: {}'.format(_func_name, self)) + + if not isinstance(self.dtype, types.Number): + raise TypingError( + '{} The function only applies to elements that are all numeric. Given data type: {}'.format(_func_name, self.dtype)) + + if not (isinstance(axis, (types.Integer, types.UnicodeType, types.Omitted)) or axis is None): + raise TypingError('{} The axis must be an Integer or a String. Currently unsupported. Given: {}'.format(_func_name, axis)) + + if not (isinstance(skipna, (types.Boolean, types.Omitted)) or skipna == True): + raise TypingError('{} The is_copy must be a boolean. Given: {}'.format(_func_name, skipna)) + + if not ((level is None or isinstance(level, types.Omitted)) + or (numeric_only is None or isinstance(numeric_only, types.Omitted)) + or (axis is None or isinstance(axis, types.Omitted)) + ): + raise TypingError('{} Unsupported parameters. Given level: {}, numeric_only: {}, axis: {}'.format(_func_name, level, numeric_only, axis)) + + + def hpat_pandas_series_median_impl(self, axis=None, skipna=True, level=None, numeric_only=None): + if skipna: + return numpy.nanmedian(self._data) + + return numpy.median(self._data) + + return hpat_pandas_series_median_impl diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py index 6eefa938b..61af4579f 100644 --- a/hpat/hiframes/pd_series_ext.py +++ b/hpat/hiframes/pd_series_ext.py @@ -683,13 +683,13 @@ def resolve_head(self, ary, args, kws): assert not kws return signature(ary, *args) - @bound_function("series.median") - def resolve_median(self, ary, args, kws): - assert not kws - dtype = ary.dtype - # median converts integer output to float - dtype = types.float64 if isinstance(dtype, types.Integer) else dtype - return signature(dtype, *args) +# @bound_function("series.median") +# def resolve_median(self, ary, args, kws): +# assert not kws +# dtype = ary.dtype +# # median converts integer output to float +# dtype = types.float64 if isinstance(dtype, types.Integer) else dtype +# return signature(dtype, *args) @bound_function("series.idxmin") def resolve_idxmin(self, ary, args, kws): diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 247a7f6bd..0f53217ca 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -1691,6 +1691,7 @@ def test_impl(S): self.assertTrue(count_array_OneDs() > 0) def test_series_median1(self): + '''Verifies median implementation for float and integer series of random data''' def test_impl(S): return S.median() hpat_func = hpat.jit(test_impl) @@ -1711,6 +1712,34 @@ def test_impl(S): S = pd.Series(np.random.ranf(m)) self.assertEqual(hpat_func(S), test_impl(S)) + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + "BUG: old-style median implementation doesn't filter NaNs") + def test_series_median_skipna_default1(self): + '''Verifies median implementation with default skipna=True argument on a series with NA values''' + def test_impl(S): + return S.median() + hpat_func = hpat.jit(test_impl) + + S = pd.Series([2., 3., 5., np.nan, 5., 6., 7.]) + self.assertEqual(hpat_func(S), test_impl(S)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + "Skipna argument is not supported in old-style") + def test_series_median_skipna_false1(self): + '''Verifies median implementation with skipna=False on a series with NA values''' + def test_impl(S): + return S.median(skipna=False) + hpat_func = hpat.jit(test_impl) + + # np.inf is not NaN, so verify that a correct number is returned + S1 = pd.Series([2., 3., 5., np.inf, 5., 6., 7.]) + self.assertEqual(hpat_func(S1), test_impl(S1)) + + # TODO: both return values are 'nan', but HPAT's is not np.nan, hence checking with + # assertIs() doesn't work - check if it's Numba relatated + S2 = pd.Series([2., 3., 5., np.nan, 5., 6., 7.]) + self.assertEqual(np.isnan(hpat_func(S2)), np.isnan(test_impl(S2))) + def test_series_median_parallel1(self): # create `kde.parquet` file ParquetGenerator.gen_kde_pq() @@ -1722,6 +1751,9 @@ def test_impl(): hpat_func = hpat.jit(test_impl) self.assertEqual(hpat_func(), test_impl()) + self.assertEqual(count_array_REPs(), 0) + self.assertEqual(count_parfor_REPs(), 0) + self.assertTrue(count_array_OneDs() > 0) def test_series_argsort_parallel(self): # create `kde.parquet` file