From f68846731d2f48158d30082305a8865be8d44192 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Tue, 15 Oct 2019 11:42:15 +0300 Subject: [PATCH 1/4] Implement Series.prod() in new style Improve exception message for min_count value for Series.prod() Improve test Add tests for skipna=None as default Fix prod(skipna=None) and use subTest() for better error indication when testing on a set of data Describe the parameters and add types to docstring Use SeriesType().dtype instead of SeriesType.data.dtype Use numba.typing.arraydecl.ArrayAttribute.resolve_prod() instead of hpat.hiframes.pd_series_ext.SeriesAttribute.resolve_prod() because SeriesAttribute does not have resolve_prod Update docstring for hpat_pandas_series_prod Fix docstring --- .../datatypes/hpat_pandas_series_functions.py | 69 +++++++++++++++++++ hpat/hiframes/hiframes_typed.py | 2 +- hpat/hiframes/pd_dataframe_ext.py | 2 +- hpat/hiframes/pd_series_ext.py | 3 +- hpat/tests/test_series.py | 38 ++++++++-- 5 files changed, 104 insertions(+), 10 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 3009c12fd..ff1bea85e 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -1074,6 +1074,75 @@ def hpat_pandas_series_max_impl(self, axis=None, skipna=True, level=None, numeri return hpat_pandas_series_max_impl +@overload_method(SeriesType, 'prod') +def hpat_pandas_series_prod(self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0): + """ + Pandas Series method :meth:`pandas.Series.prod` implementation. + + .. only:: developer + + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_prod + + Parameters + ----------- + self: :obj:`pandas.Series` + input series + axis: {index (0)} + Axis for the function to be applied on. + *unsupported* + skipna: :obj:`bool`, default :obj:`True` + Exclude nan values when computing the result + level: :obj:`int`, :obj:`str`, default :obj:`None` + If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a scalar. + *unsupported* + numeric_only: :obj:`bool`, default :obj:`None` + Include only float, int, boolean columns. + If None, will attempt to use everything, then use only numeric data. + Not implemented for Series. + *unsupported* + min_count: :obj:`int`, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + *unsupported* + + Returns + ------- + :obj: + Returns scalar or Series (if level specified) + """ + + _func_name = 'Method prod().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not isinstance(self.dtype, (types.Integer, types.Float)): + raise TypingError('{} Currently function supports only numeric values. Given data type: {}'.format(_func_name, self.data.dtype)) + + if not (isinstance(skipna, (types.Omitted, types.Boolean, types.NoneType)) or skipna is None): + raise TypingError( + '{} The parameter must be a boolean type. Given type skipna: {}'.format(_func_name, skipna)) + + if not (isinstance(axis, types.Omitted) or axis is None) \ + or not (isinstance(level, types.Omitted) or level is None) \ + or not (isinstance(numeric_only, types.Omitted) or numeric_only is None) \ + or not (isinstance(min_count, types.Omitted) or min_count == 0): + raise TypingError( + '{} Unsupported parameters. Given axis: {}, level: {}, numeric_only: {}, min_count: {}'.format( + _func_name, axis, level, numeric_only, min_count)) + + def hpat_pandas_series_prod_impl(self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0): + if skipna is None: + skipna = True + + if skipna: + return numpy.nanprod(self._data) + + return numpy.prod(self._data) + + return hpat_pandas_series_prod_impl + + @overload_method(SeriesType, 'mod') def hpat_pandas_series_mod(self, other, level=None, fill_value=None, axis=0): """ diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index a72c13680..382fbcd75 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -851,7 +851,7 @@ def parse_impl(data): def _run_call_series(self, assign, lhs, rhs, series_var, func_name): # single arg functions - if func_name in ('sum', 'count', 'mean', 'var', 'min', 'max', 'prod'): + if func_name in ('sum', 'count', 'mean', 'var', 'min', 'max'): if rhs.args or rhs.kws: raise ValueError("HPAT pipeline does not support arguments for Series.{}()".format(func_name)) diff --git a/hpat/hiframes/pd_dataframe_ext.py b/hpat/hiframes/pd_dataframe_ext.py index d2f75d645..8be5aa6d8 100644 --- a/hpat/hiframes/pd_dataframe_ext.py +++ b/hpat/hiframes/pd_dataframe_ext.py @@ -1554,7 +1554,7 @@ def generic(self, args, kws): df = args[0] # TODO: ignore non-numerics # get series prod output types - dtypes = tuple(hpat.hiframes.pd_series_ext.SeriesAttribute.resolve_prod( + dtypes = tuple(numba.typing.arraydecl.ArrayAttribute.resolve_prod( self, SeriesType(d.dtype)).get_call_type(self, (), {}).return_type for d in df.data) diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py index 6eefa938b..7b98a6d58 100644 --- a/hpat/hiframes/pd_series_ext.py +++ b/hpat/hiframes/pd_series_ext.py @@ -988,7 +988,8 @@ def generic_expand_cumulative_series(self, args, kws): # TODO: add itemsize, strides, etc. when removed from Pandas _not_series_array_attrs = ['flat', 'ctypes', 'itemset', 'reshape', 'sort', 'flatten', - 'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique'] + 'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique', + 'resolve_prod'] # use ArrayAttribute for attributes not defined in SeriesAttribute for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items(): diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 48ffde091..d55aa4fd5 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -1010,17 +1010,41 @@ def test_impl(S): S = pd.Series([np.nan, np.nan]) self.assertEqual(hpat_func(S), test_impl(S)) - def test_series_prod1(self): + def test_series_prod(self): + def test_impl(S, skipna): + return S.prod(skipna=skipna) + hpat_func = hpat.jit(test_impl) + + data_samples = [ + [6, 6, 2, 1, 3, 3, 2, 1, 2], + [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], + [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2], + [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf], + [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf], + [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.inf], + ] + + for skipna in [True, False, None]: + with self.subTest(skipna=skipna): + kwargs = {'skipna': skipna} + for data in data_samples: + with self.subTest(data=data): + S = pd.Series(data) + actual = hpat_func(S, **kwargs) + expected = test_impl(S, **kwargs) + if np.isnan(actual) or np.isnan(expected): + self.assertEqual(np.isnan(actual), np.isnan(expected)) + else: + self.assertEqual(actual, expected) + + def test_series_prod_skipna_default(self): def test_impl(S): return S.prod() hpat_func = hpat.jit(test_impl) - # column with NA - S = pd.Series([np.nan, 2., 3.]) - self.assertEqual(hpat_func(S), test_impl(S)) - - # all NA case should produce 1 - S = pd.Series([np.nan, np.nan]) + S = pd.Series([np.nan, 2, 3.]) self.assertEqual(hpat_func(S), test_impl(S)) def test_series_count1(self): From 19a83fc45ab6767fbc76e598e2e752e0c02da262 Mon Sep 17 00:00:00 2001 From: Sergey Shalnov Date: Sat, 19 Oct 2019 20:53:06 -0500 Subject: [PATCH 2/4] PR227. Test simplified. algo moved with minor changes --- .../datatypes/hpat_pandas_series_functions.py | 134 +++++++++--------- hpat/hiframes/hiframes_typed.py | 2 +- hpat/tests/test_series.py | 24 ++-- 3 files changed, 78 insertions(+), 82 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index ff1bea85e..ca60fbb7f 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -925,6 +925,71 @@ def hpat_pandas_series_pow_impl(self, other): raise TypingError('{} The object must be a pandas.series and argument must be a number. Given: {} and other: {}'.format(_func_name, self, other)) +@overload_method(SeriesType, 'prod') +def hpat_pandas_series_prod(self, axis=None, skipna=True, level=None, numeric_only=None, min_count=0): + """ + Pandas Series method :meth:`pandas.Series.prod` implementation. + + .. only:: developer + + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_prod + + Parameters + ----------- + self: :obj:`pandas.Series` + input series + axis: {index (0)} + Axis for the function to be applied on. + *unsupported* + skipna: :obj:`bool`, default :obj:`True` + Exclude nan values when computing the result + level: :obj:`int`, :obj:`str`, default :obj:`None` + If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a scalar. + *unsupported* + numeric_only: :obj:`bool`, default :obj:`None` + Include only float, int, boolean columns. + If None, will attempt to use everything, then use only numeric data. + Not implemented for Series. + *unsupported* + min_count: :obj:`int`, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + *unsupported* + + Returns + ------- + :obj: + Returns scalar or Series (if level specified) + """ + + _func_name = 'Method prod().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not isinstance(self.data.dtype, (types.Integer, types.Float)): + raise TypingError('{} Non numeric values unsupported. Given: {}'.format(_func_name, self.data.data.dtype)) + + if not (isinstance(skipna, (types.Omitted, types.Boolean)) or skipna is True): + raise TypingError("{} 'skipna' must be a boolean type. Given: {}".format(_func_name, skipna)) + + if not (isinstance(axis, types.Omitted) or axis is None) \ + or not (isinstance(level, types.Omitted) or level is None) \ + or not (isinstance(numeric_only, types.Omitted) or numeric_only is None) \ + or not (isinstance(min_count, types.Omitted) or min_count == 0): + raise TypingError( + '{} Unsupported parameters. Given axis: {}, level: {}, numeric_only: {}, min_count: {}'.format( + _func_name, axis, level, numeric_only, min_count)) + + def hpat_pandas_series_prod_impl(self, axis=None, skipna=True, level=None, numeric_only=None, min_count=0): + if skipna: + return numpy.nanprod(self._data) + else: + return numpy.prod(self._data) + + return hpat_pandas_series_prod_impl + + @overload_method(SeriesType, 'quantile') def hpat_pandas_series_quantile(self, q=0.5, interpolation='linear'): """ @@ -1074,75 +1139,6 @@ def hpat_pandas_series_max_impl(self, axis=None, skipna=True, level=None, numeri return hpat_pandas_series_max_impl -@overload_method(SeriesType, 'prod') -def hpat_pandas_series_prod(self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0): - """ - Pandas Series method :meth:`pandas.Series.prod` implementation. - - .. only:: developer - - Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_prod - - Parameters - ----------- - self: :obj:`pandas.Series` - input series - axis: {index (0)} - Axis for the function to be applied on. - *unsupported* - skipna: :obj:`bool`, default :obj:`True` - Exclude nan values when computing the result - level: :obj:`int`, :obj:`str`, default :obj:`None` - If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a scalar. - *unsupported* - numeric_only: :obj:`bool`, default :obj:`None` - Include only float, int, boolean columns. - If None, will attempt to use everything, then use only numeric data. - Not implemented for Series. - *unsupported* - min_count: :obj:`int`, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result will be NA. - *unsupported* - - Returns - ------- - :obj: - Returns scalar or Series (if level specified) - """ - - _func_name = 'Method prod().' - - if not isinstance(self, SeriesType): - raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) - - if not isinstance(self.dtype, (types.Integer, types.Float)): - raise TypingError('{} Currently function supports only numeric values. Given data type: {}'.format(_func_name, self.data.dtype)) - - if not (isinstance(skipna, (types.Omitted, types.Boolean, types.NoneType)) or skipna is None): - raise TypingError( - '{} The parameter must be a boolean type. Given type skipna: {}'.format(_func_name, skipna)) - - if not (isinstance(axis, types.Omitted) or axis is None) \ - or not (isinstance(level, types.Omitted) or level is None) \ - or not (isinstance(numeric_only, types.Omitted) or numeric_only is None) \ - or not (isinstance(min_count, types.Omitted) or min_count == 0): - raise TypingError( - '{} Unsupported parameters. Given axis: {}, level: {}, numeric_only: {}, min_count: {}'.format( - _func_name, axis, level, numeric_only, min_count)) - - def hpat_pandas_series_prod_impl(self, axis=None, skipna=None, level=None, numeric_only=None, min_count=0): - if skipna is None: - skipna = True - - if skipna: - return numpy.nanprod(self._data) - - return numpy.prod(self._data) - - return hpat_pandas_series_prod_impl - - @overload_method(SeriesType, 'mod') def hpat_pandas_series_mod(self, other, level=None, fill_value=None, axis=0): """ diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index 382fbcd75..d6c523e5d 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -865,7 +865,7 @@ def _run_call_series(self, assign, lhs, rhs, series_var, func_name): data = self._get_series_data(series_var, nodes) return self._replace_func(func, [data], pre_nodes=nodes) - if func_name in ('std', 'nunique', 'describe', 'isna', + if func_name in ('std', 'nunique', 'describe', 'isna', 'prod' 'isnull', 'median', 'idxmin', 'idxmax', 'unique'): if rhs.args or rhs.kws: raise ValueError("unsupported Series.{}() arguments".format( diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index d55aa4fd5..f1dfe7362 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -1026,18 +1026,18 @@ def test_impl(S, skipna): [np.nan, np.nan, np.inf], ] - for skipna in [True, False, None]: - with self.subTest(skipna=skipna): - kwargs = {'skipna': skipna} - for data in data_samples: - with self.subTest(data=data): - S = pd.Series(data) - actual = hpat_func(S, **kwargs) - expected = test_impl(S, **kwargs) - if np.isnan(actual) or np.isnan(expected): - self.assertEqual(np.isnan(actual), np.isnan(expected)) - else: - self.assertEqual(actual, expected) + for data in data_samples: + S = pd.Series(data) + + for skipna_var in [True, False]: + actual = hpat_func(S, skipna=skipna_var) + expected = test_impl(S, skipna=skipna_var) + + if np.isnan(actual) or np.isnan(expected): + # con not compare Nan != Nan directly + self.assertEqual(np.isnan(actual), np.isnan(expected)) + else: + self.assertEqual(actual, expected) def test_series_prod_skipna_default(self): def test_impl(S): From 02433972ad92df59a9160c57b4418bb88f5aa3f8 Mon Sep 17 00:00:00 2001 From: Sergey Shalnov Date: Sat, 19 Oct 2019 22:25:37 -0500 Subject: [PATCH 3/4] missed comma cause silent names changes in array --- hpat/hiframes/hiframes_typed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index 112083836..f4b73334a 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -856,7 +856,7 @@ def _run_call_series(self, assign, lhs, rhs, series_var, func_name): data = self._get_series_data(series_var, nodes) return self._replace_func(func, [data], pre_nodes=nodes) - if func_name in ('std', 'nunique', 'describe', 'prod' + if func_name in ('std', 'nunique', 'describe', 'prod', 'isnull', 'median', 'idxmin', 'idxmax', 'unique'): if rhs.args or rhs.kws: raise ValueError("unsupported Series.{}() arguments".format( From d6d2a7797de049d5ab91b996b9f48769bdd4ef3f Mon Sep 17 00:00:00 2001 From: Sergey Shalnov Date: Sat, 19 Oct 2019 23:57:05 -0500 Subject: [PATCH 4/4] PR227. Using inline kernel --- hpat/hiframes/hiframes_typed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index f4b73334a..6da99b89b 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -856,7 +856,7 @@ def _run_call_series(self, assign, lhs, rhs, series_var, func_name): data = self._get_series_data(series_var, nodes) return self._replace_func(func, [data], pre_nodes=nodes) - if func_name in ('std', 'nunique', 'describe', 'prod', + if func_name in ('std', 'nunique', 'describe', 'isnull', 'median', 'idxmin', 'idxmax', 'unique'): if rhs.args or rhs.kws: raise ValueError("unsupported Series.{}() arguments".format(