From 942d0fd2815aa74b1c33cdcc7df3673285808a6d Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 8 Oct 2019 08:08:09 +0300 Subject: [PATCH 1/4] Implement series.var() in new style --- .../datatypes/hpat_pandas_series_functions.py | 73 +++++++++++++++++++ hpat/hiframes/hiframes_typed.py | 2 +- hpat/hiframes/pd_series_ext.py | 3 +- hpat/tests/test_series.py | 53 ++++++++++++++ 4 files changed, 129 insertions(+), 2 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 37fd48060..9bc9572fa 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -188,6 +188,79 @@ def hpat_pandas_series_values_impl(self): return hpat_pandas_series_values_impl +@overload_method(SeriesType, 'var') +def hapt_pandas_series_var(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None): + """ + Pandas Series method :meth:`pandas.Series.var` implementation. + + .. only:: developer + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_var + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_var_unboxing + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_var_str + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_var_unsupported_params + + Parameters + ---------- + self: :obj:`pandas.Series` + input series + axis: :obj:`int`, :obj:`str` + Axis along which the operation acts + 0/None - row-wise operation + 1 - column-wise operation + *unsupported* + skipna: :obj:`bool` + exclude NA/null values + level: :obj:`int`, :obj:`str` + If the axis is a MultiIndex (hierarchical), + count along a particular level, collapsing into a scalar + *unsupported* + ddof: :obj:`int` + Delta Degrees of Freedom. + The divisor used in calculations is N - ddof, + where N represents the number of elements. + numeric_only: :obj:`bool` + Include only float, int, boolean columns. + If None, will attempt to use everything, then use only numeric data. + Not implemented for Series. + *unsupported* + + Returns + ------- + :obj:`scalar` + returns :obj:`scalar` + """ + + _func_name = 'Method var().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not isinstance(self.dtype, types.Number): + raise TypingError('{} The object must be a number. Given self.dtype: {}'.format(_func_name, self.dtype)) + + if not isinstance(skipna, (types.Omitted, types.Boolean, types.NoneType)) and skipna is not None: + raise TypingError('{} The object must be a boolean. Given skipna: {}'.format(_func_name, skipna)) + + if not isinstance(ddof, (types.Omitted, int, types.Integer)): + raise TypingError('{} The object must be an integer. Given ddof: {}'.format(_func_name, ddof)) + + for name, arg in [('axis', axis), ('level', level), ('numeric_only', numeric_only)]: + if not isinstance(arg, (types.Omitted, types.NoneType)) and arg is not None: + raise TypingError('{} Unsupported parameters. Given {}: {}'.format(_func_name, name, arg)) + + def hpat_pandas_series_var_impl(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None): + if skipna is None: + skipna = True + + if skipna: + valuable_length = len(self._data) - numpy.sum(numpy.isnan(self._data)) + return numpy.nanvar(self._data) * valuable_length / (valuable_length - ddof) + + return self._data.var() * len(self._data) / (len(self._data) - ddof) + + return hpat_pandas_series_var_impl + + @overload_attribute(SeriesType, 'index') def hpat_pandas_series_index(self): """ diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index db3284ae9..c6cd1da50 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -843,7 +843,7 @@ def parse_impl(data): def _run_call_series(self, assign, lhs, rhs, series_var, func_name): # single arg functions - if func_name in ('sum', 'count', 'mean', 'var', 'min', 'max'): + if func_name in ('sum', 'count', 'mean', 'min', 'max'): if rhs.args or rhs.kws: raise ValueError("HPAT pipeline does not support arguments for Series.{}()".format(func_name)) diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py index d36986836..ed4a3ce7e 100644 --- a/hpat/hiframes/pd_series_ext.py +++ b/hpat/hiframes/pd_series_ext.py @@ -992,7 +992,8 @@ def generic_expand_cumulative_series(self, args, kws): # TODO: add itemsize, strides, etc. when removed from Pandas _not_series_array_attrs = ['flat', 'ctypes', 'itemset', 'reshape', 'sort', 'flatten', - 'resolve_shift', 'resolve_sum', 'resolve_copy', 'resolve_mean', + 'resolve_shift', 'resolve_sum', 'resolve_var', + 'resolve_copy', 'resolve_mean', 'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique', 'resolve_prod', 'resolve_count'] diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 2a27408ea..07e7303b3 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -2637,6 +2637,58 @@ def test_series_nunique_param1_impl(S, dropna): result_param1 = hpat_func_param1(S, param1) self.assertEqual(result_param1, result_param1_ref) + def test_series_var(self): + def pyfunc(): + series = pd.Series([1.3, -2.7, np.nan, 0.1, 10.9]) + return series.var() + + cfunc = hpat.jit(pyfunc) + ref_result = pyfunc() + result = cfunc() + np.testing.assert_equal(ref_result, result) + + def test_series_var_unboxing(self): + def pyfunc(series, skipna, ddof): + return series.var(skipna=skipna, ddof=ddof) + + cfunc = hpat.jit(pyfunc) + series = pd.Series([1.3, -2.7, np.nan, 0.1, 10.9]) + for ddof in [0, 1]: + for skipna in [True, False]: + ref_result = pyfunc(series, skipna=skipna, ddof=ddof) + result = cfunc(series, skipna=skipna, ddof=ddof) + np.testing.assert_equal(ref_result, result) + + def test_series_var_str(self): + def pyfunc(series): + return series.var() + + cfunc = hpat.jit(pyfunc) + series = pd.Series(['test', 'series', 'var', 'str']) + with self.assertRaises(TypingError) as raises: + cfunc(series) + msg = 'Method var(). The object must be a number. Given self.dtype: {}' + self.assertIn(msg.format(types.unicode_type), str(raises.exception)) + + def test_series_var_unsupported_params(self): + def pyfunc(series, axis, level, numeric_only): + return series.var(axis=axis, level=level, numeric_only=numeric_only) + + cfunc = hpat.jit(pyfunc) + series = pd.Series([1.3, -2.7, np.nan, 0.1, 10.9]) + msg = 'Method var(). Unsupported parameters. Given {}: {}' + with self.assertRaises(TypingError) as raises: + cfunc(series, axis=1, level=None, numeric_only=None) + self.assertIn(msg.format('axis', 'int'), str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + cfunc(series, axis=None, level=1, numeric_only=None) + self.assertIn(msg.format('level', 'int'), str(raises.exception)) + + with self.assertRaises(TypingError) as raises: + cfunc(series, axis=None, level=None, numeric_only=True) + self.assertIn(msg.format('numeric_only', 'bool'), str(raises.exception)) + def test_series_count(self): def test_series_count_impl(S): return S.count() @@ -2665,5 +2717,6 @@ def test_series_count_impl(S): result = hpat_func(S) self.assertEqual(result, result_ref) + if __name__ == "__main__": unittest.main() From 6c273279674e11ab7a6882fa0d826f096b40129a Mon Sep 17 00:00:00 2001 From: Denis Date: Wed, 16 Oct 2019 15:00:17 +0300 Subject: [PATCH 2/4] Minor changes for series.var() --- .../datatypes/hpat_pandas_series_functions.py | 11 +++++----- hpat/tests/test_series.py | 21 ++++++++++--------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 9bc9572fa..6887388e0 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -189,7 +189,7 @@ def hpat_pandas_series_values_impl(self): @overload_method(SeriesType, 'var') -def hapt_pandas_series_var(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None): +def hpat_pandas_series_var(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None): """ Pandas Series method :meth:`pandas.Series.var` implementation. @@ -205,8 +205,8 @@ def hapt_pandas_series_var(self, axis=None, skipna=None, level=None, ddof=1, num input series axis: :obj:`int`, :obj:`str` Axis along which the operation acts - 0/None - row-wise operation - 1 - column-wise operation + 0/None/'index' - row-wise operation + 1/'columns' - column-wise operation *unsupported* skipna: :obj:`bool` exclude NA/null values @@ -235,8 +235,9 @@ def hapt_pandas_series_var(self, axis=None, skipna=None, level=None, ddof=1, num if not isinstance(self, SeriesType): raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) - if not isinstance(self.dtype, types.Number): - raise TypingError('{} The object must be a number. Given self.dtype: {}'.format(_func_name, self.dtype)) + if not isinstance(self.data.dtype, types.Number): + msg = '{} The object must be a number. Given self.data.dtype: {}' + raise TypingError(msg.format(_func_name, self.data.dtype)) if not isinstance(skipna, (types.Omitted, types.Boolean, types.NoneType)) and skipna is not None: raise TypingError('{} The object must be a boolean. Given skipna: {}'.format(_func_name, skipna)) diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 07e7303b3..3ba6c32f7 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -2639,7 +2639,7 @@ def test_series_nunique_param1_impl(S, dropna): def test_series_var(self): def pyfunc(): - series = pd.Series([1.3, -2.7, np.nan, 0.1, 10.9]) + series = pd.Series([1.0, np.nan, -1.0, 0.0, 5e-324]) return series.var() cfunc = hpat.jit(pyfunc) @@ -2652,22 +2652,23 @@ def pyfunc(series, skipna, ddof): return series.var(skipna=skipna, ddof=ddof) cfunc = hpat.jit(pyfunc) - series = pd.Series([1.3, -2.7, np.nan, 0.1, 10.9]) - for ddof in [0, 1]: - for skipna in [True, False]: - ref_result = pyfunc(series, skipna=skipna, ddof=ddof) - result = cfunc(series, skipna=skipna, ddof=ddof) - np.testing.assert_equal(ref_result, result) + for data in test_global_input_data_float64: + series = pd.Series(data) + for ddof in [0, 1]: + for skipna in [True, False]: + ref_result = pyfunc(series, skipna=skipna, ddof=ddof) + result = cfunc(series, skipna=skipna, ddof=ddof) + np.testing.assert_equal(ref_result, result) def test_series_var_str(self): def pyfunc(series): return series.var() cfunc = hpat.jit(pyfunc) - series = pd.Series(['test', 'series', 'var', 'str']) + series = pd.Series(test_global_input_data_unicode_kind4) with self.assertRaises(TypingError) as raises: cfunc(series) - msg = 'Method var(). The object must be a number. Given self.dtype: {}' + msg = 'Method var(). The object must be a number. Given self.data.dtype: {}' self.assertIn(msg.format(types.unicode_type), str(raises.exception)) def test_series_var_unsupported_params(self): @@ -2675,7 +2676,7 @@ def pyfunc(series, axis, level, numeric_only): return series.var(axis=axis, level=level, numeric_only=numeric_only) cfunc = hpat.jit(pyfunc) - series = pd.Series([1.3, -2.7, np.nan, 0.1, 10.9]) + series = pd.Series(test_global_input_data_float64[0]) msg = 'Method var(). Unsupported parameters. Given {}: {}' with self.assertRaises(TypingError) as raises: cfunc(series, axis=1, level=None, numeric_only=None) From f783e970c892811b51fdffeead2ac54c0a7c59fe Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 24 Oct 2019 09:12:14 +0300 Subject: [PATCH 3/4] Avoid devision by zero for series.var() --- hpat/datatypes/hpat_pandas_series_functions.py | 6 ++++++ hpat/tests/test_series.py | 18 +++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 6887388e0..b59fc3541 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -255,8 +255,14 @@ def hpat_pandas_series_var_impl(self, axis=None, skipna=None, level=None, ddof=1 if skipna: valuable_length = len(self._data) - numpy.sum(numpy.isnan(self._data)) + if valuable_length <= ddof: + return numpy.nan + return numpy.nanvar(self._data) * valuable_length / (valuable_length - ddof) + if len(self._data) <= ddof: + return numpy.nan + return self._data.var() * len(self._data) / (len(self._data) - ddof) return hpat_pandas_series_var_impl diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 3ba6c32f7..5cc5da031 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -36,18 +36,22 @@ ), ]] +min_float64 = np.finfo('float64').min +max_float64 = np.finfo('float64').max + test_global_input_data_float64 = [ - [1.0, np.nan, -1.0, 0.0, 5e-324], + [1., np.nan, -1., 0., min_float64, max_float64], [np.nan, np.inf, np.NINF, np.NZERO] ] -min_int64 = -9223372036854775808 -max_int64 = 9223372036854775807 -max_uint64 = 18446744073709551615 +min_int64 = np.iinfo('int64').min +max_int64 = np.iinfo('int64').max +max_uint64 = np.iinfo('uint64').max test_global_input_data_integer64 = [ - [1, -1, 0, max_uint64], - [-0, min_int64, max_int64] + [1, -1, 0], + [min_int64, max_int64], + [max_uint64] ] test_global_input_data_numeric = test_global_input_data_integer64 + test_global_input_data_float64 @@ -2652,7 +2656,7 @@ def pyfunc(series, skipna, ddof): return series.var(skipna=skipna, ddof=ddof) cfunc = hpat.jit(pyfunc) - for data in test_global_input_data_float64: + for data in test_global_input_data_numeric + [[]]: series = pd.Series(data) for ddof in [0, 1]: for skipna in [True, False]: From a1d287a8e1bc3716cd473c1e97d277612df279ef Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 24 Oct 2019 13:27:21 +0300 Subject: [PATCH 4/4] Revert multiprocessing parallelism for series.var() --- hpat/hiframes/hiframes_typed.py | 2 +- hpat/tests/test_series.py | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index c6cd1da50..db3284ae9 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -843,7 +843,7 @@ def parse_impl(data): def _run_call_series(self, assign, lhs, rhs, series_var, func_name): # single arg functions - if func_name in ('sum', 'count', 'mean', 'min', 'max'): + if func_name in ('sum', 'count', 'mean', 'var', 'min', 'max'): if rhs.args or rhs.kws: raise ValueError("HPAT pipeline does not support arguments for Series.{}()".format(func_name)) diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 5cc5da031..93d2a14c2 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -2647,11 +2647,20 @@ def pyfunc(): return series.var() cfunc = hpat.jit(pyfunc) - ref_result = pyfunc() - result = cfunc() - np.testing.assert_equal(ref_result, result) + np.testing.assert_equal(pyfunc(), cfunc()) def test_series_var_unboxing(self): + def pyfunc(series): + return series.var() + + cfunc = hpat.jit(pyfunc) + for data in test_global_input_data_numeric + [[]]: + series = pd.Series(data) + np.testing.assert_equal(pyfunc(series), cfunc(series)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'Series.var() parameters "ddof" and "skipna" unsupported') + def test_series_var_full(self): def pyfunc(series, skipna, ddof): return series.var(skipna=skipna, ddof=ddof)