diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 87d544676..e7279b8d9 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -2274,3 +2274,49 @@ def hpat_pandas_series_median_impl(self, axis=None, skipna=True, level=None, num return numpy.median(self._data) return hpat_pandas_series_median_impl + + +@overload_method(SeriesType, 'dropna') +def hpat_pandas_series_dropna(self, axis=0, inplace=False): + """ + Pandas Series method :meth:`pandas.Series.dropna` implementation. + + .. only:: developer + + Tests: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_dropna* + + Parameters + ---------- + self: :obj:`pandas.Series` + input series + axis: :obj:`int` or :obj:`string` {0 or `index`}, default 0 + There is only one axis to drop values from. + inplace: :obj:`bool`, default False + If True, do operation inplace and return None. + *unsupported* + + Returns + ------- + :obj:`pandas.Series` + returns :obj:`pandas.Series` object with NA entries dropped from it. + """ + + _func_name = 'Method dropna().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted)) or axis == 0): + raise TypingError('{} The axis must be an Integer or String. Given: {}'.format(_func_name, axis)) + + if not (inplace is False or isinstance(inplace, types.Omitted)): + raise TypingError('{} Unsupported parameters. Given inplace: {}'.format(_func_name, inplace)) + + def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False): + # generate Series index if needed by using SeriesType.index (i.e. not self._index) + na_data_arr = hpat.hiframes.api.get_nan_mask(self._data) + data = self._data[~na_data_arr] + index = self.index[~na_data_arr] + return pandas.Series(data, index, self._name) + + return hpat_pandas_series_dropna_impl diff --git a/hpat/hiframes/api.py b/hpat/hiframes/api.py index a38046fab..84de317d1 100644 --- a/hpat/hiframes/api.py +++ b/hpat/hiframes/api.py @@ -47,6 +47,7 @@ alloc_pre_shuffle_metadata) from hpat.hiframes.join import write_send_buff from hpat.hiframes.split_impl import string_array_split_view_type +from numba.errors import TypingError # XXX: used in agg func output to avoid mutating filter, agg, join, etc. # TODO: fix type inferrer and remove this @@ -533,6 +534,31 @@ def isna_overload(arr, i): return lambda arr, i: False +def get_nan_mask(arr): + return np.zeros(len(arr), np.bool_) + + +@overload(get_nan_mask) +def get_nan_mask_overload(arr): + + def get_nan_mask_via_isna_impl(arr): + return np.array([isna(arr, i) for i in np.arange(len(arr))]) + + if isinstance(arr, types.Array): + dtype = arr.dtype + if isinstance(dtype, types.Float): + return lambda arr: np.isnan(arr) + elif isinstance(dtype, (types.Boolean, types.Integer)): + return lambda arr: np.zeros(len(arr), np.bool_) + elif isinstance(dtype, (types.NPDatetime, types.NPTimedelta)): + return get_nan_mask_via_isna_impl + else: + raise TypingError('{} Not implemented for arrays with dtype: {}'.format(_func_name, dtype)) + else: + # for StringArrayType and other cases rely on isna implementation + return get_nan_mask_via_isna_impl + + @numba.njit def min_heapify(arr, n, start, cmp_f): min_ind = start diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py index a2ec4dc73..630d398bf 100644 --- a/hpat/hiframes/pd_series_ext.py +++ b/hpat/hiframes/pd_series_ext.py @@ -532,6 +532,7 @@ def resolve_fillna(self, ary, args, kws): out = types.none return signature(out, *args) + # PR135. This needs to be commented out (for new-style impl to be called) @bound_function("series.dropna") def resolve_dropna(self, ary, args, kws): out = ary @@ -994,7 +995,7 @@ def generic_expand_cumulative_series(self, args, kws): 'resolve_cumsum', 'resolve_shift', 'resolve_sum', 'resolve_copy', 'resolve_mean', 'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique', - 'resolve_prod', 'resolve_count'] + 'resolve_prod', 'resolve_count', 'resolve_dropna'] # use ArrayAttribute for attributes not defined in SeriesAttribute for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items(): diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 50c57d957..58e9eec12 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -1078,25 +1078,159 @@ def test_impl(S): pd.testing.assert_series_equal(hpat_func(S), test_impl(S), check_names=False) - def test_series_dropna_float1(self): - def test_impl(A): - return A.dropna().values + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'No support of axis argument in old-style Series.dropna() impl') + def test_series_dropna_axis1(self): + '''Verifies Series.dropna() implementation handles 'index' as axis argument''' + def test_impl(S): + return S.dropna(axis='index') hpat_func = hpat.jit(test_impl) - S1 = pd.Series([1.0, 2.0, np.nan, 1.0]) + S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) S2 = S1.copy() - np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) - def test_series_dropna_str1(self): - def test_impl(A): - return A.dropna().values + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'No support of axis argument in old-style Series.dropna() impl') + def test_series_dropna_axis2(self): + '''Verifies Series.dropna() implementation handles 0 as axis argument''' + def test_impl(S): + return S.dropna(axis=0) hpat_func = hpat.jit(test_impl) - S1 = pd.Series(['aa', 'b', None, 'ccc']) + S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) S2 = S1.copy() - np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'No support of axis argument in old-style Series.dropna() impl') + def test_series_dropna_axis3(self): + '''Verifies Series.dropna() implementation handles correct non-literal axis argument''' + def test_impl(S, axis): + return S.dropna(axis=axis) + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) + S2 = S1.copy() + axis_values = [0, 'index'] + for value in axis_values: + pd.testing.assert_series_equal(hpat_func(S1, value), test_impl(S2, value)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'BUG: old-style dropna impl returns series without index') + def test_series_dropna_float_index1(self): + '''Verifies Series.dropna() implementation for float series with default index''' + def test_impl(S): + return S.dropna() + hpat_func = hpat.jit(test_impl) + + for data in test_global_input_data_float64: + S1 = pd.Series(data) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'BUG: old-style dropna impl returns series without index') + def test_series_dropna_float_index2(self): + '''Verifies Series.dropna() implementation for float series with string index''' + def test_impl(S): + return S.dropna() + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf], ['a', 'b', 'c', 'd', 'e']) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'BUG: old-style dropna impl returns series without index') + def test_series_dropna_str_index1(self): + '''Verifies Series.dropna() implementation for series of strings with default index''' + def test_impl(S): + return S.dropna() + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series(['aa', 'b', None, 'cccd', '']) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'BUG: old-style dropna impl returns series without index') + def test_series_dropna_str_index2(self): + '''Verifies Series.dropna() implementation for series of strings with string index''' + def test_impl(S): + return S.dropna() + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series(['aa', 'b', None, 'cccd', ''], ['a', 'b', 'c', 'd', 'e']) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'BUG: old-style dropna impl returns series without index') + def test_series_dropna_str_index3(self): + def test_impl(S): + return S.dropna() + + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series(['aa', 'b', None, 'cccd', ''], index=[1, 2, 5, 7, 10]) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skip('BUG: old-style dropna impl returns series without index, in new-style inplace is unsupported') + def test_series_dropna_float_inplace_no_index1(self): + '''Verifies Series.dropna() implementation for float series with default index and inplace argument True''' + def test_impl(S): + S.dropna(inplace=True) + return S + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skip('TODO: add reflection support and check method return value') + def test_series_dropna_float_inplace_no_index2(self): + '''Verifies Series.dropna(inplace=True) results are reflected back in the original float series''' + def test_impl(S): + return S.dropna(inplace=True) + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) + S2 = S1.copy() + self.assertIsNone(hpat_func(S1)) + self.assertIsNone(test_impl(S2)) + pd.testing.assert_series_equal(S1, S2) + + @unittest.skip('BUG: old-style dropna impl returns series without index, in new-style inplace is unsupported') + def test_series_dropna_str_inplace_no_index1(self): + '''Verifies Series.dropna() implementation for series of strings + with default index and inplace argument True + ''' + def test_impl(S): + S.dropna(inplace=True) + return S + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series(['aa', 'b', None, 'cccd', '']) + S2 = S1.copy() + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) + + @unittest.skip('TODO: add reflection support and check method return value') + def test_series_dropna_str_inplace_no_index2(self): + '''Verifies Series.dropna(inplace=True) results are reflected back in the original string series''' + def test_impl(S): + return S.dropna(inplace=True) + hpat_func = hpat.jit(test_impl) + + S1 = pd.Series(['aa', 'b', None, 'cccd', '']) + S2 = S1.copy() + self.assertIsNone(hpat_func(S1)) + self.assertIsNone(test_impl(S2)) + pd.testing.assert_series_equal(S1, S2) def test_series_dropna_str_parallel1(self): + '''Verifies Series.dropna() distributed work for series of strings with default index''' def test_impl(A): B = A.dropna() return (B == 'gg').sum() @@ -1106,46 +1240,44 @@ def test_impl(A): start, end = get_start_end(len(S1)) # TODO: gatherv self.assertEqual(hpat_func(S1[start:end]), test_impl(S1)) + self.assertEqual(count_array_REPs(), 0) + self.assertEqual(count_parfor_REPs(), 0) + self.assertTrue(count_array_OneDs() > 0) - def test_series_dropna_float_inplace1(self): - def test_impl(A): - A.dropna(inplace=True) - return A.values - hpat_func = hpat.jit(test_impl) - - S1 = pd.Series([1.0, 2.0, np.nan, 1.0]) - S2 = S1.copy() - np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) - - def test_series_dropna_str_inplace1(self): - def test_impl(A): - A.dropna(inplace=True) - return A.values + @unittest.skip('AssertionError: Series are different\n' + 'Series length are different\n' + '[left]: 3, Int64Index([0, 1, 2], dtype=\'int64\')\n' + '[right]: 2, Int64Index([1, 2], dtype=\'int64\')') + def test_series_dropna_dt_no_index1(self): + '''Verifies Series.dropna() implementation for datetime series with default index''' + def test_impl(S): + return S.dropna() hpat_func = hpat.jit(test_impl) - S1 = pd.Series(['aa', 'b', None, 'ccc']) + S1 = pd.Series([pd.NaT, pd.Timestamp('1970-12-01'), pd.Timestamp('2012-07-25')]) S2 = S1.copy() - np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) + pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) - @unittest.skip('Unsupported functionality: failed to handle index') - def test_series_dropna_index_str(self): + def test_series_dropna_bool_no_index1(self): + '''Verifies Series.dropna() implementation for bool series with default index''' def test_impl(S): return S.dropna() - hpat_func = hpat.jit(test_impl) - S1 = pd.Series(['aa', 'b', None, 'ccc'], index=['a', 'b', 'c', 'd']) + S1 = pd.Series([True, False, False, True]) S2 = S1.copy() pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) - @unittest.skip('Unsupported functionality: failed to handle index') - def test_series_dropna_index_int(self): + @unittest.skipIf(hpat.config.config_pipeline_hpat_default, + 'BUG: old-style dropna impl returns series without index') + def test_series_dropna_int_no_index1(self): + '''Verifies Series.dropna() implementation for integer series with default index''' def test_impl(S): return S.dropna() - hpat_func = hpat.jit(test_impl) - S1 = pd.Series(['aa', 'b', None, 'ccc'], index=[1, 2, 5, 7]) + n = 11 + S1 = pd.Series(np.arange(n, dtype=np.int64)) S2 = S1.copy() pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))