-
Notifications
You must be signed in to change notification settings - Fork 62
Refactor Series.dropna() to a new style w/o inplace support #239
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ | |
alloc_pre_shuffle_metadata) | ||
from hpat.hiframes.join import write_send_buff | ||
from hpat.hiframes.split_impl import string_array_split_view_type | ||
from numba.errors import TypingError | ||
|
||
# XXX: used in agg func output to avoid mutating filter, agg, join, etc. | ||
# TODO: fix type inferrer and remove this | ||
|
@@ -533,6 +534,31 @@ def isna_overload(arr, i): | |
return lambda arr, i: False | ||
|
||
|
||
def get_nan_mask(arr): | ||
return np.zeros(len(arr), np.bool_) | ||
|
||
|
||
@overload(get_nan_mask) | ||
def get_nan_mask_overload(arr): | ||
|
||
def get_nan_mask_via_isna_impl(arr): | ||
return np.array([isna(arr, i) for i in np.arange(len(arr))]) | ||
|
||
if isinstance(arr, types.Array): | ||
dtype = arr.dtype | ||
if isinstance(dtype, types.Float): | ||
return lambda arr: np.isnan(arr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouls propose to avoid lambda here and make it in the same way as |
||
elif isinstance(dtype, (types.Boolean, types.Integer)): | ||
return lambda arr: np.zeros(len(arr), np.bool_) | ||
elif isinstance(dtype, (types.NPDatetime, types.NPTimedelta)): | ||
return get_nan_mask_via_isna_impl | ||
else: | ||
raise TypingError('{} Not implemented for arrays with dtype: {}'.format(_func_name, dtype)) | ||
else: | ||
# for StringArrayType and other cases rely on isna implementation | ||
return get_nan_mask_via_isna_impl | ||
|
||
|
||
@numba.njit | ||
def min_heapify(arr, n, start, cmp_f): | ||
min_ind = start | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1078,25 +1078,159 @@ def test_impl(S): | |
pd.testing.assert_series_equal(hpat_func(S), | ||
test_impl(S), check_names=False) | ||
|
||
def test_series_dropna_float1(self): | ||
def test_impl(A): | ||
return A.dropna().values | ||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'No support of axis argument in old-style Series.dropna() impl') | ||
def test_series_dropna_axis1(self): | ||
'''Verifies Series.dropna() implementation handles 'index' as axis argument''' | ||
def test_impl(S): | ||
return S.dropna(axis='index') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This changed original test. Please implement new one with this parameter. |
||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to copy in tests that don't test inplace operation |
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you use floats defined globally? e.g. f2435b0#diff-deca39d332649cea819383154a5d2cb3R39-R42 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @densmirn Sure, but in most of the tests indexes of different types have to be specified too. I added using test_global_input_data_float64 for one of the tests that uses default index: test_series_dropna_float_index1. That should be enough probably. |
||
S2 = S1.copy() | ||
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
def test_series_dropna_str1(self): | ||
def test_impl(A): | ||
return A.dropna().values | ||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'No support of axis argument in old-style Series.dropna() impl') | ||
def test_series_dropna_axis2(self): | ||
'''Verifies Series.dropna() implementation handles 0 as axis argument''' | ||
def test_impl(S): | ||
return S.dropna(axis=0) | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'ccc']) | ||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) | ||
S2 = S1.copy() | ||
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'No support of axis argument in old-style Series.dropna() impl') | ||
def test_series_dropna_axis3(self): | ||
'''Verifies Series.dropna() implementation handles correct non-literal axis argument''' | ||
def test_impl(S, axis): | ||
return S.dropna(axis=axis) | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) | ||
S2 = S1.copy() | ||
axis_values = [0, 'index'] | ||
for value in axis_values: | ||
pd.testing.assert_series_equal(hpat_func(S1, value), test_impl(S2, value)) | ||
|
||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'BUG: old-style dropna impl returns series without index') | ||
def test_series_dropna_float_index1(self): | ||
'''Verifies Series.dropna() implementation for float series with default index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
for data in test_global_input_data_float64: | ||
S1 = pd.Series(data) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'BUG: old-style dropna impl returns series without index') | ||
def test_series_dropna_float_index2(self): | ||
'''Verifies Series.dropna() implementation for float series with string index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf], ['a', 'b', 'c', 'd', 'e']) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'BUG: old-style dropna impl returns series without index') | ||
def test_series_dropna_str_index1(self): | ||
'''Verifies Series.dropna() implementation for series of strings with default index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'cccd', '']) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'BUG: old-style dropna impl returns series without index') | ||
def test_series_dropna_str_index2(self): | ||
'''Verifies Series.dropna() implementation for series of strings with string index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'cccd', ''], ['a', 'b', 'c', 'd', 'e']) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'BUG: old-style dropna impl returns series without index') | ||
def test_series_dropna_str_index3(self): | ||
def test_impl(S): | ||
return S.dropna() | ||
|
||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'cccd', ''], index=[1, 2, 5, 7, 10]) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skip('BUG: old-style dropna impl returns series without index, in new-style inplace is unsupported') | ||
def test_series_dropna_float_inplace_no_index1(self): | ||
'''Verifies Series.dropna() implementation for float series with default index and inplace argument True''' | ||
def test_impl(S): | ||
S.dropna(inplace=True) | ||
return S | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skip('TODO: add reflection support and check method return value') | ||
def test_series_dropna_float_inplace_no_index2(self): | ||
'''Verifies Series.dropna(inplace=True) results are reflected back in the original float series''' | ||
def test_impl(S): | ||
return S.dropna(inplace=True) | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf]) | ||
S2 = S1.copy() | ||
self.assertIsNone(hpat_func(S1)) | ||
self.assertIsNone(test_impl(S2)) | ||
pd.testing.assert_series_equal(S1, S2) | ||
|
||
@unittest.skip('BUG: old-style dropna impl returns series without index, in new-style inplace is unsupported') | ||
def test_series_dropna_str_inplace_no_index1(self): | ||
'''Verifies Series.dropna() implementation for series of strings | ||
with default index and inplace argument True | ||
''' | ||
def test_impl(S): | ||
S.dropna(inplace=True) | ||
return S | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'cccd', '']) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skip('TODO: add reflection support and check method return value') | ||
def test_series_dropna_str_inplace_no_index2(self): | ||
'''Verifies Series.dropna(inplace=True) results are reflected back in the original string series''' | ||
def test_impl(S): | ||
return S.dropna(inplace=True) | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'cccd', '']) | ||
S2 = S1.copy() | ||
self.assertIsNone(hpat_func(S1)) | ||
self.assertIsNone(test_impl(S2)) | ||
pd.testing.assert_series_equal(S1, S2) | ||
|
||
def test_series_dropna_str_parallel1(self): | ||
'''Verifies Series.dropna() distributed work for series of strings with default index''' | ||
def test_impl(A): | ||
B = A.dropna() | ||
return (B == 'gg').sum() | ||
|
@@ -1106,46 +1240,44 @@ def test_impl(A): | |
start, end = get_start_end(len(S1)) | ||
# TODO: gatherv | ||
self.assertEqual(hpat_func(S1[start:end]), test_impl(S1)) | ||
self.assertEqual(count_array_REPs(), 0) | ||
self.assertEqual(count_parfor_REPs(), 0) | ||
self.assertTrue(count_array_OneDs() > 0) | ||
|
||
def test_series_dropna_float_inplace1(self): | ||
def test_impl(A): | ||
A.dropna(inplace=True) | ||
return A.values | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series([1.0, 2.0, np.nan, 1.0]) | ||
S2 = S1.copy() | ||
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
def test_series_dropna_str_inplace1(self): | ||
def test_impl(A): | ||
A.dropna(inplace=True) | ||
return A.values | ||
@unittest.skip('AssertionError: Series are different\n' | ||
'Series length are different\n' | ||
'[left]: 3, Int64Index([0, 1, 2], dtype=\'int64\')\n' | ||
'[right]: 2, Int64Index([1, 2], dtype=\'int64\')') | ||
def test_series_dropna_dt_no_index1(self): | ||
'''Verifies Series.dropna() implementation for datetime series with default index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'ccc']) | ||
S1 = pd.Series([pd.NaT, pd.Timestamp('1970-12-01'), pd.Timestamp('2012-07-25')]) | ||
S2 = S1.copy() | ||
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2)) | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skip('Unsupported functionality: failed to handle index') | ||
def test_series_dropna_index_str(self): | ||
def test_series_dropna_bool_no_index1(self): | ||
'''Verifies Series.dropna() implementation for bool series with default index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
|
||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'ccc'], index=['a', 'b', 'c', 'd']) | ||
S1 = pd.Series([True, False, False, True]) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
@unittest.skip('Unsupported functionality: failed to handle index') | ||
def test_series_dropna_index_int(self): | ||
@unittest.skipIf(hpat.config.config_pipeline_hpat_default, | ||
'BUG: old-style dropna impl returns series without index') | ||
def test_series_dropna_int_no_index1(self): | ||
'''Verifies Series.dropna() implementation for integer series with default index''' | ||
def test_impl(S): | ||
return S.dropna() | ||
|
||
hpat_func = hpat.jit(test_impl) | ||
|
||
S1 = pd.Series(['aa', 'b', None, 'ccc'], index=[1, 2, 5, 7]) | ||
n = 11 | ||
S1 = pd.Series(np.arange(n, dtype=np.int64)) | ||
S2 = S1.copy() | ||
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2)) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it is better to implement or get the function which can return
self._data == numpy.nan
and use it in a loop here.It will solve issue with index generation. Also, this function will be very helpful in other series methods