Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2274,3 +2274,49 @@ def hpat_pandas_series_median_impl(self, axis=None, skipna=True, level=None, num
return numpy.median(self._data)

return hpat_pandas_series_median_impl


@overload_method(SeriesType, 'dropna')
def hpat_pandas_series_dropna(self, axis=0, inplace=False):
"""
Pandas Series method :meth:`pandas.Series.dropna` implementation.

.. only:: developer

Tests: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_dropna*

Parameters
----------
self: :obj:`pandas.Series`
input series
axis: :obj:`int` or :obj:`string` {0 or `index`}, default 0
There is only one axis to drop values from.
inplace: :obj:`bool`, default False
If True, do operation inplace and return None.
*unsupported*

Returns
-------
:obj:`pandas.Series`
returns :obj:`pandas.Series` object with NA entries dropped from it.
"""

_func_name = 'Method dropna().'

if not isinstance(self, SeriesType):
raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self))

if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted)) or axis == 0):
raise TypingError('{} The axis must be an Integer or String. Given: {}'.format(_func_name, axis))

if not (inplace is False or isinstance(inplace, types.Omitted)):
raise TypingError('{} Unsupported parameters. Given inplace: {}'.format(_func_name, inplace))

def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False):
# generate Series index if needed by using SeriesType.index (i.e. not self._index)
na_data_arr = hpat.hiframes.api.get_nan_mask(self._data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is better to implement or get the function which can return self._data == numpy.nan and use it in a loop here.
It will solve issue with index generation. Also, this function will be very helpful in other series methods

data = self._data[~na_data_arr]
index = self.index[~na_data_arr]
return pandas.Series(data, index, self._name)

return hpat_pandas_series_dropna_impl
26 changes: 26 additions & 0 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
alloc_pre_shuffle_metadata)
from hpat.hiframes.join import write_send_buff
from hpat.hiframes.split_impl import string_array_split_view_type
from numba.errors import TypingError

# XXX: used in agg func output to avoid mutating filter, agg, join, etc.
# TODO: fix type inferrer and remove this
Expand Down Expand Up @@ -533,6 +534,31 @@ def isna_overload(arr, i):
return lambda arr, i: False


def get_nan_mask(arr):
return np.zeros(len(arr), np.bool_)


@overload(get_nan_mask)
def get_nan_mask_overload(arr):

def get_nan_mask_via_isna_impl(arr):
return np.array([isna(arr, i) for i in np.arange(len(arr))])

if isinstance(arr, types.Array):
dtype = arr.dtype
if isinstance(dtype, types.Float):
return lambda arr: np.isnan(arr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouls propose to avoid lambda here and make it in the same way as get_nan_mask_via_isna_impl

elif isinstance(dtype, (types.Boolean, types.Integer)):
return lambda arr: np.zeros(len(arr), np.bool_)
elif isinstance(dtype, (types.NPDatetime, types.NPTimedelta)):
return get_nan_mask_via_isna_impl
else:
raise TypingError('{} Not implemented for arrays with dtype: {}'.format(_func_name, dtype))
else:
# for StringArrayType and other cases rely on isna implementation
return get_nan_mask_via_isna_impl


@numba.njit
def min_heapify(arr, n, start, cmp_f):
min_ind = start
Expand Down
3 changes: 2 additions & 1 deletion hpat/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ def resolve_fillna(self, ary, args, kws):
out = types.none
return signature(out, *args)

# PR135. This needs to be commented out (for new-style impl to be called)
@bound_function("series.dropna")
def resolve_dropna(self, ary, args, kws):
out = ary
Expand Down Expand Up @@ -994,7 +995,7 @@ def generic_expand_cumulative_series(self, args, kws):
'resolve_cumsum',
'resolve_shift', 'resolve_sum', 'resolve_copy', 'resolve_mean',
'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique',
'resolve_prod', 'resolve_count']
'resolve_prod', 'resolve_count', 'resolve_dropna']

# use ArrayAttribute for attributes not defined in SeriesAttribute
for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items():
Expand Down
200 changes: 166 additions & 34 deletions hpat/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,25 +1078,159 @@ def test_impl(S):
pd.testing.assert_series_equal(hpat_func(S),
test_impl(S), check_names=False)

def test_series_dropna_float1(self):
def test_impl(A):
return A.dropna().values
@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'No support of axis argument in old-style Series.dropna() impl')
def test_series_dropna_axis1(self):
'''Verifies Series.dropna() implementation handles 'index' as axis argument'''
def test_impl(S):
return S.dropna(axis='index')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This changed original test. Please implement new one with this parameter.

hpat_func = hpat.jit(test_impl)

S1 = pd.Series([1.0, 2.0, np.nan, 1.0])
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to copy in tests that don't test inplace operation

S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you use floats defined globally? e.g. f2435b0#diff-deca39d332649cea819383154a5d2cb3R39-R42

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@densmirn Sure, but in most of the tests indexes of different types have to be specified too. I added using test_global_input_data_float64 for one of the tests that uses default index: test_series_dropna_float_index1. That should be enough probably.

S2 = S1.copy()
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2))
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

def test_series_dropna_str1(self):
def test_impl(A):
return A.dropna().values
@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'No support of axis argument in old-style Series.dropna() impl')
def test_series_dropna_axis2(self):
'''Verifies Series.dropna() implementation handles 0 as axis argument'''
def test_impl(S):
return S.dropna(axis=0)
hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'ccc'])
S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf])
S2 = S1.copy()
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2))
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'No support of axis argument in old-style Series.dropna() impl')
def test_series_dropna_axis3(self):
'''Verifies Series.dropna() implementation handles correct non-literal axis argument'''
def test_impl(S, axis):
return S.dropna(axis=axis)
hpat_func = hpat.jit(test_impl)

S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf])
S2 = S1.copy()
axis_values = [0, 'index']
for value in axis_values:
pd.testing.assert_series_equal(hpat_func(S1, value), test_impl(S2, value))

@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'BUG: old-style dropna impl returns series without index')
def test_series_dropna_float_index1(self):
'''Verifies Series.dropna() implementation for float series with default index'''
def test_impl(S):
return S.dropna()
hpat_func = hpat.jit(test_impl)

for data in test_global_input_data_float64:
S1 = pd.Series(data)
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'BUG: old-style dropna impl returns series without index')
def test_series_dropna_float_index2(self):
'''Verifies Series.dropna() implementation for float series with string index'''
def test_impl(S):
return S.dropna()
hpat_func = hpat.jit(test_impl)

S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf], ['a', 'b', 'c', 'd', 'e'])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'BUG: old-style dropna impl returns series without index')
def test_series_dropna_str_index1(self):
'''Verifies Series.dropna() implementation for series of strings with default index'''
def test_impl(S):
return S.dropna()
hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'cccd', ''])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'BUG: old-style dropna impl returns series without index')
def test_series_dropna_str_index2(self):
'''Verifies Series.dropna() implementation for series of strings with string index'''
def test_impl(S):
return S.dropna()
hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'cccd', ''], ['a', 'b', 'c', 'd', 'e'])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'BUG: old-style dropna impl returns series without index')
def test_series_dropna_str_index3(self):
def test_impl(S):
return S.dropna()

hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'cccd', ''], index=[1, 2, 5, 7, 10])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skip('BUG: old-style dropna impl returns series without index, in new-style inplace is unsupported')
def test_series_dropna_float_inplace_no_index1(self):
'''Verifies Series.dropna() implementation for float series with default index and inplace argument True'''
def test_impl(S):
S.dropna(inplace=True)
return S
hpat_func = hpat.jit(test_impl)

S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skip('TODO: add reflection support and check method return value')
def test_series_dropna_float_inplace_no_index2(self):
'''Verifies Series.dropna(inplace=True) results are reflected back in the original float series'''
def test_impl(S):
return S.dropna(inplace=True)
hpat_func = hpat.jit(test_impl)

S1 = pd.Series([1.0, 2.0, np.nan, 1.0, np.inf])
S2 = S1.copy()
self.assertIsNone(hpat_func(S1))
self.assertIsNone(test_impl(S2))
pd.testing.assert_series_equal(S1, S2)

@unittest.skip('BUG: old-style dropna impl returns series without index, in new-style inplace is unsupported')
def test_series_dropna_str_inplace_no_index1(self):
'''Verifies Series.dropna() implementation for series of strings
with default index and inplace argument True
'''
def test_impl(S):
S.dropna(inplace=True)
return S
hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'cccd', ''])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skip('TODO: add reflection support and check method return value')
def test_series_dropna_str_inplace_no_index2(self):
'''Verifies Series.dropna(inplace=True) results are reflected back in the original string series'''
def test_impl(S):
return S.dropna(inplace=True)
hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'cccd', ''])
S2 = S1.copy()
self.assertIsNone(hpat_func(S1))
self.assertIsNone(test_impl(S2))
pd.testing.assert_series_equal(S1, S2)

def test_series_dropna_str_parallel1(self):
'''Verifies Series.dropna() distributed work for series of strings with default index'''
def test_impl(A):
B = A.dropna()
return (B == 'gg').sum()
Expand All @@ -1106,46 +1240,44 @@ def test_impl(A):
start, end = get_start_end(len(S1))
# TODO: gatherv
self.assertEqual(hpat_func(S1[start:end]), test_impl(S1))
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)
self.assertTrue(count_array_OneDs() > 0)

def test_series_dropna_float_inplace1(self):
def test_impl(A):
A.dropna(inplace=True)
return A.values
hpat_func = hpat.jit(test_impl)

S1 = pd.Series([1.0, 2.0, np.nan, 1.0])
S2 = S1.copy()
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2))

def test_series_dropna_str_inplace1(self):
def test_impl(A):
A.dropna(inplace=True)
return A.values
@unittest.skip('AssertionError: Series are different\n'
'Series length are different\n'
'[left]: 3, Int64Index([0, 1, 2], dtype=\'int64\')\n'
'[right]: 2, Int64Index([1, 2], dtype=\'int64\')')
def test_series_dropna_dt_no_index1(self):
'''Verifies Series.dropna() implementation for datetime series with default index'''
def test_impl(S):
return S.dropna()
hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'ccc'])
S1 = pd.Series([pd.NaT, pd.Timestamp('1970-12-01'), pd.Timestamp('2012-07-25')])
S2 = S1.copy()
np.testing.assert_array_equal(hpat_func(S1), test_impl(S2))
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skip('Unsupported functionality: failed to handle index')
def test_series_dropna_index_str(self):
def test_series_dropna_bool_no_index1(self):
'''Verifies Series.dropna() implementation for bool series with default index'''
def test_impl(S):
return S.dropna()

hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'ccc'], index=['a', 'b', 'c', 'd'])
S1 = pd.Series([True, False, False, True])
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

@unittest.skip('Unsupported functionality: failed to handle index')
def test_series_dropna_index_int(self):
@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'BUG: old-style dropna impl returns series without index')
def test_series_dropna_int_no_index1(self):
'''Verifies Series.dropna() implementation for integer series with default index'''
def test_impl(S):
return S.dropna()

hpat_func = hpat.jit(test_impl)

S1 = pd.Series(['aa', 'b', None, 'ccc'], index=[1, 2, 5, 7])
n = 11
S1 = pd.Series(np.arange(n, dtype=np.int64))
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))

Expand Down