Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Implement Series.nsmallest()/Series.nlargest() in new style (#241)
Browse files Browse the repository at this point in the history
* Implement Series.nsmallest() in new style

* Replace rand chararray generator with strlist one

* Minor changes in tests for nsmallest/nlargest
  • Loading branch information
densmirn authored and shssf committed Nov 4, 2019
1 parent 1707927 commit 6f36f5b
Show file tree
Hide file tree
Showing 4 changed files with 323 additions and 75 deletions.
104 changes: 101 additions & 3 deletions hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import pandas

from numba.errors import TypingError
from numba.extending import (types, overload, overload_method, overload_attribute)
from numba.extending import overload, overload_method, overload_attribute
from numba import types

import hpat
Expand Down Expand Up @@ -139,6 +139,104 @@ def hpat_pandas_series_iloc_impl(self):
return hpat_pandas_series_iloc_impl


@overload_method(SeriesType, 'nsmallest')
def hpat_pandas_series_nsmallest(self, n=5, keep='first'):
"""
Pandas Series method :meth:`pandas.Series.nsmallest` implementation.
.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nsmallest*
n: :obj:`int`, default 5
Return this many ascending sorted values.
keep: :obj:`str`, default 'first'
When there are duplicate values that cannot all fit in a Series of n elements:
first : return the first n occurrences in order of appearance.
last : return the last n occurrences in reverse order of appearance.
all : keep all occurrences. This can result in a Series of size larger than n.
*unsupported*
Returns
-------
:obj:`series`
returns :obj:`series`
"""

_func_name = 'Method nsmallest().'

if not isinstance(self, SeriesType):
raise TypingError('{} The object\n given: {}\n expected: {}'.format(_func_name, self, 'series'))

if not isinstance(n, (types.Omitted, int, types.Integer)):
raise TypingError('{} The object n\n given: {}\n expected: {}'.format(_func_name, n, 'int'))

if not isinstance(keep, (types.Omitted, str, types.UnicodeType, types.StringLiteral)):
raise TypingError('{} The object keep\n given: {}\n expected: {}'.format(_func_name, keep, 'str'))

def hpat_pandas_series_nsmallest_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")

# mergesort is used for stable sorting of repeated values
indices = self._data.argsort(kind='mergesort')[:max(n, 0)]

return self.take(indices)

return hpat_pandas_series_nsmallest_impl


@overload_method(SeriesType, 'nlargest')
def hpat_pandas_series_nlargest(self, n=5, keep='first'):
"""
Pandas Series method :meth:`pandas.Series.nlargest` implementation.
.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nlargest*
Parameters
----------
self: :obj:`pandas.Series`
input series
n: :obj:`int`, default 5
Return this many ascending sorted values.
keep: :obj:`str`, default 'first'
When there are duplicate values that cannot all fit in a Series of n elements:
first : return the first n occurrences in order of appearance.
last : return the last n occurrences in reverse order of appearance.
all : keep all occurrences. This can result in a Series of size larger than n.
*unsupported*
Returns
-------
:obj:`series`
returns :obj:`series`
"""

_func_name = 'Method nlargest().'

if not isinstance(self, SeriesType):
raise TypingError('{} The object\n given: {}\n expected: {}'.format(_func_name, self, 'series'))

if not isinstance(n, (types.Omitted, int, types.Integer)):
raise TypingError('{} The object n\n given: {}\n expected: {}'.format(_func_name, n, 'int'))

if not isinstance(keep, (types.Omitted, str, types.UnicodeType, types.StringLiteral)):
raise TypingError('{} The object keep\n given: {}\n expected: {}'.format(_func_name, keep, 'str'))

def hpat_pandas_series_nlargest_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'")

# data: [0, 1, -1, 1, 0] -> [1, 1, 0, 0, -1]
# index: [0, 1, 2, 3, 4] -> [1, 3, 0, 4, 2] (not [3, 1, 4, 0, 2])
# subtract 1 to ensure reverse ordering at boundaries
indices = (-self._data - 1).argsort(kind='mergesort')[:max(n, 0)]

return self.take(indices)

return hpat_pandas_series_nlargest_impl


@overload_attribute(SeriesType, 'shape')
def hpat_pandas_series_shape(self):
"""
Expand Down Expand Up @@ -1185,8 +1283,8 @@ def hpat_pandas_series_take(self, indices, axis=0, is_copy=False):
if not isinstance(self, SeriesType):
raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self))

if not isinstance(indices, types.List):
raise TypingError('{} The indices must be a List. Given: {}'.format(_func_name, indices))
if not isinstance(indices, (types.List, types.Array)):
raise TypingError('{} The indices must be an array-like. Given: {}'.format(_func_name, indices))

if not (isinstance(axis, (types.Integer, types.Omitted)) or axis == 0):
raise TypingError('{} The axis must be an Integer. Currently unsupported. Given: {}'.format(_func_name, axis))
Expand Down
4 changes: 2 additions & 2 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,13 +585,13 @@ def select_k_nonan_overload(A, m, k):
dtype = A.dtype
if isinstance(dtype, types.Integer):
# ints don't have nans
return lambda A, m, k: (A[:k].copy(), k)
return lambda A, m, k: (A[:max(k, 0)].copy(), k)

assert isinstance(dtype, types.Float)

def select_k_nonan_float(A, m, k):
# select the first k elements but ignore NANs
min_heap_vals = np.empty(k, A.dtype)
min_heap_vals = np.empty(max(k, 0), A.dtype)
i = 0
ind = 0
while i < m and ind < k:
Expand Down
7 changes: 7 additions & 0 deletions hpat/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,13 +1000,20 @@ def generic_expand_cumulative_series(self, args, kws):
if not hpat.config.config_pipeline_hpat_default:
_not_series_array_attrs.append('resolve_std')

_non_hpat_pipeline_attrs = ['resolve_nsmallest', 'resolve_nlargest']

# use ArrayAttribute for attributes not defined in SeriesAttribute
for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items():
if (attr.startswith('resolve_')
and attr not in SeriesAttribute.__dict__
and attr not in _not_series_array_attrs):
setattr(SeriesAttribute, attr, func)

# remove some attributes from SeriesAttribute for non-hpat pipeline
if not hpat.config.config_pipeline_hpat_default:
for attr in _non_hpat_pipeline_attrs:
if attr in SeriesAttribute.__dict__:
delattr(SeriesAttribute, attr)

# PR135. This needs to be commented out
@infer_global(operator.getitem)
Expand Down

0 comments on commit 6f36f5b

Please sign in to comment.