-
Notifications
You must be signed in to change notification settings - Fork 62
Implement Series.nsmallest()/Series.nlargest() in new style #241
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,7 @@ | |
import pandas | ||
|
||
from numba.errors import TypingError | ||
from numba.extending import (types, overload, overload_method, overload_attribute) | ||
from numba.extending import overload, overload_method, overload_attribute | ||
from numba import types | ||
|
||
import hpat | ||
|
@@ -139,6 +139,104 @@ def hpat_pandas_series_iloc_impl(self): | |
return hpat_pandas_series_iloc_impl | ||
|
||
|
||
@overload_method(SeriesType, 'nsmallest') | ||
def hpat_pandas_series_nsmallest(self, n=5, keep='first'): | ||
""" | ||
Pandas Series method :meth:`pandas.Series.nsmallest` implementation. | ||
|
||
.. only:: developer | ||
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nsmallest* | ||
|
||
n: :obj:`int`, default 5 | ||
Return this many ascending sorted values. | ||
keep: :obj:`str`, default 'first' | ||
When there are duplicate values that cannot all fit in a Series of n elements: | ||
first : return the first n occurrences in order of appearance. | ||
last : return the last n occurrences in reverse order of appearance. | ||
all : keep all occurrences. This can result in a Series of size larger than n. | ||
*unsupported* | ||
|
||
Returns | ||
------- | ||
:obj:`series` | ||
returns :obj:`series` | ||
""" | ||
|
||
_func_name = 'Method nsmallest().' | ||
|
||
if not isinstance(self, SeriesType): | ||
raise TypingError('{} The object\n given: {}\n expected: {}'.format(_func_name, self, 'series')) | ||
|
||
if not isinstance(n, (types.Omitted, int, types.Integer)): | ||
raise TypingError('{} The object n\n given: {}\n expected: {}'.format(_func_name, n, 'int')) | ||
|
||
if not isinstance(keep, (types.Omitted, str, types.UnicodeType, types.StringLiteral)): | ||
raise TypingError('{} The object keep\n given: {}\n expected: {}'.format(_func_name, keep, 'str')) | ||
|
||
def hpat_pandas_series_nsmallest_impl(self, n=5, keep='first'): | ||
densmirn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if keep != 'first': | ||
raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'") | ||
|
||
# mergesort is used for stable sorting of repeated values | ||
indices = self._data.argsort(kind='mergesort')[:max(n, 0)] | ||
|
||
return self.take(indices) | ||
|
||
return hpat_pandas_series_nsmallest_impl | ||
|
||
|
||
@overload_method(SeriesType, 'nlargest') | ||
def hpat_pandas_series_nlargest(self, n=5, keep='first'): | ||
""" | ||
Pandas Series method :meth:`pandas.Series.nlargest` implementation. | ||
|
||
.. only:: developer | ||
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nlargest* | ||
|
||
Parameters | ||
---------- | ||
self: :obj:`pandas.Series` | ||
input series | ||
n: :obj:`int`, default 5 | ||
Return this many ascending sorted values. | ||
keep: :obj:`str`, default 'first' | ||
When there are duplicate values that cannot all fit in a Series of n elements: | ||
first : return the first n occurrences in order of appearance. | ||
last : return the last n occurrences in reverse order of appearance. | ||
all : keep all occurrences. This can result in a Series of size larger than n. | ||
*unsupported* | ||
|
||
Returns | ||
------- | ||
:obj:`series` | ||
returns :obj:`series` | ||
""" | ||
|
||
_func_name = 'Method nlargest().' | ||
|
||
if not isinstance(self, SeriesType): | ||
raise TypingError('{} The object\n given: {}\n expected: {}'.format(_func_name, self, 'series')) | ||
|
||
if not isinstance(n, (types.Omitted, int, types.Integer)): | ||
raise TypingError('{} The object n\n given: {}\n expected: {}'.format(_func_name, n, 'int')) | ||
|
||
if not isinstance(keep, (types.Omitted, str, types.UnicodeType, types.StringLiteral)): | ||
raise TypingError('{} The object keep\n given: {}\n expected: {}'.format(_func_name, keep, 'str')) | ||
|
||
def hpat_pandas_series_nlargest_impl(self, n=5, keep='first'): | ||
if keep != 'first': | ||
raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'") | ||
|
||
# data: [0, 1, -1, 1, 0] -> [1, 1, 0, 0, -1] | ||
# index: [0, 1, 2, 3, 4] -> [1, 3, 0, 4, 2] (not [3, 1, 4, 0, 2]) | ||
# subtract 1 to ensure reverse ordering at boundaries | ||
indices = (-self._data - 1).argsort(kind='mergesort')[:max(n, 0)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quite strange algorithm. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you think it will work if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I checked it. Moreover the similar approach is used in Pandas. |
||
|
||
return self.take(indices) | ||
|
||
return hpat_pandas_series_nlargest_impl | ||
|
||
|
||
@overload_attribute(SeriesType, 'shape') | ||
def hpat_pandas_series_shape(self): | ||
""" | ||
|
@@ -1185,8 +1283,8 @@ def hpat_pandas_series_take(self, indices, axis=0, is_copy=False): | |
if not isinstance(self, SeriesType): | ||
raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) | ||
|
||
if not isinstance(indices, types.List): | ||
raise TypingError('{} The indices must be a List. Given: {}'.format(_func_name, indices)) | ||
if not isinstance(indices, (types.List, types.Array)): | ||
raise TypingError('{} The indices must be an array-like. Given: {}'.format(_func_name, indices)) | ||
|
||
if not (isinstance(axis, (types.Integer, types.Omitted)) or axis == 0): | ||
raise TypingError('{} The axis must be an Integer. Currently unsupported. Given: {}'.format(_func_name, axis)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -585,13 +585,13 @@ def select_k_nonan_overload(A, m, k): | |
dtype = A.dtype | ||
if isinstance(dtype, types.Integer): | ||
# ints don't have nans | ||
return lambda A, m, k: (A[:k].copy(), k) | ||
return lambda A, m, k: (A[:max(k, 0)].copy(), k) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure we have to cut off negatives here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW old functionality didn't work with negative |
||
|
||
assert isinstance(dtype, types.Float) | ||
|
||
def select_k_nonan_float(A, m, k): | ||
# select the first k elements but ignore NANs | ||
min_heap_vals = np.empty(k, A.dtype) | ||
min_heap_vals = np.empty(max(k, 0), A.dtype) | ||
i = 0 | ||
ind = 0 | ||
while i < m and ind < k: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1000,13 +1000,20 @@ def generic_expand_cumulative_series(self, args, kws): | |
if not hpat.config.config_pipeline_hpat_default: | ||
_not_series_array_attrs.append('resolve_std') | ||
|
||
_non_hpat_pipeline_attrs = ['resolve_nsmallest', 'resolve_nlargest'] | ||
|
||
# use ArrayAttribute for attributes not defined in SeriesAttribute | ||
for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items(): | ||
if (attr.startswith('resolve_') | ||
and attr not in SeriesAttribute.__dict__ | ||
and attr not in _not_series_array_attrs): | ||
setattr(SeriesAttribute, attr, func) | ||
|
||
# remove some attributes from SeriesAttribute for non-hpat pipeline | ||
if not hpat.config.config_pipeline_hpat_default: | ||
for attr in _non_hpat_pipeline_attrs: | ||
if attr in SeriesAttribute.__dict__: | ||
delattr(SeriesAttribute, attr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still don't understand why we need to remove attributes after adding them. I still think it would be better to merge There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We cannot merge them. The key idea of the code is to remove predefined series attributes from |
||
|
||
# PR135. This needs to be commented out | ||
@infer_global(operator.getitem) | ||
|
Uh oh!
There was an error while loading. Please reload this page.