Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 101 additions & 3 deletions hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
import pandas

from numba.errors import TypingError
from numba.extending import (types, overload, overload_method, overload_attribute)
from numba.extending import overload, overload_method, overload_attribute
from numba import types

import hpat
Expand Down Expand Up @@ -139,6 +139,104 @@ def hpat_pandas_series_iloc_impl(self):
return hpat_pandas_series_iloc_impl


@overload_method(SeriesType, 'nsmallest')
def hpat_pandas_series_nsmallest(self, n=5, keep='first'):
"""
Pandas Series method :meth:`pandas.Series.nsmallest` implementation.

.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nsmallest*

n: :obj:`int`, default 5
Return this many ascending sorted values.
keep: :obj:`str`, default 'first'
When there are duplicate values that cannot all fit in a Series of n elements:
first : return the first n occurrences in order of appearance.
last : return the last n occurrences in reverse order of appearance.
all : keep all occurrences. This can result in a Series of size larger than n.
*unsupported*

Returns
-------
:obj:`series`
returns :obj:`series`
"""

_func_name = 'Method nsmallest().'

if not isinstance(self, SeriesType):
raise TypingError('{} The object\n given: {}\n expected: {}'.format(_func_name, self, 'series'))

if not isinstance(n, (types.Omitted, int, types.Integer)):
raise TypingError('{} The object n\n given: {}\n expected: {}'.format(_func_name, n, 'int'))

if not isinstance(keep, (types.Omitted, str, types.UnicodeType, types.StringLiteral)):
raise TypingError('{} The object keep\n given: {}\n expected: {}'.format(_func_name, keep, 'str'))

def hpat_pandas_series_nsmallest_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")

# mergesort is used for stable sorting of repeated values
indices = self._data.argsort(kind='mergesort')[:max(n, 0)]

return self.take(indices)

return hpat_pandas_series_nsmallest_impl


@overload_method(SeriesType, 'nlargest')
def hpat_pandas_series_nlargest(self, n=5, keep='first'):
"""
Pandas Series method :meth:`pandas.Series.nlargest` implementation.

.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nlargest*

Parameters
----------
self: :obj:`pandas.Series`
input series
n: :obj:`int`, default 5
Return this many ascending sorted values.
keep: :obj:`str`, default 'first'
When there are duplicate values that cannot all fit in a Series of n elements:
first : return the first n occurrences in order of appearance.
last : return the last n occurrences in reverse order of appearance.
all : keep all occurrences. This can result in a Series of size larger than n.
*unsupported*

Returns
-------
:obj:`series`
returns :obj:`series`
"""

_func_name = 'Method nlargest().'

if not isinstance(self, SeriesType):
raise TypingError('{} The object\n given: {}\n expected: {}'.format(_func_name, self, 'series'))

if not isinstance(n, (types.Omitted, int, types.Integer)):
raise TypingError('{} The object n\n given: {}\n expected: {}'.format(_func_name, n, 'int'))

if not isinstance(keep, (types.Omitted, str, types.UnicodeType, types.StringLiteral)):
raise TypingError('{} The object keep\n given: {}\n expected: {}'.format(_func_name, keep, 'str'))

def hpat_pandas_series_nlargest_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'")

# data: [0, 1, -1, 1, 0] -> [1, 1, 0, 0, -1]
# index: [0, 1, 2, 3, 4] -> [1, 3, 0, 4, 2] (not [3, 1, 4, 0, 2])
# subtract 1 to ensure reverse ordering at boundaries
indices = (-self._data - 1).argsort(kind='mergesort')[:max(n, 0)]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quite strange algorithm.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(-self._data - 1): subtracted 1 to ensure reverse ordering at boundaries, e.g. to turn min into max integer. self._data.argsort(kind='mergesort')[::-1] is invalid in case of duplicates in data.

Copy link
Contributor

@shssf shssf Nov 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you think it will work if self._data[i]==type_max as you expected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I checked it. Moreover the similar approach is used in Pandas.


return self.take(indices)

return hpat_pandas_series_nlargest_impl


@overload_attribute(SeriesType, 'shape')
def hpat_pandas_series_shape(self):
"""
Expand Down Expand Up @@ -1185,8 +1283,8 @@ def hpat_pandas_series_take(self, indices, axis=0, is_copy=False):
if not isinstance(self, SeriesType):
raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self))

if not isinstance(indices, types.List):
raise TypingError('{} The indices must be a List. Given: {}'.format(_func_name, indices))
if not isinstance(indices, (types.List, types.Array)):
raise TypingError('{} The indices must be an array-like. Given: {}'.format(_func_name, indices))

if not (isinstance(axis, (types.Integer, types.Omitted)) or axis == 0):
raise TypingError('{} The axis must be an Integer. Currently unsupported. Given: {}'.format(_func_name, axis))
Expand Down
4 changes: 2 additions & 2 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,13 +585,13 @@ def select_k_nonan_overload(A, m, k):
dtype = A.dtype
if isinstance(dtype, types.Integer):
# ints don't have nans
return lambda A, m, k: (A[:k].copy(), k)
return lambda A, m, k: (A[:max(k, 0)].copy(), k)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure we have to cut off negatives here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nsmallest(0) == nsmallest(-1) returns empty series. So we need to set up 0 instead of negative k to return empty array. E.g. k = -2, A = [1, 2, 3, 4, 5]:
A[:k] # [1, 2, 3]
A[:0] # [] - what we want to get when we have negative k

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW old functionality didn't work with negative k.


assert isinstance(dtype, types.Float)

def select_k_nonan_float(A, m, k):
# select the first k elements but ignore NANs
min_heap_vals = np.empty(k, A.dtype)
min_heap_vals = np.empty(max(k, 0), A.dtype)
i = 0
ind = 0
while i < m and ind < k:
Expand Down
7 changes: 7 additions & 0 deletions hpat/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,13 +1000,20 @@ def generic_expand_cumulative_series(self, args, kws):
if not hpat.config.config_pipeline_hpat_default:
_not_series_array_attrs.append('resolve_std')

_non_hpat_pipeline_attrs = ['resolve_nsmallest', 'resolve_nlargest']

# use ArrayAttribute for attributes not defined in SeriesAttribute
for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items():
if (attr.startswith('resolve_')
and attr not in SeriesAttribute.__dict__
and attr not in _not_series_array_attrs):
setattr(SeriesAttribute, attr, func)

# remove some attributes from SeriesAttribute for non-hpat pipeline
if not hpat.config.config_pipeline_hpat_default:
for attr in _non_hpat_pipeline_attrs:
if attr in SeriesAttribute.__dict__:
delattr(SeriesAttribute, attr)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don't understand why we need to remove attributes after adding them. I still think it would be better to merge _non_hpat_pipeline_attrs with _not_series_array_attrs and remove this piece of code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot merge them. The key idea of the code is to remove predefined series attributes from SeriesAttribute, not added to SeriesAttribute via previous loop. E.g. if to add 'resolve_nsmallest' to _not_series_array_attrs and remove the "deleter" then SeriesAttribute will still contain attribute 'resolve_nsmallest' and "new style" won't be used.


# PR135. This needs to be commented out
@infer_global(operator.getitem)
Expand Down
Loading