Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Merge c0a52c2 into 33742c4
Browse files Browse the repository at this point in the history
  • Loading branch information
densmirn committed Oct 29, 2019
2 parents 33742c4 + c0a52c2 commit d7759cb
Show file tree
Hide file tree
Showing 4 changed files with 419 additions and 52 deletions.
193 changes: 192 additions & 1 deletion hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
import pandas

from numba.errors import TypingError
from numba.extending import (types, overload, overload_method, overload_attribute)
from numba.extending import overload, overload_method, overload_attribute
from numba.typed import Dict, List
from numba import types

import hpat
Expand All @@ -43,6 +44,40 @@
from hpat.utils import to_array


class TypingChecker:
msg_template = '{} The object must be {}. Given{}: {}'
integer_types = (types.Omitted, int, types.Integer)

def __init__(self, func_name):
self.func_name = func_name

def msg(self, ty, val, name=''):
if name:
name = ' {}'.format(name)
return self.msg_template.format(self.func_name, ty, name, val)

def check_series(self, data):
if not isinstance(data, SeriesType):
raise TypingError(self.msg('a pandas.series', data))

def check_integer(self, data, name):
if not isinstance(data, (types.Omitted, int, types.Integer)):
raise TypingError(self.msg('an integer', data, name=name))

def check_float(self, data, name):
if not isinstance(data, (types.Omitted, float, types.Float)):
raise TypingError(self.msg('a float', data, name=name))

def check_number(self, data, name):
if not isinstance(data, (types.Omitted, int, float, types.Number)):
raise TypingError(self.msg('a number', data, name=name))

def check_string(self, data, name):
str_types = (types.Omitted, str, types.UnicodeType, types.StringLiteral)
if not isinstance(data, str_types):
raise TypingError(self.msg('a string', data, name=name))


@overload(operator.getitem)
def hpat_pandas_series_getitem(self, idx):
"""
Expand Down Expand Up @@ -138,6 +173,162 @@ def hpat_pandas_series_iloc_impl(self):
return hpat_pandas_series_iloc_impl


@overload_method(SeriesType, 'nsmallest')
def hpat_pandas_series_nsmallest(self, n=5, keep='first'):
"""
Pandas Series method :meth:`pandas.Series.nsmallest` implementation.
.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nsmallest*
n: :obj:`int`, default 5
Return this many ascending sorted values.
keep: :obj:`str`, default 'first'
When there are duplicate values that cannot all fit in a Series of n elements:
first : return the first n occurrences in order of appearance.
last : return the last n occurrences in reverse order of appearance.
all : keep all occurrences. This can result in a Series of size larger than n.
*unsupported*
Returns
-------
:obj:`series`
returns :obj:`series`
"""

_func_name = 'Method nsmallest().'

ty_checker = TypingChecker(_func_name)
ty_checker.check_series(self)
ty_checker.check_integer(n, 'n')
ty_checker.check_string(keep, 'keep')

if isinstance(self.index, types.NoneType):

def hpat_pandas_series_nsmallest_no_index_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")

# invoke existing functionality because numpy does not cover that
nsmallest = hpat.hiframes.api.nlargest(self._data, n, False,
hpat.hiframes.series_kernels.lt_f)
# mergesort is used for stable sorting of repeated values
local_index = self._data.argsort(kind='mergesort')[:len(nsmallest)]

return pandas.Series(nsmallest, local_index, name=self._name)

return hpat_pandas_series_nsmallest_no_index_impl

def hpat_pandas_series_nsmallest_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")

# invoke existing functionality because numpy does not cover that
nsmallest = hpat.hiframes.api.nlargest(self._data, n, False,
hpat.hiframes.series_kernels.lt_f)
# mergesort is used for stable sorting of repeated values
indices = self._data.argsort(kind='mergesort')[:len(nsmallest)]
local_index = [self._index[i] for i in indices]

return pandas.Series(nsmallest, local_index, name=self._name)

return hpat_pandas_series_nsmallest_impl


@overload_method(SeriesType, 'nlargest')
def hpat_pandas_series_nlargest(self, n=5, keep='first'):
"""
Pandas Series method :meth:`pandas.Series.nlargest` implementation.
.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nlargest*
Parameters
----------
self: :obj:`pandas.Series`
input series
n: :obj:`int`, default 5
Return this many ascending sorted values.
keep: :obj:`str`, default 'first'
When there are duplicate values that cannot all fit in a Series of n elements:
first : return the first n occurrences in order of appearance.
last : return the last n occurrences in reverse order of appearance.
all : keep all occurrences. This can result in a Series of size larger than n.
*unsupported*
Returns
-------
:obj:`series`
returns :obj:`series`
"""

_func_name = 'Method nlargest().'

ty_checker = TypingChecker(_func_name)
ty_checker.check_series(self)
ty_checker.check_integer(n, 'n')
ty_checker.check_string(keep, 'keep')

dtype = self.data.dtype
if isinstance(self.index, types.NoneType):
inner_list_type = types.ListType(types.intp)

def hpat_pandas_series_nlargest_no_index_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'")

# invoke existing functionality because numpy does not cover that
nlargest = hpat.hiframes.api.nlargest(self._data, n, True,
hpat.hiframes.series_kernels.gt_f)
# np.argsort is not applicable here due to descending ordering
indices = Dict.empty(dtype, inner_list_type)
for idx, item in enumerate(self._data):
if item not in indices:
indices[item] = List.empty_list(types.intp)
indices[item].append(idx)

max_length = len(nlargest)
local_index = []
for item in numpy.unique(nlargest)[::-1]:
for index in indices[item]:
if len(local_index) == max_length:
break
local_index.append(index)

return pandas.Series(nlargest, local_index, name=self._name)

return hpat_pandas_series_nlargest_no_index_impl

index_type = self.index.dtype
inner_list_type = types.ListType(index_type)

def hpat_pandas_series_nlargest_impl(self, n=5, keep='first'):
if keep != 'first':
raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'")

# invoke existing functionality because numpy does not cover that
nlargest = hpat.hiframes.api.nlargest(self._data, n, True,
hpat.hiframes.series_kernels.gt_f)
# np.argsort is not applicable here due to descending ordering
indices = Dict.empty(dtype, inner_list_type)
for idx, item in enumerate(self._data):
if item not in indices:
indices[item] = List.empty_list(index_type)
indices[item].append(self._index[idx])

max_length = len(nlargest)
local_index = []
for item in numpy.unique(nlargest)[::-1]:
for index in indices[item]:
if len(local_index) == max_length:
break
local_index.append(index)

return pandas.Series(nlargest, local_index, name=self._name)

return hpat_pandas_series_nlargest_impl


@overload_attribute(SeriesType, 'shape')
def hpat_pandas_series_shape(self):
"""
Expand Down
4 changes: 2 additions & 2 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,13 +585,13 @@ def select_k_nonan_overload(A, m, k):
dtype = A.dtype
if isinstance(dtype, types.Integer):
# ints don't have nans
return lambda A, m, k: (A[:k].copy(), k)
return lambda A, m, k: (A[:max(k, 0)].copy(), k)

assert isinstance(dtype, types.Float)

def select_k_nonan_float(A, m, k):
# select the first k elements but ignore NANs
min_heap_vals = np.empty(k, A.dtype)
min_heap_vals = np.empty(max(k, 0), A.dtype)
i = 0
ind = 0
while i < m and ind < k:
Expand Down
7 changes: 7 additions & 0 deletions hpat/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,13 +999,20 @@ def generic_expand_cumulative_series(self, args, kws):
if not hpat.config.config_pipeline_hpat_default:
_not_series_array_attrs.append('resolve_std')

_non_hpat_pipeline_attrs = ['resolve_nsmallest', 'resolve_nlargest']

# use ArrayAttribute for attributes not defined in SeriesAttribute
for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items():
if (attr.startswith('resolve_')
and attr not in SeriesAttribute.__dict__
and attr not in _not_series_array_attrs):
setattr(SeriesAttribute, attr, func)

# remove some attributes from SeriesAttribute for non-hpat pipeline
if not hpat.config.config_pipeline_hpat_default:
for attr in _non_hpat_pipeline_attrs:
if attr in SeriesAttribute.__dict__:
delattr(SeriesAttribute, attr)

# PR135. This needs to be commented out
@infer_global(operator.getitem)
Expand Down
Loading

0 comments on commit d7759cb

Please sign in to comment.