Merge c0a52c2 into 33742c4

IntelPython · Oct 29, 2019 · d7759cb · d7759cb
2 parents 33742c4 + c0a52c2
commit d7759cb
Show file tree

Hide file tree

Showing 4 changed files with 419 additions and 52 deletions.
diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py
@@ -34,7 +34,8 @@
 import pandas
 
 from numba.errors import TypingError
-from numba.extending import (types, overload, overload_method, overload_attribute)
+from numba.extending import overload, overload_method, overload_attribute
+from numba.typed import Dict, List
 from numba import types
 
 import hpat
@@ -43,6 +44,40 @@
 from hpat.utils import to_array
 
 
+class TypingChecker:
+    msg_template = '{} The object must be {}. Given{}: {}'
+    integer_types = (types.Omitted, int, types.Integer)
+
+    def __init__(self, func_name):
+        self.func_name = func_name
+
+    def msg(self, ty, val, name=''):
+        if name:
+            name = ' {}'.format(name)
+        return self.msg_template.format(self.func_name, ty, name, val)
+
+    def check_series(self, data):
+        if not isinstance(data, SeriesType):
+            raise TypingError(self.msg('a pandas.series', data))
+
+    def check_integer(self, data, name):
+        if not isinstance(data, (types.Omitted, int, types.Integer)):
+            raise TypingError(self.msg('an integer', data, name=name))
+
+    def check_float(self, data, name):
+        if not isinstance(data, (types.Omitted, float, types.Float)):
+            raise TypingError(self.msg('a float', data, name=name))
+
+    def check_number(self, data, name):
+        if not isinstance(data, (types.Omitted, int, float, types.Number)):
+            raise TypingError(self.msg('a number', data, name=name))
+
+    def check_string(self, data, name):
+        str_types = (types.Omitted, str, types.UnicodeType, types.StringLiteral)
+        if not isinstance(data, str_types):
+            raise TypingError(self.msg('a string', data, name=name))
+
+
 @overload(operator.getitem)
 def hpat_pandas_series_getitem(self, idx):
     """
@@ -138,6 +173,162 @@ def hpat_pandas_series_iloc_impl(self):
     return hpat_pandas_series_iloc_impl
 
 
+@overload_method(SeriesType, 'nsmallest')
+def hpat_pandas_series_nsmallest(self, n=5, keep='first'):
+    """
+    Pandas Series method :meth:`pandas.Series.nsmallest` implementation.
+
+    .. only:: developer
+       Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nsmallest*
+
+    n: :obj:`int`, default 5
+        Return this many ascending sorted values.
+    keep: :obj:`str`, default 'first'
+        When there are duplicate values that cannot all fit in a Series of n elements:
+        first : return the first n occurrences in order of appearance.
+        last : return the last n occurrences in reverse order of appearance.
+        all : keep all occurrences. This can result in a Series of size larger than n.
+        *unsupported*
+
+    Returns
+    -------
+    :obj:`series`
+         returns :obj:`series`
+    """
+
+    _func_name = 'Method nsmallest().'
+
+    ty_checker = TypingChecker(_func_name)
+    ty_checker.check_series(self)
+    ty_checker.check_integer(n, 'n')
+    ty_checker.check_string(keep, 'keep')
+
+    if isinstance(self.index, types.NoneType):
+
+        def hpat_pandas_series_nsmallest_no_index_impl(self, n=5, keep='first'):
+            if keep != 'first':
+                raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")
+
+            # invoke existing functionality because numpy does not cover that
+            nsmallest = hpat.hiframes.api.nlargest(self._data, n, False,
+                                                   hpat.hiframes.series_kernels.lt_f)
+            # mergesort is used for stable sorting of repeated values
+            local_index = self._data.argsort(kind='mergesort')[:len(nsmallest)]
+
+            return pandas.Series(nsmallest, local_index, name=self._name)
+
+        return hpat_pandas_series_nsmallest_no_index_impl
+
+    def hpat_pandas_series_nsmallest_impl(self, n=5, keep='first'):
+        if keep != 'first':
+            raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")
+
+        # invoke existing functionality because numpy does not cover that
+        nsmallest = hpat.hiframes.api.nlargest(self._data, n, False,
+                                               hpat.hiframes.series_kernels.lt_f)
+        # mergesort is used for stable sorting of repeated values
+        indices = self._data.argsort(kind='mergesort')[:len(nsmallest)]
+        local_index = [self._index[i] for i in indices]
+
+        return pandas.Series(nsmallest, local_index, name=self._name)
+
+    return hpat_pandas_series_nsmallest_impl
+
+
+@overload_method(SeriesType, 'nlargest')
+def hpat_pandas_series_nlargest(self, n=5, keep='first'):
+    """
+    Pandas Series method :meth:`pandas.Series.nlargest` implementation.
+
+    .. only:: developer
+       Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_nlargest*
+
+    Parameters
+    ----------
+    self: :obj:`pandas.Series`
+        input series
+    n: :obj:`int`, default 5
+        Return this many ascending sorted values.
+    keep: :obj:`str`, default 'first'
+        When there are duplicate values that cannot all fit in a Series of n elements:
+        first : return the first n occurrences in order of appearance.
+        last : return the last n occurrences in reverse order of appearance.
+        all : keep all occurrences. This can result in a Series of size larger than n.
+        *unsupported*
+
+    Returns
+    -------
+    :obj:`series`
+         returns :obj:`series`
+    """
+
+    _func_name = 'Method nlargest().'
+
+    ty_checker = TypingChecker(_func_name)
+    ty_checker.check_series(self)
+    ty_checker.check_integer(n, 'n')
+    ty_checker.check_string(keep, 'keep')
+
+    dtype = self.data.dtype
+    if isinstance(self.index, types.NoneType):
+        inner_list_type = types.ListType(types.intp)
+
+        def hpat_pandas_series_nlargest_no_index_impl(self, n=5, keep='first'):
+            if keep != 'first':
+                raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'")
+
+            # invoke existing functionality because numpy does not cover that
+            nlargest = hpat.hiframes.api.nlargest(self._data, n, True,
+                                                  hpat.hiframes.series_kernels.gt_f)
+            # np.argsort is not applicable here due to descending ordering
+            indices = Dict.empty(dtype, inner_list_type)
+            for idx, item in enumerate(self._data):
+                if item not in indices:
+                    indices[item] = List.empty_list(types.intp)
+                indices[item].append(idx)
+
+            max_length = len(nlargest)
+            local_index = []
+            for item in numpy.unique(nlargest)[::-1]:
+                for index in indices[item]:
+                    if len(local_index) == max_length:
+                        break
+                    local_index.append(index)
+
+            return pandas.Series(nlargest, local_index, name=self._name)
+
+        return hpat_pandas_series_nlargest_no_index_impl
+
+    index_type = self.index.dtype
+    inner_list_type = types.ListType(index_type)
+
+    def hpat_pandas_series_nlargest_impl(self, n=5, keep='first'):
+        if keep != 'first':
+            raise ValueError("Method nlargest(). Unsupported parameter. Given 'keep' != 'first'")
+
+        # invoke existing functionality because numpy does not cover that
+        nlargest = hpat.hiframes.api.nlargest(self._data, n, True,
+                                              hpat.hiframes.series_kernels.gt_f)
+        # np.argsort is not applicable here due to descending ordering
+        indices = Dict.empty(dtype, inner_list_type)
+        for idx, item in enumerate(self._data):
+            if item not in indices:
+                indices[item] = List.empty_list(index_type)
+            indices[item].append(self._index[idx])
+
+        max_length = len(nlargest)
+        local_index = []
+        for item in numpy.unique(nlargest)[::-1]:
+            for index in indices[item]:
+                if len(local_index) == max_length:
+                    break
+                local_index.append(index)
+
+        return pandas.Series(nlargest, local_index, name=self._name)
+
+    return hpat_pandas_series_nlargest_impl
+
+
 @overload_attribute(SeriesType, 'shape')
 def hpat_pandas_series_shape(self):
     """

diff --git a/hpat/hiframes/api.py b/hpat/hiframes/api.py
@@ -585,13 +585,13 @@ def select_k_nonan_overload(A, m, k):
     dtype = A.dtype
     if isinstance(dtype, types.Integer):
         # ints don't have nans
-        return lambda A, m, k: (A[:k].copy(), k)
+        return lambda A, m, k: (A[:max(k, 0)].copy(), k)
 
     assert isinstance(dtype, types.Float)
 
     def select_k_nonan_float(A, m, k):
         # select the first k elements but ignore NANs
-        min_heap_vals = np.empty(k, A.dtype)
+        min_heap_vals = np.empty(max(k, 0), A.dtype)
         i = 0
         ind = 0
         while i < m and ind < k:

diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py
@@ -999,13 +999,20 @@ def generic_expand_cumulative_series(self, args, kws):
 if not hpat.config.config_pipeline_hpat_default:
     _not_series_array_attrs.append('resolve_std')
 
+_non_hpat_pipeline_attrs = ['resolve_nsmallest', 'resolve_nlargest']
+
 # use ArrayAttribute for attributes not defined in SeriesAttribute
 for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items():
     if (attr.startswith('resolve_')
             and attr not in SeriesAttribute.__dict__
             and attr not in _not_series_array_attrs):
         setattr(SeriesAttribute, attr, func)
 
+# remove some attributes from SeriesAttribute for non-hpat pipeline
+if not hpat.config.config_pipeline_hpat_default:
+    for attr in _non_hpat_pipeline_attrs:
+        if attr in SeriesAttribute.__dict__:
+            delattr(SeriesAttribute, attr)
 
 # PR135. This needs to be commented out
 @infer_global(operator.getitem)