Add Series.sort_values() and tests

IntelPython · Oct 25, 2019 · b7d4481 · b7d4481
1 parent d658914
commit b7d4481
Show file tree

Hide file tree

Showing 3 changed files with 295 additions and 11 deletions.
diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py
@@ -2181,10 +2181,13 @@ def hpat_pandas_series_argsort(self, axis=0, kind='quicksort', order=None):
         input arg
     axis: :obj:`int`
         Has no effect but is accepted for compatibility with numpy.
-    kind: {‘mergesort’, ‘quicksort’, ‘heapsort’}, default ‘quicksort’
-        Choice of sorting algorithm. See np.sort for more information. ‘mergesort’ is the only stable algorithm
+        *unsupported*
+    kind: {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort'
+        Choice of sorting algorithm. See np.sort for more information. 'mergesort' is the only stable algorithm
+        *unsupported, uses python func - sorted()*
     order: None
         Has no effect but is accepted for compatibility with numpy.
+        *unsupported*
 
     Returns
     -------
@@ -2262,3 +2265,219 @@ def hpat_pandas_series_argsort_impl(self, axis=0, kind='quicksort', order=None):
     return hpat_pandas_series_argsort_impl
 
 
+@overload_method(SeriesType, 'sort_values')
+def hpat_pandas_series_sort_values(self, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'):
+    """
+    Pandas Series method :meth:`pandas.Series.sort_values` implementation.
+
+    .. only:: developer
+
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values1
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values2
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_index1
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_noidx
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_idx
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_parallel1
+
+    Parameters
+    -----------
+    self: :class:'pandas.Series'
+        input arg
+    axis: 0 or :obj:'pandas.Series.index'
+        Axis to direct sorting.
+        *unsupported*
+    ascending: :obj:'bool', default: True
+        If True, sort values in ascending order, otherwise descending.
+    kind: {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort'
+        Choice of sorting algorithm.
+        *unsupported, uses python func - sorted()*
+    na_position: {'first' or 'last'}, default 'last'
+        Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end.
+        *unsupported*
+
+    Returns
+    -------
+    :obj:`pandas.Series`
+         returns: Series ordered by values.
+    """
+
+    _func_name = 'Method sort_values().'
+
+    if not isinstance(self, SeriesType):
+        raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self))
+
+    if not (isinstance(ascending, types.Omitted) or isinstance(ascending, types.Boolean) or ascending is True or False):
+        raise TypingError('{} Unsupported parameters. Given ascending: {}'.format(_func_name, ascending))
+
+    if isinstance(self.index, types.NoneType) and isinstance(self.data.dtype, types.UnicodeType):
+        def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort',
+                                                na_position='last'):
+
+            index = numpy.arange(len(self._data))
+            my_index = numpy.arange(len(self._data))
+            used_index = numpy.full((len(self._data)), -1)
+            result = sorted(self._data)
+            cycle = range(len(self._data))
+            if ascending is False:
+                result = result[::-1]
+                cycle = range(len(self._data) - 1, -1, -1)
+            result_index = index.copy()
+            for i in range(len(result_index)):
+                find = 0
+                for search in cycle:
+                    check = 0
+                    for j in used_index:
+                        if my_index[search] == j:
+                            check = 1
+                    if (self._data[search] == result[i]) and check == 0 and find == 0:
+                        result_index[i] = index[search]
+                        used_index[i] = my_index[search]
+                        find = 1
+
+            na = 0
+            for i in self.isna():
+                if i:
+                    na += 1
+            num = 0
+            for i in self.isna():
+                j = len(result_index) - na
+                if i and used_index[j] == -1:
+                    result_index[j] = index[num]
+                    used_index[j] = my_index[num]
+                    na -= 1
+                num += 1
+
+            return pandas.Series(result, result_index)
+
+        return hpat_pandas_series_sort_values_impl
+
+    if isinstance(self.index, types.NoneType) and isinstance(self.data.dtype, types.Number):
+        def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort',
+                                                na_position='last'):
+
+            na = 0
+            for i in self.isna():
+                if i:
+                    na += 1
+            index = numpy.arange(len(self._data))
+            my_index = numpy.arange(len(self._data))
+            used_index = numpy.full((len(self._data)), -1)
+            result = numpy.sort(self._data)
+            i = len(self._data) - na
+            cycle = range(len(self._data))
+            if ascending is False:
+                result[:i] = result[:i][::-1]
+                cycle = range(len(self._data), -1, -1)
+            result_index = index.copy()
+
+            for i in range(len(result_index)):
+                find = 0
+                for search in cycle:
+                    check = 0
+                    for j in used_index:
+                        if my_index[search] == j:
+                            check = 1
+                    if (self._data[search] == result[i]) and check == 0 and find == 0:
+                        result_index[i] = index[search]
+                        used_index[i] = my_index[search]
+                        find = 1
+
+
+            num = 0
+            for i in self.isna():
+                j = len(result_index) - na
+                if i and used_index[j] == -1:
+                    result_index[j] = index[num]
+                    used_index[j] = my_index[num]
+                    na -= 1
+                num += 1
+
+            return pandas.Series(result, result_index)
+
+        return hpat_pandas_series_sort_values_impl
+
+    if isinstance(self.data.dtype, types.UnicodeType):
+        def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort',
+                                                na_position='last'):
+
+            index = self._index
+            my_index = numpy.arange(len(self._data))
+            used_index = numpy.full((len(self._data)), -1)
+            result = sorted(self._data)
+            cycle = range(len(self._data))
+            if ascending is False:
+                result = result[::-1]
+                cycle = range(len(self._data) - 1, -1, -1)
+            result_index = self._index.copy()
+            for i in range(len(result_index)):
+                find = 0
+                for search in cycle:
+                    check = 0
+                    for j in used_index:
+                        if my_index[search] == j:
+                            check = 1
+                    if (self._data[search] == result[i]) and check == 0 and find == 0:
+                        result_index[i] = index[search]
+                        used_index[i] = my_index[search]
+                        find = 1
+
+            na = 0
+            for i in self.isna():
+                if i:
+                    na += 1
+            num = 0
+            for i in self.isna():
+                j = len(result_index) - na
+                if i and used_index[j] == -1:
+                    result_index[j] = index[num]
+                    used_index[j] = my_index[num]
+                    na -= 1
+                num += 1
+
+            return pandas.Series(result, result_index)
+
+        return hpat_pandas_series_sort_values_impl
+
+    if isinstance(self.data.dtype, types.Number):
+        def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort',
+                                                na_position='last'):
+
+            na = 0
+            for i in self.isna():
+                if i:
+                    na += 1
+            i = len(self._data) - na
+            index = self._index
+            my_index = numpy.arange(len(self._data))
+            used_index = numpy.full((len(self._data)), -1)
+            result = numpy.sort(self._data)
+            cycle = range(len(self._data))
+            if ascending is False:
+                result[:i] = result[:i][::-1]
+                cycle = range(len(self._data), -1, -1)
+            result_index = self._index.copy()
+            for i in range(len(result_index)):
+                find = 0
+                for search in cycle:
+                    check = 0
+                    for j in used_index:
+                        if my_index[search] == j:
+                            check = 1
+                    if (self._data[search] == result[i]) and check == 0 and find == 0:
+                        result_index[i] = index[search]
+                        used_index[i] = my_index[search]
+                        find = 1
+
+
+            num = 0
+            for i in self.isna():
+                j = len(result_index) - na
+                if i and used_index[j] == -1:
+                    result_index[j] = index[num]
+                    used_index[j] = my_index[num]
+                    na -= 1
+                num += 1
+
+            return pandas.Series(result, result_index)
+
+        return hpat_pandas_series_sort_values_impl
diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py
@@ -484,14 +484,14 @@ def resolve_rolling(self, ary, args, kws):
     #     sig.return_type = if_arr_to_series_type(sig.return_type)
     #     return sig
 
-    @bound_function("series.sort_values")
-    def resolve_sort_values(self, ary, args, kws):
-        # output will have permuted input index
-        out_index = ary.index
-        if out_index == types.none:
-            out_index = types.Array(types.intp, 1, 'C')
-        out = SeriesType(ary.dtype, ary.data, out_index)
-        return signature(out, *args)
+    # @bound_function("series.sort_values")
+    # def resolve_sort_values(self, ary, args, kws):
+    #     # output will have permuted input index
+    #     out_index = ary.index
+    #     if out_index == types.none:
+    #         out_index = types.Array(types.intp, 1, 'C')
+    #     out = SeriesType(ary.dtype, ary.data, out_index)
+    #     return signature(out, *args)
 
 #     @bound_function("array.take")
 #     def resolve_take(self, ary, args, kws):
@@ -994,7 +994,7 @@ def generic_expand_cumulative_series(self, args, kws):
 _not_series_array_attrs = ['flat', 'ctypes', 'itemset', 'reshape', 'sort', 'flatten',
                            'resolve_shift', 'resolve_sum', 'resolve_copy', 'resolve_mean',
                            'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique',
-                           'resolve_prod', 'resolve_count', 'resolve_argsort']
+                           'resolve_prod', 'resolve_count', 'resolve_argsort', 'resolve_sort_values']
 
 
 # use ArrayAttribute for attributes not defined in SeriesAttribute

diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py
@@ -2220,6 +2220,14 @@ def test_impl(A):
         S = pd.Series(np.random.ranf(n))
         pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
 
+    def test_series_sort_values2(self):
+        def test_impl(S):
+            return S.sort_values(ascending=False)
+        hpat_func = hpat.jit(test_impl)
+
+        S = pd.Series([6, 6, 2, 1, 3, 3, 2, 1, 2])
+        pd.testing.assert_series_equal(test_impl(S), hpat_func(S))
+
     def test_series_sort_values_index1(self):
         def test_impl(A, B):
             S = pd.Series(A, B)
@@ -2234,6 +2242,63 @@ def test_impl(A, B):
         B = np.random.ranf(n)
         pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B))
 
+    def test_series_sort_values_noidx(self):
+        def test_impl_true(S):
+            return S.sort_values(ascending=True)
+
+        def test_impl_false(S):
+            return S.sort_values(ascending=False)
+
+        hpat_func1 = hpat.jit(test_impl_true)
+        hpat_func2 = hpat.jit(test_impl_false)
+
+        data_test = [[6, 6, 2, 1, 3, 3, 2, 1, 2],
+                     [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2],
+                     [6, 6.1, 2.2, 1, 3, 0, 2.2, 1, 2],
+                     [6, 6, 2, 1, 3, np.nan, np.nan, np.nan, np.nan],
+                     [3., 5.3, np.nan, np.nan, 33.2, 56.3, 4.4, 3.7, 8.9],
+                     ['a', 's', 'dd', 'm', 'll', '345', 'xrt', 'kd', 'qq'],
+                     ['dh', 'a', '', 'cv', 'b', '', 'b', 'b', 'p']
+                     ]
+
+        for input_data in data_test:
+            S = pd.Series(input_data)
+            result_ref = test_impl_true(S)
+            result = hpat_func1(S)
+            pd.testing.assert_series_equal(result, result_ref)
+            result_ref = test_impl_false(S)
+            result = hpat_func2(S)
+            pd.testing.assert_series_equal(result, result_ref)
+
+    def test_series_sort_values_idx(self):
+        def test_impl_true(S):
+            return S.sort_values(ascending=True)
+
+        def test_impl_false(S):
+            return S.sort_values(ascending=False)
+
+        hpat_func1 = hpat.jit(test_impl_true)
+        hpat_func2 = hpat.jit(test_impl_false)
+
+        data_test = [[6, 6, 2, 1, 3, 3, 2, 1, 2],
+                     [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2],
+                     [6, 6.1, 2.2, 1, 3, 0, 2.2, 1, 2],
+                     [6, 6, 2, 1, 3, np.nan, np.nan, np.nan, np.nan],
+                     [3., 5.3, np.nan, np.nan, np.inf, np.inf, 4.4, 3.7, 8.9],
+                     ['a', 's', 'dd', 'm', 'll', '345', 'xrt', 'kd', 'qq'],
+                     ['dh', 'a', '', 'cv', 'b', '', 'b', 'b', 'p']
+                     ]
+
+        for input_data in data_test:
+            for index_data in data_test:
+                S = pd.Series(input_data, index_data)
+                result_ref = test_impl_true(S)
+                result = hpat_func1(S)
+                pd.testing.assert_series_equal(result, result_ref)
+                result_ref = test_impl_false(S)
+                result = hpat_func2(S)
+                pd.testing.assert_series_equal(result, result_ref)
+
     def test_series_sort_values_parallel1(self):
         # create `kde.parquet` file
         ParquetGenerator.gen_kde_pq()