From d6589144def4ad2437423f0a7e66241edd5b3151 Mon Sep 17 00:00:00 2001 From: etotmeni Date: Tue, 22 Oct 2019 18:21:10 +0300 Subject: [PATCH 1/2] Add functional and 3 tests --- .../datatypes/hpat_pandas_series_functions.py | 102 ++++++++++++++++++ hpat/hiframes/hiframes_typed.py | 6 +- hpat/hiframes/pd_series_ext.py | 15 +-- hpat/tests/test_series.py | 48 +++++++++ 4 files changed, 161 insertions(+), 10 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 3172c2c6c..6abb36d63 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -2160,3 +2160,105 @@ def hpat_pandas_series_median_impl(self, axis=None, skipna=True, level=None, num return numpy.median(self._data) return hpat_pandas_series_median_impl + + +@overload_method(SeriesType, 'argsort') +def hpat_pandas_series_argsort(self, axis=0, kind='quicksort', order=None): + """ + Pandas Series method :meth:`pandas.Series.argsort` implementation. + + .. only:: developer + + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_argsort1 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_argsort2 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_argsort_noidx + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_argsort_idx + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_argsort_parallel + + Parameters + ----------- + self: :class:`pandas.Series` + input arg + axis: :obj:`int` + Has no effect but is accepted for compatibility with numpy. + kind: {‘mergesort’, ‘quicksort’, ‘heapsort’}, default ‘quicksort’ + Choice of sorting algorithm. See np.sort for more information. ‘mergesort’ is the only stable algorithm + order: None + Has no effect but is accepted for compatibility with numpy. + + Returns + ------- + :obj:`pandas.Series` + returns: Positions of values within the sort order with -1 indicating nan values. + """ + + _func_name = 'Method argsort().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not isinstance(self.data.dtype, types.Number): + raise TypingError('{} Currently function supports only numeric values. Given data type: {}'.format(_func_name, + self.data.dtype)) + + if not (isinstance(axis, types.Omitted) or isinstance(axis, types.Integer) or axis == 0): + raise TypingError('{} Unsupported parameters. Given axis: {}'.format(_func_name, axis)) + + if not isinstance(self.index, types.NoneType): + def hpat_pandas_series_argsort_impl(self, axis=0, kind='quicksort', order=None): + + sort = numpy.argsort(self._data) + series_data = pandas.Series(self._data) + na = 0 + for i in series_data.isna(): + if i: + na += 1 + id = 0 + i = 0 + list_no_nan = numpy.empty(len(self._data) - na) + for bool_value in series_data.isna(): + if not bool_value: + list_no_nan[id] = self._data[i] + id += 1 + i += 1 + sort_no_nan = numpy.argsort(list_no_nan) + ne_na = sort[:len(sort) - na] + num = 0 + result = numpy.full((len(self._data)), -1) + for i in numpy.sort(ne_na): + result[i] = sort_no_nan[num] + num += 1 + + return pandas.Series(result, self._index) + + return hpat_pandas_series_argsort_impl + + def hpat_pandas_series_argsort_impl(self, axis=0, kind='quicksort', order=None): + + sort = numpy.argsort(self._data) + series_data = pandas.Series(self._data) + na = 0 + for i in series_data.isna(): + if i: + na += 1 + id = 0 + i = 0 + list_no_nan = numpy.empty(len(self._data) - na) + for bool_value in series_data.isna(): + if not bool_value: + list_no_nan[id] = self._data[i] + id += 1 + i += 1 + sort_no_nan = numpy.argsort(list_no_nan) + ne_na = sort[:len(sort) - na] + num = 0 + result = numpy.full((len(self._data)), -1) + for i in numpy.sort(ne_na): + result[i] = sort_no_nan[num] + num += 1 + + return pandas.Series(result) + + return hpat_pandas_series_argsort_impl + + diff --git a/hpat/hiframes/hiframes_typed.py b/hpat/hiframes/hiframes_typed.py index 423bb11db..0d9eeac1c 100644 --- a/hpat/hiframes/hiframes_typed.py +++ b/hpat/hiframes/hiframes_typed.py @@ -969,9 +969,9 @@ def run_call_series_quantile_default(A): func = series_replace_funcs[func_name] return self._replace_func(func, [series_var, S2]) - if func_name in ('argsort', 'sort_values'): - return self._handle_series_sort( - lhs, rhs, series_var, func_name == 'argsort') + # if func_name in ('argsort', 'sort_values'): + # return self._handle_series_sort( + # lhs, rhs, series_var, func_name == 'argsort') if func_name == 'rolling': # XXX: remove rolling setup call, assuming still available in definitions diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py index 3a33c5331..195b93bc4 100644 --- a/hpat/hiframes/pd_series_ext.py +++ b/hpat/hiframes/pd_series_ext.py @@ -477,12 +477,12 @@ def resolve_astype(self, ary, args, kws): def resolve_rolling(self, ary, args, kws): return signature(SeriesRollingType(ary.dtype), *args) - @bound_function("array.argsort") - def resolve_argsort(self, ary, args, kws): - resolver = ArrayAttribute.resolve_argsort.__wrapped__ - sig = resolver(self, ary.data, args, kws) - sig.return_type = if_arr_to_series_type(sig.return_type) - return sig + # @bound_function("array.argsort") + # def resolve_argsort(self, ary, args, kws): + # resolver = ArrayAttribute.resolve_argsort.__wrapped__ + # sig = resolver(self, ary.data, args, kws) + # sig.return_type = if_arr_to_series_type(sig.return_type) + # return sig @bound_function("series.sort_values") def resolve_sort_values(self, ary, args, kws): @@ -994,7 +994,8 @@ def generic_expand_cumulative_series(self, args, kws): _not_series_array_attrs = ['flat', 'ctypes', 'itemset', 'reshape', 'sort', 'flatten', 'resolve_shift', 'resolve_sum', 'resolve_copy', 'resolve_mean', 'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique', - 'resolve_prod', 'resolve_count'] + 'resolve_prod', 'resolve_count', 'resolve_argsort'] + # use ArrayAttribute for attributes not defined in SeriesAttribute for attr, func in numba.typing.arraydecl.ArrayAttribute.__dict__.items(): diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index dc64fc064..1d396203b 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -266,6 +266,54 @@ def test_impl(A): A = pd.Series(np.random.ranf(n)) pd.testing.assert_series_equal(hpat_func(A), test_impl(A)) + def test_series_argsort2(self): + def test_impl(S): + return S.argsort() + hpat_func = hpat.jit(test_impl) + + S = pd.Series([5, np.nan, 3, 3, np.nan]) + pd.testing.assert_series_equal(test_impl(S), hpat_func(S)) + + def test_series_argsort_noidx(self): + def test_impl(S): + return S.argsort() + + hpat_func = hpat.jit(test_impl) + + data_test = [[6, 6, 2, 1, 3, 3, 2, 1, 2], + [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], + [6, 6.1, 2.2, 1, 3, 0, 2.2, 1, 2], + [6, 6, 2, 1, 3, np.nan, np.nan, np.nan, np.nan], + [3., 5.3, np.nan, np.nan, 33.2, 56.3, 4.4, 3.7, 8.9] + ] + + for input_data in data_test: + S = pd.Series(input_data) + + result_ref = test_impl(S) + result = hpat_func(S) + pd.testing.assert_series_equal(result, result_ref) + + def test_series_argsort_idx(self): + def test_impl(S): + return S.argsort() + + hpat_func = hpat.jit(test_impl) + + data_test = [[6, 6, 2, 1, 3, 3, 2, 1, 2], + [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], + [6, 6.1, 2.2, 1, 3, 0, 2.2, 1, 2], + [6, 6, 2, 1, 3, np.nan, np.nan, np.nan, np.nan], + [3., 5.3, np.nan, np.nan, np.inf, np.inf, 4.4, 3.7, 8.9] + ] + + for input_data in data_test: + for index_data in data_test: + S = pd.Series(input_data, index_data) + result_ref = test_impl(S) + result = hpat_func(S) + pd.testing.assert_series_equal(result, result_ref) + def test_series_attr6(self): def test_impl(A): return A.take([2, 3]).values From b7d44818c0c8c5524256911dee5ae4771f7fed26 Mon Sep 17 00:00:00 2001 From: etotmeni Date: Fri, 25 Oct 2019 14:24:35 +0300 Subject: [PATCH 2/2] Add Series.sort_values() and tests --- .../datatypes/hpat_pandas_series_functions.py | 223 +++++++++++++++++- hpat/hiframes/pd_series_ext.py | 18 +- hpat/tests/test_series.py | 65 +++++ 3 files changed, 295 insertions(+), 11 deletions(-) diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py index 6abb36d63..ec1cbb46c 100644 --- a/hpat/datatypes/hpat_pandas_series_functions.py +++ b/hpat/datatypes/hpat_pandas_series_functions.py @@ -2181,10 +2181,13 @@ def hpat_pandas_series_argsort(self, axis=0, kind='quicksort', order=None): input arg axis: :obj:`int` Has no effect but is accepted for compatibility with numpy. - kind: {‘mergesort’, ‘quicksort’, ‘heapsort’}, default ‘quicksort’ - Choice of sorting algorithm. See np.sort for more information. ‘mergesort’ is the only stable algorithm + *unsupported* + kind: {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more information. 'mergesort' is the only stable algorithm + *unsupported, uses python func - sorted()* order: None Has no effect but is accepted for compatibility with numpy. + *unsupported* Returns ------- @@ -2262,3 +2265,219 @@ def hpat_pandas_series_argsort_impl(self, axis=0, kind='quicksort', order=None): return hpat_pandas_series_argsort_impl +@overload_method(SeriesType, 'sort_values') +def hpat_pandas_series_sort_values(self, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): + """ + Pandas Series method :meth:`pandas.Series.sort_values` implementation. + + .. only:: developer + + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values1 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values2 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_index1 + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_noidx + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_idx + Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_sort_values_parallel1 + + Parameters + ----------- + self: :class:'pandas.Series' + input arg + axis: 0 or :obj:'pandas.Series.index' + Axis to direct sorting. + *unsupported* + ascending: :obj:'bool', default: True + If True, sort values in ascending order, otherwise descending. + kind: {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. + *unsupported, uses python func - sorted()* + na_position: {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. + *unsupported* + + Returns + ------- + :obj:`pandas.Series` + returns: Series ordered by values. + """ + + _func_name = 'Method sort_values().' + + if not isinstance(self, SeriesType): + raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) + + if not (isinstance(ascending, types.Omitted) or isinstance(ascending, types.Boolean) or ascending is True or False): + raise TypingError('{} Unsupported parameters. Given ascending: {}'.format(_func_name, ascending)) + + if isinstance(self.index, types.NoneType) and isinstance(self.data.dtype, types.UnicodeType): + def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort', + na_position='last'): + + index = numpy.arange(len(self._data)) + my_index = numpy.arange(len(self._data)) + used_index = numpy.full((len(self._data)), -1) + result = sorted(self._data) + cycle = range(len(self._data)) + if ascending is False: + result = result[::-1] + cycle = range(len(self._data) - 1, -1, -1) + result_index = index.copy() + for i in range(len(result_index)): + find = 0 + for search in cycle: + check = 0 + for j in used_index: + if my_index[search] == j: + check = 1 + if (self._data[search] == result[i]) and check == 0 and find == 0: + result_index[i] = index[search] + used_index[i] = my_index[search] + find = 1 + + na = 0 + for i in self.isna(): + if i: + na += 1 + num = 0 + for i in self.isna(): + j = len(result_index) - na + if i and used_index[j] == -1: + result_index[j] = index[num] + used_index[j] = my_index[num] + na -= 1 + num += 1 + + return pandas.Series(result, result_index) + + return hpat_pandas_series_sort_values_impl + + if isinstance(self.index, types.NoneType) and isinstance(self.data.dtype, types.Number): + def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort', + na_position='last'): + + na = 0 + for i in self.isna(): + if i: + na += 1 + index = numpy.arange(len(self._data)) + my_index = numpy.arange(len(self._data)) + used_index = numpy.full((len(self._data)), -1) + result = numpy.sort(self._data) + i = len(self._data) - na + cycle = range(len(self._data)) + if ascending is False: + result[:i] = result[:i][::-1] + cycle = range(len(self._data), -1, -1) + result_index = index.copy() + + for i in range(len(result_index)): + find = 0 + for search in cycle: + check = 0 + for j in used_index: + if my_index[search] == j: + check = 1 + if (self._data[search] == result[i]) and check == 0 and find == 0: + result_index[i] = index[search] + used_index[i] = my_index[search] + find = 1 + + + num = 0 + for i in self.isna(): + j = len(result_index) - na + if i and used_index[j] == -1: + result_index[j] = index[num] + used_index[j] = my_index[num] + na -= 1 + num += 1 + + return pandas.Series(result, result_index) + + return hpat_pandas_series_sort_values_impl + + if isinstance(self.data.dtype, types.UnicodeType): + def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort', + na_position='last'): + + index = self._index + my_index = numpy.arange(len(self._data)) + used_index = numpy.full((len(self._data)), -1) + result = sorted(self._data) + cycle = range(len(self._data)) + if ascending is False: + result = result[::-1] + cycle = range(len(self._data) - 1, -1, -1) + result_index = self._index.copy() + for i in range(len(result_index)): + find = 0 + for search in cycle: + check = 0 + for j in used_index: + if my_index[search] == j: + check = 1 + if (self._data[search] == result[i]) and check == 0 and find == 0: + result_index[i] = index[search] + used_index[i] = my_index[search] + find = 1 + + na = 0 + for i in self.isna(): + if i: + na += 1 + num = 0 + for i in self.isna(): + j = len(result_index) - na + if i and used_index[j] == -1: + result_index[j] = index[num] + used_index[j] = my_index[num] + na -= 1 + num += 1 + + return pandas.Series(result, result_index) + + return hpat_pandas_series_sort_values_impl + + if isinstance(self.data.dtype, types.Number): + def hpat_pandas_series_sort_values_impl(self, axis=0, ascending=True, inplace=False, kind='quicksort', + na_position='last'): + + na = 0 + for i in self.isna(): + if i: + na += 1 + i = len(self._data) - na + index = self._index + my_index = numpy.arange(len(self._data)) + used_index = numpy.full((len(self._data)), -1) + result = numpy.sort(self._data) + cycle = range(len(self._data)) + if ascending is False: + result[:i] = result[:i][::-1] + cycle = range(len(self._data), -1, -1) + result_index = self._index.copy() + for i in range(len(result_index)): + find = 0 + for search in cycle: + check = 0 + for j in used_index: + if my_index[search] == j: + check = 1 + if (self._data[search] == result[i]) and check == 0 and find == 0: + result_index[i] = index[search] + used_index[i] = my_index[search] + find = 1 + + + num = 0 + for i in self.isna(): + j = len(result_index) - na + if i and used_index[j] == -1: + result_index[j] = index[num] + used_index[j] = my_index[num] + na -= 1 + num += 1 + + return pandas.Series(result, result_index) + + return hpat_pandas_series_sort_values_impl diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py index 195b93bc4..0d06c0e3d 100644 --- a/hpat/hiframes/pd_series_ext.py +++ b/hpat/hiframes/pd_series_ext.py @@ -484,14 +484,14 @@ def resolve_rolling(self, ary, args, kws): # sig.return_type = if_arr_to_series_type(sig.return_type) # return sig - @bound_function("series.sort_values") - def resolve_sort_values(self, ary, args, kws): - # output will have permuted input index - out_index = ary.index - if out_index == types.none: - out_index = types.Array(types.intp, 1, 'C') - out = SeriesType(ary.dtype, ary.data, out_index) - return signature(out, *args) + # @bound_function("series.sort_values") + # def resolve_sort_values(self, ary, args, kws): + # # output will have permuted input index + # out_index = ary.index + # if out_index == types.none: + # out_index = types.Array(types.intp, 1, 'C') + # out = SeriesType(ary.dtype, ary.data, out_index) + # return signature(out, *args) # @bound_function("array.take") # def resolve_take(self, ary, args, kws): @@ -994,7 +994,7 @@ def generic_expand_cumulative_series(self, args, kws): _not_series_array_attrs = ['flat', 'ctypes', 'itemset', 'reshape', 'sort', 'flatten', 'resolve_shift', 'resolve_sum', 'resolve_copy', 'resolve_mean', 'resolve_take', 'resolve_max', 'resolve_min', 'resolve_nunique', - 'resolve_prod', 'resolve_count', 'resolve_argsort'] + 'resolve_prod', 'resolve_count', 'resolve_argsort', 'resolve_sort_values'] # use ArrayAttribute for attributes not defined in SeriesAttribute diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py index 1d396203b..209275d6b 100644 --- a/hpat/tests/test_series.py +++ b/hpat/tests/test_series.py @@ -2220,6 +2220,14 @@ def test_impl(A): S = pd.Series(np.random.ranf(n)) pd.testing.assert_series_equal(hpat_func(S), test_impl(S)) + def test_series_sort_values2(self): + def test_impl(S): + return S.sort_values(ascending=False) + hpat_func = hpat.jit(test_impl) + + S = pd.Series([6, 6, 2, 1, 3, 3, 2, 1, 2]) + pd.testing.assert_series_equal(test_impl(S), hpat_func(S)) + def test_series_sort_values_index1(self): def test_impl(A, B): S = pd.Series(A, B) @@ -2234,6 +2242,63 @@ def test_impl(A, B): B = np.random.ranf(n) pd.testing.assert_series_equal(hpat_func(A, B), test_impl(A, B)) + def test_series_sort_values_noidx(self): + def test_impl_true(S): + return S.sort_values(ascending=True) + + def test_impl_false(S): + return S.sort_values(ascending=False) + + hpat_func1 = hpat.jit(test_impl_true) + hpat_func2 = hpat.jit(test_impl_false) + + data_test = [[6, 6, 2, 1, 3, 3, 2, 1, 2], + [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], + [6, 6.1, 2.2, 1, 3, 0, 2.2, 1, 2], + [6, 6, 2, 1, 3, np.nan, np.nan, np.nan, np.nan], + [3., 5.3, np.nan, np.nan, 33.2, 56.3, 4.4, 3.7, 8.9], + ['a', 's', 'dd', 'm', 'll', '345', 'xrt', 'kd', 'qq'], + ['dh', 'a', '', 'cv', 'b', '', 'b', 'b', 'p'] + ] + + for input_data in data_test: + S = pd.Series(input_data) + result_ref = test_impl_true(S) + result = hpat_func1(S) + pd.testing.assert_series_equal(result, result_ref) + result_ref = test_impl_false(S) + result = hpat_func2(S) + pd.testing.assert_series_equal(result, result_ref) + + def test_series_sort_values_idx(self): + def test_impl_true(S): + return S.sort_values(ascending=True) + + def test_impl_false(S): + return S.sort_values(ascending=False) + + hpat_func1 = hpat.jit(test_impl_true) + hpat_func2 = hpat.jit(test_impl_false) + + data_test = [[6, 6, 2, 1, 3, 3, 2, 1, 2], + [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], + [6, 6.1, 2.2, 1, 3, 0, 2.2, 1, 2], + [6, 6, 2, 1, 3, np.nan, np.nan, np.nan, np.nan], + [3., 5.3, np.nan, np.nan, np.inf, np.inf, 4.4, 3.7, 8.9], + ['a', 's', 'dd', 'm', 'll', '345', 'xrt', 'kd', 'qq'], + ['dh', 'a', '', 'cv', 'b', '', 'b', 'b', 'p'] + ] + + for input_data in data_test: + for index_data in data_test: + S = pd.Series(input_data, index_data) + result_ref = test_impl_true(S) + result = hpat_func1(S) + pd.testing.assert_series_equal(result, result_ref) + result_ref = test_impl_false(S) + result = hpat_func2(S) + pd.testing.assert_series_equal(result, result_ref) + def test_series_sort_values_parallel1(self): # create `kde.parquet` file ParquetGenerator.gen_kde_pq()