From 2484e0307875ce367111c6a2296487cc775517a1 Mon Sep 17 00:00:00 2001
From: "Kozlov, Alexey" <alexey.kozlov@intel.com>
Date: Tue, 8 Oct 2019 01:04:14 +0300
Subject: [PATCH] Refactor Series.median() in a new style via np.median()

---
 .../datatypes/hpat_pandas_series_functions.py | 64 +++++++++++++++++++
 hpat/hiframes/pd_series_ext.py                | 14 ++--
 hpat/tests/test_series.py                     | 32 ++++++++++
 3 files changed, 103 insertions(+), 7 deletions(-)

diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py
index 2545f96f8..7dd99a242 100644
--- a/hpat/datatypes/hpat_pandas_series_functions.py
+++ b/hpat/datatypes/hpat_pandas_series_functions.py
@@ -1547,3 +1547,67 @@ def hpat_pandas_series_nunique_impl(self, dropna=True):
             return len(data_set) + 1
 
     return hpat_pandas_series_nunique_impl
+
+
+@overload_method(SeriesType, 'median')
+def hpat_pandas_series_median(self, axis=None, skipna=True, level=None, numeric_only=None):
+    """
+    Pandas Series method :meth:`pandas.Series.median` implementation.
+
+    .. only:: developer
+
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_median1
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_median_skipna_default1
+       Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_median_skipna_false1
+
+    Parameters
+    -----------
+    self: :obj:`pandas.Series`
+          input series
+    axis: :obj:`int` or :obj:`string` {0 or `index`, None}, default None
+        The axis for the function to be applied on.
+        *unsupported*
+    skipna: :obj:`bool`, default True
+        exclude NA/null values when computing the result
+    level: :obj:`int` or :obj:`string`, default None
+         *unsupported*
+    numeric_only: :obj:`bool` or None, default None
+         *unsupported*
+
+    Returns
+    -------
+    :obj:`float` or :obj:`pandas.Series` (if level is specified)
+         median of values in the series
+
+    """
+
+    _func_name = 'Method median().'
+
+    if not isinstance(self, SeriesType):
+        raise TypingError(
+            '{} The object must be a pandas.series. Given self: {}'.format(_func_name, self))
+
+    if not isinstance(self.dtype, types.Number):
+        raise TypingError(
+            '{} The function only applies to elements that are all numeric. Given data type: {}'.format(_func_name, self.dtype))
+
+    if not (isinstance(axis, (types.Integer, types.UnicodeType, types.Omitted)) or axis is None):
+        raise TypingError('{} The axis must be an Integer or a String. Currently unsupported. Given: {}'.format(_func_name, axis))
+
+    if not (isinstance(skipna, (types.Boolean, types.Omitted)) or skipna == True):
+        raise TypingError('{} The is_copy must be a boolean. Given: {}'.format(_func_name, skipna))
+
+    if not ((level is None or isinstance(level, types.Omitted))
+            or (numeric_only is None or isinstance(numeric_only, types.Omitted))
+            or (axis is None or isinstance(axis, types.Omitted))
+    ):
+        raise TypingError('{} Unsupported parameters. Given level: {}, numeric_only: {}, axis: {}'.format(_func_name, level, numeric_only, axis))
+
+
+    def hpat_pandas_series_median_impl(self, axis=None, skipna=True, level=None, numeric_only=None):
+        if skipna:
+            return numpy.nanmedian(self._data)
+
+        return numpy.median(self._data)
+
+    return hpat_pandas_series_median_impl
diff --git a/hpat/hiframes/pd_series_ext.py b/hpat/hiframes/pd_series_ext.py
index 6eefa938b..61af4579f 100644
--- a/hpat/hiframes/pd_series_ext.py
+++ b/hpat/hiframes/pd_series_ext.py
@@ -683,13 +683,13 @@ def resolve_head(self, ary, args, kws):
         assert not kws
         return signature(ary, *args)
 
-    @bound_function("series.median")
-    def resolve_median(self, ary, args, kws):
-        assert not kws
-        dtype = ary.dtype
-        # median converts integer output to float
-        dtype = types.float64 if isinstance(dtype, types.Integer) else dtype
-        return signature(dtype, *args)
+#     @bound_function("series.median")
+#     def resolve_median(self, ary, args, kws):
+#         assert not kws
+#         dtype = ary.dtype
+#         # median converts integer output to float
+#         dtype = types.float64 if isinstance(dtype, types.Integer) else dtype
+#         return signature(dtype, *args)
 
     @bound_function("series.idxmin")
     def resolve_idxmin(self, ary, args, kws):
diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py
index 247a7f6bd..0f53217ca 100644
--- a/hpat/tests/test_series.py
+++ b/hpat/tests/test_series.py
@@ -1691,6 +1691,7 @@ def test_impl(S):
         self.assertTrue(count_array_OneDs() > 0)
 
     def test_series_median1(self):
+        '''Verifies median implementation for float and integer series of random data'''
         def test_impl(S):
             return S.median()
         hpat_func = hpat.jit(test_impl)
@@ -1711,6 +1712,34 @@ def test_impl(S):
         S = pd.Series(np.random.ranf(m))
         self.assertEqual(hpat_func(S), test_impl(S))
 
+    @unittest.skipIf(hpat.config.config_pipeline_hpat_default,
+                     "BUG: old-style median implementation doesn't filter NaNs")
+    def test_series_median_skipna_default1(self):
+        '''Verifies median implementation with default skipna=True argument on a series with NA values'''
+        def test_impl(S):
+            return S.median()
+        hpat_func = hpat.jit(test_impl)
+
+        S = pd.Series([2., 3., 5., np.nan, 5., 6., 7.])
+        self.assertEqual(hpat_func(S), test_impl(S))
+
+    @unittest.skipIf(hpat.config.config_pipeline_hpat_default,
+                     "Skipna argument is not supported in old-style")
+    def test_series_median_skipna_false1(self):
+        '''Verifies median implementation with skipna=False on a series with NA values'''
+        def test_impl(S):
+            return S.median(skipna=False)
+        hpat_func = hpat.jit(test_impl)
+
+        # np.inf is not NaN, so verify that a correct number is returned
+        S1 = pd.Series([2., 3., 5., np.inf, 5., 6., 7.])
+        self.assertEqual(hpat_func(S1), test_impl(S1))
+
+        # TODO: both return values are 'nan', but HPAT's is not np.nan, hence checking with
+        # assertIs() doesn't work - check if it's Numba relatated
+        S2 = pd.Series([2., 3., 5., np.nan, 5., 6., 7.])
+        self.assertEqual(np.isnan(hpat_func(S2)), np.isnan(test_impl(S2)))
+
     def test_series_median_parallel1(self):
         # create `kde.parquet` file
         ParquetGenerator.gen_kde_pq()
@@ -1722,6 +1751,9 @@ def test_impl():
         hpat_func = hpat.jit(test_impl)
 
         self.assertEqual(hpat_func(), test_impl())
+        self.assertEqual(count_array_REPs(), 0)
+        self.assertEqual(count_parfor_REPs(), 0)
+        self.assertTrue(count_array_OneDs() > 0)
 
     def test_series_argsort_parallel(self):
         # create `kde.parquet` file