Fix for 'apending StringArrays drops NaNs problem' and minor changes

IntelPython · Nov 5, 2019 · 03c9265 · 03c9265
1 parent 6b14375
commit 03c9265
Show file tree

Hide file tree

Showing 5 changed files with 187 additions and 118 deletions.
diff --git a/hpat/datatypes/common_functions.py b/hpat/datatypes/common_functions.py
@@ -0,0 +1,129 @@
+# *****************************************************************************
+# Copyright (c) 2019, Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+#     Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+"""
+
+| This file contains internal common functions used in SDC implementation across different files
+
+"""
+
+import numpy
+
+from numba import types
+from numba.extending import overload
+from numba import numpy_support
+
+import hpat
+from hpat.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to)
+
+
+def has_literal_value(var, value):
+    if not isinstance(var, types.Literal):
+        return False
+
+    if value is None or isinstance(value, type(bool)):
+        return var.literal_value is value
+    else:
+        return var.literal_value == value
+
+
+def has_python_value(var, value):
+    if not isinstance(var, type(value)):
+        return False
+
+    if value is None or isinstance(value, type(bool)):
+        return var is value
+    else:
+        return var == value
+
+
+def _append(A, B):
+    return None
+
+
+@overload(_append)
+def _append_overload(A, B):
+    '''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A'''
+
+    if isinstance(A, types.Array):
+        if isinstance(B, types.Array):
+            def _append_single_numeric_impl(A, B):
+                return numpy.concatenate((A, B,))
+
+            return _append_single_numeric_impl
+        elif isinstance(B, (types.UniTuple, types.List)):
+            # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
+            # to resolve common dtype of heterogeneous sequence of arrays
+            np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)]
+            np_common_dtype = numpy.find_common_type([], np_dtypes)
+            numba_common_dtype = numpy_support.from_dtype(np_common_dtype)
+
+            # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
+            def _append_list_numeric_impl(A, B):
+
+                total_length = len(A) + numpy.array([len(arr) for arr in B]).sum()
+                new_data = numpy.empty(total_length, numba_common_dtype)
+
+                stop = len(A)
+                new_data[:stop] = A
+                for arr in B:
+                    start = stop
+                    stop = start + len(arr)
+                    new_data[start:stop] = arr
+                return new_data
+
+            return _append_list_numeric_impl
+
+    elif A == string_array_type:
+        if B == string_array_type:
+            def _append_single_string_array_impl(A, B):
+                total_size = len(A) + len(B)
+                total_chars = num_total_chars(A) + num_total_chars(B)
+                new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)
+
+                pos = 0
+                pos += append_string_array_to(new_data, pos, A)
+                pos += append_string_array_to(new_data, pos, B)
+
+                return new_data
+
+            return _append_single_string_array_impl
+        elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type):
+            def _append_list_string_array_impl(A, B):
+                array_list = [A] + list(B)
+                total_size = numpy.array([len(arr) for arr in array_list]).sum()
+                total_chars = numpy.array([num_total_chars(arr) for arr in array_list]).sum()
+
+                new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)
+
+                pos = 0
+                pos += append_string_array_to(new_data, pos, A)
+                for arr in B:
+                    pos += append_string_array_to(new_data, pos, arr)
+
+                return new_data
+
+            return _append_list_string_array_impl
diff --git a/hpat/datatypes/hpat_pandas_series_functions.py b/hpat/datatypes/hpat_pandas_series_functions.py
@@ -23,6 +23,7 @@
 # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
+from hpat.datatypes import common_functions
 
 """
 | :class:`pandas.Series` functions and operators implementations in HPAT
@@ -39,6 +40,7 @@
 from numba import types
 
 import hpat
+import hpat.datatypes.common_functions as common_functions
 from hpat.hiframes.pd_series_ext import SeriesType
 from hpat.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars)
 from hpat.utils import to_array
@@ -706,26 +708,26 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr
 
     .. only:: developer
 
-       Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_append*
+        Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_append*
 
     Parameters
     -----------
     self: :obj:`pandas.Series`
-           input series
+        input series
     to_append : :obj:`pandas.Series` object or :obj:`list` or :obj:`set`
-                Series (or list or tuple of Series) to append with self
+        Series (or list or tuple of Series) to append with self
     ignore_index: :obj:`bool`, default False
-                If True, do not use the index labels.
-                Supported as literal value only
+        If True, do not use the index labels.
+        Supported as literal value only
     verify_integrity: :obj:`bool`, default False
-                If True, raise Exception on creating index with duplicates.
-                *unsupported*
+        If True, raise Exception on creating index with duplicates.
+        *unsupported*
 
     Returns
     -------
     :obj:`pandas.Series`
-         returns :obj:`pandas.Series` object
-         Concatenated Series
+        returns :obj:`pandas.Series` object
+        Concatenated Series
 
     """
 
@@ -754,53 +756,38 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr
             '{} Unsupported parameters. Given verify_integrity: {}'.format(_func_name, verify_integrity))
 
     # ignore_index value has to be known at compile time to select between implementations with different signatures
-    if ((isinstance(ignore_index, types.Literal) and ignore_index.literal_value is True)
-            or (isinstance(ignore_index, bool) and ignore_index is True)):
-        # implementations that ignore series index
-        if isinstance(to_append, SeriesType):
-            def hpat_pandas_series_append_single_impl(self, to_append, ignore_index=False, verify_integrity=False):
+    ignore_index_is_false = (common_functions.has_literal_value(ignore_index, False)
+                             or common_functions.has_python_value(ignore_index, False)
+                             or isinstance(ignore_index, types.Omitted))
+    to_append_is_series = isinstance(to_append, SeriesType)
+
+    if ignore_index_is_false:
+        def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False):
+            if to_append_is_series == True:
+                new_data = common_functions._append(self._data, to_append._data)
+                new_index = common_functions._append(self.index, to_append.index)
+            else:
+                data_arrays_to_append = [series._data for series in to_append]
+                index_arrays_to_append = [series.index for series in to_append]
+                new_data = common_functions._append(self._data, data_arrays_to_append)
+                new_index = common_functions._append(self.index, index_arrays_to_append)
 
-                new_data = hpat.hiframes.api._append(self._data, to_append._data)
-                new_index = numpy.arange(len(self._data) + len(to_append._data))
-                return pandas.Series(new_data, new_index)
+            return pandas.Series(new_data, new_index)
 
-            return hpat_pandas_series_append_single_impl
+        return hpat_pandas_series_append_impl
 
-        elif isinstance(to_append, (types.UniTuple, types.List)):
-            def hpat_pandas_series_append_list_impl(self, to_append, ignore_index=False, verify_integrity=False):
+    else:
+        def hpat_pandas_series_append_ignore_index_impl(self, to_append, ignore_index=False, verify_integrity=False):
 
+            if to_append_is_series == True:
+                new_data = common_functions._append(self._data, to_append._data)
+            else:
                 arrays_to_append = [series._data for series in to_append]
-                sum_of_sizes = numpy.array([len(arr) for arr in arrays_to_append]).sum()
-                new_data = hpat.hiframes.api._append(self._data, arrays_to_append)
-                new_index = numpy.arange(len(self._data) + sum_of_sizes)
-                return pandas.Series(new_data, new_index)
-
-            return hpat_pandas_series_append_list_impl
-
-    elif ((isinstance(ignore_index, types.Literal) and ignore_index.literal_value is False)
-            or (isinstance(ignore_index, bool) and ignore_index is False)
-            or isinstance(ignore_index, types.Omitted)):
-        # implementations that handle series index (ignore_index is False)
-        if isinstance(to_append, SeriesType):
-            def hpat_pandas_series_append_single_impl(self, to_append, ignore_index=False, verify_integrity=False):
-
-                new_data = hpat.hiframes.api._append(self._data, to_append._data)
-                new_index = hpat.hiframes.api._append(self.index, to_append.index)
-                return pandas.Series(new_data, new_index)
-
-            return hpat_pandas_series_append_single_impl
-
-        elif isinstance(to_append, (types.UniTuple, types.List)):
-            def hpat_pandas_series_append_list_impl(self, to_append, ignore_index=False, verify_integrity=False):
-
-                data_arrays_to_append = [series._data for series in to_append]
-                index_arrays_to_append = [series.index for series in to_append]
+                new_data = common_functions._append(self._data, arrays_to_append)
 
-                new_data = hpat.hiframes.api._append(self._data, data_arrays_to_append)
-                new_index = hpat.hiframes.api._append(self.index, index_arrays_to_append)
-                return pandas.Series(new_data, new_index)
+            return pandas.Series(new_data, None)
 
-            return hpat_pandas_series_append_list_impl
+        return hpat_pandas_series_append_ignore_index_impl
 
 
 @overload_method(SeriesType, 'copy')

diff --git a/hpat/hiframes/api.py b/hpat/hiframes/api.py
@@ -29,7 +29,12 @@
 from hpat.utils import _numba_to_c_type_map, unliteral_all
 from hpat.str_ext import string_type, list_string_array_type
 from hpat.set_ext import build_set
-from hpat.str_arr_ext import (StringArrayType, string_array_type, is_str_arr_typ, cp_str_list_to_array, num_total_chars)
+from hpat.str_arr_ext import (
+    StringArrayType,
+    string_array_type,
+    is_str_arr_typ,
+    num_total_chars,
+    append_string_array_to)
 from hpat.hiframes.pd_timestamp_ext import (pandas_timestamp_type, datetime_date_type, set_df_datetime_date_lower)
 from hpat.hiframes.pd_series_ext import (
     SeriesType,
@@ -1763,68 +1768,3 @@ def _analyze_op_pair_first(self, scope, equiv_set, expr):
 
 
 numba.array_analysis.ArrayAnalysis._analyze_op_pair_first = _analyze_op_pair_first
-
-
-def _append(A, B):
-    return None
-
-
-@overload(_append)
-def _append_overload(A, B):
-    '''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A'''
-
-    if isinstance(A, types.Array):
-        if isinstance(B, types.Array):
-            def _append_single_numeric_impl(A, B):
-                return np.concatenate((A, B,))
-
-            return _append_single_numeric_impl
-        elif isinstance(B, (types.UniTuple, types.List)):
-            # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
-            # to resolve common dtype of heterogeneous sequence of arrays
-            np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)]
-            np_common_dtype = np.find_common_type([], np_dtypes)
-            numba_common_dtype = numpy_support.from_dtype(np_common_dtype)
-
-            # TODO: refactor to use np.concatenate when Numba supports building a tuple at runtime
-            def _append_list_numeric_impl(A, B):
-
-                total_length = len(A) + np.array([len(arr) for arr in B]).sum()
-                new_data = np.empty(total_length, numba_common_dtype)
-
-                stop = len(A)
-                new_data[:stop] = A
-                for arr in B:
-                    start = stop
-                    stop = start + len(arr)
-                    new_data[start:stop] = arr
-                return new_data
-
-            return _append_list_numeric_impl
-
-    elif A == string_array_type:
-        if B == string_array_type:
-            def _append_single_string_array_impl(A, B):
-                total_size = len(A) + len(B)
-                total_chars = num_total_chars(A) + num_total_chars(B)
-                new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)
-
-                list_of_strings = list(A) + list(B)
-                hpat.str_arr_ext.cp_str_list_to_array(new_data, list_of_strings)
-                return new_data
-
-            return _append_single_string_array_impl
-        elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type):
-            def _append_list_string_array_impl(A, B):
-                array_list = [A] + list(B)
-                total_size = np.array([len(arr) for arr in array_list]).sum()
-                total_chars = np.array([num_total_chars(arr) for arr in array_list]).sum()
-
-                new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)
-                list_of_strings = list(A)
-                for arr in B:
-                    list_of_strings.extend(list(arr))
-                hpat.str_arr_ext.cp_str_list_to_array(new_data, list_of_strings)
-                return new_data
-
-            return _append_list_string_array_impl
diff --git a/hpat/str_arr_ext.py b/hpat/str_arr_ext.py
@@ -1392,3 +1392,17 @@ def lower_glob(context, builder, sig, args):
     # context.nrt.decref(builder, ty, ret)
 
     return impl_ret_new_ref(context, builder, typ, ret)
+
+
+@numba.njit(no_cpython_wrapper=True)
+def append_string_array_to(result, pos, A):
+    # precondition: result is allocated with the size enough to contain A
+    i, j = 0, pos
+    for str in A:
+        result[j] = str
+        if str_arr_is_na(A, i):
+            hpat.str_arr_ext.str_arr_set_na(result, j)
+        i += 1
+        j += 1
+
+    return i
diff --git a/hpat/tests/test_series.py b/hpat/tests/test_series.py
@@ -1325,7 +1325,6 @@ def test_impl(S):
         S2 = S1.copy()
         pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))
 
-
     @unittest.skipIf(hpat.config.config_pipeline_hpat_default,
                      'No support of axis argument in old-style Series.dropna() impl')
     def test_series_dropna_axis1(self):
@@ -2040,7 +2039,7 @@ def test_impl(S, other):
         hpat_func = hpat.jit(test_impl)
 
         dtype_to_data = {'float': [[-2., 3., 9.1], [-2., 5.0]],
-                         'string': [['a', 'b', 'q'], ['d', 'e']]}
+                         'string': [['a', None, 'q', ''], ['d', '', 'e']]}
 
         for dtype, data_list in dtype_to_data.items():
             with self.subTest(series_dtype=dtype, concatenated_data=data_list):
@@ -2057,7 +2056,7 @@ def test_impl(S1, S2, S3):
 
         dtype_to_data = {'float': [[-2., 3., 9.1], [-2., 5.0], [1.0]]}
         if not hpat.config.config_pipeline_hpat_default:
-            dtype_to_data['string'] = [['a', 'b', 'q'], ['d', 'e'], ['s']]
+            dtype_to_data['string'] = [['a', None, ''], ['d', None], ['']]
 
         for dtype, data_list in dtype_to_data.items():
             with self.subTest(series_dtype=dtype, concatenated_data=data_list):