Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Fix for 'apending StringArrays drops NaNs problem' and minor changes
Browse files Browse the repository at this point in the history
  • Loading branch information
kozlov-alexey committed Nov 5, 2019
1 parent 6b14375 commit 03c9265
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 118 deletions.
129 changes: 129 additions & 0 deletions hpat/datatypes/common_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# *****************************************************************************
# Copyright (c) 2019, Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

"""
| This file contains internal common functions used in SDC implementation across different files
"""

import numpy

from numba import types
from numba.extending import overload
from numba import numpy_support

import hpat
from hpat.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to)


def has_literal_value(var, value):
if not isinstance(var, types.Literal):
return False

if value is None or isinstance(value, type(bool)):
return var.literal_value is value
else:
return var.literal_value == value


def has_python_value(var, value):
if not isinstance(var, type(value)):
return False

if value is None or isinstance(value, type(bool)):
return var is value
else:
return var == value


def _append(A, B):
return None


@overload(_append)
def _append_overload(A, B):
'''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A'''

if isinstance(A, types.Array):
if isinstance(B, types.Array):
def _append_single_numeric_impl(A, B):
return numpy.concatenate((A, B,))

return _append_single_numeric_impl
elif isinstance(B, (types.UniTuple, types.List)):
# TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
# to resolve common dtype of heterogeneous sequence of arrays
np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)]
np_common_dtype = numpy.find_common_type([], np_dtypes)
numba_common_dtype = numpy_support.from_dtype(np_common_dtype)

# TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
def _append_list_numeric_impl(A, B):

total_length = len(A) + numpy.array([len(arr) for arr in B]).sum()
new_data = numpy.empty(total_length, numba_common_dtype)

stop = len(A)
new_data[:stop] = A
for arr in B:
start = stop
stop = start + len(arr)
new_data[start:stop] = arr
return new_data

return _append_list_numeric_impl

elif A == string_array_type:
if B == string_array_type:
def _append_single_string_array_impl(A, B):
total_size = len(A) + len(B)
total_chars = num_total_chars(A) + num_total_chars(B)
new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

pos = 0
pos += append_string_array_to(new_data, pos, A)
pos += append_string_array_to(new_data, pos, B)

return new_data

return _append_single_string_array_impl
elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type):
def _append_list_string_array_impl(A, B):
array_list = [A] + list(B)
total_size = numpy.array([len(arr) for arr in array_list]).sum()
total_chars = numpy.array([num_total_chars(arr) for arr in array_list]).sum()

new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

pos = 0
pos += append_string_array_to(new_data, pos, A)
for arr in B:
pos += append_string_array_to(new_data, pos, arr)

return new_data

return _append_list_string_array_impl
85 changes: 36 additions & 49 deletions hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************
from hpat.datatypes import common_functions

"""
| :class:`pandas.Series` functions and operators implementations in HPAT
Expand All @@ -39,6 +40,7 @@
from numba import types

import hpat
import hpat.datatypes.common_functions as common_functions
from hpat.hiframes.pd_series_ext import SeriesType
from hpat.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars)
from hpat.utils import to_array
Expand Down Expand Up @@ -706,26 +708,26 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr
.. only:: developer
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_append*
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_append*
Parameters
-----------
self: :obj:`pandas.Series`
input series
input series
to_append : :obj:`pandas.Series` object or :obj:`list` or :obj:`set`
Series (or list or tuple of Series) to append with self
Series (or list or tuple of Series) to append with self
ignore_index: :obj:`bool`, default False
If True, do not use the index labels.
Supported as literal value only
If True, do not use the index labels.
Supported as literal value only
verify_integrity: :obj:`bool`, default False
If True, raise Exception on creating index with duplicates.
*unsupported*
If True, raise Exception on creating index with duplicates.
*unsupported*
Returns
-------
:obj:`pandas.Series`
returns :obj:`pandas.Series` object
Concatenated Series
returns :obj:`pandas.Series` object
Concatenated Series
"""

Expand Down Expand Up @@ -754,53 +756,38 @@ def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integr
'{} Unsupported parameters. Given verify_integrity: {}'.format(_func_name, verify_integrity))

# ignore_index value has to be known at compile time to select between implementations with different signatures
if ((isinstance(ignore_index, types.Literal) and ignore_index.literal_value is True)
or (isinstance(ignore_index, bool) and ignore_index is True)):
# implementations that ignore series index
if isinstance(to_append, SeriesType):
def hpat_pandas_series_append_single_impl(self, to_append, ignore_index=False, verify_integrity=False):
ignore_index_is_false = (common_functions.has_literal_value(ignore_index, False)
or common_functions.has_python_value(ignore_index, False)
or isinstance(ignore_index, types.Omitted))
to_append_is_series = isinstance(to_append, SeriesType)

if ignore_index_is_false:
def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False):
if to_append_is_series == True:
new_data = common_functions._append(self._data, to_append._data)
new_index = common_functions._append(self.index, to_append.index)
else:
data_arrays_to_append = [series._data for series in to_append]
index_arrays_to_append = [series.index for series in to_append]
new_data = common_functions._append(self._data, data_arrays_to_append)
new_index = common_functions._append(self.index, index_arrays_to_append)

new_data = hpat.hiframes.api._append(self._data, to_append._data)
new_index = numpy.arange(len(self._data) + len(to_append._data))
return pandas.Series(new_data, new_index)
return pandas.Series(new_data, new_index)

return hpat_pandas_series_append_single_impl
return hpat_pandas_series_append_impl

elif isinstance(to_append, (types.UniTuple, types.List)):
def hpat_pandas_series_append_list_impl(self, to_append, ignore_index=False, verify_integrity=False):
else:
def hpat_pandas_series_append_ignore_index_impl(self, to_append, ignore_index=False, verify_integrity=False):

if to_append_is_series == True:
new_data = common_functions._append(self._data, to_append._data)
else:
arrays_to_append = [series._data for series in to_append]
sum_of_sizes = numpy.array([len(arr) for arr in arrays_to_append]).sum()
new_data = hpat.hiframes.api._append(self._data, arrays_to_append)
new_index = numpy.arange(len(self._data) + sum_of_sizes)
return pandas.Series(new_data, new_index)

return hpat_pandas_series_append_list_impl

elif ((isinstance(ignore_index, types.Literal) and ignore_index.literal_value is False)
or (isinstance(ignore_index, bool) and ignore_index is False)
or isinstance(ignore_index, types.Omitted)):
# implementations that handle series index (ignore_index is False)
if isinstance(to_append, SeriesType):
def hpat_pandas_series_append_single_impl(self, to_append, ignore_index=False, verify_integrity=False):

new_data = hpat.hiframes.api._append(self._data, to_append._data)
new_index = hpat.hiframes.api._append(self.index, to_append.index)
return pandas.Series(new_data, new_index)

return hpat_pandas_series_append_single_impl

elif isinstance(to_append, (types.UniTuple, types.List)):
def hpat_pandas_series_append_list_impl(self, to_append, ignore_index=False, verify_integrity=False):

data_arrays_to_append = [series._data for series in to_append]
index_arrays_to_append = [series.index for series in to_append]
new_data = common_functions._append(self._data, arrays_to_append)

new_data = hpat.hiframes.api._append(self._data, data_arrays_to_append)
new_index = hpat.hiframes.api._append(self.index, index_arrays_to_append)
return pandas.Series(new_data, new_index)
return pandas.Series(new_data, None)

return hpat_pandas_series_append_list_impl
return hpat_pandas_series_append_ignore_index_impl


@overload_method(SeriesType, 'copy')
Expand Down
72 changes: 6 additions & 66 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@
from hpat.utils import _numba_to_c_type_map, unliteral_all
from hpat.str_ext import string_type, list_string_array_type
from hpat.set_ext import build_set
from hpat.str_arr_ext import (StringArrayType, string_array_type, is_str_arr_typ, cp_str_list_to_array, num_total_chars)
from hpat.str_arr_ext import (
StringArrayType,
string_array_type,
is_str_arr_typ,
num_total_chars,
append_string_array_to)
from hpat.hiframes.pd_timestamp_ext import (pandas_timestamp_type, datetime_date_type, set_df_datetime_date_lower)
from hpat.hiframes.pd_series_ext import (
SeriesType,
Expand Down Expand Up @@ -1763,68 +1768,3 @@ def _analyze_op_pair_first(self, scope, equiv_set, expr):


numba.array_analysis.ArrayAnalysis._analyze_op_pair_first = _analyze_op_pair_first


def _append(A, B):
return None


@overload(_append)
def _append_overload(A, B):
'''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A'''

if isinstance(A, types.Array):
if isinstance(B, types.Array):
def _append_single_numeric_impl(A, B):
return np.concatenate((A, B,))

return _append_single_numeric_impl
elif isinstance(B, (types.UniTuple, types.List)):
# TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
# to resolve common dtype of heterogeneous sequence of arrays
np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)]
np_common_dtype = np.find_common_type([], np_dtypes)
numba_common_dtype = numpy_support.from_dtype(np_common_dtype)

# TODO: refactor to use np.concatenate when Numba supports building a tuple at runtime
def _append_list_numeric_impl(A, B):

total_length = len(A) + np.array([len(arr) for arr in B]).sum()
new_data = np.empty(total_length, numba_common_dtype)

stop = len(A)
new_data[:stop] = A
for arr in B:
start = stop
stop = start + len(arr)
new_data[start:stop] = arr
return new_data

return _append_list_numeric_impl

elif A == string_array_type:
if B == string_array_type:
def _append_single_string_array_impl(A, B):
total_size = len(A) + len(B)
total_chars = num_total_chars(A) + num_total_chars(B)
new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

list_of_strings = list(A) + list(B)
hpat.str_arr_ext.cp_str_list_to_array(new_data, list_of_strings)
return new_data

return _append_single_string_array_impl
elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type):
def _append_list_string_array_impl(A, B):
array_list = [A] + list(B)
total_size = np.array([len(arr) for arr in array_list]).sum()
total_chars = np.array([num_total_chars(arr) for arr in array_list]).sum()

new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)
list_of_strings = list(A)
for arr in B:
list_of_strings.extend(list(arr))
hpat.str_arr_ext.cp_str_list_to_array(new_data, list_of_strings)
return new_data

return _append_list_string_array_impl
14 changes: 14 additions & 0 deletions hpat/str_arr_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1392,3 +1392,17 @@ def lower_glob(context, builder, sig, args):
# context.nrt.decref(builder, ty, ret)

return impl_ret_new_ref(context, builder, typ, ret)


@numba.njit(no_cpython_wrapper=True)
def append_string_array_to(result, pos, A):
# precondition: result is allocated with the size enough to contain A
i, j = 0, pos
for str in A:
result[j] = str
if str_arr_is_na(A, i):
hpat.str_arr_ext.str_arr_set_na(result, j)
i += 1
j += 1

return i
5 changes: 2 additions & 3 deletions hpat/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1325,7 +1325,6 @@ def test_impl(S):
S2 = S1.copy()
pd.testing.assert_series_equal(hpat_func(S1), test_impl(S2))


@unittest.skipIf(hpat.config.config_pipeline_hpat_default,
'No support of axis argument in old-style Series.dropna() impl')
def test_series_dropna_axis1(self):
Expand Down Expand Up @@ -2040,7 +2039,7 @@ def test_impl(S, other):
hpat_func = hpat.jit(test_impl)

dtype_to_data = {'float': [[-2., 3., 9.1], [-2., 5.0]],
'string': [['a', 'b', 'q'], ['d', 'e']]}
'string': [['a', None, 'q', ''], ['d', '', 'e']]}

for dtype, data_list in dtype_to_data.items():
with self.subTest(series_dtype=dtype, concatenated_data=data_list):
Expand All @@ -2057,7 +2056,7 @@ def test_impl(S1, S2, S3):

dtype_to_data = {'float': [[-2., 3., 9.1], [-2., 5.0], [1.0]]}
if not hpat.config.config_pipeline_hpat_default:
dtype_to_data['string'] = [['a', 'b', 'q'], ['d', 'e'], ['s']]
dtype_to_data['string'] = [['a', None, ''], ['d', None], ['']]

for dtype, data_list in dtype_to_data.items():
with self.subTest(series_dtype=dtype, concatenated_data=data_list):
Expand Down

0 comments on commit 03c9265

Please sign in to comment.