Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Fix new-style implementation of Series.append() method (#262)
Browse files Browse the repository at this point in the history
* Fix new-style implementation of Series.append() method

* Fix for 'apending StringArrays drops NaNs problem' and minor changes
  • Loading branch information
kozlov-alexey authored and shssf committed Nov 10, 2019
1 parent 34b1a80 commit b857c10
Show file tree
Hide file tree
Showing 6 changed files with 475 additions and 32 deletions.
133 changes: 133 additions & 0 deletions hpat/datatypes/common_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# *****************************************************************************
# Copyright (c) 2019, Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

"""
| This file contains internal common functions used in SDC implementation across different files
"""

import numpy

from numba import types
from numba.extending import overload
from numba import numpy_support

import hpat
from hpat.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to)


def has_literal_value(var, value):
'''Used during typing to check that variable var is a Numba literal value equal to value'''

if not isinstance(var, types.Literal):
return False

if value is None or isinstance(value, type(bool)):
return var.literal_value is value
else:
return var.literal_value == value


def has_python_value(var, value):
'''Used during typing to check that variable var was resolved as Python type and has specific value'''

if not isinstance(var, type(value)):
return False

if value is None or isinstance(value, type(bool)):
return var is value
else:
return var == value


def hpat_arrays_append(A, B):
pass


@overload(hpat_arrays_append)
def hpat_arrays_append_overload(A, B):
'''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A'''

if isinstance(A, types.Array):
if isinstance(B, types.Array):
def _append_single_numeric_impl(A, B):
return numpy.concatenate((A, B,))

return _append_single_numeric_impl
elif isinstance(B, (types.UniTuple, types.List)):
# TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
# to resolve common dtype of heterogeneous sequence of arrays
np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)]
np_common_dtype = numpy.find_common_type([], np_dtypes)
numba_common_dtype = numpy_support.from_dtype(np_common_dtype)

# TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
def _append_list_numeric_impl(A, B):

total_length = len(A) + numpy.array([len(arr) for arr in B]).sum()
new_data = numpy.empty(total_length, numba_common_dtype)

stop = len(A)
new_data[:stop] = A
for arr in B:
start = stop
stop = start + len(arr)
new_data[start:stop] = arr
return new_data

return _append_list_numeric_impl

elif A == string_array_type:
if B == string_array_type:
def _append_single_string_array_impl(A, B):
total_size = len(A) + len(B)
total_chars = num_total_chars(A) + num_total_chars(B)
new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

pos = 0
pos += append_string_array_to(new_data, pos, A)
pos += append_string_array_to(new_data, pos, B)

return new_data

return _append_single_string_array_impl
elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type):
def _append_list_string_array_impl(A, B):
array_list = [A] + list(B)
total_size = numpy.array([len(arr) for arr in array_list]).sum()
total_chars = numpy.array([num_total_chars(arr) for arr in array_list]).sum()

new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

pos = 0
pos += append_string_array_to(new_data, pos, A)
for arr in B:
pos += append_string_array_to(new_data, pos, arr)

return new_data

return _append_list_string_array_impl
87 changes: 71 additions & 16 deletions hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@
from numba import types

import hpat
import hpat.datatypes.common_functions as common_functions
from hpat.hiframes.pd_series_ext import SeriesType
from hpat.str_arr_ext import StringArrayType
from hpat.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars)
from hpat.utils import to_array


class TypeChecker:
"""
Validate object type and raise TypingError if the type is invalid, e.g.:
Expand Down Expand Up @@ -753,37 +753,92 @@ def hpat_pandas_series_isin_impl(self, values):


@overload_method(SeriesType, 'append')
def hpat_pandas_series_append(self, to_append):
def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integrity=False):
"""
Pandas Series method :meth:`pandas.Series.append` implementation.
.. only:: developer
Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_append1
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_append*
Parameters
-----------
to_append : :obj:`pandas.Series` object
input argument
ignore_index:
*unsupported*
verify_integrity:
*unsupported*
self: :obj:`pandas.Series`
input series
to_append : :obj:`pandas.Series` object or :obj:`list` or :obj:`set`
Series (or list or tuple of Series) to append with self
ignore_index: :obj:`bool`, default False
If True, do not use the index labels.
Supported as literal value only
verify_integrity: :obj:`bool`, default False
If True, raise Exception on creating index with duplicates.
*unsupported*
Returns
-------
:obj:`pandas.Series`
returns :obj:`pandas.Series` object
returns :obj:`pandas.Series` object
Concatenated Series
"""

_func_name = 'Method append().'

if not isinstance(self, SeriesType) or not isinstance(to_append, SeriesType):
if not isinstance(self, SeriesType):
raise TypingError(
'{} The object must be a pandas.series. Given self: {}'.format(_func_name, self))

if not (isinstance(to_append, SeriesType)
or (isinstance(to_append, (types.UniTuple, types.List)) and isinstance(to_append.dtype, SeriesType))):
raise TypingError(
'{} The argument must be a pandas.series or list/tuple of pandas.series. \
Given to_append: {}'.format(_func_name, to_append))

# currently we will always raise this in the end, i.e. if no impl was found
# TODO: find a way to stop compilation early and not proceed with unliteral step
if not (isinstance(ignore_index, types.Literal) and isinstance(ignore_index, types.Boolean)
or isinstance(ignore_index, types.Omitted)
or ignore_index is False):
raise TypingError(
'{} The ignore_index must be a literal Boolean constant. Given: {}'.format(_func_name, ignore_index))

if not (verify_integrity is False or isinstance(verify_integrity, types.Omitted)):
raise TypingError(
'{} The object must be a pandas.series. Given self: {}, to_append: {}'.format(_func_name, self, to_append))
'{} Unsupported parameters. Given verify_integrity: {}'.format(_func_name, verify_integrity))

# ignore_index value has to be known at compile time to select between implementations with different signatures
ignore_index_is_false = (common_functions.has_literal_value(ignore_index, False)
or common_functions.has_python_value(ignore_index, False)
or isinstance(ignore_index, types.Omitted))
to_append_is_series = isinstance(to_append, SeriesType)

if ignore_index_is_false:
def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False):
if to_append_is_series == True: # noqa
new_data = common_functions.hpat_arrays_append(self._data, to_append._data)
new_index = common_functions.hpat_arrays_append(self.index, to_append.index)
else:
data_arrays_to_append = [series._data for series in to_append]
index_arrays_to_append = [series.index for series in to_append]
new_data = common_functions.hpat_arrays_append(self._data, data_arrays_to_append)
new_index = common_functions.hpat_arrays_append(self.index, index_arrays_to_append)

return pandas.Series(new_data, new_index)

return hpat_pandas_series_append_impl

else:
def hpat_pandas_series_append_ignore_index_impl(self, to_append, ignore_index=False, verify_integrity=False):

if to_append_is_series == True: # noqa
new_data = common_functions.hpat_arrays_append(self._data, to_append._data)
else:
arrays_to_append = [series._data for series in to_append]
new_data = common_functions.hpat_arrays_append(self._data, arrays_to_append)

def hpat_pandas_series_append_impl(self, to_append):
return pandas.Series(self._data + to_append._data)
return pandas.Series(new_data, None)

return hpat_pandas_series_append_impl
return hpat_pandas_series_append_ignore_index_impl


@overload_method(SeriesType, 'copy')
Expand Down
9 changes: 7 additions & 2 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

import numba
from numba import ir, ir_utils
from numba import types, cgutils
from numba import numpy_support, types, cgutils
from numba.ir_utils import require, mk_unique_var
import numba.array_analysis
from numba.typing import signature
Expand All @@ -56,7 +56,12 @@
from hpat.utils import _numba_to_c_type_map, unliteral_all
from hpat.str_ext import string_type, list_string_array_type
from hpat.set_ext import build_set
from hpat.str_arr_ext import (StringArrayType, string_array_type, is_str_arr_typ)
from hpat.str_arr_ext import (
StringArrayType,
string_array_type,
is_str_arr_typ,
num_total_chars,
append_string_array_to)
from hpat.hiframes.pd_timestamp_ext import (pandas_timestamp_type, datetime_date_type, set_df_datetime_date_lower)
from hpat.hiframes.pd_series_ext import (
SeriesType,
Expand Down
2 changes: 1 addition & 1 deletion hpat/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,7 +1288,7 @@ def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, c

'''' use binop here as otherwise Numba's dead branch pruning doesn't work
TODO: replace with 'if not is_index_none' when resolved '''
if is_index_none == False:
if is_index_none == False: # noqa
fix_index = hpat.hiframes.api.fix_df_array(index)
else:
fix_index = index
Expand Down
14 changes: 14 additions & 0 deletions hpat/str_arr_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1419,3 +1419,17 @@ def lower_glob(context, builder, sig, args):
# context.nrt.decref(builder, ty, ret)

return impl_ret_new_ref(context, builder, typ, ret)


@numba.njit(no_cpython_wrapper=True)
def append_string_array_to(result, pos, A):
# precondition: result is allocated with the size enough to contain A
i, j = 0, pos
for str in A:
result[j] = str
if str_arr_is_na(A, i):
hpat.str_arr_ext.str_arr_set_na(result, j)
i += 1
j += 1

return i

0 comments on commit b857c10

Please sign in to comment.