Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Fix new-style implementation of Series.append() method #262

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
133 changes: 133 additions & 0 deletions hpat/datatypes/common_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# *****************************************************************************
# Copyright (c) 2019, Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

"""

| This file contains internal common functions used in SDC implementation across different files

"""

import numpy

from numba import types
from numba.extending import overload
from numba import numpy_support

import hpat
from hpat.str_arr_ext import (string_array_type, num_total_chars, append_string_array_to)


def has_literal_value(var, value):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not easy to understand why with function by the name. Please add docstring.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

'''Used during typing to check that variable var is a Numba literal value equal to value'''

if not isinstance(var, types.Literal):
return False

if value is None or isinstance(value, type(bool)):
return var.literal_value is value
else:
return var.literal_value == value


def has_python_value(var, value):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not easy to understand why with function by the name. Please add docstring.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added.

'''Used during typing to check that variable var was resolved as Python type and has specific value'''

if not isinstance(var, type(value)):
return False

if value is None or isinstance(value, type(bool)):
return var is value
else:
return var == value


def hpat_arrays_append(A, B):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this function is needed? As far as I understand you could use other decorator to register function to be used in JIT.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shssf This should provide different overloads for the same append operation for various underlying data types (np.array, StringArray). What are you suggesting to use instead?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shssf please specify the exact way to implement it, or let's close this discussion

pass


@overload(hpat_arrays_append)
def hpat_arrays_append_overload(A, B):
'''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A'''

if isinstance(A, types.Array):
if isinstance(B, types.Array):
def _append_single_numeric_impl(A, B):
return numpy.concatenate((A, B,))

return _append_single_numeric_impl
elif isinstance(B, (types.UniTuple, types.List)):
# TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
# to resolve common dtype of heterogeneous sequence of arrays
np_dtypes = [numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype)]
np_common_dtype = numpy.find_common_type([], np_dtypes)
numba_common_dtype = numpy_support.from_dtype(np_common_dtype)

# TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
def _append_list_numeric_impl(A, B):

total_length = len(A) + numpy.array([len(arr) for arr in B]).sum()
new_data = numpy.empty(total_length, numba_common_dtype)

stop = len(A)
new_data[:stop] = A
for arr in B:
start = stop
stop = start + len(arr)
new_data[start:stop] = arr
return new_data

return _append_list_numeric_impl

elif A == string_array_type:
if B == string_array_type:
def _append_single_string_array_impl(A, B):
total_size = len(A) + len(B)
total_chars = num_total_chars(A) + num_total_chars(B)
new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

pos = 0
pos += append_string_array_to(new_data, pos, A)
pos += append_string_array_to(new_data, pos, B)

return new_data

return _append_single_string_array_impl
elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type):
def _append_list_string_array_impl(A, B):
array_list = [A] + list(B)
total_size = numpy.array([len(arr) for arr in array_list]).sum()
total_chars = numpy.array([num_total_chars(arr) for arr in array_list]).sum()

new_data = hpat.str_arr_ext.pre_alloc_string_array(total_size, total_chars)

pos = 0
pos += append_string_array_to(new_data, pos, A)
for arr in B:
pos += append_string_array_to(new_data, pos, arr)

return new_data

return _append_list_string_array_impl
87 changes: 71 additions & 16 deletions hpat/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@
from numba import types

import hpat
import hpat.datatypes.common_functions as common_functions
from hpat.hiframes.pd_series_ext import SeriesType
from hpat.str_arr_ext import StringArrayType
from hpat.str_arr_ext import (StringArrayType, cp_str_list_to_array, num_total_chars)
from hpat.utils import to_array


class TypeChecker:
"""
Validate object type and raise TypingError if the type is invalid, e.g.:
Expand Down Expand Up @@ -753,37 +753,92 @@ def hpat_pandas_series_isin_impl(self, values):


@overload_method(SeriesType, 'append')
def hpat_pandas_series_append(self, to_append):
def hpat_pandas_series_append(self, to_append, ignore_index=False, verify_integrity=False):
"""
Pandas Series method :meth:`pandas.Series.append` implementation.

.. only:: developer

Test: python -m hpat.runtests hpat.tests.test_series.TestSeries.test_series_append1
Test: python -m hpat.runtests -k hpat.tests.test_series.TestSeries.test_series_append*

Parameters
-----------
to_append : :obj:`pandas.Series` object
input argument
ignore_index:
*unsupported*
verify_integrity:
*unsupported*
self: :obj:`pandas.Series`
input series
to_append : :obj:`pandas.Series` object or :obj:`list` or :obj:`set`
Series (or list or tuple of Series) to append with self
ignore_index: :obj:`bool`, default False
If True, do not use the index labels.
Supported as literal value only
verify_integrity: :obj:`bool`, default False
If True, raise Exception on creating index with duplicates.
*unsupported*

Returns
-------
:obj:`pandas.Series`
returns :obj:`pandas.Series` object
returns :obj:`pandas.Series` object
Concatenated Series

"""

_func_name = 'Method append().'

if not isinstance(self, SeriesType) or not isinstance(to_append, SeriesType):
if not isinstance(self, SeriesType):
raise TypingError(
'{} The object must be a pandas.series. Given self: {}'.format(_func_name, self))

if not (isinstance(to_append, SeriesType)
or (isinstance(to_append, (types.UniTuple, types.List)) and isinstance(to_append.dtype, SeriesType))):
raise TypingError(
'{} The argument must be a pandas.series or list/tuple of pandas.series. \
Given to_append: {}'.format(_func_name, to_append))

# currently we will always raise this in the end, i.e. if no impl was found
# TODO: find a way to stop compilation early and not proceed with unliteral step
if not (isinstance(ignore_index, types.Literal) and isinstance(ignore_index, types.Boolean)
or isinstance(ignore_index, types.Omitted)
or ignore_index is False):
raise TypingError(
'{} The ignore_index must be a literal Boolean constant. Given: {}'.format(_func_name, ignore_index))

if not (verify_integrity is False or isinstance(verify_integrity, types.Omitted)):
raise TypingError(
'{} The object must be a pandas.series. Given self: {}, to_append: {}'.format(_func_name, self, to_append))
'{} Unsupported parameters. Given verify_integrity: {}'.format(_func_name, verify_integrity))

# ignore_index value has to be known at compile time to select between implementations with different signatures
ignore_index_is_false = (common_functions.has_literal_value(ignore_index, False)
or common_functions.has_python_value(ignore_index, False)
or isinstance(ignore_index, types.Omitted))
to_append_is_series = isinstance(to_append, SeriesType)

if ignore_index_is_false:
def hpat_pandas_series_append_impl(self, to_append, ignore_index=False, verify_integrity=False):
if to_append_is_series == True: # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix style according PEP

new_data = common_functions.hpat_arrays_append(self._data, to_append._data)
new_index = common_functions.hpat_arrays_append(self.index, to_append.index)
else:
data_arrays_to_append = [series._data for series in to_append]
index_arrays_to_append = [series.index for series in to_append]
new_data = common_functions.hpat_arrays_append(self._data, data_arrays_to_append)
new_index = common_functions.hpat_arrays_append(self.index, index_arrays_to_append)

return pandas.Series(new_data, new_index)

return hpat_pandas_series_append_impl

else:
def hpat_pandas_series_append_ignore_index_impl(self, to_append, ignore_index=False, verify_integrity=False):

if to_append_is_series == True: # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix style according PEP

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shssf It's not possible - with "if to_append_is_series" Numba will fail to compile, as dead branch pruning won't eliminate the dead branch (it's similar as in
https://github.com/kozlov-alexey/hpat/blob/45c06272f576ce53dc76038da08d59338770b39a/hpat/hiframes/pd_series_ext.py#L1278).

new_data = common_functions.hpat_arrays_append(self._data, to_append._data)
else:
arrays_to_append = [series._data for series in to_append]
new_data = common_functions.hpat_arrays_append(self._data, arrays_to_append)

def hpat_pandas_series_append_impl(self, to_append):
return pandas.Series(self._data + to_append._data)
return pandas.Series(new_data, None)

return hpat_pandas_series_append_impl
return hpat_pandas_series_append_ignore_index_impl


@overload_method(SeriesType, 'copy')
Expand Down
9 changes: 7 additions & 2 deletions hpat/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

import numba
from numba import ir, ir_utils
from numba import types, cgutils
from numba import numpy_support, types, cgutils
from numba.ir_utils import require, mk_unique_var
import numba.array_analysis
from numba.typing import signature
Expand All @@ -56,7 +56,12 @@
from hpat.utils import _numba_to_c_type_map, unliteral_all
from hpat.str_ext import string_type, list_string_array_type
from hpat.set_ext import build_set
from hpat.str_arr_ext import (StringArrayType, string_array_type, is_str_arr_typ)
from hpat.str_arr_ext import (
StringArrayType,
string_array_type,
is_str_arr_typ,
num_total_chars,
append_string_array_to)
from hpat.hiframes.pd_timestamp_ext import (pandas_timestamp_type, datetime_date_type, set_df_datetime_date_lower)
from hpat.hiframes.pd_series_ext import (
SeriesType,
Expand Down
2 changes: 1 addition & 1 deletion hpat/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,7 +1288,7 @@ def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, c

'''' use binop here as otherwise Numba's dead branch pruning doesn't work
TODO: replace with 'if not is_index_none' when resolved '''
if is_index_none == False:
if is_index_none == False: # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix it accordion PEP

fix_index = hpat.hiframes.api.fix_df_array(index)
else:
fix_index = index
Expand Down
14 changes: 14 additions & 0 deletions hpat/str_arr_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1419,3 +1419,17 @@ def lower_glob(context, builder, sig, args):
# context.nrt.decref(builder, ty, ret)

return impl_ret_new_ref(context, builder, typ, ret)


@numba.njit(no_cpython_wrapper=True)
def append_string_array_to(result, pos, A):
# precondition: result is allocated with the size enough to contain A
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add docstring

i, j = 0, pos
for str in A:
result[j] = str
if str_arr_is_na(A, i):
hpat.str_arr_ext.str_arr_set_na(result, j)
i += 1
j += 1

return i