From f405df1018e4e63f3d0675bf794631252f8f6656 Mon Sep 17 00:00:00 2001 From: akharche Date: Mon, 9 Dec 2019 19:08:20 +0300 Subject: [PATCH 01/12] DataFrame.append base implementation --- examples/dataframe_append.py | 46 +++++++ .../hpat_pandas_dataframe_functions.py | 112 +++++++++++++++++- sdc/hiframes/pd_dataframe_ext.py | 33 +++--- sdc/tests/test_dataframe.py | 8 +- 4 files changed, 179 insertions(+), 20 deletions(-) create mode 100644 examples/dataframe_append.py diff --git a/examples/dataframe_append.py b/examples/dataframe_append.py new file mode 100644 index 000000000..9f04ac124 --- /dev/null +++ b/examples/dataframe_append.py @@ -0,0 +1,46 @@ +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd +from numba import njit + + +@njit +def dataframe_append(): + df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) + df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) + result = df.append(df2) + + return result + # Expect dataframe: + # A B + # 0 1 2 + # 1 3 4 + # 0 5 6 + # 1 7 8 + + +print(dataframe_append()) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index b17ceb47c..86e583f68 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -31,12 +31,121 @@ import operator import pandas +import numpy +import sdc from numba import types from numba.extending import (overload, overload_method, overload_attribute) +from sdc.hiframes.pd_dataframe_ext import DataFrameType +from sdc.datatypes.common_functions import TypeChecker from numba.errors import TypingError -from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType + + +@overload_method(DataFrameType, 'append') +def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): + """ + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: pandas.DataFrame.append + + Examples + -------- + .. literalinclude:: ../../../examples/dataframe_append.py + :language: python + :lines: 27- + :caption: Pad strings in the Series by prepending '0' characters + :name: ex_dataframe_append + .. code-block:: console + > python ./dataframe_append.py + A B + 0 1 2 + 1 3 4 + 0 5 6 + 1 7 8 + dtype: object + + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_append_df_no_index + Parameters + ----------- + df: :obj:`pandas.DataFrame` + input arg + other: :obj:`pandas.DataFrame` object or :obj:`pandas.Series` or :obj:`dict` + The data to append + ignore_index: :obj:`bool` + *unsupported* + verify_integrity: :obj:`bool` + *unsupported* + sort: :obj:`bool` + *unsupported* + + Returns + ------- + :obj: `pandas.DataFrame` + return DataFrame with appended rows to the end + """ + + name = 'append' + + ty_checker = TypeChecker('Method {}().'.format(name)) + ty_checker.check(df, DataFrameType) + # TODO: support other array-like types + ty_checker.check(other, DataFrameType) + # TODO: support index in series from df-columns + if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)) and not ignore_index: + ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') + + if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity: + ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity') + + if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and verify_integrity: + ty_checker.raise_exc(verify_integrity, 'boolean', 'sort') + + args = (('other', other), ('ignore_index', ignore_index), ('verify_integrity', False), ('sort', None)) + + def sdc_pandas_dataframe_append_impl(df, name, args): + df_columns = df.columns + n_cols = len(df_columns) + data_args = tuple('data{}'.format(i) for i in range(n_cols)) + func_args = ['df', 'other'] + + for key, value in args: + #TODO: improve check + if key not in func_args: + if isinstance(value, types.Literal): + value = value.literal_value + func_args.append('{}={}'.format(key, value)) + + func_definition = 'def sdc_pandas_dataframe_{}_impl({}):'.format(name, ', '.join(func_args)) + func_lines = [func_definition] + for i, d in enumerate(data_args): + line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' + line2 = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(other, {}))' + func_lines.append(line.format(d + '_S', i)) + func_lines.append(line2.format('to_append_{}'.format(i) + '_S', i)) + func_lines.append( + ' {} = {}.{}({})._data'.format(d + '_O', d + '_S', name, 'to_append_{}'.format(i) + '_S')) + data = ", ".join(d + '_O' for d in data_args) + # TODO: Handle index + index = None + col_names = ", ".join("'{}'".format(c) for c in df_columns) + func_lines.append(" return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, {}, {})\n".format( + data, + index, + col_names)) + loc_vars = {} + func_text = '\n'.join(func_lines) + + exec(func_text, {'sdc': sdc, 'np': numpy}, loc_vars) + _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] + + return _append_impl + + return sdc_pandas_dataframe_append_impl(df, name, args) @overload_method(DataFrameType, 'count') @@ -92,3 +201,4 @@ def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False return pandas.Series(data=result_data, index=result_index) return sdc_pandas_dataframe_count_impl + diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 61e988095..d3c975079 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1240,21 +1240,21 @@ def lower_isin_dummy(context, builder, sig, args): return out_obj._getvalue() -@overload_method(DataFrameType, 'append') -def append_overload(df, other, ignore_index=False, verify_integrity=False, - sort=None): - if isinstance(other, DataFrameType): - return (lambda df, other, ignore_index=False, verify_integrity=False, - sort=None: pd.concat((df, other))) - - # TODO: tuple case - # TODO: non-homogenous build_list case - if isinstance(other, types.List) and isinstance(other.dtype, DataFrameType): - return (lambda df, other, ignore_index=False, verify_integrity=False, - sort=None: pd.concat([df] + other)) - - raise ValueError("invalid df.append() input. Only dataframe and list" - " of dataframes supported") +# @overload_method(DataFrameType, 'append') +# def append_overload(df, other, ignore_index=False, verify_integrity=False, +# sort=None): +# if isinstance(other, DataFrameType): +# return (lambda df, other, ignore_index=False, verify_integrity=False, +# sort=None: pd.concat((df, other))) +# +# # TODO: tuple case +# # TODO: non-homogenous build_list case +# if isinstance(other, types.List) and isinstance(other.dtype, DataFrameType): +# return (lambda df, other, ignore_index=False, verify_integrity=False, +# sort=None: pd.concat([df] + other)) +# +# raise ValueError("invalid df.append() input. Only dataframe and list" +# " of dataframes supported") @overload_method(DataFrameType, 'pct_change') @@ -1636,3 +1636,6 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None, date_format, doublequote, escapechar, decimal) return _impl + + +from sdc.datatypes.hpat_pandas_dataframe_functions import * \ No newline at end of file diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index bba797cfa..d20aefd26 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1040,20 +1040,20 @@ def test_impl(df): df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) - @skip_numba_jit - def test_append1(self): + + def test_append_df_no_index(self): def test_impl(df, df2): return df.append(df2, ignore_index=True) hpat_func = self.jit(test_impl) n = 11 df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) - df2 = pd.DataFrame({'A': np.arange(n), 'C': np.arange(n)**2}) + df2 = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) df2.A[n // 2:] = n pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) @skip_numba_jit - def test_append2(self): + def test_append_no_index(self): def test_impl(df, df2, df3): return df.append([df2, df3], ignore_index=True) From 82188fc2696c966e35ac72e9487e5e8cae0e8b96 Mon Sep 17 00:00:00 2001 From: akharche Date: Mon, 16 Dec 2019 14:16:59 +0300 Subject: [PATCH 02/12] Added functionality for appending columns with different names --- examples/dataframe/dataframe_append.py | 47 +++++++++ sdc/datatypes/common_functions.py | 14 ++- .../hpat_pandas_dataframe_functions.py | 95 +++++++++++++------ sdc/hiframes/dataframe_pass.py | 24 ++--- sdc/hiframes/pd_series_ext.py | 44 ++++----- sdc/tests/test_dataframe.py | 41 +++++++- 6 files changed, 200 insertions(+), 65 deletions(-) create mode 100644 examples/dataframe/dataframe_append.py diff --git a/examples/dataframe/dataframe_append.py b/examples/dataframe/dataframe_append.py new file mode 100644 index 000000000..c55aa26ca --- /dev/null +++ b/examples/dataframe/dataframe_append.py @@ -0,0 +1,47 @@ +# ***************************************************************************** +# Copyright (c) 2019, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd +from numba import njit + + +@njit +def dataframe_append(): + # Concat dfs with the same column names + df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) + df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) + result1 = df.append(df2) + + # Concat dfs with the different column names + df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) + df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('CD')) + result2 = df.append(df2) + + return result1, result2 + + +print(dataframe_append()[0]) +print(dataframe_append()[1]) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index d359d2c97..4435f49ff 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -34,7 +34,7 @@ from numba import types from numba.errors import TypingError -from numba.extending import overload +from numba.extending import overload, register_jitable from numba import numpy_support import sdc @@ -181,3 +181,15 @@ def _append_list_string_array_impl(A, B): return new_data return _append_list_string_array_impl + + +@register_jitable +def fill_array(data, size, fill_value=numpy.nan, push_back=True): + """ + Fill array with given values to reach the size + """ + + if push_back: + return numpy.append(data, numpy.repeat(fill_value, size - data.size)) + + return numpy.append(numpy.repeat(fill_value, size - data.size), data) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 86e583f68..ac2fd996a 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -41,7 +41,6 @@ from numba.errors import TypingError - @overload_method(DataFrameType, 'append') def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): """ @@ -54,7 +53,8 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F .. literalinclude:: ../../../examples/dataframe_append.py :language: python :lines: 27- - :caption: Pad strings in the Series by prepending '0' characters + :caption: Appending rows of other to the end of caller, returning a new object. + Columns in other that are not in the caller are added as new columns. :name: ex_dataframe_append .. code-block:: console > python ./dataframe_append.py @@ -65,11 +65,26 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F 1 7 8 dtype: object + A B C D + 0 1.0 2.0 NaN NaN + 1 3.0 4.0 NaN NaN + 0 NaN NaN 5.0 6.0 + 1 NaN NaN 7.0 8.0 + dtype: object + + .. note:: + Parameter ignore_index, verify_integrity, sort are currently unsupported by Intel Scalable Dataframe Compiler + Currently only pandas.DataFrame is supported as "other" parameter + + .. seealso:: + :ref:`concat ` + General function to concatenate DataFrame or Series objects. + Intel Scalable Dataframe Compiler Developer Guide ************************************************* Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_append_df_no_index + Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append* Parameters ----------- df: :obj:`pandas.DataFrame` @@ -105,12 +120,10 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and verify_integrity: ty_checker.raise_exc(verify_integrity, 'boolean', 'sort') - args = (('other', other), ('ignore_index', ignore_index), ('verify_integrity', False), ('sort', None)) + args = (('ignore_index', ignore_index), ('verify_integrity', False), ('sort', None)) - def sdc_pandas_dataframe_append_impl(df, name, args): - df_columns = df.columns - n_cols = len(df_columns) - data_args = tuple('data{}'.format(i) for i in range(n_cols)) + def sdc_pandas_dataframe_append_impl(df, other, name, args): + spaces = 4 * ' ' func_args = ['df', 'other'] for key, value in args: @@ -120,32 +133,58 @@ def sdc_pandas_dataframe_append_impl(df, name, args): value = value.literal_value func_args.append('{}={}'.format(key, value)) - func_definition = 'def sdc_pandas_dataframe_{}_impl({}):'.format(name, ', '.join(func_args)) - func_lines = [func_definition] - for i, d in enumerate(data_args): - line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' - line2 = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(other, {}))' - func_lines.append(line.format(d + '_S', i)) - func_lines.append(line2.format('to_append_{}'.format(i) + '_S', i)) - func_lines.append( - ' {} = {}.{}({})._data'.format(d + '_O', d + '_S', name, 'to_append_{}'.format(i) + '_S')) - data = ", ".join(d + '_O' for d in data_args) + df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} + other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} + + def get_dataframe_column(df, column, idx): + return f'new_col_{column}_data_{df} = get_dataframe_data({df}, {idx})' + + def get_append_result(df1, df2, column): + return f'new_col_{column} = init_series(new_col_{column}_data_{df1}).append(init_series(new_col_{column}_data_{df2}))._data' + + func_definition = [f'def sdc_pandas_dataframe_{name}_impl({", ".join(func_args)}):'] + func_text = [] + column_list = [] + + func_text.append(f'len_df = len(get_dataframe_data(df, {0}))') + func_text.append(f'len_other = len(get_dataframe_data(other, {0}))') + + for col_name, i in df_columns_indx.items(): + if col_name in other_columns_indx: + func_text.append(get_dataframe_column('df', col_name, i)) + func_text.append(get_dataframe_column('other', col_name, other_columns_indx.get(col_name))) + func_text.append(get_append_result('df', 'other', col_name)) + column_list.append((f'new_col_{col_name}', col_name)) + else: + func_text.append(get_dataframe_column('df', col_name, i)) + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') + func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') + column_list.append((f'new_col_{col_name}', col_name)) + + for col_name, i in other_columns_indx.items(): + if col_name not in df_columns_indx: + func_text.append(get_dataframe_column('other', col_name, i)) + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') + func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + column_list.append((f'new_col_{col_name}', col_name)) + + data = ', '.join(column for column, _ in column_list) # TODO: Handle index index = None - col_names = ", ".join("'{}'".format(c) for c in df_columns) - func_lines.append(" return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, {}, {})\n".format( - data, - index, - col_names)) - loc_vars = {} - func_text = '\n'.join(func_lines) + col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) + func_text.append(f"return sdc.hiframes.pd_dataframe_ext.init_dataframe({data}, {index}, {col_names})\n") - exec(func_text, {'sdc': sdc, 'np': numpy}, loc_vars) - _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] + func_definition.extend([spaces + func_line for func_line in func_text]) + + func_def = '\n'.join(func_definition) + loc_vars = {} + exec(func_def, {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, + 'init_series': sdc.hiframes.api.init_series, 'fill_array': sdc.datatypes.common_functions.fill_array}, loc_vars) + _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] return _append_impl - return sdc_pandas_dataframe_append_impl(df, name, args) + return sdc_pandas_dataframe_append_impl(df, other, name, args) @overload_method(DataFrameType, 'count') diff --git a/sdc/hiframes/dataframe_pass.py b/sdc/hiframes/dataframe_pass.py index c080f29e7..bb3e45ce5 100644 --- a/sdc/hiframes/dataframe_pass.py +++ b/sdc/hiframes/dataframe_pass.py @@ -855,18 +855,18 @@ def _run_call_dataframe(self, assign, lhs, rhs, df_var, func_name): pysig=numba.utils.pysignature(stub), kws=dict(rhs.kws)) - if func_name == 'append': - rhs.args.insert(0, df_var) - arg_typs = tuple(self.state.typemap[v.name] for v in rhs.args) - kw_typs = {name: self.state.typemap[v.name] - for name, v in dict(rhs.kws).items()} - impl = sdc.hiframes.pd_dataframe_ext.append_overload( - *arg_typs, **kw_typs) - stub = (lambda df, other, ignore_index=False, - verify_integrity=False, sort=None: None) - return self._replace_func(impl, rhs.args, - pysig=numba.utils.pysignature(stub), - kws=dict(rhs.kws)) + # if func_name == 'append': + # rhs.args.insert(0, df_var) + # arg_typs = tuple(self.state.typemap[v.name] for v in rhs.args) + # kw_typs = {name: self.state.typemap[v.name] + # for name, v in dict(rhs.kws).items()} + # impl = sdc.hiframes.pd_dataframe_ext.append_overload( + # *arg_typs, **kw_typs) + # stub = (lambda df, other, ignore_index=False, + # verify_integrity=False, sort=None: None) + # return self._replace_func(impl, rhs.args, + # pysig=numba.utils.pysignature(stub), + # kws=dict(rhs.kws)) if func_name == 'pct_change': rhs.args.insert(0, df_var) diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index f023aea48..3abff299a 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -633,28 +633,28 @@ def resolve_corr(self, ary, args, kws): return self._resolve_cov_func(ary, args, kws) # PR135. This needs to be commented out - @bound_function("series.append") - def resolve_append(self, ary, args, kws): - # TODO: ignore_index - assert not kws - arr_typ = if_series_to_array_type(ary) - other, = args - if isinstance(other, (SeriesType, types.Array)): - all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other))) - elif isinstance(other, types.BaseTuple): - all_arrs = types.Tuple((arr_typ, *[if_series_to_array_type(a) for a in other.types])) - elif isinstance(other, (types.List, types.Set)): - # add only one value from the list for typing since it shouldn't - # matter for np.concatenate typing - all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other.dtype))) - else: - raise ValueError("Invalid input for Series.append (Series, or tuple/list of Series expected)") - - # TODO: list - # call np.concatenate to handle type promotion e.g. int, float -> float - ret_typ = self.context.resolve_function_type(np.concatenate, (all_arrs,), kws).return_type - ret_typ = if_arr_to_series_type(ret_typ) - return signature(ret_typ, *args) +# @bound_function("series.append") +# def resolve_append(self, ary, args, kws): +# # TODO: ignore_index +# assert not kws +# arr_typ = if_series_to_array_type(ary) +# other, = args +# if isinstance(other, (SeriesType, types.Array)): +# all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other))) +# elif isinstance(other, types.BaseTuple): +# all_arrs = types.Tuple((arr_typ, *[if_series_to_array_type(a) for a in other.types])) +# elif isinstance(other, (types.List, types.Set)): +# # add only one value from the list for typing since it shouldn't +# # matter for np.concatenate typing +# all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other.dtype))) +# else: +# raise ValueError("Invalid input for Series.append (Series, or tuple/list of Series expected)") +# +# # TODO: list +# # call np.concatenate to handle type promotion e.g. int, float -> float +# ret_typ = self.context.resolve_function_type(np.concatenate, (all_arrs,), kws).return_type +# ret_typ = if_arr_to_series_type(ret_typ) +# return signature(ret_typ, *args) # @bound_function("series.isna") # def resolve_isna(self, ary, args, kws): diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index d20aefd26..1c2521aec 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1040,8 +1040,7 @@ def test_impl(df): df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2}) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) - - def test_append_df_no_index(self): + def test_append_df_same_cols_no_index(self): def test_impl(df, df2): return df.append(df2, ignore_index=True) @@ -1052,6 +1051,44 @@ def test_impl(df, df2): df2.A[n // 2:] = n pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) + def test_append_df_diff_cols_no_index(self): + def test_impl(df, df2): + return df.append(df2, ignore_index=True) + + hpat_func = self.jit(test_impl) + n1 = 11 + n2 = n1 * 2 + df = pd.DataFrame({'A': np.arange(n1), 'B': np.arange(n1)**2}) + df2 = pd.DataFrame({'C': np.arange(n2), 'D': np.arange(n2)**2, 'E': np.arange(n2) + 100}) + + pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) + + def test_append_df_cross_cols_no_index(self): + def test_impl(df, df2): + return df.append(df2, ignore_index=True) + + hpat_func = self.jit(test_impl) + n1 = 11 + n2 = n1 * 2 + df = pd.DataFrame({'A': np.arange(n1), 'B': np.arange(n1)**2}) + df2 = pd.DataFrame({'A': np.arange(n2), 'D': np.arange(n2)**2, 'E': np.arange(n2) + 100}) + + print(hpat_func(df, df2)) + print(test_impl(df, df2)) + + pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) + + def test_append_df_diff_types_no_index(self): + def test_impl(df, df2): + return df.append(df2, ignore_index=True) + + hpat_func = self.jit(test_impl) + + df = pd.DataFrame({'A': ['e', 'm', 'l'], 'B': [.2, .3, np.nan]}) + df2 = pd.DataFrame({'C': [5, 6, 7], 'A': ['a', 'b', 'c']}) + + pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) + @skip_numba_jit def test_append_no_index(self): def test_impl(df, df2, df3): From 7b2ffb61423c117abed798402c360423ad8b6d14 Mon Sep 17 00:00:00 2001 From: akharche Date: Mon, 16 Dec 2019 14:26:17 +0300 Subject: [PATCH 03/12] Delete duplicate --- examples/dataframe_append.py | 46 ------------------------------------ 1 file changed, 46 deletions(-) delete mode 100644 examples/dataframe_append.py diff --git a/examples/dataframe_append.py b/examples/dataframe_append.py deleted file mode 100644 index 9f04ac124..000000000 --- a/examples/dataframe_append.py +++ /dev/null @@ -1,46 +0,0 @@ -# ***************************************************************************** -# Copyright (c) 2019, Intel Corporation All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# ***************************************************************************** - -import pandas as pd -from numba import njit - - -@njit -def dataframe_append(): - df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) - result = df.append(df2) - - return result - # Expect dataframe: - # A B - # 0 1 2 - # 1 3 4 - # 0 5 6 - # 1 7 8 - - -print(dataframe_append()) From e664723da4acede7df170eb27b97ed598d02b2c2 Mon Sep 17 00:00:00 2001 From: akharche Date: Wed, 18 Dec 2019 14:13:03 +0300 Subject: [PATCH 04/12] Handle StringArrayType --- examples/dataframe/dataframe_append.py | 23 ++++--- sdc/datatypes/common_functions.py | 40 +++++++++++ .../hpat_pandas_dataframe_functions.py | 66 ++++++++++--------- sdc/tests/test_dataframe.py | 9 +-- 4 files changed, 92 insertions(+), 46 deletions(-) diff --git a/examples/dataframe/dataframe_append.py b/examples/dataframe/dataframe_append.py index c55aa26ca..86228b947 100644 --- a/examples/dataframe/dataframe_append.py +++ b/examples/dataframe/dataframe_append.py @@ -30,18 +30,21 @@ @njit def dataframe_append(): - # Concat dfs with the same column names - df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB')) - result1 = df.append(df2) + """ + Expected result: + A B C + 0 1.0 2 NaN + 1 3.0 4 NaN + 2 NaN 5 6.0 + 3 NaN 7 8.0 + dtype: object + """ - # Concat dfs with the different column names df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('CD')) - result2 = df.append(df2) + df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('BC')) + result = df.append(df2) - return result1, result2 + return result -print(dataframe_append()[0]) -print(dataframe_append()[1]) +print(dataframe_append()) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 772277776..1021b6013 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -201,6 +201,46 @@ def fill_array(data, size, fill_value=numpy.nan, push_back=True): return numpy.append(numpy.repeat(fill_value, size - data.size), data) +@register_jitable +def fill_str_array(data, size, push_back=True): + """ + Fill StringArrayType array with given values to reach the size + """ + + string_array_size = len(data) + none_array_size = size - string_array_size + num_chars = sdc.str_arr_ext.num_total_chars(data) + + result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars) + + # Keep NaN values of initial array + str_arr_is_na_mask = [] + for i in numba.prange(string_array_size): + if sdc.hiframes.api.isna(data, i): + str_arr_is_na_mask.append(i) + + str_arr_is_na_mask = numpy.array(str_arr_is_na_mask) + + if push_back: + str_arr_is_na_mask = numpy.append(str_arr_is_na_mask, numpy.arange(string_array_size, size)) + else: + # Make offset to push front + str_arr_is_na_mask = str_arr_is_na_mask + none_array_size + str_arr_is_na_mask = numpy.append(numpy.arange(none_array_size), str_arr_is_na_mask) + + data_str_list = sdc.str_arr_ext.to_string_list(data) + nan_list = ['' for _ in numba.prange(none_array_size)] + + result_list = data_str_list + nan_list if push_back else nan_list + data_str_list + + sdc.str_arr_ext.cp_str_list_to_array(result_data, result_list) + + for i in numba.prange(len(str_arr_is_na_mask)): + str_arr_set_na(result_data, str_arr_is_na_mask[i]) + + return result_data + + @numba.njit def _hpat_ensure_array_capacity(new_size, arr): """ Function ensuring that the size of numpy array is at least as specified diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 24cd44099..4e61abb69 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -39,6 +39,7 @@ from sdc.hiframes.pd_dataframe_ext import DataFrameType from sdc.datatypes.common_functions import TypeChecker from numba.errors import TypingError +from sdc.str_arr_ext import StringArrayType # from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType from sdc.utils import sdc_overload_method @@ -61,18 +62,11 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F :name: ex_dataframe_append .. code-block:: console > python ./dataframe_append.py - A B - 0 1 2 - 1 3 4 - 0 5 6 - 1 7 8 - dtype: object - - A B C D - 0 1.0 2.0 NaN NaN - 1 3.0 4.0 NaN NaN - 0 NaN NaN 5.0 6.0 - 1 NaN NaN 7.0 8.0 + A B C + 0 1.0 2 NaN + 1 3.0 4 NaN + 2 NaN 5 6.0 + 3 NaN 7 8.0 dtype: object .. note:: @@ -107,9 +101,9 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F return DataFrame with appended rows to the end """ - name = 'append' + _func_name = 'append' - ty_checker = TypeChecker('Method {}().'.format(name)) + ty_checker = TypeChecker(f'Method {_func_name}().') ty_checker.check(df, DataFrameType) # TODO: support other array-like types ty_checker.check(other, DataFrameType) @@ -120,13 +114,13 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity: ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity') - if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and verify_integrity: - ty_checker.raise_exc(verify_integrity, 'boolean', 'sort') + if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and sort is not None: + ty_checker.raise_exc(sort, 'boolean, None', 'sort') - args = (('ignore_index', ignore_index), ('verify_integrity', False), ('sort', None)) + args = (('ignore_index', True), ('verify_integrity', False), ('sort', None)) - def sdc_pandas_dataframe_append_impl(df, other, name, args): - spaces = 4 * ' ' + def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): + indent = 4 * ' ' func_args = ['df', 'other'] for key, value in args: @@ -139,6 +133,14 @@ def sdc_pandas_dataframe_append_impl(df, other, name, args): df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} + # Keep columns that are StringArrayType + string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) + if isinstance(typ, StringArrayType)) + + for typ, col_name in zip(other.data, other.columns): + if isinstance(typ, StringArrayType): + string_type_columns.add(col_name) + def get_dataframe_column(df, column, idx): return f'new_col_{column}_data_{df} = get_dataframe_data({df}, {idx})' @@ -146,7 +148,7 @@ def get_append_result(df1, df2, column): return f'new_col_{column} = ' \ f'init_series(new_col_{column}_data_{df1}).append(init_series(new_col_{column}_data_{df2}))._data' - func_definition = [f'def sdc_pandas_dataframe_{name}_impl({", ".join(func_args)}):'] + func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] func_text = [] column_list = [] @@ -154,23 +156,27 @@ def get_append_result(df1, df2, column): func_text.append(f'len_other = len(get_dataframe_data(other, {0}))') for col_name, i in df_columns_indx.items(): + func_text.append(get_dataframe_column('df', col_name, i)) if col_name in other_columns_indx: - func_text.append(get_dataframe_column('df', col_name, i)) func_text.append(get_dataframe_column('other', col_name, other_columns_indx.get(col_name))) func_text.append(get_append_result('df', 'other', col_name)) - column_list.append((f'new_col_{col_name}', col_name)) else: - func_text.append(get_dataframe_column('df', col_name, i)) func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') - func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') - column_list.append((f'new_col_{col_name}', col_name)) + if col_name in string_type_columns: + func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') + else: + func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') + column_list.append((f'new_col_{col_name}', col_name)) for col_name, i in other_columns_indx.items(): if col_name not in df_columns_indx: func_text.append(get_dataframe_column('other', col_name, i)) func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') - func_text.append(f'new_col_{col_name} = ' - f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + if col_name in string_type_columns: + func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + else: + func_text.append(f'new_col_{col_name} = ' + f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') column_list.append((f'new_col_{col_name}', col_name)) data = ', '.join(column for column, _ in column_list) @@ -179,18 +185,18 @@ def get_append_result(df1, df2, column): col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) func_text.append(f"return sdc.hiframes.pd_dataframe_ext.init_dataframe({data}, {index}, {col_names})\n") - func_definition.extend([spaces + func_line for func_line in func_text]) + func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) loc_vars = {} exec(func_def, {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, 'init_series': sdc.hiframes.api.init_series, - 'fill_array': sdc.datatypes.common_functions.fill_array}, loc_vars) + 'fill_array': sdc.datatypes.common_functions.fill_array, 'fill_str_array': sdc.datatypes.common_functions.fill_str_array}, loc_vars) _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] return _append_impl - return sdc_pandas_dataframe_append_impl(df, other, name, args) + return sdc_pandas_dataframe_append_impl(df, other, _func_name, args) @sdc_overload_method(DataFrameType, 'count') diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 35a51a6dc..e0bbdc790 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1074,9 +1074,6 @@ def test_impl(df, df2): df = pd.DataFrame({'A': np.arange(n1), 'B': np.arange(n1)**2}) df2 = pd.DataFrame({'A': np.arange(n2), 'D': np.arange(n2)**2, 'E': np.arange(n2) + 100}) - print(hpat_func(df, df2)) - print(test_impl(df, df2)) - pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) def test_append_df_diff_types_no_index(self): @@ -1085,12 +1082,12 @@ def test_impl(df, df2): hpat_func = self.jit(test_impl) - df = pd.DataFrame({'A': ['e', 'm', 'l'], 'B': [.2, .3, np.nan]}) - df2 = pd.DataFrame({'C': [5, 6, 7], 'A': ['a', 'b', 'c']}) + df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) + df2 = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': [5, 6, 7], 'D': ['a', np.nan, '']}) pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) - @skip_numba_jit + @unittest.skip('Unsupported functionality df.append([df2, df3]') def test_append_no_index(self): def test_impl(df, df2, df3): return df.append([df2, df3], ignore_index=True) From 661ad74f621fc4d6ecf80b1396addf4f245ba8ea Mon Sep 17 00:00:00 2001 From: akharche Date: Wed, 18 Dec 2019 14:25:06 +0300 Subject: [PATCH 05/12] Refactor --- sdc/datatypes/common_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 1021b6013..d50eebb0f 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -229,7 +229,7 @@ def fill_str_array(data, size, push_back=True): str_arr_is_na_mask = numpy.append(numpy.arange(none_array_size), str_arr_is_na_mask) data_str_list = sdc.str_arr_ext.to_string_list(data) - nan_list = ['' for _ in numba.prange(none_array_size)] + nan_list = [''] * none_array_size result_list = data_str_list + nan_list if push_back else nan_list + data_str_list From a364c37efd4e5bff501f2ae57a672df757d55386 Mon Sep 17 00:00:00 2001 From: akharche Date: Fri, 20 Dec 2019 15:23:07 +0300 Subject: [PATCH 06/12] Refactoring --- sdc/datatypes/hpat_pandas_dataframe_functions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 4e61abb69..814ace192 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -41,7 +41,6 @@ from numba.errors import TypingError from sdc.str_arr_ext import StringArrayType -# from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType from sdc.utils import sdc_overload_method @@ -60,7 +59,9 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F :caption: Appending rows of other to the end of caller, returning a new object. Columns in other that are not in the caller are added as new columns. :name: ex_dataframe_append + .. code-block:: console + > python ./dataframe_append.py A B C 0 1.0 2 NaN @@ -82,6 +83,7 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. .. only:: developer Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append* + Parameters ----------- df: :obj:`pandas.DataFrame` @@ -145,8 +147,9 @@ def get_dataframe_column(df, column, idx): return f'new_col_{column}_data_{df} = get_dataframe_data({df}, {idx})' def get_append_result(df1, df2, column): - return f'new_col_{column} = ' \ - f'init_series(new_col_{column}_data_{df1}).append(init_series(new_col_{column}_data_{df2}))._data' + s1 = f'init_series(new_col_{column}_data_{df1})' + s2 = f'init_series(new_col_{column}_data_{df2})' + return f'new_col_{column} = {s1}.append({s2})._data' func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] func_text = [] From c360dd39f53b7f35fcff1266c14b8eb7966fc4c2 Mon Sep 17 00:00:00 2001 From: akharche Date: Tue, 24 Dec 2019 15:48:05 +0300 Subject: [PATCH 07/12] Separated codegen func+refactoring --- examples/dataframe/dataframe_append.py | 14 +- sdc/datatypes/common_functions.py | 32 ++- .../hpat_pandas_dataframe_functions.py | 189 ++++++++++-------- sdc/hiframes/dataframe_pass.py | 24 +-- sdc/hiframes/pd_dataframe_ext.py | 2 +- sdc/hiframes/pd_series_ext.py | 46 +++-- sdc/tests/test_dataframe.py | 2 +- 7 files changed, 170 insertions(+), 139 deletions(-) diff --git a/examples/dataframe/dataframe_append.py b/examples/dataframe/dataframe_append.py index 86228b947..c3cc41a72 100644 --- a/examples/dataframe/dataframe_append.py +++ b/examples/dataframe/dataframe_append.py @@ -33,15 +33,15 @@ def dataframe_append(): """ Expected result: A B C - 0 1.0 2 NaN - 1 3.0 4 NaN - 2 NaN 5 6.0 - 3 NaN 7 8.0 - dtype: object + 0 1.0 3 NaN + 1 2.0 4 NaN + 2 NaN 5 7.0 + 3 NaN 6 8.0 + """ - df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB')) - df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('BC')) + df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = pd.DataFrame({'B': [5, 6], 'C': [7, 8]}) result = df.append(df2) return result diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index d50eebb0f..67480610e 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -214,29 +214,27 @@ def fill_str_array(data, size, push_back=True): result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars) # Keep NaN values of initial array - str_arr_is_na_mask = [] - for i in numba.prange(string_array_size): - if sdc.hiframes.api.isna(data, i): - str_arr_is_na_mask.append(i) - - str_arr_is_na_mask = numpy.array(str_arr_is_na_mask) - - if push_back: - str_arr_is_na_mask = numpy.append(str_arr_is_na_mask, numpy.arange(string_array_size, size)) - else: - # Make offset to push front - str_arr_is_na_mask = str_arr_is_na_mask + none_array_size - str_arr_is_na_mask = numpy.append(numpy.arange(none_array_size), str_arr_is_na_mask) - + arr_is_na_mask = numpy.array([sdc.hiframes.api.isna(data, i) for i in + numba.parfor.internal_prange(string_array_size)]) data_str_list = sdc.str_arr_ext.to_string_list(data) nan_list = [''] * none_array_size result_list = data_str_list + nan_list if push_back else nan_list + data_str_list - sdc.str_arr_ext.cp_str_list_to_array(result_data, result_list) - for i in numba.prange(len(str_arr_is_na_mask)): - str_arr_set_na(result_data, str_arr_is_na_mask[i]) + if push_back: + for i in numba.parfor.internal_prange(string_array_size): + if arr_is_na_mask[i]: + str_arr_set_na(result_data, i) + for i in numba.parfor.internal_prange(string_array_size, size): + str_arr_set_na(result_data, i) + else: + for i in numba.parfor.internal_prange(none_array_size): + str_arr_set_na(result_data, i) + for i in numba.parfor.internal_prange(none_array_size, size): + off_set = i - none_array_size + if arr_is_na_mask[off_set]: + str_arr_set_na(result_data, i) return result_data diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 814ace192..b94cd8801 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -44,6 +44,107 @@ from sdc.utils import sdc_overload_method +def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args): + """ + Input: + df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) + other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']}) + + Func generated: + def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integrity=False, sort=None): + len_df = len(get_dataframe_data(df, 0)) + len_other = len(get_dataframe_data(other, 0)) + new_col_A_data_df = get_dataframe_data(df, 0) + new_col_A_data_other = get_dataframe_data(other, 0) + new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data + new_col_B_data_df = get_dataframe_data(df, 1) + new_col_B_data = init_series(new_col_B_data_df)._data + new_col_B = fill_array(new_col_B_data, len_df+len_other) + new_col_C_data_other = get_dataframe_data(other, 1) + new_col_C_data = init_series(new_col_C_data_other)._data + new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False) + return init_dataframe(new_col_A, new_col_B, new_col_C, None, 'A', 'B', 'C') + """ + indent = 4 * ' ' + func_args = ['df', 'other'] + + for key, value in args: + # TODO: improve check + if key not in func_args: + if isinstance(value, types.Literal): + value = value.literal_value + func_args.append(f'{key}={value}') + + df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} + other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} + + # Keep columns that are StringArrayType + string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) + if isinstance(typ, StringArrayType)) + + for typ, col_name in zip(other.data, other.columns): + if isinstance(typ, StringArrayType): + string_type_columns.add(col_name) + + def get_dataframe_column(df, column, idx): + return f'new_col_{column}_data_{df} = get_dataframe_data({df}, {idx})' + + def get_append_result(df1, df2, column): + s1 = f'init_series(new_col_{column}_data_{df1})' + s2 = f'init_series(new_col_{column}_data_{df2})' + return f'new_col_{column} = {s1}.append({s2})._data' + + func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] + func_text = [] + column_list = [] + + func_text.append(f'len_df = len(get_dataframe_data(df, 0))') + func_text.append(f'len_other = len(get_dataframe_data(other, 0))') + + for col_name, i in df_columns_indx.items(): + func_text.append(get_dataframe_column('df', col_name, i)) + if col_name in other_columns_indx: + func_text.append(get_dataframe_column('other', col_name, other_columns_indx.get(col_name))) + func_text.append(get_append_result('df', 'other', col_name)) + else: + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') + if col_name in string_type_columns: + func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') + else: + func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') + column_list.append((f'new_col_{col_name}', col_name)) + + for col_name, i in other_columns_indx.items(): + if col_name not in df_columns_indx: + func_text.append(get_dataframe_column('other', col_name, i)) + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') + if col_name in string_type_columns: + func_text.append( + f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + else: + func_text.append(f'new_col_{col_name} = ' + f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + column_list.append((f'new_col_{col_name}', col_name)) + + data = ', '.join(column for column, _ in column_list) + # TODO: Handle index + index = None + col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) + func_text.append(f"return init_dataframe({data}, {index}, {col_names})\n") + + func_definition.extend([indent + func_line for func_line in func_text]) + + func_def = '\n'.join(func_definition) + + global_vars = {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, + 'init_series': sdc.hiframes.api.init_series, + 'fill_array': sdc.datatypes.common_functions.fill_array, + 'fill_str_array': sdc.datatypes.common_functions.fill_str_array, + 'init_dataframe': sdc.hiframes.pd_dataframe_ext.init_dataframe} + + return func_def, global_vars + + @sdc_overload_method(DataFrameType, 'append') def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): """ @@ -64,18 +165,18 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F > python ./dataframe_append.py A B C - 0 1.0 2 NaN - 1 3.0 4 NaN - 2 NaN 5 6.0 - 3 NaN 7 8.0 - dtype: object + 0 1.0 3 NaN + 1 2.0 4 NaN + 2 NaN 5 7.0 + 3 NaN 6 8.0 .. note:: Parameter ignore_index, verify_integrity, sort are currently unsupported by Intel Scalable Dataframe Compiler Currently only pandas.DataFrame is supported as "other" parameter .. seealso:: - :ref:`concat ` + + `pandas.concat `_ General function to concatenate DataFrame or Series objects. Intel Scalable Dataframe Compiler Developer Guide @@ -122,80 +223,10 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F args = (('ignore_index', True), ('verify_integrity', False), ('sort', None)) def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): - indent = 4 * ' ' - func_args = ['df', 'other'] - - for key, value in args: - # TODO: improve check - if key not in func_args: - if isinstance(value, types.Literal): - value = value.literal_value - func_args.append('{}={}'.format(key, value)) - - df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} - other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} - - # Keep columns that are StringArrayType - string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) - if isinstance(typ, StringArrayType)) - - for typ, col_name in zip(other.data, other.columns): - if isinstance(typ, StringArrayType): - string_type_columns.add(col_name) - - def get_dataframe_column(df, column, idx): - return f'new_col_{column}_data_{df} = get_dataframe_data({df}, {idx})' - - def get_append_result(df1, df2, column): - s1 = f'init_series(new_col_{column}_data_{df1})' - s2 = f'init_series(new_col_{column}_data_{df2})' - return f'new_col_{column} = {s1}.append({s2})._data' - - func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] - func_text = [] - column_list = [] - - func_text.append(f'len_df = len(get_dataframe_data(df, {0}))') - func_text.append(f'len_other = len(get_dataframe_data(other, {0}))') - - for col_name, i in df_columns_indx.items(): - func_text.append(get_dataframe_column('df', col_name, i)) - if col_name in other_columns_indx: - func_text.append(get_dataframe_column('other', col_name, other_columns_indx.get(col_name))) - func_text.append(get_append_result('df', 'other', col_name)) - else: - func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') - if col_name in string_type_columns: - func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') - else: - func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') - column_list.append((f'new_col_{col_name}', col_name)) - - for col_name, i in other_columns_indx.items(): - if col_name not in df_columns_indx: - func_text.append(get_dataframe_column('other', col_name, i)) - func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') - if col_name in string_type_columns: - func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') - else: - func_text.append(f'new_col_{col_name} = ' - f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') - column_list.append((f'new_col_{col_name}', col_name)) - - data = ', '.join(column for column, _ in column_list) - # TODO: Handle index - index = None - col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) - func_text.append(f"return sdc.hiframes.pd_dataframe_ext.init_dataframe({data}, {index}, {col_names})\n") - - func_definition.extend([indent + func_line for func_line in func_text]) - - func_def = '\n'.join(func_definition) - loc_vars = {} - exec(func_def, {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, - 'init_series': sdc.hiframes.api.init_series, - 'fill_array': sdc.datatypes.common_functions.fill_array, 'fill_str_array': sdc.datatypes.common_functions.fill_str_array}, loc_vars) + func_def, global_vars = sdc_pandas_dataframe_append_codegen(df, other, _func_name, args) + + exec(func_def, global_vars, loc_vars) _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] return _append_impl diff --git a/sdc/hiframes/dataframe_pass.py b/sdc/hiframes/dataframe_pass.py index bb3e45ce5..c080f29e7 100644 --- a/sdc/hiframes/dataframe_pass.py +++ b/sdc/hiframes/dataframe_pass.py @@ -855,18 +855,18 @@ def _run_call_dataframe(self, assign, lhs, rhs, df_var, func_name): pysig=numba.utils.pysignature(stub), kws=dict(rhs.kws)) - # if func_name == 'append': - # rhs.args.insert(0, df_var) - # arg_typs = tuple(self.state.typemap[v.name] for v in rhs.args) - # kw_typs = {name: self.state.typemap[v.name] - # for name, v in dict(rhs.kws).items()} - # impl = sdc.hiframes.pd_dataframe_ext.append_overload( - # *arg_typs, **kw_typs) - # stub = (lambda df, other, ignore_index=False, - # verify_integrity=False, sort=None: None) - # return self._replace_func(impl, rhs.args, - # pysig=numba.utils.pysignature(stub), - # kws=dict(rhs.kws)) + if func_name == 'append': + rhs.args.insert(0, df_var) + arg_typs = tuple(self.state.typemap[v.name] for v in rhs.args) + kw_typs = {name: self.state.typemap[v.name] + for name, v in dict(rhs.kws).items()} + impl = sdc.hiframes.pd_dataframe_ext.append_overload( + *arg_typs, **kw_typs) + stub = (lambda df, other, ignore_index=False, + verify_integrity=False, sort=None: None) + return self._replace_func(impl, rhs.args, + pysig=numba.utils.pysignature(stub), + kws=dict(rhs.kws)) if func_name == 'pct_change': rhs.args.insert(0, df_var) diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index ba9c5f455..c34ef98bb 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1630,4 +1630,4 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None, return _impl -from sdc.datatypes.hpat_pandas_dataframe_functions import * \ No newline at end of file +from sdc.datatypes.hpat_pandas_dataframe_functions import * diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index ff38f8ce6..6262d8c8b 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -69,6 +69,7 @@ StringArrayType, GetItemStringArray) from sdc.str_ext import string_type, list_string_array_type +from sdc.config import config_pipeline_hpat_default class SeriesType(types.IterableType): @@ -620,28 +621,29 @@ def resolve_corr(self, ary, args, kws): return self._resolve_cov_func(ary, args, kws) # PR135. This needs to be commented out -# @bound_function("series.append") -# def resolve_append(self, ary, args, kws): -# # TODO: ignore_index -# assert not kws -# arr_typ = if_series_to_array_type(ary) -# other, = args -# if isinstance(other, (SeriesType, types.Array)): -# all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other))) -# elif isinstance(other, types.BaseTuple): -# all_arrs = types.Tuple((arr_typ, *[if_series_to_array_type(a) for a in other.types])) -# elif isinstance(other, (types.List, types.Set)): -# # add only one value from the list for typing since it shouldn't -# # matter for np.concatenate typing -# all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other.dtype))) -# else: -# raise ValueError("Invalid input for Series.append (Series, or tuple/list of Series expected)") -# -# # TODO: list -# # call np.concatenate to handle type promotion e.g. int, float -> float -# ret_typ = self.context.resolve_function_type(np.concatenate, (all_arrs,), kws).return_type -# ret_typ = if_arr_to_series_type(ret_typ) -# return signature(ret_typ, *args) + if config_pipeline_hpat_default: + @bound_function("series.append") + def resolve_append(self, ary, args, kws): + # TODO: ignore_index + assert not kws + arr_typ = if_series_to_array_type(ary) + other, = args + if isinstance(other, (SeriesType, types.Array)): + all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other))) + elif isinstance(other, types.BaseTuple): + all_arrs = types.Tuple((arr_typ, *[if_series_to_array_type(a) for a in other.types])) + elif isinstance(other, (types.List, types.Set)): + # add only one value from the list for typing since it shouldn't + # matter for np.concatenate typing + all_arrs = types.Tuple((arr_typ, if_series_to_array_type(other.dtype))) + else: + raise ValueError("Invalid input for Series.append (Series, or tuple/list of Series expected)") + + # TODO: list + # call np.concatenate to handle type promotion e.g. int, float -> float + ret_typ = self.context.resolve_function_type(np.concatenate, (all_arrs,), kws).return_type + ret_typ = if_arr_to_series_type(ret_typ) + return signature(ret_typ, *args) # @bound_function("series.isna") # def resolve_isna(self, ary, args, kws): diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index efaac1b89..d0fa6a8e5 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1158,7 +1158,7 @@ def test_impl(df, df2): pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) - @unittest.skip('Unsupported functionality df.append([df2, df3]') + @skip_numba_jit('Unsupported functionality df.append([df2, df3])') def test_append_no_index(self): def test_impl(df, df2, df3): return df.append([df2, df3], ignore_index=True) From 133ef8d180a4b5a6d0756793594c1bae94d2c204 Mon Sep 17 00:00:00 2001 From: akharche Date: Wed, 25 Dec 2019 15:35:44 +0300 Subject: [PATCH 08/12] Batch iteration to add nans to StringArray --- sdc/datatypes/common_functions.py | 30 +- .../hpat_pandas_dataframe_functions.py | 466 +++++++++--------- sdc/tests/test_dataframe.py | 4 +- 3 files changed, 250 insertions(+), 250 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index 67480610e..f307f507f 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -215,26 +215,32 @@ def fill_str_array(data, size, push_back=True): # Keep NaN values of initial array arr_is_na_mask = numpy.array([sdc.hiframes.api.isna(data, i) for i in - numba.parfor.internal_prange(string_array_size)]) + numba.prange(string_array_size)]) data_str_list = sdc.str_arr_ext.to_string_list(data) nan_list = [''] * none_array_size result_list = data_str_list + nan_list if push_back else nan_list + data_str_list sdc.str_arr_ext.cp_str_list_to_array(result_data, result_list) + # Batch=64 iteration to avoid threads competition + batch_size = 64 if push_back: - for i in numba.parfor.internal_prange(string_array_size): - if arr_is_na_mask[i]: - str_arr_set_na(result_data, i) - for i in numba.parfor.internal_prange(string_array_size, size): - str_arr_set_na(result_data, i) + for i in numba.prange(string_array_size//batch_size + 1): + for j in range(i, max(i + batch_size, string_array_size)): + if arr_is_na_mask[j]: + str_arr_set_na(result_data, j) + for i in numba.prange(none_array_size//batch_size + 1): + for j in range(string_array_size, string_array_size + max(i + batch_size, size)): + str_arr_set_na(result_data, j) else: - for i in numba.parfor.internal_prange(none_array_size): - str_arr_set_na(result_data, i) - for i in numba.parfor.internal_prange(none_array_size, size): - off_set = i - none_array_size - if arr_is_na_mask[off_set]: - str_arr_set_na(result_data, i) + for i in numba.prange(none_array_size//batch_size + 1): + for j in range(i, max(i + batch_size, none_array_size)): + str_arr_set_na(result_data, j) + for i in numba.prange(string_array_size//batch_size + 1): + for j in range(none_array_size, none_array_size + max(i + batch_size, size)): + off_set = j - none_array_size + if arr_is_na_mask[off_set]: + str_arr_set_na(result_data, j) return result_data diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index b94cd8801..79f5b0dcd 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -40,249 +40,243 @@ from sdc.datatypes.common_functions import TypeChecker from numba.errors import TypingError from sdc.str_arr_ext import StringArrayType +from sdc.config import config_pipeline_hpat_default from sdc.utils import sdc_overload_method - -def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args): - """ - Input: - df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) - other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']}) - - Func generated: - def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integrity=False, sort=None): - len_df = len(get_dataframe_data(df, 0)) - len_other = len(get_dataframe_data(other, 0)) - new_col_A_data_df = get_dataframe_data(df, 0) - new_col_A_data_other = get_dataframe_data(other, 0) - new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data - new_col_B_data_df = get_dataframe_data(df, 1) - new_col_B_data = init_series(new_col_B_data_df)._data - new_col_B = fill_array(new_col_B_data, len_df+len_other) - new_col_C_data_other = get_dataframe_data(other, 1) - new_col_C_data = init_series(new_col_C_data_other)._data - new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False) - return init_dataframe(new_col_A, new_col_B, new_col_C, None, 'A', 'B', 'C') - """ - indent = 4 * ' ' - func_args = ['df', 'other'] - - for key, value in args: - # TODO: improve check - if key not in func_args: - if isinstance(value, types.Literal): - value = value.literal_value - func_args.append(f'{key}={value}') - - df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} - other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} - - # Keep columns that are StringArrayType - string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) - if isinstance(typ, StringArrayType)) - - for typ, col_name in zip(other.data, other.columns): - if isinstance(typ, StringArrayType): - string_type_columns.add(col_name) - - def get_dataframe_column(df, column, idx): - return f'new_col_{column}_data_{df} = get_dataframe_data({df}, {idx})' - - def get_append_result(df1, df2, column): - s1 = f'init_series(new_col_{column}_data_{df1})' - s2 = f'init_series(new_col_{column}_data_{df2})' - return f'new_col_{column} = {s1}.append({s2})._data' - - func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] - func_text = [] - column_list = [] - - func_text.append(f'len_df = len(get_dataframe_data(df, 0))') - func_text.append(f'len_other = len(get_dataframe_data(other, 0))') - - for col_name, i in df_columns_indx.items(): - func_text.append(get_dataframe_column('df', col_name, i)) - if col_name in other_columns_indx: - func_text.append(get_dataframe_column('other', col_name, other_columns_indx.get(col_name))) - func_text.append(get_append_result('df', 'other', col_name)) - else: - func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') - if col_name in string_type_columns: - func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') - else: - func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') - column_list.append((f'new_col_{col_name}', col_name)) - - for col_name, i in other_columns_indx.items(): - if col_name not in df_columns_indx: - func_text.append(get_dataframe_column('other', col_name, i)) - func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') - if col_name in string_type_columns: - func_text.append( - f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') +if not config_pipeline_hpat_default: + def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args): + """ + Input: + df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) + other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']}) + + Func generated: + def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integrity=False, sort=None): + len_df = len(get_dataframe_data(df, 0)) + len_other = len(get_dataframe_data(other, 0)) + new_col_A_data_df = get_dataframe_data(df, 0) + new_col_A_data_other = get_dataframe_data(other, 0) + new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data + new_col_B_data_df = get_dataframe_data(df, 1) + new_col_B_data = init_series(new_col_B_data_df)._data + new_col_B = fill_array(new_col_B_data, len_df+len_other) + new_col_C_data_other = get_dataframe_data(other, 1) + new_col_C_data = init_series(new_col_C_data_other)._data + new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False) + return init_dataframe(new_col_A, new_col_B, new_col_C, None, 'A', 'B', 'C') + """ + indent = 4 * ' ' + func_args = ['df', 'other'] + + for key, value in args: + # TODO: improve check + if key not in func_args: + if isinstance(value, types.Literal): + value = value.literal_value + func_args.append(f'{key}={value}') + + df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} + other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} + + # Keep columns that are StringArrayType + string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) + if isinstance(typ, StringArrayType)) + + for typ, col_name in zip(other.data, other.columns): + if isinstance(typ, StringArrayType): + string_type_columns.add(col_name) + + func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] + func_text = [] + column_list = [] + + func_text.append(f'len_df = len(get_dataframe_data(df, 0))') + func_text.append(f'len_other = len(get_dataframe_data(other, 0))') + + for col_name, i in df_columns_indx.items(): + func_text.append(f'new_col_{col_name}_data_{"df"} = get_dataframe_data({"df"}, {i})') + if col_name in other_columns_indx: + func_text.append(f'new_col_{col_name}_data_{"other"} = ' + f'get_dataframe_data({"other"}, {other_columns_indx.get(col_name)})') + s1 = f'init_series(new_col_{col_name}_data_{"df"})' + s2 = f'init_series(new_col_{col_name}_data_{"other"})' + func_text.append(f'new_col_{col_name} = {s1}.append({s2})._data') else: - func_text.append(f'new_col_{col_name} = ' - f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') + if col_name in string_type_columns: + func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') + else: + func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') column_list.append((f'new_col_{col_name}', col_name)) - data = ', '.join(column for column, _ in column_list) - # TODO: Handle index - index = None - col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) - func_text.append(f"return init_dataframe({data}, {index}, {col_names})\n") - - func_definition.extend([indent + func_line for func_line in func_text]) - - func_def = '\n'.join(func_definition) - - global_vars = {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, - 'init_series': sdc.hiframes.api.init_series, - 'fill_array': sdc.datatypes.common_functions.fill_array, - 'fill_str_array': sdc.datatypes.common_functions.fill_str_array, - 'init_dataframe': sdc.hiframes.pd_dataframe_ext.init_dataframe} - - return func_def, global_vars - - -@sdc_overload_method(DataFrameType, 'append') -def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): - """ - Intel Scalable Dataframe Compiler User Guide - ******************************************** - Pandas API: pandas.DataFrame.append - - Examples - -------- - .. literalinclude:: ../../../examples/dataframe_append.py - :language: python - :lines: 27- - :caption: Appending rows of other to the end of caller, returning a new object. - Columns in other that are not in the caller are added as new columns. - :name: ex_dataframe_append - - .. code-block:: console - - > python ./dataframe_append.py - A B C - 0 1.0 3 NaN - 1 2.0 4 NaN - 2 NaN 5 7.0 - 3 NaN 6 8.0 - - .. note:: - Parameter ignore_index, verify_integrity, sort are currently unsupported by Intel Scalable Dataframe Compiler - Currently only pandas.DataFrame is supported as "other" parameter - - .. seealso:: - - `pandas.concat `_ - General function to concatenate DataFrame or Series objects. - - Intel Scalable Dataframe Compiler Developer Guide - ************************************************* - Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. - .. only:: developer - Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append* - - Parameters - ----------- - df: :obj:`pandas.DataFrame` - input arg - other: :obj:`pandas.DataFrame` object or :obj:`pandas.Series` or :obj:`dict` - The data to append - ignore_index: :obj:`bool` - *unsupported* - verify_integrity: :obj:`bool` - *unsupported* - sort: :obj:`bool` - *unsupported* - - Returns - ------- - :obj: `pandas.DataFrame` - return DataFrame with appended rows to the end - """ - - _func_name = 'append' - - ty_checker = TypeChecker(f'Method {_func_name}().') - ty_checker.check(df, DataFrameType) - # TODO: support other array-like types - ty_checker.check(other, DataFrameType) - # TODO: support index in series from df-columns - if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)) and not ignore_index: - ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') - - if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity: - ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity') - - if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and sort is not None: - ty_checker.raise_exc(sort, 'boolean, None', 'sort') - - args = (('ignore_index', True), ('verify_integrity', False), ('sort', None)) - - def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): - loc_vars = {} - func_def, global_vars = sdc_pandas_dataframe_append_codegen(df, other, _func_name, args) - - exec(func_def, global_vars, loc_vars) - _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] - return _append_impl - - return sdc_pandas_dataframe_append_impl(df, other, _func_name, args) - - -@sdc_overload_method(DataFrameType, 'count') -def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. - - .. only:: developer - - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count - - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. - """ - - _func_name = 'Method pandas.dataframe.count().' + for col_name, i in other_columns_indx.items(): + if col_name not in df_columns_indx: + func_text.append(f'new_col_{col_name}_data_{"other"} = get_dataframe_data({"other"}, {i})') + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') + if col_name in string_type_columns: + func_text.append( + f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + else: + func_text.append(f'new_col_{col_name} = ' + f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + column_list.append((f'new_col_{col_name}', col_name)) + + data = ', '.join(column for column, _ in column_list) + # TODO: Handle index + index = None + col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) + func_text.append(f"return init_dataframe({data}, {index}, {col_names})\n") + func_definition.extend([indent + func_line for func_line in func_text]) + func_def = '\n'.join(func_definition) + + global_vars = {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, + 'init_series': sdc.hiframes.api.init_series, + 'fill_array': sdc.datatypes.common_functions.fill_array, + 'fill_str_array': sdc.datatypes.common_functions.fill_str_array, + 'init_dataframe': sdc.hiframes.pd_dataframe_ext.init_dataframe} + + return func_def, global_vars + + + @sdc_overload_method(DataFrameType, 'append') + def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): + """ + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: pandas.DataFrame.append + + Examples + -------- + .. literalinclude:: ../../../examples/dataframe_append.py + :language: python + :lines: 27- + :caption: Appending rows of other to the end of caller, returning a new object. + Columns in other that are not in the caller are added as new columns. + :name: ex_dataframe_append + + .. code-block:: console + + > python ./dataframe_append.py + A B C + 0 1.0 3 NaN + 1 2.0 4 NaN + 2 NaN 5 7.0 + 3 NaN 6 8.0 + + .. note:: + Parameter ignore_index, verify_integrity, sort are currently unsupported by Intel Scalable Dataframe Compiler + Currently only pandas.DataFrame is supported as "other" parameter + + .. seealso:: + + `pandas.concat `_ + General function to concatenate DataFrame or Series objects. + + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. + .. only:: developer + Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append* + + Parameters + ----------- + df: :obj:`pandas.DataFrame` + input arg + other: :obj:`pandas.DataFrame` object or :obj:`pandas.Series` or :obj:`dict` + The data to append + ignore_index: :obj:`bool` + *unsupported* + verify_integrity: :obj:`bool` + *unsupported* + sort: :obj:`bool` + *unsupported* + + Returns + ------- + :obj: `pandas.DataFrame` + return DataFrame with appended rows to the end + """ + + _func_name = 'append' + + ty_checker = TypeChecker(f'Method {_func_name}().') + ty_checker.check(df, DataFrameType) + # TODO: support other array-like types + ty_checker.check(other, DataFrameType) + # TODO: support index in series from df-columns + if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)) and not ignore_index: + ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') + + if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity: + ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity') + + if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and sort is not None: + ty_checker.raise_exc(sort, 'boolean, None', 'sort') + + args = (('ignore_index', True), ('verify_integrity', False), ('sort', None)) + + def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): + loc_vars = {} + func_def, global_vars = sdc_pandas_dataframe_append_codegen(df, other, _func_name, args) + + exec(func_def, global_vars, loc_vars) + _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] + return _append_impl + + return sdc_pandas_dataframe_append_impl(df, other, _func_name, args) + + + @sdc_overload_method(DataFrameType, 'count') + def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. + + .. only:: developer + + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count + + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. + """ + + _func_name = 'Method pandas.dataframe.count().' - if not isinstance(self, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) - - if not (isinstance(axis, types.Omitted) or axis == 0): - raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) - - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) + if not isinstance(self, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) + + if not (isinstance(axis, types.Omitted) or axis == 0): + raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) + + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) - if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) + if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) - def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): - result_data = [] - result_index = [] - - for dataframe_item in self._data: - item_count = dataframe_item.count() - item_name = dataframe_item._name - result_data.append(item_count) - result_index.append(item_name) - - return pandas.Series(data=result_data, index=result_index) + def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): + result_data = [] + result_index = [] + + for dataframe_item in self._data: + item_count = dataframe_item.count() + item_name = dataframe_item._name + result_data.append(item_count) + result_index.append(item_name) + + return pandas.Series(data=result_data, index=result_index) - return sdc_pandas_dataframe_count_impl + return sdc_pandas_dataframe_count_impl diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index d0fa6a8e5..22022258d 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1153,8 +1153,8 @@ def test_impl(df, df2): hpat_func = self.jit(test_impl) - df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) - df2 = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': [5, 6, 7], 'D': ['a', np.nan, '']}) + df = pd.DataFrame({'A': ['cat', 'dog', np.nan] * 64, 'B': [.2, .3, np.nan] * 64}) + df2 = pd.DataFrame({'C': [5, 6, 7]*63, 'D': ['a', np.nan, '']*63}) pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) From f44d2b2cb118b894c9a9f32e8d55792db1d03b89 Mon Sep 17 00:00:00 2001 From: akharche Date: Wed, 25 Dec 2019 15:47:42 +0300 Subject: [PATCH 09/12] Style fixes --- .../hpat_pandas_dataframe_functions.py | 11 ++++--- sdc/hiframes/pd_dataframe_ext.py | 31 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 79f5b0dcd..63c3d228a 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -116,7 +116,8 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integr func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') if col_name in string_type_columns: func_text.append( - f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') + f'new_col_{col_name} = ' + f'fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') else: func_text.append(f'new_col_{col_name} = ' f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') @@ -138,7 +139,6 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integr return func_def, global_vars - @sdc_overload_method(DataFrameType, 'append') def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): """ @@ -165,7 +165,8 @@ def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=F 3 NaN 6 8.0 .. note:: - Parameter ignore_index, verify_integrity, sort are currently unsupported by Intel Scalable Dataframe Compiler + Parameter ignore_index, verify_integrity, sort are currently unsupported + by Intel Scalable Dataframe Compiler Currently only pandas.DataFrame is supported as "other" parameter .. seealso:: @@ -226,7 +227,6 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): return sdc_pandas_dataframe_append_impl(df, other, _func_name, args) - @sdc_overload_method(DataFrameType, 'count') def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): """ @@ -250,7 +250,8 @@ def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): Returns ------- :obj:`pandas.Series` or `pandas.DataFrame` - returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. + returns: For each column/row the number of non-NA/null entries. + If level is specified returns a DataFrame. """ _func_name = 'Method pandas.dataframe.count().' diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index c34ef98bb..d32ca68b0 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1232,21 +1232,22 @@ def lower_isin_dummy(context, builder, sig, args): return out_obj._getvalue() -# @overload_method(DataFrameType, 'append') -# def append_overload(df, other, ignore_index=False, verify_integrity=False, -# sort=None): -# if isinstance(other, DataFrameType): -# return (lambda df, other, ignore_index=False, verify_integrity=False, -# sort=None: pd.concat((df, other))) -# -# # TODO: tuple case -# # TODO: non-homogenous build_list case -# if isinstance(other, types.List) and isinstance(other.dtype, DataFrameType): -# return (lambda df, other, ignore_index=False, verify_integrity=False, -# sort=None: pd.concat([df] + other)) -# -# raise ValueError("invalid df.append() input. Only dataframe and list" -# " of dataframes supported") +if sdc.config.config_pipeline_hpat_default: + @overload_method(DataFrameType, 'append') + def append_overload(df, other, ignore_index=False, verify_integrity=False, + sort=None): + if isinstance(other, DataFrameType): + return (lambda df, other, ignore_index=False, verify_integrity=False, + sort=None: pd.concat((df, other))) + + # TODO: tuple case + # TODO: non-homogenous build_list case + if isinstance(other, types.List) and isinstance(other.dtype, DataFrameType): + return (lambda df, other, ignore_index=False, verify_integrity=False, + sort=None: pd.concat([df] + other)) + + raise ValueError("invalid df.append() input. Only dataframe and list" + " of dataframes supported") @overload_method(DataFrameType, 'pct_change') From 65c28dc7fecacb1ab3b6f84c8bceb6a32b0417b6 Mon Sep 17 00:00:00 2001 From: akharche Date: Wed, 25 Dec 2019 17:55:00 +0300 Subject: [PATCH 10/12] Create df through rewrite --- sdc/datatypes/hpat_pandas_dataframe_functions.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 63c3d228a..e1b08ead9 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -123,19 +123,17 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integr f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') column_list.append((f'new_col_{col_name}', col_name)) - data = ', '.join(column for column, _ in column_list) + data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) # TODO: Handle index - index = None - col_names = ', '.join(f"'{column_name}'" for _, column_name in column_list) - func_text.append(f"return init_dataframe({data}, {index}, {col_names})\n") + func_text.append(f"return pandas.DataFrame({{{data}}})\n") func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) - global_vars = {'sdc': sdc, 'np': numpy, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, + global_vars = {'sdc': sdc, 'np': numpy, 'pandas': pandas, + 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, 'init_series': sdc.hiframes.api.init_series, 'fill_array': sdc.datatypes.common_functions.fill_array, - 'fill_str_array': sdc.datatypes.common_functions.fill_str_array, - 'init_dataframe': sdc.hiframes.pd_dataframe_ext.init_dataframe} + 'fill_str_array': sdc.datatypes.common_functions.fill_str_array} return func_def, global_vars From c8a6430538020b8ed8c8133162088eb8cc0f99f7 Mon Sep 17 00:00:00 2001 From: akharche Date: Thu, 9 Jan 2020 23:10:30 +0300 Subject: [PATCH 11/12] Fix appending nones to StringArrayType columns --- sdc/__init__.py | 1 + sdc/datatypes/common_functions.py | 32 +- .../hpat_pandas_dataframe_functions.py | 317 +++++++++--------- sdc/hiframes/pd_dataframe_ext.py | 4 - sdc/tests/test_dataframe.py | 5 +- 5 files changed, 176 insertions(+), 183 deletions(-) diff --git a/sdc/__init__.py b/sdc/__init__.py index 162ed61fd..37f2c1e35 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -63,6 +63,7 @@ import sdc.rewrites.dataframe_constructor import sdc.datatypes.hpat_pandas_functions + import sdc.datatypes.hpat_pandas_dataframe_functions else: import sdc.compiler diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index b0c17176a..f250d42f8 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -225,8 +225,7 @@ def fill_str_array(data, size, push_back=True): result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars) # Keep NaN values of initial array - arr_is_na_mask = numpy.array([sdc.hiframes.api.isna(data, i) for i in - numba.prange(string_array_size)]) + arr_is_na_mask = numpy.array([sdc.hiframes.api.isna(data, i) for i in range(string_array_size)]) data_str_list = sdc.str_arr_ext.to_string_list(data) nan_list = [''] * none_array_size @@ -235,23 +234,22 @@ def fill_str_array(data, size, push_back=True): # Batch=64 iteration to avoid threads competition batch_size = 64 + if push_back: - for i in numba.prange(string_array_size//batch_size + 1): - for j in range(i, max(i + batch_size, string_array_size)): - if arr_is_na_mask[j]: - str_arr_set_na(result_data, j) - for i in numba.prange(none_array_size//batch_size + 1): - for j in range(string_array_size, string_array_size + max(i + batch_size, size)): - str_arr_set_na(result_data, j) + string_array_shift = 0 + none_array_shift = string_array_size else: - for i in numba.prange(none_array_size//batch_size + 1): - for j in range(i, max(i + batch_size, none_array_size)): - str_arr_set_na(result_data, j) - for i in numba.prange(string_array_size//batch_size + 1): - for j in range(none_array_size, none_array_size + max(i + batch_size, size)): - off_set = j - none_array_size - if arr_is_na_mask[off_set]: - str_arr_set_na(result_data, j) + string_array_shift = none_array_size + none_array_shift = 0 + + for i in numba.prange(string_array_size//batch_size + 1): + for j in range(i*batch_size, min((i+1)*batch_size, string_array_size)): + if arr_is_na_mask[j]: + str_arr_set_na(result_data, string_array_shift + j) + + for i in numba.prange(none_array_size//batch_size + 1): + for j in range(i*batch_size, min((i+1)*batch_size, none_array_size)): + str_arr_set_na(result_data, none_array_shift + j) return result_data diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index f400ab2ac..21d311e54 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -51,177 +51,174 @@ from sdc.hiframes.pd_dataframe_ext import get_dataframe_data -if not config_pipeline_hpat_default: - def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args): - """ - Input: - df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) - other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']}) - Func generated: - def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integrity=False, sort=None): - len_df = len(get_dataframe_data(df, 0)) - len_other = len(get_dataframe_data(other, 0)) - new_col_A_data_df = get_dataframe_data(df, 0) - new_col_A_data_other = get_dataframe_data(other, 0) - new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data - new_col_B_data_df = get_dataframe_data(df, 1) - new_col_B_data = init_series(new_col_B_data_df)._data - new_col_B = fill_array(new_col_B_data, len_df+len_other) - new_col_C_data_other = get_dataframe_data(other, 1) - new_col_C_data = init_series(new_col_C_data_other)._data - new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False) - return init_dataframe(new_col_A, new_col_B, new_col_C, None, 'A', 'B', 'C') - """ - indent = 4 * ' ' - func_args = ['df', 'other'] - - for key, value in args: - # TODO: improve check - if key not in func_args: - if isinstance(value, types.Literal): - value = value.literal_value - func_args.append(f'{key}={value}') - - df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} - other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} - - # Keep columns that are StringArrayType - string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) - if isinstance(typ, StringArrayType)) - - for typ, col_name in zip(other.data, other.columns): - if isinstance(typ, StringArrayType): - string_type_columns.add(col_name) - - func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] - func_text = [] - column_list = [] - - func_text.append(f'len_df = len(get_dataframe_data(df, 0))') - func_text.append(f'len_other = len(get_dataframe_data(other, 0))') - - for col_name, i in df_columns_indx.items(): - func_text.append(f'new_col_{col_name}_data_{"df"} = get_dataframe_data({"df"}, {i})') - if col_name in other_columns_indx: - func_text.append(f'new_col_{col_name}_data_{"other"} = ' - f'get_dataframe_data({"other"}, {other_columns_indx.get(col_name)})') - s1 = f'init_series(new_col_{col_name}_data_{"df"})' - s2 = f'init_series(new_col_{col_name}_data_{"other"})' - func_text.append(f'new_col_{col_name} = {s1}.append({s2})._data') +def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args): + """ + Input: + df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) + other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']}) + Func generated: + def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integrity=False, sort=None): + len_df = len(get_dataframe_data(df, 0)) + len_other = len(get_dataframe_data(other, 0)) + new_col_A_data_df = get_dataframe_data(df, 0) + new_col_A_data_other = get_dataframe_data(other, 0) + new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data + new_col_B_data_df = get_dataframe_data(df, 1) + new_col_B_data = init_series(new_col_B_data_df)._data + new_col_B = fill_array(new_col_B_data, len_df+len_other) + new_col_C_data_other = get_dataframe_data(other, 1) + new_col_C_data = init_series(new_col_C_data_other)._data + new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False) + return pandas.DataFrame({"A": new_col_A, "B": new_col_B, "C": new_col_C) + """ + indent = 4 * ' ' + func_args = ['df', 'other'] + + for key, value in args: + # TODO: improve check + if key not in func_args: + if isinstance(value, types.Literal): + value = value.literal_value + func_args.append(f'{key}={value}') + + df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} + other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} + + # Keep columns that are StringArrayType + string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) + if isinstance(typ, StringArrayType)) + + for typ, col_name in zip(other.data, other.columns): + if isinstance(typ, StringArrayType): + string_type_columns.add(col_name) + + func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):'] + func_text = [] + column_list = [] + + func_text.append(f'len_df = len(get_dataframe_data(df, 0))') + func_text.append(f'len_other = len(get_dataframe_data(other, 0))') + + for col_name, i in df_columns_indx.items(): + func_text.append(f'new_col_{col_name}_data_{"df"} = get_dataframe_data({"df"}, {i})') + if col_name in other_columns_indx: + func_text.append(f'new_col_{col_name}_data_{"other"} = ' + f'get_dataframe_data({"other"}, {other_columns_indx.get(col_name)})') + s1 = f'init_series(new_col_{col_name}_data_{"df"})' + s2 = f'init_series(new_col_{col_name}_data_{"other"})' + func_text.append(f'new_col_{col_name} = {s1}.append({s2})._data') + else: + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') + if col_name in string_type_columns: + func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') + else: + func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') + column_list.append((f'new_col_{col_name}', col_name)) + + for col_name, i in other_columns_indx.items(): + if col_name not in df_columns_indx: + func_text.append(f'new_col_{col_name}_data_{"other"} = get_dataframe_data({"other"}, {i})') + func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') + if col_name in string_type_columns: + func_text.append( + f'new_col_{col_name} = ' + f'fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') else: - func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data') - if col_name in string_type_columns: - func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)') - else: - func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)') + func_text.append(f'new_col_{col_name} = ' + f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') column_list.append((f'new_col_{col_name}', col_name)) - for col_name, i in other_columns_indx.items(): - if col_name not in df_columns_indx: - func_text.append(f'new_col_{col_name}_data_{"other"} = get_dataframe_data({"other"}, {i})') - func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data') - if col_name in string_type_columns: - func_text.append( - f'new_col_{col_name} = ' - f'fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') - else: - func_text.append(f'new_col_{col_name} = ' - f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)') - column_list.append((f'new_col_{col_name}', col_name)) - - data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) - # TODO: Handle index - func_text.append(f"return pandas.DataFrame({{{data}}})\n") - func_definition.extend([indent + func_line for func_line in func_text]) - func_def = '\n'.join(func_definition) - - global_vars = {'sdc': sdc, 'np': numpy, 'pandas': pandas, - 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, - 'init_series': sdc.hiframes.api.init_series, - 'fill_array': sdc.datatypes.common_functions.fill_array, - 'fill_str_array': sdc.datatypes.common_functions.fill_str_array} - - return func_def, global_vars - - - @sdc_overload_method(DataFrameType, 'append') - def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): - """ - Intel Scalable Dataframe Compiler User Guide - ******************************************** - Pandas API: pandas.DataFrame.append - Examples - -------- - .. literalinclude:: ../../../examples/dataframe_append.py - :language: python - :lines: 27- - :caption: Appending rows of other to the end of caller, returning a new object. - Columns in other that are not in the caller are added as new columns. - :name: ex_dataframe_append - .. code-block:: console - > python ./dataframe_append.py - A B C - 0 1.0 3 NaN - 1 2.0 4 NaN - 2 NaN 5 7.0 - 3 NaN 6 8.0 - .. note:: - Parameter ignore_index, verify_integrity, sort are currently unsupported - by Intel Scalable Dataframe Compiler - Currently only pandas.DataFrame is supported as "other" parameter - .. seealso:: - `pandas.concat `_ - General function to concatenate DataFrame or Series objects. - Intel Scalable Dataframe Compiler Developer Guide - ************************************************* - Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. - .. only:: developer - Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append* - Parameters - ----------- - df: :obj:`pandas.DataFrame` - input arg - other: :obj:`pandas.DataFrame` object or :obj:`pandas.Series` or :obj:`dict` - The data to append - ignore_index: :obj:`bool` - *unsupported* - verify_integrity: :obj:`bool` - *unsupported* - sort: :obj:`bool` - *unsupported* - Returns - ------- - :obj: `pandas.DataFrame` - return DataFrame with appended rows to the end - """ + data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) + # TODO: Handle index + func_text.append(f"return pandas.DataFrame({{{data}}})\n") + func_definition.extend([indent + func_line for func_line in func_text]) + func_def = '\n'.join(func_definition) + + global_vars = {'sdc': sdc, 'np': numpy, 'pandas': pandas, + 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, + 'init_series': sdc.hiframes.api.init_series, + 'fill_array': sdc.datatypes.common_functions.fill_array, + 'fill_str_array': sdc.datatypes.common_functions.fill_str_array} + + return func_def, global_vars + + +@sdc_overload_method(DataFrameType, 'append') +def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None): + """ + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: pandas.DataFrame.append + Examples + -------- + .. literalinclude:: ../../../examples/dataframe_append.py + :language: python + :lines: 27- + :caption: Appending rows of other to the end of caller, returning a new object. + Columns in other that are not in the caller are added as new columns. + :name: ex_dataframe_append + + .. command-output:: python ./dataframe_append.py + :cwd: ../../../examples + + .. note:: + Parameter ignore_index, verify_integrity, sort are currently unsupported + by Intel Scalable Dataframe Compiler + Currently only pandas.DataFrame is supported as "other" parameter + + .. seealso:: + `pandas.concat `_ + General function to concatenate DataFrame or Series objects. + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation. + .. only:: developer + Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append* + Parameters + ----------- + df: :obj:`pandas.DataFrame` + input arg + other: :obj:`pandas.DataFrame` object or :obj:`pandas.Series` or :obj:`dict` + The data to append + ignore_index: :obj:`bool` + *unsupported* + verify_integrity: :obj:`bool` + *unsupported* + sort: :obj:`bool` + *unsupported* + Returns + ------- + :obj: `pandas.DataFrame` + return DataFrame with appended rows to the end + """ - _func_name = 'append' + _func_name = 'append' - ty_checker = TypeChecker(f'Method {_func_name}().') - ty_checker.check(df, DataFrameType) - # TODO: support other array-like types - ty_checker.check(other, DataFrameType) - # TODO: support index in series from df-columns - if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)) and not ignore_index: - ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') + ty_checker = TypeChecker(f'Method {_func_name}().') + ty_checker.check(df, DataFrameType) + # TODO: support other array-like types + ty_checker.check(other, DataFrameType) + # TODO: support index in series from df-columns + if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)) and not ignore_index: + ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index') - if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity: - ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity') + if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity: + ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity') - if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and sort is not None: - ty_checker.raise_exc(sort, 'boolean, None', 'sort') + if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and sort is not None: + ty_checker.raise_exc(sort, 'boolean, None', 'sort') - args = (('ignore_index', True), ('verify_integrity', False), ('sort', None)) + args = (('ignore_index', True), ('verify_integrity', False), ('sort', None)) - def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): - loc_vars = {} - func_def, global_vars = sdc_pandas_dataframe_append_codegen(df, other, _func_name, args) + def sdc_pandas_dataframe_append_impl(df, other, _func_name, args): + loc_vars = {} + func_def, global_vars = sdc_pandas_dataframe_append_codegen(df, other, _func_name, args) - exec(func_def, global_vars, loc_vars) - _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] - return _append_impl + exec(func_def, global_vars, loc_vars) + _append_impl = loc_vars['sdc_pandas_dataframe_append_impl'] + return _append_impl - return sdc_pandas_dataframe_append_impl(df, other, _func_name, args) + return sdc_pandas_dataframe_append_impl(df, other, _func_name, args) # Example func_text for func_name='count' columns=('A', 'B'): diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 7bb91e42a..00ddc13f9 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1559,7 +1559,3 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None, date_format, doublequote, escapechar, decimal) return _impl - - -if not sdc.config.config_pipeline_hpat_default: - from sdc.datatypes.hpat_pandas_dataframe_functions import * diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 8d14293f6..438db816a 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1138,14 +1138,15 @@ def test_impl(df, df2): pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) + @skip_sdc_jit def test_append_df_diff_types_no_index(self): def test_impl(df, df2): return df.append(df2, ignore_index=True) hpat_func = self.jit(test_impl) - df = pd.DataFrame({'A': ['cat', 'dog', np.nan] * 64, 'B': [.2, .3, np.nan] * 64}) - df2 = pd.DataFrame({'C': [5, 6, 7]*63, 'D': ['a', np.nan, '']*63}) + df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]}) + df2 = pd.DataFrame({'C': [5, 6, 7, 8]*64, 'D': ['a', 'b', np.nan, '']*64}) pd.testing.assert_frame_equal(hpat_func(df, df2), test_impl(df, df2)) From 80ce11b85ae720f4ac85a03362fc05b8dc9a8055 Mon Sep 17 00:00:00 2001 From: akharche Date: Mon, 13 Jan 2020 16:15:14 +0300 Subject: [PATCH 12/12] Fix threads competition cases --- sdc/datatypes/common_functions.py | 34 ++++++++++--------- .../hpat_pandas_dataframe_functions.py | 5 +-- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py index f250d42f8..8730136d8 100644 --- a/sdc/datatypes/common_functions.py +++ b/sdc/datatypes/common_functions.py @@ -219,7 +219,7 @@ def fill_str_array(data, size, push_back=True): """ string_array_size = len(data) - none_array_size = size - string_array_size + nan_array_size = size - string_array_size num_chars = sdc.str_arr_ext.num_total_chars(data) result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars) @@ -227,29 +227,31 @@ def fill_str_array(data, size, push_back=True): # Keep NaN values of initial array arr_is_na_mask = numpy.array([sdc.hiframes.api.isna(data, i) for i in range(string_array_size)]) data_str_list = sdc.str_arr_ext.to_string_list(data) - nan_list = [''] * none_array_size + nan_list = [''] * nan_array_size result_list = data_str_list + nan_list if push_back else nan_list + data_str_list sdc.str_arr_ext.cp_str_list_to_array(result_data, result_list) # Batch=64 iteration to avoid threads competition batch_size = 64 - if push_back: - string_array_shift = 0 - none_array_shift = string_array_size - else: - string_array_shift = none_array_size - none_array_shift = 0 - - for i in numba.prange(string_array_size//batch_size + 1): - for j in range(i*batch_size, min((i+1)*batch_size, string_array_size)): - if arr_is_na_mask[j]: - str_arr_set_na(result_data, string_array_shift + j) + for i in numba.prange(size//batch_size + 1): + for j in range(i*batch_size, min((i+1)*batch_size, size)): + if j < string_array_size: + if arr_is_na_mask[j]: + str_arr_set_na(result_data, j) + else: + str_arr_set_na(result_data, j) - for i in numba.prange(none_array_size//batch_size + 1): - for j in range(i*batch_size, min((i+1)*batch_size, none_array_size)): - str_arr_set_na(result_data, none_array_shift + j) + else: + for i in numba.prange(size//batch_size + 1): + for j in range(i*batch_size, min((i+1)*batch_size, size)): + if j < nan_array_size: + str_arr_set_na(result_data, j) + else: + str_arr_j = j - nan_array_size + if arr_is_na_mask[str_arr_j]: + str_arr_set_na(result_data, j) return result_data diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 21d311e54..77c1d956b 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -84,6 +84,8 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integr df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)} other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)} + + # Keep columns that are StringArrayType string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns) if isinstance(typ, StringArrayType)) @@ -134,8 +136,7 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integr func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) - global_vars = {'sdc': sdc, 'np': numpy, 'pandas': pandas, - 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, + global_vars = {'pandas': pandas, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data, 'init_series': sdc.hiframes.api.init_series, 'fill_array': sdc.datatypes.common_functions.fill_array, 'fill_str_array': sdc.datatypes.common_functions.fill_str_array}