Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions examples/dataframe/dataframe_append.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# *****************************************************************************
# Copyright (c) 2019, Intel Corporation All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

import pandas as pd
from numba import njit


@njit
def dataframe_append():
"""
Expected result:
A B C
0 1.0 3 NaN
1 2.0 4 NaN
2 NaN 5 7.0
3 NaN 6 8.0
"""
Comment on lines +33 to +41
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'B': [5, 6], 'C': [7, 8]})
result = df.append(df2)

return result


print(dataframe_append())
1 change: 1 addition & 0 deletions sdc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@

import sdc.rewrites.dataframe_constructor
import sdc.datatypes.hpat_pandas_functions
import sdc.datatypes.hpat_pandas_dataframe_functions
else:
import sdc.compiler

Expand Down
58 changes: 57 additions & 1 deletion sdc/datatypes/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import numba
from numba import types
from numba.errors import TypingError
from numba.extending import overload
from numba.extending import overload, register_jitable
from numba import numpy_support

import sdc
Expand Down Expand Up @@ -200,6 +200,62 @@ def _append_list_string_array_impl(A, B):
return _append_list_string_array_impl


@register_jitable
def fill_array(data, size, fill_value=numpy.nan, push_back=True):
"""
Fill array with given values to reach the size
"""

if push_back:
return numpy.append(data, numpy.repeat(fill_value, size - data.size))

return numpy.append(numpy.repeat(fill_value, size - data.size), data)


@register_jitable
def fill_str_array(data, size, push_back=True):
"""
Fill StringArrayType array with given values to reach the size
"""

string_array_size = len(data)
nan_array_size = size - string_array_size
num_chars = sdc.str_arr_ext.num_total_chars(data)

result_data = sdc.str_arr_ext.pre_alloc_string_array(size, num_chars)

# Keep NaN values of initial array
arr_is_na_mask = numpy.array([sdc.hiframes.api.isna(data, i) for i in range(string_array_size)])
data_str_list = sdc.str_arr_ext.to_string_list(data)
nan_list = [''] * nan_array_size

result_list = data_str_list + nan_list if push_back else nan_list + data_str_list
sdc.str_arr_ext.cp_str_list_to_array(result_data, result_list)

# Batch=64 iteration to avoid threads competition
batch_size = 64
if push_back:
for i in numba.prange(size//batch_size + 1):
for j in range(i*batch_size, min((i+1)*batch_size, size)):
if j < string_array_size:
if arr_is_na_mask[j]:
str_arr_set_na(result_data, j)
else:
str_arr_set_na(result_data, j)

else:
for i in numba.prange(size//batch_size + 1):
for j in range(i*batch_size, min((i+1)*batch_size, size)):
if j < nan_array_size:
str_arr_set_na(result_data, j)
else:
str_arr_j = j - nan_array_size
if arr_is_na_mask[str_arr_j]:
str_arr_set_na(result_data, j)

return result_data


@numba.njit
def _hpat_ensure_array_capacity(new_size, arr):
""" Function ensuring that the size of numpy array is at least as specified
Expand Down
185 changes: 180 additions & 5 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,22 @@
| Also, it contains Numba internal operators which are required for DataFrame type handling
'''


import operator
import pandas
import copy
import numpy
import sdc
import copy

from numba import types
from numba.extending import (overload, overload_method, overload_attribute)
from sdc.hiframes.pd_dataframe_ext import DataFrameType
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DataFrameType is already importing from correct module at line 48.

from sdc.datatypes.common_functions import TypeChecker
from numba.errors import TypingError
from sdc.str_arr_ext import StringArrayType
from sdc.config import config_pipeline_hpat_default

from sdc.utils import sdc_overload_method
from sdc.hiframes.pd_dataframe_type import DataFrameType

from sdc.datatypes.hpat_pandas_dataframe_rolling_types import _hpat_pandas_df_rolling_init
Expand All @@ -47,6 +55,177 @@
from sdc.utils import sdc_overload_method


def sdc_pandas_dataframe_append_codegen(df, other, _func_name, args):
"""
Input:
df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]})
other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']})
Func generated:
def sdc_pandas_dataframe_append_impl(df, other, ignore_index=True, verify_integrity=False, sort=None):
len_df = len(get_dataframe_data(df, 0))
len_other = len(get_dataframe_data(other, 0))
new_col_A_data_df = get_dataframe_data(df, 0)
new_col_A_data_other = get_dataframe_data(other, 0)
new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data
new_col_B_data_df = get_dataframe_data(df, 1)
new_col_B_data = init_series(new_col_B_data_df)._data
new_col_B = fill_array(new_col_B_data, len_df+len_other)
new_col_C_data_other = get_dataframe_data(other, 1)
new_col_C_data = init_series(new_col_C_data_other)._data
new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False)
return pandas.DataFrame({"A": new_col_A, "B": new_col_B, "C": new_col_C)
"""
indent = 4 * ' '
func_args = ['df', 'other']

for key, value in args:
# TODO: improve check
if key not in func_args:
if isinstance(value, types.Literal):
value = value.literal_value
func_args.append(f'{key}={value}')

df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)}
other_columns_indx = {col_name: i for i, col_name in enumerate(other.columns)}



# Keep columns that are StringArrayType
string_type_columns = set(col_name for typ, col_name in zip(df.data, df.columns)
if isinstance(typ, StringArrayType))

for typ, col_name in zip(other.data, other.columns):
if isinstance(typ, StringArrayType):
string_type_columns.add(col_name)

func_definition = [f'def sdc_pandas_dataframe_{_func_name}_impl({", ".join(func_args)}):']
func_text = []
column_list = []

func_text.append(f'len_df = len(get_dataframe_data(df, 0))')
func_text.append(f'len_other = len(get_dataframe_data(other, 0))')

for col_name, i in df_columns_indx.items():
func_text.append(f'new_col_{col_name}_data_{"df"} = get_dataframe_data({"df"}, {i})')
if col_name in other_columns_indx:
func_text.append(f'new_col_{col_name}_data_{"other"} = '
f'get_dataframe_data({"other"}, {other_columns_indx.get(col_name)})')
s1 = f'init_series(new_col_{col_name}_data_{"df"})'
s2 = f'init_series(new_col_{col_name}_data_{"other"})'
func_text.append(f'new_col_{col_name} = {s1}.append({s2})._data')
else:
func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_df)._data')
if col_name in string_type_columns:
func_text.append(f'new_col_{col_name} = fill_str_array(new_col_{col_name}_data, len_df+len_other)')
else:
func_text.append(f'new_col_{col_name} = fill_array(new_col_{col_name}_data, len_df+len_other)')
column_list.append((f'new_col_{col_name}', col_name))

for col_name, i in other_columns_indx.items():
if col_name not in df_columns_indx:
func_text.append(f'new_col_{col_name}_data_{"other"} = get_dataframe_data({"other"}, {i})')
func_text.append(f'new_col_{col_name}_data = init_series(new_col_{col_name}_data_other)._data')
if col_name in string_type_columns:
func_text.append(
f'new_col_{col_name} = '
f'fill_str_array(new_col_{col_name}_data, len_df+len_other, push_back=False)')
else:
func_text.append(f'new_col_{col_name} = '
f'fill_array(new_col_{col_name}_data, len_df+len_other, push_back=False)')
column_list.append((f'new_col_{col_name}', col_name))

data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list)
# TODO: Handle index
func_text.append(f"return pandas.DataFrame({{{data}}})\n")
func_definition.extend([indent + func_line for func_line in func_text])
func_def = '\n'.join(func_definition)

global_vars = {'pandas': pandas, 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data,
'init_series': sdc.hiframes.api.init_series,
'fill_array': sdc.datatypes.common_functions.fill_array,
'fill_str_array': sdc.datatypes.common_functions.fill_str_array}

return func_def, global_vars


@sdc_overload_method(DataFrameType, 'append')
def sdc_pandas_dataframe_append(df, other, ignore_index=True, verify_integrity=False, sort=None):
"""
Intel Scalable Dataframe Compiler User Guide
********************************************
Pandas API: pandas.DataFrame.append
Examples
--------
.. literalinclude:: ../../../examples/dataframe_append.py
:language: python
:lines: 27-
:caption: Appending rows of other to the end of caller, returning a new object.
Columns in other that are not in the caller are added as new columns.
:name: ex_dataframe_append

.. command-output:: python ./dataframe_append.py
:cwd: ../../../examples

.. note::
Parameter ignore_index, verify_integrity, sort are currently unsupported
by Intel Scalable Dataframe Compiler
Currently only pandas.DataFrame is supported as "other" parameter

.. seealso::
`pandas.concat <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html>`_
General function to concatenate DataFrame or Series objects.
Intel Scalable Dataframe Compiler Developer Guide
*************************************************
Pandas DataFrame method :meth:`pandas.DataFrame.append` implementation.
.. only:: developer
Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_append*
Parameters
-----------
df: :obj:`pandas.DataFrame`
input arg
other: :obj:`pandas.DataFrame` object or :obj:`pandas.Series` or :obj:`dict`
The data to append
ignore_index: :obj:`bool`
*unsupported*
verify_integrity: :obj:`bool`
*unsupported*
sort: :obj:`bool`
*unsupported*
Returns
-------
:obj: `pandas.DataFrame`
return DataFrame with appended rows to the end
"""

_func_name = 'append'

ty_checker = TypeChecker(f'Method {_func_name}().')
ty_checker.check(df, DataFrameType)
# TODO: support other array-like types
ty_checker.check(other, DataFrameType)
# TODO: support index in series from df-columns
if not isinstance(ignore_index, (bool, types.Boolean, types.Omitted)) and not ignore_index:
ty_checker.raise_exc(ignore_index, 'boolean', 'ignore_index')

if not isinstance(verify_integrity, (bool, types.Boolean, types.Omitted)) and verify_integrity:
ty_checker.raise_exc(verify_integrity, 'boolean', 'verify_integrity')

if not isinstance(sort, (bool, types.Boolean, types.Omitted)) and sort is not None:
ty_checker.raise_exc(sort, 'boolean, None', 'sort')

args = (('ignore_index', True), ('verify_integrity', False), ('sort', None))

def sdc_pandas_dataframe_append_impl(df, other, _func_name, args):
loc_vars = {}
func_def, global_vars = sdc_pandas_dataframe_append_codegen(df, other, _func_name, args)

exec(func_def, global_vars, loc_vars)
_append_impl = loc_vars['sdc_pandas_dataframe_append_impl']
return _append_impl

return sdc_pandas_dataframe_append_impl(df, other, _func_name, args)


# Example func_text for func_name='count' columns=('A', 'B'):
#
# def _df_count_impl(df, axis=0, level=None, numeric_only=False):
Expand Down Expand Up @@ -445,11 +624,8 @@ def prod_overload(df, axis=None, skipna=None, level=None, numeric_only=None, min
def count_overload(df, axis=0, level=None, numeric_only=False):
"""
Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation.

.. only:: developer

Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_count*

Parameters
-----------
self: :class:`pandas.DataFrame`
Expand All @@ -460,7 +636,6 @@ def count_overload(df, axis=0, level=None, numeric_only=False):
*unsupported*
numeric_only:
*unsupported*

Returns
-------
:obj:`pandas.Series` or `pandas.DataFrame`
Expand Down
31 changes: 14 additions & 17 deletions sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1161,21 +1161,22 @@ def lower_isin_dummy(context, builder, sig, args):
return out_obj._getvalue()


@overload_method(DataFrameType, 'append')
def append_overload(df, other, ignore_index=False, verify_integrity=False,
sort=None):
if isinstance(other, DataFrameType):
return (lambda df, other, ignore_index=False, verify_integrity=False,
sort=None: pd.concat((df, other)))
if sdc.config.config_pipeline_hpat_default:
@overload_method(DataFrameType, 'append')
def append_overload(df, other, ignore_index=False, verify_integrity=False,
sort=None):
if isinstance(other, DataFrameType):
return (lambda df, other, ignore_index=False, verify_integrity=False,
sort=None: pd.concat((df, other)))

# TODO: tuple case
# TODO: non-homogenous build_list case
if isinstance(other, types.List) and isinstance(other.dtype, DataFrameType):
return (lambda df, other, ignore_index=False, verify_integrity=False,
sort=None: pd.concat([df] + other))
# TODO: tuple case
# TODO: non-homogenous build_list case
if isinstance(other, types.List) and isinstance(other.dtype, DataFrameType):
return (lambda df, other, ignore_index=False, verify_integrity=False,
sort=None: pd.concat([df] + other))

raise ValueError("invalid df.append() input. Only dataframe and list"
" of dataframes supported")
raise ValueError("invalid df.append() input. Only dataframe and list"
" of dataframes supported")


@overload_method(DataFrameType, 'pct_change')
Expand Down Expand Up @@ -1558,7 +1559,3 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None,
date_format, doublequote, escapechar, decimal)

return _impl


if not sdc.config.config_pipeline_hpat_default:
from sdc.datatypes.hpat_pandas_dataframe_functions import *
Loading