Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1500,7 +1500,7 @@ def df_getitem_slice_idx_main_codelines(self, idx):
res_data = f'res_data_{i}'
func_lines += [
f' data_{i} = self._data[{type_id}][{col_id}][idx]',
f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")'
f' {res_data} = data_{i}'
]
results.append((col, res_data))

Expand Down
10 changes: 5 additions & 5 deletions sdc/hiframes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,25 +167,25 @@ def fix_df_array_list_str_impl(column): # pragma: no cover
return lambda column: column


def fix_df_index(index, *columns):
def fix_df_index(index):
return index


@overload(fix_df_index)
def fix_df_index_overload(index, *columns):
def fix_df_index_overload(index):

# TO-DO: replace types.none index with separate type, e.g. DefaultIndex
if (index is None or isinstance(index, types.NoneType)):
def fix_df_index_impl(index, *columns):
def fix_df_index_impl(index):
return None

elif isinstance(index, RangeIndexType):
def fix_df_index_impl(index, *columns):
def fix_df_index_impl(index):
return index

else:
# default case, transform index the same as df data
def fix_df_index_impl(index, *columns):
def fix_df_index_impl(index):
return fix_df_array(index)

return fix_df_index_impl
Expand Down
53 changes: 0 additions & 53 deletions sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,59 +80,6 @@ def get_structure_maps(col_types, col_names):
return column_loc, data_typs_map, types_order


@intrinsic
def init_dataframe(typingctx, *args):
"""Create a DataFrame with provided data, index and columns values.
Used as a single constructor for DataFrame and assigning its data, so that
optimization passes can look for init_dataframe() to see if underlying
data has changed, and get the array variables from init_dataframe() args if
not changed.
"""

n_cols = len(args) // 2
data_typs = tuple(args[:n_cols])
index_typ = args[n_cols]
column_names = tuple(a.literal_value for a in args[n_cols + 1:])

column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names)

def codegen(context, builder, signature, args):
in_tup = args[0]
data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)]
index = builder.extract_value(in_tup, n_cols)

# create dataframe struct and store values
dataframe = cgutils.create_struct_proxy(
signature.return_type)(context, builder)

data_list_type = [types.List(typ) for typ in types_order]

data_lists = []
for typ_id, typ in enumerate(types_order):
data_list_typ = context.build_list(builder, data_list_type[typ_id],
[data_arrs[data_id] for data_id in data_typs_map[typ][1]])
data_lists.append(data_list_typ)

data_tup = context.make_tuple(
builder, types.Tuple(data_list_type), data_lists)

dataframe.data = data_tup
dataframe.index = index
dataframe.parent = context.get_constant_null(types.pyobject)

# increase refcount of stored values
if context.enable_nrt:
context.nrt.incref(builder, index_typ, index)
for var, typ in zip(data_arrs, data_typs):
context.nrt.incref(builder, typ, var)

return dataframe._getvalue()

ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc)
sig = signature(ret_typ, types.Tuple(args))
return sig, codegen


# TODO: alias analysis
# this function should be used for getting df._data for alias analysis to work
# no_cpython_wrapper since Array(DatetimeDate) cannot be boxed
Expand Down
1 change: 0 additions & 1 deletion sdc/hiframes/pd_dataframe_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,4 @@ class ColumnLoc(NamedTuple):

make_attribute_wrapper(DataFrameType, 'data', '_data')
make_attribute_wrapper(DataFrameType, 'index', '_index')
make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed')
make_attribute_wrapper(DataFrameType, 'parent', '_parent')
2 changes: 1 addition & 1 deletion sdc/hiframes/pd_series_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False,
def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False):

fix_data = sdc.hiframes.api.fix_df_array(data)
fix_index = sdc.hiframes.api.fix_df_index(index, fix_data)
fix_index = sdc.hiframes.api.fix_df_index(index)
return sdc.hiframes.api.init_series(fix_data, fix_index, name)

return hpat_pandas_series_ctor_impl
159 changes: 132 additions & 27 deletions sdc/rewrites/dataframe_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,30 @@
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************


import numba
from numba.core import cgutils, types
from numba.core.rewrites import (register_rewrite, Rewrite)
from numba.core.ir_utils import (guard, find_callname)
from numba.core.ir import (Expr)
from numba.extending import overload
from numba.core.extending import intrinsic
from numba.core.typing import signature

from pandas import DataFrame
from sys import modules
from textwrap import dedent

from sdc.rewrites.ir_utils import (find_operations, is_dict,
get_tuple_items, get_dict_items, remove_unused_recursively,
get_call_parameters,
declare_constant,
import_function, make_call,
insert_before)
from sdc.hiframes.pd_dataframe_ext import (init_dataframe, DataFrameType)

from sdc.hiframes import pd_dataframe_ext as pd_dataframe_ext_module
from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc
from sdc.hiframes.pd_dataframe_ext import get_structure_maps
from sdc.hiframes.api import fix_df_array, fix_df_index
from sdc.str_ext import string_type


@register_rewrite('before-inference')
Expand All @@ -54,6 +61,7 @@ class RewriteDataFrame(Rewrite):
_df_arg_list = ('data', 'index', 'columns', 'dtype', 'copy')

def __init__(self, pipeline):
self._pipeline = pipeline
super().__init__(pipeline)

self._reset()
Expand All @@ -79,18 +87,45 @@ def match(self, func_ir, block, typemap, calltypes):
return len(self._calls_to_rewrite) > 0

def apply(self):
init_df_stmt = import_function(init_dataframe, self._block, self._func_ir)

for stmt in self._calls_to_rewrite:
args = get_call_parameters(call=stmt.value, arg_names=self._df_arg_list)

old_data = args['data']

args['data'], args['columns'] = self._extract_dict_args(args, self._func_ir)

args_len = len(args['data'])
func_name = f'init_dataframe_{args_len}'

# injected_module = modules[pd_dataframe_ext_module.__name__]
init_df = getattr(pd_dataframe_ext_module, func_name, None)
if init_df is None:
init_df_text = gen_init_dataframe_text(func_name, args_len)
init_df = gen_init_dataframe_func(
func_name,
init_df_text,
{
'numba': numba,
'cgutils': cgutils,
'signature': signature,
'types': types,
'get_structure_maps': get_structure_maps,
'intrinsic': intrinsic,
'DataFrameType': DataFrameType,
'ColumnLoc': ColumnLoc,
'string_type': string_type,
'intrinsic': intrinsic,
'fix_df_array': fix_df_array,
'fix_df_index': fix_df_index
})

setattr(pd_dataframe_ext_module, func_name, init_df)
init_df.__module__ = pd_dataframe_ext_module.__name__
init_df._defn.__module__ = pd_dataframe_ext_module.__name__

init_df_stmt = import_function(init_df, self._block, self._func_ir)
self._replace_call(stmt, init_df_stmt.target, args, self._block, self._func_ir)

remove_unused_recursively(old_data, self._block, self._func_ir)
self._pipeline.typingctx.refresh()

return self._block

Expand Down Expand Up @@ -130,42 +165,112 @@ def _replace_call(stmt, new_call, args, block, func_ir):
columns_args = args['columns']
index_args = args.get('index')

data_args = RewriteDataFrame._replace_data_with_arrays(data_args, stmt, block, func_ir)

if index_args is None: # index arg was omitted
none_stmt = declare_constant(None, block, func_ir, stmt.loc)
index_args = none_stmt.target

index_and_data_args = [index_args] + data_args
index_args = RewriteDataFrame._replace_index_with_arrays(index_and_data_args, stmt, block, func_ir)
index_args = [index_args]

all_args = data_args + index_args + columns_args
call = Expr.call(new_call, all_args, {}, func.loc)

stmt.value = call

@staticmethod
def _replace_data_with_arrays(args, stmt, block, func_ir):
new_args = []

for var in args:
call_stmt = make_call(fix_df_array, [var], {}, block, func_ir, var.loc)
insert_before(block, call_stmt, stmt)
new_args.append(call_stmt.target)
def gen_init_dataframe_text(func_name, n_cols):
args_col_data = ['c' + str(i) for i in range(n_cols)]
args_col_names = ['n' + str(i) for i in range(n_cols)]
params = ', '.join(args_col_data + ['index'] + args_col_names)
suffix = ('' if n_cols == 0 else ', ')

func_text = dedent(
f'''
@intrinsic
def {func_name}(typingctx, {params}):
"""Create a DataFrame with provided columns data and index values.
Takes 2n+1 args: n columns data, index data and n column names.
Each column data is passed as separate argument to have compact LLVM IR.
Used as as generic constructor for native DataFrame objects, which
can be used with different input column types (e.g. lists), and
resulting DataFrameType is deduced by applying transform functions
(fix_df_array and fix_df_index) to input argument types.
"""

n_cols = {n_cols}

input_data_typs = ({', '.join(args_col_data) + suffix})
fnty = typingctx.resolve_value_type(fix_df_array)
fixed_col_sigs = []
for i in range({n_cols}):
to_sig = fnty.get_call_type(typingctx, (input_data_typs[i],), {{}})
fixed_col_sigs.append(to_sig)
data_typs = tuple(fixed_col_sigs[i].return_type for i in range({n_cols}))
need_fix_cols = tuple(data_typs[i] != input_data_typs[i] for i in range({n_cols}))

input_index_typ = index
fnty = typingctx.resolve_value_type(fix_df_index)
fixed_index_sig = fnty.get_call_type(typingctx, (input_index_typ,), {{}})
index_typ = fixed_index_sig.return_type
need_fix_index = index_typ != input_index_typ

column_names = tuple(a.literal_value for a in ({', '.join(args_col_names) + suffix}))
column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names)
col_needs_transform = tuple(not isinstance(data_typs[i], types.Array) for i in range(len(data_typs)))

def codegen(context, builder, sig, args):
{params}, = args
data_arrs = [{', '.join(args_col_data) + suffix}]
data_arrs_transformed = []
for i, arr in enumerate(data_arrs):
if need_fix_cols[i] == False:
data_arrs_transformed.append(arr)
else:
res = context.compile_internal(builder, lambda a: fix_df_array(a), fixed_col_sigs[i], [arr])
data_arrs_transformed.append(res)

return new_args
# create dataframe struct and store values
dataframe = cgutils.create_struct_proxy(
sig.return_type)(context, builder)

@staticmethod
def _replace_index_with_arrays(args, stmt, block, func_ir):
new_args = []
data_list_type = [types.List(typ) for typ in types_order]

data_lists = []
for typ_id, typ in enumerate(types_order):
data_arrs_of_typ = [data_arrs_transformed[data_id] for data_id in data_typs_map[typ][1]]
data_list_typ = context.build_list(builder, data_list_type[typ_id], data_arrs_of_typ)
data_lists.append(data_list_typ)

data_tup = context.make_tuple(
builder, types.Tuple(data_list_type), data_lists)

if need_fix_index == True:
index = context.compile_internal(builder, lambda a: fix_df_index(a), fixed_index_sig, [index])

dataframe.data = data_tup
dataframe.index = index
dataframe.parent = context.get_constant_null(types.pyobject)

# increase refcount of stored values
if context.enable_nrt:
context.nrt.incref(builder, index_typ, index)
for var, typ in zip(data_arrs_transformed, data_typs):
context.nrt.incref(builder, typ, var)

return dataframe._getvalue()

ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc)
sig = signature(ret_typ, {params})
return sig, codegen
''')

return func_text

call_stmt = make_call(fix_df_index, args, {}, block, func_ir, args[0].loc)
insert_before(block, call_stmt, stmt)
new_args.append(call_stmt.target)

return new_args
def gen_init_dataframe_func(func_name, func_text, global_vars):

return new_args
loc_vars = {}
exec(func_text, global_vars, loc_vars)
return loc_vars[func_name]


@overload(DataFrame)
Expand Down
27 changes: 26 additions & 1 deletion sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,31 @@ def test_impl(n):
n = 11
self.assertEqual(hpat_func(n), test_impl(n))

def test_create_empty_df(self):
""" Verifies empty DF can be created """
def test_impl():
df = pd.DataFrame({})
return len(df)
hpat_func = self.jit(test_impl)

self.assertEqual(hpat_func(), test_impl())

def test_create_multiple_dfs(self):
""" Verifies generated dataframe ctor is added to pd_dataframe_ext module
correctly (and numba global context is refreshed), so that subsequent
compilations are not broken. """
def test_impl(a, b, c):
df1 = pd.DataFrame({'A': a, 'B': b})
df2 = pd.DataFrame({'C': c})
total_cols = len(df1.columns) + len(df2.columns)
return total_cols
hpat_func = self.jit(test_impl)

a1 = np.array([1, 2, 3, 4.0, 5])
a2 = [7, 6, 5, 4, 3]
a3 = ['a', 'b', 'c', 'd', 'e']
self.assertEqual(hpat_func(a1, a2, a3), test_impl(a1, a2, a3))

def test_create_str(self):
def test_impl():
df = pd.DataFrame({'A': ['a', 'b', 'c']})
Expand Down Expand Up @@ -159,7 +184,7 @@ def test_impl(A, B, index):
result_ref = test_impl(A, B, index)
pd.testing.assert_frame_equal(result, result_ref)

def test_create_empty_df(self):
def test_unbox_empty_df(self):
def test_impl(df):
return df
sdc_func = self.jit(test_impl)
Expand Down