diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 52e06b79f..98c724705 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -1500,7 +1500,7 @@ def df_getitem_slice_idx_main_codelines(self, idx): res_data = f'res_data_{i}' func_lines += [ f' data_{i} = self._data[{type_id}][{col_id}][idx]', - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")' + f' {res_data} = data_{i}' ] results.append((col, res_data)) diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index b1e22f536..77436f49b 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -167,25 +167,25 @@ def fix_df_array_list_str_impl(column): # pragma: no cover return lambda column: column -def fix_df_index(index, *columns): +def fix_df_index(index): return index @overload(fix_df_index) -def fix_df_index_overload(index, *columns): +def fix_df_index_overload(index): # TO-DO: replace types.none index with separate type, e.g. DefaultIndex if (index is None or isinstance(index, types.NoneType)): - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return None elif isinstance(index, RangeIndexType): - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return index else: # default case, transform index the same as df data - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return fix_df_array(index) return fix_df_index_impl diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 5c77bc41a..56218e99c 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -80,59 +80,6 @@ def get_structure_maps(col_types, col_names): return column_loc, data_typs_map, types_order -@intrinsic -def init_dataframe(typingctx, *args): - """Create a DataFrame with provided data, index and columns values. - Used as a single constructor for DataFrame and assigning its data, so that - optimization passes can look for init_dataframe() to see if underlying - data has changed, and get the array variables from init_dataframe() args if - not changed. - """ - - n_cols = len(args) // 2 - data_typs = tuple(args[:n_cols]) - index_typ = args[n_cols] - column_names = tuple(a.literal_value for a in args[n_cols + 1:]) - - column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names) - - def codegen(context, builder, signature, args): - in_tup = args[0] - data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)] - index = builder.extract_value(in_tup, n_cols) - - # create dataframe struct and store values - dataframe = cgutils.create_struct_proxy( - signature.return_type)(context, builder) - - data_list_type = [types.List(typ) for typ in types_order] - - data_lists = [] - for typ_id, typ in enumerate(types_order): - data_list_typ = context.build_list(builder, data_list_type[typ_id], - [data_arrs[data_id] for data_id in data_typs_map[typ][1]]) - data_lists.append(data_list_typ) - - data_tup = context.make_tuple( - builder, types.Tuple(data_list_type), data_lists) - - dataframe.data = data_tup - dataframe.index = index - dataframe.parent = context.get_constant_null(types.pyobject) - - # increase refcount of stored values - if context.enable_nrt: - context.nrt.incref(builder, index_typ, index) - for var, typ in zip(data_arrs, data_typs): - context.nrt.incref(builder, typ, var) - - return dataframe._getvalue() - - ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc) - sig = signature(ret_typ, types.Tuple(args)) - return sig, codegen - - # TODO: alias analysis # this function should be used for getting df._data for alias analysis to work # no_cpython_wrapper since Array(DatetimeDate) cannot be boxed diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py index c600a0209..9dd5fcaf4 100644 --- a/sdc/hiframes/pd_dataframe_type.py +++ b/sdc/hiframes/pd_dataframe_type.py @@ -126,5 +126,4 @@ class ColumnLoc(NamedTuple): make_attribute_wrapper(DataFrameType, 'data', '_data') make_attribute_wrapper(DataFrameType, 'index', '_index') -make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed') make_attribute_wrapper(DataFrameType, 'parent', '_parent') diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 245426643..d48aaf0f1 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -138,7 +138,7 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): fix_data = sdc.hiframes.api.fix_df_array(data) - fix_index = sdc.hiframes.api.fix_df_index(index, fix_data) + fix_index = sdc.hiframes.api.fix_df_index(index) return sdc.hiframes.api.init_series(fix_data, fix_index, name) return hpat_pandas_series_ctor_impl diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index c9538759e..debf0b73c 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -24,13 +24,18 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - +import numba +from numba.core import cgutils, types from numba.core.rewrites import (register_rewrite, Rewrite) from numba.core.ir_utils import (guard, find_callname) from numba.core.ir import (Expr) from numba.extending import overload +from numba.core.extending import intrinsic +from numba.core.typing import signature from pandas import DataFrame +from sys import modules +from textwrap import dedent from sdc.rewrites.ir_utils import (find_operations, is_dict, get_tuple_items, get_dict_items, remove_unused_recursively, @@ -38,9 +43,11 @@ declare_constant, import_function, make_call, insert_before) -from sdc.hiframes.pd_dataframe_ext import (init_dataframe, DataFrameType) - +from sdc.hiframes import pd_dataframe_ext as pd_dataframe_ext_module +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc +from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.str_ext import string_type @register_rewrite('before-inference') @@ -54,6 +61,7 @@ class RewriteDataFrame(Rewrite): _df_arg_list = ('data', 'index', 'columns', 'dtype', 'copy') def __init__(self, pipeline): + self._pipeline = pipeline super().__init__(pipeline) self._reset() @@ -79,18 +87,45 @@ def match(self, func_ir, block, typemap, calltypes): return len(self._calls_to_rewrite) > 0 def apply(self): - init_df_stmt = import_function(init_dataframe, self._block, self._func_ir) - for stmt in self._calls_to_rewrite: args = get_call_parameters(call=stmt.value, arg_names=self._df_arg_list) - old_data = args['data'] - args['data'], args['columns'] = self._extract_dict_args(args, self._func_ir) + args_len = len(args['data']) + func_name = f'init_dataframe_{args_len}' + + # injected_module = modules[pd_dataframe_ext_module.__name__] + init_df = getattr(pd_dataframe_ext_module, func_name, None) + if init_df is None: + init_df_text = gen_init_dataframe_text(func_name, args_len) + init_df = gen_init_dataframe_func( + func_name, + init_df_text, + { + 'numba': numba, + 'cgutils': cgutils, + 'signature': signature, + 'types': types, + 'get_structure_maps': get_structure_maps, + 'intrinsic': intrinsic, + 'DataFrameType': DataFrameType, + 'ColumnLoc': ColumnLoc, + 'string_type': string_type, + 'intrinsic': intrinsic, + 'fix_df_array': fix_df_array, + 'fix_df_index': fix_df_index + }) + + setattr(pd_dataframe_ext_module, func_name, init_df) + init_df.__module__ = pd_dataframe_ext_module.__name__ + init_df._defn.__module__ = pd_dataframe_ext_module.__name__ + + init_df_stmt = import_function(init_df, self._block, self._func_ir) self._replace_call(stmt, init_df_stmt.target, args, self._block, self._func_ir) remove_unused_recursively(old_data, self._block, self._func_ir) + self._pipeline.typingctx.refresh() return self._block @@ -130,42 +165,112 @@ def _replace_call(stmt, new_call, args, block, func_ir): columns_args = args['columns'] index_args = args.get('index') - data_args = RewriteDataFrame._replace_data_with_arrays(data_args, stmt, block, func_ir) - if index_args is None: # index arg was omitted none_stmt = declare_constant(None, block, func_ir, stmt.loc) index_args = none_stmt.target - index_and_data_args = [index_args] + data_args - index_args = RewriteDataFrame._replace_index_with_arrays(index_and_data_args, stmt, block, func_ir) + index_args = [index_args] all_args = data_args + index_args + columns_args call = Expr.call(new_call, all_args, {}, func.loc) stmt.value = call - @staticmethod - def _replace_data_with_arrays(args, stmt, block, func_ir): - new_args = [] - for var in args: - call_stmt = make_call(fix_df_array, [var], {}, block, func_ir, var.loc) - insert_before(block, call_stmt, stmt) - new_args.append(call_stmt.target) +def gen_init_dataframe_text(func_name, n_cols): + args_col_data = ['c' + str(i) for i in range(n_cols)] + args_col_names = ['n' + str(i) for i in range(n_cols)] + params = ', '.join(args_col_data + ['index'] + args_col_names) + suffix = ('' if n_cols == 0 else ', ') + + func_text = dedent( + f''' + @intrinsic + def {func_name}(typingctx, {params}): + """Create a DataFrame with provided columns data and index values. + Takes 2n+1 args: n columns data, index data and n column names. + Each column data is passed as separate argument to have compact LLVM IR. + Used as as generic constructor for native DataFrame objects, which + can be used with different input column types (e.g. lists), and + resulting DataFrameType is deduced by applying transform functions + (fix_df_array and fix_df_index) to input argument types. + """ + + n_cols = {n_cols} + + input_data_typs = ({', '.join(args_col_data) + suffix}) + fnty = typingctx.resolve_value_type(fix_df_array) + fixed_col_sigs = [] + for i in range({n_cols}): + to_sig = fnty.get_call_type(typingctx, (input_data_typs[i],), {{}}) + fixed_col_sigs.append(to_sig) + data_typs = tuple(fixed_col_sigs[i].return_type for i in range({n_cols})) + need_fix_cols = tuple(data_typs[i] != input_data_typs[i] for i in range({n_cols})) + + input_index_typ = index + fnty = typingctx.resolve_value_type(fix_df_index) + fixed_index_sig = fnty.get_call_type(typingctx, (input_index_typ,), {{}}) + index_typ = fixed_index_sig.return_type + need_fix_index = index_typ != input_index_typ + + column_names = tuple(a.literal_value for a in ({', '.join(args_col_names) + suffix})) + column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names) + col_needs_transform = tuple(not isinstance(data_typs[i], types.Array) for i in range(len(data_typs))) + + def codegen(context, builder, sig, args): + {params}, = args + data_arrs = [{', '.join(args_col_data) + suffix}] + data_arrs_transformed = [] + for i, arr in enumerate(data_arrs): + if need_fix_cols[i] == False: + data_arrs_transformed.append(arr) + else: + res = context.compile_internal(builder, lambda a: fix_df_array(a), fixed_col_sigs[i], [arr]) + data_arrs_transformed.append(res) - return new_args + # create dataframe struct and store values + dataframe = cgutils.create_struct_proxy( + sig.return_type)(context, builder) - @staticmethod - def _replace_index_with_arrays(args, stmt, block, func_ir): - new_args = [] + data_list_type = [types.List(typ) for typ in types_order] + + data_lists = [] + for typ_id, typ in enumerate(types_order): + data_arrs_of_typ = [data_arrs_transformed[data_id] for data_id in data_typs_map[typ][1]] + data_list_typ = context.build_list(builder, data_list_type[typ_id], data_arrs_of_typ) + data_lists.append(data_list_typ) + + data_tup = context.make_tuple( + builder, types.Tuple(data_list_type), data_lists) + + if need_fix_index == True: + index = context.compile_internal(builder, lambda a: fix_df_index(a), fixed_index_sig, [index]) + + dataframe.data = data_tup + dataframe.index = index + dataframe.parent = context.get_constant_null(types.pyobject) + + # increase refcount of stored values + if context.enable_nrt: + context.nrt.incref(builder, index_typ, index) + for var, typ in zip(data_arrs_transformed, data_typs): + context.nrt.incref(builder, typ, var) + + return dataframe._getvalue() + + ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc) + sig = signature(ret_typ, {params}) + return sig, codegen + ''') + + return func_text - call_stmt = make_call(fix_df_index, args, {}, block, func_ir, args[0].loc) - insert_before(block, call_stmt, stmt) - new_args.append(call_stmt.target) - return new_args +def gen_init_dataframe_func(func_name, func_text, global_vars): - return new_args + loc_vars = {} + exec(func_text, global_vars, loc_vars) + return loc_vars[func_name] @overload(DataFrame) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index f8f15e121..d2b2d3547 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -98,6 +98,31 @@ def test_impl(n): n = 11 self.assertEqual(hpat_func(n), test_impl(n)) + def test_create_empty_df(self): + """ Verifies empty DF can be created """ + def test_impl(): + df = pd.DataFrame({}) + return len(df) + hpat_func = self.jit(test_impl) + + self.assertEqual(hpat_func(), test_impl()) + + def test_create_multiple_dfs(self): + """ Verifies generated dataframe ctor is added to pd_dataframe_ext module + correctly (and numba global context is refreshed), so that subsequent + compilations are not broken. """ + def test_impl(a, b, c): + df1 = pd.DataFrame({'A': a, 'B': b}) + df2 = pd.DataFrame({'C': c}) + total_cols = len(df1.columns) + len(df2.columns) + return total_cols + hpat_func = self.jit(test_impl) + + a1 = np.array([1, 2, 3, 4.0, 5]) + a2 = [7, 6, 5, 4, 3] + a3 = ['a', 'b', 'c', 'd', 'e'] + self.assertEqual(hpat_func(a1, a2, a3), test_impl(a1, a2, a3)) + def test_create_str(self): def test_impl(): df = pd.DataFrame({'A': ['a', 'b', 'c']}) @@ -159,7 +184,7 @@ def test_impl(A, B, index): result_ref = test_impl(A, B, index) pd.testing.assert_frame_equal(result, result_ref) - def test_create_empty_df(self): + def test_unbox_empty_df(self): def test_impl(df): return df sdc_func = self.jit(test_impl)