From a39d73decf020f5574bc5cee43de95d7c2183d6d Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Tue, 10 Nov 2020 15:33:13 +0300 Subject: [PATCH 1/8] Return back csv_reader_py changes from #918 (#943) This reverts commit 30122b26c25c51464b5fd0c8f39f28300b26761b. --- sdc/datatypes/hpat_pandas_functions.py | 65 ++++++---- sdc/hiframes/pd_dataframe_ext.py | 7 +- sdc/hiframes/pd_dataframe_type.py | 23 +++- sdc/io/csv_ext.py | 159 ++++++++++--------------- sdc/str_arr_type.py | 2 + sdc/tests/test_io.py | 4 + 6 files changed, 139 insertions(+), 121 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_functions.py b/sdc/datatypes/hpat_pandas_functions.py index a82813836..7bbbf951d 100644 --- a/sdc/datatypes/hpat_pandas_functions.py +++ b/sdc/datatypes/hpat_pandas_functions.py @@ -39,12 +39,13 @@ from sdc.io.csv_ext import ( _gen_csv_reader_py_pyarrow_py_func, - _gen_csv_reader_py_pyarrow_func_text_dataframe, + _gen_pandas_read_csv_func_text, ) from sdc.str_arr_ext import string_array_type from sdc.hiframes import join, aggregate, sort from sdc.types import CategoricalDtypeType, Categorical +from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype def get_numba_array_types_for_csv(df): @@ -255,45 +256,69 @@ def sdc_pandas_read_csv( usecols = [col.literal_value for col in usecols] if infer_from_params: - # dtype should be constants and is important only for inference from params + # dtype is a tuple of format ('A', A_dtype, 'B', B_dtype, ...) + # where column names should be constants and is important only for inference from params if isinstance(dtype, types.Tuple): - assert all(isinstance(key, types.Literal) for key in dtype[::2]) + assert all(isinstance(key, types.StringLiteral) for key in dtype[::2]) keys = (k.literal_value for k in dtype[::2]) - values = dtype[1::2] - values = [v.typing_key if isinstance(v, types.Function) else v for v in values] - values = [types.Array(numba.from_dtype(np.dtype(v.literal_value)), 1, 'C') - if isinstance(v, types.Literal) else v for v in values] - values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values] - values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values] - values = [string_array_type if v == str else v for v in values] - values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values] - dtype = dict(zip(keys, values)) + def _get_df_col_type(dtype): + if isinstance(dtype, types.Function): + if dtype.typing_key == int: + return types.Array(types.int_, 1, 'C') + elif dtype.typing_key == float: + return types.Array(types.float64, 1, 'C') + elif dtype.typing_key == str: + return string_array_type + else: + assert False, f"map_dtype_to_col_type: failing to infer column type for dtype={dtype}" + + if isinstance(dtype, types.StringLiteral): + if dtype.literal_value == 'str': + return string_array_type + else: + return types.Array(numba.from_dtype(np.dtype(dtype.literal_value)), 1, 'C') + + if isinstance(dtype, types.NumberClass): + return types.Array(dtype.dtype, 1, 'C') + + if isinstance(dtype, CategoricalDtypeType): + return Categorical(dtype) + + col_types_map = dict(zip(keys, map(_get_df_col_type, values))) # in case of both are available # inferencing from params has priority over inferencing from file if infer_from_params: - col_names = names # all names should be in dtype - return_columns = usecols if usecols else names - col_typs = [dtype[n] for n in return_columns] + col_names = usecols if usecols else names + col_types = [col_types_map[n] for n in col_names] elif infer_from_file: - col_names, col_typs = infer_column_names_and_types_from_constant_filename( + col_names, col_types = infer_column_names_and_types_from_constant_filename( filepath_or_buffer, delimiter, names, usecols, skiprows) else: return None - dtype_present = not isinstance(dtype, (types.Omitted, type(None))) + def _get_py_col_dtype(ctype): + """ Re-creates column dtype as python type to be used in read_csv call """ + dtype = ctype.dtype + if ctype == string_array_type: + return str + if isinstance(ctype, Categorical): + return _reconstruct_CategoricalDtype(ctype.pd_dtype) + return numpy_support.as_dtype(dtype) + + py_col_dtypes = {cname: _get_py_col_dtype(ctype) for cname, ctype in zip(col_names, col_types)} # generate function text with signature and returning DataFrame - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_dataframe( - col_names, col_typs, dtype_present, usecols, signature) + func_text, func_name, global_vars = _gen_pandas_read_csv_func_text( + col_names, col_types, py_col_dtypes, usecols, signature) # compile with Python - csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name) + csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars) return csv_reader_py diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index f863423a3..7c6d5e40a 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -26,7 +26,6 @@ import operator -from typing import NamedTuple import numba from numba import types @@ -39,7 +38,7 @@ from numba.core.imputils import impl_ret_new_ref, impl_ret_borrowed from sdc.hiframes.pd_series_ext import SeriesType -from sdc.hiframes.pd_dataframe_type import DataFrameType +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc from sdc.str_ext import string_type @@ -54,10 +53,6 @@ def generic_resolve(self, df, attr): return SeriesType(arr_typ.dtype, arr_typ, df.index, True) -class ColumnLoc(NamedTuple): - type_id: int - col_id: int - def get_structure_maps(col_types, col_names): # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)} diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py index a962329c3..85255b22c 100644 --- a/sdc/hiframes/pd_dataframe_type.py +++ b/sdc/hiframes/pd_dataframe_type.py @@ -24,6 +24,8 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import re +from typing import NamedTuple import numba from numba import types @@ -48,7 +50,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, column self.has_parent = has_parent self.column_loc = column_loc super(DataFrameType, self).__init__( - name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent)) + name="DataFrameType({}, {}, {}, {})".format(data, index, columns, has_parent)) def copy(self, index=None, has_parent=None): # XXX is copy necessary? @@ -83,6 +85,16 @@ def unify(self, typingctx, other): def is_precise(self): return all(a.is_precise() for a in self.data) and self.index.is_precise() + def __repr__(self): + + # To have correct repr of DataFrame we need some changes to what types.Type gives: + # (1) e.g. array(int64, 1d, C) should be Array(int64, 1, 'C') + # (2) ColumnLoc is not part of DataFrame name, so we need to add it + default_repr = super(DataFrameType, self).__repr__() + res = re.sub(r'array\((\w+), 1d, C\)', r'Array(\1, 1, \'C\')', default_repr) + res = re.sub(r'\)$', f', column_loc={self.column_loc})', res) + return res + @register_model(DataFrameType) class DataFrameModel(models.StructModel): @@ -104,6 +116,15 @@ def __init__(self, dmm, fe_type): super(DataFrameModel, self).__init__(dmm, fe_type, members) +class ColumnLoc(NamedTuple): + type_id: int + col_id: int + + +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.DataFrameType = DataFrameType +types.ColumnLoc = ColumnLoc + make_attribute_wrapper(DataFrameType, 'data', '_data') make_attribute_wrapper(DataFrameType, 'index', '_index') make_attribute_wrapper(DataFrameType, 'columns', '_columns') diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index 2a569945d..78aaafeb6 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -24,7 +24,6 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - import contextlib import functools @@ -45,6 +44,8 @@ from sdc.utilities.utils import (debug_prints, alloc_arr_tup, empty_like_type, _numba_to_c_type_map) from sdc.distributed_analysis import Distribution +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc +from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.str_ext import string_type from sdc.str_arr_ext import (string_array_type, to_string_list, cp_str_list_to_array, str_list_to_array, @@ -56,13 +57,14 @@ import pandas as pd import numpy as np -from sdc.types import CategoricalDtypeType, Categorical +from sdc.types import Categorical import pyarrow import pyarrow.csv class CsvReader(ir.Stmt): + def __init__(self, file_name, df_out, sep, df_colnames, out_vars, out_types, usecols, loc, skiprows=0): self.file_name = file_name self.df_out = df_out @@ -263,8 +265,10 @@ def csv_distributed_run(csv_node, array_dists, typemap, calltypes, typingctx, ta # get global array sizes by calling allreduce on chunk lens # TODO: get global size from C for arr in csv_node.out_vars: + def f(A): return sdc.distributed_api.dist_reduce(len(A), np.int32(_op)) + f_block = compile_to_numba_ir( f, {'sdc': sdc, 'np': np, '_op': sdc.distributed_api.Reduce_Type.Sum.value}, @@ -287,6 +291,7 @@ def f(A): class StreamReaderType(types.Opaque): + def __init__(self): super(StreamReaderType, self).__init__(name='StreamReaderType') @@ -300,36 +305,6 @@ def box_stream_reader(typ, val, c): return val -def _get_dtype_str(t): - dtype = t.dtype - - if isinstance(t, Categorical): - # return categorical representation - # for some reason pandas and pyarrow read_csv() return CategoricalDtype with - # ordered=False in case when dtype is with ordered=None - return str(t).replace('ordered=None', 'ordered=False') - - if dtype == types.NPDatetime('ns'): - dtype = 'NPDatetime("ns")' - if t == string_array_type: - # HACK: add string_array_type to numba.types - # FIXME: fix after Numba #3372 is resolved - types.string_array_type = string_array_type - return 'string_array_type' - return '{}[::1]'.format(dtype) - - -def _get_pd_dtype_str(t): - dtype = t.dtype - if isinstance(t, Categorical): - return 'pd.{}'.format(t.pd_dtype) - if dtype == types.NPDatetime('ns'): - dtype = 'str' - if t == string_array_type: - return 'str' - return 'np.{}'.format(dtype) - - # XXX: temporary fix pending Numba's #3378 # keep the compiled functions around to make sure GC doesn't delete them and # the reference to the dynamic function inside them @@ -343,7 +318,7 @@ def to_varname(string): Replaces unavailable symbols with _ and insert _ if string starts with digit. """ import re - return re.sub(r'\W|^(?=\d)','_', string) + return re.sub(r'\W|^(?=\d)', '_', string) @contextlib.contextmanager @@ -358,10 +333,12 @@ def pyarrow_cpu_count(cpu_count=pyarrow.cpu_count()): def pyarrow_cpu_count_equal_numba_num_treads(func): """Decorator. Set pyarrow cpu_count the same as NUMBA_NUM_THREADS.""" + @functools.wraps(func) def wrapper(*args, **kwargs): with pyarrow_cpu_count(numba.config.NUMBA_NUM_THREADS): return func(*args, **kwargs) + return wrapper @@ -541,84 +518,81 @@ def pandas_read_csv( return dataframe -def _gen_csv_reader_py_pyarrow(col_names, col_typs, usecols, sep, typingctx, targetctx, parallel, skiprows): - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text(col_names, col_typs, usecols, sep, skiprows) - csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name) - return _gen_csv_reader_py_pyarrow_jit_func(csv_reader_py) +def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, signature=None): + func_name = 'csv_reader_py' + return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names + + column_loc, _, _ = get_structure_maps(col_typs, return_columns) + df_type = DataFrameType( + tuple(col_typs), + types.none, + tuple(col_names), + column_loc=column_loc + ) + + df_type_repr = repr(df_type) + # for some reason pandas and pyarrow read_csv() return CategoricalDtype with + # ordered=False in case when dtype is with ordered=None + df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False') -def _gen_csv_reader_py_pyarrow_func_text_core(col_names, col_typs, dtype_present, usecols, signature=None): # TODO: support non-numpy types like strings date_inds = ", ".join(str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names - nb_objmode_vars = ", ".join([ - '{}="{}"'.format(to_varname(cname), _get_dtype_str(t)) - for cname, t in zip(return_columns, col_typs) - ]) - pd_dtype_strs = ", ".join([ - "'{}': {}".format(cname, _get_pd_dtype_str(t)) - for cname, t in zip(return_columns, col_typs) - ]) if signature is None: signature = "filepath_or_buffer" - func_text = "def csv_reader_py({}):\n".format(signature) - func_text += " with objmode({}):\n".format(nb_objmode_vars) - func_text += " df = pandas_read_csv(filepath_or_buffer,\n" + + # map generated func params into values used in inner call of pandas_read_csv + # if no transformation is needed just use outer param name (since APIs match) + # otherwise use value in the dictionary + inner_call_params = {'parse_dates': f"[{date_inds}]"} + used_read_csv_params = ( + 'filepath_or_buffer', + 'names', + 'skiprows', + 'parse_dates', + 'dtype', + 'usecols', + 'sep', + 'delimiter' + ) # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N" # during inference from file names should be raplaced with "Unnamed: N" # passing names to pyarrow means that one row is header and should be skipped if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)): - func_text += " names={},\n".format(col_names) - func_text += " skiprows=(skiprows and skiprows + 1) or 1,\n" - else: - func_text += " names=names,\n" - func_text += " skiprows=skiprows,\n" - - func_text += " parse_dates=[{}],\n".format(date_inds) - - # Python objects (e.g. str, np.float) could not be jitted and passed to objmode - # so they are hardcoded to function - # func_text += " dtype={{{}}},\n".format(pd_dtype_strs) if dtype_present else \ - # " dtype=dtype,\n" - # dtype is hardcoded because datetime should be read as string - func_text += " dtype={{{}}},\n".format(pd_dtype_strs) - - func_text += " usecols=usecols,\n" - func_text += " sep=sep,\n" - func_text += " delimiter=delimiter,\n" - func_text += " )\n" - for cname in return_columns: - func_text += " {} = df['{}'].values\n".format(to_varname(cname), cname) - # func_text += " print({})\n".format(cname) - return func_text, 'csv_reader_py' - - -def _gen_csv_reader_py_pyarrow_func_text(col_names, col_typs, usecols): - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_core(col_names, col_typs, usecols) + inner_call_params['names'] = str(col_names) + inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1" - func_text += " return ({},)\n".format(", ".join(to_varname(c) for c in col_names)) + # dtype parameter of compiled function is not used at all, instead a python dict + # of columns dtypes is captured at compile time, because some dtypes (like datetime) + # are converted and also to avoid penalty of creating dict in objmode + inner_call_params['dtype'] = 'read_as_dtypes' - return func_text, func_name - - -def _gen_csv_reader_py_pyarrow_func_text_dataframe(col_names, col_typs, dtype_present, usecols, signature): - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_core( - col_names, col_typs, dtype_present, usecols, signature) - return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names + params_str = '\n'.join([ + f" {param}={inner_call_params.get(param, param)}," for param in used_read_csv_params + ]) + func_text = '\n'.join([ + f"def {func_name}({signature}):", + f" with objmode(df=\"{df_type_repr}\"):", + f" df = pandas_read_csv(\n{params_str}", + f" )", + f" return df" + ]) - func_text += " return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format( - ", ".join(to_varname(c) for c in return_columns), - ", ".join("'{}'".format(c) for c in return_columns) - ) + global_vars = { + 'read_as_dtypes': py_col_dtypes, + 'objmode': objmode, + 'pandas_read_csv': pandas_read_csv, + } - return func_text, func_name + return func_text, func_name, global_vars -def _gen_csv_reader_py_pyarrow_py_func(func_text, func_name): +def _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars): locals = {} - exec(func_text, globals(), locals) + exec(func_text, global_vars, locals) func = locals[func_name] return func @@ -628,6 +602,3 @@ def _gen_csv_reader_py_pyarrow_jit_func(csv_reader_py): jit_func = numba.njit(csv_reader_py) compiled_funcs.append(jit_func) return jit_func - - -_gen_csv_reader_py = _gen_csv_reader_py_pyarrow diff --git a/sdc/str_arr_type.py b/sdc/str_arr_type.py index b18967f63..c9cf59ce1 100644 --- a/sdc/str_arr_type.py +++ b/sdc/str_arr_type.py @@ -66,6 +66,8 @@ def copy(self): string_array_type = StringArrayType() +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.StringArrayType = StringArrayType class StringArrayPayloadType(types.Type): diff --git a/sdc/tests/test_io.py b/sdc/tests/test_io.py index 768f2827e..55fead5ba 100644 --- a/sdc/tests/test_io.py +++ b/sdc/tests/test_io.py @@ -663,6 +663,8 @@ def test_impl(): return test_impl + @skip_numba_jit("Lowering error while returning df from objmode:\n" + "Can only insert i8* at [4] in {i8*, i8*, i64, i64, i8*, [1 x i64], [1 x i64]}: got i64*") def test_csv_cat1(self): test_impl = self.pd_csv_cat1() hpat_func = self.jit(test_impl) @@ -682,6 +684,8 @@ def test_impl(): return test_impl + @skip_numba_jit("Lowering error while returning df from objmode:\n" + "Can only insert i8* at [4] in {i8*, i8*, i64, i64, i8*, [1 x i64], [1 x i64]}: got i64*") def test_csv_cat2(self): test_impl = self.pd_csv_cat2() hpat_func = self.jit(test_impl) From 9deb0296987949322409b0ae6a22971d08c234a5 Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Wed, 11 Nov 2020 19:34:25 +0300 Subject: [PATCH 2/8] Adding debug decorator to print Numba compile stats (#925) * Adding debug decorator to print Numba compile stats This adds a decorator to print Numba compilation stats for a decorated function and all nested functions (literally all compiled overloads). Filtering can be made using decorator arguments. NOTE: this may show a bit lesser times than compile time calculated using common approach via first_run_exec_time - second_run_exec_time. * Adding tests for compile time log format * Remove check for CPUDispatcher to work with numba DPPL --- sdc/decorators.py | 29 +++++++++ sdc/tests/__init__.py | 1 + sdc/tests/test_compile_time.py | 114 +++++++++++++++++++++++++++++++++ sdc/utilities/utils.py | 58 +++++++++++++++++ 4 files changed, 202 insertions(+) create mode 100644 sdc/tests/test_compile_time.py diff --git a/sdc/decorators.py b/sdc/decorators.py index aa5ad0f5e..adb779059 100644 --- a/sdc/decorators.py +++ b/sdc/decorators.py @@ -31,6 +31,9 @@ import numba import sdc +from functools import wraps +from sdc.utilities.utils import print_compile_times + def jit(signature_or_function=None, **options): @@ -44,3 +47,29 @@ def jit(signature_or_function=None, **options): Use Numba compiler pipeline ''' return numba.jit(signature_or_function, **options) + + +def debug_compile_time(level=1, func_names=None): + """ Decorates Numba Dispatcher object to print compile stats after call. + Usage: + @debug_compile_time() + @numba.njit + + Args: + level: if zero prints only short summary + func_names: filters output to include only functions which names include listed strings, + """ + + def get_wrapper(disp): + + @wraps(disp) + def wrapper(*args, **kwargs): + res = disp(*args, **kwargs) + print('*' * 40, 'COMPILE STATS', '*' * 40) + print_compile_times(disp, level=level, func_names=func_names) + print('*' * 95) + return res + + return wrapper + + return get_wrapper diff --git a/sdc/tests/__init__.py b/sdc/tests/__init__.py index c36067f0f..eaba6a8a9 100644 --- a/sdc/tests/__init__.py +++ b/sdc/tests/__init__.py @@ -48,6 +48,7 @@ from sdc.tests.test_sdc_numpy import * from sdc.tests.test_prange_utils import * +from sdc.tests.test_compile_time import * # performance tests import sdc.tests.tests_perf diff --git a/sdc/tests/test_compile_time.py b/sdc/tests/test_compile_time.py new file mode 100644 index 000000000..03b5fd46a --- /dev/null +++ b/sdc/tests/test_compile_time.py @@ -0,0 +1,114 @@ +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import re +import unittest + +from contextlib import redirect_stdout +from io import StringIO +from sdc.tests.test_base import TestCase +from sdc.decorators import debug_compile_time + + +# regexp patterns for lines in @debug_compile_time output log +line_heading = r'\*+\s+COMPILE STATS\s+\*+\n' +line_function = r'Function: [^\s]+\n' +line_args = r'\s+Args:.*\n' +line_pipeline = r'\s+Pipeline: \w+\n' +line_passes = r'(\s+\w+\s+[\d.]+\n)+' +line_time = r'\s+Time: [\d.]+\n' +line_ending = r'\*+\n' + + +class TestCompileTime(TestCase): + + @staticmethod + def _gen_usecase_data(): + n = 11 + S1 = pd.Series(np.ones(n)) + S2 = pd.Series(2 ** np.arange(n)) + return S1, S2 + + def test_log_format_summary(self): + """ Verifies shortened log format when only summary info is printed """ + + @debug_compile_time(level=0) + @self.jit + def test_impl(S1, S2): + return S1 + S2 + + buffer = StringIO() + with redirect_stdout(buffer): + S1, S2 = self._gen_usecase_data() + test_impl(S1, S2) + + entry_format = fr'{line_function}{line_pipeline}{line_time}\n' + log_format = fr'^{line_heading}({entry_format})+{line_ending}$' + self.assertRegex(buffer.getvalue(), log_format) + + def test_log_format_detailed(self): + """ Verifies detailed log format with passes and args information """ + + @debug_compile_time() + @self.jit + def test_impl(S1, S2): + return S1 + S2 + + buffer = StringIO() + with redirect_stdout(buffer): + S1, S2 = self._gen_usecase_data() + test_impl(S1, S2) + + entry_format = fr'{line_function}{line_args}{line_pipeline}{line_passes}{line_time}\n' + log_format = fr'{line_heading}({entry_format})+{line_ending}' + self.assertRegex(buffer.getvalue(), log_format) + + def test_func_names_filter(self): + """ Verifies filtering log entries via func_names paramter """ + searched_name = 'add' + + @debug_compile_time(func_names=[searched_name]) + @self.jit + def test_impl(S1, S2): + return S1 + S2 + + buffer = StringIO() + with redirect_stdout(buffer): + S1, S2 = self._gen_usecase_data() + test_impl(S1, S2) + + line_function = r'Function: ([^\s]+)\n' + match_iter = re.finditer(line_function, buffer.getvalue()) + next(match_iter) # skip entry for top-level func + for m in match_iter: + self.assertIn(searched_name, m.group(1)) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/utilities/utils.py b/sdc/utilities/utils.py index c17285a58..bdce82b0a 100644 --- a/sdc/utilities/utils.py +++ b/sdc/utilities/utils.py @@ -51,6 +51,8 @@ from numba.extending import overload, overload_method, overload_attribute from numba.extending import register_jitable, register_model from numba.core.datamodel.registry import register_default +from functools import wraps +from itertools import filterfalse, chain # int values for types to pass to C code @@ -686,3 +688,59 @@ def sdc_overload_attribute(typ, name, jit_options={}, parallel=None, strict=True return overload_attribute( typ, name, jit_options=jit_options, strict=strict, inline=inline, prefer_literal=prefer_literal ) + + +def print_compile_times(disp, level, func_names=None): + + def print_times(cres, args): + print(f'Function: {cres.fndesc.unique_name}') + pad = ' ' * 2 + if level: + print(f'{pad * 1}Args: {args}') + times = cres.metadata['pipeline_times'] + for pipeline, pass_times in times.items(): + print(f'{pad * 1}Pipeline: {pipeline}') + if level: + for name, timings in pass_times.items(): + print(f'{pad * 2}{name:50s}{timings.run:.13f}') + + pipeline_total = sum(t.init + t.run + t.finalize for t in pass_times.values()) + print(f'{pad * 1}Time: {pipeline_total}\n') + + # print times for compiled function indicated by disp + for args, cres in disp.overloads.items(): + print_times(cres, args) + + def has_no_cache(ovld): + return not (getattr(ovld, '_impl_cache', False) and ovld._impl_cache) + + known_funcs = disp.typingctx._functions + all_templs = chain.from_iterable(known_funcs.values()) + compiled_templs = filterfalse(has_no_cache, all_templs) + + # filter only function names that are in the func_names list + if func_names: + compiled_templs = filterfalse( + lambda x: not any(f in str(x) for f in func_names), + compiled_templs + ) + + dispatchers_list = [] + for template in compiled_templs: + tmpl_cached_impls = template._impl_cache.values() + dispatchers_list.extend(tmpl_cached_impls) + + for impl_cache in set(dispatchers_list): + # impl_cache is usually a tuple of format (dispatcher, args) + # if not just skip these entires + if not (isinstance(impl_cache, tuple) + and len(impl_cache) == 2 + and isinstance(impl_cache[0], type(disp))): + continue + + fndisp, args = impl_cache + if not getattr(fndisp, 'overloads', False): + continue + + cres, = list(fndisp.overloads.values()) + print_times(cres, args) From ce4142a11bb23d571d274d5c185edd357b0621ee Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Thu, 12 Nov 2020 17:23:02 +0300 Subject: [PATCH 3/8] Remove column names from DataFrame model (#944) Motivation: before this change column names were passed to DF ctor as arguments of LiteralString types (each name of it's own type), which seems to add to linear dependency of LLVM IR size and hence impact DF ctor compile time. Since this information is saved into DF type itself and can be captured in any of DF methods on typing it's proposed to remove columns from DF model struct as redundant. --- examples/dataframe/dataframe_columns.py | 39 +++++++++++++++++++ .../hpat_pandas_dataframe_functions.py | 38 ++++++++++++++++++ .../hpat_pandas_groupby_functions.py | 4 +- sdc/hiframes/boxing.py | 25 ------------ sdc/hiframes/pd_dataframe_ext.py | 9 +---- sdc/hiframes/pd_dataframe_type.py | 2 - sdc/tests/test_dataframe.py | 3 -- 7 files changed, 81 insertions(+), 39 deletions(-) create mode 100644 examples/dataframe/dataframe_columns.py diff --git a/examples/dataframe/dataframe_columns.py b/examples/dataframe/dataframe_columns.py new file mode 100644 index 000000000..c4094bb8a --- /dev/null +++ b/examples/dataframe/dataframe_columns.py @@ -0,0 +1,39 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd +from numba import njit + + +@njit +def dataframe_columns(): + df = pd.DataFrame({'A': [1, 2], 'AA': [3, 4], 'B': [5, 6]}, index=['a', 'b']) + result = df.columns + + return result # A tuple of column names ('A', 'AA', 'B') + + +print(dataframe_columns()) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 8b3cf79bc..52e06b79f 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -116,6 +116,44 @@ def hpat_pandas_df_index_impl(df): return hpat_pandas_df_index_impl +@sdc_overload_attribute(DataFrameType, 'columns') +def hpat_pandas_dataframe_columns(df): + """ + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: pandas.DataFrame.columns + + Examples + -------- + .. literalinclude:: ../../../examples/dataframe/dataframe_columns.py + :language: python + :lines: 27- + :caption: The column names of the DataFrame. + :name: ex_dataframe_columns + + .. command-output:: python ./dataframe/dataframe_columns.py + :cwd: ../../../examples + + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Pandas DataFrame attribute :attr:`pandas.DataFrame.columns` implementation. + + .. only:: developer + Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_dataframe_columns* + """ + + ty_checker = TypeChecker('Attribute columns.') + ty_checker.check(df, DataFrameType) + + # no columns in DF model to avoid impact on DF ctor IR size (captured when needed only) + df_columns = df.columns + + def hpat_pandas_df_columns_impl(df): + return df_columns + + return hpat_pandas_df_columns_impl + + def sdc_pandas_dataframe_values_codegen(self, numba_common_dtype): """ Example of generated implementation: diff --git a/sdc/datatypes/hpat_pandas_groupby_functions.py b/sdc/datatypes/hpat_pandas_groupby_functions.py index 9d5d86878..83f752e9d 100644 --- a/sdc/datatypes/hpat_pandas_groupby_functions.py +++ b/sdc/datatypes/hpat_pandas_groupby_functions.py @@ -160,6 +160,8 @@ def sdc_pandas_dataframe_getitem(self, idx): target_col_loc = self.parent.column_loc[self.parent.columns[target_col_id_literal]] target_type_id, target_col_id = target_col_loc.type_id, target_col_loc.col_id + parent_df_col_names = self.parent.columns + def sdc_pandas_dataframe_getitem_common_impl(self, idx): # calling getitem twice raises IndexError, just as in pandas @@ -170,7 +172,7 @@ def sdc_pandas_dataframe_getitem_common_impl(self, idx): # no need to pass index into this series, as we group by array target_series = pandas.Series( data=self._parent._data[target_type_id][target_col_id], - name=self._parent._columns[target_col_id_literal] + name=parent_df_col_names[target_col_id_literal] ) by_arr_data = self._parent._data[by_type_id][by_col_id] return init_series_groupby(target_series, by_arr_data, self._data, self._sort) diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 232096472..ac14e1ce9 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -88,30 +88,11 @@ def unbox_dataframe(typ, val, c): columns will be extracted later if necessary. """ n_cols = len(typ.columns) - column_strs = [numba.cpython.unicode.make_string_from_constant( - c.context, c.builder, string_type, a) for a in typ.columns] # create dataframe struct and store values dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder) errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) - col_list_type = types.List(string_type) - ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, col_list_type, n_cols) - - with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): - with if_ok: - inst.size = c.context.get_constant(types.intp, n_cols) - for i, column_str in enumerate(column_strs): - inst.setitem(c.context.get_constant(types.intp, i), column_str, incref=False) - dataframe.columns = inst.value - - with if_not_ok: - c.builder.store(cgutils.true_bit, errorptr) - - # If an error occurred, drop the whole native list - with c.builder.if_then(c.builder.load(errorptr)): - c.context.nrt.decref(c.builder, col_list_type, inst.value) - _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns) for col_typ in types_order: @@ -150,12 +131,6 @@ def unbox_dataframe(typ, val, c): dataframe.parent = val - # increase refcount of stored values - if c.context.enable_nrt: - # TODO: other objects? - for var in column_strs: - c.context.nrt.incref(c.builder, string_type, var) - return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr)) diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 7c6d5e40a..5c77bc41a 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -100,8 +100,7 @@ def codegen(context, builder, signature, args): in_tup = args[0] data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)] index = builder.extract_value(in_tup, n_cols) - column_strs = [numba.cpython.unicode.make_string_from_constant( - context, builder, string_type, c) for c in column_names] + # create dataframe struct and store values dataframe = cgutils.create_struct_proxy( signature.return_type)(context, builder) @@ -117,12 +116,8 @@ def codegen(context, builder, signature, args): data_tup = context.make_tuple( builder, types.Tuple(data_list_type), data_lists) - col_list_type = types.List(string_type) - column_list = context.build_list(builder, col_list_type, column_strs) - dataframe.data = data_tup dataframe.index = index - dataframe.columns = column_list dataframe.parent = context.get_constant_null(types.pyobject) # increase refcount of stored values @@ -130,8 +125,6 @@ def codegen(context, builder, signature, args): context.nrt.incref(builder, index_typ, index) for var, typ in zip(data_arrs, data_typs): context.nrt.incref(builder, typ, var) - for var in column_strs: - context.nrt.incref(builder, string_type, var) return dataframe._getvalue() diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py index 85255b22c..c600a0209 100644 --- a/sdc/hiframes/pd_dataframe_type.py +++ b/sdc/hiframes/pd_dataframe_type.py @@ -110,7 +110,6 @@ def __init__(self, dmm, fe_type): members = [ ('data', types.Tuple([types.List(typ) for typ in df_types])), ('index', fe_type.index), - ('columns', types.List(string_type)), ('parent', types.pyobject), ] super(DataFrameModel, self).__init__(dmm, fe_type, members) @@ -127,6 +126,5 @@ class ColumnLoc(NamedTuple): make_attribute_wrapper(DataFrameType, 'data', '_data') make_attribute_wrapper(DataFrameType, 'index', '_index') -make_attribute_wrapper(DataFrameType, 'columns', '_columns') make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed') make_attribute_wrapper(DataFrameType, 'parent', '_parent') diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 9102c916f..f8f15e121 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -242,7 +242,6 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) - @skip_numba_jit def test_box1(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}) @@ -2557,7 +2556,6 @@ def test_impl(): hpat_func = self.jit(test_impl) pd.testing.assert_series_equal(hpat_func(), test_impl(), check_names=False) - @unittest.skip("Implement getting columns attribute") def test_dataframe_columns_attribute(self): def test_impl(): df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) @@ -2566,7 +2564,6 @@ def test_impl(): hpat_func = self.jit(test_impl) np.testing.assert_array_equal(hpat_func(), test_impl()) - @unittest.skip("Implement getting columns attribute") def test_dataframe_columns_iterator(self): def test_impl(): df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) From 70b5ae813243d120571eec908e3b7bf1178cb8dd Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Tue, 17 Nov 2020 17:46:10 +0300 Subject: [PATCH 4/8] Implements init_dataframe as multiple codegen functions (#936) Motivation: init_dataframe was implemented via Numba intrinsic taking *args, which seems to generate redundant extractvalue/insertvalue LLVM instructions, producing quadratic IR when number of DF columns grows and affecting total compilation time of function that create large DFs. This PR replaces singe init_dataframe with multiple functions basing on number of columns in a DF which are generated at compile time, thus avoiding use of *args. --- .../hpat_pandas_dataframe_functions.py | 2 +- sdc/hiframes/api.py | 10 +- sdc/hiframes/pd_dataframe_ext.py | 53 ------ sdc/hiframes/pd_dataframe_type.py | 1 - sdc/hiframes/pd_series_ext.py | 2 +- sdc/rewrites/dataframe_constructor.py | 159 +++++++++++++++--- sdc/tests/test_dataframe.py | 27 ++- 7 files changed, 165 insertions(+), 89 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 52e06b79f..98c724705 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -1500,7 +1500,7 @@ def df_getitem_slice_idx_main_codelines(self, idx): res_data = f'res_data_{i}' func_lines += [ f' data_{i} = self._data[{type_id}][{col_id}][idx]', - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")' + f' {res_data} = data_{i}' ] results.append((col, res_data)) diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index b1e22f536..77436f49b 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -167,25 +167,25 @@ def fix_df_array_list_str_impl(column): # pragma: no cover return lambda column: column -def fix_df_index(index, *columns): +def fix_df_index(index): return index @overload(fix_df_index) -def fix_df_index_overload(index, *columns): +def fix_df_index_overload(index): # TO-DO: replace types.none index with separate type, e.g. DefaultIndex if (index is None or isinstance(index, types.NoneType)): - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return None elif isinstance(index, RangeIndexType): - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return index else: # default case, transform index the same as df data - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return fix_df_array(index) return fix_df_index_impl diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 5c77bc41a..56218e99c 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -80,59 +80,6 @@ def get_structure_maps(col_types, col_names): return column_loc, data_typs_map, types_order -@intrinsic -def init_dataframe(typingctx, *args): - """Create a DataFrame with provided data, index and columns values. - Used as a single constructor for DataFrame and assigning its data, so that - optimization passes can look for init_dataframe() to see if underlying - data has changed, and get the array variables from init_dataframe() args if - not changed. - """ - - n_cols = len(args) // 2 - data_typs = tuple(args[:n_cols]) - index_typ = args[n_cols] - column_names = tuple(a.literal_value for a in args[n_cols + 1:]) - - column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names) - - def codegen(context, builder, signature, args): - in_tup = args[0] - data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)] - index = builder.extract_value(in_tup, n_cols) - - # create dataframe struct and store values - dataframe = cgutils.create_struct_proxy( - signature.return_type)(context, builder) - - data_list_type = [types.List(typ) for typ in types_order] - - data_lists = [] - for typ_id, typ in enumerate(types_order): - data_list_typ = context.build_list(builder, data_list_type[typ_id], - [data_arrs[data_id] for data_id in data_typs_map[typ][1]]) - data_lists.append(data_list_typ) - - data_tup = context.make_tuple( - builder, types.Tuple(data_list_type), data_lists) - - dataframe.data = data_tup - dataframe.index = index - dataframe.parent = context.get_constant_null(types.pyobject) - - # increase refcount of stored values - if context.enable_nrt: - context.nrt.incref(builder, index_typ, index) - for var, typ in zip(data_arrs, data_typs): - context.nrt.incref(builder, typ, var) - - return dataframe._getvalue() - - ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc) - sig = signature(ret_typ, types.Tuple(args)) - return sig, codegen - - # TODO: alias analysis # this function should be used for getting df._data for alias analysis to work # no_cpython_wrapper since Array(DatetimeDate) cannot be boxed diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py index c600a0209..9dd5fcaf4 100644 --- a/sdc/hiframes/pd_dataframe_type.py +++ b/sdc/hiframes/pd_dataframe_type.py @@ -126,5 +126,4 @@ class ColumnLoc(NamedTuple): make_attribute_wrapper(DataFrameType, 'data', '_data') make_attribute_wrapper(DataFrameType, 'index', '_index') -make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed') make_attribute_wrapper(DataFrameType, 'parent', '_parent') diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 245426643..d48aaf0f1 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -138,7 +138,7 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): fix_data = sdc.hiframes.api.fix_df_array(data) - fix_index = sdc.hiframes.api.fix_df_index(index, fix_data) + fix_index = sdc.hiframes.api.fix_df_index(index) return sdc.hiframes.api.init_series(fix_data, fix_index, name) return hpat_pandas_series_ctor_impl diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index c9538759e..debf0b73c 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -24,13 +24,18 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - +import numba +from numba.core import cgutils, types from numba.core.rewrites import (register_rewrite, Rewrite) from numba.core.ir_utils import (guard, find_callname) from numba.core.ir import (Expr) from numba.extending import overload +from numba.core.extending import intrinsic +from numba.core.typing import signature from pandas import DataFrame +from sys import modules +from textwrap import dedent from sdc.rewrites.ir_utils import (find_operations, is_dict, get_tuple_items, get_dict_items, remove_unused_recursively, @@ -38,9 +43,11 @@ declare_constant, import_function, make_call, insert_before) -from sdc.hiframes.pd_dataframe_ext import (init_dataframe, DataFrameType) - +from sdc.hiframes import pd_dataframe_ext as pd_dataframe_ext_module +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc +from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.str_ext import string_type @register_rewrite('before-inference') @@ -54,6 +61,7 @@ class RewriteDataFrame(Rewrite): _df_arg_list = ('data', 'index', 'columns', 'dtype', 'copy') def __init__(self, pipeline): + self._pipeline = pipeline super().__init__(pipeline) self._reset() @@ -79,18 +87,45 @@ def match(self, func_ir, block, typemap, calltypes): return len(self._calls_to_rewrite) > 0 def apply(self): - init_df_stmt = import_function(init_dataframe, self._block, self._func_ir) - for stmt in self._calls_to_rewrite: args = get_call_parameters(call=stmt.value, arg_names=self._df_arg_list) - old_data = args['data'] - args['data'], args['columns'] = self._extract_dict_args(args, self._func_ir) + args_len = len(args['data']) + func_name = f'init_dataframe_{args_len}' + + # injected_module = modules[pd_dataframe_ext_module.__name__] + init_df = getattr(pd_dataframe_ext_module, func_name, None) + if init_df is None: + init_df_text = gen_init_dataframe_text(func_name, args_len) + init_df = gen_init_dataframe_func( + func_name, + init_df_text, + { + 'numba': numba, + 'cgutils': cgutils, + 'signature': signature, + 'types': types, + 'get_structure_maps': get_structure_maps, + 'intrinsic': intrinsic, + 'DataFrameType': DataFrameType, + 'ColumnLoc': ColumnLoc, + 'string_type': string_type, + 'intrinsic': intrinsic, + 'fix_df_array': fix_df_array, + 'fix_df_index': fix_df_index + }) + + setattr(pd_dataframe_ext_module, func_name, init_df) + init_df.__module__ = pd_dataframe_ext_module.__name__ + init_df._defn.__module__ = pd_dataframe_ext_module.__name__ + + init_df_stmt = import_function(init_df, self._block, self._func_ir) self._replace_call(stmt, init_df_stmt.target, args, self._block, self._func_ir) remove_unused_recursively(old_data, self._block, self._func_ir) + self._pipeline.typingctx.refresh() return self._block @@ -130,42 +165,112 @@ def _replace_call(stmt, new_call, args, block, func_ir): columns_args = args['columns'] index_args = args.get('index') - data_args = RewriteDataFrame._replace_data_with_arrays(data_args, stmt, block, func_ir) - if index_args is None: # index arg was omitted none_stmt = declare_constant(None, block, func_ir, stmt.loc) index_args = none_stmt.target - index_and_data_args = [index_args] + data_args - index_args = RewriteDataFrame._replace_index_with_arrays(index_and_data_args, stmt, block, func_ir) + index_args = [index_args] all_args = data_args + index_args + columns_args call = Expr.call(new_call, all_args, {}, func.loc) stmt.value = call - @staticmethod - def _replace_data_with_arrays(args, stmt, block, func_ir): - new_args = [] - for var in args: - call_stmt = make_call(fix_df_array, [var], {}, block, func_ir, var.loc) - insert_before(block, call_stmt, stmt) - new_args.append(call_stmt.target) +def gen_init_dataframe_text(func_name, n_cols): + args_col_data = ['c' + str(i) for i in range(n_cols)] + args_col_names = ['n' + str(i) for i in range(n_cols)] + params = ', '.join(args_col_data + ['index'] + args_col_names) + suffix = ('' if n_cols == 0 else ', ') + + func_text = dedent( + f''' + @intrinsic + def {func_name}(typingctx, {params}): + """Create a DataFrame with provided columns data and index values. + Takes 2n+1 args: n columns data, index data and n column names. + Each column data is passed as separate argument to have compact LLVM IR. + Used as as generic constructor for native DataFrame objects, which + can be used with different input column types (e.g. lists), and + resulting DataFrameType is deduced by applying transform functions + (fix_df_array and fix_df_index) to input argument types. + """ + + n_cols = {n_cols} + + input_data_typs = ({', '.join(args_col_data) + suffix}) + fnty = typingctx.resolve_value_type(fix_df_array) + fixed_col_sigs = [] + for i in range({n_cols}): + to_sig = fnty.get_call_type(typingctx, (input_data_typs[i],), {{}}) + fixed_col_sigs.append(to_sig) + data_typs = tuple(fixed_col_sigs[i].return_type for i in range({n_cols})) + need_fix_cols = tuple(data_typs[i] != input_data_typs[i] for i in range({n_cols})) + + input_index_typ = index + fnty = typingctx.resolve_value_type(fix_df_index) + fixed_index_sig = fnty.get_call_type(typingctx, (input_index_typ,), {{}}) + index_typ = fixed_index_sig.return_type + need_fix_index = index_typ != input_index_typ + + column_names = tuple(a.literal_value for a in ({', '.join(args_col_names) + suffix})) + column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names) + col_needs_transform = tuple(not isinstance(data_typs[i], types.Array) for i in range(len(data_typs))) + + def codegen(context, builder, sig, args): + {params}, = args + data_arrs = [{', '.join(args_col_data) + suffix}] + data_arrs_transformed = [] + for i, arr in enumerate(data_arrs): + if need_fix_cols[i] == False: + data_arrs_transformed.append(arr) + else: + res = context.compile_internal(builder, lambda a: fix_df_array(a), fixed_col_sigs[i], [arr]) + data_arrs_transformed.append(res) - return new_args + # create dataframe struct and store values + dataframe = cgutils.create_struct_proxy( + sig.return_type)(context, builder) - @staticmethod - def _replace_index_with_arrays(args, stmt, block, func_ir): - new_args = [] + data_list_type = [types.List(typ) for typ in types_order] + + data_lists = [] + for typ_id, typ in enumerate(types_order): + data_arrs_of_typ = [data_arrs_transformed[data_id] for data_id in data_typs_map[typ][1]] + data_list_typ = context.build_list(builder, data_list_type[typ_id], data_arrs_of_typ) + data_lists.append(data_list_typ) + + data_tup = context.make_tuple( + builder, types.Tuple(data_list_type), data_lists) + + if need_fix_index == True: + index = context.compile_internal(builder, lambda a: fix_df_index(a), fixed_index_sig, [index]) + + dataframe.data = data_tup + dataframe.index = index + dataframe.parent = context.get_constant_null(types.pyobject) + + # increase refcount of stored values + if context.enable_nrt: + context.nrt.incref(builder, index_typ, index) + for var, typ in zip(data_arrs_transformed, data_typs): + context.nrt.incref(builder, typ, var) + + return dataframe._getvalue() + + ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc) + sig = signature(ret_typ, {params}) + return sig, codegen + ''') + + return func_text - call_stmt = make_call(fix_df_index, args, {}, block, func_ir, args[0].loc) - insert_before(block, call_stmt, stmt) - new_args.append(call_stmt.target) - return new_args +def gen_init_dataframe_func(func_name, func_text, global_vars): - return new_args + loc_vars = {} + exec(func_text, global_vars, loc_vars) + return loc_vars[func_name] @overload(DataFrame) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index f8f15e121..d2b2d3547 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -98,6 +98,31 @@ def test_impl(n): n = 11 self.assertEqual(hpat_func(n), test_impl(n)) + def test_create_empty_df(self): + """ Verifies empty DF can be created """ + def test_impl(): + df = pd.DataFrame({}) + return len(df) + hpat_func = self.jit(test_impl) + + self.assertEqual(hpat_func(), test_impl()) + + def test_create_multiple_dfs(self): + """ Verifies generated dataframe ctor is added to pd_dataframe_ext module + correctly (and numba global context is refreshed), so that subsequent + compilations are not broken. """ + def test_impl(a, b, c): + df1 = pd.DataFrame({'A': a, 'B': b}) + df2 = pd.DataFrame({'C': c}) + total_cols = len(df1.columns) + len(df2.columns) + return total_cols + hpat_func = self.jit(test_impl) + + a1 = np.array([1, 2, 3, 4.0, 5]) + a2 = [7, 6, 5, 4, 3] + a3 = ['a', 'b', 'c', 'd', 'e'] + self.assertEqual(hpat_func(a1, a2, a3), test_impl(a1, a2, a3)) + def test_create_str(self): def test_impl(): df = pd.DataFrame({'A': ['a', 'b', 'c']}) @@ -159,7 +184,7 @@ def test_impl(A, B, index): result_ref = test_impl(A, B, index) pd.testing.assert_frame_equal(result, result_ref) - def test_create_empty_df(self): + def test_unbox_empty_df(self): def test_impl(df): return df sdc_func = self.jit(test_impl) From 8520b6238d541c0aa1c55476b10b2099420bbf2f Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Fri, 20 Nov 2020 19:08:50 +0300 Subject: [PATCH 5/8] Fix for python objects leaked in df unboxing (#946) --- sdc/hiframes/boxing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index ac14e1ce9..5e6930da9 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -115,6 +115,8 @@ def unbox_dataframe(typ, val, c): native_val = _unbox_series_data(column_dtype, ty_series, arr_obj, c) inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False) + c.pyapi.decref(arr_obj) + c.pyapi.decref(series_obj) dataframe.data = c.builder.insert_value(dataframe.data, inst.value, type_id) From d5f706fe42a45c33153e3fbca25cc31c4f28b966 Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Mon, 7 Dec 2020 17:05:29 +0300 Subject: [PATCH 6/8] Refactor df.drop to improve compile times (#945) --- .../hpat_pandas_dataframe_functions.py | 132 +++++++--- sdc/hiframes/pd_dataframe_ext.py | 26 +- sdc/tests/test_dataframe.py | 235 ++++++++++++------ 3 files changed, 277 insertions(+), 116 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 98c724705..56ae09d1e 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -52,6 +52,7 @@ from sdc.datatypes.range_index_type import RangeIndexType from sdc.hiframes.pd_dataframe_type import DataFrameType +from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps from sdc.hiframes.pd_series_type import SeriesType from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType, @@ -1337,40 +1338,69 @@ def isna_overload(df): return sdc_pandas_dataframe_isna_codegen(df, 'isna') -def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols): +def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_col_names): """ Example of generated implementation: def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns=None, - level=None, inplace=False, errors="raise"): - new_col_0_data_df = df._data[1][0] - new_col_1_data_df = df._data[0][1] - return pandas.DataFrame({"B": new_col_0_data_df, "C": new_col_1_data_df}, index=df.index) + level=None, inplace=False, errors="raise"): + list_0 = df._data[0].copy() + for col_id in old_scheme_drop_idxs_0[::-1]: + list_0.pop(col_id) + list_1 = df._data[1].copy() + new_data = (list_1, list_0, ) + return init_dataframe_internal(new_data, df._index, df_type) """ indent = 4 * ' ' - saved_df_columns = [column for column in df.columns if column not in drop_cols] - func_definition = [f'def sdc_pandas_dataframe_{func_name}_impl({", ".join(func_args)}):'] + func_definition = [f'def {func_name}({", ".join(func_args)}):'] func_text = [] - column_list = [] - for label in drop_cols: + old_column_loc, old_data_typs_map, old_types_order = get_structure_maps(df.data, df.columns) + + new_data_typs = tuple(t for i, t in enumerate(df.data) if df.columns[i] not in drop_col_names) + new_column_names = tuple(c for c in df.columns if c not in drop_col_names) + new_column_loc, new_data_typs_map, new_types_order = get_structure_maps(new_data_typs, new_column_names) + + old_types_idxs_map = dict(zip(old_types_order, range(len(old_types_order)))) + reorder_scheme = tuple(old_types_idxs_map[t] for t in new_types_order) + df_type = DataFrameType(new_data_typs, df.index, new_column_names, column_loc=new_column_loc) + + old_scheme_drop_idxs = [] + for i, k in enumerate(old_types_order): + a = [j for j, x in enumerate(old_data_typs_map[k][1]) if df.columns[x] in drop_col_names] + old_scheme_drop_idxs.append(tuple(a) or None) + + for label in drop_col_names: if label not in df.columns: func_text.append(f'if errors == "raise":') func_text.append(indent + f'raise ValueError("The label {label} is not found in the selected axis")') break - for column_id, column_name in enumerate(saved_df_columns): - col_loc = df.column_loc[column_name] - type_id, col_id = col_loc.type_id, col_loc.col_id - func_text.append(f'new_col_{column_id}_data_df = df._data[{type_id}][{col_id}]') - column_list.append((f'new_col_{column_id}_data_df', column_name)) - - data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) - index = 'df.index' - func_text.append(f"return pandas.DataFrame({{{data}}}, index={index})\n") + old_ntypes = len(old_types_order) + for type_id in range(old_ntypes): + func_text.append(f'list_{type_id} = df._data[{type_id}].copy()') + if old_scheme_drop_idxs[type_id]: + func_text.append(f'for col_id in old_scheme_drop_idxs_{type_id}[::-1]:') + func_text.append(indent + f'list_{type_id}.pop(col_id)') + + # in new df the order of array lists (i.e. types_order) can be different, so + # making a new tuple of lists reorder as needed + new_ntypes = len(new_types_order) + data_lists_reordered = ', '.join(['list_' + str(reorder_scheme[i]) for i in range(new_ntypes)]) + data_val = '(' + data_lists_reordered + ', )' if new_ntypes > 0 else '()' + + data, index = 'new_data', 'df._index' + func_text.append(f'{data} = {data_val}') + func_text.append(f"return init_dataframe_internal({data}, {index}, df_type)\n") func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) - global_vars = {'pandas': pandas} + global_vars = { + 'pandas': pandas, + 'init_dataframe_internal': init_dataframe_internal, + 'df_type': df_type + } + + global_vars.update({f'old_scheme_drop_idxs_{i}': old_scheme_drop_idxs[i] for i in range(old_ntypes)}) return func_def, global_vars @@ -1387,7 +1417,8 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, ----------- - Parameters ``labels``, ``axis``, ``index``, ``level`` and ``inplace`` are currently unsupported. - Parameter ``columns`` is required and is expected to be a Literal value with one column name - or Tuple with columns names. + or List with columns names. Mutating a list of column names after it was defined and then using it as a + columns argument results in an SDCLimitation exception at runtime. - Supported ``errors`` can be {``raise``, ``ignore``}, default ``raise``. If ``ignore``, suppress error and only existing labels are dropped. @@ -1420,36 +1451,66 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, """ - _func_name = 'drop' + method_name = f'Method drop().' - ty_checker = TypeChecker(f'Method {_func_name}().') + ty_checker = TypeChecker(method_name) ty_checker.check(df, DataFrameType) - if not isinstance(labels, types.Omitted) and labels is not None: + if not isinstance(labels, (types.Omitted, types.NoneType)) and labels is not None: ty_checker.raise_exc(labels, 'None', 'labels') - if not isinstance(axis, (int, types.Omitted)): + if not isinstance(axis, (types.Omitted, types.Integer)) and axis != 0: ty_checker.raise_exc(axis, 'int', 'axis') - if not isinstance(index, types.Omitted) and index is not None: + if not isinstance(index, (types.Omitted, types.NoneType)) and index is not None: ty_checker.raise_exc(index, 'None', 'index') - if not isinstance(columns, (types.Omitted, types.Tuple, types.Literal)): - ty_checker.raise_exc(columns, 'str, tuple of str', 'columns') + if not (isinstance(columns, (types.Omitted, types.StringLiteral)) + or (isinstance(columns, types.Tuple) + and all(isinstance(c, types.StringLiteral) for c in columns)) + or (isinstance(columns, types.UniTuple) and isinstance(columns.dtype, types.StringLiteral)) + or isinstance(columns, types.List) and isinstance(columns.dtype, types.UnicodeType) + ): + ty_checker.raise_exc(columns, 'str, list of const str', 'columns') - if not isinstance(level, (types.Omitted, types.Literal)) and level is not None: + if not isinstance(level, (types.Omitted, types.NoneType, types.Literal)) and level is not None: ty_checker.raise_exc(level, 'None', 'level') - if not isinstance(inplace, (bool, types.Omitted)) and inplace: + if not isinstance(inplace, (types.Omitted, types.NoneType, types.Boolean)) and inplace: ty_checker.raise_exc(inplace, 'bool', 'inplace') - if not isinstance(errors, (str, types.Omitted, types.Literal)): + if not isinstance(errors, (types.Omitted, types.UnicodeType, types.StringLiteral)) and errors != "raise": ty_checker.raise_exc(errors, 'str', 'errors') + if isinstance(columns, types.List): + if columns.initial_value is None: + raise TypingError('{} Unsupported use of parameter columns:' + ' expected list of constant strings. Given: {}'.format(method_name, columns)) + else: + # this works because global tuple of strings is captured as Tuple of StringLiterals + columns_as_tuple = tuple(columns.initial_value) + def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None, + columns=None, level=None, inplace=False, errors="raise"): + + # if at runtime columns list differs from it's initial value (known at compile time) + # we cannot tell which columns to drop and what is the resulting DataFrameType, so raise exception + if list(columns_as_tuple) != columns: + raise SDCLimitation("Unsupported use of parameter columns: non-const list was used.") + + return df.drop(labels=labels, + axis=axis, + index=index, + columns=columns_as_tuple, + level=level, + inplace=inplace, + errors=errors) + + return _sdc_pandas_dataframe_drop_wrapper_impl + args = {'labels': None, 'axis': 0, 'index': None, 'columns': None, 'level': None, 'inplace': False, 'errors': f'"raise"'} - def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns): + def sdc_pandas_dataframe_drop_impl(df, args, columns): func_args = ['df'] for key, value in args.items(): if key not in func_args: @@ -1459,18 +1520,19 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns): if isinstance(columns, types.StringLiteral): drop_cols = (columns.literal_value,) - elif isinstance(columns, types.Tuple): + elif isinstance(columns, (types.Tuple, types.UniTuple)): drop_cols = tuple(column.literal_value for column in columns) else: raise ValueError('Only drop by one column or tuple of columns is currently supported in df.drop()') - func_def, global_vars = sdc_pandas_dataframe_drop_codegen(_func_name, func_args, df, drop_cols) + func_name = 'sdc_pandas_dataframe_drop_impl' + func_def, global_vars = sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols) loc_vars = {} exec(func_def, global_vars, loc_vars) - _drop_impl = loc_vars['sdc_pandas_dataframe_drop_impl'] + _drop_impl = loc_vars[func_name] return _drop_impl - return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns) + return sdc_pandas_dataframe_drop_impl(df, args, columns) def df_length_expr(self): diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 56218e99c..65d10ef52 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -53,7 +53,6 @@ def generic_resolve(self, df, attr): return SeriesType(arr_typ.dtype, arr_typ, df.index, True) - def get_structure_maps(col_types, col_names): # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)} column_loc = {} @@ -80,6 +79,31 @@ def get_structure_maps(col_types, col_names): return column_loc, data_typs_map, types_order +@intrinsic +def init_dataframe_internal(typingctx, data, index, df_type): + + ret_type = df_type.instance_type + + def codegen(context, builder, sig, args): + data_val, index_val = args[:2] + + dataframe = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + dataframe.data = data_val + dataframe.index = index_val + dataframe.parent = context.get_constant_null(types.pyobject) + + # increase refcount of stored values + if context.enable_nrt: + context.nrt.incref(builder, sig.args[0], data_val) + context.nrt.incref(builder, sig.args[1], index_val) + + return dataframe._getvalue() + + sig = signature(ret_type, data, index, df_type) + return sig, codegen + + # TODO: alias analysis # this function should be used for getting df._data for alias analysis to work # no_cpython_wrapper since Array(DatetimeDate) cannot be boxed diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index d2b2d3547..c245f0694 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -31,7 +31,7 @@ import random import string import unittest -from itertools import permutations, product +from itertools import permutations, product, combinations_with_replacement from numba import types from numba.core.config import IS_32BITS from numba import literal_unroll @@ -1836,103 +1836,145 @@ def test_impl(df): h_out = hpat_func(df) pd.testing.assert_frame_equal(out, h_out) - def test_df_drop_one_column_unboxing(self): - def test_impl(df): - return df.drop(columns='C D') - - index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - None, - ['a', 'b', 'c', 'd']] - - sdc_func = self.jit(test_impl) - - for index in index_to_test: - with self.subTest(index=index): - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C D': [1.0, 2.0, np.nan, 1.0]}, - index=index) - pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - def test_df_drop_one_column(self): - def test_impl(index): + """ Verifies df.drop handles string literal as columns param """ + def test_impl(): df = pd.DataFrame({ 'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0] - }, index=index) + }) return df.drop(columns='A') sdc_func = self.jit(test_impl) - for index in [[1, 2, 3, 4], [.1, .2, .3, .4], ['a', 'b', 'c', 'd']]: - with self.subTest(index=index): - pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) + pd.testing.assert_frame_equal(sdc_func(), test_impl()) - def test_df_drop_tuple_column_unboxing(self): - def gen_test_impl(do_jit=False): - def test_impl(df): - if do_jit == True: # noqa - return df.drop(columns=('A', 'C')) - else: - return df.drop(columns=['A', 'C']) + def test_df_drop_inplace_default_false(self): + """ Verifies df.drop doesn't change original dataframe (default inplace=False) """ + def test_impl(df): + return df, df.drop(columns='A') - return test_impl + sdc_func = self.jit(test_impl) - index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - None, - ['a', 'b', 'c', 'd']] + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0] + }) - test_impl = gen_test_impl() - sdc_func = self.jit(gen_test_impl(do_jit=True)) + df_old, df_new = sdc_func(df.copy()) + _, df_new_ref = test_impl(df.copy()) - for index in index_to_test: + pd.testing.assert_frame_equal(df_old, df) + pd.testing.assert_frame_equal(df_new, df_new_ref) + self.assertFalse(df.equals(df_new), "Column was dropped, but DF not changed") + + def test_df_drop_handles_index(self): + """ Verifies df.drop handles DataFrames with indexes of different types """ + def test_impl(df): + return df.drop(columns=['A', 'C']) + + sdc_func = self.jit(test_impl) + indexes_to_test = [ + None, + pd.RangeIndex(4, 0, -1), + [1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd'] + ] + + for index in indexes_to_test: with self.subTest(index=index): - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, - index=index) + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0] + }, index=index) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - def test_df_drop_tuple_column(self): - def gen_test_impl(do_jit=False): - def test_impl(index): - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, - index=index) - if do_jit == True: # noqa - return df.drop(columns=('A', 'C')) - else: - return df.drop(columns=['A', 'C']) + def test_df_drop_columns_list(self): + """ Verifies df.drop handles columns as list of const column names """ + def test_impl(df): + return df.drop(columns=['D', 'B']) + sdc_func = self.jit(test_impl) - return test_impl + n_cols = 5 + col_names = ['A', 'B', 'C', 'D', 'E'] + columns_data = { + 'int': [4, 5, 6, 7], + 'float': [1.0, 2.0, np.nan, 1.0], + 'str': ['a', 'b', 'c', 'd'] + } + + for scheme in combinations_with_replacement(columns_data.keys(), n_cols): + with self.subTest(col_types=scheme): + df_data = [columns_data[scheme[i]] for i in range(n_cols)] + df = pd.DataFrame(dict(zip(col_names, df_data))) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - ['a', 'b', 'c', 'd']] + def test_df_drop_columns_list_single_column(self): + """ Verifies df.drop handles list with single column """ + def test_impl(df): + return df.drop(columns=['A']) + sdc_func = self.jit(test_impl) - test_impl = gen_test_impl() - sdc_func = self.jit(gen_test_impl(do_jit=True)) + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0], + }) - for index in index_to_test: + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + + def test_df_drop_columns_list_repeat(self): + """ Verifies multiple invocations with different dropped column lists do not interfere """ + def test_impl(df): + res = df + res = res.drop(columns=['A', 'E']) + res = res.drop(columns=['B', 'C']) + return res + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0], + 'D': ['a', 'b', '', 'c'], + 'E': [8, 9, 10, 11], + }) + + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + + @unittest.skip("BUG: empty df cannot be unboxed") + def test_df_unbox_empty_df(self): + def test_impl(index): + df = pd.DataFrame({}, index=index) + return df + + sdc_func = self.jit(test_impl) + for index in [ + [1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd'] + ]: with self.subTest(index=index): - pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_frame_equal(result, result_ref) - @unittest.skip("ValueError when return empty dataframe") - def test_df_drop_tuple_columns_all(self): - def gen_test_impl(do_jit=False): - def test_impl(df): - if do_jit == True: # noqa - return df.drop(columns=('A', 'B', 'C')) - else: - return df.drop(columns=['A', 'B', 'C']) + @unittest.skip("BUG: empty df cannot be unboxed") + def test_df_drop_all_columns(self): + def test_impl(df): + return df.drop(columns=['A', 'B', 'C']) + sdc_func = self.jit(test_impl) - return test_impl index_to_test = [[1, 2, 3, 4], [.1, .2, .3, .4], None, ['a', 'b', 'c', 'd']] - test_impl = gen_test_impl() - sdc_func = self.jit(gen_test_impl(do_jit=True)) - for index in index_to_test: with self.subTest(index=index): df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, @@ -1940,7 +1982,8 @@ def test_impl(df): pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - def test_df_drop_by_column_errors_ignore(self): + def test_df_drop_errors_ignore(self): + """ Verifies df.drop handles errors argument """ def test_impl(df): return df.drop(columns='M', errors='ignore') @@ -1948,18 +1991,37 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) - @skip_numba_jit - def test_df_drop_inplace2(self): - # test droping after setting the column + def test_df_drop_columns_non_literals(self): + """ Verifies that SDC supports only list of literals as dropped column names """ def test_impl(df): - df2 = df[['A', 'B']] - df2['D'] = np.ones(3) - df2.drop(columns=['D'], inplace=True) - return df2 + drop_cols = [c for c in df.columns if c != 'B'] + return df.drop(columns=drop_cols) + sdc_func = self.jit(test_impl) - df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) - hpat_func = self.jit(test_impl) - pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}) + with self.assertRaises(TypingError) as raises: + sdc_func(df) + msg = 'Unsupported use of parameter columns: expected list of constant strings.' + self.assertIn(msg, str(raises.exception)) + + def test_df_drop_columns_list_mutation_unsupported(self): + """ Verifies SDCLimitation is raised if non-const list is used as columns """ + def test_impl(df): + drop_cols = ['A', 'C'] + drop_cols.pop(-1) + return df.drop(columns=drop_cols) + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0], + }) + + with self.assertRaises(SDCLimitation) as raises: + sdc_func(df) + msg = 'Unsupported use of parameter columns: non-const list was used.' + self.assertIn(msg, str(raises.exception)) @skip_numba_jit def test_df_drop_inplace1(self): @@ -1972,6 +2034,19 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df2)) + @skip_numba_jit + def test_df_drop_inplace2(self): + # test droping after setting the column + def test_impl(df): + df2 = df[['A', 'B']] + df2['D'] = np.ones(3) + df2.drop(columns=['D'], inplace=True) + return df2 + + df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) + hpat_func = self.jit(test_impl) + pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + def _test_df_getitem_str_literal_idx(self, df): def test_impl(df): return df['A'] From a91d01c83945b01fc443f61ee60d79256d9fd758 Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Mon, 7 Dec 2020 17:06:33 +0300 Subject: [PATCH 7/8] Refactor df.geitem by slice idx to improve compile time (#949) Motivation: df[idx] when idx is a slice produces DataFrame with the same internal structure as original one (only index type may change). Hence it can be copied and filled with column[idx] slices, which improves compilation time. --- .../hpat_pandas_dataframe_functions.py | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 56ae09d1e..31f3738d9 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -1554,24 +1554,33 @@ def df_index_expr(self, length_expr=None): def df_getitem_slice_idx_main_codelines(self, idx): """Generate main code lines for df.getitem with idx of slice""" + + types_order = get_structure_maps(self.data, self.columns)[2] + n_lists = len(types_order) + results = [] - func_lines = [f' res_index = self.index[idx]'] - for i, col in enumerate(self.columns): - col_loc = self.column_loc[col] - type_id, col_id = col_loc.type_id, col_loc.col_id - res_data = f'res_data_{i}' + func_lines = [] + for i in range(n_lists): func_lines += [ - f' data_{i} = self._data[{type_id}][{col_id}][idx]', - f' {res_data} = data_{i}' + f' list_{i} = self._data[{i}].copy()', + f' for i, item in enumerate(list_{i}):', + f' list_{i}[i] = item[idx]' ] - results.append((col, res_data)) - data = ', '.join(f'"{col}": {data}' for col, data in results) - func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index)'] + all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', ' + res_data = f'({all_lists_joined})' if n_lists > 0 else '()' + func_lines += [ + f' if self_index_is_none == True:', + f' old_index = pandas.RangeIndex(len(self))', + f' else:', + f' old_index = self._index', + f' res_data = {res_data}', + f' res_index = old_index[idx]', + f' return init_dataframe_internal(res_data, res_index, df_type)' + ] return func_lines - def df_getitem_tuple_idx_main_codelines(self, literal_idx): """Generate main code lines for df.getitem with idx of tuple""" results = [] @@ -1686,13 +1695,17 @@ def df_getitem_key_error_codelines(): def df_getitem_slice_idx_codegen(self, idx): """ Example of generated implementation with provided index: - def _df_getitem_slice_idx_impl(self, idx) - res_index = self._index - data_0 = self._data[0] - res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A") - data_1 = self._data [1] - res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B") - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx]) + def _df_getitem_slice_idx_impl(self, idx): + list_0 = self._data[0].copy() + for i, item in enumerate(list_0): + list_0[i] = item[idx] + if self_index_is_none == True: + old_index = pandas.RangeIndex(len(self)) + else: + old_index = self._index + res_data = (list_0, ) + res_index = old_index[idx] + return init_dataframe_internal(res_data, res_index, df_type) """ func_lines = ['def _df_getitem_slice_idx_impl(self, idx):'] if self.columns: @@ -1701,7 +1714,17 @@ def _df_getitem_slice_idx_impl(self, idx) # raise KeyError if input DF is empty func_lines += df_getitem_key_error_codelines() func_text = '\n'.join(func_lines) - global_vars = {'pandas': pandas, 'numpy': numpy} + + # TO-DO: need DefaultIndex to handle self.index[idx] construct inside func + self_index_is_none = isinstance(self.index, types.NoneType) + new_index_type = RangeIndexType(False) if self_index_is_none else self.index + df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc) + + global_vars = {'pandas': pandas, + 'numpy': numpy, + 'df_type': df_type, + 'init_dataframe_internal': init_dataframe_internal, + 'self_index_is_none': self_index_is_none} return func_text, global_vars From 79fb01b77c9d76b81b96d694f879a088dc193bc0 Mon Sep 17 00:00:00 2001 From: Alexey Kozlov <52973316+kozlov-alexey@users.noreply.github.com> Date: Wed, 27 Jan 2021 03:08:21 +0300 Subject: [PATCH 8/8] Fix segfault in NRT_dealloc for str_arr getitem (#955) * Fix segfault in NRT_dealloc for str_arr getitem Motivation: legacy SDC implementation copied Numba internals (MemInfo struct and API functions), which if changed on the Numba side (as it's done in IntelPython/Numba where external_allocator is added) may cause segfaults. This PR removes duplication of Numba internals and moves hstr extension to NRT API functions * Updating conda-recipe with build requirement on Numba --- conda-recipe/meta.yaml | 1 + sdc/_meminfo.h | 182 ----------------------------------------- sdc/_str_decode.cpp | 52 ++++++++---- sdc/str_arr_ext.py | 7 +- setup.py | 4 +- 5 files changed, 44 insertions(+), 202 deletions(-) delete mode 100644 sdc/_meminfo.h diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 11369b6ba..40868e5e2 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -20,6 +20,7 @@ requirements: - {{ compiler('cxx') }} # [not osx] - wheel - python + - numba {{ NUMBA_VERSION }} host: - python diff --git a/sdc/_meminfo.h b/sdc/_meminfo.h deleted file mode 100644 index 8a893a95d..000000000 --- a/sdc/_meminfo.h +++ /dev/null @@ -1,182 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2020, Intel Corporation All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** - -#ifndef _MEMINFO_INCLUDED -#define _MEMINFO_INCLUDED - -// #include "_import_py.h" -// #include - -// /* Import MemInfo_* from numba.runtime._nrt_python. -// */ -// static void * -// import_meminfo_func(const char * func) { -// #define CHECK(expr, msg) if(!(expr)){std::cerr << msg << std::endl; PyGILState_Release(gilstate); return NULL;} -// auto gilstate = PyGILState_Ensure(); -// PyObject * helperdct = import_sym("numba.runtime._nrt_python", "c_helpers"); -// CHECK(helperdct, "getting numba.runtime._nrt_python.c_helpers failed"); -// /* helperdct[func] */ -// PyObject * mi_rel_fn = PyDict_GetItemString(helperdct, func); -// CHECK(mi_rel_fn, "getting meminfo func failed"); -// void * fnptr = PyLong_AsVoidPtr(mi_rel_fn); - -// Py_XDECREF(helperdct); -// PyGILState_Release(gilstate); -// return fnptr; -// #undef CHECK -// } - -// typedef void (*MemInfo_release_type)(void*); -// typedef MemInfo* (*MemInfo_alloc_aligned_type)(size_t size, unsigned align); -// typedef void* (*MemInfo_data_type)(MemInfo* mi); - -// ******** copied from Numba -// TODO: make Numba C library -typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info); -struct MemInfo -{ - size_t refct; - NRT_dtor_function dtor; - void* dtor_info; - void* data; - size_t size; /* only used for NRT allocated memory */ -}; - -typedef struct MemInfo NRT_MemInfo; - -void nrt_debug_print(const char* fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vfprintf(stderr, fmt, args); - va_end(args); -} - -#if 0 -#define NRT_Debug(X) X -#else -#define NRT_Debug(X) \ - if (0) \ - { \ - X; \ - } -#endif - -#if !defined MIN -#define MIN(a, b) ((a) < (b)) ? (a) : (b) -#endif - -void NRT_Free(void* ptr) -{ - NRT_Debug(nrt_debug_print("NRT_Free %p\n", ptr)); - free(ptr); - // TheMSys.allocator.free(ptr); - // TheMSys.atomic_inc(&TheMSys.stats_free); -} - -void NRT_MemInfo_destroy(NRT_MemInfo* mi) -{ - NRT_Free(mi); - // TheMSys.atomic_inc(&TheMSys.stats_mi_free); -} - -void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) -{ - NRT_Debug(nrt_debug_print("NRT_MemInfo_call_dtor %p\n", mi)); - if (mi->dtor) // && !TheMSys.shutting) - /* We have a destructor and the system is not shutting down */ - mi->dtor(mi->data, mi->size, mi->dtor_info); - /* Clear and release MemInfo */ - NRT_MemInfo_destroy(mi); -} - -void* NRT_Allocate(size_t size) -{ - // void *ptr = TheMSys.allocator.malloc(size); - void* ptr = malloc(size); - NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr)); - // TheMSys.atomic_inc(&TheMSys.stats_alloc); - return ptr; -} - -static void* nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo** mi_out) -{ - NRT_MemInfo* mi; - char* base = (char*)NRT_Allocate(sizeof(NRT_MemInfo) + size); - mi = (NRT_MemInfo*)base; - *mi_out = mi; - return base + sizeof(NRT_MemInfo); -} - -void NRT_MemInfo_init(NRT_MemInfo* mi, void* data, size_t size, NRT_dtor_function dtor, void* dtor_info) -{ - mi->refct = 1; /* starts with 1 refct */ - mi->dtor = dtor; - mi->dtor_info = dtor_info; - mi->data = data; - mi->size = size; - /* Update stats */ - // TheMSys.atomic_inc(&TheMSys.stats_mi_alloc); -} - -static void nrt_internal_dtor_safe(void* ptr, size_t size, void* info) -{ - NRT_Debug(nrt_debug_print("nrt_internal_dtor_safe %p, %p\n", ptr, info)); - /* See NRT_MemInfo_alloc_safe() */ - memset(ptr, 0xDE, MIN(size, 256)); -} - -static void nrt_internal_custom_dtor_safe(void* ptr, size_t size, void* info) -{ - NRT_dtor_function dtor = (NRT_dtor_function)info; - NRT_Debug(nrt_debug_print("nrt_internal_custom_dtor_safe %p, %p\n", ptr, info)); - if (dtor) - { - dtor(ptr, size, NULL); - } - - nrt_internal_dtor_safe(ptr, size, NULL); -} - -NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) -{ - NRT_MemInfo* mi; - void* data = nrt_allocate_meminfo_and_data(size, &mi); - /* Only fill up a couple cachelines with debug markers, to minimize - overhead. */ - memset(data, 0xCB, MIN(size, 256)); - NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size)); - NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, (void*)dtor); - return mi; -} - -NRT_MemInfo* NRT_MemInfo_alloc_safe(size_t size) -{ - return NRT_MemInfo_alloc_dtor_safe(size, NULL); -} - -#endif // #ifndef _MEMINFO_INCLUDED diff --git a/sdc/_str_decode.cpp b/sdc/_str_decode.cpp index 88ac1f842..9c64bcaf3 100644 --- a/sdc/_str_decode.cpp +++ b/sdc/_str_decode.cpp @@ -26,8 +26,9 @@ #include #include +#include -#include "_meminfo.h" +#include "numba/core/runtime/nrt_external.h" #ifndef Py_UNREACHABLE #define Py_UNREACHABLE() abort() @@ -37,6 +38,7 @@ typedef struct { + NRT_api_functions* nrt; NRT_MemInfo* buffer; void* data; enum PyUnicode_Kind kind; @@ -120,7 +122,8 @@ void _C_UnicodeWriter_Init(_C_UnicodeWriter* writer) #include "stringlib/undef.h" static inline int _C_UnicodeWriter_WriteCharInline(_C_UnicodeWriter* writer, Py_UCS4 ch); -static int _copy_characters(NRT_MemInfo* to, +static int _copy_characters(NRT_api_functions* nrt, + NRT_MemInfo* to, Py_ssize_t to_start, NRT_MemInfo* from, Py_ssize_t from_start, @@ -128,12 +131,19 @@ static int _copy_characters(NRT_MemInfo* to, unsigned int from_kind, unsigned int to_kind); + +static void str_data_dtor(void* data_ptr) +{ + free(data_ptr); +} + // similar to PyUnicode_New() NRT_MemInfo* alloc_writer(_C_UnicodeWriter* writer, Py_ssize_t newlen, Py_UCS4 maxchar) { enum PyUnicode_Kind kind; int is_ascii = 0; Py_ssize_t char_size; + auto nrt = writer->nrt; if (maxchar < 128) { @@ -161,20 +171,22 @@ NRT_MemInfo* alloc_writer(_C_UnicodeWriter* writer, Py_ssize_t newlen, Py_UCS4 m kind = PyUnicode_4BYTE_KIND; char_size = 4; } - NRT_MemInfo* newbuffer = NRT_MemInfo_alloc_safe((newlen + 1) * char_size); - if (newbuffer == NULL) + + char* str_data = (char*)malloc((newlen + 1) * char_size); + if (str_data == NULL) { return NULL; } + auto newbuffer = nrt->manage_memory(str_data, str_data_dtor); if (writer->buffer != NULL) { - _copy_characters(newbuffer, 0, writer->buffer, 0, writer->pos, writer->kind, kind); - NRT_MemInfo_call_dtor(writer->buffer); + _copy_characters(nrt, newbuffer, 0, writer->buffer, 0, writer->pos, writer->kind, kind); + nrt->release(writer->buffer); } writer->buffer = newbuffer; writer->maxchar = KIND_MAX_CHAR_VALUE(kind); - writer->data = writer->buffer->data; + writer->data = nrt->get_data(writer->buffer); if (!writer->readonly) { @@ -356,19 +368,22 @@ static Py_ssize_t ascii_decode(const char* start, const char* end, Py_UCS1* dest return p - start; } + // ported from CPython PyUnicode_DecodeUTF8Stateful: https://github.com/python/cpython/blob/31e8d69bfe7cf5d4ffe0967cb225d2a8a229cc97/Objects/unicodeobject.c#L4813 -void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* length, NRT_MemInfo** meminfo) +void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* length, NRT_MemInfo** meminfo, void* nrt_table) { _C_UnicodeWriter writer; const char* end = s + size; + auto nrt = (NRT_api_functions*)nrt_table; const char* errmsg = ""; *is_ascii = 0; if (size == 0) { - (*meminfo) = NRT_MemInfo_alloc_safe(1); - ((char*)((*meminfo)->data))[0] = 0; + char* str_data = (char*)malloc(1); + (*meminfo) = nrt->manage_memory(str_data, str_data_dtor); + ((char*)(nrt->get_data(*meminfo)))[0] = 0; *kind = PyUnicode_1BYTE_KIND; *is_ascii = 1; *length = 0; @@ -379,9 +394,10 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* if (size == 1 && (unsigned char)s[0] < 128) { // TODO interning - (*meminfo) = NRT_MemInfo_alloc_safe(2); - ((char*)((*meminfo)->data))[0] = s[0]; - ((char*)((*meminfo)->data))[1] = 0; + char* str_data = (char*)malloc(2); + (*meminfo) = nrt->manage_memory(str_data, str_data_dtor); + ((char*)(nrt->get_data(*meminfo)))[0] = s[0]; + ((char*)(nrt->get_data(*meminfo)))[1] = 0; *kind = PyUnicode_1BYTE_KIND; *is_ascii = 1; *length = 1; @@ -390,6 +406,7 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* _C_UnicodeWriter_Init(&writer); writer.min_length = size; + writer.nrt = nrt; if (_C_UnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) goto onError; @@ -469,7 +486,7 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* onError: std::cerr << "utf8 decode error:" << errmsg << std::endl; - NRT_MemInfo_call_dtor(writer.buffer); + nrt->release(*meminfo); return; } @@ -499,7 +516,8 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* *_to++ = (to_type)*_iter++; \ } while (0) -static int _copy_characters(NRT_MemInfo* to, +static int _copy_characters(NRT_api_functions* nrt, + NRT_MemInfo* to, Py_ssize_t to_start, NRT_MemInfo* from, Py_ssize_t from_start, @@ -516,8 +534,8 @@ static int _copy_characters(NRT_MemInfo* to, if (how_many == 0) return 0; - from_data = from->data; - to_data = to->data; + from_data = nrt->get_data(from); + to_data = nrt->get_data(to); if (from_kind == to_kind) { diff --git a/sdc/str_arr_ext.py b/sdc/str_arr_ext.py index 6b363d013..1d12a80a5 100644 --- a/sdc/str_arr_ext.py +++ b/sdc/str_arr_ext.py @@ -1026,6 +1026,7 @@ def str_arr_getitem_by_array_impl(A, arg): def decode_utf8(typingctx, ptr_t, len_t=None): def codegen(context, builder, sig, args): ptr, length = args + nrt_table = context.nrt.get_nrt_api(builder) # create str and call decode with internal pointers uni_str = cgutils.create_struct_proxy(string_type)(context, builder) @@ -1034,14 +1035,16 @@ def codegen(context, builder, sig, args): lir.IntType(32).as_pointer(), lir.IntType(32).as_pointer(), lir.IntType(64).as_pointer(), - uni_str.meminfo.type.as_pointer()]) + uni_str.meminfo.type.as_pointer(), + lir.IntType(8).as_pointer()]) fn_decode = builder.module.get_or_insert_function( fnty, name="decode_utf8") builder.call(fn_decode, [ptr, length, uni_str._get_ptr_by_name('kind'), uni_str._get_ptr_by_name('is_ascii'), uni_str._get_ptr_by_name('length'), - uni_str._get_ptr_by_name('meminfo')]) + uni_str._get_ptr_by_name('meminfo'), + nrt_table]) uni_str.hash = context.get_constant(_Py_hash_t, -1) uni_str.data = context.nrt.meminfo_data(builder, uni_str.meminfo) # Set parent to NULL diff --git a/setup.py b/setup.py index fd128f02b..d4c8f1fab 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ import platform import os import sys +import numba from docs.source.buildscripts.sdc_build_doc import SDCBuildDoc @@ -185,6 +186,7 @@ def check_file_at_path(path2file): ) str_libs = np_compile_args['libraries'] +numba_include_path = numba.extending.include_path() ext_str = Extension(name="sdc.hstr_ext", sources=["sdc/_str_ext.cpp"], @@ -192,7 +194,7 @@ def check_file_at_path(path2file): define_macros=np_compile_args['define_macros'], extra_compile_args=eca, extra_link_args=ela, - include_dirs=np_compile_args['include_dirs'] + ind, + include_dirs=np_compile_args['include_dirs'] + ind + [numba_include_path], library_dirs=np_compile_args['library_dirs'] + lid, )