diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 11369b6ba..40868e5e2 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -20,6 +20,7 @@ requirements: - {{ compiler('cxx') }} # [not osx] - wheel - python + - numba {{ NUMBA_VERSION }} host: - python diff --git a/examples/dataframe/dataframe_columns.py b/examples/dataframe/dataframe_columns.py new file mode 100644 index 000000000..c4094bb8a --- /dev/null +++ b/examples/dataframe/dataframe_columns.py @@ -0,0 +1,39 @@ +# ***************************************************************************** +# Copyright (c) 2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import pandas as pd +from numba import njit + + +@njit +def dataframe_columns(): + df = pd.DataFrame({'A': [1, 2], 'AA': [3, 4], 'B': [5, 6]}, index=['a', 'b']) + result = df.columns + + return result # A tuple of column names ('A', 'AA', 'B') + + +print(dataframe_columns()) diff --git a/sdc/_meminfo.h b/sdc/_meminfo.h deleted file mode 100644 index 8a893a95d..000000000 --- a/sdc/_meminfo.h +++ /dev/null @@ -1,182 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2020, Intel Corporation All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, -// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** - -#ifndef _MEMINFO_INCLUDED -#define _MEMINFO_INCLUDED - -// #include "_import_py.h" -// #include - -// /* Import MemInfo_* from numba.runtime._nrt_python. -// */ -// static void * -// import_meminfo_func(const char * func) { -// #define CHECK(expr, msg) if(!(expr)){std::cerr << msg << std::endl; PyGILState_Release(gilstate); return NULL;} -// auto gilstate = PyGILState_Ensure(); -// PyObject * helperdct = import_sym("numba.runtime._nrt_python", "c_helpers"); -// CHECK(helperdct, "getting numba.runtime._nrt_python.c_helpers failed"); -// /* helperdct[func] */ -// PyObject * mi_rel_fn = PyDict_GetItemString(helperdct, func); -// CHECK(mi_rel_fn, "getting meminfo func failed"); -// void * fnptr = PyLong_AsVoidPtr(mi_rel_fn); - -// Py_XDECREF(helperdct); -// PyGILState_Release(gilstate); -// return fnptr; -// #undef CHECK -// } - -// typedef void (*MemInfo_release_type)(void*); -// typedef MemInfo* (*MemInfo_alloc_aligned_type)(size_t size, unsigned align); -// typedef void* (*MemInfo_data_type)(MemInfo* mi); - -// ******** copied from Numba -// TODO: make Numba C library -typedef void (*NRT_dtor_function)(void* ptr, size_t size, void* info); -struct MemInfo -{ - size_t refct; - NRT_dtor_function dtor; - void* dtor_info; - void* data; - size_t size; /* only used for NRT allocated memory */ -}; - -typedef struct MemInfo NRT_MemInfo; - -void nrt_debug_print(const char* fmt, ...) -{ - va_list args; - - va_start(args, fmt); - vfprintf(stderr, fmt, args); - va_end(args); -} - -#if 0 -#define NRT_Debug(X) X -#else -#define NRT_Debug(X) \ - if (0) \ - { \ - X; \ - } -#endif - -#if !defined MIN -#define MIN(a, b) ((a) < (b)) ? (a) : (b) -#endif - -void NRT_Free(void* ptr) -{ - NRT_Debug(nrt_debug_print("NRT_Free %p\n", ptr)); - free(ptr); - // TheMSys.allocator.free(ptr); - // TheMSys.atomic_inc(&TheMSys.stats_free); -} - -void NRT_MemInfo_destroy(NRT_MemInfo* mi) -{ - NRT_Free(mi); - // TheMSys.atomic_inc(&TheMSys.stats_mi_free); -} - -void NRT_MemInfo_call_dtor(NRT_MemInfo* mi) -{ - NRT_Debug(nrt_debug_print("NRT_MemInfo_call_dtor %p\n", mi)); - if (mi->dtor) // && !TheMSys.shutting) - /* We have a destructor and the system is not shutting down */ - mi->dtor(mi->data, mi->size, mi->dtor_info); - /* Clear and release MemInfo */ - NRT_MemInfo_destroy(mi); -} - -void* NRT_Allocate(size_t size) -{ - // void *ptr = TheMSys.allocator.malloc(size); - void* ptr = malloc(size); - NRT_Debug(nrt_debug_print("NRT_Allocate bytes=%zu ptr=%p\n", size, ptr)); - // TheMSys.atomic_inc(&TheMSys.stats_alloc); - return ptr; -} - -static void* nrt_allocate_meminfo_and_data(size_t size, NRT_MemInfo** mi_out) -{ - NRT_MemInfo* mi; - char* base = (char*)NRT_Allocate(sizeof(NRT_MemInfo) + size); - mi = (NRT_MemInfo*)base; - *mi_out = mi; - return base + sizeof(NRT_MemInfo); -} - -void NRT_MemInfo_init(NRT_MemInfo* mi, void* data, size_t size, NRT_dtor_function dtor, void* dtor_info) -{ - mi->refct = 1; /* starts with 1 refct */ - mi->dtor = dtor; - mi->dtor_info = dtor_info; - mi->data = data; - mi->size = size; - /* Update stats */ - // TheMSys.atomic_inc(&TheMSys.stats_mi_alloc); -} - -static void nrt_internal_dtor_safe(void* ptr, size_t size, void* info) -{ - NRT_Debug(nrt_debug_print("nrt_internal_dtor_safe %p, %p\n", ptr, info)); - /* See NRT_MemInfo_alloc_safe() */ - memset(ptr, 0xDE, MIN(size, 256)); -} - -static void nrt_internal_custom_dtor_safe(void* ptr, size_t size, void* info) -{ - NRT_dtor_function dtor = (NRT_dtor_function)info; - NRT_Debug(nrt_debug_print("nrt_internal_custom_dtor_safe %p, %p\n", ptr, info)); - if (dtor) - { - dtor(ptr, size, NULL); - } - - nrt_internal_dtor_safe(ptr, size, NULL); -} - -NRT_MemInfo* NRT_MemInfo_alloc_dtor_safe(size_t size, NRT_dtor_function dtor) -{ - NRT_MemInfo* mi; - void* data = nrt_allocate_meminfo_and_data(size, &mi); - /* Only fill up a couple cachelines with debug markers, to minimize - overhead. */ - memset(data, 0xCB, MIN(size, 256)); - NRT_Debug(nrt_debug_print("NRT_MemInfo_alloc_dtor_safe %p %zu\n", data, size)); - NRT_MemInfo_init(mi, data, size, nrt_internal_custom_dtor_safe, (void*)dtor); - return mi; -} - -NRT_MemInfo* NRT_MemInfo_alloc_safe(size_t size) -{ - return NRT_MemInfo_alloc_dtor_safe(size, NULL); -} - -#endif // #ifndef _MEMINFO_INCLUDED diff --git a/sdc/_str_decode.cpp b/sdc/_str_decode.cpp index 88ac1f842..9c64bcaf3 100644 --- a/sdc/_str_decode.cpp +++ b/sdc/_str_decode.cpp @@ -26,8 +26,9 @@ #include #include +#include -#include "_meminfo.h" +#include "numba/core/runtime/nrt_external.h" #ifndef Py_UNREACHABLE #define Py_UNREACHABLE() abort() @@ -37,6 +38,7 @@ typedef struct { + NRT_api_functions* nrt; NRT_MemInfo* buffer; void* data; enum PyUnicode_Kind kind; @@ -120,7 +122,8 @@ void _C_UnicodeWriter_Init(_C_UnicodeWriter* writer) #include "stringlib/undef.h" static inline int _C_UnicodeWriter_WriteCharInline(_C_UnicodeWriter* writer, Py_UCS4 ch); -static int _copy_characters(NRT_MemInfo* to, +static int _copy_characters(NRT_api_functions* nrt, + NRT_MemInfo* to, Py_ssize_t to_start, NRT_MemInfo* from, Py_ssize_t from_start, @@ -128,12 +131,19 @@ static int _copy_characters(NRT_MemInfo* to, unsigned int from_kind, unsigned int to_kind); + +static void str_data_dtor(void* data_ptr) +{ + free(data_ptr); +} + // similar to PyUnicode_New() NRT_MemInfo* alloc_writer(_C_UnicodeWriter* writer, Py_ssize_t newlen, Py_UCS4 maxchar) { enum PyUnicode_Kind kind; int is_ascii = 0; Py_ssize_t char_size; + auto nrt = writer->nrt; if (maxchar < 128) { @@ -161,20 +171,22 @@ NRT_MemInfo* alloc_writer(_C_UnicodeWriter* writer, Py_ssize_t newlen, Py_UCS4 m kind = PyUnicode_4BYTE_KIND; char_size = 4; } - NRT_MemInfo* newbuffer = NRT_MemInfo_alloc_safe((newlen + 1) * char_size); - if (newbuffer == NULL) + + char* str_data = (char*)malloc((newlen + 1) * char_size); + if (str_data == NULL) { return NULL; } + auto newbuffer = nrt->manage_memory(str_data, str_data_dtor); if (writer->buffer != NULL) { - _copy_characters(newbuffer, 0, writer->buffer, 0, writer->pos, writer->kind, kind); - NRT_MemInfo_call_dtor(writer->buffer); + _copy_characters(nrt, newbuffer, 0, writer->buffer, 0, writer->pos, writer->kind, kind); + nrt->release(writer->buffer); } writer->buffer = newbuffer; writer->maxchar = KIND_MAX_CHAR_VALUE(kind); - writer->data = writer->buffer->data; + writer->data = nrt->get_data(writer->buffer); if (!writer->readonly) { @@ -356,19 +368,22 @@ static Py_ssize_t ascii_decode(const char* start, const char* end, Py_UCS1* dest return p - start; } + // ported from CPython PyUnicode_DecodeUTF8Stateful: https://github.com/python/cpython/blob/31e8d69bfe7cf5d4ffe0967cb225d2a8a229cc97/Objects/unicodeobject.c#L4813 -void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* length, NRT_MemInfo** meminfo) +void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* length, NRT_MemInfo** meminfo, void* nrt_table) { _C_UnicodeWriter writer; const char* end = s + size; + auto nrt = (NRT_api_functions*)nrt_table; const char* errmsg = ""; *is_ascii = 0; if (size == 0) { - (*meminfo) = NRT_MemInfo_alloc_safe(1); - ((char*)((*meminfo)->data))[0] = 0; + char* str_data = (char*)malloc(1); + (*meminfo) = nrt->manage_memory(str_data, str_data_dtor); + ((char*)(nrt->get_data(*meminfo)))[0] = 0; *kind = PyUnicode_1BYTE_KIND; *is_ascii = 1; *length = 0; @@ -379,9 +394,10 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* if (size == 1 && (unsigned char)s[0] < 128) { // TODO interning - (*meminfo) = NRT_MemInfo_alloc_safe(2); - ((char*)((*meminfo)->data))[0] = s[0]; - ((char*)((*meminfo)->data))[1] = 0; + char* str_data = (char*)malloc(2); + (*meminfo) = nrt->manage_memory(str_data, str_data_dtor); + ((char*)(nrt->get_data(*meminfo)))[0] = s[0]; + ((char*)(nrt->get_data(*meminfo)))[1] = 0; *kind = PyUnicode_1BYTE_KIND; *is_ascii = 1; *length = 1; @@ -390,6 +406,7 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* _C_UnicodeWriter_Init(&writer); writer.min_length = size; + writer.nrt = nrt; if (_C_UnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1) goto onError; @@ -469,7 +486,7 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* onError: std::cerr << "utf8 decode error:" << errmsg << std::endl; - NRT_MemInfo_call_dtor(writer.buffer); + nrt->release(*meminfo); return; } @@ -499,7 +516,8 @@ void decode_utf8(const char* s, Py_ssize_t size, int* kind, int* is_ascii, int* *_to++ = (to_type)*_iter++; \ } while (0) -static int _copy_characters(NRT_MemInfo* to, +static int _copy_characters(NRT_api_functions* nrt, + NRT_MemInfo* to, Py_ssize_t to_start, NRT_MemInfo* from, Py_ssize_t from_start, @@ -516,8 +534,8 @@ static int _copy_characters(NRT_MemInfo* to, if (how_many == 0) return 0; - from_data = from->data; - to_data = to->data; + from_data = nrt->get_data(from); + to_data = nrt->get_data(to); if (from_kind == to_kind) { diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 8b3cf79bc..31f3738d9 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -52,6 +52,7 @@ from sdc.datatypes.range_index_type import RangeIndexType from sdc.hiframes.pd_dataframe_type import DataFrameType +from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps from sdc.hiframes.pd_series_type import SeriesType from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType, @@ -116,6 +117,44 @@ def hpat_pandas_df_index_impl(df): return hpat_pandas_df_index_impl +@sdc_overload_attribute(DataFrameType, 'columns') +def hpat_pandas_dataframe_columns(df): + """ + Intel Scalable Dataframe Compiler User Guide + ******************************************** + Pandas API: pandas.DataFrame.columns + + Examples + -------- + .. literalinclude:: ../../../examples/dataframe/dataframe_columns.py + :language: python + :lines: 27- + :caption: The column names of the DataFrame. + :name: ex_dataframe_columns + + .. command-output:: python ./dataframe/dataframe_columns.py + :cwd: ../../../examples + + Intel Scalable Dataframe Compiler Developer Guide + ************************************************* + Pandas DataFrame attribute :attr:`pandas.DataFrame.columns` implementation. + + .. only:: developer + Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_dataframe_columns* + """ + + ty_checker = TypeChecker('Attribute columns.') + ty_checker.check(df, DataFrameType) + + # no columns in DF model to avoid impact on DF ctor IR size (captured when needed only) + df_columns = df.columns + + def hpat_pandas_df_columns_impl(df): + return df_columns + + return hpat_pandas_df_columns_impl + + def sdc_pandas_dataframe_values_codegen(self, numba_common_dtype): """ Example of generated implementation: @@ -1299,40 +1338,69 @@ def isna_overload(df): return sdc_pandas_dataframe_isna_codegen(df, 'isna') -def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols): +def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_col_names): """ Example of generated implementation: def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns=None, - level=None, inplace=False, errors="raise"): - new_col_0_data_df = df._data[1][0] - new_col_1_data_df = df._data[0][1] - return pandas.DataFrame({"B": new_col_0_data_df, "C": new_col_1_data_df}, index=df.index) + level=None, inplace=False, errors="raise"): + list_0 = df._data[0].copy() + for col_id in old_scheme_drop_idxs_0[::-1]: + list_0.pop(col_id) + list_1 = df._data[1].copy() + new_data = (list_1, list_0, ) + return init_dataframe_internal(new_data, df._index, df_type) """ indent = 4 * ' ' - saved_df_columns = [column for column in df.columns if column not in drop_cols] - func_definition = [f'def sdc_pandas_dataframe_{func_name}_impl({", ".join(func_args)}):'] + func_definition = [f'def {func_name}({", ".join(func_args)}):'] func_text = [] - column_list = [] - for label in drop_cols: + old_column_loc, old_data_typs_map, old_types_order = get_structure_maps(df.data, df.columns) + + new_data_typs = tuple(t for i, t in enumerate(df.data) if df.columns[i] not in drop_col_names) + new_column_names = tuple(c for c in df.columns if c not in drop_col_names) + new_column_loc, new_data_typs_map, new_types_order = get_structure_maps(new_data_typs, new_column_names) + + old_types_idxs_map = dict(zip(old_types_order, range(len(old_types_order)))) + reorder_scheme = tuple(old_types_idxs_map[t] for t in new_types_order) + df_type = DataFrameType(new_data_typs, df.index, new_column_names, column_loc=new_column_loc) + + old_scheme_drop_idxs = [] + for i, k in enumerate(old_types_order): + a = [j for j, x in enumerate(old_data_typs_map[k][1]) if df.columns[x] in drop_col_names] + old_scheme_drop_idxs.append(tuple(a) or None) + + for label in drop_col_names: if label not in df.columns: func_text.append(f'if errors == "raise":') func_text.append(indent + f'raise ValueError("The label {label} is not found in the selected axis")') break - for column_id, column_name in enumerate(saved_df_columns): - col_loc = df.column_loc[column_name] - type_id, col_id = col_loc.type_id, col_loc.col_id - func_text.append(f'new_col_{column_id}_data_df = df._data[{type_id}][{col_id}]') - column_list.append((f'new_col_{column_id}_data_df', column_name)) - - data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) - index = 'df.index' - func_text.append(f"return pandas.DataFrame({{{data}}}, index={index})\n") + old_ntypes = len(old_types_order) + for type_id in range(old_ntypes): + func_text.append(f'list_{type_id} = df._data[{type_id}].copy()') + if old_scheme_drop_idxs[type_id]: + func_text.append(f'for col_id in old_scheme_drop_idxs_{type_id}[::-1]:') + func_text.append(indent + f'list_{type_id}.pop(col_id)') + + # in new df the order of array lists (i.e. types_order) can be different, so + # making a new tuple of lists reorder as needed + new_ntypes = len(new_types_order) + data_lists_reordered = ', '.join(['list_' + str(reorder_scheme[i]) for i in range(new_ntypes)]) + data_val = '(' + data_lists_reordered + ', )' if new_ntypes > 0 else '()' + + data, index = 'new_data', 'df._index' + func_text.append(f'{data} = {data_val}') + func_text.append(f"return init_dataframe_internal({data}, {index}, df_type)\n") func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) - global_vars = {'pandas': pandas} + global_vars = { + 'pandas': pandas, + 'init_dataframe_internal': init_dataframe_internal, + 'df_type': df_type + } + + global_vars.update({f'old_scheme_drop_idxs_{i}': old_scheme_drop_idxs[i] for i in range(old_ntypes)}) return func_def, global_vars @@ -1349,7 +1417,8 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, ----------- - Parameters ``labels``, ``axis``, ``index``, ``level`` and ``inplace`` are currently unsupported. - Parameter ``columns`` is required and is expected to be a Literal value with one column name - or Tuple with columns names. + or List with columns names. Mutating a list of column names after it was defined and then using it as a + columns argument results in an SDCLimitation exception at runtime. - Supported ``errors`` can be {``raise``, ``ignore``}, default ``raise``. If ``ignore``, suppress error and only existing labels are dropped. @@ -1382,36 +1451,66 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, """ - _func_name = 'drop' + method_name = f'Method drop().' - ty_checker = TypeChecker(f'Method {_func_name}().') + ty_checker = TypeChecker(method_name) ty_checker.check(df, DataFrameType) - if not isinstance(labels, types.Omitted) and labels is not None: + if not isinstance(labels, (types.Omitted, types.NoneType)) and labels is not None: ty_checker.raise_exc(labels, 'None', 'labels') - if not isinstance(axis, (int, types.Omitted)): + if not isinstance(axis, (types.Omitted, types.Integer)) and axis != 0: ty_checker.raise_exc(axis, 'int', 'axis') - if not isinstance(index, types.Omitted) and index is not None: + if not isinstance(index, (types.Omitted, types.NoneType)) and index is not None: ty_checker.raise_exc(index, 'None', 'index') - if not isinstance(columns, (types.Omitted, types.Tuple, types.Literal)): - ty_checker.raise_exc(columns, 'str, tuple of str', 'columns') + if not (isinstance(columns, (types.Omitted, types.StringLiteral)) + or (isinstance(columns, types.Tuple) + and all(isinstance(c, types.StringLiteral) for c in columns)) + or (isinstance(columns, types.UniTuple) and isinstance(columns.dtype, types.StringLiteral)) + or isinstance(columns, types.List) and isinstance(columns.dtype, types.UnicodeType) + ): + ty_checker.raise_exc(columns, 'str, list of const str', 'columns') - if not isinstance(level, (types.Omitted, types.Literal)) and level is not None: + if not isinstance(level, (types.Omitted, types.NoneType, types.Literal)) and level is not None: ty_checker.raise_exc(level, 'None', 'level') - if not isinstance(inplace, (bool, types.Omitted)) and inplace: + if not isinstance(inplace, (types.Omitted, types.NoneType, types.Boolean)) and inplace: ty_checker.raise_exc(inplace, 'bool', 'inplace') - if not isinstance(errors, (str, types.Omitted, types.Literal)): + if not isinstance(errors, (types.Omitted, types.UnicodeType, types.StringLiteral)) and errors != "raise": ty_checker.raise_exc(errors, 'str', 'errors') + if isinstance(columns, types.List): + if columns.initial_value is None: + raise TypingError('{} Unsupported use of parameter columns:' + ' expected list of constant strings. Given: {}'.format(method_name, columns)) + else: + # this works because global tuple of strings is captured as Tuple of StringLiterals + columns_as_tuple = tuple(columns.initial_value) + def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None, + columns=None, level=None, inplace=False, errors="raise"): + + # if at runtime columns list differs from it's initial value (known at compile time) + # we cannot tell which columns to drop and what is the resulting DataFrameType, so raise exception + if list(columns_as_tuple) != columns: + raise SDCLimitation("Unsupported use of parameter columns: non-const list was used.") + + return df.drop(labels=labels, + axis=axis, + index=index, + columns=columns_as_tuple, + level=level, + inplace=inplace, + errors=errors) + + return _sdc_pandas_dataframe_drop_wrapper_impl + args = {'labels': None, 'axis': 0, 'index': None, 'columns': None, 'level': None, 'inplace': False, 'errors': f'"raise"'} - def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns): + def sdc_pandas_dataframe_drop_impl(df, args, columns): func_args = ['df'] for key, value in args.items(): if key not in func_args: @@ -1421,18 +1520,19 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns): if isinstance(columns, types.StringLiteral): drop_cols = (columns.literal_value,) - elif isinstance(columns, types.Tuple): + elif isinstance(columns, (types.Tuple, types.UniTuple)): drop_cols = tuple(column.literal_value for column in columns) else: raise ValueError('Only drop by one column or tuple of columns is currently supported in df.drop()') - func_def, global_vars = sdc_pandas_dataframe_drop_codegen(_func_name, func_args, df, drop_cols) + func_name = 'sdc_pandas_dataframe_drop_impl' + func_def, global_vars = sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols) loc_vars = {} exec(func_def, global_vars, loc_vars) - _drop_impl = loc_vars['sdc_pandas_dataframe_drop_impl'] + _drop_impl = loc_vars[func_name] return _drop_impl - return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns) + return sdc_pandas_dataframe_drop_impl(df, args, columns) def df_length_expr(self): @@ -1454,24 +1554,33 @@ def df_index_expr(self, length_expr=None): def df_getitem_slice_idx_main_codelines(self, idx): """Generate main code lines for df.getitem with idx of slice""" + + types_order = get_structure_maps(self.data, self.columns)[2] + n_lists = len(types_order) + results = [] - func_lines = [f' res_index = self.index[idx]'] - for i, col in enumerate(self.columns): - col_loc = self.column_loc[col] - type_id, col_id = col_loc.type_id, col_loc.col_id - res_data = f'res_data_{i}' + func_lines = [] + for i in range(n_lists): func_lines += [ - f' data_{i} = self._data[{type_id}][{col_id}][idx]', - f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")' + f' list_{i} = self._data[{i}].copy()', + f' for i, item in enumerate(list_{i}):', + f' list_{i}[i] = item[idx]' ] - results.append((col, res_data)) - data = ', '.join(f'"{col}": {data}' for col, data in results) - func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index)'] + all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', ' + res_data = f'({all_lists_joined})' if n_lists > 0 else '()' + func_lines += [ + f' if self_index_is_none == True:', + f' old_index = pandas.RangeIndex(len(self))', + f' else:', + f' old_index = self._index', + f' res_data = {res_data}', + f' res_index = old_index[idx]', + f' return init_dataframe_internal(res_data, res_index, df_type)' + ] return func_lines - def df_getitem_tuple_idx_main_codelines(self, literal_idx): """Generate main code lines for df.getitem with idx of tuple""" results = [] @@ -1586,13 +1695,17 @@ def df_getitem_key_error_codelines(): def df_getitem_slice_idx_codegen(self, idx): """ Example of generated implementation with provided index: - def _df_getitem_slice_idx_impl(self, idx) - res_index = self._index - data_0 = self._data[0] - res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A") - data_1 = self._data [1] - res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B") - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx]) + def _df_getitem_slice_idx_impl(self, idx): + list_0 = self._data[0].copy() + for i, item in enumerate(list_0): + list_0[i] = item[idx] + if self_index_is_none == True: + old_index = pandas.RangeIndex(len(self)) + else: + old_index = self._index + res_data = (list_0, ) + res_index = old_index[idx] + return init_dataframe_internal(res_data, res_index, df_type) """ func_lines = ['def _df_getitem_slice_idx_impl(self, idx):'] if self.columns: @@ -1601,7 +1714,17 @@ def _df_getitem_slice_idx_impl(self, idx) # raise KeyError if input DF is empty func_lines += df_getitem_key_error_codelines() func_text = '\n'.join(func_lines) - global_vars = {'pandas': pandas, 'numpy': numpy} + + # TO-DO: need DefaultIndex to handle self.index[idx] construct inside func + self_index_is_none = isinstance(self.index, types.NoneType) + new_index_type = RangeIndexType(False) if self_index_is_none else self.index + df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc) + + global_vars = {'pandas': pandas, + 'numpy': numpy, + 'df_type': df_type, + 'init_dataframe_internal': init_dataframe_internal, + 'self_index_is_none': self_index_is_none} return func_text, global_vars diff --git a/sdc/datatypes/hpat_pandas_functions.py b/sdc/datatypes/hpat_pandas_functions.py index a82813836..7bbbf951d 100644 --- a/sdc/datatypes/hpat_pandas_functions.py +++ b/sdc/datatypes/hpat_pandas_functions.py @@ -39,12 +39,13 @@ from sdc.io.csv_ext import ( _gen_csv_reader_py_pyarrow_py_func, - _gen_csv_reader_py_pyarrow_func_text_dataframe, + _gen_pandas_read_csv_func_text, ) from sdc.str_arr_ext import string_array_type from sdc.hiframes import join, aggregate, sort from sdc.types import CategoricalDtypeType, Categorical +from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype def get_numba_array_types_for_csv(df): @@ -255,45 +256,69 @@ def sdc_pandas_read_csv( usecols = [col.literal_value for col in usecols] if infer_from_params: - # dtype should be constants and is important only for inference from params + # dtype is a tuple of format ('A', A_dtype, 'B', B_dtype, ...) + # where column names should be constants and is important only for inference from params if isinstance(dtype, types.Tuple): - assert all(isinstance(key, types.Literal) for key in dtype[::2]) + assert all(isinstance(key, types.StringLiteral) for key in dtype[::2]) keys = (k.literal_value for k in dtype[::2]) - values = dtype[1::2] - values = [v.typing_key if isinstance(v, types.Function) else v for v in values] - values = [types.Array(numba.from_dtype(np.dtype(v.literal_value)), 1, 'C') - if isinstance(v, types.Literal) else v for v in values] - values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values] - values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values] - values = [string_array_type if v == str else v for v in values] - values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values] - dtype = dict(zip(keys, values)) + def _get_df_col_type(dtype): + if isinstance(dtype, types.Function): + if dtype.typing_key == int: + return types.Array(types.int_, 1, 'C') + elif dtype.typing_key == float: + return types.Array(types.float64, 1, 'C') + elif dtype.typing_key == str: + return string_array_type + else: + assert False, f"map_dtype_to_col_type: failing to infer column type for dtype={dtype}" + + if isinstance(dtype, types.StringLiteral): + if dtype.literal_value == 'str': + return string_array_type + else: + return types.Array(numba.from_dtype(np.dtype(dtype.literal_value)), 1, 'C') + + if isinstance(dtype, types.NumberClass): + return types.Array(dtype.dtype, 1, 'C') + + if isinstance(dtype, CategoricalDtypeType): + return Categorical(dtype) + + col_types_map = dict(zip(keys, map(_get_df_col_type, values))) # in case of both are available # inferencing from params has priority over inferencing from file if infer_from_params: - col_names = names # all names should be in dtype - return_columns = usecols if usecols else names - col_typs = [dtype[n] for n in return_columns] + col_names = usecols if usecols else names + col_types = [col_types_map[n] for n in col_names] elif infer_from_file: - col_names, col_typs = infer_column_names_and_types_from_constant_filename( + col_names, col_types = infer_column_names_and_types_from_constant_filename( filepath_or_buffer, delimiter, names, usecols, skiprows) else: return None - dtype_present = not isinstance(dtype, (types.Omitted, type(None))) + def _get_py_col_dtype(ctype): + """ Re-creates column dtype as python type to be used in read_csv call """ + dtype = ctype.dtype + if ctype == string_array_type: + return str + if isinstance(ctype, Categorical): + return _reconstruct_CategoricalDtype(ctype.pd_dtype) + return numpy_support.as_dtype(dtype) + + py_col_dtypes = {cname: _get_py_col_dtype(ctype) for cname, ctype in zip(col_names, col_types)} # generate function text with signature and returning DataFrame - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_dataframe( - col_names, col_typs, dtype_present, usecols, signature) + func_text, func_name, global_vars = _gen_pandas_read_csv_func_text( + col_names, col_types, py_col_dtypes, usecols, signature) # compile with Python - csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name) + csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars) return csv_reader_py diff --git a/sdc/datatypes/hpat_pandas_groupby_functions.py b/sdc/datatypes/hpat_pandas_groupby_functions.py index 9d5d86878..83f752e9d 100644 --- a/sdc/datatypes/hpat_pandas_groupby_functions.py +++ b/sdc/datatypes/hpat_pandas_groupby_functions.py @@ -160,6 +160,8 @@ def sdc_pandas_dataframe_getitem(self, idx): target_col_loc = self.parent.column_loc[self.parent.columns[target_col_id_literal]] target_type_id, target_col_id = target_col_loc.type_id, target_col_loc.col_id + parent_df_col_names = self.parent.columns + def sdc_pandas_dataframe_getitem_common_impl(self, idx): # calling getitem twice raises IndexError, just as in pandas @@ -170,7 +172,7 @@ def sdc_pandas_dataframe_getitem_common_impl(self, idx): # no need to pass index into this series, as we group by array target_series = pandas.Series( data=self._parent._data[target_type_id][target_col_id], - name=self._parent._columns[target_col_id_literal] + name=parent_df_col_names[target_col_id_literal] ) by_arr_data = self._parent._data[by_type_id][by_col_id] return init_series_groupby(target_series, by_arr_data, self._data, self._sort) diff --git a/sdc/decorators.py b/sdc/decorators.py index aa5ad0f5e..adb779059 100644 --- a/sdc/decorators.py +++ b/sdc/decorators.py @@ -31,6 +31,9 @@ import numba import sdc +from functools import wraps +from sdc.utilities.utils import print_compile_times + def jit(signature_or_function=None, **options): @@ -44,3 +47,29 @@ def jit(signature_or_function=None, **options): Use Numba compiler pipeline ''' return numba.jit(signature_or_function, **options) + + +def debug_compile_time(level=1, func_names=None): + """ Decorates Numba Dispatcher object to print compile stats after call. + Usage: + @debug_compile_time() + @numba.njit + + Args: + level: if zero prints only short summary + func_names: filters output to include only functions which names include listed strings, + """ + + def get_wrapper(disp): + + @wraps(disp) + def wrapper(*args, **kwargs): + res = disp(*args, **kwargs) + print('*' * 40, 'COMPILE STATS', '*' * 40) + print_compile_times(disp, level=level, func_names=func_names) + print('*' * 95) + return res + + return wrapper + + return get_wrapper diff --git a/sdc/hiframes/api.py b/sdc/hiframes/api.py index b1e22f536..77436f49b 100644 --- a/sdc/hiframes/api.py +++ b/sdc/hiframes/api.py @@ -167,25 +167,25 @@ def fix_df_array_list_str_impl(column): # pragma: no cover return lambda column: column -def fix_df_index(index, *columns): +def fix_df_index(index): return index @overload(fix_df_index) -def fix_df_index_overload(index, *columns): +def fix_df_index_overload(index): # TO-DO: replace types.none index with separate type, e.g. DefaultIndex if (index is None or isinstance(index, types.NoneType)): - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return None elif isinstance(index, RangeIndexType): - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return index else: # default case, transform index the same as df data - def fix_df_index_impl(index, *columns): + def fix_df_index_impl(index): return fix_df_array(index) return fix_df_index_impl diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 232096472..5e6930da9 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -88,30 +88,11 @@ def unbox_dataframe(typ, val, c): columns will be extracted later if necessary. """ n_cols = len(typ.columns) - column_strs = [numba.cpython.unicode.make_string_from_constant( - c.context, c.builder, string_type, a) for a in typ.columns] # create dataframe struct and store values dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder) errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit) - col_list_type = types.List(string_type) - ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, col_list_type, n_cols) - - with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok): - with if_ok: - inst.size = c.context.get_constant(types.intp, n_cols) - for i, column_str in enumerate(column_strs): - inst.setitem(c.context.get_constant(types.intp, i), column_str, incref=False) - dataframe.columns = inst.value - - with if_not_ok: - c.builder.store(cgutils.true_bit, errorptr) - - # If an error occurred, drop the whole native list - with c.builder.if_then(c.builder.load(errorptr)): - c.context.nrt.decref(c.builder, col_list_type, inst.value) - _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns) for col_typ in types_order: @@ -134,6 +115,8 @@ def unbox_dataframe(typ, val, c): native_val = _unbox_series_data(column_dtype, ty_series, arr_obj, c) inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False) + c.pyapi.decref(arr_obj) + c.pyapi.decref(series_obj) dataframe.data = c.builder.insert_value(dataframe.data, inst.value, type_id) @@ -150,12 +133,6 @@ def unbox_dataframe(typ, val, c): dataframe.parent = val - # increase refcount of stored values - if c.context.enable_nrt: - # TODO: other objects? - for var in column_strs: - c.context.nrt.incref(c.builder, string_type, var) - return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr)) diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index f863423a3..65d10ef52 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -26,7 +26,6 @@ import operator -from typing import NamedTuple import numba from numba import types @@ -39,7 +38,7 @@ from numba.core.imputils import impl_ret_new_ref, impl_ret_borrowed from sdc.hiframes.pd_series_ext import SeriesType -from sdc.hiframes.pd_dataframe_type import DataFrameType +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc from sdc.str_ext import string_type @@ -54,11 +53,6 @@ def generic_resolve(self, df, attr): return SeriesType(arr_typ.dtype, arr_typ, df.index, True) -class ColumnLoc(NamedTuple): - type_id: int - col_id: int - - def get_structure_maps(col_types, col_names): # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)} column_loc = {} @@ -86,62 +80,27 @@ def get_structure_maps(col_types, col_names): @intrinsic -def init_dataframe(typingctx, *args): - """Create a DataFrame with provided data, index and columns values. - Used as a single constructor for DataFrame and assigning its data, so that - optimization passes can look for init_dataframe() to see if underlying - data has changed, and get the array variables from init_dataframe() args if - not changed. - """ - - n_cols = len(args) // 2 - data_typs = tuple(args[:n_cols]) - index_typ = args[n_cols] - column_names = tuple(a.literal_value for a in args[n_cols + 1:]) - - column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names) - - def codegen(context, builder, signature, args): - in_tup = args[0] - data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)] - index = builder.extract_value(in_tup, n_cols) - column_strs = [numba.cpython.unicode.make_string_from_constant( - context, builder, string_type, c) for c in column_names] - # create dataframe struct and store values - dataframe = cgutils.create_struct_proxy( - signature.return_type)(context, builder) +def init_dataframe_internal(typingctx, data, index, df_type): - data_list_type = [types.List(typ) for typ in types_order] + ret_type = df_type.instance_type - data_lists = [] - for typ_id, typ in enumerate(types_order): - data_list_typ = context.build_list(builder, data_list_type[typ_id], - [data_arrs[data_id] for data_id in data_typs_map[typ][1]]) - data_lists.append(data_list_typ) + def codegen(context, builder, sig, args): + data_val, index_val = args[:2] - data_tup = context.make_tuple( - builder, types.Tuple(data_list_type), data_lists) - - col_list_type = types.List(string_type) - column_list = context.build_list(builder, col_list_type, column_strs) - - dataframe.data = data_tup - dataframe.index = index - dataframe.columns = column_list + dataframe = cgutils.create_struct_proxy( + sig.return_type)(context, builder) + dataframe.data = data_val + dataframe.index = index_val dataframe.parent = context.get_constant_null(types.pyobject) # increase refcount of stored values if context.enable_nrt: - context.nrt.incref(builder, index_typ, index) - for var, typ in zip(data_arrs, data_typs): - context.nrt.incref(builder, typ, var) - for var in column_strs: - context.nrt.incref(builder, string_type, var) + context.nrt.incref(builder, sig.args[0], data_val) + context.nrt.incref(builder, sig.args[1], index_val) return dataframe._getvalue() - ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc) - sig = signature(ret_typ, types.Tuple(args)) + sig = signature(ret_type, data, index, df_type) return sig, codegen diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py index a962329c3..9dd5fcaf4 100644 --- a/sdc/hiframes/pd_dataframe_type.py +++ b/sdc/hiframes/pd_dataframe_type.py @@ -24,6 +24,8 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import re +from typing import NamedTuple import numba from numba import types @@ -48,7 +50,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, column self.has_parent = has_parent self.column_loc = column_loc super(DataFrameType, self).__init__( - name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent)) + name="DataFrameType({}, {}, {}, {})".format(data, index, columns, has_parent)) def copy(self, index=None, has_parent=None): # XXX is copy necessary? @@ -83,6 +85,16 @@ def unify(self, typingctx, other): def is_precise(self): return all(a.is_precise() for a in self.data) and self.index.is_precise() + def __repr__(self): + + # To have correct repr of DataFrame we need some changes to what types.Type gives: + # (1) e.g. array(int64, 1d, C) should be Array(int64, 1, 'C') + # (2) ColumnLoc is not part of DataFrame name, so we need to add it + default_repr = super(DataFrameType, self).__repr__() + res = re.sub(r'array\((\w+), 1d, C\)', r'Array(\1, 1, \'C\')', default_repr) + res = re.sub(r'\)$', f', column_loc={self.column_loc})', res) + return res + @register_model(DataFrameType) class DataFrameModel(models.StructModel): @@ -98,14 +110,20 @@ def __init__(self, dmm, fe_type): members = [ ('data', types.Tuple([types.List(typ) for typ in df_types])), ('index', fe_type.index), - ('columns', types.List(string_type)), ('parent', types.pyobject), ] super(DataFrameModel, self).__init__(dmm, fe_type, members) +class ColumnLoc(NamedTuple): + type_id: int + col_id: int + + +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.DataFrameType = DataFrameType +types.ColumnLoc = ColumnLoc + make_attribute_wrapper(DataFrameType, 'data', '_data') make_attribute_wrapper(DataFrameType, 'index', '_index') -make_attribute_wrapper(DataFrameType, 'columns', '_columns') -make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed') make_attribute_wrapper(DataFrameType, 'parent', '_parent') diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index 245426643..d48aaf0f1 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -138,7 +138,7 @@ def pd_series_overload(data=None, index=None, dtype=None, name=None, copy=False, def hpat_pandas_series_ctor_impl(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): fix_data = sdc.hiframes.api.fix_df_array(data) - fix_index = sdc.hiframes.api.fix_df_index(index, fix_data) + fix_index = sdc.hiframes.api.fix_df_index(index) return sdc.hiframes.api.init_series(fix_data, fix_index, name) return hpat_pandas_series_ctor_impl diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index 2a569945d..78aaafeb6 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -24,7 +24,6 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - import contextlib import functools @@ -45,6 +44,8 @@ from sdc.utilities.utils import (debug_prints, alloc_arr_tup, empty_like_type, _numba_to_c_type_map) from sdc.distributed_analysis import Distribution +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc +from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.str_ext import string_type from sdc.str_arr_ext import (string_array_type, to_string_list, cp_str_list_to_array, str_list_to_array, @@ -56,13 +57,14 @@ import pandas as pd import numpy as np -from sdc.types import CategoricalDtypeType, Categorical +from sdc.types import Categorical import pyarrow import pyarrow.csv class CsvReader(ir.Stmt): + def __init__(self, file_name, df_out, sep, df_colnames, out_vars, out_types, usecols, loc, skiprows=0): self.file_name = file_name self.df_out = df_out @@ -263,8 +265,10 @@ def csv_distributed_run(csv_node, array_dists, typemap, calltypes, typingctx, ta # get global array sizes by calling allreduce on chunk lens # TODO: get global size from C for arr in csv_node.out_vars: + def f(A): return sdc.distributed_api.dist_reduce(len(A), np.int32(_op)) + f_block = compile_to_numba_ir( f, {'sdc': sdc, 'np': np, '_op': sdc.distributed_api.Reduce_Type.Sum.value}, @@ -287,6 +291,7 @@ def f(A): class StreamReaderType(types.Opaque): + def __init__(self): super(StreamReaderType, self).__init__(name='StreamReaderType') @@ -300,36 +305,6 @@ def box_stream_reader(typ, val, c): return val -def _get_dtype_str(t): - dtype = t.dtype - - if isinstance(t, Categorical): - # return categorical representation - # for some reason pandas and pyarrow read_csv() return CategoricalDtype with - # ordered=False in case when dtype is with ordered=None - return str(t).replace('ordered=None', 'ordered=False') - - if dtype == types.NPDatetime('ns'): - dtype = 'NPDatetime("ns")' - if t == string_array_type: - # HACK: add string_array_type to numba.types - # FIXME: fix after Numba #3372 is resolved - types.string_array_type = string_array_type - return 'string_array_type' - return '{}[::1]'.format(dtype) - - -def _get_pd_dtype_str(t): - dtype = t.dtype - if isinstance(t, Categorical): - return 'pd.{}'.format(t.pd_dtype) - if dtype == types.NPDatetime('ns'): - dtype = 'str' - if t == string_array_type: - return 'str' - return 'np.{}'.format(dtype) - - # XXX: temporary fix pending Numba's #3378 # keep the compiled functions around to make sure GC doesn't delete them and # the reference to the dynamic function inside them @@ -343,7 +318,7 @@ def to_varname(string): Replaces unavailable symbols with _ and insert _ if string starts with digit. """ import re - return re.sub(r'\W|^(?=\d)','_', string) + return re.sub(r'\W|^(?=\d)', '_', string) @contextlib.contextmanager @@ -358,10 +333,12 @@ def pyarrow_cpu_count(cpu_count=pyarrow.cpu_count()): def pyarrow_cpu_count_equal_numba_num_treads(func): """Decorator. Set pyarrow cpu_count the same as NUMBA_NUM_THREADS.""" + @functools.wraps(func) def wrapper(*args, **kwargs): with pyarrow_cpu_count(numba.config.NUMBA_NUM_THREADS): return func(*args, **kwargs) + return wrapper @@ -541,84 +518,81 @@ def pandas_read_csv( return dataframe -def _gen_csv_reader_py_pyarrow(col_names, col_typs, usecols, sep, typingctx, targetctx, parallel, skiprows): - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text(col_names, col_typs, usecols, sep, skiprows) - csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name) - return _gen_csv_reader_py_pyarrow_jit_func(csv_reader_py) +def _gen_pandas_read_csv_func_text(col_names, col_typs, py_col_dtypes, usecols, signature=None): + func_name = 'csv_reader_py' + return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names + + column_loc, _, _ = get_structure_maps(col_typs, return_columns) + df_type = DataFrameType( + tuple(col_typs), + types.none, + tuple(col_names), + column_loc=column_loc + ) + + df_type_repr = repr(df_type) + # for some reason pandas and pyarrow read_csv() return CategoricalDtype with + # ordered=False in case when dtype is with ordered=None + df_type_repr = df_type_repr.replace('ordered=None', 'ordered=False') -def _gen_csv_reader_py_pyarrow_func_text_core(col_names, col_typs, dtype_present, usecols, signature=None): # TODO: support non-numpy types like strings date_inds = ", ".join(str(i) for i, t in enumerate(col_typs) if t.dtype == types.NPDatetime('ns')) return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names - nb_objmode_vars = ", ".join([ - '{}="{}"'.format(to_varname(cname), _get_dtype_str(t)) - for cname, t in zip(return_columns, col_typs) - ]) - pd_dtype_strs = ", ".join([ - "'{}': {}".format(cname, _get_pd_dtype_str(t)) - for cname, t in zip(return_columns, col_typs) - ]) if signature is None: signature = "filepath_or_buffer" - func_text = "def csv_reader_py({}):\n".format(signature) - func_text += " with objmode({}):\n".format(nb_objmode_vars) - func_text += " df = pandas_read_csv(filepath_or_buffer,\n" + + # map generated func params into values used in inner call of pandas_read_csv + # if no transformation is needed just use outer param name (since APIs match) + # otherwise use value in the dictionary + inner_call_params = {'parse_dates': f"[{date_inds}]"} + used_read_csv_params = ( + 'filepath_or_buffer', + 'names', + 'skiprows', + 'parse_dates', + 'dtype', + 'usecols', + 'sep', + 'delimiter' + ) # pyarrow reads unnamed header as " ", pandas reads it as "Unnamed: N" # during inference from file names should be raplaced with "Unnamed: N" # passing names to pyarrow means that one row is header and should be skipped if col_names and any(map(lambda x: x.startswith('Unnamed: '), col_names)): - func_text += " names={},\n".format(col_names) - func_text += " skiprows=(skiprows and skiprows + 1) or 1,\n" - else: - func_text += " names=names,\n" - func_text += " skiprows=skiprows,\n" - - func_text += " parse_dates=[{}],\n".format(date_inds) - - # Python objects (e.g. str, np.float) could not be jitted and passed to objmode - # so they are hardcoded to function - # func_text += " dtype={{{}}},\n".format(pd_dtype_strs) if dtype_present else \ - # " dtype=dtype,\n" - # dtype is hardcoded because datetime should be read as string - func_text += " dtype={{{}}},\n".format(pd_dtype_strs) - - func_text += " usecols=usecols,\n" - func_text += " sep=sep,\n" - func_text += " delimiter=delimiter,\n" - func_text += " )\n" - for cname in return_columns: - func_text += " {} = df['{}'].values\n".format(to_varname(cname), cname) - # func_text += " print({})\n".format(cname) - return func_text, 'csv_reader_py' - - -def _gen_csv_reader_py_pyarrow_func_text(col_names, col_typs, usecols): - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_core(col_names, col_typs, usecols) + inner_call_params['names'] = str(col_names) + inner_call_params['skiprows'] = "(skiprows and skiprows + 1) or 1" - func_text += " return ({},)\n".format(", ".join(to_varname(c) for c in col_names)) + # dtype parameter of compiled function is not used at all, instead a python dict + # of columns dtypes is captured at compile time, because some dtypes (like datetime) + # are converted and also to avoid penalty of creating dict in objmode + inner_call_params['dtype'] = 'read_as_dtypes' - return func_text, func_name - - -def _gen_csv_reader_py_pyarrow_func_text_dataframe(col_names, col_typs, dtype_present, usecols, signature): - func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_core( - col_names, col_typs, dtype_present, usecols, signature) - return_columns = usecols if usecols and isinstance(usecols[0], str) else col_names + params_str = '\n'.join([ + f" {param}={inner_call_params.get(param, param)}," for param in used_read_csv_params + ]) + func_text = '\n'.join([ + f"def {func_name}({signature}):", + f" with objmode(df=\"{df_type_repr}\"):", + f" df = pandas_read_csv(\n{params_str}", + f" )", + f" return df" + ]) - func_text += " return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format( - ", ".join(to_varname(c) for c in return_columns), - ", ".join("'{}'".format(c) for c in return_columns) - ) + global_vars = { + 'read_as_dtypes': py_col_dtypes, + 'objmode': objmode, + 'pandas_read_csv': pandas_read_csv, + } - return func_text, func_name + return func_text, func_name, global_vars -def _gen_csv_reader_py_pyarrow_py_func(func_text, func_name): +def _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars): locals = {} - exec(func_text, globals(), locals) + exec(func_text, global_vars, locals) func = locals[func_name] return func @@ -628,6 +602,3 @@ def _gen_csv_reader_py_pyarrow_jit_func(csv_reader_py): jit_func = numba.njit(csv_reader_py) compiled_funcs.append(jit_func) return jit_func - - -_gen_csv_reader_py = _gen_csv_reader_py_pyarrow diff --git a/sdc/rewrites/dataframe_constructor.py b/sdc/rewrites/dataframe_constructor.py index c9538759e..debf0b73c 100644 --- a/sdc/rewrites/dataframe_constructor.py +++ b/sdc/rewrites/dataframe_constructor.py @@ -24,13 +24,18 @@ # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** - +import numba +from numba.core import cgutils, types from numba.core.rewrites import (register_rewrite, Rewrite) from numba.core.ir_utils import (guard, find_callname) from numba.core.ir import (Expr) from numba.extending import overload +from numba.core.extending import intrinsic +from numba.core.typing import signature from pandas import DataFrame +from sys import modules +from textwrap import dedent from sdc.rewrites.ir_utils import (find_operations, is_dict, get_tuple_items, get_dict_items, remove_unused_recursively, @@ -38,9 +43,11 @@ declare_constant, import_function, make_call, insert_before) -from sdc.hiframes.pd_dataframe_ext import (init_dataframe, DataFrameType) - +from sdc.hiframes import pd_dataframe_ext as pd_dataframe_ext_module +from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc +from sdc.hiframes.pd_dataframe_ext import get_structure_maps from sdc.hiframes.api import fix_df_array, fix_df_index +from sdc.str_ext import string_type @register_rewrite('before-inference') @@ -54,6 +61,7 @@ class RewriteDataFrame(Rewrite): _df_arg_list = ('data', 'index', 'columns', 'dtype', 'copy') def __init__(self, pipeline): + self._pipeline = pipeline super().__init__(pipeline) self._reset() @@ -79,18 +87,45 @@ def match(self, func_ir, block, typemap, calltypes): return len(self._calls_to_rewrite) > 0 def apply(self): - init_df_stmt = import_function(init_dataframe, self._block, self._func_ir) - for stmt in self._calls_to_rewrite: args = get_call_parameters(call=stmt.value, arg_names=self._df_arg_list) - old_data = args['data'] - args['data'], args['columns'] = self._extract_dict_args(args, self._func_ir) + args_len = len(args['data']) + func_name = f'init_dataframe_{args_len}' + + # injected_module = modules[pd_dataframe_ext_module.__name__] + init_df = getattr(pd_dataframe_ext_module, func_name, None) + if init_df is None: + init_df_text = gen_init_dataframe_text(func_name, args_len) + init_df = gen_init_dataframe_func( + func_name, + init_df_text, + { + 'numba': numba, + 'cgutils': cgutils, + 'signature': signature, + 'types': types, + 'get_structure_maps': get_structure_maps, + 'intrinsic': intrinsic, + 'DataFrameType': DataFrameType, + 'ColumnLoc': ColumnLoc, + 'string_type': string_type, + 'intrinsic': intrinsic, + 'fix_df_array': fix_df_array, + 'fix_df_index': fix_df_index + }) + + setattr(pd_dataframe_ext_module, func_name, init_df) + init_df.__module__ = pd_dataframe_ext_module.__name__ + init_df._defn.__module__ = pd_dataframe_ext_module.__name__ + + init_df_stmt = import_function(init_df, self._block, self._func_ir) self._replace_call(stmt, init_df_stmt.target, args, self._block, self._func_ir) remove_unused_recursively(old_data, self._block, self._func_ir) + self._pipeline.typingctx.refresh() return self._block @@ -130,42 +165,112 @@ def _replace_call(stmt, new_call, args, block, func_ir): columns_args = args['columns'] index_args = args.get('index') - data_args = RewriteDataFrame._replace_data_with_arrays(data_args, stmt, block, func_ir) - if index_args is None: # index arg was omitted none_stmt = declare_constant(None, block, func_ir, stmt.loc) index_args = none_stmt.target - index_and_data_args = [index_args] + data_args - index_args = RewriteDataFrame._replace_index_with_arrays(index_and_data_args, stmt, block, func_ir) + index_args = [index_args] all_args = data_args + index_args + columns_args call = Expr.call(new_call, all_args, {}, func.loc) stmt.value = call - @staticmethod - def _replace_data_with_arrays(args, stmt, block, func_ir): - new_args = [] - for var in args: - call_stmt = make_call(fix_df_array, [var], {}, block, func_ir, var.loc) - insert_before(block, call_stmt, stmt) - new_args.append(call_stmt.target) +def gen_init_dataframe_text(func_name, n_cols): + args_col_data = ['c' + str(i) for i in range(n_cols)] + args_col_names = ['n' + str(i) for i in range(n_cols)] + params = ', '.join(args_col_data + ['index'] + args_col_names) + suffix = ('' if n_cols == 0 else ', ') + + func_text = dedent( + f''' + @intrinsic + def {func_name}(typingctx, {params}): + """Create a DataFrame with provided columns data and index values. + Takes 2n+1 args: n columns data, index data and n column names. + Each column data is passed as separate argument to have compact LLVM IR. + Used as as generic constructor for native DataFrame objects, which + can be used with different input column types (e.g. lists), and + resulting DataFrameType is deduced by applying transform functions + (fix_df_array and fix_df_index) to input argument types. + """ + + n_cols = {n_cols} + + input_data_typs = ({', '.join(args_col_data) + suffix}) + fnty = typingctx.resolve_value_type(fix_df_array) + fixed_col_sigs = [] + for i in range({n_cols}): + to_sig = fnty.get_call_type(typingctx, (input_data_typs[i],), {{}}) + fixed_col_sigs.append(to_sig) + data_typs = tuple(fixed_col_sigs[i].return_type for i in range({n_cols})) + need_fix_cols = tuple(data_typs[i] != input_data_typs[i] for i in range({n_cols})) + + input_index_typ = index + fnty = typingctx.resolve_value_type(fix_df_index) + fixed_index_sig = fnty.get_call_type(typingctx, (input_index_typ,), {{}}) + index_typ = fixed_index_sig.return_type + need_fix_index = index_typ != input_index_typ + + column_names = tuple(a.literal_value for a in ({', '.join(args_col_names) + suffix})) + column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names) + col_needs_transform = tuple(not isinstance(data_typs[i], types.Array) for i in range(len(data_typs))) + + def codegen(context, builder, sig, args): + {params}, = args + data_arrs = [{', '.join(args_col_data) + suffix}] + data_arrs_transformed = [] + for i, arr in enumerate(data_arrs): + if need_fix_cols[i] == False: + data_arrs_transformed.append(arr) + else: + res = context.compile_internal(builder, lambda a: fix_df_array(a), fixed_col_sigs[i], [arr]) + data_arrs_transformed.append(res) - return new_args + # create dataframe struct and store values + dataframe = cgutils.create_struct_proxy( + sig.return_type)(context, builder) - @staticmethod - def _replace_index_with_arrays(args, stmt, block, func_ir): - new_args = [] + data_list_type = [types.List(typ) for typ in types_order] + + data_lists = [] + for typ_id, typ in enumerate(types_order): + data_arrs_of_typ = [data_arrs_transformed[data_id] for data_id in data_typs_map[typ][1]] + data_list_typ = context.build_list(builder, data_list_type[typ_id], data_arrs_of_typ) + data_lists.append(data_list_typ) + + data_tup = context.make_tuple( + builder, types.Tuple(data_list_type), data_lists) + + if need_fix_index == True: + index = context.compile_internal(builder, lambda a: fix_df_index(a), fixed_index_sig, [index]) + + dataframe.data = data_tup + dataframe.index = index + dataframe.parent = context.get_constant_null(types.pyobject) + + # increase refcount of stored values + if context.enable_nrt: + context.nrt.incref(builder, index_typ, index) + for var, typ in zip(data_arrs_transformed, data_typs): + context.nrt.incref(builder, typ, var) + + return dataframe._getvalue() + + ret_typ = DataFrameType(data_typs, index_typ, column_names, column_loc=column_loc) + sig = signature(ret_typ, {params}) + return sig, codegen + ''') + + return func_text - call_stmt = make_call(fix_df_index, args, {}, block, func_ir, args[0].loc) - insert_before(block, call_stmt, stmt) - new_args.append(call_stmt.target) - return new_args +def gen_init_dataframe_func(func_name, func_text, global_vars): - return new_args + loc_vars = {} + exec(func_text, global_vars, loc_vars) + return loc_vars[func_name] @overload(DataFrame) diff --git a/sdc/str_arr_ext.py b/sdc/str_arr_ext.py index 6b363d013..1d12a80a5 100644 --- a/sdc/str_arr_ext.py +++ b/sdc/str_arr_ext.py @@ -1026,6 +1026,7 @@ def str_arr_getitem_by_array_impl(A, arg): def decode_utf8(typingctx, ptr_t, len_t=None): def codegen(context, builder, sig, args): ptr, length = args + nrt_table = context.nrt.get_nrt_api(builder) # create str and call decode with internal pointers uni_str = cgutils.create_struct_proxy(string_type)(context, builder) @@ -1034,14 +1035,16 @@ def codegen(context, builder, sig, args): lir.IntType(32).as_pointer(), lir.IntType(32).as_pointer(), lir.IntType(64).as_pointer(), - uni_str.meminfo.type.as_pointer()]) + uni_str.meminfo.type.as_pointer(), + lir.IntType(8).as_pointer()]) fn_decode = builder.module.get_or_insert_function( fnty, name="decode_utf8") builder.call(fn_decode, [ptr, length, uni_str._get_ptr_by_name('kind'), uni_str._get_ptr_by_name('is_ascii'), uni_str._get_ptr_by_name('length'), - uni_str._get_ptr_by_name('meminfo')]) + uni_str._get_ptr_by_name('meminfo'), + nrt_table]) uni_str.hash = context.get_constant(_Py_hash_t, -1) uni_str.data = context.nrt.meminfo_data(builder, uni_str.meminfo) # Set parent to NULL diff --git a/sdc/str_arr_type.py b/sdc/str_arr_type.py index b18967f63..c9cf59ce1 100644 --- a/sdc/str_arr_type.py +++ b/sdc/str_arr_type.py @@ -66,6 +66,8 @@ def copy(self): string_array_type = StringArrayType() +# FIXME_Numba#3372: add into numba.types to allow returning from objmode +types.StringArrayType = StringArrayType class StringArrayPayloadType(types.Type): diff --git a/sdc/tests/__init__.py b/sdc/tests/__init__.py index c36067f0f..eaba6a8a9 100644 --- a/sdc/tests/__init__.py +++ b/sdc/tests/__init__.py @@ -48,6 +48,7 @@ from sdc.tests.test_sdc_numpy import * from sdc.tests.test_prange_utils import * +from sdc.tests.test_compile_time import * # performance tests import sdc.tests.tests_perf diff --git a/sdc/tests/test_compile_time.py b/sdc/tests/test_compile_time.py new file mode 100644 index 000000000..03b5fd46a --- /dev/null +++ b/sdc/tests/test_compile_time.py @@ -0,0 +1,114 @@ +# ***************************************************************************** +# Copyright (c) 2019-2020, Intel Corporation All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numba +import numpy as np +import pandas as pd +import re +import unittest + +from contextlib import redirect_stdout +from io import StringIO +from sdc.tests.test_base import TestCase +from sdc.decorators import debug_compile_time + + +# regexp patterns for lines in @debug_compile_time output log +line_heading = r'\*+\s+COMPILE STATS\s+\*+\n' +line_function = r'Function: [^\s]+\n' +line_args = r'\s+Args:.*\n' +line_pipeline = r'\s+Pipeline: \w+\n' +line_passes = r'(\s+\w+\s+[\d.]+\n)+' +line_time = r'\s+Time: [\d.]+\n' +line_ending = r'\*+\n' + + +class TestCompileTime(TestCase): + + @staticmethod + def _gen_usecase_data(): + n = 11 + S1 = pd.Series(np.ones(n)) + S2 = pd.Series(2 ** np.arange(n)) + return S1, S2 + + def test_log_format_summary(self): + """ Verifies shortened log format when only summary info is printed """ + + @debug_compile_time(level=0) + @self.jit + def test_impl(S1, S2): + return S1 + S2 + + buffer = StringIO() + with redirect_stdout(buffer): + S1, S2 = self._gen_usecase_data() + test_impl(S1, S2) + + entry_format = fr'{line_function}{line_pipeline}{line_time}\n' + log_format = fr'^{line_heading}({entry_format})+{line_ending}$' + self.assertRegex(buffer.getvalue(), log_format) + + def test_log_format_detailed(self): + """ Verifies detailed log format with passes and args information """ + + @debug_compile_time() + @self.jit + def test_impl(S1, S2): + return S1 + S2 + + buffer = StringIO() + with redirect_stdout(buffer): + S1, S2 = self._gen_usecase_data() + test_impl(S1, S2) + + entry_format = fr'{line_function}{line_args}{line_pipeline}{line_passes}{line_time}\n' + log_format = fr'{line_heading}({entry_format})+{line_ending}' + self.assertRegex(buffer.getvalue(), log_format) + + def test_func_names_filter(self): + """ Verifies filtering log entries via func_names paramter """ + searched_name = 'add' + + @debug_compile_time(func_names=[searched_name]) + @self.jit + def test_impl(S1, S2): + return S1 + S2 + + buffer = StringIO() + with redirect_stdout(buffer): + S1, S2 = self._gen_usecase_data() + test_impl(S1, S2) + + line_function = r'Function: ([^\s]+)\n' + match_iter = re.finditer(line_function, buffer.getvalue()) + next(match_iter) # skip entry for top-level func + for m in match_iter: + self.assertIn(searched_name, m.group(1)) + + +if __name__ == "__main__": + unittest.main() diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 9102c916f..c245f0694 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -31,7 +31,7 @@ import random import string import unittest -from itertools import permutations, product +from itertools import permutations, product, combinations_with_replacement from numba import types from numba.core.config import IS_32BITS from numba import literal_unroll @@ -98,6 +98,31 @@ def test_impl(n): n = 11 self.assertEqual(hpat_func(n), test_impl(n)) + def test_create_empty_df(self): + """ Verifies empty DF can be created """ + def test_impl(): + df = pd.DataFrame({}) + return len(df) + hpat_func = self.jit(test_impl) + + self.assertEqual(hpat_func(), test_impl()) + + def test_create_multiple_dfs(self): + """ Verifies generated dataframe ctor is added to pd_dataframe_ext module + correctly (and numba global context is refreshed), so that subsequent + compilations are not broken. """ + def test_impl(a, b, c): + df1 = pd.DataFrame({'A': a, 'B': b}) + df2 = pd.DataFrame({'C': c}) + total_cols = len(df1.columns) + len(df2.columns) + return total_cols + hpat_func = self.jit(test_impl) + + a1 = np.array([1, 2, 3, 4.0, 5]) + a2 = [7, 6, 5, 4, 3] + a3 = ['a', 'b', 'c', 'd', 'e'] + self.assertEqual(hpat_func(a1, a2, a3), test_impl(a1, a2, a3)) + def test_create_str(self): def test_impl(): df = pd.DataFrame({'A': ['a', 'b', 'c']}) @@ -159,7 +184,7 @@ def test_impl(A, B, index): result_ref = test_impl(A, B, index) pd.testing.assert_frame_equal(result, result_ref) - def test_create_empty_df(self): + def test_unbox_empty_df(self): def test_impl(df): return df sdc_func = self.jit(test_impl) @@ -242,7 +267,6 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) - @skip_numba_jit def test_box1(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}) @@ -1812,103 +1836,145 @@ def test_impl(df): h_out = hpat_func(df) pd.testing.assert_frame_equal(out, h_out) - def test_df_drop_one_column_unboxing(self): - def test_impl(df): - return df.drop(columns='C D') - - index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - None, - ['a', 'b', 'c', 'd']] - - sdc_func = self.jit(test_impl) - - for index in index_to_test: - with self.subTest(index=index): - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C D': [1.0, 2.0, np.nan, 1.0]}, - index=index) - pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - def test_df_drop_one_column(self): - def test_impl(index): + """ Verifies df.drop handles string literal as columns param """ + def test_impl(): df = pd.DataFrame({ 'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0] - }, index=index) + }) return df.drop(columns='A') sdc_func = self.jit(test_impl) - for index in [[1, 2, 3, 4], [.1, .2, .3, .4], ['a', 'b', 'c', 'd']]: - with self.subTest(index=index): - pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) + pd.testing.assert_frame_equal(sdc_func(), test_impl()) - def test_df_drop_tuple_column_unboxing(self): - def gen_test_impl(do_jit=False): - def test_impl(df): - if do_jit == True: # noqa - return df.drop(columns=('A', 'C')) - else: - return df.drop(columns=['A', 'C']) + def test_df_drop_inplace_default_false(self): + """ Verifies df.drop doesn't change original dataframe (default inplace=False) """ + def test_impl(df): + return df, df.drop(columns='A') - return test_impl + sdc_func = self.jit(test_impl) - index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - None, - ['a', 'b', 'c', 'd']] + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0] + }) - test_impl = gen_test_impl() - sdc_func = self.jit(gen_test_impl(do_jit=True)) + df_old, df_new = sdc_func(df.copy()) + _, df_new_ref = test_impl(df.copy()) - for index in index_to_test: + pd.testing.assert_frame_equal(df_old, df) + pd.testing.assert_frame_equal(df_new, df_new_ref) + self.assertFalse(df.equals(df_new), "Column was dropped, but DF not changed") + + def test_df_drop_handles_index(self): + """ Verifies df.drop handles DataFrames with indexes of different types """ + def test_impl(df): + return df.drop(columns=['A', 'C']) + + sdc_func = self.jit(test_impl) + indexes_to_test = [ + None, + pd.RangeIndex(4, 0, -1), + [1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd'] + ] + + for index in indexes_to_test: with self.subTest(index=index): - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, - index=index) + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0] + }, index=index) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - def test_df_drop_tuple_column(self): - def gen_test_impl(do_jit=False): - def test_impl(index): - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, - index=index) - if do_jit == True: # noqa - return df.drop(columns=('A', 'C')) - else: - return df.drop(columns=['A', 'C']) + def test_df_drop_columns_list(self): + """ Verifies df.drop handles columns as list of const column names """ + def test_impl(df): + return df.drop(columns=['D', 'B']) + sdc_func = self.jit(test_impl) - return test_impl + n_cols = 5 + col_names = ['A', 'B', 'C', 'D', 'E'] + columns_data = { + 'int': [4, 5, 6, 7], + 'float': [1.0, 2.0, np.nan, 1.0], + 'str': ['a', 'b', 'c', 'd'] + } + + for scheme in combinations_with_replacement(columns_data.keys(), n_cols): + with self.subTest(col_types=scheme): + df_data = [columns_data[scheme[i]] for i in range(n_cols)] + df = pd.DataFrame(dict(zip(col_names, df_data))) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - ['a', 'b', 'c', 'd']] + def test_df_drop_columns_list_single_column(self): + """ Verifies df.drop handles list with single column """ + def test_impl(df): + return df.drop(columns=['A']) + sdc_func = self.jit(test_impl) - test_impl = gen_test_impl() - sdc_func = self.jit(gen_test_impl(do_jit=True)) + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0], + }) - for index in index_to_test: + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + + def test_df_drop_columns_list_repeat(self): + """ Verifies multiple invocations with different dropped column lists do not interfere """ + def test_impl(df): + res = df + res = res.drop(columns=['A', 'E']) + res = res.drop(columns=['B', 'C']) + return res + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0], + 'D': ['a', 'b', '', 'c'], + 'E': [8, 9, 10, 11], + }) + + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + + @unittest.skip("BUG: empty df cannot be unboxed") + def test_df_unbox_empty_df(self): + def test_impl(index): + df = pd.DataFrame({}, index=index) + return df + + sdc_func = self.jit(test_impl) + for index in [ + [1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd'] + ]: with self.subTest(index=index): - pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) + result = sdc_func(index) + result_ref = test_impl(index) + pd.testing.assert_frame_equal(result, result_ref) - @unittest.skip("ValueError when return empty dataframe") - def test_df_drop_tuple_columns_all(self): - def gen_test_impl(do_jit=False): - def test_impl(df): - if do_jit == True: # noqa - return df.drop(columns=('A', 'B', 'C')) - else: - return df.drop(columns=['A', 'B', 'C']) + @unittest.skip("BUG: empty df cannot be unboxed") + def test_df_drop_all_columns(self): + def test_impl(df): + return df.drop(columns=['A', 'B', 'C']) + sdc_func = self.jit(test_impl) - return test_impl index_to_test = [[1, 2, 3, 4], [.1, .2, .3, .4], None, ['a', 'b', 'c', 'd']] - test_impl = gen_test_impl() - sdc_func = self.jit(gen_test_impl(do_jit=True)) - for index in index_to_test: with self.subTest(index=index): df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, @@ -1916,7 +1982,8 @@ def test_impl(df): pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - def test_df_drop_by_column_errors_ignore(self): + def test_df_drop_errors_ignore(self): + """ Verifies df.drop handles errors argument """ def test_impl(df): return df.drop(columns='M', errors='ignore') @@ -1924,18 +1991,37 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) - @skip_numba_jit - def test_df_drop_inplace2(self): - # test droping after setting the column + def test_df_drop_columns_non_literals(self): + """ Verifies that SDC supports only list of literals as dropped column names """ def test_impl(df): - df2 = df[['A', 'B']] - df2['D'] = np.ones(3) - df2.drop(columns=['D'], inplace=True) - return df2 + drop_cols = [c for c in df.columns if c != 'B'] + return df.drop(columns=drop_cols) + sdc_func = self.jit(test_impl) - df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) - hpat_func = self.jit(test_impl) - pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}) + with self.assertRaises(TypingError) as raises: + sdc_func(df) + msg = 'Unsupported use of parameter columns: expected list of constant strings.' + self.assertIn(msg, str(raises.exception)) + + def test_df_drop_columns_list_mutation_unsupported(self): + """ Verifies SDCLimitation is raised if non-const list is used as columns """ + def test_impl(df): + drop_cols = ['A', 'C'] + drop_cols.pop(-1) + return df.drop(columns=drop_cols) + sdc_func = self.jit(test_impl) + + df = pd.DataFrame({ + 'A': [1.0, 2.0, np.nan, 1.0], + 'B': [4, 5, 6, 7], + 'C': [1.0, 2.0, np.nan, 1.0], + }) + + with self.assertRaises(SDCLimitation) as raises: + sdc_func(df) + msg = 'Unsupported use of parameter columns: non-const list was used.' + self.assertIn(msg, str(raises.exception)) @skip_numba_jit def test_df_drop_inplace1(self): @@ -1948,6 +2034,19 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df2)) + @skip_numba_jit + def test_df_drop_inplace2(self): + # test droping after setting the column + def test_impl(df): + df2 = df[['A', 'B']] + df2['D'] = np.ones(3) + df2.drop(columns=['D'], inplace=True) + return df2 + + df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) + hpat_func = self.jit(test_impl) + pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + def _test_df_getitem_str_literal_idx(self, df): def test_impl(df): return df['A'] @@ -2557,7 +2656,6 @@ def test_impl(): hpat_func = self.jit(test_impl) pd.testing.assert_series_equal(hpat_func(), test_impl(), check_names=False) - @unittest.skip("Implement getting columns attribute") def test_dataframe_columns_attribute(self): def test_impl(): df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) @@ -2566,7 +2664,6 @@ def test_impl(): hpat_func = self.jit(test_impl) np.testing.assert_array_equal(hpat_func(), test_impl()) - @unittest.skip("Implement getting columns attribute") def test_dataframe_columns_iterator(self): def test_impl(): df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) diff --git a/sdc/tests/test_io.py b/sdc/tests/test_io.py index 768f2827e..55fead5ba 100644 --- a/sdc/tests/test_io.py +++ b/sdc/tests/test_io.py @@ -663,6 +663,8 @@ def test_impl(): return test_impl + @skip_numba_jit("Lowering error while returning df from objmode:\n" + "Can only insert i8* at [4] in {i8*, i8*, i64, i64, i8*, [1 x i64], [1 x i64]}: got i64*") def test_csv_cat1(self): test_impl = self.pd_csv_cat1() hpat_func = self.jit(test_impl) @@ -682,6 +684,8 @@ def test_impl(): return test_impl + @skip_numba_jit("Lowering error while returning df from objmode:\n" + "Can only insert i8* at [4] in {i8*, i8*, i64, i64, i8*, [1 x i64], [1 x i64]}: got i64*") def test_csv_cat2(self): test_impl = self.pd_csv_cat2() hpat_func = self.jit(test_impl) diff --git a/sdc/utilities/utils.py b/sdc/utilities/utils.py index c17285a58..bdce82b0a 100644 --- a/sdc/utilities/utils.py +++ b/sdc/utilities/utils.py @@ -51,6 +51,8 @@ from numba.extending import overload, overload_method, overload_attribute from numba.extending import register_jitable, register_model from numba.core.datamodel.registry import register_default +from functools import wraps +from itertools import filterfalse, chain # int values for types to pass to C code @@ -686,3 +688,59 @@ def sdc_overload_attribute(typ, name, jit_options={}, parallel=None, strict=True return overload_attribute( typ, name, jit_options=jit_options, strict=strict, inline=inline, prefer_literal=prefer_literal ) + + +def print_compile_times(disp, level, func_names=None): + + def print_times(cres, args): + print(f'Function: {cres.fndesc.unique_name}') + pad = ' ' * 2 + if level: + print(f'{pad * 1}Args: {args}') + times = cres.metadata['pipeline_times'] + for pipeline, pass_times in times.items(): + print(f'{pad * 1}Pipeline: {pipeline}') + if level: + for name, timings in pass_times.items(): + print(f'{pad * 2}{name:50s}{timings.run:.13f}') + + pipeline_total = sum(t.init + t.run + t.finalize for t in pass_times.values()) + print(f'{pad * 1}Time: {pipeline_total}\n') + + # print times for compiled function indicated by disp + for args, cres in disp.overloads.items(): + print_times(cres, args) + + def has_no_cache(ovld): + return not (getattr(ovld, '_impl_cache', False) and ovld._impl_cache) + + known_funcs = disp.typingctx._functions + all_templs = chain.from_iterable(known_funcs.values()) + compiled_templs = filterfalse(has_no_cache, all_templs) + + # filter only function names that are in the func_names list + if func_names: + compiled_templs = filterfalse( + lambda x: not any(f in str(x) for f in func_names), + compiled_templs + ) + + dispatchers_list = [] + for template in compiled_templs: + tmpl_cached_impls = template._impl_cache.values() + dispatchers_list.extend(tmpl_cached_impls) + + for impl_cache in set(dispatchers_list): + # impl_cache is usually a tuple of format (dispatcher, args) + # if not just skip these entires + if not (isinstance(impl_cache, tuple) + and len(impl_cache) == 2 + and isinstance(impl_cache[0], type(disp))): + continue + + fndisp, args = impl_cache + if not getattr(fndisp, 'overloads', False): + continue + + cres, = list(fndisp.overloads.values()) + print_times(cres, args) diff --git a/setup.py b/setup.py index fd128f02b..d4c8f1fab 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ import platform import os import sys +import numba from docs.source.buildscripts.sdc_build_doc import SDCBuildDoc @@ -185,6 +186,7 @@ def check_file_at_path(path2file): ) str_libs = np_compile_args['libraries'] +numba_include_path = numba.extending.include_path() ext_str = Extension(name="sdc.hstr_ext", sources=["sdc/_str_ext.cpp"], @@ -192,7 +194,7 @@ def check_file_at_path(path2file): define_macros=np_compile_args['define_macros'], extra_compile_args=eca, extra_link_args=ela, - include_dirs=np_compile_args['include_dirs'] + ind, + include_dirs=np_compile_args['include_dirs'] + ind + [numba_include_path], library_dirs=np_compile_args['library_dirs'] + lid, )