diff --git a/sdc/__init__.py b/sdc/__init__.py index 0e11d5bea..c7ac6dab5 100644 --- a/sdc/__init__.py +++ b/sdc/__init__.py @@ -61,9 +61,10 @@ """ # sdc.config.numba_compiler_define_nopython_pipeline_orig = \ - # numba.compiler.DefaultPassBuilder.define_nopython_pipeline + # numba.compiler.DefaultPassBuilder.define_nopython_pipeline # numba.compiler.DefaultPassBuilder.define_nopython_pipeline = \ - # sdc.datatypes.hpat_pandas_dataframe_pass.sdc_nopython_pipeline_lite_register + # sdc.datatypes.hpat_pandas_dataframe_pass.sdc_nopython_pipeline_lite_register + def _init_extension(): '''Register Pandas classes and functions with Numba. diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index b474c8824..2641df2c6 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -31,65 +31,116 @@ import operator import pandas +import copy +import numpy + +import sdc from numba import types from numba.extending import (overload, overload_method, overload_attribute) +from sdc.hiframes.pd_dataframe_ext import DataFrameType from numba.errors import TypingError +import sdc.datatypes.hpat_pandas_dataframe_types + +from sdc.datatypes.hpat_pandas_series_functions import TypeChecker + + +# Example func_text for func_name='count' columns=('A', 'B'): +# +# def _df_count_impl(df, axis=0, level=None, numeric_only=False): +# series_A = init_series(get_dataframe_data(df, 0)) +# result_A = series_A.count(level=level) +# series_B = init_series(get_dataframe_data(df, 1)) +# result_B = series_B.count(level=level) +# return pandas.Series([result_A, result_B], ['A', 'B']) + + +def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns): + result_name_list = [] + joined = ', '.join(func_params) + func_lines = [f'def _df_{func_name}_impl({joined}):'] + for i, c in enumerate(columns): + result_c = f'result_{c}' + func_lines += [f' series_{c} = init_series(get_dataframe_data({func_params[0]}, {i}))', + f' {result_c} = series_{c}.{func_name}({series_params})'] + result_name_list.append(result_c) + all_results = ', '.join(result_name_list) + all_columns = ', '.join([f"'{c}'" for c in columns]) + + func_lines += [f' return pandas.Series([{all_results}], [{all_columns}])'] + func_text = '\n'.join(func_lines) + + global_vars = {'pandas': pandas, 'np': numpy, + 'init_series': sdc.hiframes.api.init_series, + 'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data} + + return func_text, global_vars -from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType -from sdc.utils import sdc_overload_method +def sdc_pandas_dataframe_reduce_columns(df, func_name, params, ser_params): + all_params = ['df'] + ser_par = [] -@sdc_overload_method(DataFrameType, 'count') -def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): + for key, value in params.items(): + all_params.append('{}={}'.format(key, value)) + for key, value in ser_params.items(): + ser_par.append('{}={}'.format(key, value)) + + s_par = '{}'.format(', '.join(ser_par[:])) + + df_func_name = f'_df_{func_name}_impl' + + func_text, global_vars = _dataframe_reduce_columns_codegen(func_name, all_params, s_par, df.columns) + + loc_vars = {} + exec(func_text, global_vars, loc_vars) + _reduce_impl = loc_vars[df_func_name] + + return _reduce_impl + + +@overload_method(DataFrameType, 'count') +def count_overload(df, axis=0, level=None, numeric_only=False): """ Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count1 Parameters ----------- self: :class:`pandas.DataFrame` - input arg + input arg axis: - *unsupported* + *unsupported* level: - *unsupported* + *unsupported* numeric_only: - *unsupported* + *unsupported* Returns ------- :obj:`pandas.Series` or `pandas.DataFrame` - returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. + for each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. """ - _func_name = 'Method pandas.dataframe.count().' + name = 'count' - if not isinstance(self, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) + ty_checker = TypeChecker('Method {}().'.format(name)) + ty_checker.check(df, DataFrameType) if not (isinstance(axis, types.Omitted) or axis == 0): - raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) + ty_checker.raise_exc(axis, 'unsupported', 'axis') if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) + ty_checker.raise_exc(level, 'unsupported', 'level') if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) - - def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): - result_data = [] - result_index = [] - - for dataframe_item in self._data: - item_count = dataframe_item.count() - item_name = dataframe_item._name - result_data.append(item_count) - result_index.append(item_name) + ty_checker.raise_exc(numeric_only, 'unsupported', 'numeric_only') - return pandas.Series(data=result_data, index=result_index) + params = {'axis': 0, 'level': None, 'numeric_only': False} + ser_par = {'level': 'level'} - return sdc_pandas_dataframe_count_impl + return sdc_pandas_dataframe_reduce_columns(df, name, params, ser_par) diff --git a/sdc/datatypes/hpat_pandas_dataframe_types.py b/sdc/datatypes/hpat_pandas_dataframe_types.py index ff8a16766..03f79934a 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_types.py +++ b/sdc/datatypes/hpat_pandas_dataframe_types.py @@ -110,7 +110,7 @@ def iterator_type(self): return DataFrameTypeIterator(self) -if config_pipeline_hpat_default is 0: +if not config_pipeline_hpat_default: @register_model(DataFrameType) class DataFrameTypeModel(StructModel): """ @@ -163,7 +163,7 @@ def _hpat_pandas_dataframe_init_codegen(context, builder, signature, args): return sig, _hpat_pandas_dataframe_init_codegen -if config_pipeline_hpat_default is 0: +if not config_pipeline_hpat_default: @overload(pandas.DataFrame) def hpat_pandas_dataframe(data=None, index=None, columns=None, dtype=None, copy=False): """ diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 27bf593b0..3d44c7b18 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1628,3 +1628,7 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None, date_format, doublequote, escapechar, decimal) return _impl + + +if not sdc.config.config_pipeline_hpat_default: + from sdc.datatypes.hpat_pandas_dataframe_functions import * diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 99a2e6807..da3906531 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -929,7 +929,6 @@ def test_impl(n): n = 11 pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) - @skip_numba_jit def test_count(self): def test_impl(n): df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)}) @@ -939,7 +938,6 @@ def test_impl(n): n = 11 pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) - @skip_numba_jit def test_count1(self): # TODO: non-numeric columns should be ignored automatically def test_impl(n):