From bf8c0238f400c989fc1e9749c0ba975247a14f68 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Fri, 29 Nov 2019 12:49:58 +0300 Subject: [PATCH 1/8] Init dataframe --- sdc/config.py | 5 + .../hpat_pandas_dataframe_functions.py | 334 +++++++++++++++--- sdc/hiframes/pd_dataframe_ext.py | 2 + sdc/tests/test_dataframe.py | 22 ++ 4 files changed, 322 insertions(+), 41 deletions(-) diff --git a/sdc/config.py b/sdc/config.py index 13c4c07b3..321ca5c4b 100644 --- a/sdc/config.py +++ b/sdc/config.py @@ -67,3 +67,8 @@ ''' Default value for a pointer intended to use as Numba.DefaultPassBuilder.define_nopython_pipeline() in overloaded function ''' + +use_default_dataframe = distutils_util.strtobool(os.getenv('SDC_CONFIG_USE_DEFAULT_DATAFRAME', 'True')) +''' +Default value used to select compiler pipeline in a function decorator +''' \ No newline at end of file diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index b17ceb47c..e3480543e 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -31,64 +31,316 @@ import operator import pandas +import numpy + +import sdc from numba import types from numba.extending import (overload, overload_method, overload_attribute) +from sdc.hiframes.pd_dataframe_ext import DataFrameType from numba.errors import TypingError -from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType +if not sdc.config.use_default_dataframe: + from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType + + + @overload_method(DataFrameType, 'count') + def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. + """ + + _func_name = 'Method pandas.dataframe.count().' + + if not isinstance(self, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) + + if not (isinstance(axis, types.Omitted) or axis == 0): + raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) + + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) + + if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) + + def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): + result_data = [] + result_index = [] + + for dataframe_item in self._data: + item_count = dataframe_item.count() + item_name = dataframe_item._name + result_data.append(item_count) + result_index.append(item_name) + + return pandas.Series(data=result_data, index=result_index) + + return sdc_pandas_dataframe_count_impl + +else: + def reduce(df, name): + saved_columns = df.columns + n_cols = len(saved_columns) + data_args = tuple('data{}'.format(i) for i in range(n_cols)) + func_text = "def _reduce_impl(df, axis=None, skipna=None, level=None, numeric_only=None):\n" + for i, d in enumerate(data_args): + func_text += " {} = hpat.hiframes.api.init_series(hpat.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))\n".format( + d + '_S', i) + func_text += " {} = {}.{}()\n".format(d + '_O', d + '_S', name) + func_text += " data = np.array(({},))\n".format( + ", ".join(d + '_O' for d in data_args)) + func_text += " index = hpat.str_arr_ext.StringArray(({},))\n".format( + ", ".join("'{}'".format(c) for c in saved_columns)) + func_text += " return hpat.hiframes.api.init_series(data, index)\n" + loc_vars = {} + + print() + print(func_text) + print() + + exec(func_text, {'hpat': sdc, 'np': numpy}, loc_vars) + _reduce_impl = loc_vars['_reduce_impl'] + + return _reduce_impl + + + @overload_method(DataFrameType, 'median') + def median_overload(df, axis=None, skipna=None, level=None, numeric_only=None): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.median` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_median1 + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + skipna: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + return the median of the values for the requested axis. + """ + + name = 'median' + + if not isinstance(df, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) + + if not (isinstance(axis, types.Omitted) or axis is None): + raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) + + if not (isinstance(skipna, types.Omitted) or skipna is None): + raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) + + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) + + if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + + return reduce(df, name) + + + @overload_method(DataFrameType, 'mean') + def mean_overload(df, axis=None, skipna=None, level=None, numeric_only=None): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.mean` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_mean1 + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + skipna: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + return the mean of the values for the requested axis. + """ + + name = 'mean' + + if not isinstance(df, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) + + if not (isinstance(axis, types.Omitted) or axis is None): + raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) + + if not (isinstance(skipna, types.Omitted) or skipna is None): + raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) + + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) + + if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + + return reduce(df, name) + + + @overload_method(DataFrameType, 'max') + def max_overload(df, axis=None, skipna=None, level=None, numeric_only=None): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.max` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_max1 + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + skipna: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + return the maximum of the values for the requested axis. + """ + + name = 'max' + + if not isinstance(df, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) + + if not (isinstance(axis, types.Omitted) or axis is None): + raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) + + if not (isinstance(skipna, types.Omitted) or skipna is None): + raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) + + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) + + if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + + return reduce(df, name) + + + @overload_method(DataFrameType, 'min') + def min_overload(df, axis=None, skipna=None, level=None, numeric_only=None): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.min` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_min1 + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + skipna: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + returns: the minimum of the values for the requested axis. + """ + name = 'min' + + if not isinstance(df, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) -@overload_method(DataFrameType, 'count') -def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. + if not (isinstance(axis, types.Omitted) or axis is None): + raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) - .. only:: developer + if not (isinstance(skipna, types.Omitted) or skipna is None): + raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* + if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + + return reduce(df, name) - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. - """ - _func_name = 'Method pandas.dataframe.count().' + @overload_method(DataFrameType, 'sum') + def sum_overload(df, axis=None, skipna=None, level=None, numeric_only=None, min_count=0): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.sum` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_sum1 + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + axis: + *unsupported* + skipna: + *unsupported* + level: + *unsupported* + numeric_only: + *unsupported* + min_count: + *unsupported* + Returns + ------- + :obj:`pandas.Series` or `pandas.DataFrame` + return the sum of the values for the requested axis. + """ - if not isinstance(self, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) + name = 'sum' - if not (isinstance(axis, types.Omitted) or axis == 0): - raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) + if not isinstance(df, DataFrameType): + raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) + if not (isinstance(axis, types.Omitted) or axis is None): + raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) - if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) + if not (isinstance(skipna, types.Omitted) or skipna is None): + raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) - def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): - result_data = [] - result_index = [] + if not (isinstance(level, types.Omitted) or level is None): + raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) - for dataframe_item in self._data: - item_count = dataframe_item.count() - item_name = dataframe_item._name - result_data.append(item_count) - result_index.append(item_name) + if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): + raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) - return pandas.Series(data=result_data, index=result_index) + if not (isinstance(min_count, types.Omitted) or min_count == 0): + raise TypingError("{} 'min_count' unsupported. Given: {}".format(name, min_count)) - return sdc_pandas_dataframe_count_impl + return reduce(df, name) diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 61e988095..1291173d7 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1636,3 +1636,5 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None, date_format, doublequote, escapechar, decimal) return _impl + +from sdc.datatypes.hpat_pandas_dataframe_functions import * \ No newline at end of file diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index a3bc3c51a..7b7c75690 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1151,5 +1151,27 @@ def test_impl(n): pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) + def test_dataframe_min(self): + def test_impl(df): + return df.min() + sdc_func = sdc.jit(test_impl) + df = pd.DataFrame({"A": [12, 4, 5, 44, 1], + "B": [5, 2, 54, 3, 2], + "C": [20, 16, 7, 3, 8], + "D": [14, 3, 17, 2, 6]}) + print(sdc_func(df)) + pd.testing.assert_series_equal(sdc_func(df), test_impl(df)) + + def test_dataframe_min2(self): + def test_impl(df): + return df.min() + sdc_func = sdc.jit(test_impl) + df = pd.DataFrame({"A": [12, 4, 5, None, 1], + "B": [5, 2, 54, 3, None], + "C": [20, 16, 7, 3, 8], + "D": [14, 3, None, 2, 6]}) + print(sdc_func(df)) + pd.testing.assert_series_equal(sdc_func(df), test_impl(df)) + if __name__ == "__main__": unittest.main() From c966f8e428bfac498a839f91077fbe9b07d22a2f Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Tue, 3 Dec 2019 15:39:58 +0300 Subject: [PATCH 2/8] Implement Dataframe.head() --- .../hpat_pandas_dataframe_functions.py | 274 ++++-------------- sdc/tests/test_dataframe.py | 35 +-- sdc/tests/test_utils.py | 8 + 3 files changed, 74 insertions(+), 243 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index e3480543e..0bdc5830a 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -34,6 +34,7 @@ import numpy import sdc +from sdc.datatypes.hpat_pandas_series_functions import TypeChecker from numba import types from numba.extending import (overload, overload_method, overload_attribute) @@ -95,252 +96,83 @@ def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False return sdc_pandas_dataframe_count_impl else: - def reduce(df, name): + def sdc_pandas_dataframe_reduce_columns(df, name, param): saved_columns = df.columns n_cols = len(saved_columns) data_args = tuple('data{}'.format(i) for i in range(n_cols)) - func_text = "def _reduce_impl(df, axis=None, skipna=None, level=None, numeric_only=None):\n" + help_param = ', {}={}):' + func_text = 'def _reduce_impl(df):' + all_params = ['df'] + for name, value in param: + all_params.append('{}={}'.format(name, value)) + func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params)) + func_lines = [func_definition] for i, d in enumerate(data_args): - func_text += " {} = hpat.hiframes.api.init_series(hpat.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))\n".format( - d + '_S', i) - func_text += " {} = {}.{}()\n".format(d + '_O', d + '_S', name) - func_text += " data = np.array(({},))\n".format( - ", ".join(d + '_O' for d in data_args)) - func_text += " index = hpat.str_arr_ext.StringArray(({},))\n".format( - ", ".join("'{}'".format(c) for c in saved_columns)) - func_text += " return hpat.hiframes.api.init_series(data, index)\n" + line = ' {} = hpat.hiframes.api.init_series(hpat.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' + func_lines.append(line.format(d + '_S', i)) + func_lines.append(' {} = {}.{}()'.format(d + '_O', d + '_S', name)) + func_lines.append(' data = np.array(({},))'.format( + ", ".join(d + '_O' for d in data_args))) + func_lines.append(' index = hpat.str_arr_ext.StringArray(({},))'.format( + ', '.join('"{}"'.format(c) for c in saved_columns))) + func_lines.append(' return hpat.hiframes.api.init_series(data, index)') loc_vars = {} - - print() - print(func_text) - print() + func_text = '\n'.join(func_lines) exec(func_text, {'hpat': sdc, 'np': numpy}, loc_vars) _reduce_impl = loc_vars['_reduce_impl'] return _reduce_impl - - @overload_method(DataFrameType, 'median') - def median_overload(df, axis=None, skipna=None, level=None, numeric_only=None): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.median` implementation. - .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_median1 - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - skipna: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - return the median of the values for the requested axis. - """ - - name = 'median' - - if not isinstance(df, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) - - if not (isinstance(axis, types.Omitted) or axis is None): - raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) - - if not (isinstance(skipna, types.Omitted) or skipna is None): - raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) - - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) - - if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) - - return reduce(df, name) - - - @overload_method(DataFrameType, 'mean') - def mean_overload(df, axis=None, skipna=None, level=None, numeric_only=None): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.mean` implementation. - .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_mean1 - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - skipna: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - return the mean of the values for the requested axis. - """ - - name = 'mean' - - if not isinstance(df, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) - - if not (isinstance(axis, types.Omitted) or axis is None): - raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) - - if not (isinstance(skipna, types.Omitted) or skipna is None): - raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) - - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) - - if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) - - return reduce(df, name) - - - @overload_method(DataFrameType, 'max') - def max_overload(df, axis=None, skipna=None, level=None, numeric_only=None): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.max` implementation. - .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_max1 - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - skipna: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - return the maximum of the values for the requested axis. - """ - - name = 'max' - - if not isinstance(df, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) + + def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0): + ty_checker = TypeChecker('Method {}().'.format(name)) + ty_checker.check(df, DataFrameType) if not (isinstance(axis, types.Omitted) or axis is None): - raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) + ty_checker.raise_exc(axis, 'unsupported', 'axis') - if not (isinstance(skipna, types.Omitted) or skipna is None): - raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) + if not (isinstance(skipna, (types.Omitted, types.NoneType, types.Boolean)) or skipna is None): + ty_checker.raise_exc(skipna, 'bool', 'skipna') if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) + ty_checker.raise_exc(level, 'unsupported', 'level') if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + ty_checker.raise_exc(numeric_only, 'unsupported', 'numeric_only') - return reduce(df, name) + if not (isinstance(ddof, types.Omitted) or ddof == 1): + ty_checker.raise_exc(ddof, 'unsupported', 'ddof') - - @overload_method(DataFrameType, 'min') - def min_overload(df, axis=None, skipna=None, level=None, numeric_only=None): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.min` implementation. - .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_min1 - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - skipna: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - returns: the minimum of the values for the requested axis. - """ - - name = 'min' - - if not isinstance(df, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) - - if not (isinstance(axis, types.Omitted) or axis is None): - raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) - - if not (isinstance(skipna, types.Omitted) or skipna is None): - raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) - - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) - - if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + if not (isinstance(min_count, types.Omitted) or min_count == 0): + ty_checker.raise_exc(min_count, 'unsupported', 'min_count') - return reduce(df, name) + if not isinstance(n, (types.Integer, types.Omitted)) and n != 5: + ty_checker.raise_exc(n, 'integer', 'n') - @overload_method(DataFrameType, 'sum') - def sum_overload(df, axis=None, skipna=None, level=None, numeric_only=None, min_count=0): + @overload_method(DataFrameType, 'head') + def median_overload(df, n=5): + """ + Pandas DataFrame method :meth:`pandas.DataFrame.head` implementation. + .. only:: developer + Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_head1 + Parameters + ----------- + self: :class:`pandas.DataFrame` + input arg + n: :obj:`int`, default 5 + input arg, default 5 + Returns + ------- + :obj:`pandas.Series` + returns: The first n rows of the caller object. """ - Pandas DataFrame method :meth:`pandas.DataFrame.sum` implementation. - .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_sum1 - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - skipna: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - min_count: - *unsupported* - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - return the sum of the values for the requested axis. - """ - - name = 'sum' - - if not isinstance(df, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(name, df)) - - if not (isinstance(axis, types.Omitted) or axis is None): - raise TypingError("{} 'axis' unsupported. Given: {}".format(name, axis)) - - if not (isinstance(skipna, types.Omitted) or skipna is None): - raise TypingError("{} 'skipna' unsupported. Given: {}".format(name, skipna)) - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(name, level)) + name = 'head' - if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(name, numeric_only)) + check_type(name, df, n=n) - if not (isinstance(min_count, types.Omitted) or min_count == 0): - raise TypingError("{} 'min_count' unsupported. Given: {}".format(name, min_count)) + params = [('n', n)] - return reduce(df, name) + return sdc_pandas_dataframe_reduce_columns(df, name, params) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 7b7c75690..7055bfb70 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -40,6 +40,9 @@ skip_numba_jit, TestCase) from sdc.tests.gen_test_data import ParquetGenerator +from sdc.tests.test_utils import (min_float64, max_float64, test_global_input_data_float64, + test_global_input_data_unicode_kind4, test_datatime, + min_int64, max_int64, test_global_input_data_int64) from numba.config import IS_32BITS @@ -1150,28 +1153,16 @@ def test_impl(n): hpat_func = sdc.jit(test_impl) pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) - - def test_dataframe_min(self): - def test_impl(df): - return df.min() - sdc_func = sdc.jit(test_impl) - df = pd.DataFrame({"A": [12, 4, 5, 44, 1], - "B": [5, 2, 54, 3, 2], - "C": [20, 16, 7, 3, 8], - "D": [14, 3, 17, 2, 6]}) - print(sdc_func(df)) - pd.testing.assert_series_equal(sdc_func(df), test_impl(df)) - - def test_dataframe_min2(self): - def test_impl(df): - return df.min() + def test_dataframe_head1(self): + def test_impl(df, n): + return df.head(n) sdc_func = sdc.jit(test_impl) - df = pd.DataFrame({"A": [12, 4, 5, None, 1], - "B": [5, 2, 54, 3, None], - "C": [20, 16, 7, 3, 8], - "D": [14, 3, None, 2, 6]}) - print(sdc_func(df)) - pd.testing.assert_series_equal(sdc_func(df), test_impl(df)) - + df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5], + "DATATIME": test_datatime, + "INT": test_global_input_data_int64[:5], + "STRING": ['a', 'dd', 'c', '12', 'ddf']}) + for n in [-1, 0, 2, 5]: + pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) + if __name__ == "__main__": unittest.main() diff --git a/sdc/tests/test_utils.py b/sdc/tests/test_utils.py index 798359d86..df0ecf145 100644 --- a/sdc/tests/test_utils.py +++ b/sdc/tests/test_utils.py @@ -37,16 +37,24 @@ '¡Y tú quién te crees?', '🐍⚡', '大处 着眼,c小处着手c。大大c大处', + 'c小处着手c。', + 'te crees?' ] min_float64 = np.finfo('float64').min max_float64 = np.finfo('float64').max +min_int64 = np.iinfo(np.int64).min +max_int64 = np.iinfo(np.int64).max test_global_input_data_float64 = [ [1., np.nan, -1., 0., min_float64, max_float64, max_float64, min_float64], [np.nan, np.inf, np.inf, np.nan, np.nan, np.nan, np.NINF, np.NZERO] ] +test_datatime = np.array(['2007-07-13', '2006-01-13', '2010-08-13', + '2005-02-27', '2005-02-28'], dtype='datetime64') + +test_global_input_data_int64 = [min_int64, 0, 1, -23, max_int64, min_int64, max_int64] def count_array_REPs(): if sdc.config.config_pipeline_hpat_default: From d036cc7c731c2568ed3f943696756d1c95348afd Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Tue, 3 Dec 2019 15:57:33 +0300 Subject: [PATCH 3/8] Fix codestyle --- sdc/config.py | 2 +- sdc/datatypes/hpat_pandas_dataframe_functions.py | 10 ++++------ sdc/hiframes/pd_dataframe_ext.py | 2 +- sdc/tests/test_dataframe.py | 10 +++++----- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/sdc/config.py b/sdc/config.py index 321ca5c4b..1bccbe3bf 100644 --- a/sdc/config.py +++ b/sdc/config.py @@ -71,4 +71,4 @@ use_default_dataframe = distutils_util.strtobool(os.getenv('SDC_CONFIG_USE_DEFAULT_DATAFRAME', 'True')) ''' Default value used to select compiler pipeline in a function decorator -''' \ No newline at end of file +''' diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 0bdc5830a..1cb556c3e 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -44,7 +44,6 @@ if not sdc.config.use_default_dataframe: from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType - @overload_method(DataFrameType, 'count') def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): """ @@ -64,7 +63,8 @@ def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): Returns ------- :obj:`pandas.Series` or `pandas.DataFrame` - returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. + returns: For each column/row the number of non-NA/null entries.\ + If level is specified returns a DataFrame. """ _func_name = 'Method pandas.dataframe.count().' @@ -105,7 +105,7 @@ def sdc_pandas_dataframe_reduce_columns(df, name, param): all_params = ['df'] for name, value in param: all_params.append('{}={}'.format(name, value)) - func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params)) + func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params)) func_lines = [func_definition] for i, d in enumerate(data_args): line = ' {} = hpat.hiframes.api.init_series(hpat.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' @@ -124,7 +124,6 @@ def sdc_pandas_dataframe_reduce_columns(df, name, param): return _reduce_impl - def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0): ty_checker = TypeChecker('Method {}().'.format(name)) ty_checker.check(df, DataFrameType) @@ -146,11 +145,10 @@ def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, if not (isinstance(min_count, types.Omitted) or min_count == 0): ty_checker.raise_exc(min_count, 'unsupported', 'min_count') - + if not isinstance(n, (types.Integer, types.Omitted)) and n != 5: ty_checker.raise_exc(n, 'integer', 'n') - @overload_method(DataFrameType, 'head') def median_overload(df, n=5): """ diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 1291173d7..3256f8c2b 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -1637,4 +1637,4 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None, return _impl -from sdc.datatypes.hpat_pandas_dataframe_functions import * \ No newline at end of file +from sdc.datatypes.hpat_pandas_dataframe_functions import * diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 96803b698..da205d27c 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -37,13 +37,13 @@ import sdc from sdc.tests.test_base import TestCase from sdc.tests.test_utils import (count_array_REPs, count_parfor_REPs, count_parfor_OneDs, - count_array_OneDs, dist_IR_contains, get_start_end, check_numba_version, - skip_numba_jit) + count_array_OneDs, dist_IR_contains, get_start_end, check_numba_version, + skip_numba_jit) from sdc.tests.gen_test_data import ParquetGenerator from sdc.tests.test_utils import (min_float64, max_float64, test_global_input_data_float64, - test_global_input_data_unicode_kind4, test_datatime, - min_int64, max_int64, test_global_input_data_int64) + test_global_input_data_unicode_kind4, test_datatime, + min_int64, max_int64, test_global_input_data_int64) from numba.config import IS_32BITS @@ -1163,6 +1163,6 @@ def test_impl(df, n): "STRING": ['a', 'dd', 'c', '12', 'ddf']}) for n in [-1, 0, 2, 5]: pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) - + if __name__ == "__main__": unittest.main() From 12d8f2c3405866ab0fc094ec22c3131649d109c0 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Wed, 11 Dec 2019 14:23:02 +0300 Subject: [PATCH 4/8] impl dataframe.head --- .../hpat_pandas_dataframe_functions.py | 99 +++++-------------- sdc/datatypes/hpat_pandas_series_functions.py | 8 +- sdc/tests/test_dataframe.py | 23 ++++- 3 files changed, 50 insertions(+), 80 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 1cb556c3e..665a5e991 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -32,6 +32,7 @@ import operator import pandas import numpy +import numba import sdc from sdc.datatypes.hpat_pandas_series_functions import TypeChecker @@ -39,95 +40,43 @@ from numba import types from numba.extending import (overload, overload_method, overload_attribute) from sdc.hiframes.pd_dataframe_ext import DataFrameType +from sdc.hiframes.pd_series_ext import SeriesType from numba.errors import TypingError if not sdc.config.use_default_dataframe: from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType - @overload_method(DataFrameType, 'count') - def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): - """ - Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. - .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count - Parameters - ----------- - self: :class:`pandas.DataFrame` - input arg - axis: - *unsupported* - level: - *unsupported* - numeric_only: - *unsupported* - Returns - ------- - :obj:`pandas.Series` or `pandas.DataFrame` - returns: For each column/row the number of non-NA/null entries.\ - If level is specified returns a DataFrame. - """ - - _func_name = 'Method pandas.dataframe.count().' - - if not isinstance(self, DataFrameType): - raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) - - if not (isinstance(axis, types.Omitted) or axis == 0): - raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) - - if not (isinstance(level, types.Omitted) or level is None): - raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) - - if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): - raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) - - def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): - result_data = [] - result_index = [] - - for dataframe_item in self._data: - item_count = dataframe_item.count() - item_name = dataframe_item._name - result_data.append(item_count) - result_index.append(item_name) - - return pandas.Series(data=result_data, index=result_index) - - return sdc_pandas_dataframe_count_impl - else: - def sdc_pandas_dataframe_reduce_columns(df, name, param): + def sdc_pandas_dataframe_reduce_columns_series(df, name, params): saved_columns = df.columns n_cols = len(saved_columns) data_args = tuple('data{}'.format(i) for i in range(n_cols)) - help_param = ', {}={}):' - func_text = 'def _reduce_impl(df):' - all_params = ['df'] - for name, value in param: - all_params.append('{}={}'.format(name, value)) - func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params)) + space = [] + if len(params) > 0: + space.append(', ') + func_definition = 'def _reduce_impl(df{}{}):'.format("".join(space), ", ".join( + str(key) + '=' + str(value) for key, value in params)) func_lines = [func_definition] for i, d in enumerate(data_args): - line = ' {} = hpat.hiframes.api.init_series(hpat.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' + line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' func_lines.append(line.format(d + '_S', i)) - func_lines.append(' {} = {}.{}()'.format(d + '_O', d + '_S', name)) - func_lines.append(' data = np.array(({},))'.format( - ", ".join(d + '_O' for d in data_args))) - func_lines.append(' index = hpat.str_arr_ext.StringArray(({},))'.format( - ', '.join('"{}"'.format(c) for c in saved_columns))) - func_lines.append(' return hpat.hiframes.api.init_series(data, index)') + func_lines.append(' {} = {}.{}({})'.format(d + '_O', d + '_S', name, ", ".join( + str(key) for key, value in params))) + func_lines.append(" return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format( + ", ".join(d + '_O._data' for d in data_args), + ", ".join("'" + c + "'" for c in saved_columns))) + loc_vars = {} func_text = '\n'.join(func_lines) - - exec(func_text, {'hpat': sdc, 'np': numpy}, loc_vars) + exec(func_text, {'sdc': sdc, 'np': numpy}, loc_vars) _reduce_impl = loc_vars['_reduce_impl'] return _reduce_impl - def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0): + def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0, n=5): ty_checker = TypeChecker('Method {}().'.format(name)) ty_checker.check(df, DataFrameType) - + if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'unsupported', 'axis') @@ -142,15 +91,15 @@ def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, if not (isinstance(ddof, types.Omitted) or ddof == 1): ty_checker.raise_exc(ddof, 'unsupported', 'ddof') - + if not (isinstance(min_count, types.Omitted) or min_count == 0): ty_checker.raise_exc(min_count, 'unsupported', 'min_count') - if not isinstance(n, (types.Integer, types.Omitted)) and n != 5: - ty_checker.raise_exc(n, 'integer', 'n') + if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5): + ty_checker.raise_exc(n, 'int64', 'n') @overload_method(DataFrameType, 'head') - def median_overload(df, n=5): + def head_overload(df, n=5): """ Pandas DataFrame method :meth:`pandas.DataFrame.head` implementation. .. only:: developer @@ -171,6 +120,6 @@ def median_overload(df, n=5): check_type(name, df, n=n) - params = [('n', n)] + params = [('n', 5)] - return sdc_pandas_dataframe_reduce_columns(df, name, params) + return sdc_pandas_dataframe_reduce_columns_series(df, name, params) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 3f36f6fdb..ca928b5e0 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -3419,7 +3419,7 @@ def hpat_pandas_series_fillna(self, value=None, method=None, axis=None, inplace= if not isinstance(self, SeriesType): raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) - if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted)) or axis is None): + if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted, types.NoneType)) or axis is None): raise TypingError('{} The axis must be an Integer or String. Given: {}'.format(_func_name, axis)) if not (isinstance(inplace, types.Literal) and isinstance(inplace, types.Boolean) @@ -3427,9 +3427,9 @@ def hpat_pandas_series_fillna(self, value=None, method=None, axis=None, inplace= or inplace is False): raise TypingError('{} The inplace must be a literal Boolean constant. Given: {}'.format(_func_name, inplace)) - if not ((method is None or isinstance(method, types.Omitted)) - and (limit is None or isinstance(limit, types.Omitted)) - and (downcast is None or isinstance(downcast, types.Omitted)) + if not ((method is None or isinstance(method, (types.Omitted, types.NoneType))) + and (limit is None or isinstance(limit, (types.Omitted, types.NoneType))) + and (downcast is None or isinstance(downcast, (types.Omitted, types.NoneType))) ): raise TypingError('{} Unsupported parameters. Given method: {}, limit: {}, downcast: {}'.format( _func_name, method, limit, downcast)) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index da205d27c..9578d2673 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -891,7 +891,7 @@ def test_impl(n): def test_df_fillna1(self): def test_impl(df): - return df.fillna(5.0) + return df.fillna(0.) df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0]}) hpat_func = self.jit(test_impl) @@ -1153,6 +1153,16 @@ def test_impl(n): hpat_func = self.jit(test_impl) pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) + def test_dataframe_head(self): + def test_impl(df): + return df.head() + sdc_func = sdc.jit(test_impl) + df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5], + "DATATIME": test_datatime, + "INT": test_global_input_data_int64[:5], + "STRING": ['a', 'dd', 'c', '12', 'ddf']}) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + def test_dataframe_head1(self): def test_impl(df, n): return df.head(n) @@ -1163,6 +1173,17 @@ def test_impl(df, n): "STRING": ['a', 'dd', 'c', '12', 'ddf']}) for n in [-1, 0, 2, 5]: pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) + + def test_dataframe_head2(self): + def test_impl(df, n): + return df.head(n) + sdc_func = sdc.jit(test_impl) + df = pd.DataFrame({"A": [12, 4, 5, 1, 6, 8], + "B": [5, 2, 54, 3, 6, 4], + "C": [20, 16, 3, 8, 2, 3], + "D": [14, 3, 2, 6, 4, 5]}) + for n in [-1, 0, 2, 5]: + pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) if __name__ == "__main__": unittest.main() From a1872ad630355d11c5bd267e6f159b627d7c2e59 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Wed, 11 Dec 2019 14:28:17 +0300 Subject: [PATCH 5/8] codestyle --- sdc/datatypes/hpat_pandas_dataframe_functions.py | 4 ++-- sdc/datatypes/hpat_pandas_series_functions.py | 3 ++- sdc/tests/test_dataframe.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 665a5e991..29975cdc6 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -76,7 +76,7 @@ def sdc_pandas_dataframe_reduce_columns_series(df, name, params): def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0, n=5): ty_checker = TypeChecker('Method {}().'.format(name)) ty_checker.check(df, DataFrameType) - + if not (isinstance(axis, types.Omitted) or axis is None): ty_checker.raise_exc(axis, 'unsupported', 'axis') @@ -91,7 +91,7 @@ def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, if not (isinstance(ddof, types.Omitted) or ddof == 1): ty_checker.raise_exc(ddof, 'unsupported', 'ddof') - + if not (isinstance(min_count, types.Omitted) or min_count == 0): ty_checker.raise_exc(min_count, 'unsupported', 'min_count') diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index ca928b5e0..3def65a2f 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -3419,7 +3419,8 @@ def hpat_pandas_series_fillna(self, value=None, method=None, axis=None, inplace= if not isinstance(self, SeriesType): raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self)) - if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted, types.NoneType)) or axis is None): + if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, + types.Omitted, types.NoneType)) or axis is None): raise TypingError('{} The axis must be an Integer or String. Given: {}'.format(_func_name, axis)) if not (isinstance(inplace, types.Literal) and isinstance(inplace, types.Boolean) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 9578d2673..d1e8af803 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1173,7 +1173,7 @@ def test_impl(df, n): "STRING": ['a', 'dd', 'c', '12', 'ddf']}) for n in [-1, 0, 2, 5]: pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) - + def test_dataframe_head2(self): def test_impl(df, n): return df.head(n) From 655bba382552164fce16680deef3324b4c7a9da1 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Thu, 12 Dec 2019 15:12:12 +0300 Subject: [PATCH 6/8] Fix type check --- sdc/datatypes/hpat_pandas_dataframe_functions.py | 13 ++++++++----- sdc/tests/test_dataframe.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 29975cdc6..8295f79e9 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -73,7 +73,7 @@ def sdc_pandas_dataframe_reduce_columns_series(df, name, params): return _reduce_impl - def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0, n=5): + def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0): ty_checker = TypeChecker('Method {}().'.format(name)) ty_checker.check(df, DataFrameType) @@ -95,9 +95,6 @@ def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, if not (isinstance(min_count, types.Omitted) or min_count == 0): ty_checker.raise_exc(min_count, 'unsupported', 'min_count') - if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5): - ty_checker.raise_exc(n, 'int64', 'n') - @overload_method(DataFrameType, 'head') def head_overload(df, n=5): """ @@ -118,7 +115,13 @@ def head_overload(df, n=5): name = 'head' - check_type(name, df, n=n) + check_type(name, df) + + ty_checker = TypeChecker('Method {}().'.format(name)) + ty_checker.check(df, DataFrameType) + + if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5): + ty_checker.raise_exc(n, 'int64', 'n') params = [('n', 5)] diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index d1e8af803..b4c41e41f 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1174,6 +1174,19 @@ def test_impl(df, n): for n in [-1, 0, 2, 5]: pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) + @unittest.skip('Dataframe.index not support') + def test_dataframe_head1_index(self): + def test_impl(df, n): + return df.head(n) + sdc_func = sdc.jit(test_impl) + df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5], + "DATATIME": test_datatime, + "INT": test_global_input_data_int64[:5], + "STRING": ['a', 'dd', 'c', '12', 'ddf']}, + index=[32, 3, 6, 17, 23]) + for n in [-1, 0, 2, 5]: + pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) + def test_dataframe_head2(self): def test_impl(df, n): return df.head(n) From fe0490e8a45e6cf91f89e955e1953ed9afc8defc Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Mon, 16 Dec 2019 16:26:56 +0300 Subject: [PATCH 7/8] little fixes --- sdc/datatypes/hpat_pandas_dataframe_functions.py | 15 ++++----------- sdc/tests/test_dataframe.py | 5 ++--- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 8295f79e9..50341c264 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -51,11 +51,9 @@ def sdc_pandas_dataframe_reduce_columns_series(df, name, params): saved_columns = df.columns n_cols = len(saved_columns) data_args = tuple('data{}'.format(i) for i in range(n_cols)) - space = [] - if len(params) > 0: - space.append(', ') - func_definition = 'def _reduce_impl(df{}{}):'.format("".join(space), ", ".join( - str(key) + '=' + str(value) for key, value in params)) + all_params = ['df'] + [f'{key}={value}' for key, value in params] + func_definition = "def _reduce_impl(" + ', '.join(all_params) + "):" + func_lines = [func_definition] for i, d in enumerate(data_args): line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' @@ -117,12 +115,7 @@ def head_overload(df, n=5): check_type(name, df) - ty_checker = TypeChecker('Method {}().'.format(name)) - ty_checker.check(df, DataFrameType) - if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5): ty_checker.raise_exc(n, 'int64', 'n') - params = [('n', 5)] - - return sdc_pandas_dataframe_reduce_columns_series(df, name, params) + return sdc_pandas_dataframe_reduce_columns_series(df, name, [('n', 5)]) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index b4c41e41f..1f92346b6 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -126,6 +126,7 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + @unittest.skip('returned NULL without setting an error') def test_box1(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}) @@ -166,8 +167,7 @@ def test_impl(df): dtype=pd.api.types.CategoricalDtype(['N', 'Y']))}) pd.testing.assert_frame_equal(hpat_func(df.copy(deep=True)), test_impl(df)) - @unittest.skipIf(check_numba_version('0.46.0'), - "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") + @unittest.skip('does not support option: "distributed"') def test_box_dist_return(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}) @@ -1136,7 +1136,6 @@ def test_impl(): @unittest.skip("Implement iterrows for DataFrame") def test_dataframe_iterrows(self): def test_impl(df): - print(df.iterrows()) return [row for _, row in df.iterrows()] df = pd.DataFrame({'A': [1, 2, 3], 'B': [0.2, 0.5, 0.001], 'C': ['a', 'bb', 'ccc']}) From 785b80e5763e7ee9bfb1f4703cbf7662d4fd46f4 Mon Sep 17 00:00:00 2001 From: "elena.totmenina" Date: Mon, 16 Dec 2019 17:01:48 +0300 Subject: [PATCH 8/8] format fix --- sdc/datatypes/hpat_pandas_dataframe_functions.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 50341c264..f845cfe64 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -52,17 +52,17 @@ def sdc_pandas_dataframe_reduce_columns_series(df, name, params): n_cols = len(saved_columns) data_args = tuple('data{}'.format(i) for i in range(n_cols)) all_params = ['df'] + [f'{key}={value}' for key, value in params] - func_definition = "def _reduce_impl(" + ', '.join(all_params) + "):" + func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params)) func_lines = [func_definition] for i, d in enumerate(data_args): line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' func_lines.append(line.format(d + '_S', i)) - func_lines.append(' {} = {}.{}({})'.format(d + '_O', d + '_S', name, ", ".join( - str(key) for key, value in params))) + func_lines.append(' {}_O = {}_S.{}({})'.format(d, d, name, ", ".join( + key for key, _ in params))) func_lines.append(" return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format( ", ".join(d + '_O._data' for d in data_args), - ", ".join("'" + c + "'" for c in saved_columns))) + ", ".join(f"'{c}'" for c in saved_columns))) loc_vars = {} func_text = '\n'.join(func_lines) @@ -113,7 +113,8 @@ def head_overload(df, n=5): name = 'head' - check_type(name, df) + ty_checker = TypeChecker('Method {}().'.format(name)) + ty_checker.check(df, DataFrameType) if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5): ty_checker.raise_exc(n, 'int64', 'n')