-
Notifications
You must be signed in to change notification settings - Fork 62
Dataframe.head() #363
Dataframe.head() #363
Changes from all commits
bf8c023
c966f8e
eef527d
d036cc7
12d8f2c
a1872ad
2c1bd6b
655bba3
f6bb9c6
0356771
b26ac5f
fe0490e
c4be0ab
785b80e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,65 +31,92 @@ | |
|
|
||
| import operator | ||
| import pandas | ||
| import numpy | ||
| import numba | ||
|
|
||
| import sdc | ||
| from sdc.datatypes.hpat_pandas_series_functions import TypeChecker | ||
|
|
||
| from numba import types | ||
| from numba.extending import (overload, overload_method, overload_attribute) | ||
| from sdc.hiframes.pd_dataframe_ext import DataFrameType | ||
| from sdc.hiframes.pd_series_ext import SeriesType | ||
| from numba.errors import TypingError | ||
|
|
||
| from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType | ||
| from sdc.utils import sdc_overload_method | ||
|
|
||
|
|
||
| @sdc_overload_method(DataFrameType, 'count') | ||
| def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False): | ||
| """ | ||
| Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation. | ||
| .. only:: developer | ||
| Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count | ||
| Parameters | ||
| ----------- | ||
| self: :class:`pandas.DataFrame` | ||
| input arg | ||
| axis: | ||
| *unsupported* | ||
| level: | ||
| *unsupported* | ||
| numeric_only: | ||
| *unsupported* | ||
| Returns | ||
| ------- | ||
| :obj:`pandas.Series` or `pandas.DataFrame` | ||
| returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame. | ||
| """ | ||
|
|
||
| _func_name = 'Method pandas.dataframe.count().' | ||
|
|
||
| if not isinstance(self, DataFrameType): | ||
| raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self)) | ||
|
|
||
| if not (isinstance(axis, types.Omitted) or axis == 0): | ||
| raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis)) | ||
|
|
||
| if not (isinstance(level, types.Omitted) or level is None): | ||
| raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis)) | ||
|
|
||
| if not (isinstance(numeric_only, types.Omitted) or numeric_only is False): | ||
| raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis)) | ||
|
|
||
| def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False): | ||
| result_data = [] | ||
| result_index = [] | ||
|
|
||
| for dataframe_item in self._data: | ||
| item_count = dataframe_item.count() | ||
| item_name = dataframe_item._name | ||
| result_data.append(item_count) | ||
| result_index.append(item_name) | ||
|
|
||
| return pandas.Series(data=result_data, index=result_index) | ||
|
|
||
| return sdc_pandas_dataframe_count_impl | ||
| if not sdc.config.use_default_dataframe: | ||
| from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType | ||
|
|
||
| else: | ||
| def sdc_pandas_dataframe_reduce_columns_series(df, name, params): | ||
| saved_columns = df.columns | ||
| n_cols = len(saved_columns) | ||
| data_args = tuple('data{}'.format(i) for i in range(n_cols)) | ||
| all_params = ['df'] + [f'{key}={value}' for key, value in params] | ||
| func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params)) | ||
|
|
||
| func_lines = [func_definition] | ||
| for i, d in enumerate(data_args): | ||
| line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))' | ||
| func_lines.append(line.format(d + '_S', i)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use f-string or formatted string instead of concatenation. |
||
| func_lines.append(' {}_O = {}_S.{}({})'.format(d, d, name, ", ".join( | ||
| key for key, _ in params))) | ||
| func_lines.append(" return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you exactly need |
||
| ", ".join(d + '_O._data' for d in data_args), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You forgot to use f-string here. |
||
| ", ".join(f"'{c}'" for c in saved_columns))) | ||
|
|
||
| loc_vars = {} | ||
| func_text = '\n'.join(func_lines) | ||
| exec(func_text, {'sdc': sdc, 'np': numpy}, loc_vars) | ||
| _reduce_impl = loc_vars['_reduce_impl'] | ||
|
|
||
| return _reduce_impl | ||
|
|
||
| def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it a common function for checking all parameters? How is it related to df.head()?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is |
||
| ty_checker = TypeChecker('Method {}().'.format(name)) | ||
| ty_checker.check(df, DataFrameType) | ||
|
|
||
| if not (isinstance(axis, types.Omitted) or axis is None): | ||
| ty_checker.raise_exc(axis, 'unsupported', 'axis') | ||
|
|
||
| if not (isinstance(skipna, (types.Omitted, types.NoneType, types.Boolean)) or skipna is None): | ||
| ty_checker.raise_exc(skipna, 'bool', 'skipna') | ||
|
|
||
| if not (isinstance(level, types.Omitted) or level is None): | ||
| ty_checker.raise_exc(level, 'unsupported', 'level') | ||
|
|
||
| if not (isinstance(numeric_only, types.Omitted) or numeric_only is None): | ||
| ty_checker.raise_exc(numeric_only, 'unsupported', 'numeric_only') | ||
|
|
||
| if not (isinstance(ddof, types.Omitted) or ddof == 1): | ||
| ty_checker.raise_exc(ddof, 'unsupported', 'ddof') | ||
|
|
||
| if not (isinstance(min_count, types.Omitted) or min_count == 0): | ||
| ty_checker.raise_exc(min_count, 'unsupported', 'min_count') | ||
|
|
||
| @overload_method(DataFrameType, 'head') | ||
| def head_overload(df, n=5): | ||
| """ | ||
| Pandas DataFrame method :meth:`pandas.DataFrame.head` implementation. | ||
| .. only:: developer | ||
| Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_head1 | ||
| Parameters | ||
| ----------- | ||
| self: :class:`pandas.DataFrame` | ||
| input arg | ||
| n: :obj:`int`, default 5 | ||
| input arg, default 5 | ||
| Returns | ||
| ------- | ||
| :obj:`pandas.Series` | ||
| returns: The first n rows of the caller object. | ||
| """ | ||
|
|
||
| name = 'head' | ||
|
|
||
| ty_checker = TypeChecker('Method {}().'.format(name)) | ||
| ty_checker.check(df, DataFrameType) | ||
|
|
||
| if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5): | ||
| ty_checker.raise_exc(n, 'int64', 'n') | ||
|
|
||
| return sdc_pandas_dataframe_reduce_columns_series(df, name, [('n', 5)]) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -41,6 +41,9 @@ | |
| skip_numba_jit, skip_sdc_jit) | ||
|
|
||
| from sdc.tests.gen_test_data import ParquetGenerator | ||
| from sdc.tests.test_utils import (min_float64, max_float64, test_global_input_data_float64, | ||
| test_global_input_data_unicode_kind4, test_datatime, | ||
| min_int64, max_int64, test_global_input_data_int64) | ||
| from numba.config import IS_32BITS | ||
|
|
||
|
|
||
|
|
@@ -123,6 +126,7 @@ def test_impl(df): | |
| hpat_func = self.jit(test_impl) | ||
| pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) | ||
|
|
||
| @unittest.skip('returned NULL without setting an error') | ||
| def test_box1(self): | ||
| def test_impl(n): | ||
| df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}) | ||
|
|
@@ -889,7 +893,7 @@ def test_impl(n): | |
|
|
||
| def test_df_fillna1(self): | ||
| def test_impl(df): | ||
| return df.fillna(5.0) | ||
| return df.fillna(0.) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason to change it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its has attention to other PR, I forget delete this |
||
|
|
||
| df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0]}) | ||
| hpat_func = self.jit(test_impl) | ||
|
|
@@ -1134,7 +1138,6 @@ def test_impl(): | |
| @unittest.skip("Implement iterrows for DataFrame") | ||
| def test_dataframe_iterrows(self): | ||
| def test_impl(df): | ||
| print(df.iterrows()) | ||
| return [row for _, row in df.iterrows()] | ||
|
|
||
| df = pd.DataFrame({'A': [1, 2, 3], 'B': [0.2, 0.5, 0.001], 'C': ['a', 'bb', 'ccc']}) | ||
|
|
@@ -1151,6 +1154,50 @@ def test_impl(n): | |
| hpat_func = self.jit(test_impl) | ||
| pd.testing.assert_series_equal(hpat_func(n), test_impl(n)) | ||
|
|
||
| def test_dataframe_head(self): | ||
| def test_impl(df): | ||
| return df.head() | ||
| sdc_func = sdc.jit(test_impl) | ||
| df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5], | ||
| "DATATIME": test_datatime, | ||
| "INT": test_global_input_data_int64[:5], | ||
| "STRING": ['a', 'dd', 'c', '12', 'ddf']}) | ||
| pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) | ||
|
|
||
| def test_dataframe_head1(self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if Index is set explicitly, this solution is ok for that case? I don't see tests on it |
||
| def test_impl(df, n): | ||
| return df.head(n) | ||
| sdc_func = sdc.jit(test_impl) | ||
| df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5], | ||
| "DATATIME": test_datatime, | ||
| "INT": test_global_input_data_int64[:5], | ||
| "STRING": ['a', 'dd', 'c', '12', 'ddf']}) | ||
| for n in [-1, 0, 2, 5]: | ||
| pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) | ||
|
|
||
| @unittest.skip('Dataframe.index not support') | ||
| def test_dataframe_head1_index(self): | ||
| def test_impl(df, n): | ||
| return df.head(n) | ||
| sdc_func = sdc.jit(test_impl) | ||
| df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5], | ||
| "DATATIME": test_datatime, | ||
| "INT": test_global_input_data_int64[:5], | ||
| "STRING": ['a', 'dd', 'c', '12', 'ddf']}, | ||
| index=[32, 3, 6, 17, 23]) | ||
| for n in [-1, 0, 2, 5]: | ||
| pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) | ||
|
|
||
| def test_dataframe_head2(self): | ||
| def test_impl(df, n): | ||
| return df.head(n) | ||
| sdc_func = sdc.jit(test_impl) | ||
| df = pd.DataFrame({"A": [12, 4, 5, 1, 6, 8], | ||
| "B": [5, 2, 54, 3, 6, 4], | ||
| "C": [20, 16, 3, 8, 2, 3], | ||
| "D": [14, 3, 2, 6, 4, 5]}) | ||
| for n in [-1, 0, 2, 5]: | ||
| pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n)) | ||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this variable is needed here.
You can just delete implementation of method count(or keep it commented if you need this).