diff --git a/sdc/datatypes/hpat_pandas_groupby_functions.py b/sdc/datatypes/hpat_pandas_groupby_functions.py index fd1ca4fb9..52e902fce 100644 --- a/sdc/datatypes/hpat_pandas_groupby_functions.py +++ b/sdc/datatypes/hpat_pandas_groupby_functions.py @@ -31,6 +31,7 @@ import pandas import numba import numpy +import operator import sdc from numba import types @@ -39,20 +40,31 @@ from numba.targets.registry import cpu_target from numba.typed import List, Dict from numba.typing import signature +from numba.special import literally from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray, _sdc_take from sdc.datatypes.hpat_pandas_groupby_types import DataFrameGroupByType from sdc.utilities.sdc_typing_utils import TypeChecker, kwsparams2list, sigparams2list -from sdc.utilities.utils import sdc_overload_method, sdc_overload_attribute +from sdc.utilities.utils import sdc_overload, sdc_overload_method, sdc_overload_attribute from sdc.hiframes.pd_dataframe_ext import get_dataframe_data from sdc.hiframes.pd_series_type import SeriesType +from sdc.str_ext import string_type @intrinsic -def init_dataframe_groupby(typingctx, parent, column_id, data, sort): +def init_dataframe_groupby(typingctx, parent, column_id, data, sort, target_columns=None): + target_columns = types.none if target_columns is None else target_columns + if isinstance(target_columns, types.NoneType): + target_not_specified = True + selected_col_names = tuple([a for i, a in enumerate(parent.columns) if i != column_id.literal_value]) + else: + target_not_specified = False + selected_col_names = tuple([a.literal_value for a in target_columns]) + + n_target_cols = len(selected_col_names) def codegen(context, builder, signature, args): - parent_val, column_id_val, data_val, sort_val = args + parent_val, column_id_val, data_val, sort_val, target_columns = args # create series struct and store values groupby_obj = cgutils.create_struct_proxy( signature.return_type)(context, builder) @@ -60,20 +72,62 @@ def codegen(context, builder, signature, args): groupby_obj.col_id = column_id_val groupby_obj.data = data_val groupby_obj.sort = sort_val + groupby_obj.target_default = context.get_constant(types.bool_, target_not_specified) + + column_strs = [numba.unicode.make_string_from_constant( + context, builder, string_type, c) for c in selected_col_names] + column_tup = context.make_tuple( + builder, types.UniTuple(string_type, n_target_cols), column_strs) + + groupby_obj.target_columns = column_tup # increase refcount of stored values if context.enable_nrt: context.nrt.incref(builder, signature.args[0], parent_val) context.nrt.incref(builder, signature.args[1], column_id_val) context.nrt.incref(builder, signature.args[2], data_val) + for var in column_strs: + context.nrt.incref(builder, string_type, var) return groupby_obj._getvalue() - ret_typ = DataFrameGroupByType(parent, column_id) - sig = signature(ret_typ, parent, column_id, data, sort) + ret_typ = DataFrameGroupByType(parent, column_id, selected_col_names) + sig = signature(ret_typ, parent, column_id, data, sort, target_columns) return sig, codegen +@sdc_overload(operator.getitem) +def sdc_pandas_dataframe_getitem(self, idx): + + if not isinstance(self, DataFrameGroupByType): + return None + + idx_is_literal_str = isinstance(idx, types.StringLiteral) + if (idx_is_literal_str + or (isinstance(idx, types.Tuple) + and all(isinstance(a, types.StringLiteral) for a in idx))): + + col_id_literal = self.col_id.literal_value + idx_literal = idx.literal_value if idx_is_literal_str else None + def sdc_pandas_dataframe_getitem_common_impl(self, idx): + + _idx = (idx_literal, ) if idx_is_literal_str == True else idx # noqa + # calling getitem twice raises IndexError, just as in pandas + if not self._target_default: + raise IndexError("DataFrame.GroupBy.getitem: Columns already selected") + return init_dataframe_groupby(self._parent, col_id_literal, self._data, self._sort, _idx) + + return sdc_pandas_dataframe_getitem_common_impl + + if isinstance(idx, types.UnicodeType): + def sdc_pandas_dataframe_getitem_idx_unicode_str_impl(self, idx): + # just call literally as it will raise and compilation will continue via common impl + return literally(idx) + return sdc_pandas_dataframe_getitem_idx_unicode_str_impl + + return None + + def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, func_params, defaults, impl_params): all_params_as_str = ', '.join(sigparams2list(func_params, defaults)) @@ -155,7 +209,8 @@ def sdc_pandas_groupby_apply_func(self, func_name, func_args, defaults=None, imp df_column_types = self.parent.data df_column_names = self.parent.columns by_column_id = self.col_id.literal_value - subject_columns = [(name, i) for i, name in enumerate(df_column_names) if i != by_column_id] + selected_cols_set = set(self.target_columns) + subject_columns = [(name, i) for i, name in enumerate(df_column_names) if name in selected_cols_set] # resolve types of result dataframe columns res_arrays_dtypes = tuple( diff --git a/sdc/datatypes/hpat_pandas_groupby_types.py b/sdc/datatypes/hpat_pandas_groupby_types.py index b60311443..ac902462f 100644 --- a/sdc/datatypes/hpat_pandas_groupby_types.py +++ b/sdc/datatypes/hpat_pandas_groupby_types.py @@ -26,9 +26,8 @@ import numba -from numba import types, cgutils +from numba import types from numba.extending import (models, register_model, make_attribute_wrapper) -from numba.typed import Dict, List from sdc.str_ext import string_type @@ -37,15 +36,16 @@ class DataFrameGroupByType(types.Type): Type definition for DataFrameGroupBy functions handling. """ - def __init__(self, parent, col_id): + def __init__(self, parent, col_id, target_columns): self.parent = parent self.col_id = col_id + self.target_columns = target_columns super(DataFrameGroupByType, self).__init__( - name="DataFrameGroupByType({}, {})".format(parent, col_id)) + name="DataFrameGroupByType({}, {})".format(parent, col_id, target_columns)) @property def key(self): - return self.parent, self.col_id + return self.parent, self.col_id, self.target_columns @register_model(DataFrameGroupByType) @@ -56,11 +56,15 @@ def __init__(self, dmm, fe_type): by_series_dtype, types.containers.ListType(types.int64) ) + + n_target_cols = len(fe_type.target_columns) members = [ ('parent', fe_type.parent), ('col_id', types.int64), ('data', ty_data), - ('sort', types.bool_) + ('sort', types.bool_), + ('target_default', types.bool_), + ('target_columns', types.UniTuple(string_type, n_target_cols)) ] super(DataFrameGroupByModel, self).__init__(dmm, fe_type, members) @@ -69,3 +73,5 @@ def __init__(self, dmm, fe_type): make_attribute_wrapper(DataFrameGroupByType, 'col_id', '_col_id') make_attribute_wrapper(DataFrameGroupByType, 'data', '_data') make_attribute_wrapper(DataFrameGroupByType, 'sort', '_sort') +make_attribute_wrapper(DataFrameGroupByType, 'target_default', '_target_default') +make_attribute_wrapper(DataFrameGroupByType, 'target_columns', '_target_columns') diff --git a/sdc/tests/test_groupby.py b/sdc/tests/test_groupby.py index e6858f917..f853295d6 100644 --- a/sdc/tests/test_groupby.py +++ b/sdc/tests/test_groupby.py @@ -258,45 +258,53 @@ def test_impl(df): # np.testing.assert_array_equal(hpat_func(df), test_impl(df)) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) - @skip_numba_jit + @skip_numba_jit("BUG: SDC impl of Series.sum returns float64 on as series of ints") def test_agg_seq_sum(self): def test_impl(df): - A = df.groupby('A')['B'].sum() - return A.values + return df.groupby('A')['B'].sum() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) - @skip_numba_jit + @skip_sdc_jit("Old-style implementation returns ndarray, not a Series") def test_agg_seq_count(self): def test_impl(df): - A = df.groupby('A')['B'].count() - return A.values + return df.groupby('A')['B'].count() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) - @skip_numba_jit + @skip_sdc_jit("Old-style implementation returns ndarray, not a Series") def test_agg_seq_mean(self): def test_impl(df): - A = df.groupby('A')['B'].mean() - return A.values + return df.groupby('A')['B'].mean() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) - @skip_numba_jit + @skip_sdc_jit("Old-style implementation returns ndarray, not a Series") def test_agg_seq_min(self): def test_impl(df): - A = df.groupby('A')['B'].min() - return A.values + return df.groupby('A')['B'].min() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) @skip_numba_jit def test_agg_seq_min_date(self): @@ -308,15 +316,17 @@ def test_impl(df): df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': pd.date_range('2019-1-3', '2019-1-9')}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) - @skip_numba_jit + @skip_sdc_jit("Old-style implementation returns ndarray, not a Series") def test_agg_seq_max(self): def test_impl(df): - A = df.groupby('A')['B'].max() - return A.values + return df.groupby('A')['B'].max() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) @skip_numba_jit def test_agg_seq_all_col(self): @@ -338,37 +348,43 @@ def test_impl(df): df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) self.assertEqual(set(hpat_func(df)), set(test_impl(df))) - @skip_numba_jit + @skip_sdc_jit("Old-style implementation returns ndarray, not a Series") def test_agg_seq_prod(self): def test_impl(df): - A = df.groupby('A')['B'].prod() - return A.values + return df.groupby('A')['B'].prod() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) @skip_sdc_jit @skip_numba_jit def test_agg_seq_var(self): def test_impl(df): - A = df.groupby('A')['B'].var() - return A.values + return df.groupby('A')['B'].var() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) @skip_sdc_jit @skip_numba_jit def test_agg_seq_std(self): def test_impl(df): - A = df.groupby('A')['B'].std() - return A.values + return df.groupby('A')['B'].std() hpat_func = self.jit(test_impl) df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]}) - self.assertEqual(set(hpat_func(df)), set(test_impl(df))) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + pd.testing.assert_frame_equal(result, result_ref, check_names=False) @skip_numba_jit def test_agg_seq_multiselect(self): @@ -661,6 +677,53 @@ def test_impl(df): hpat_func = self.jit(test_impl) pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + def test_dataframe_groupby_getitem_literal_tuple(self): + def test_impl(df): + return df.groupby('A')['B', 'C'].count() + hpat_func = self.jit(test_impl) + + df = pd.DataFrame(_default_df_numeric_data) + result = hpat_func(df) + result_ref = test_impl(df) + # TODO: implement index classes, as current indexes do not have names + pd.testing.assert_frame_equal(result, result_ref, check_names=False) + + def test_dataframe_groupby_getitem_literal_str(self): + def test_impl(df): + return df.groupby('C')['B'].count() + hpat_func = self.jit(test_impl) + + df = pd.DataFrame(_default_df_numeric_data) + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df) + result_ref = pd.DataFrame(test_impl(df)) + # TODO: implement index classes, as current indexes do not have names + pd.testing.assert_frame_equal(result, result_ref, check_names=False) + + def test_dataframe_groupby_getitem_unicode_str(self): + def test_impl(df, col_name): + return df.groupby('A')[col_name].count() + hpat_func = self.jit(test_impl) + + df = pd.DataFrame(_default_df_numeric_data) + col_name = 'C' + # pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref + result = hpat_func(df, col_name) + result_ref = pd.DataFrame(test_impl(df, col_name)) + # TODO: implement index classes, as current indexes do not have names + pd.testing.assert_frame_equal(result, result_ref, check_names=False) + + def test_dataframe_groupby_getitem_repeated(self): + def test_impl(df): + return df.groupby('A')['B', 'C']['D'] + hpat_func = self.jit(test_impl) + + df = pd.DataFrame(_default_df_numeric_data) + with self.assertRaises(Exception) as context: + test_impl(df) + pandas_exception = context.exception + + self.assertRaises(type(pandas_exception), hpat_func, df) if __name__ == "__main__": unittest.main()