Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 61 additions & 6 deletions sdc/datatypes/hpat_pandas_groupby_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import pandas
import numba
import numpy
import operator
import sdc

from numba import types
Expand All @@ -39,41 +40,94 @@
from numba.targets.registry import cpu_target
from numba.typed import List, Dict
from numba.typing import signature
from numba.special import literally

from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray, _sdc_take
from sdc.datatypes.hpat_pandas_groupby_types import DataFrameGroupByType
from sdc.utilities.sdc_typing_utils import TypeChecker, kwsparams2list, sigparams2list
from sdc.utilities.utils import sdc_overload_method, sdc_overload_attribute
from sdc.utilities.utils import sdc_overload, sdc_overload_method, sdc_overload_attribute
from sdc.hiframes.pd_dataframe_ext import get_dataframe_data
from sdc.hiframes.pd_series_type import SeriesType
from sdc.str_ext import string_type


@intrinsic
def init_dataframe_groupby(typingctx, parent, column_id, data, sort):
def init_dataframe_groupby(typingctx, parent, column_id, data, sort, target_columns=None):

target_columns = types.none if target_columns is None else target_columns
if isinstance(target_columns, types.NoneType):
target_not_specified = True
selected_col_names = tuple([a for i, a in enumerate(parent.columns) if i != column_id.literal_value])
else:
target_not_specified = False
selected_col_names = tuple([a.literal_value for a in target_columns])

n_target_cols = len(selected_col_names)
def codegen(context, builder, signature, args):
parent_val, column_id_val, data_val, sort_val = args
parent_val, column_id_val, data_val, sort_val, target_columns = args
# create series struct and store values
groupby_obj = cgutils.create_struct_proxy(
signature.return_type)(context, builder)
groupby_obj.parent = parent_val
groupby_obj.col_id = column_id_val
groupby_obj.data = data_val
groupby_obj.sort = sort_val
groupby_obj.target_default = context.get_constant(types.bool_, target_not_specified)

column_strs = [numba.unicode.make_string_from_constant(
context, builder, string_type, c) for c in selected_col_names]
column_tup = context.make_tuple(
builder, types.UniTuple(string_type, n_target_cols), column_strs)

groupby_obj.target_columns = column_tup

# increase refcount of stored values
if context.enable_nrt:
context.nrt.incref(builder, signature.args[0], parent_val)
context.nrt.incref(builder, signature.args[1], column_id_val)
context.nrt.incref(builder, signature.args[2], data_val)
for var in column_strs:
context.nrt.incref(builder, string_type, var)

return groupby_obj._getvalue()

ret_typ = DataFrameGroupByType(parent, column_id)
sig = signature(ret_typ, parent, column_id, data, sort)
ret_typ = DataFrameGroupByType(parent, column_id, selected_col_names)
sig = signature(ret_typ, parent, column_id, data, sort, target_columns)
return sig, codegen


@sdc_overload(operator.getitem)
def sdc_pandas_dataframe_getitem(self, idx):

if not isinstance(self, DataFrameGroupByType):
return None

idx_is_literal_str = isinstance(idx, types.StringLiteral)
if (idx_is_literal_str
or (isinstance(idx, types.Tuple)
and all(isinstance(a, types.StringLiteral) for a in idx))):

col_id_literal = self.col_id.literal_value
idx_literal = idx.literal_value if idx_is_literal_str else None
def sdc_pandas_dataframe_getitem_common_impl(self, idx):

_idx = (idx_literal, ) if idx_is_literal_str == True else idx # noqa
# calling getitem twice raises IndexError, just as in pandas
if not self._target_default:
raise IndexError("DataFrame.GroupBy.getitem: Columns already selected")
return init_dataframe_groupby(self._parent, col_id_literal, self._data, self._sort, _idx)

return sdc_pandas_dataframe_getitem_common_impl

if isinstance(idx, types.UnicodeType):
def sdc_pandas_dataframe_getitem_idx_unicode_str_impl(self, idx):
# just call literally as it will raise and compilation will continue via common impl
return literally(idx)
return sdc_pandas_dataframe_getitem_idx_unicode_str_impl

return None


def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, func_params, defaults, impl_params):

all_params_as_str = ', '.join(sigparams2list(func_params, defaults))
Expand Down Expand Up @@ -155,7 +209,8 @@ def sdc_pandas_groupby_apply_func(self, func_name, func_args, defaults=None, imp
df_column_types = self.parent.data
df_column_names = self.parent.columns
by_column_id = self.col_id.literal_value
subject_columns = [(name, i) for i, name in enumerate(df_column_names) if i != by_column_id]
selected_cols_set = set(self.target_columns)
subject_columns = [(name, i) for i, name in enumerate(df_column_names) if name in selected_cols_set]

# resolve types of result dataframe columns
res_arrays_dtypes = tuple(
Expand Down
18 changes: 12 additions & 6 deletions sdc/datatypes/hpat_pandas_groupby_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@


import numba
from numba import types, cgutils
from numba import types
from numba.extending import (models, register_model, make_attribute_wrapper)
from numba.typed import Dict, List
from sdc.str_ext import string_type


Expand All @@ -37,15 +36,16 @@ class DataFrameGroupByType(types.Type):
Type definition for DataFrameGroupBy functions handling.
"""

def __init__(self, parent, col_id):
def __init__(self, parent, col_id, target_columns):
self.parent = parent
self.col_id = col_id
self.target_columns = target_columns
super(DataFrameGroupByType, self).__init__(
name="DataFrameGroupByType({}, {})".format(parent, col_id))
name="DataFrameGroupByType({}, {})".format(parent, col_id, target_columns))

@property
def key(self):
return self.parent, self.col_id
return self.parent, self.col_id, self.target_columns


@register_model(DataFrameGroupByType)
Expand All @@ -56,11 +56,15 @@ def __init__(self, dmm, fe_type):
by_series_dtype,
types.containers.ListType(types.int64)
)

n_target_cols = len(fe_type.target_columns)
members = [
('parent', fe_type.parent),
('col_id', types.int64),
('data', ty_data),
('sort', types.bool_)
('sort', types.bool_),
('target_default', types.bool_),
('target_columns', types.UniTuple(string_type, n_target_cols))
]
super(DataFrameGroupByModel, self).__init__(dmm, fe_type, members)

Expand All @@ -69,3 +73,5 @@ def __init__(self, dmm, fe_type):
make_attribute_wrapper(DataFrameGroupByType, 'col_id', '_col_id')
make_attribute_wrapper(DataFrameGroupByType, 'data', '_data')
make_attribute_wrapper(DataFrameGroupByType, 'sort', '_sort')
make_attribute_wrapper(DataFrameGroupByType, 'target_default', '_target_default')
make_attribute_wrapper(DataFrameGroupByType, 'target_columns', '_target_columns')
123 changes: 93 additions & 30 deletions sdc/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,45 +258,53 @@ def test_impl(df):
# np.testing.assert_array_equal(hpat_func(df), test_impl(df))
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))

@skip_numba_jit
@skip_numba_jit("BUG: SDC impl of Series.sum returns float64 on as series of ints")
def test_agg_seq_sum(self):
def test_impl(df):
A = df.groupby('A')['B'].sum()
return A.values
return df.groupby('A')['B'].sum()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_numba_jit
@skip_sdc_jit("Old-style implementation returns ndarray, not a Series")
def test_agg_seq_count(self):
def test_impl(df):
A = df.groupby('A')['B'].count()
return A.values
return df.groupby('A')['B'].count()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_numba_jit
@skip_sdc_jit("Old-style implementation returns ndarray, not a Series")
def test_agg_seq_mean(self):
def test_impl(df):
A = df.groupby('A')['B'].mean()
return A.values
return df.groupby('A')['B'].mean()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_numba_jit
@skip_sdc_jit("Old-style implementation returns ndarray, not a Series")
def test_agg_seq_min(self):
def test_impl(df):
A = df.groupby('A')['B'].min()
return A.values
return df.groupby('A')['B'].min()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_numba_jit
def test_agg_seq_min_date(self):
Expand All @@ -308,15 +316,17 @@ def test_impl(df):
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': pd.date_range('2019-1-3', '2019-1-9')})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))

@skip_numba_jit
@skip_sdc_jit("Old-style implementation returns ndarray, not a Series")
def test_agg_seq_max(self):
def test_impl(df):
A = df.groupby('A')['B'].max()
return A.values
return df.groupby('A')['B'].max()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_numba_jit
def test_agg_seq_all_col(self):
Expand All @@ -338,37 +348,43 @@ def test_impl(df):
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))

@skip_numba_jit
@skip_sdc_jit("Old-style implementation returns ndarray, not a Series")
def test_agg_seq_prod(self):
def test_impl(df):
A = df.groupby('A')['B'].prod()
return A.values
return df.groupby('A')['B'].prod()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_sdc_jit
@skip_numba_jit
def test_agg_seq_var(self):
def test_impl(df):
A = df.groupby('A')['B'].var()
return A.values
return df.groupby('A')['B'].var()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_sdc_jit
@skip_numba_jit
def test_agg_seq_std(self):
def test_impl(df):
A = df.groupby('A')['B'].std()
return A.values
return df.groupby('A')['B'].std()

hpat_func = self.jit(test_impl)
df = pd.DataFrame({'A': [2, 1, 1, 1, 2, 2, 1], 'B': [-8, 2, 3, 1, 5, 6, 7]})
self.assertEqual(set(hpat_func(df)), set(test_impl(df)))
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

@skip_numba_jit
def test_agg_seq_multiselect(self):
Expand Down Expand Up @@ -661,6 +677,53 @@ def test_impl(df):
hpat_func = self.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))

def test_dataframe_groupby_getitem_literal_tuple(self):
def test_impl(df):
return df.groupby('A')['B', 'C'].count()
hpat_func = self.jit(test_impl)

df = pd.DataFrame(_default_df_numeric_data)
result = hpat_func(df)
result_ref = test_impl(df)
# TODO: implement index classes, as current indexes do not have names
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

def test_dataframe_groupby_getitem_literal_str(self):
def test_impl(df):
return df.groupby('C')['B'].count()
hpat_func = self.jit(test_impl)

df = pd.DataFrame(_default_df_numeric_data)
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df)
result_ref = pd.DataFrame(test_impl(df))
# TODO: implement index classes, as current indexes do not have names
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

def test_dataframe_groupby_getitem_unicode_str(self):
def test_impl(df, col_name):
return df.groupby('A')[col_name].count()
hpat_func = self.jit(test_impl)

df = pd.DataFrame(_default_df_numeric_data)
col_name = 'C'
# pandas returns groupby.generic.SeriesGroupBy object in this case, hence align result_ref
result = hpat_func(df, col_name)
result_ref = pd.DataFrame(test_impl(df, col_name))
# TODO: implement index classes, as current indexes do not have names
pd.testing.assert_frame_equal(result, result_ref, check_names=False)

def test_dataframe_groupby_getitem_repeated(self):
def test_impl(df):
return df.groupby('A')['B', 'C']['D']
hpat_func = self.jit(test_impl)

df = pd.DataFrame(_default_df_numeric_data)
with self.assertRaises(Exception) as context:
test_impl(df)
pandas_exception = context.exception

self.assertRaises(type(pandas_exception), hpat_func, df)

if __name__ == "__main__":
unittest.main()