Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 84 additions & 18 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,21 +929,46 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns)


def df_getitem_slice_idx_main_codelines(self):
"""Generate main code lines for df.getitem"""
def df_index_codelines(self):
"""Generate code lines to get or create index of DF"""
if isinstance(self.index, types.NoneType):
func_lines = [' length = len(get_dataframe_data(self, 0))',
' _index = numpy.arange(length)',
' res_index = _index[idx]']
' res_index = _index']
else:
func_lines = [' res_index = self._index[idx]']
func_lines = [' res_index = self._index']

return func_lines


def df_getitem_slice_idx_main_codelines(self, idx):
"""Generate main code lines for df.getitem with idx of slice"""
results = []
func_lines = df_index_codelines(self)
for i, col in enumerate(self.columns):
res_data = f'res_data_{i}'
func_lines += [
f' data_{i} = get_dataframe_data(self, {i})',
f' {res_data} = pandas.Series(data_{i}[idx], index=res_index, name="{col}")'
f' {res_data} = pandas.Series(data_{i}[idx], index=res_index[idx], name="{col}")'
]
results.append((col, res_data))

data = ', '.join(f'"{col}": {data}' for col, data in results)
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[idx])']

return func_lines


def df_getitem_tuple_idx_main_codelines(self, literal_idx):
"""Generate main code lines for df.getitem with idx of tuple"""
results = []
func_lines = df_index_codelines(self)
needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx}
for col, i in needed_cols.items():
res_data = f'res_data_{i}'
func_lines += [
f' data_{i} = get_dataframe_data(self, {i})',
f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")'
]
results.append((col, res_data))

Expand All @@ -953,20 +978,20 @@ def df_getitem_slice_idx_main_codelines(self):
return func_lines


def df_getitem_str_slice_codegen(self):
def df_getitem_slice_idx_codegen(self, idx):
"""
Example of generated implementation with provided index:
def _df_getitem_slice_idx_impl(self, idx):
res_index = self._index[idx]
def _df_getitem_slice_idx_impl(self, idx)
res_index = self._index
data_0 = get_dataframe_data(self, 0)
res_data_0 = pandas.Series(data_0[idx], index=res_index, name="A")
res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
data_1 = get_dataframe_data(self, 1)
res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index)
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
"""
func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
if self.columns:
func_lines += df_getitem_slice_idx_main_codelines(self)
func_lines += df_getitem_slice_idx_main_codelines(self, idx)
else:
# raise KeyError if input DF is empty
func_lines += [' raise KeyError']
Expand All @@ -978,14 +1003,52 @@ def _df_getitem_slice_idx_impl(self, idx):
return func_text, global_vars


def gen_df_getitem_slice_idx_impl(self):
func_text, global_vars = df_getitem_str_slice_codegen(self)
def df_getitem_tuple_idx_codegen(self, idx):
"""
Example of generated implementation with provided index:
def _df_getitem_tuple_idx_impl(self, idx)
res_index = self._index
data_1 = get_dataframe_data(self, 1)
res_data_1 = pandas.Series(data_1, index=res_index, name="B")
data_2 = get_dataframe_data(self, 2)
res_data_2 = pandas.Series(data_2, index=res_index, name="C")
return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index)
"""
func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):']
literal_idx = {col.literal_value for col in idx}
key_error = any(i not in self.columns for i in literal_idx)

loc_vars = {}
exec(func_text, global_vars, loc_vars)
_impl = loc_vars['_df_getitem_slice_idx_impl']
if self.columns and not key_error:
func_lines += df_getitem_tuple_idx_main_codelines(self, literal_idx)
else:
# raise KeyError if input DF is empty or idx is invalid
func_lines += [' raise KeyError']
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exception should really have a message, maybe something like "Column is not in the DataFrame" would fit?


func_text = '\n'.join(func_lines)
global_vars = {'pandas': pandas, 'numpy': numpy,
'get_dataframe_data': get_dataframe_data}

return func_text, global_vars


def gen_df_getitem_impl_generator(codegen, impl_name):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks as a very generic function, that we could use everywhere else, so it might be useful to move it to sdc_typing_utils. So it's responsibility would be to call exec on provided func_code and return back created func object.

"""Generate generator of df.getitem"""
def _df_getitem_impl_generator(self, idx):
func_text, global_vars = codegen(self, idx)

loc_vars = {}
exec(func_text, global_vars, loc_vars)
_impl = loc_vars[impl_name]

return _impl
return _impl

return _df_getitem_impl_generator


gen_df_getitem_slice_idx_impl = gen_df_getitem_impl_generator(
df_getitem_slice_idx_codegen, '_df_getitem_slice_idx_impl')
gen_df_getitem_tuple_idx_impl = gen_df_getitem_impl_generator(
df_getitem_tuple_idx_codegen, '_df_getitem_tuple_idx_impl')


@sdc_overload(operator.getitem)
Expand Down Expand Up @@ -1018,8 +1081,11 @@ def _df_getitem_unicode_idx_impl(self, idx):

return _df_getitem_unicode_idx_impl

if isinstance(idx, types.Tuple):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be a more strict check, e.g.

Suggested change
if isinstance(idx, types.Tuple):
if (isinstance(idx, types.Tuple)
and all([isinstance(item, types.StringLiteral) for item in idx])):

return gen_df_getitem_tuple_idx_impl(self, idx)

if isinstance(idx, types.SliceType):
return gen_df_getitem_slice_idx_impl(self)
return gen_df_getitem_slice_idx_impl(self, idx)

ty_checker = TypeChecker('Operator getitem().')
ty_checker.raise_exc(idx, 'str', 'idx')
Expand Down
34 changes: 23 additions & 11 deletions sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,17 @@ def test_impl(df, start, end):
ref_result = test_impl(df, start, end)
pd.testing.assert_frame_equal(jit_result, ref_result)

@skip_sdc_jit('DF.getitem unsupported Series name')
def _test_df_getitem_tuple_idx(self, df):
def test_impl(df):
# pd.df.getitem does not support idx as a tuple
return df[['A', 'C']]

# SDC pd.df.getitem does not support idx as a list
sdc_func = self.jit(lambda df: df[('A', 'C')])
Comment on lines +1245 to +1246
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should state this as a limitation in our documentation too.
Also it might look better if you write it this way:

        test_impl = lambda df: df[['A', 'C']]
        sdc_func = self.jit(lambda df: df[('A', 'C')])

or use do_jit parameter, e.g.

      # Pandas implementation does not support idx as a tuple
       # but SDC implementation only supports tuple, so adjust tested code
       def test_impl(df, do_jit):
           if do_jit == True:  # noqa
               return df[('A', 'C')]
           else:
               return df[['A', 'C']]
       sdc_func = self.jit(test_impl)

       pd.testing.assert_frame_equal(sdc_func(df, True), test_impl(df, False))


pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

@skip_sdc_jit('DF.getitem unsupported exceptions')
def test_df_getitem_str_literal_idx_exception_key_error(self):
def test_impl(df):
Expand All @@ -1260,17 +1271,27 @@ def test_impl(df, idx):
with self.assertRaises(KeyError):
sdc_func(df, 'ABC')

@skip_sdc_jit('DF.getitem unsupported Series name')
def test_df_getitem_tuple_idx_exception_key_error(self):
sdc_func = self.jit(lambda df: df[('A', 'Z')])

for df in [gen_df(test_global_input_data_float64), pd.DataFrame()]:
with self.subTest(df=df):
with self.assertRaises(KeyError):
sdc_func(df)

@skip_sdc_jit('DF.getitem unsupported Series name')
def test_df_getitem_idx(self):
dfs = [gen_df(test_global_input_data_float64),
gen_df(test_global_input_data_float64, with_index=True),
pd.DataFrame({'A': []})]
pd.DataFrame({'A': [], 'B': [], 'C': []})]
for df in dfs:
with self.subTest(df=df):
self._test_df_getitem_str_literal_idx(df)
self._test_df_getitem_unicode_idx(df, 'A')
self._test_df_getitem_slice_idx(df)
self._test_df_getitem_unbox_slice_idx(df, 1, 3)
self._test_df_getitem_tuple_idx(df)

@skip_sdc_jit('DF.getitem unsupported Series name')
def test_df_getitem_idx_multiple_types(self):
Expand All @@ -1284,6 +1305,7 @@ def test_df_getitem_idx_multiple_types(self):
self._test_df_getitem_unicode_idx(df, 'A')
self._test_df_getitem_slice_idx(df)
self._test_df_getitem_unbox_slice_idx(df, 1, 3)
self._test_df_getitem_tuple_idx(df)

@unittest.skip('DF.getitem unsupported integer columns')
def test_df_getitem_int_literal_idx(self):
Expand All @@ -1295,16 +1317,6 @@ def test_impl(df):

pd.testing.assert_series_equal(sdc_func(df), test_impl(df))

@unittest.skip('DF.getitem unsupported idx as a tuple')
def test_df_getitem_unicode_tuple_idx(self):
def test_impl(df):
return df[['A', 'B']]

sdc_func = self.jit(lambda df: df[('A', 'B')])
df = gen_df(test_global_input_data_float64)

pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

@skip_numba_jit
def test_isin_df1(self):
def test_impl(df, df2):
Expand Down