-
Notifications
You must be signed in to change notification settings - Fork 62
Overload df.getitem with idx of tuple #573
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -929,21 +929,46 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns): | |||||||||
| return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def df_getitem_slice_idx_main_codelines(self): | ||||||||||
| """Generate main code lines for df.getitem""" | ||||||||||
| def df_index_codelines(self): | ||||||||||
| """Generate code lines to get or create index of DF""" | ||||||||||
| if isinstance(self.index, types.NoneType): | ||||||||||
| func_lines = [' length = len(get_dataframe_data(self, 0))', | ||||||||||
| ' _index = numpy.arange(length)', | ||||||||||
| ' res_index = _index[idx]'] | ||||||||||
| ' res_index = _index'] | ||||||||||
| else: | ||||||||||
| func_lines = [' res_index = self._index[idx]'] | ||||||||||
| func_lines = [' res_index = self._index'] | ||||||||||
|
|
||||||||||
| return func_lines | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def df_getitem_slice_idx_main_codelines(self, idx): | ||||||||||
| """Generate main code lines for df.getitem with idx of slice""" | ||||||||||
| results = [] | ||||||||||
| func_lines = df_index_codelines(self) | ||||||||||
| for i, col in enumerate(self.columns): | ||||||||||
| res_data = f'res_data_{i}' | ||||||||||
| func_lines += [ | ||||||||||
| f' data_{i} = get_dataframe_data(self, {i})', | ||||||||||
| f' {res_data} = pandas.Series(data_{i}[idx], index=res_index, name="{col}")' | ||||||||||
| f' {res_data} = pandas.Series(data_{i}[idx], index=res_index[idx], name="{col}")' | ||||||||||
| ] | ||||||||||
| results.append((col, res_data)) | ||||||||||
|
|
||||||||||
| data = ', '.join(f'"{col}": {data}' for col, data in results) | ||||||||||
| func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[idx])'] | ||||||||||
|
|
||||||||||
| return func_lines | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def df_getitem_tuple_idx_main_codelines(self, literal_idx): | ||||||||||
| """Generate main code lines for df.getitem with idx of tuple""" | ||||||||||
| results = [] | ||||||||||
| func_lines = df_index_codelines(self) | ||||||||||
| needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx} | ||||||||||
| for col, i in needed_cols.items(): | ||||||||||
| res_data = f'res_data_{i}' | ||||||||||
| func_lines += [ | ||||||||||
| f' data_{i} = get_dataframe_data(self, {i})', | ||||||||||
| f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")' | ||||||||||
| ] | ||||||||||
| results.append((col, res_data)) | ||||||||||
|
|
||||||||||
|
|
@@ -953,20 +978,20 @@ def df_getitem_slice_idx_main_codelines(self): | |||||||||
| return func_lines | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def df_getitem_str_slice_codegen(self): | ||||||||||
| def df_getitem_slice_idx_codegen(self, idx): | ||||||||||
| """ | ||||||||||
| Example of generated implementation with provided index: | ||||||||||
| def _df_getitem_slice_idx_impl(self, idx): | ||||||||||
| res_index = self._index[idx] | ||||||||||
| def _df_getitem_slice_idx_impl(self, idx) | ||||||||||
| res_index = self._index | ||||||||||
| data_0 = get_dataframe_data(self, 0) | ||||||||||
| res_data_0 = pandas.Series(data_0[idx], index=res_index, name="A") | ||||||||||
| res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A") | ||||||||||
| data_1 = get_dataframe_data(self, 1) | ||||||||||
| res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B") | ||||||||||
| return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index) | ||||||||||
| return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx]) | ||||||||||
| """ | ||||||||||
| func_lines = ['def _df_getitem_slice_idx_impl(self, idx):'] | ||||||||||
| if self.columns: | ||||||||||
| func_lines += df_getitem_slice_idx_main_codelines(self) | ||||||||||
| func_lines += df_getitem_slice_idx_main_codelines(self, idx) | ||||||||||
| else: | ||||||||||
| # raise KeyError if input DF is empty | ||||||||||
| func_lines += [' raise KeyError'] | ||||||||||
|
|
@@ -978,14 +1003,52 @@ def _df_getitem_slice_idx_impl(self, idx): | |||||||||
| return func_text, global_vars | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def gen_df_getitem_slice_idx_impl(self): | ||||||||||
| func_text, global_vars = df_getitem_str_slice_codegen(self) | ||||||||||
| def df_getitem_tuple_idx_codegen(self, idx): | ||||||||||
| """ | ||||||||||
| Example of generated implementation with provided index: | ||||||||||
| def _df_getitem_tuple_idx_impl(self, idx) | ||||||||||
| res_index = self._index | ||||||||||
| data_1 = get_dataframe_data(self, 1) | ||||||||||
| res_data_1 = pandas.Series(data_1, index=res_index, name="B") | ||||||||||
| data_2 = get_dataframe_data(self, 2) | ||||||||||
| res_data_2 = pandas.Series(data_2, index=res_index, name="C") | ||||||||||
| return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index) | ||||||||||
| """ | ||||||||||
| func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):'] | ||||||||||
| literal_idx = {col.literal_value for col in idx} | ||||||||||
| key_error = any(i not in self.columns for i in literal_idx) | ||||||||||
|
|
||||||||||
| loc_vars = {} | ||||||||||
| exec(func_text, global_vars, loc_vars) | ||||||||||
| _impl = loc_vars['_df_getitem_slice_idx_impl'] | ||||||||||
| if self.columns and not key_error: | ||||||||||
| func_lines += df_getitem_tuple_idx_main_codelines(self, literal_idx) | ||||||||||
| else: | ||||||||||
| # raise KeyError if input DF is empty or idx is invalid | ||||||||||
| func_lines += [' raise KeyError'] | ||||||||||
|
|
||||||||||
| func_text = '\n'.join(func_lines) | ||||||||||
| global_vars = {'pandas': pandas, 'numpy': numpy, | ||||||||||
| 'get_dataframe_data': get_dataframe_data} | ||||||||||
|
|
||||||||||
| return func_text, global_vars | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def gen_df_getitem_impl_generator(codegen, impl_name): | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks as a very generic function, that we could use everywhere else, so it might be useful to move it to sdc_typing_utils. So it's responsibility would be to call exec on provided func_code and return back created func object. |
||||||||||
| """Generate generator of df.getitem""" | ||||||||||
| def _df_getitem_impl_generator(self, idx): | ||||||||||
| func_text, global_vars = codegen(self, idx) | ||||||||||
|
|
||||||||||
| loc_vars = {} | ||||||||||
| exec(func_text, global_vars, loc_vars) | ||||||||||
| _impl = loc_vars[impl_name] | ||||||||||
|
|
||||||||||
| return _impl | ||||||||||
| return _impl | ||||||||||
|
|
||||||||||
| return _df_getitem_impl_generator | ||||||||||
|
|
||||||||||
|
|
||||||||||
| gen_df_getitem_slice_idx_impl = gen_df_getitem_impl_generator( | ||||||||||
| df_getitem_slice_idx_codegen, '_df_getitem_slice_idx_impl') | ||||||||||
| gen_df_getitem_tuple_idx_impl = gen_df_getitem_impl_generator( | ||||||||||
| df_getitem_tuple_idx_codegen, '_df_getitem_tuple_idx_impl') | ||||||||||
|
|
||||||||||
|
|
||||||||||
| @sdc_overload(operator.getitem) | ||||||||||
|
|
@@ -1018,8 +1081,11 @@ def _df_getitem_unicode_idx_impl(self, idx): | |||||||||
|
|
||||||||||
| return _df_getitem_unicode_idx_impl | ||||||||||
|
|
||||||||||
| if isinstance(idx, types.Tuple): | ||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be a more strict check, e.g.
Suggested change
|
||||||||||
| return gen_df_getitem_tuple_idx_impl(self, idx) | ||||||||||
|
|
||||||||||
| if isinstance(idx, types.SliceType): | ||||||||||
| return gen_df_getitem_slice_idx_impl(self) | ||||||||||
| return gen_df_getitem_slice_idx_impl(self, idx) | ||||||||||
|
|
||||||||||
| ty_checker = TypeChecker('Operator getitem().') | ||||||||||
| ty_checker.raise_exc(idx, 'str', 'idx') | ||||||||||
|
|
||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1236,6 +1236,17 @@ def test_impl(df, start, end): | |
| ref_result = test_impl(df, start, end) | ||
| pd.testing.assert_frame_equal(jit_result, ref_result) | ||
|
|
||
| @skip_sdc_jit('DF.getitem unsupported Series name') | ||
| def _test_df_getitem_tuple_idx(self, df): | ||
| def test_impl(df): | ||
| # pd.df.getitem does not support idx as a tuple | ||
| return df[['A', 'C']] | ||
|
|
||
| # SDC pd.df.getitem does not support idx as a list | ||
| sdc_func = self.jit(lambda df: df[('A', 'C')]) | ||
|
Comment on lines
+1245
to
+1246
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should state this as a limitation in our documentation too. or use do_jit parameter, e.g. |
||
|
|
||
| pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) | ||
|
|
||
| @skip_sdc_jit('DF.getitem unsupported exceptions') | ||
| def test_df_getitem_str_literal_idx_exception_key_error(self): | ||
| def test_impl(df): | ||
|
|
@@ -1260,17 +1271,27 @@ def test_impl(df, idx): | |
| with self.assertRaises(KeyError): | ||
| sdc_func(df, 'ABC') | ||
|
|
||
| @skip_sdc_jit('DF.getitem unsupported Series name') | ||
| def test_df_getitem_tuple_idx_exception_key_error(self): | ||
| sdc_func = self.jit(lambda df: df[('A', 'Z')]) | ||
|
|
||
| for df in [gen_df(test_global_input_data_float64), pd.DataFrame()]: | ||
| with self.subTest(df=df): | ||
| with self.assertRaises(KeyError): | ||
| sdc_func(df) | ||
|
|
||
| @skip_sdc_jit('DF.getitem unsupported Series name') | ||
| def test_df_getitem_idx(self): | ||
| dfs = [gen_df(test_global_input_data_float64), | ||
| gen_df(test_global_input_data_float64, with_index=True), | ||
| pd.DataFrame({'A': []})] | ||
| pd.DataFrame({'A': [], 'B': [], 'C': []})] | ||
| for df in dfs: | ||
| with self.subTest(df=df): | ||
| self._test_df_getitem_str_literal_idx(df) | ||
| self._test_df_getitem_unicode_idx(df, 'A') | ||
| self._test_df_getitem_slice_idx(df) | ||
| self._test_df_getitem_unbox_slice_idx(df, 1, 3) | ||
| self._test_df_getitem_tuple_idx(df) | ||
|
|
||
| @skip_sdc_jit('DF.getitem unsupported Series name') | ||
| def test_df_getitem_idx_multiple_types(self): | ||
|
|
@@ -1284,6 +1305,7 @@ def test_df_getitem_idx_multiple_types(self): | |
| self._test_df_getitem_unicode_idx(df, 'A') | ||
| self._test_df_getitem_slice_idx(df) | ||
| self._test_df_getitem_unbox_slice_idx(df, 1, 3) | ||
| self._test_df_getitem_tuple_idx(df) | ||
|
|
||
| @unittest.skip('DF.getitem unsupported integer columns') | ||
| def test_df_getitem_int_literal_idx(self): | ||
|
|
@@ -1295,16 +1317,6 @@ def test_impl(df): | |
|
|
||
| pd.testing.assert_series_equal(sdc_func(df), test_impl(df)) | ||
|
|
||
| @unittest.skip('DF.getitem unsupported idx as a tuple') | ||
| def test_df_getitem_unicode_tuple_idx(self): | ||
| def test_impl(df): | ||
| return df[['A', 'B']] | ||
|
|
||
| sdc_func = self.jit(lambda df: df[('A', 'B')]) | ||
| df = gen_df(test_global_input_data_float64) | ||
|
|
||
| pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) | ||
|
|
||
| @skip_numba_jit | ||
| def test_isin_df1(self): | ||
| def test_impl(df, df2): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The exception should really have a message, maybe something like "Column is not in the DataFrame" would fit?