From 71bf261412847c105f14ac5c9c4491036ac8d8a5 Mon Sep 17 00:00:00 2001 From: Denis Date: Thu, 6 Feb 2020 15:59:26 +0300 Subject: [PATCH] Overload df.getitem with idx of tuple --- .../hpat_pandas_dataframe_functions.py | 102 ++++++++++++++---- sdc/tests/test_dataframe.py | 34 ++++-- 2 files changed, 107 insertions(+), 29 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 362ea02d3..a30b08b7c 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -929,21 +929,46 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns): return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns) -def df_getitem_slice_idx_main_codelines(self): - """Generate main code lines for df.getitem""" +def df_index_codelines(self): + """Generate code lines to get or create index of DF""" if isinstance(self.index, types.NoneType): func_lines = [' length = len(get_dataframe_data(self, 0))', ' _index = numpy.arange(length)', - ' res_index = _index[idx]'] + ' res_index = _index'] else: - func_lines = [' res_index = self._index[idx]'] + func_lines = [' res_index = self._index'] + return func_lines + + +def df_getitem_slice_idx_main_codelines(self, idx): + """Generate main code lines for df.getitem with idx of slice""" results = [] + func_lines = df_index_codelines(self) for i, col in enumerate(self.columns): res_data = f'res_data_{i}' func_lines += [ f' data_{i} = get_dataframe_data(self, {i})', - f' {res_data} = pandas.Series(data_{i}[idx], index=res_index, name="{col}")' + f' {res_data} = pandas.Series(data_{i}[idx], index=res_index[idx], name="{col}")' + ] + results.append((col, res_data)) + + data = ', '.join(f'"{col}": {data}' for col, data in results) + func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[idx])'] + + return func_lines + + +def df_getitem_tuple_idx_main_codelines(self, literal_idx): + """Generate main code lines for df.getitem with idx of tuple""" + results = [] + func_lines = df_index_codelines(self) + needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx} + for col, i in needed_cols.items(): + res_data = f'res_data_{i}' + func_lines += [ + f' data_{i} = get_dataframe_data(self, {i})', + f' {res_data} = pandas.Series(data_{i}, index=res_index, name="{col}")' ] results.append((col, res_data)) @@ -953,20 +978,20 @@ def df_getitem_slice_idx_main_codelines(self): return func_lines -def df_getitem_str_slice_codegen(self): +def df_getitem_slice_idx_codegen(self, idx): """ Example of generated implementation with provided index: - def _df_getitem_slice_idx_impl(self, idx): - res_index = self._index[idx] + def _df_getitem_slice_idx_impl(self, idx) + res_index = self._index data_0 = get_dataframe_data(self, 0) - res_data_0 = pandas.Series(data_0[idx], index=res_index, name="A") + res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A") data_1 = get_dataframe_data(self, 1) res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B") - return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index) + return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx]) """ func_lines = ['def _df_getitem_slice_idx_impl(self, idx):'] if self.columns: - func_lines += df_getitem_slice_idx_main_codelines(self) + func_lines += df_getitem_slice_idx_main_codelines(self, idx) else: # raise KeyError if input DF is empty func_lines += [' raise KeyError'] @@ -978,14 +1003,52 @@ def _df_getitem_slice_idx_impl(self, idx): return func_text, global_vars -def gen_df_getitem_slice_idx_impl(self): - func_text, global_vars = df_getitem_str_slice_codegen(self) +def df_getitem_tuple_idx_codegen(self, idx): + """ + Example of generated implementation with provided index: + def _df_getitem_tuple_idx_impl(self, idx) + res_index = self._index + data_1 = get_dataframe_data(self, 1) + res_data_1 = pandas.Series(data_1, index=res_index, name="B") + data_2 = get_dataframe_data(self, 2) + res_data_2 = pandas.Series(data_2, index=res_index, name="C") + return pandas.DataFrame({"B": res_data_1, "C": res_data_2}, index=res_index) + """ + func_lines = ['def _df_getitem_tuple_idx_impl(self, idx):'] + literal_idx = {col.literal_value for col in idx} + key_error = any(i not in self.columns for i in literal_idx) - loc_vars = {} - exec(func_text, global_vars, loc_vars) - _impl = loc_vars['_df_getitem_slice_idx_impl'] + if self.columns and not key_error: + func_lines += df_getitem_tuple_idx_main_codelines(self, literal_idx) + else: + # raise KeyError if input DF is empty or idx is invalid + func_lines += [' raise KeyError'] + + func_text = '\n'.join(func_lines) + global_vars = {'pandas': pandas, 'numpy': numpy, + 'get_dataframe_data': get_dataframe_data} + + return func_text, global_vars + + +def gen_df_getitem_impl_generator(codegen, impl_name): + """Generate generator of df.getitem""" + def _df_getitem_impl_generator(self, idx): + func_text, global_vars = codegen(self, idx) + + loc_vars = {} + exec(func_text, global_vars, loc_vars) + _impl = loc_vars[impl_name] - return _impl + return _impl + + return _df_getitem_impl_generator + + +gen_df_getitem_slice_idx_impl = gen_df_getitem_impl_generator( + df_getitem_slice_idx_codegen, '_df_getitem_slice_idx_impl') +gen_df_getitem_tuple_idx_impl = gen_df_getitem_impl_generator( + df_getitem_tuple_idx_codegen, '_df_getitem_tuple_idx_impl') @sdc_overload(operator.getitem) @@ -1018,8 +1081,11 @@ def _df_getitem_unicode_idx_impl(self, idx): return _df_getitem_unicode_idx_impl + if isinstance(idx, types.Tuple): + return gen_df_getitem_tuple_idx_impl(self, idx) + if isinstance(idx, types.SliceType): - return gen_df_getitem_slice_idx_impl(self) + return gen_df_getitem_slice_idx_impl(self, idx) ty_checker = TypeChecker('Operator getitem().') ty_checker.raise_exc(idx, 'str', 'idx') diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 2f2b4bc08..dd4e2853d 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -1236,6 +1236,17 @@ def test_impl(df, start, end): ref_result = test_impl(df, start, end) pd.testing.assert_frame_equal(jit_result, ref_result) + @skip_sdc_jit('DF.getitem unsupported Series name') + def _test_df_getitem_tuple_idx(self, df): + def test_impl(df): + # pd.df.getitem does not support idx as a tuple + return df[['A', 'C']] + + # SDC pd.df.getitem does not support idx as a list + sdc_func = self.jit(lambda df: df[('A', 'C')]) + + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + @skip_sdc_jit('DF.getitem unsupported exceptions') def test_df_getitem_str_literal_idx_exception_key_error(self): def test_impl(df): @@ -1260,17 +1271,27 @@ def test_impl(df, idx): with self.assertRaises(KeyError): sdc_func(df, 'ABC') + @skip_sdc_jit('DF.getitem unsupported Series name') + def test_df_getitem_tuple_idx_exception_key_error(self): + sdc_func = self.jit(lambda df: df[('A', 'Z')]) + + for df in [gen_df(test_global_input_data_float64), pd.DataFrame()]: + with self.subTest(df=df): + with self.assertRaises(KeyError): + sdc_func(df) + @skip_sdc_jit('DF.getitem unsupported Series name') def test_df_getitem_idx(self): dfs = [gen_df(test_global_input_data_float64), gen_df(test_global_input_data_float64, with_index=True), - pd.DataFrame({'A': []})] + pd.DataFrame({'A': [], 'B': [], 'C': []})] for df in dfs: with self.subTest(df=df): self._test_df_getitem_str_literal_idx(df) self._test_df_getitem_unicode_idx(df, 'A') self._test_df_getitem_slice_idx(df) self._test_df_getitem_unbox_slice_idx(df, 1, 3) + self._test_df_getitem_tuple_idx(df) @skip_sdc_jit('DF.getitem unsupported Series name') def test_df_getitem_idx_multiple_types(self): @@ -1284,6 +1305,7 @@ def test_df_getitem_idx_multiple_types(self): self._test_df_getitem_unicode_idx(df, 'A') self._test_df_getitem_slice_idx(df) self._test_df_getitem_unbox_slice_idx(df, 1, 3) + self._test_df_getitem_tuple_idx(df) @unittest.skip('DF.getitem unsupported integer columns') def test_df_getitem_int_literal_idx(self): @@ -1295,16 +1317,6 @@ def test_impl(df): pd.testing.assert_series_equal(sdc_func(df), test_impl(df)) - @unittest.skip('DF.getitem unsupported idx as a tuple') - def test_df_getitem_unicode_tuple_idx(self): - def test_impl(df): - return df[['A', 'B']] - - sdc_func = self.jit(lambda df: df[('A', 'B')]) - df = gen_df(test_global_input_data_float64) - - pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) - @skip_numba_jit def test_isin_df1(self): def test_impl(df, df2):