From 5dca3455bcf673fd4f47a235bcf6e17b3525e856 Mon Sep 17 00:00:00 2001 From: akharche Date: Wed, 22 Jan 2020 13:43:17 +0300 Subject: [PATCH 1/2] Add index to DataFrame.drop() --- .../hpat_pandas_dataframe_functions.py | 4 +- sdc/tests/test_dataframe.py | 69 +++++++++++++++---- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 35baec3aa..58f7e468a 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -790,8 +790,8 @@ def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns= column_list.append((f'new_col_{column}_data_df', column)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) - # TODO: Handle index - func_text.append(f"return pandas.DataFrame({{{data}}})\n") + func_text.append(f'return pandas.DataFrame({{{data}}}, index=df.index)\n') + func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index a44b5c7a1..f077ec765 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -53,6 +53,9 @@ def inner_get_column(df): COL_IND = 0 +global_index_to_test = [[1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd']] class TestDataFrame(TestCase): @@ -1073,29 +1076,69 @@ def test_impl(df): h_out = hpat_func(df) pd.testing.assert_frame_equal(out, h_out) - def test_df_drop_one_column(self): + def test_df_drop_one_column_unboxing(self): def test_impl(df): return df.drop(columns='A') - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}) - hpat_func = self.jit(test_impl) - pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + index_to_test = global_index_to_test + [None] - @skip_sdc_jit - def test_df_drop_tuple_column(self): - # Pandas supports only list as a parameter + sdc_func = self.jit(test_impl) + + for index in index_to_test: + with self.subTest(index=index): + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, + index=index) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + + def test_df_drop_one_column(self): + def test_impl(index): + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, + index=index) + return df.drop(columns='A') + + sdc_func = self.jit(test_impl) + + for index in global_index_to_test: + with self.subTest(index=index): + pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) + + def test_df_drop_tuple_column_unboxing(self): def test_impl(df): - return df.drop(columns=['A', 'B']) + return df.drop(columns=['A', 'C']) # Numba supports only tuple iteration def test_sdc_impl(df): - return df.drop(columns=('A', 'B')) + return df.drop(columns=('A', 'C')) - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}) - hpat_func = self.jit(test_sdc_impl) - pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + index_to_test = global_index_to_test + [None] + + sdc_func = self.jit(test_sdc_impl) + + for index in index_to_test: + with self.subTest(index=index): + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, + index=index) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) + + def test_df_drop_tuple_column(self): + def test_impl(index): + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, + index=index) + return df.drop(columns=['A', 'C']) + + # Numba supports only tuple iteration + def test_sdc_impl(index): + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, + index=index) + return df.drop(columns=('A', 'C')) + + sdc_func = self.jit(test_sdc_impl) + + for index in global_index_to_test: + with self.subTest(index=index): + pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) - @unittest.skip("Implement Index for DataFrames") + @unittest.skip("Error in return empty DataFrame") def test_df_drop_tuple_columns_all(self): def test_impl(df): return df.drop(columns=['A', 'B', 'C']) From 56e0d8d12a9db17e533387e4d1e7f0ae45359727 Mon Sep 17 00:00:00 2001 From: akharche Date: Mon, 27 Jan 2020 17:32:35 +0300 Subject: [PATCH 2/2] Fix segfault in boxing --- .../hpat_pandas_dataframe_functions.py | 5 ++- sdc/hiframes/boxing.py | 13 +++--- sdc/tests/test_dataframe.py | 41 ++++++++++++++----- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py index 58f7e468a..e901fb316 100644 --- a/sdc/datatypes/hpat_pandas_dataframe_functions.py +++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py @@ -790,8 +790,9 @@ def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns= column_list.append((f'new_col_{column}_data_df', column)) data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list) - func_text.append(f'return pandas.DataFrame({{{data}}}, index=df.index)\n') + index = 'df.index' + func_text.append(f'return pandas.DataFrame({{{data}}}, index={index})\n') func_definition.extend([indent + func_line for func_line in func_text]) func_def = '\n'.join(func_definition) @@ -842,7 +843,7 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None, ************************************************* Pandas DataFrame method :meth:`pandas.DataFrame.drop` implementation. .. only:: developer - Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_drop* + Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_drop* Parameters ----------- df: :obj:`pandas.DataFrame` diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py index 6b2a69113..f0e6c2d35 100644 --- a/sdc/hiframes/boxing.py +++ b/sdc/hiframes/boxing.py @@ -224,7 +224,13 @@ def box_dataframe(typ, val, c): mod_name = context.insert_const_string(c.builder.module, "pandas") class_obj = pyapi.import_module_noblock(mod_name) - df_obj = pyapi.call_method(class_obj, "DataFrame", ()) + + # set df.index if necessary + if typ.index != types.none: + arr_obj = _box_series_data(typ.index.dtype, typ.index, dataframe.index, c) + df_obj = pyapi.call_method(class_obj, "DataFrame", (c.pyapi.make_none(), arr_obj)) + else: + df_obj = pyapi.call_method(class_obj, "DataFrame", ()) for i, cname, arr, arr_typ, dtype in zip(range(n_cols), col_names, col_arrs, arr_typs, dtypes): # df['cname'] = boxed_arr @@ -262,11 +268,6 @@ def box_dataframe(typ, val, c): # pyapi.decref(arr_obj) pyapi.decref(cname_obj) - # set df.index if necessary - if typ.index != types.none: - arr_obj = _box_series_data(typ.index.dtype, typ.index, dataframe.index, c) - pyapi.object_setattr_string(df_obj, 'index', arr_obj) - pyapi.decref(class_obj) # pyapi.gil_release(gil_state) # release GIL return df_obj diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 2b70916ec..c575f027e 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -56,9 +56,6 @@ def inner_get_column(df): COL_IND = 0 -global_index_to_test = [[1, 2, 3, 4], - [.1, .2, .3, .4], - ['a', 'b', 'c', 'd']] class TestDataFrame(TestCase): @@ -1084,7 +1081,10 @@ def test_df_drop_one_column_unboxing(self): def test_impl(df): return df.drop(columns='A') - index_to_test = global_index_to_test + [None] + index_to_test = [[1, 2, 3, 4], + [.1, .2, .3, .4], + None, + ['a', 'b', 'c', 'd']] sdc_func = self.jit(test_impl) @@ -1100,9 +1100,13 @@ def test_impl(index): index=index) return df.drop(columns='A') + index_to_test = [[1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd']] + sdc_func = self.jit(test_impl) - for index in global_index_to_test: + for index in index_to_test: with self.subTest(index=index): pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) @@ -1114,7 +1118,10 @@ def test_impl(df): def test_sdc_impl(df): return df.drop(columns=('A', 'C')) - index_to_test = global_index_to_test + [None] + index_to_test = [[1, 2, 3, 4], + [.1, .2, .3, .4], + None, + ['a', 'b', 'c', 'd']] sdc_func = self.jit(test_sdc_impl) @@ -1136,13 +1143,16 @@ def test_sdc_impl(index): index=index) return df.drop(columns=('A', 'C')) + index_to_test = [[1, 2, 3, 4], + [.1, .2, .3, .4], + ['a', 'b', 'c', 'd']] + sdc_func = self.jit(test_sdc_impl) - for index in global_index_to_test: + for index in index_to_test: with self.subTest(index=index): pd.testing.assert_frame_equal(sdc_func(index), test_impl(index)) - @unittest.skip("Error in return empty DataFrame") def test_df_drop_tuple_columns_all(self): def test_impl(df): return df.drop(columns=['A', 'B', 'C']) @@ -1151,9 +1161,18 @@ def test_impl(df): def test_sdc_impl(df): return df.drop(columns=('A', 'B', 'C')) - df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}) - hpat_func = self.jit(test_sdc_impl) - pd.testing.assert_frame_equal(hpat_func(df), test_impl(df)) + index_to_test = [[1, 2, 3, 4], + [.1, .2, .3, .4], + None, + ['a', 'b', 'c', 'd']] + + sdc_func = self.jit(test_sdc_impl) + + for index in index_to_test: + with self.subTest(index=index): + df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]}, + index=index) + pd.testing.assert_frame_equal(sdc_func(df), test_impl(df)) @skip_sdc_jit def test_df_drop_by_column_errors_ignore(self):