Fix for pandas.merge wrong overload handling of 'on' args

Problem description: merge_overload and merge_asof_overload functions use 'on' argument value to compute 'left_on' and 'right_on' kwargs of join function that provides implementation. This function relies on the fact that these variables are either lists or constant string literals, and in the latter case simple copying of on's value is wrong. Error: File "../hpat/hiframes/dataframe_pass.py", line 202, in _run_assign return self._run_call(assign, lhs, rhs) File "../hpat/hiframes/dataframe_pass.py", line 522, in _run_call return self._run_call_join(assign, lhs, rhs) File "../hpat/hiframes/dataframe_pass.py", line 1488, in _run_call_join left_on = self._get_const_or_list(left_on_var) File "../hpat/hiframes/dataframe_pass.py", line 2135, in _get_const_or_list raise ValueError(err_msg) ValueError: Failed in hpat mode pipeline (step: typed dataframe pass) None Following tests should be fixed with this commit: test_join_cat1 (hpat.tests.test_join.TestJoin) test_join_cat2 (hpat.tests.test_join.TestJoin) test_join_cat_parallel1 (hpat.tests.test_join.TestJoin) test_join_datetime_parallel1 (hpat.tests.test_join.TestJoin) test_join_datetime_seq1 (hpat.tests.test_join.TestJoin) test_join_left_seq1 (hpat.tests.test_join.TestJoin) test_join_left_seq2 (hpat.tests.test_join.TestJoin) test_join_outer_seq1 (hpat.tests.test_join.TestJoin) test_join_right_seq1 (hpat.tests.test_join.TestJoin) test_merge_asof_parallel1 (hpat.tests.test_join.TestJoin) test_merge_asof_seq1 (hpat.tests.test_join.TestJoin)
IntelPython · Aug 1, 2019 · 571f3f2 · 571f3f2
1 parent 6640831
commit 571f3f2
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 24 deletions.
diff --git a/hpat/hiframes/pd_dataframe_ext.py b/hpat/hiframes/pd_dataframe_ext.py
@@ -628,10 +628,13 @@ def merge_overload(left, right, how='inner', on=None, left_on=None,
         right_on=None, left_index=False, right_index=False, sort=False,
         suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
 
+    # check if on's inferred type is NoneType and store the result,
+    # use it later to branch based on the value available at compile time
+    onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
     def _impl(left, right, how='inner', on=None, left_on=None,
             right_on=None, left_index=False, right_index=False, sort=False,
             suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
-        if on is not None:
+        if not onHasNoneType:
             left_on = right_on = on
 
         return hpat.hiframes.api.join_dummy(
@@ -645,11 +648,14 @@ def merge_asof_overload(left, right, on=None, left_on=None, right_on=None,
         right_by=None, suffixes=('_x', '_y'), tolerance=None,
         allow_exact_matches=True, direction='backward'):
 
+    # check if on's inferred type is NoneType and store the result,
+    # use it later to branch based on the value available at compile time
+    onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
     def _impl(left, right, on=None, left_on=None, right_on=None,
             left_index=False, right_index=False, by=None, left_by=None,
             right_by=None, suffixes=('_x', '_y'), tolerance=None,
             allow_exact_matches=True, direction='backward'):
-        if on is not None:
+        if not onHasNoneType:
             left_on = right_on = on
 
         return hpat.hiframes.api.join_dummy(

diff --git a/hpat/tests/test_join.py b/hpat/tests/test_join.py
@@ -169,8 +169,6 @@ def test_impl(A1, B1, C1, A2, B2, D2):
         self.assertEqual(h_res, p_res)
         self.assertEqual(count_array_OneDs(), 3)
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_datetime_seq1(self):
         def test_impl(df1, df2):
             return pd.merge(df1, df2, on='time')
@@ -184,8 +182,6 @@ def test_impl(df1, df2):
                 ['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9]})
         pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_datetime_parallel1(self):
         def test_impl(df1, df2):
             df3 = pd.merge(df1, df2, on='time')
@@ -206,8 +202,6 @@ def test_impl(df1, df2):
         self.assertEqual(count_array_REPs(), 0)
         self.assertEqual(count_parfor_REPs(), 0)
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_merge_asof_seq1(self):
         def test_impl(df1, df2):
             return pd.merge_asof(df1, df2, on='time')
@@ -222,8 +216,6 @@ def test_impl(df1, df2):
                 '2017-02-25']), 'A': [2,3,7,8,9]})
         pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_merge_asof_parallel1(self):
         def test_impl():
             df1 = pd.read_parquet('asof1.pq')
@@ -234,8 +226,6 @@ def test_impl():
         hpat_func = hpat.jit(test_impl)
         self.assertEqual(hpat_func(), test_impl())
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_left_seq1(self):
         def test_impl(df1, df2):
             return pd.merge(df1, df2, how='left', on='key')
@@ -253,8 +243,6 @@ def test_impl(df1, df2):
         self.assertEqual(
             set(h_res.B.dropna().values), set(res.B.dropna().values))
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_left_seq2(self):
         def test_impl(df1, df2):
             return pd.merge(df1, df2, how='left', on='key')
@@ -273,8 +261,6 @@ def test_impl(df1, df2):
         self.assertEqual(
             set(h_res.B.dropna().values), set(res.B.dropna().values))
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_right_seq1(self):
         def test_impl(df1, df2):
             return pd.merge(df1, df2, how='right', on='key')
@@ -292,8 +278,6 @@ def test_impl(df1, df2):
         self.assertEqual(
             set(h_res.A.dropna().values), set(res.A.dropna().values))
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_outer_seq1(self):
         def test_impl(df1, df2):
             return pd.merge(df1, df2, how='outer', on='key')
@@ -327,8 +311,6 @@ def test_impl(df1, df2, df3, df4):
         df4 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BBB': n+np.arange(n)+1.0})
         pd.testing.assert_frame_equal(hpat_func(df1, df2, df3, df4)[1], test_impl(df1, df2, df3, df4)[1])
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_cat1(self):
         def test_impl():
             ct_dtype = CategoricalDtype(['A', 'B', 'C'])
@@ -345,8 +327,6 @@ def test_impl():
         hpat_func = hpat.jit(test_impl)
         pd.testing.assert_frame_equal(hpat_func(), test_impl())
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_cat2(self):
         # test setting NaN in categorical array
         def test_impl():
@@ -366,8 +346,6 @@ def test_impl():
             hpat_func().sort_values('C1').reset_index(drop=True),
             test_impl().sort_values('C1').reset_index(drop=True))
 
-    @unittest.skip('ValueError - fix needed\n'
-                   'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
     def test_join_cat_parallel1(self):
         # TODO: cat as keys
         def test_impl():