Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Fix for pandas.merge wrong overload handling of 'on' args
Browse files Browse the repository at this point in the history
Problem description: merge_overload and merge_asof_overload functions
use 'on' argument value to compute 'left_on' and 'right_on' kwargs of
join function that provides implementation. This function relies on the
fact that these variables are either lists or constant string
literals, and in the latter case simple copying of on's value is wrong.

Error:
  File "../hpat/hiframes/dataframe_pass.py", line 202, in _run_assign
    return self._run_call(assign, lhs, rhs)
  File "../hpat/hiframes/dataframe_pass.py", line 522, in _run_call
    return self._run_call_join(assign, lhs, rhs)
  File "../hpat/hiframes/dataframe_pass.py", line 1488, in
_run_call_join
    left_on = self._get_const_or_list(left_on_var)
  File "../hpat/hiframes/dataframe_pass.py", line 2135, in
_get_const_or_list
    raise ValueError(err_msg)
ValueError: Failed in hpat mode pipeline (step: typed dataframe pass)
None

Following tests should be fixed with this commit:
    test_join_cat1 (hpat.tests.test_join.TestJoin)
    test_join_cat2 (hpat.tests.test_join.TestJoin)
    test_join_cat_parallel1 (hpat.tests.test_join.TestJoin)
    test_join_datetime_parallel1 (hpat.tests.test_join.TestJoin)
    test_join_datetime_seq1 (hpat.tests.test_join.TestJoin)
    test_join_left_seq1 (hpat.tests.test_join.TestJoin)
    test_join_left_seq2 (hpat.tests.test_join.TestJoin)
    test_join_outer_seq1 (hpat.tests.test_join.TestJoin)
    test_join_right_seq1 (hpat.tests.test_join.TestJoin)
    test_merge_asof_parallel1 (hpat.tests.test_join.TestJoin)
    test_merge_asof_seq1 (hpat.tests.test_join.TestJoin)
  • Loading branch information
kozlov-alexey committed Aug 1, 2019
1 parent 6640831 commit 571f3f2
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 24 deletions.
10 changes: 8 additions & 2 deletions hpat/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,10 +628,13 @@ def merge_overload(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):

# check if on's inferred type is NoneType and store the result,
# use it later to branch based on the value available at compile time
onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
def _impl(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
if on is not None:
if not onHasNoneType:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
Expand All @@ -645,11 +648,14 @@ def merge_asof_overload(left, right, on=None, left_on=None, right_on=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):

# check if on's inferred type is NoneType and store the result,
# use it later to branch based on the value available at compile time
onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
def _impl(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):
if on is not None:
if not onHasNoneType:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
Expand Down
22 changes: 0 additions & 22 deletions hpat/tests/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ def test_impl(A1, B1, C1, A2, B2, D2):
self.assertEqual(h_res, p_res)
self.assertEqual(count_array_OneDs(), 3)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_datetime_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, on='time')
Expand All @@ -184,8 +182,6 @@ def test_impl(df1, df2):
['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_datetime_parallel1(self):
def test_impl(df1, df2):
df3 = pd.merge(df1, df2, on='time')
Expand All @@ -206,8 +202,6 @@ def test_impl(df1, df2):
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_merge_asof_seq1(self):
def test_impl(df1, df2):
return pd.merge_asof(df1, df2, on='time')
Expand All @@ -222,8 +216,6 @@ def test_impl(df1, df2):
'2017-02-25']), 'A': [2,3,7,8,9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_merge_asof_parallel1(self):
def test_impl():
df1 = pd.read_parquet('asof1.pq')
Expand All @@ -234,8 +226,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
self.assertEqual(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -253,8 +243,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq2(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -273,8 +261,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_right_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='right', on='key')
Expand All @@ -292,8 +278,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.A.dropna().values), set(res.A.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_outer_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='outer', on='key')
Expand Down Expand Up @@ -327,8 +311,6 @@ def test_impl(df1, df2, df3, df4):
df4 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BBB': n+np.arange(n)+1.0})
pd.testing.assert_frame_equal(hpat_func(df1, df2, df3, df4)[1], test_impl(df1, df2, df3, df4)[1])

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat1(self):
def test_impl():
ct_dtype = CategoricalDtype(['A', 'B', 'C'])
Expand All @@ -345,8 +327,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat2(self):
# test setting NaN in categorical array
def test_impl():
Expand All @@ -366,8 +346,6 @@ def test_impl():
hpat_func().sort_values('C1').reset_index(drop=True),
test_impl().sort_values('C1').reset_index(drop=True))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat_parallel1(self):
# TODO: cat as keys
def test_impl():
Expand Down

0 comments on commit 571f3f2

Please sign in to comment.