Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Fix for pandas.merge wrong overload handling of 'on' args (#99)
Browse files Browse the repository at this point in the history
Problem description: merge_overload and merge_asof_overload functions
use 'on' argument value to compute 'left_on' and 'right_on' arguments
in a way that breaks type stability, causing compilation failure
when 'on' is assigned a StringLiteral value.

Error:
  File "../hpat/hiframes/dataframe_pass.py", line 202, in _run_assign
    return self._run_call(assign, lhs, rhs)
  File "../hpat/hiframes/dataframe_pass.py", line 522, in _run_call
    return self._run_call_join(assign, lhs, rhs)
  File "../hpat/hiframes/dataframe_pass.py", line 1488, in
_run_call_join
    left_on = self._get_const_or_list(left_on_var)
  File "../hpat/hiframes/dataframe_pass.py", line 2135, in
_get_const_or_list
    raise ValueError(err_msg)
ValueError: Failed in hpat mode pipeline (step: typed dataframe pass)
None

Following tests should be fixed with this commit:
    test_join_cat1 (hpat.tests.test_join.TestJoin)
    test_join_cat2 (hpat.tests.test_join.TestJoin)
    test_join_cat_parallel1 (hpat.tests.test_join.TestJoin)
    test_join_datetime_seq1 (hpat.tests.test_join.TestJoin)
    test_join_left_seq1 (hpat.tests.test_join.TestJoin)
    test_join_left_seq2 (hpat.tests.test_join.TestJoin)
    test_join_outer_seq1 (hpat.tests.test_join.TestJoin)
    test_join_right_seq1 (hpat.tests.test_join.TestJoin)
    test_merge_asof_seq1 (hpat.tests.test_join.TestJoin)
  • Loading branch information
kozlov-alexey authored and shssf committed Aug 2, 2019
1 parent 4ac5d9c commit 91e55d8
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 24 deletions.
10 changes: 8 additions & 2 deletions hpat/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,10 +628,13 @@ def merge_overload(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):

# check if on's inferred type is NoneType and store the result,
# use it later to branch based on the value available at compile time
onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
def _impl(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
if on is not None:
if not onHasNoneType:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
Expand All @@ -645,11 +648,14 @@ def merge_asof_overload(left, right, on=None, left_on=None, right_on=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):

# check if on's inferred type is NoneType and store the result,
# use it later to branch based on the value available at compile time
onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
def _impl(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):
if on is not None:
if not onHasNoneType:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
Expand Down
30 changes: 8 additions & 22 deletions hpat/tests/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ def test_impl(A1, B1, C1, A2, B2, D2):
self.assertEqual(h_res, p_res)
self.assertEqual(count_array_OneDs(), 3)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_datetime_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, on='time')
Expand All @@ -184,8 +182,10 @@ def test_impl(df1, df2):
['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
@unittest.skip('AssertionError - fix needed\n'
'Tuples differ: (-9223372036854775791, Timestamp(\'2017-01-06 00:00:00\'), 9) != \n'
'(17, Timestamp(\'2017-01-06 00:00:00\'), 9)\n'
'NUMA_PES=3 build')
def test_join_datetime_parallel1(self):
def test_impl(df1, df2):
df3 = pd.merge(df1, df2, on='time')
Expand All @@ -206,8 +206,6 @@ def test_impl(df1, df2):
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_merge_asof_seq1(self):
def test_impl(df1, df2):
return pd.merge_asof(df1, df2, on='time')
Expand All @@ -222,8 +220,10 @@ def test_impl(df1, df2):
'2017-02-25']), 'A': [2,3,7,8,9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
@unittest.skip('AssertionError - fix needed\n'
'Tuples differ: (-9223372036854775790, Timestamp(\'2017-02-21 00:00:00\'), 24) !=\n'
'(18, Timestamp(\'2017-02-21 00:00:00\'), 24)\n'
'NUMA_PES=3 build')
def test_merge_asof_parallel1(self):
def test_impl():
df1 = pd.read_parquet('asof1.pq')
Expand All @@ -234,8 +234,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
self.assertEqual(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -253,8 +251,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq2(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -273,8 +269,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_right_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='right', on='key')
Expand All @@ -292,8 +286,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.A.dropna().values), set(res.A.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_outer_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='outer', on='key')
Expand Down Expand Up @@ -327,8 +319,6 @@ def test_impl(df1, df2, df3, df4):
df4 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BBB': n+np.arange(n)+1.0})
pd.testing.assert_frame_equal(hpat_func(df1, df2, df3, df4)[1], test_impl(df1, df2, df3, df4)[1])

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat1(self):
def test_impl():
ct_dtype = CategoricalDtype(['A', 'B', 'C'])
Expand All @@ -345,8 +335,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat2(self):
# test setting NaN in categorical array
def test_impl():
Expand All @@ -366,8 +354,6 @@ def test_impl():
hpat_func().sort_values('C1').reset_index(drop=True),
test_impl().sort_values('C1').reset_index(drop=True))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat_parallel1(self):
# TODO: cat as keys
def test_impl():
Expand Down

0 comments on commit 91e55d8

Please sign in to comment.