Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Commit

Permalink
Fix for Pandas merge overload wrong handling of string literals
Browse files Browse the repository at this point in the history
Problem description: merge_overload and merge_asof_overload functions
use 'on' argument value to compute 'left_on' and 'right_on' kwargs of
join function that provides implementation. This function relies on the
fact that these variables are either lists or constant string
literals, and in the latter case simple copying of on's value is wrong.

Error:
  File "../hpat/hiframes/dataframe_pass.py", line 202, in _run_assign
    return self._run_call(assign, lhs, rhs)
  File "../hpat/hiframes/dataframe_pass.py", line 522, in _run_call
    return self._run_call_join(assign, lhs, rhs)
  File "../hpat/hiframes/dataframe_pass.py", line 1488, in
_run_call_join
    left_on = self._get_const_or_list(left_on_var)
  File "../hpat/hiframes/dataframe_pass.py", line 2135, in
_get_const_or_list
    raise ValueError(err_msg)
ValueError: Failed in hpat mode pipeline (step: typed dataframe pass)
None

Following tests should be fixed with this commit:
    test_join_cat1 (hpat.tests.test_join.TestJoin)
    test_join_cat2 (hpat.tests.test_join.TestJoin)
    test_join_cat_parallel1 (hpat.tests.test_join.TestJoin)
    test_join_datetime_parallel1 (hpat.tests.test_join.TestJoin)
    test_join_datetime_seq1 (hpat.tests.test_join.TestJoin)
    test_join_left_seq1 (hpat.tests.test_join.TestJoin)
    test_join_left_seq2 (hpat.tests.test_join.TestJoin)
    test_join_outer_seq1 (hpat.tests.test_join.TestJoin)
    test_join_right_seq1 (hpat.tests.test_join.TestJoin)
    test_merge_asof_parallel1 (hpat.tests.test_join.TestJoin)
    test_merge_asof_seq1 (hpat.tests.test_join.TestJoin)
  • Loading branch information
kozlov-alexey committed Jul 26, 2019
1 parent 6640831 commit ca99c6c
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 43 deletions.
67 changes: 46 additions & 21 deletions hpat/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,38 +625,63 @@ def generic(self, args, kws):
@overload_method(DataFrameType, 'merge')
@overload(pd.merge)
def merge_overload(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):

def _impl(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
if on is not None:
left_on = right_on = on
# 'on' can be a list or a string literal, use different overloads for them
if on is not None and isinstance(on, types.StringLiteral):
# use literal strings value to fetch them as consts from IR later
new_right_on = on.literal_value
new_left_on = on.literal_value

return hpat.hiframes.api.join_dummy(
left, right, left_on, right_on, how)
def _impl(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):

return hpat.hiframes.api.join_dummy(left, right, new_left_on, new_right_on, how)
else:
def _impl(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
if on is not None:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(left, right, left_on, right_on, how)

return _impl


@overload(pd.merge_asof)
def merge_asof_overload(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):

def _impl(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):
if on is not None:
left_on = right_on = on
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):

# 'on' can be a list or a string literal, use different overloads for them
if on is not None and isinstance(on, types.StringLiteral):
# use literal strings value to fetch them as consts from IR later
new_right_on = on.literal_value
new_left_on = on.literal_value

def _impl(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):

return hpat.hiframes.api.join_dummy(left, right, new_left_on, new_right_on, 'asof')
else:
def _impl(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):
if on is not None:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
left, right, left_on, right_on, 'asof')
return hpat.hiframes.api.join_dummy(left, right, left_on, right_on, 'asof')

return _impl


@overload_method(DataFrameType, 'pivot_table')
def pivot_table_overload(df, values=None, index=None, columns=None, aggfunc='mean',
fill_value=None, margins=False, dropna=True, margins_name='All',
Expand Down
22 changes: 0 additions & 22 deletions hpat/tests/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ def test_impl(A1, B1, C1, A2, B2, D2):
self.assertEqual(h_res, p_res)
self.assertEqual(count_array_OneDs(), 3)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_datetime_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, on='time')
Expand All @@ -184,8 +182,6 @@ def test_impl(df1, df2):
['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_datetime_parallel1(self):
def test_impl(df1, df2):
df3 = pd.merge(df1, df2, on='time')
Expand All @@ -206,8 +202,6 @@ def test_impl(df1, df2):
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_merge_asof_seq1(self):
def test_impl(df1, df2):
return pd.merge_asof(df1, df2, on='time')
Expand All @@ -222,8 +216,6 @@ def test_impl(df1, df2):
'2017-02-25']), 'A': [2,3,7,8,9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_merge_asof_parallel1(self):
def test_impl():
df1 = pd.read_parquet('asof1.pq')
Expand All @@ -234,8 +226,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
self.assertEqual(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -253,8 +243,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq2(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -273,8 +261,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_right_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='right', on='key')
Expand All @@ -292,8 +278,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.A.dropna().values), set(res.A.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_outer_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='outer', on='key')
Expand Down Expand Up @@ -327,8 +311,6 @@ def test_impl(df1, df2, df3, df4):
df4 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BBB': n+np.arange(n)+1.0})
pd.testing.assert_frame_equal(hpat_func(df1, df2, df3, df4)[1], test_impl(df1, df2, df3, df4)[1])

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat1(self):
def test_impl():
ct_dtype = CategoricalDtype(['A', 'B', 'C'])
Expand All @@ -345,8 +327,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat2(self):
# test setting NaN in categorical array
def test_impl():
Expand All @@ -366,8 +346,6 @@ def test_impl():
hpat_func().sort_values('C1').reset_index(drop=True),
test_impl().sort_values('C1').reset_index(drop=True))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat_parallel1(self):
# TODO: cat as keys
def test_impl():
Expand Down

0 comments on commit ca99c6c

Please sign in to comment.