Skip to content
This repository has been archived by the owner on Feb 2, 2024. It is now read-only.

Fix for Pandas merge overload wrong handling of string literals #99

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions hpat/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -628,10 +628,13 @@ def merge_overload(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):

# check if on's inferred type is NoneType and store the result,
# use it later to branch based on the value available at compile time
onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
def _impl(left, right, how='inner', on=None, left_on=None,
right_on=None, left_index=False, right_index=False, sort=False,
suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
if on is not None:
if not onHasNoneType:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
Expand All @@ -645,11 +648,14 @@ def merge_asof_overload(left, right, on=None, left_on=None, right_on=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):

# check if on's inferred type is NoneType and store the result,
# use it later to branch based on the value available at compile time
onHasNoneType = isinstance(numba.typeof(on), types.NoneType)
def _impl(left, right, on=None, left_on=None, right_on=None,
left_index=False, right_index=False, by=None, left_by=None,
right_by=None, suffixes=('_x', '_y'), tolerance=None,
allow_exact_matches=True, direction='backward'):
if on is not None:
if not onHasNoneType:
left_on = right_on = on

return hpat.hiframes.api.join_dummy(
Expand Down
30 changes: 8 additions & 22 deletions hpat/tests/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,6 @@ def test_impl(A1, B1, C1, A2, B2, D2):
self.assertEqual(h_res, p_res)
self.assertEqual(count_array_OneDs(), 3)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_datetime_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, on='time')
Expand All @@ -184,8 +182,10 @@ def test_impl(df1, df2):
['2017-01-01', '2017-01-06', '2017-01-03']), 'A': [7, 8, 9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
@unittest.skip('AssertionError - fix needed\n'
'Tuples differ: (-9223372036854775791, Timestamp(\'2017-01-06 00:00:00\'), 9) != \n'
'(17, Timestamp(\'2017-01-06 00:00:00\'), 9)\n'
'NUMA_PES=3 build')
def test_join_datetime_parallel1(self):
def test_impl(df1, df2):
df3 = pd.merge(df1, df2, on='time')
Expand All @@ -206,8 +206,6 @@ def test_impl(df1, df2):
self.assertEqual(count_array_REPs(), 0)
self.assertEqual(count_parfor_REPs(), 0)

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_merge_asof_seq1(self):
def test_impl(df1, df2):
return pd.merge_asof(df1, df2, on='time')
Expand All @@ -222,8 +220,10 @@ def test_impl(df1, df2):
'2017-02-25']), 'A': [2,3,7,8,9]})
pd.testing.assert_frame_equal(hpat_func(df1, df2), test_impl(df1, df2))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
@unittest.skip('AssertionError - fix needed\n'
'Tuples differ: (-9223372036854775790, Timestamp(\'2017-02-21 00:00:00\'), 24) !=\n'
'(18, Timestamp(\'2017-02-21 00:00:00\'), 24)\n'
'NUMA_PES=3 build')
def test_merge_asof_parallel1(self):
def test_impl():
df1 = pd.read_parquet('asof1.pq')
Expand All @@ -234,8 +234,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
self.assertEqual(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -253,8 +251,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_left_seq2(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='left', on='key')
Expand All @@ -273,8 +269,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.B.dropna().values), set(res.B.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_right_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='right', on='key')
Expand All @@ -292,8 +286,6 @@ def test_impl(df1, df2):
self.assertEqual(
set(h_res.A.dropna().values), set(res.A.dropna().values))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_outer_seq1(self):
def test_impl(df1, df2):
return pd.merge(df1, df2, how='outer', on='key')
Expand Down Expand Up @@ -327,8 +319,6 @@ def test_impl(df1, df2, df3, df4):
df4 = pd.DataFrame({'B': 2*np.arange(n)+1, 'BBB': n+np.arange(n)+1.0})
pd.testing.assert_frame_equal(hpat_func(df1, df2, df3, df4)[1], test_impl(df1, df2, df3, df4)[1])

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat1(self):
def test_impl():
ct_dtype = CategoricalDtype(['A', 'B', 'C'])
Expand All @@ -345,8 +335,6 @@ def test_impl():
hpat_func = hpat.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(), test_impl())

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat2(self):
# test setting NaN in categorical array
def test_impl():
Expand All @@ -366,8 +354,6 @@ def test_impl():
hpat_func().sort_values('C1').reset_index(drop=True),
test_impl().sort_values('C1').reset_index(drop=True))

@unittest.skip('ValueError - fix needed\n'
'Failed in hpat mode pipeline (step: typed dataframe pass)\n')
def test_join_cat_parallel1(self):
# TODO: cat as keys
def test_impl():
Expand Down