From 77f8d378029f4207a4611d7944cb588cca86d8e9 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Wed, 27 Nov 2019 12:23:40 +0300 Subject: [PATCH] Revert "WIP: Port to numba master (#338)" This reverts commit e3f53bcdd6edc4920e7cfea0cac93d4673ae8808. --- sdc/distributed.py | 2 +- sdc/hiframes/aggregate.py | 2 +- sdc/hiframes/dataframe_pass.py | 3 +-- sdc/hiframes/filter.py | 2 +- sdc/hiframes/hiframes_typed.py | 20 +++++++++++++++----- sdc/hiframes/join.py | 2 +- sdc/hiframes/pd_dataframe_ext.py | 12 ++++++++++-- sdc/hiframes/pd_series_ext.py | 21 +++++++++++++++++---- sdc/io/csv_ext.py | 2 +- sdc/tests/test_basic.py | 12 ++++++++---- sdc/tests/test_dataframe.py | 3 ++- sdc/tests/test_ml.py | 3 ++- setup.py | 3 +++ 13 files changed, 63 insertions(+), 24 deletions(-) diff --git a/sdc/distributed.py b/sdc/distributed.py index 1be684b4e..26ad23cc3 100644 --- a/sdc/distributed.py +++ b/sdc/distributed.py @@ -1770,7 +1770,7 @@ def _gen_parfor_reductions(self, parfor, namevar_table): _, reductions = get_parfor_reductions( parfor, parfor.params, self.state.calltypes) - for reduce_varname, (init_val, reduce_nodes, _) in reductions.items(): + for reduce_varname, (init_val, reduce_nodes) in reductions.items(): reduce_op = guard(self._get_reduce_op, reduce_nodes) # TODO: initialize reduction vars (arrays) reduce_var = namevar_table[reduce_varname] diff --git a/sdc/hiframes/aggregate.py b/sdc/hiframes/aggregate.py index d6364ebb8..161812e46 100644 --- a/sdc/hiframes/aggregate.py +++ b/sdc/hiframes/aggregate.py @@ -438,7 +438,7 @@ def aggregate_array_analysis(aggregate_node, equiv_set, typemap, equiv_set.insert_equiv(col_var, shape) post.extend(c_post) all_shapes.append(shape[0]) - equiv_set.define(col_var, {}) + equiv_set.define(col_var) if len(all_shapes) > 1: equiv_set.insert_equiv(*all_shapes) diff --git a/sdc/hiframes/dataframe_pass.py b/sdc/hiframes/dataframe_pass.py index 0d8cc31c3..3ae428e95 100644 --- a/sdc/hiframes/dataframe_pass.py +++ b/sdc/hiframes/dataframe_pass.py @@ -140,8 +140,7 @@ def run_pass(self): out_nodes = [inst] if isinstance(inst, ir.Assign): - if inst.value in self.state.func_ir._definitions[inst.target.name]: - self.state.func_ir._definitions[inst.target.name].remove(inst.value) + self.state.func_ir._definitions[inst.target.name].remove(inst.value) out_nodes = self._run_assign(inst) elif isinstance(inst, (ir.SetItem, ir.StaticSetItem)): out_nodes = self._run_setitem(inst) diff --git a/sdc/hiframes/filter.py b/sdc/hiframes/filter.py index fe3d143e7..3db62ff79 100644 --- a/sdc/hiframes/filter.py +++ b/sdc/hiframes/filter.py @@ -100,7 +100,7 @@ def filter_array_analysis(filter_node, equiv_set, typemap, array_analysis): equiv_set.insert_equiv(col_var, shape) post.extend(c_post) all_shapes.append(shape[0]) - equiv_set.define(col_var, {}) + equiv_set.define(col_var) if len(all_shapes) > 1: equiv_set.insert_equiv(*all_shapes) diff --git a/sdc/hiframes/hiframes_typed.py b/sdc/hiframes/hiframes_typed.py index bede365ea..19e6c6f05 100644 --- a/sdc/hiframes/hiframes_typed.py +++ b/sdc/hiframes/hiframes_typed.py @@ -1212,7 +1212,10 @@ def _handle_series_map(self, assign, lhs, rhs, series_var): # error checking: make sure there is function input only if len(rhs.args) != 1: raise ValueError("map expects 1 argument") - func = guard(get_definition, self.state.func_ir, rhs.args[0]).value.py_func + func = guard(get_definition, self.state.func_ir, rhs.args[0]) + if func is None or not (isinstance(func, ir.Expr) + and func.op == 'make_function'): + raise ValueError("lambda for map not found") dtype = self.state.typemap[series_var.name].dtype nodes = [] @@ -1379,7 +1382,11 @@ def _handle_series_combine(self, assign, lhs, rhs, series_var): raise ValueError("not enough arguments in call to combine") if len(rhs.args) > 3: raise ValueError("too many arguments in call to combine") - func = guard(get_definition, self.state.func_ir, rhs.args[1]).value.py_func + func = guard(get_definition, self.state.func_ir, rhs.args[1]) + if func is None or not (isinstance(func, ir.Expr) + and func.op == 'make_function'): + raise ValueError("lambda for combine not found") + out_typ = self.state.typemap[lhs.name].dtype other = rhs.args[0] nodes = [] @@ -1526,16 +1533,19 @@ def f(arr, w, center): # pragma: no cover def _handle_rolling_apply_func(self, func_node, dtype, out_dtype): if func_node is None: raise ValueError("cannot find kernel function for rolling.apply() call") - func_node = func_node.value.py_func # TODO: more error checking on the kernel to make sure it doesn't # use global/closure variables + if func_node.closure is not None: + raise ValueError("rolling apply kernel functions cannot have closure variables") + if func_node.defaults is not None: + raise ValueError("rolling apply kernel functions cannot have default arguments") # create a function from the code object glbs = self.state.func_ir.func_id.func.__globals__ lcs = {} exec("def f(A): return A", glbs, lcs) kernel_func = lcs['f'] - kernel_func.__code__ = func_node.__code__ - kernel_func.__name__ = func_node.__code__.co_name + kernel_func.__code__ = func_node.code + kernel_func.__name__ = func_node.code.co_name # use hpat's sequential pipeline to enable pandas operations # XXX seq pipeline used since dist pass causes a hang m = numba.ir_utils._max_label diff --git a/sdc/hiframes/join.py b/sdc/hiframes/join.py index 540d3fd7d..246e0a176 100644 --- a/sdc/hiframes/join.py +++ b/sdc/hiframes/join.py @@ -131,7 +131,7 @@ def join_array_analysis(join_node, equiv_set, typemap, array_analysis): equiv_set.insert_equiv(col_var, shape) post.extend(c_post) all_shapes.append(shape[0]) - equiv_set.define(col_var, {}) + equiv_set.define(col_var) if len(all_shapes) > 1: equiv_set.insert_equiv(*all_shapes) diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py index 27bf593b0..61e988095 100644 --- a/sdc/hiframes/pd_dataframe_ext.py +++ b/sdc/hiframes/pd_dataframe_ext.py @@ -142,6 +142,10 @@ def resolve_values(self, ary): def resolve_apply(self, df, args, kws): kws = dict(kws) func = args[0] if len(args) > 0 else kws.get('func', None) + # check lambda + if not isinstance(func, types.MakeFunctionLiteral): + raise ValueError("df.apply(): lambda not found") + # check axis axis = args[1] if len(args) > 1 else kws.get('axis', None) if (axis is None or not isinstance(axis, types.IntegerLiteral) @@ -161,8 +165,12 @@ def resolve_apply(self, df, args, kws): dtypes.append(el_typ) row_typ = types.NamedTuple(dtypes, Row) - t = func.get_call_type(self.context, (row_typ,), {}) - return signature(SeriesType(t.return_type), *args) + code = func.literal_value.code + f_ir = numba.ir_utils.get_ir_of_code({'np': np}, code) + _, f_return_type, _ = numba.typed_passes.type_inference_stage( + self.context, f_ir, (row_typ,), None) + + return signature(SeriesType(f_return_type), *args) @bound_function("df.describe") def resolve_describe(self, df, args, kws): diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py index c85614b2e..7366036d8 100644 --- a/sdc/hiframes/pd_series_ext.py +++ b/sdc/hiframes/pd_series_ext.py @@ -564,8 +564,18 @@ def _resolve_map_func(self, ary, args, kws): # getitem returns Timestamp for dt_index and series(dt64) if dtype == types.NPDatetime('ns'): dtype = pandas_timestamp_type - t = args[0].get_call_type(self.context, (dtype,), {}) - return signature(SeriesType(t.return_type), *args) + code = args[0].literal_value.code + _globals = {'np': np} + # XXX hack in hiframes_typed to make globals available + if hasattr(args[0].literal_value, 'globals'): + # TODO: use code.co_names to find globals actually used? + _globals = args[0].literal_value.globals + + f_ir = numba.ir_utils.get_ir_of_code(_globals, code) + f_typemap, f_return_type, f_calltypes = numba.typed_passes.type_inference_stage( + self.context, f_ir, (dtype,), None) + + return signature(SeriesType(f_return_type), *args) @bound_function("series.map") def resolve_map(self, ary, args, kws): @@ -584,8 +594,11 @@ def _resolve_combine_func(self, ary, args, kws): dtype2 = args[0].dtype if dtype2 == types.NPDatetime('ns'): dtype2 = pandas_timestamp_type - t = args[1].get_call_type(self.context, (dtype1, dtype2,), {}) - return signature(SeriesType(t.return_type), *args) + code = args[1].literal_value.code + f_ir = numba.ir_utils.get_ir_of_code({'np': np}, code) + f_typemap, f_return_type, f_calltypes = numba.typed_passes.type_inference_stage( + self.context, f_ir, (dtype1, dtype2,), None) + return signature(SeriesType(f_return_type), *args) @bound_function("series.combine") def resolve_combine(self, ary, args, kws): diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py index 2765081d9..292d82204 100644 --- a/sdc/io/csv_ext.py +++ b/sdc/io/csv_ext.py @@ -93,7 +93,7 @@ def csv_array_analysis(csv_node, equiv_set, typemap, array_analysis): equiv_set.insert_equiv(col_var, shape) post.extend(c_post) all_shapes.append(shape[0]) - equiv_set.define(col_var, {}) + equiv_set.define(col_var) if len(all_shapes) > 1: equiv_set.insert_equiv(*all_shapes) diff --git a/sdc/tests/test_basic.py b/sdc/tests/test_basic.py index 65537fbef..d89df5012 100644 --- a/sdc/tests/test_basic.py +++ b/sdc/tests/test_basic.py @@ -327,7 +327,8 @@ def test_array_reduce(self): self.assertEqual(count_array_OneDs(), 0) self.assertEqual(count_parfor_OneDs(), 1) - @unittest.expectedFailure # https://github.com/numba/numba/issues/4690 + @unittest.skipIf(check_numba_version('0.46.0'), + "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") def test_dist_return(self): def test_impl(N): A = np.arange(N) @@ -344,7 +345,8 @@ def test_impl(N): self.assertEqual(count_array_OneDs(), 1) self.assertEqual(count_parfor_OneDs(), 1) - @unittest.expectedFailure # https://github.com/numba/numba/issues/4690 + @unittest.skipIf(check_numba_version('0.46.0'), + "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") def test_dist_return_tuple(self): def test_impl(N): A = np.arange(N) @@ -373,7 +375,8 @@ def test_impl(A): np.testing.assert_allclose(hpat_func(arr) / self.num_ranks, test_impl(arr)) self.assertEqual(count_array_OneDs(), 1) - @unittest.expectedFailure # https://github.com/numba/numba/issues/4690 + @unittest.skipIf(check_numba_version('0.46.0'), + "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") def test_rebalance(self): def test_impl(N): A = np.arange(n) @@ -391,7 +394,8 @@ def test_impl(N): finally: sdc.distributed_analysis.auto_rebalance = False - @unittest.expectedFailure # https://github.com/numba/numba/issues/4690 + @unittest.skipIf(check_numba_version('0.46.0'), + "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") def test_rebalance_loop(self): def test_impl(N): A = np.arange(n) diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py index 7614e3a57..cf9ab680f 100644 --- a/sdc/tests/test_dataframe.py +++ b/sdc/tests/test_dataframe.py @@ -160,7 +160,8 @@ def test_impl(df): dtype=pd.api.types.CategoricalDtype(['N', 'Y']))}) pd.testing.assert_frame_equal(hpat_func(df.copy(deep=True)), test_impl(df)) - @unittest.expectedFailure # https://github.com/numba/numba/issues/4690 + @unittest.skipIf(check_numba_version('0.46.0'), + "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") def test_box_dist_return(self): def test_impl(n): df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)}) diff --git a/sdc/tests/test_ml.py b/sdc/tests/test_ml.py index 95c2bf5c5..40d27d85d 100644 --- a/sdc/tests/test_ml.py +++ b/sdc/tests/test_ml.py @@ -117,7 +117,8 @@ def test_impl(n): self.assertEqual(count_array_OneDs(), 1) self.assertEqual(count_parfor_OneDs(), 2) - @unittest.expectedFailure # https://github.com/numba/numba/issues/4690 + @unittest.skipIf(check_numba_version('0.46.0'), + "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690") def test_kmeans(self): def test_impl(numCenter, numIter, N, D): A = np.ones((N, D)) diff --git a/setup.py b/setup.py index 6e2b606ba..966390040 100644 --- a/setup.py +++ b/setup.py @@ -202,6 +202,9 @@ def readme(): str_libs = np_compile_args['libraries'] +if not is_win: + str_libs += ['boost_regex'] + ext_str = Extension(name="sdc.hstr_ext", sources=["sdc/_str_ext.cpp"], libraries=str_libs,