From 77f8d378029f4207a4611d7944cb588cca86d8e9 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <hardcode84@gmail.com>
Date: Wed, 27 Nov 2019 12:23:40 +0300
Subject: [PATCH] Revert "WIP: Port to numba master (#338)"

This reverts commit e3f53bcdd6edc4920e7cfea0cac93d4673ae8808.
---
 sdc/distributed.py               |  2 +-
 sdc/hiframes/aggregate.py        |  2 +-
 sdc/hiframes/dataframe_pass.py   |  3 +--
 sdc/hiframes/filter.py           |  2 +-
 sdc/hiframes/hiframes_typed.py   | 20 +++++++++++++++-----
 sdc/hiframes/join.py             |  2 +-
 sdc/hiframes/pd_dataframe_ext.py | 12 ++++++++++--
 sdc/hiframes/pd_series_ext.py    | 21 +++++++++++++++++----
 sdc/io/csv_ext.py                |  2 +-
 sdc/tests/test_basic.py          | 12 ++++++++----
 sdc/tests/test_dataframe.py      |  3 ++-
 sdc/tests/test_ml.py             |  3 ++-
 setup.py                         |  3 +++
 13 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/sdc/distributed.py b/sdc/distributed.py
index 1be684b4e..26ad23cc3 100644
--- a/sdc/distributed.py
+++ b/sdc/distributed.py
@@ -1770,7 +1770,7 @@ def _gen_parfor_reductions(self, parfor, namevar_table):
         _, reductions = get_parfor_reductions(
             parfor, parfor.params, self.state.calltypes)
 
-        for reduce_varname, (init_val, reduce_nodes, _) in reductions.items():
+        for reduce_varname, (init_val, reduce_nodes) in reductions.items():
             reduce_op = guard(self._get_reduce_op, reduce_nodes)
             # TODO: initialize reduction vars (arrays)
             reduce_var = namevar_table[reduce_varname]
diff --git a/sdc/hiframes/aggregate.py b/sdc/hiframes/aggregate.py
index d6364ebb8..161812e46 100644
--- a/sdc/hiframes/aggregate.py
+++ b/sdc/hiframes/aggregate.py
@@ -438,7 +438,7 @@ def aggregate_array_analysis(aggregate_node, equiv_set, typemap,
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var, {})
+        equiv_set.define(col_var)
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/hiframes/dataframe_pass.py b/sdc/hiframes/dataframe_pass.py
index 0d8cc31c3..3ae428e95 100644
--- a/sdc/hiframes/dataframe_pass.py
+++ b/sdc/hiframes/dataframe_pass.py
@@ -140,8 +140,7 @@ def run_pass(self):
                 out_nodes = [inst]
 
                 if isinstance(inst, ir.Assign):
-                    if inst.value in self.state.func_ir._definitions[inst.target.name]:
-                        self.state.func_ir._definitions[inst.target.name].remove(inst.value)
+                    self.state.func_ir._definitions[inst.target.name].remove(inst.value)
                     out_nodes = self._run_assign(inst)
                 elif isinstance(inst, (ir.SetItem, ir.StaticSetItem)):
                     out_nodes = self._run_setitem(inst)
diff --git a/sdc/hiframes/filter.py b/sdc/hiframes/filter.py
index fe3d143e7..3db62ff79 100644
--- a/sdc/hiframes/filter.py
+++ b/sdc/hiframes/filter.py
@@ -100,7 +100,7 @@ def filter_array_analysis(filter_node, equiv_set, typemap, array_analysis):
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var, {})
+        equiv_set.define(col_var)
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/hiframes/hiframes_typed.py b/sdc/hiframes/hiframes_typed.py
index bede365ea..19e6c6f05 100644
--- a/sdc/hiframes/hiframes_typed.py
+++ b/sdc/hiframes/hiframes_typed.py
@@ -1212,7 +1212,10 @@ def _handle_series_map(self, assign, lhs, rhs, series_var):
         # error checking: make sure there is function input only
         if len(rhs.args) != 1:
             raise ValueError("map expects 1 argument")
-        func = guard(get_definition, self.state.func_ir, rhs.args[0]).value.py_func
+        func = guard(get_definition, self.state.func_ir, rhs.args[0])
+        if func is None or not (isinstance(func, ir.Expr)
+                                and func.op == 'make_function'):
+            raise ValueError("lambda for map not found")
 
         dtype = self.state.typemap[series_var.name].dtype
         nodes = []
@@ -1379,7 +1382,11 @@ def _handle_series_combine(self, assign, lhs, rhs, series_var):
             raise ValueError("not enough arguments in call to combine")
         if len(rhs.args) > 3:
             raise ValueError("too many arguments in call to combine")
-        func = guard(get_definition, self.state.func_ir, rhs.args[1]).value.py_func
+        func = guard(get_definition, self.state.func_ir, rhs.args[1])
+        if func is None or not (isinstance(func, ir.Expr)
+                                and func.op == 'make_function'):
+            raise ValueError("lambda for combine not found")
+
         out_typ = self.state.typemap[lhs.name].dtype
         other = rhs.args[0]
         nodes = []
@@ -1526,16 +1533,19 @@ def f(arr, w, center):  # pragma: no cover
     def _handle_rolling_apply_func(self, func_node, dtype, out_dtype):
         if func_node is None:
             raise ValueError("cannot find kernel function for rolling.apply() call")
-        func_node = func_node.value.py_func
         # TODO: more error checking on the kernel to make sure it doesn't
         # use global/closure variables
+        if func_node.closure is not None:
+            raise ValueError("rolling apply kernel functions cannot have closure variables")
+        if func_node.defaults is not None:
+            raise ValueError("rolling apply kernel functions cannot have default arguments")
         # create a function from the code object
         glbs = self.state.func_ir.func_id.func.__globals__
         lcs = {}
         exec("def f(A): return A", glbs, lcs)
         kernel_func = lcs['f']
-        kernel_func.__code__ = func_node.__code__
-        kernel_func.__name__ = func_node.__code__.co_name
+        kernel_func.__code__ = func_node.code
+        kernel_func.__name__ = func_node.code.co_name
         # use hpat's sequential pipeline to enable pandas operations
         # XXX seq pipeline used since dist pass causes a hang
         m = numba.ir_utils._max_label
diff --git a/sdc/hiframes/join.py b/sdc/hiframes/join.py
index 540d3fd7d..246e0a176 100644
--- a/sdc/hiframes/join.py
+++ b/sdc/hiframes/join.py
@@ -131,7 +131,7 @@ def join_array_analysis(join_node, equiv_set, typemap, array_analysis):
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var, {})
+        equiv_set.define(col_var)
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
index 27bf593b0..61e988095 100644
--- a/sdc/hiframes/pd_dataframe_ext.py
+++ b/sdc/hiframes/pd_dataframe_ext.py
@@ -142,6 +142,10 @@ def resolve_values(self, ary):
     def resolve_apply(self, df, args, kws):
         kws = dict(kws)
         func = args[0] if len(args) > 0 else kws.get('func', None)
+        # check lambda
+        if not isinstance(func, types.MakeFunctionLiteral):
+            raise ValueError("df.apply(): lambda not found")
+
         # check axis
         axis = args[1] if len(args) > 1 else kws.get('axis', None)
         if (axis is None or not isinstance(axis, types.IntegerLiteral)
@@ -161,8 +165,12 @@ def resolve_apply(self, df, args, kws):
             dtypes.append(el_typ)
 
         row_typ = types.NamedTuple(dtypes, Row)
-        t = func.get_call_type(self.context, (row_typ,), {})
-        return signature(SeriesType(t.return_type), *args)
+        code = func.literal_value.code
+        f_ir = numba.ir_utils.get_ir_of_code({'np': np}, code)
+        _, f_return_type, _ = numba.typed_passes.type_inference_stage(
+            self.context, f_ir, (row_typ,), None)
+
+        return signature(SeriesType(f_return_type), *args)
 
     @bound_function("df.describe")
     def resolve_describe(self, df, args, kws):
diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py
index c85614b2e..7366036d8 100644
--- a/sdc/hiframes/pd_series_ext.py
+++ b/sdc/hiframes/pd_series_ext.py
@@ -564,8 +564,18 @@ def _resolve_map_func(self, ary, args, kws):
         # getitem returns Timestamp for dt_index and series(dt64)
         if dtype == types.NPDatetime('ns'):
             dtype = pandas_timestamp_type
-        t = args[0].get_call_type(self.context, (dtype,), {})
-        return signature(SeriesType(t.return_type), *args)
+        code = args[0].literal_value.code
+        _globals = {'np': np}
+        # XXX hack in hiframes_typed to make globals available
+        if hasattr(args[0].literal_value, 'globals'):
+            # TODO: use code.co_names to find globals actually used?
+            _globals = args[0].literal_value.globals
+
+        f_ir = numba.ir_utils.get_ir_of_code(_globals, code)
+        f_typemap, f_return_type, f_calltypes = numba.typed_passes.type_inference_stage(
+            self.context, f_ir, (dtype,), None)
+
+        return signature(SeriesType(f_return_type), *args)
 
     @bound_function("series.map")
     def resolve_map(self, ary, args, kws):
@@ -584,8 +594,11 @@ def _resolve_combine_func(self, ary, args, kws):
         dtype2 = args[0].dtype
         if dtype2 == types.NPDatetime('ns'):
             dtype2 = pandas_timestamp_type
-        t = args[1].get_call_type(self.context, (dtype1, dtype2,), {})
-        return signature(SeriesType(t.return_type), *args)
+        code = args[1].literal_value.code
+        f_ir = numba.ir_utils.get_ir_of_code({'np': np}, code)
+        f_typemap, f_return_type, f_calltypes = numba.typed_passes.type_inference_stage(
+            self.context, f_ir, (dtype1, dtype2,), None)
+        return signature(SeriesType(f_return_type), *args)
 
     @bound_function("series.combine")
     def resolve_combine(self, ary, args, kws):
diff --git a/sdc/io/csv_ext.py b/sdc/io/csv_ext.py
index 2765081d9..292d82204 100644
--- a/sdc/io/csv_ext.py
+++ b/sdc/io/csv_ext.py
@@ -93,7 +93,7 @@ def csv_array_analysis(csv_node, equiv_set, typemap, array_analysis):
         equiv_set.insert_equiv(col_var, shape)
         post.extend(c_post)
         all_shapes.append(shape[0])
-        equiv_set.define(col_var, {})
+        equiv_set.define(col_var)
 
     if len(all_shapes) > 1:
         equiv_set.insert_equiv(*all_shapes)
diff --git a/sdc/tests/test_basic.py b/sdc/tests/test_basic.py
index 65537fbef..d89df5012 100644
--- a/sdc/tests/test_basic.py
+++ b/sdc/tests/test_basic.py
@@ -327,7 +327,8 @@ def test_array_reduce(self):
             self.assertEqual(count_array_OneDs(), 0)
             self.assertEqual(count_parfor_OneDs(), 1)
 
-    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
+    @unittest.skipIf(check_numba_version('0.46.0'),
+                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
     def test_dist_return(self):
         def test_impl(N):
             A = np.arange(N)
@@ -344,7 +345,8 @@ def test_impl(N):
         self.assertEqual(count_array_OneDs(), 1)
         self.assertEqual(count_parfor_OneDs(), 1)
 
-    @unittest.expectedFailure # https://github.com/numba/numba/issues/4690
+    @unittest.skipIf(check_numba_version('0.46.0'),
+                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
     def test_dist_return_tuple(self):
         def test_impl(N):
             A = np.arange(N)
@@ -373,7 +375,8 @@ def test_impl(A):
         np.testing.assert_allclose(hpat_func(arr) / self.num_ranks, test_impl(arr))
         self.assertEqual(count_array_OneDs(), 1)
 
-    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
+    @unittest.skipIf(check_numba_version('0.46.0'),
+                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
     def test_rebalance(self):
         def test_impl(N):
             A = np.arange(n)
@@ -391,7 +394,8 @@ def test_impl(N):
         finally:
             sdc.distributed_analysis.auto_rebalance = False
 
-    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
+    @unittest.skipIf(check_numba_version('0.46.0'),
+                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
     def test_rebalance_loop(self):
         def test_impl(N):
             A = np.arange(n)
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
index 7614e3a57..cf9ab680f 100644
--- a/sdc/tests/test_dataframe.py
+++ b/sdc/tests/test_dataframe.py
@@ -160,7 +160,8 @@ def test_impl(df):
                                           dtype=pd.api.types.CategoricalDtype(['N', 'Y']))})
         pd.testing.assert_frame_equal(hpat_func(df.copy(deep=True)), test_impl(df))
 
-    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
+    @unittest.skipIf(check_numba_version('0.46.0'),
+                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
     def test_box_dist_return(self):
         def test_impl(n):
             df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
diff --git a/sdc/tests/test_ml.py b/sdc/tests/test_ml.py
index 95c2bf5c5..40d27d85d 100644
--- a/sdc/tests/test_ml.py
+++ b/sdc/tests/test_ml.py
@@ -117,7 +117,8 @@ def test_impl(n):
         self.assertEqual(count_array_OneDs(), 1)
         self.assertEqual(count_parfor_OneDs(), 2)
 
-    @unittest.expectedFailure  # https://github.com/numba/numba/issues/4690
+    @unittest.skipIf(check_numba_version('0.46.0'),
+                     "Broken in numba 0.46.0. https://github.com/numba/numba/issues/4690")
     def test_kmeans(self):
         def test_impl(numCenter, numIter, N, D):
             A = np.ones((N, D))
diff --git a/setup.py b/setup.py
index 6e2b606ba..966390040 100644
--- a/setup.py
+++ b/setup.py
@@ -202,6 +202,9 @@ def readme():
 
 str_libs = np_compile_args['libraries']
 
+if not is_win:
+    str_libs += ['boost_regex']
+
 ext_str = Extension(name="sdc.hstr_ext",
                     sources=["sdc/_str_ext.cpp"],
                     libraries=str_libs,