From 8015bced8d17ce663b7c8beb7778d7452daf2bc9 Mon Sep 17 00:00:00 2001
From: "Kozlov, Alexey" <alexey.kozlov@intel.com>
Date: Mon, 23 Nov 2020 20:04:32 +0300
Subject: [PATCH] Refactor df.geitem by slice idx to improve compile time

Motivation: df[idx] when idx is a slice produces DataFrame with the same
internal structure as original one (only index type may change). Hence
it can be copied and filled with column[idx] slices, which improves
compilation time.
---
 .../hpat_pandas_dataframe_functions.py        | 62 +++++++++++++------
 sdc/hiframes/pd_dataframe_ext.py              | 26 +++++++-
 2 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
index 98c724705..bdf1e6f14 100644
--- a/sdc/datatypes/hpat_pandas_dataframe_functions.py
+++ b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -52,6 +52,7 @@
 from sdc.datatypes.range_index_type import RangeIndexType
 
 from sdc.hiframes.pd_dataframe_type import DataFrameType
+from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
 from sdc.hiframes.pd_series_type import SeriesType
 
 from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType,
@@ -1492,24 +1493,33 @@ def df_index_expr(self, length_expr=None):
 
 def df_getitem_slice_idx_main_codelines(self, idx):
     """Generate main code lines for df.getitem with idx of slice"""
+
+    types_order = get_structure_maps(self.data, self.columns)[2]
+    n_lists = len(types_order)
+
     results = []
-    func_lines = [f'  res_index = self.index[idx]']
-    for i, col in enumerate(self.columns):
-        col_loc = self.column_loc[col]
-        type_id, col_id = col_loc.type_id, col_loc.col_id
-        res_data = f'res_data_{i}'
+    func_lines = []
+    for i in range(n_lists):
         func_lines += [
-            f'  data_{i} = self._data[{type_id}][{col_id}][idx]',
-            f'  {res_data} = data_{i}'
+            f'  list_{i} = self._data[{i}].copy()',
+            f'  for i, item in enumerate(list_{i}):',
+            f'    list_{i}[i] = item[idx]'
         ]
-        results.append((col, res_data))
 
-    data = ', '.join(f'"{col}": {data}' for col, data in results)
-    func_lines += [f'  return pandas.DataFrame({{{data}}}, index=res_index)']
+    all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', '
+    res_data = f'({all_lists_joined})' if n_lists > 0 else '()'
+    func_lines += [
+        f'  if self_index_is_none == True:',
+        f'    old_index = pandas.RangeIndex(len(self))',
+        f'  else:',
+        f'    old_index = self._index',
+        f'  res_data = {res_data}',
+        f'  res_index = old_index[idx]',
+        f'  return init_dataframe_internal(res_data, res_index, df_type)'
+    ]
 
     return func_lines
 
-
 def df_getitem_tuple_idx_main_codelines(self, literal_idx):
     """Generate main code lines for df.getitem with idx of tuple"""
     results = []
@@ -1624,13 +1634,17 @@ def df_getitem_key_error_codelines():
 def df_getitem_slice_idx_codegen(self, idx):
     """
     Example of generated implementation with provided index:
-        def _df_getitem_slice_idx_impl(self, idx)
-          res_index = self._index
-          data_0 = self._data[0]
-          res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
-          data_1 = self._data [1]
-          res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
-          return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
+        def _df_getitem_slice_idx_impl(self, idx):
+          list_0 = self._data[0].copy()
+          for i, item in enumerate(list_0):
+            list_0[i] = item[idx]
+          if self_index_is_none == True:
+            old_index = pandas.RangeIndex(len(self))
+          else:
+            old_index = self._index
+          res_data = (list_0, )
+          res_index = old_index[idx]
+          return init_dataframe_internal(res_data, res_index, df_type)
     """
     func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
     if self.columns:
@@ -1639,7 +1653,17 @@ def _df_getitem_slice_idx_impl(self, idx)
         # raise KeyError if input DF is empty
         func_lines += df_getitem_key_error_codelines()
     func_text = '\n'.join(func_lines)
-    global_vars = {'pandas': pandas, 'numpy': numpy}
+
+    # TO-DO: need DefaultIndex to handle self.index[idx] construct inside func
+    self_index_is_none = isinstance(self.index, types.NoneType)
+    new_index_type = RangeIndexType(False) if self_index_is_none else self.index
+    df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc)
+
+    global_vars = {'pandas': pandas,
+                   'numpy': numpy,
+                   'df_type': df_type,
+                   'init_dataframe_internal': init_dataframe_internal,
+                   'self_index_is_none': self_index_is_none}
 
     return func_text, global_vars
 
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
index 56218e99c..65d10ef52 100644
--- a/sdc/hiframes/pd_dataframe_ext.py
+++ b/sdc/hiframes/pd_dataframe_ext.py
@@ -53,7 +53,6 @@ def generic_resolve(self, df, attr):
             return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
 
 
-
 def get_structure_maps(col_types, col_names):
     # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
     column_loc = {}
@@ -80,6 +79,31 @@ def get_structure_maps(col_types, col_names):
     return column_loc, data_typs_map, types_order
 
 
+@intrinsic
+def init_dataframe_internal(typingctx, data, index, df_type):
+
+    ret_type = df_type.instance_type
+
+    def codegen(context, builder, sig, args):
+        data_val, index_val = args[:2]
+
+        dataframe = cgutils.create_struct_proxy(
+            sig.return_type)(context, builder)
+        dataframe.data = data_val
+        dataframe.index = index_val
+        dataframe.parent = context.get_constant_null(types.pyobject)
+
+        # increase refcount of stored values
+        if context.enable_nrt:
+            context.nrt.incref(builder, sig.args[0], data_val)
+            context.nrt.incref(builder, sig.args[1], index_val)
+
+        return dataframe._getvalue()
+
+    sig = signature(ret_type, data, index, df_type)
+    return sig, codegen
+
+
 # TODO: alias analysis
 # this function should be used for getting df._data for alias analysis to work
 # no_cpython_wrapper since Array(DatetimeDate) cannot be boxed