IntelPython · Hardcode84 · Sep 30, 2020 · Sep 30, 2020
diff --git a/sdc/datatypes/hpat_pandas_functions.py b/sdc/datatypes/hpat_pandas_functions.py
@@ -39,13 +39,12 @@
 
 from sdc.io.csv_ext import (
     _gen_csv_reader_py_pyarrow_py_func,
-    _gen_pandas_read_csv_func_text,
+    _gen_csv_reader_py_pyarrow_func_text_dataframe,
 )
 from sdc.str_arr_ext import string_array_type
 
 from sdc.hiframes import join, aggregate, sort
 from sdc.types import CategoricalDtypeType, Categorical
-from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype
 
 
 def get_numba_array_types_for_csv(df):
@@ -256,69 +255,45 @@ def sdc_pandas_read_csv(
         usecols = [col.literal_value for col in usecols]
 
     if infer_from_params:
-        # dtype is a tuple of format ('A', A_dtype, 'B', B_dtype, ...)
-        # where column names should be constants and is important only for inference from params
+        # dtype should be constants and is important only for inference from params
         if isinstance(dtype, types.Tuple):
-            assert all(isinstance(key, types.StringLiteral) for key in dtype[::2])
+            assert all(isinstance(key, types.Literal) for key in dtype[::2])
             keys = (k.literal_value for k in dtype[::2])
-            values = dtype[1::2]
-
-            def _get_df_col_type(dtype):
-                if isinstance(dtype, types.Function):
-                    if dtype.typing_key == int:
-                        return types.Array(types.int_, 1, 'C')
-                    elif dtype.typing_key == float:
-                        return types.Array(types.float64, 1, 'C')
-                    elif dtype.typing_key == str:
-                        return string_array_type
-                    else:
-                        assert False, f"map_dtype_to_col_type: failing to infer column type for dtype={dtype}"
-
-                if isinstance(dtype, types.StringLiteral):
-                    if dtype.literal_value == 'str':
-                        return string_array_type
-                    else:
-                        return types.Array(numba.from_dtype(np.dtype(dtype.literal_value)), 1, 'C')
 
-                if isinstance(dtype, types.NumberClass):
-                    return types.Array(dtype.dtype, 1, 'C')
-
-                if isinstance(dtype, CategoricalDtypeType):
-                    return Categorical(dtype)
+            values = dtype[1::2]
+            values = [v.typing_key if isinstance(v, types.Function) else v for v in values]
+            values = [types.Array(numba.from_dtype(np.dtype(v.literal_value)), 1, 'C')
+                      if isinstance(v, types.Literal) else v for v in values]
+            values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values]
+            values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values]
+            values = [string_array_type if v == str else v for v in values]
+            values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values]
 
-            col_types_map = dict(zip(keys, map(_get_df_col_type, values)))
+            dtype = dict(zip(keys, values))
 
     # in case of both are available
     # inferencing from params has priority over inferencing from file
     if infer_from_params:
+        col_names = names
         # all names should be in dtype
-        col_names = usecols if usecols else names
-        col_types = [col_types_map[n] for n in col_names]
+        return_columns = usecols if usecols else names
+        col_typs = [dtype[n] for n in return_columns]
 
     elif infer_from_file:
-        col_names, col_types = infer_column_names_and_types_from_constant_filename(
+        col_names, col_typs = infer_column_names_and_types_from_constant_filename(
             filepath_or_buffer, delimiter, names, usecols, skiprows)
 
     else:
         return None
 
-    def _get_py_col_dtype(ctype):
-        """ Re-creates column dtype as python type to be used in read_csv call """
-        dtype = ctype.dtype
-        if ctype == string_array_type:
-            return str
-        if isinstance(ctype, Categorical):
-            return _reconstruct_CategoricalDtype(ctype.pd_dtype)
-        return numpy_support.as_dtype(dtype)
-
-    py_col_dtypes = {cname: _get_py_col_dtype(ctype) for cname, ctype in zip(col_names, col_types)}
+    dtype_present = not isinstance(dtype, (types.Omitted, type(None)))
 
     # generate function text with signature and returning DataFrame
-    func_text, func_name, global_vars = _gen_pandas_read_csv_func_text(
-        col_names, col_types, py_col_dtypes, usecols, signature)
+    func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_dataframe(
+        col_names, col_typs, dtype_present, usecols, signature)
 
     # compile with Python
-    csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars)
+    csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name)
 
     return csv_reader_py
 

diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -26,6 +26,7 @@
 
 
 import operator
+from typing import NamedTuple
 
 import numba
 from numba import types
@@ -38,7 +39,7 @@
 from numba.core.imputils import impl_ret_new_ref, impl_ret_borrowed
 
 from sdc.hiframes.pd_series_ext import SeriesType
-from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc
+from sdc.hiframes.pd_dataframe_type import DataFrameType
 from sdc.str_ext import string_type
 
 
@@ -53,6 +54,10 @@ def generic_resolve(self, df, attr):
             return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
 
 
+class ColumnLoc(NamedTuple):
+    type_id: int
+    col_id: int
+
 
 def get_structure_maps(col_types, col_names):
     # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}

diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py
@@ -24,8 +24,6 @@
 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import re
-from typing import NamedTuple
 
 import numba
 from numba import types
@@ -50,7 +48,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, column
         self.has_parent = has_parent
         self.column_loc = column_loc
         super(DataFrameType, self).__init__(
-            name="DataFrameType({}, {}, {}, {})".format(data, index, columns, has_parent))
+            name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))
 
     def copy(self, index=None, has_parent=None):
         # XXX is copy necessary?
@@ -85,16 +83,6 @@ def unify(self, typingctx, other):
     def is_precise(self):
         return all(a.is_precise() for a in self.data) and self.index.is_precise()
 
-    def __repr__(self):
-
-        # To have correct repr of DataFrame we need some changes to what types.Type gives:
-        # (1) e.g. array(int64, 1d, C) should be Array(int64, 1, 'C')
-        # (2) ColumnLoc is not part of DataFrame name, so we need to add it
-        default_repr = super(DataFrameType, self).__repr__()
-        res = re.sub(r'array\((\w+), 1d, C\)', r'Array(\1, 1, \'C\')', default_repr)
-        res = re.sub(r'\)$', f', column_loc={self.column_loc})', res)
-        return res
-
 
 @register_model(DataFrameType)
 class DataFrameModel(models.StructModel):
@@ -116,15 +104,6 @@ def __init__(self, dmm, fe_type):
         super(DataFrameModel, self).__init__(dmm, fe_type, members)
 
 
-class ColumnLoc(NamedTuple):
-    type_id: int
-    col_id: int
-
-
-# FIXME_Numba#3372: add into numba.types to allow returning from objmode
-types.DataFrameType = DataFrameType
-types.ColumnLoc = ColumnLoc
-
 make_attribute_wrapper(DataFrameType, 'data', '_data')
 make_attribute_wrapper(DataFrameType, 'index', '_index')
 make_attribute_wrapper(DataFrameType, 'columns', '_columns')