Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 20 additions & 45 deletions sdc/datatypes/hpat_pandas_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,12 @@

from sdc.io.csv_ext import (
_gen_csv_reader_py_pyarrow_py_func,
_gen_pandas_read_csv_func_text,
_gen_csv_reader_py_pyarrow_func_text_dataframe,
)
from sdc.str_arr_ext import string_array_type

from sdc.hiframes import join, aggregate, sort
from sdc.types import CategoricalDtypeType, Categorical
from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype


def get_numba_array_types_for_csv(df):
Expand Down Expand Up @@ -256,69 +255,45 @@ def sdc_pandas_read_csv(
usecols = [col.literal_value for col in usecols]

if infer_from_params:
# dtype is a tuple of format ('A', A_dtype, 'B', B_dtype, ...)
# where column names should be constants and is important only for inference from params
# dtype should be constants and is important only for inference from params
if isinstance(dtype, types.Tuple):
assert all(isinstance(key, types.StringLiteral) for key in dtype[::2])
assert all(isinstance(key, types.Literal) for key in dtype[::2])
keys = (k.literal_value for k in dtype[::2])
values = dtype[1::2]

def _get_df_col_type(dtype):
if isinstance(dtype, types.Function):
if dtype.typing_key == int:
return types.Array(types.int_, 1, 'C')
elif dtype.typing_key == float:
return types.Array(types.float64, 1, 'C')
elif dtype.typing_key == str:
return string_array_type
else:
assert False, f"map_dtype_to_col_type: failing to infer column type for dtype={dtype}"

if isinstance(dtype, types.StringLiteral):
if dtype.literal_value == 'str':
return string_array_type
else:
return types.Array(numba.from_dtype(np.dtype(dtype.literal_value)), 1, 'C')

if isinstance(dtype, types.NumberClass):
return types.Array(dtype.dtype, 1, 'C')

if isinstance(dtype, CategoricalDtypeType):
return Categorical(dtype)
values = dtype[1::2]
values = [v.typing_key if isinstance(v, types.Function) else v for v in values]
values = [types.Array(numba.from_dtype(np.dtype(v.literal_value)), 1, 'C')
if isinstance(v, types.Literal) else v for v in values]
values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values]
values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values]
values = [string_array_type if v == str else v for v in values]
values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values]

col_types_map = dict(zip(keys, map(_get_df_col_type, values)))
dtype = dict(zip(keys, values))

# in case of both are available
# inferencing from params has priority over inferencing from file
if infer_from_params:
col_names = names
# all names should be in dtype
col_names = usecols if usecols else names
col_types = [col_types_map[n] for n in col_names]
return_columns = usecols if usecols else names
col_typs = [dtype[n] for n in return_columns]

elif infer_from_file:
col_names, col_types = infer_column_names_and_types_from_constant_filename(
col_names, col_typs = infer_column_names_and_types_from_constant_filename(
filepath_or_buffer, delimiter, names, usecols, skiprows)

else:
return None

def _get_py_col_dtype(ctype):
""" Re-creates column dtype as python type to be used in read_csv call """
dtype = ctype.dtype
if ctype == string_array_type:
return str
if isinstance(ctype, Categorical):
return _reconstruct_CategoricalDtype(ctype.pd_dtype)
return numpy_support.as_dtype(dtype)

py_col_dtypes = {cname: _get_py_col_dtype(ctype) for cname, ctype in zip(col_names, col_types)}
dtype_present = not isinstance(dtype, (types.Omitted, type(None)))

# generate function text with signature and returning DataFrame
func_text, func_name, global_vars = _gen_pandas_read_csv_func_text(
col_names, col_types, py_col_dtypes, usecols, signature)
func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_dataframe(
col_names, col_typs, dtype_present, usecols, signature)

# compile with Python
csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars)
csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name)

return csv_reader_py

Expand Down
7 changes: 6 additions & 1 deletion sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@


import operator
from typing import NamedTuple

import numba
from numba import types
Expand All @@ -38,7 +39,7 @@
from numba.core.imputils import impl_ret_new_ref, impl_ret_borrowed

from sdc.hiframes.pd_series_ext import SeriesType
from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc
from sdc.hiframes.pd_dataframe_type import DataFrameType
from sdc.str_ext import string_type


Expand All @@ -53,6 +54,10 @@ def generic_resolve(self, df, attr):
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)


class ColumnLoc(NamedTuple):
type_id: int
col_id: int


def get_structure_maps(col_types, col_names):
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
Expand Down
23 changes: 1 addition & 22 deletions sdc/hiframes/pd_dataframe_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************

import re
from typing import NamedTuple

import numba
from numba import types
Expand All @@ -50,7 +48,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, column
self.has_parent = has_parent
self.column_loc = column_loc
super(DataFrameType, self).__init__(
name="DataFrameType({}, {}, {}, {})".format(data, index, columns, has_parent))
name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))

def copy(self, index=None, has_parent=None):
# XXX is copy necessary?
Expand Down Expand Up @@ -85,16 +83,6 @@ def unify(self, typingctx, other):
def is_precise(self):
return all(a.is_precise() for a in self.data) and self.index.is_precise()

def __repr__(self):

# To have correct repr of DataFrame we need some changes to what types.Type gives:
# (1) e.g. array(int64, 1d, C) should be Array(int64, 1, 'C')
# (2) ColumnLoc is not part of DataFrame name, so we need to add it
default_repr = super(DataFrameType, self).__repr__()
res = re.sub(r'array\((\w+), 1d, C\)', r'Array(\1, 1, \'C\')', default_repr)
res = re.sub(r'\)$', f', column_loc={self.column_loc})', res)
return res


@register_model(DataFrameType)
class DataFrameModel(models.StructModel):
Expand All @@ -116,15 +104,6 @@ def __init__(self, dmm, fe_type):
super(DataFrameModel, self).__init__(dmm, fe_type, members)


class ColumnLoc(NamedTuple):
type_id: int
col_id: int


# FIXME_Numba#3372: add into numba.types to allow returning from objmode
types.DataFrameType = DataFrameType
types.ColumnLoc = ColumnLoc

make_attribute_wrapper(DataFrameType, 'data', '_data')
make_attribute_wrapper(DataFrameType, 'index', '_index')
make_attribute_wrapper(DataFrameType, 'columns', '_columns')
Expand Down
Loading