Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 97 additions & 35 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from sdc.datatypes.range_index_type import RangeIndexType

from sdc.hiframes.pd_dataframe_type import DataFrameType
from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
from sdc.hiframes.pd_series_type import SeriesType

from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType,
Expand Down Expand Up @@ -1337,40 +1338,69 @@ def isna_overload(df):
return sdc_pandas_dataframe_isna_codegen(df, 'isna')


def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols):
def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_col_names):
"""
Example of generated implementation:
def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns=None,
level=None, inplace=False, errors="raise"):
new_col_0_data_df = df._data[1][0]
new_col_1_data_df = df._data[0][1]
return pandas.DataFrame({"B": new_col_0_data_df, "C": new_col_1_data_df}, index=df.index)
level=None, inplace=False, errors="raise"):
list_0 = df._data[0].copy()
for col_id in old_scheme_drop_idxs_0[::-1]:
list_0.pop(col_id)
list_1 = df._data[1].copy()
new_data = (list_1, list_0, )
return init_dataframe_internal(new_data, df._index, df_type)
"""
indent = 4 * ' '
saved_df_columns = [column for column in df.columns if column not in drop_cols]
func_definition = [f'def sdc_pandas_dataframe_{func_name}_impl({", ".join(func_args)}):']
func_definition = [f'def {func_name}({", ".join(func_args)}):']
func_text = []
column_list = []

for label in drop_cols:
old_column_loc, old_data_typs_map, old_types_order = get_structure_maps(df.data, df.columns)

new_data_typs = tuple(t for i, t in enumerate(df.data) if df.columns[i] not in drop_col_names)
new_column_names = tuple(c for c in df.columns if c not in drop_col_names)
new_column_loc, new_data_typs_map, new_types_order = get_structure_maps(new_data_typs, new_column_names)

old_types_idxs_map = dict(zip(old_types_order, range(len(old_types_order))))
reorder_scheme = tuple(old_types_idxs_map[t] for t in new_types_order)
df_type = DataFrameType(new_data_typs, df.index, new_column_names, column_loc=new_column_loc)

old_scheme_drop_idxs = []
for i, k in enumerate(old_types_order):
a = [j for j, x in enumerate(old_data_typs_map[k][1]) if df.columns[x] in drop_col_names]
old_scheme_drop_idxs.append(tuple(a) or None)

for label in drop_col_names:
if label not in df.columns:
func_text.append(f'if errors == "raise":')
func_text.append(indent + f'raise ValueError("The label {label} is not found in the selected axis")')
break

for column_id, column_name in enumerate(saved_df_columns):
col_loc = df.column_loc[column_name]
type_id, col_id = col_loc.type_id, col_loc.col_id
func_text.append(f'new_col_{column_id}_data_df = df._data[{type_id}][{col_id}]')
column_list.append((f'new_col_{column_id}_data_df', column_name))

data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list)
index = 'df.index'
func_text.append(f"return pandas.DataFrame({{{data}}}, index={index})\n")
old_ntypes = len(old_types_order)
for type_id in range(old_ntypes):
func_text.append(f'list_{type_id} = df._data[{type_id}].copy()')
if old_scheme_drop_idxs[type_id]:
func_text.append(f'for col_id in old_scheme_drop_idxs_{type_id}[::-1]:')
func_text.append(indent + f'list_{type_id}.pop(col_id)')

# in new df the order of array lists (i.e. types_order) can be different, so
# making a new tuple of lists reorder as needed
new_ntypes = len(new_types_order)
data_lists_reordered = ', '.join(['list_' + str(reorder_scheme[i]) for i in range(new_ntypes)])
data_val = '(' + data_lists_reordered + ', )' if new_ntypes > 0 else '()'

data, index = 'new_data', 'df._index'
func_text.append(f'{data} = {data_val}')
func_text.append(f"return init_dataframe_internal({data}, {index}, df_type)\n")
func_definition.extend([indent + func_line for func_line in func_text])
func_def = '\n'.join(func_definition)

global_vars = {'pandas': pandas}
global_vars = {
'pandas': pandas,
'init_dataframe_internal': init_dataframe_internal,
'df_type': df_type
}

global_vars.update({f'old_scheme_drop_idxs_{i}': old_scheme_drop_idxs[i] for i in range(old_ntypes)})

return func_def, global_vars

Expand All @@ -1387,7 +1417,8 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None,
-----------
- Parameters ``labels``, ``axis``, ``index``, ``level`` and ``inplace`` are currently unsupported.
- Parameter ``columns`` is required and is expected to be a Literal value with one column name
or Tuple with columns names.
or List with columns names. Mutating a list of column names after it was defined and then using it as a
columns argument results in an SDCLimitation exception at runtime.
- Supported ``errors`` can be {``raise``, ``ignore``}, default ``raise``. If ``ignore``, suppress error and only
existing labels are dropped.

Expand Down Expand Up @@ -1420,36 +1451,66 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None,

"""

_func_name = 'drop'
method_name = f'Method drop().'

ty_checker = TypeChecker(f'Method {_func_name}().')
ty_checker = TypeChecker(method_name)
ty_checker.check(df, DataFrameType)

if not isinstance(labels, types.Omitted) and labels is not None:
if not isinstance(labels, (types.Omitted, types.NoneType)) and labels is not None:
ty_checker.raise_exc(labels, 'None', 'labels')

if not isinstance(axis, (int, types.Omitted)):
if not isinstance(axis, (types.Omitted, types.Integer)) and axis != 0:
ty_checker.raise_exc(axis, 'int', 'axis')

if not isinstance(index, types.Omitted) and index is not None:
if not isinstance(index, (types.Omitted, types.NoneType)) and index is not None:
ty_checker.raise_exc(index, 'None', 'index')

if not isinstance(columns, (types.Omitted, types.Tuple, types.Literal)):
ty_checker.raise_exc(columns, 'str, tuple of str', 'columns')
if not (isinstance(columns, (types.Omitted, types.StringLiteral))
or (isinstance(columns, types.Tuple)
and all(isinstance(c, types.StringLiteral) for c in columns))
or (isinstance(columns, types.UniTuple) and isinstance(columns.dtype, types.StringLiteral))
or isinstance(columns, types.List) and isinstance(columns.dtype, types.UnicodeType)
):
ty_checker.raise_exc(columns, 'str, list of const str', 'columns')

if not isinstance(level, (types.Omitted, types.Literal)) and level is not None:
if not isinstance(level, (types.Omitted, types.NoneType, types.Literal)) and level is not None:
ty_checker.raise_exc(level, 'None', 'level')

if not isinstance(inplace, (bool, types.Omitted)) and inplace:
if not isinstance(inplace, (types.Omitted, types.NoneType, types.Boolean)) and inplace:
ty_checker.raise_exc(inplace, 'bool', 'inplace')

if not isinstance(errors, (str, types.Omitted, types.Literal)):
if not isinstance(errors, (types.Omitted, types.UnicodeType, types.StringLiteral)) and errors != "raise":
ty_checker.raise_exc(errors, 'str', 'errors')

if isinstance(columns, types.List):
if columns.initial_value is None:
raise TypingError('{} Unsupported use of parameter columns:'
' expected list of constant strings. Given: {}'.format(method_name, columns))
else:
# this works because global tuple of strings is captured as Tuple of StringLiterals
columns_as_tuple = tuple(columns.initial_value)
def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None,
columns=None, level=None, inplace=False, errors="raise"):

# if at runtime columns list differs from it's initial value (known at compile time)
# we cannot tell which columns to drop and what is the resulting DataFrameType, so raise exception
if list(columns_as_tuple) != columns:
raise SDCLimitation("Unsupported use of parameter columns: non-const list was used.")

return df.drop(labels=labels,
axis=axis,
index=index,
columns=columns_as_tuple,
level=level,
inplace=inplace,
errors=errors)

return _sdc_pandas_dataframe_drop_wrapper_impl

args = {'labels': None, 'axis': 0, 'index': None, 'columns': None, 'level': None, 'inplace': False,
'errors': f'"raise"'}

def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
def sdc_pandas_dataframe_drop_impl(df, args, columns):
func_args = ['df']
for key, value in args.items():
if key not in func_args:
Expand All @@ -1459,18 +1520,19 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):

if isinstance(columns, types.StringLiteral):
drop_cols = (columns.literal_value,)
elif isinstance(columns, types.Tuple):
elif isinstance(columns, (types.Tuple, types.UniTuple)):
drop_cols = tuple(column.literal_value for column in columns)
else:
raise ValueError('Only drop by one column or tuple of columns is currently supported in df.drop()')

func_def, global_vars = sdc_pandas_dataframe_drop_codegen(_func_name, func_args, df, drop_cols)
func_name = 'sdc_pandas_dataframe_drop_impl'
func_def, global_vars = sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols)
loc_vars = {}
exec(func_def, global_vars, loc_vars)
_drop_impl = loc_vars['sdc_pandas_dataframe_drop_impl']
_drop_impl = loc_vars[func_name]
return _drop_impl

return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns)
return sdc_pandas_dataframe_drop_impl(df, args, columns)


def df_length_expr(self):
Expand Down
26 changes: 25 additions & 1 deletion sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def generic_resolve(self, df, attr):
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)



def get_structure_maps(col_types, col_names):
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
column_loc = {}
Expand All @@ -80,6 +79,31 @@ def get_structure_maps(col_types, col_names):
return column_loc, data_typs_map, types_order


@intrinsic
def init_dataframe_internal(typingctx, data, index, df_type):

ret_type = df_type.instance_type

def codegen(context, builder, sig, args):
data_val, index_val = args[:2]

dataframe = cgutils.create_struct_proxy(
sig.return_type)(context, builder)
dataframe.data = data_val
dataframe.index = index_val
dataframe.parent = context.get_constant_null(types.pyobject)

# increase refcount of stored values
if context.enable_nrt:
context.nrt.incref(builder, sig.args[0], data_val)
context.nrt.incref(builder, sig.args[1], index_val)

return dataframe._getvalue()

sig = signature(ret_type, data, index, df_type)
return sig, codegen


# TODO: alias analysis
# this function should be used for getting df._data for alias analysis to work
# no_cpython_wrapper since Array(DatetimeDate) cannot be boxed
Expand Down
Loading