Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 43 additions & 19 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from sdc.datatypes.range_index_type import RangeIndexType

from sdc.hiframes.pd_dataframe_type import DataFrameType
from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
from sdc.hiframes.pd_series_type import SeriesType

from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType,
Expand Down Expand Up @@ -1492,24 +1493,33 @@ def df_index_expr(self, length_expr=None):

def df_getitem_slice_idx_main_codelines(self, idx):
"""Generate main code lines for df.getitem with idx of slice"""

types_order = get_structure_maps(self.data, self.columns)[2]
n_lists = len(types_order)

results = []
func_lines = [f' res_index = self.index[idx]']
for i, col in enumerate(self.columns):
col_loc = self.column_loc[col]
type_id, col_id = col_loc.type_id, col_loc.col_id
res_data = f'res_data_{i}'
func_lines = []
for i in range(n_lists):
func_lines += [
f' data_{i} = self._data[{type_id}][{col_id}][idx]',
f' {res_data} = data_{i}'
f' list_{i} = self._data[{i}].copy()',
f' for i, item in enumerate(list_{i}):',
f' list_{i}[i] = item[idx]'
]
results.append((col, res_data))

data = ', '.join(f'"{col}": {data}' for col, data in results)
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index)']
all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', '
res_data = f'({all_lists_joined})' if n_lists > 0 else '()'
func_lines += [
f' if self_index_is_none == True:',
f' old_index = pandas.RangeIndex(len(self))',
f' else:',
f' old_index = self._index',
f' res_data = {res_data}',
f' res_index = old_index[idx]',
f' return init_dataframe_internal(res_data, res_index, df_type)'
]

return func_lines


def df_getitem_tuple_idx_main_codelines(self, literal_idx):
"""Generate main code lines for df.getitem with idx of tuple"""
results = []
Expand Down Expand Up @@ -1624,13 +1634,17 @@ def df_getitem_key_error_codelines():
def df_getitem_slice_idx_codegen(self, idx):
"""
Example of generated implementation with provided index:
def _df_getitem_slice_idx_impl(self, idx)
res_index = self._index
data_0 = self._data[0]
res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
data_1 = self._data [1]
res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
def _df_getitem_slice_idx_impl(self, idx):
list_0 = self._data[0].copy()
for i, item in enumerate(list_0):
list_0[i] = item[idx]
if self_index_is_none == True:
old_index = pandas.RangeIndex(len(self))
else:
old_index = self._index
res_data = (list_0, )
res_index = old_index[idx]
return init_dataframe_internal(res_data, res_index, df_type)
"""
func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
if self.columns:
Expand All @@ -1639,7 +1653,17 @@ def _df_getitem_slice_idx_impl(self, idx)
# raise KeyError if input DF is empty
func_lines += df_getitem_key_error_codelines()
func_text = '\n'.join(func_lines)
global_vars = {'pandas': pandas, 'numpy': numpy}

# TO-DO: need DefaultIndex to handle self.index[idx] construct inside func
self_index_is_none = isinstance(self.index, types.NoneType)
new_index_type = RangeIndexType(False) if self_index_is_none else self.index
df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc)

global_vars = {'pandas': pandas,
'numpy': numpy,
'df_type': df_type,
'init_dataframe_internal': init_dataframe_internal,
'self_index_is_none': self_index_is_none}

return func_text, global_vars

Expand Down
26 changes: 25 additions & 1 deletion sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def generic_resolve(self, df, attr):
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)



def get_structure_maps(col_types, col_names):
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
column_loc = {}
Expand All @@ -80,6 +79,31 @@ def get_structure_maps(col_types, col_names):
return column_loc, data_typs_map, types_order


@intrinsic
def init_dataframe_internal(typingctx, data, index, df_type):

ret_type = df_type.instance_type

def codegen(context, builder, sig, args):
data_val, index_val = args[:2]

dataframe = cgutils.create_struct_proxy(
sig.return_type)(context, builder)
dataframe.data = data_val
dataframe.index = index_val
dataframe.parent = context.get_constant_null(types.pyobject)

# increase refcount of stored values
if context.enable_nrt:
context.nrt.incref(builder, sig.args[0], data_val)
context.nrt.incref(builder, sig.args[1], index_val)

return dataframe._getvalue()

sig = signature(ret_type, data, index, df_type)
return sig, codegen


# TODO: alias analysis
# this function should be used for getting df._data for alias analysis to work
# no_cpython_wrapper since Array(DatetimeDate) cannot be boxed
Expand Down