Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 115 additions & 62 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,53 +1059,32 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns)


def df_getitem_bool_series_idx_main_codelines(self, idx):
"""Generate main code lines for df.getitem"""
func_lines = [' self_length = len(get_dataframe_data(self, 0))',
' trimmed_idx_data = idx._data[:self_length]']

if isinstance(self.index, types.NoneType):
func_lines += [' self_index = numpy.arange(self_length)']
else:
func_lines += [' self_index = self._index']

results = []
for i, col in enumerate(self.columns):
res_data = f'res_data_{i}'
func_lines += [
f' data_{i} = get_dataframe_data(self, {i})',
f' series = pandas.Series(data_{i}, index=self_index, name="{col}")',
f' {res_data} = series[trimmed_idx_data]',
]
results.append((col, res_data))

data = ', '.join(f'"{col}": {data}' for col, data in results)
func_lines += [f' return pandas.DataFrame({{{data}}}, index=self_index[trimmed_idx_data])']
def df_length_codelines(self):
"""Generate code lines to get length of DF"""
if self.columns:
return [' length = len(get_dataframe_data(self, 0))']

return func_lines
return [' length = 0']


def df_index_codelines(self):
def df_index_codelines(self, with_length=False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better for this function to do only index lines generation, so that we just make df_length_codelines call not inside it, but where this func is called with with_lenght=True. This simplifies things, as on the call site you see exactly, that you generate line for lenght, then line for index.
Alternatively we could use a format string and pass the actual var name that will hold df's len to it as a second argument, i.e.

def df_index_codelines(self, length):
...
        func_lines += [f'  res_index = numpy.arange({length})']

"""Generate code lines to get or create index of DF"""
func_lines = []
if isinstance(self.index, types.NoneType):
func_lines = [' length = len(get_dataframe_data(self, 0))',
' _index = numpy.arange(length)',
' res_index = _index']
if with_length:
func_lines += df_length_codelines(self)

func_lines += [' res_index = numpy.arange(length)']
else:
func_lines = [' res_index = self._index']
func_lines += [' res_index = self._index']

return func_lines


def df_getitem_key_error_codelines():
"""Generate code lines to raise KeyError"""
return [' raise KeyError("Column is not in the DataFrame")']


def df_getitem_slice_idx_main_codelines(self, idx):
"""Generate main code lines for df.getitem with idx of slice"""
results = []
func_lines = df_index_codelines(self)
func_lines = df_index_codelines(self, with_length=True)
for i, col in enumerate(self.columns):
res_data = f'res_data_{i}'
func_lines += [
Expand All @@ -1123,7 +1102,7 @@ def df_getitem_slice_idx_main_codelines(self, idx):
def df_getitem_tuple_idx_main_codelines(self, literal_idx):
"""Generate main code lines for df.getitem with idx of tuple"""
results = []
func_lines = df_index_codelines(self)
func_lines = df_index_codelines(self, with_length=True)
needed_cols = {col: i for i, col in enumerate(self.columns) if col in literal_idx}
for col, i in needed_cols.items():
res_data = f'res_data_{i}'
Expand All @@ -1139,33 +1118,53 @@ def df_getitem_tuple_idx_main_codelines(self, literal_idx):
return func_lines


def df_getitem_bool_series_codegen(self, idx):
"""
Example of generated implementation with provided index:
def _df_getitem_bool_series_idx_impl(self, idx):
self_length = len(get_dataframe_data(self, 0))
trimmed_idx_data = idx._data[:self_length]
self_index = self._index
data_0 = get_dataframe_data(self, 0)
series = pandas.Series(data_0, index=self_index, name="A")
res_data_0 = series[trimmed_idx_data]
data_1 = get_dataframe_data(self, 1)
series = pandas.Series(data_1, index=self_index, name="B")
res_data_1 = series[trimmed_idx_data]
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=self_index[trimmed_idx_data])
"""
func_lines = ['def _df_getitem_bool_series_idx_impl(self, idx):']
if self.columns:
func_lines += df_getitem_bool_series_idx_main_codelines(self, idx)
else:
# raise KeyError if input DF is empty
func_lines += df_getitem_key_error_codelines()
def df_getitem_bool_series_idx_main_codelines(self, idx):
"""Generate main code lines for df.getitem"""
func_lines = df_length_codelines(self)
func_lines += [' _idx_data = idx._data[:length]']
func_lines += df_index_codelines(self)

func_text = '\n'.join(func_lines)
global_vars = {'pandas': pandas, 'numpy': numpy,
'get_dataframe_data': get_dataframe_data}
results = []
for i, col in enumerate(self.columns):
res_data = f'res_data_{i}'
func_lines += [
f' data_{i} = get_dataframe_data(self, {i})',
f' series_{i} = pandas.Series(data_{i}, index=res_index, name="{col}")',
f' {res_data} = series_{i}[_idx_data]'
]
results.append((col, res_data))

data = ', '.join(f'"{col}": {data}' for col, data in results)
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[_idx_data])']

return func_lines

return func_text, global_vars

def df_getitem_bool_array_idx_main_codelines(self, idx):
"""Generate main code lines for df.getitem"""
func_lines = df_length_codelines(self)
func_lines += [' if length != len(idx):',
' raise ValueError("Item wrong length.")']
func_lines += df_index_codelines(self)

results = []
for i, col in enumerate(self.columns):
res_data = f'res_data_{i}'
func_lines += [
f' data_{i} = get_dataframe_data(self, {i})',
f' {res_data} = pandas.Series(data_{i}[idx], index=res_index[idx], name="{col}")'
]
results.append((col, res_data))

data = ', '.join(f'"{col}": {data}' for col, data in results)
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index[idx])']

return func_lines


def df_getitem_key_error_codelines():
"""Generate code lines to raise KeyError"""
return [' raise KeyError("Column is not in the DataFrame")']


def df_getitem_slice_idx_codegen(self, idx):
Expand Down Expand Up @@ -1221,12 +1220,61 @@ def _df_getitem_tuple_idx_impl(self, idx)
return func_text, global_vars


def df_getitem_bool_series_idx_codegen(self, idx):
"""
Example of generated implementation with provided index:
def _df_getitem_bool_series_idx_impl(self, idx):
length = len(get_dataframe_data(self, 0))
_idx_data = idx._data[:length]
res_index = self._index
data_0 = get_dataframe_data(self, 0)
series_0 = pandas.Series(data_0, index=res_index, name="A")
res_data_0 = series_0[_idx_data]
data_1 = get_dataframe_data(self, 1)
series_1 = pandas.Series(data_1, index=res_index, name="B")
res_data_1 = series_1[_idx_data]
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[_idx_data])
"""
func_lines = ['def _df_getitem_bool_series_idx_impl(self, idx):']
func_lines += df_getitem_bool_series_idx_main_codelines(self, idx)
func_text = '\n'.join(func_lines)
global_vars = {'pandas': pandas, 'numpy': numpy,
'get_dataframe_data': get_dataframe_data}

return func_text, global_vars


def df_getitem_bool_array_idx_codegen(self, idx):
"""
Example of generated implementation with provided index:
def _df_getitem_bool_array_idx_impl(self, idx):
length = len(get_dataframe_data(self, 0))
if length != len(idx):
raise ValueError("Item wrong length.")
res_index = numpy.arange(length)
data_0 = get_dataframe_data(self, 0)
res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
data_1 = get_dataframe_data(self, 1)
res_data_1 = pandas.Series(data_1[idx], index=res_index[idx], name="B")
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
"""
func_lines = ['def _df_getitem_bool_array_idx_impl(self, idx):']
func_lines += df_getitem_bool_array_idx_main_codelines(self, idx)
func_text = '\n'.join(func_lines)
global_vars = {'pandas': pandas, 'numpy': numpy,
'get_dataframe_data': get_dataframe_data}

return func_text, global_vars


gen_df_getitem_slice_idx_impl = gen_df_impl_generator(
df_getitem_slice_idx_codegen, '_df_getitem_slice_idx_impl')
gen_df_getitem_tuple_idx_impl = gen_df_impl_generator(
df_getitem_tuple_idx_codegen, '_df_getitem_tuple_idx_impl')
gen_df_getitem_bool_series_idx_impl = gen_df_impl_generator(
df_getitem_bool_series_codegen, '_df_getitem_bool_series_idx_impl')
df_getitem_bool_series_idx_codegen, '_df_getitem_bool_series_idx_impl')
gen_df_getitem_bool_array_idx_impl = gen_df_impl_generator(
df_getitem_bool_array_idx_codegen, '_df_getitem_bool_array_idx_impl')


@sdc_overload(operator.getitem)
Expand Down Expand Up @@ -1285,7 +1333,12 @@ def _df_getitem_unicode_idx_impl(self, idx):

return gen_df_getitem_bool_series_idx_impl(self, idx)

ty_checker.raise_exc(idx, 'str', 'idx')
if isinstance(idx, types.Array) and isinstance(idx.dtype, types.Boolean):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would propose to move it at the top, just after we checked that self is of DataFrameType, because this check should really intercept wrong types (for which we do not want compilation to happen). Otherwise we might compile (or at least try to compile) some specialization we do not want to compile (e.g. if some of the arg types can be type casted to another one).

return gen_df_getitem_bool_array_idx_impl(self, idx)

ty_checker = TypeChecker('Operator getitem().')
expected_types = 'str, tuple(str), slice, series(bool), array(bool)'
ty_checker.raise_exc(idx, expected_types, 'idx')


@sdc_overload_method(DataFrameType, 'pct_change')
Expand Down
25 changes: 24 additions & 1 deletion sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1283,6 +1283,15 @@ def test_impl(df, series):
sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(df, s), test_impl(df, s))

def _test_df_getitem_bool_array_even_idx(self, df):
def test_impl(df, arr):
return df[arr]

arr = np.array([i % 2 for i in range(len(df))], dtype=np.bool_)

sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(df, arr), test_impl(df, arr))

@skip_sdc_jit('DF.getitem unsupported exceptions')
def test_df_getitem_str_literal_idx_exception_key_error(self):
def test_impl(df):
Expand All @@ -1307,7 +1316,7 @@ def test_impl(df, idx):
with self.assertRaises(KeyError):
sdc_func(df, 'ABC')

@skip_sdc_jit('DF.getitem unsupported Series name')
@skip_sdc_jit('DF.getitem unsupported exceptions')
def test_df_getitem_tuple_idx_exception_key_error(self):
sdc_func = self.jit(lambda df: df[('A', 'Z')])

Expand All @@ -1316,6 +1325,18 @@ def test_df_getitem_tuple_idx_exception_key_error(self):
with self.assertRaises(KeyError):
sdc_func(df)

@skip_sdc_jit('DF.getitem unsupported exceptions')
def test_df_getitem_bool_array_idx_exception_value_error(self):
sdc_func = self.jit(lambda df, arr: df[arr])

for df in [gen_df(test_global_input_data_float64), pd.DataFrame()]:
arr = np.array([i % 2 for i in range(len(df) + 1)], dtype=np.bool_)
with self.subTest(df=df, arr=arr):
with self.assertRaises(ValueError) as raises:
sdc_func(df, arr)
self.assertIn('Item wrong length', str(raises.exception))


@skip_sdc_jit('DF.getitem unsupported Series name')
def test_df_getitem_idx(self):
dfs = [gen_df(test_global_input_data_float64),
Expand All @@ -1336,6 +1357,7 @@ def test_df_getitem_idx_no_index(self):
for df in dfs:
with self.subTest(df=df):
self._test_df_getitem_bool_series_even_idx(df)
self._test_df_getitem_bool_array_even_idx(df)

@skip_sdc_jit('DF.getitem unsupported Series name')
def test_df_getitem_idx_multiple_types(self):
Expand All @@ -1351,6 +1373,7 @@ def test_df_getitem_idx_multiple_types(self):
self._test_df_getitem_unbox_slice_idx(df, 1, 3)
self._test_df_getitem_tuple_idx(df)
self._test_df_getitem_bool_series_even_idx(df)
self._test_df_getitem_bool_array_even_idx(df)

@unittest.skip('DF.getitem df[bool_series] unsupported index')
def test_df_getitem_bool_series_even_idx_with_index(self):
Expand Down