Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions sdc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@
'''
Default value for a pointer intended to use as Numba.DefaultPassBuilder.define_nopython_pipeline() in overloaded function
'''

use_default_dataframe = distutils_util.strtobool(os.getenv('SDC_CONFIG_USE_DEFAULT_DATAFRAME', 'True'))
'''
Default value used to select compiler pipeline in a function decorator
'''
141 changes: 84 additions & 57 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,65 +31,92 @@

import operator
import pandas
import numpy
import numba

import sdc
from sdc.datatypes.hpat_pandas_series_functions import TypeChecker

from numba import types
from numba.extending import (overload, overload_method, overload_attribute)
from sdc.hiframes.pd_dataframe_ext import DataFrameType
from sdc.hiframes.pd_series_ext import SeriesType
from numba.errors import TypingError

from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType
from sdc.utils import sdc_overload_method


@sdc_overload_method(DataFrameType, 'count')
def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False):
"""
Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation.
.. only:: developer
Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count
Parameters
-----------
self: :class:`pandas.DataFrame`
input arg
axis:
*unsupported*
level:
*unsupported*
numeric_only:
*unsupported*
Returns
-------
:obj:`pandas.Series` or `pandas.DataFrame`
returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame.
"""

_func_name = 'Method pandas.dataframe.count().'

if not isinstance(self, DataFrameType):
raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self))

if not (isinstance(axis, types.Omitted) or axis == 0):
raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis))

if not (isinstance(level, types.Omitted) or level is None):
raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis))

if not (isinstance(numeric_only, types.Omitted) or numeric_only is False):
raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis))

def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False):
result_data = []
result_index = []

for dataframe_item in self._data:
item_count = dataframe_item.count()
item_name = dataframe_item._name
result_data.append(item_count)
result_index.append(item_name)

return pandas.Series(data=result_data, index=result_index)

return sdc_pandas_dataframe_count_impl
if not sdc.config.use_default_dataframe:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this variable is needed here.
You can just delete implementation of method count(or keep it commented if you need this).

from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType

else:
def sdc_pandas_dataframe_reduce_columns_series(df, name, params):
saved_columns = df.columns
n_cols = len(saved_columns)
data_args = tuple('data{}'.format(i) for i in range(n_cols))
all_params = ['df'] + [f'{key}={value}' for key, value in params]
func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params))

func_lines = [func_definition]
for i, d in enumerate(data_args):
line = ' {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))'
func_lines.append(line.format(d + '_S', i))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use f-string or formatted string instead of concatenation.

func_lines.append(' {}_O = {}_S.{}({})'.format(d, d, name, ", ".join(
key for key, _ in params)))
func_lines.append(" return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you exactly need \n at the end?

", ".join(d + '_O._data' for d in data_args),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You forgot to use f-string here.

", ".join(f"'{c}'" for c in saved_columns)))

loc_vars = {}
func_text = '\n'.join(func_lines)
exec(func_text, {'sdc': sdc, 'np': numpy}, loc_vars)
_reduce_impl = loc_vars['_reduce_impl']

return _reduce_impl

def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it a common function for checking all parameters? How is it related to df.head()?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is check_type used somewhere?

ty_checker = TypeChecker('Method {}().'.format(name))
ty_checker.check(df, DataFrameType)

if not (isinstance(axis, types.Omitted) or axis is None):
ty_checker.raise_exc(axis, 'unsupported', 'axis')

if not (isinstance(skipna, (types.Omitted, types.NoneType, types.Boolean)) or skipna is None):
ty_checker.raise_exc(skipna, 'bool', 'skipna')

if not (isinstance(level, types.Omitted) or level is None):
ty_checker.raise_exc(level, 'unsupported', 'level')

if not (isinstance(numeric_only, types.Omitted) or numeric_only is None):
ty_checker.raise_exc(numeric_only, 'unsupported', 'numeric_only')

if not (isinstance(ddof, types.Omitted) or ddof == 1):
ty_checker.raise_exc(ddof, 'unsupported', 'ddof')

if not (isinstance(min_count, types.Omitted) or min_count == 0):
ty_checker.raise_exc(min_count, 'unsupported', 'min_count')

@overload_method(DataFrameType, 'head')
def head_overload(df, n=5):
"""
Pandas DataFrame method :meth:`pandas.DataFrame.head` implementation.
.. only:: developer
Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_head1
Parameters
-----------
self: :class:`pandas.DataFrame`
input arg
n: :obj:`int`, default 5
input arg, default 5
Returns
-------
:obj:`pandas.Series`
returns: The first n rows of the caller object.
"""

name = 'head'

ty_checker = TypeChecker('Method {}().'.format(name))
ty_checker.check(df, DataFrameType)

if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5):
ty_checker.raise_exc(n, 'int64', 'n')

return sdc_pandas_dataframe_reduce_columns_series(df, name, [('n', 5)])
9 changes: 5 additions & 4 deletions sdc/datatypes/hpat_pandas_series_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3749,17 +3749,18 @@ def hpat_pandas_series_fillna(self, value=None, method=None, axis=None, inplace=
if not isinstance(self, SeriesType):
raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self))

if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted)) or axis is None):
if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType,
types.Omitted, types.NoneType)) or axis is None):
raise TypingError('{} The axis must be an Integer or String. Given: {}'.format(_func_name, axis))

if not (isinstance(inplace, types.Literal) and isinstance(inplace, types.Boolean)
or isinstance(inplace, types.Omitted)
or inplace is False):
raise TypingError('{} The inplace must be a literal Boolean constant. Given: {}'.format(_func_name, inplace))

if not ((method is None or isinstance(method, types.Omitted))
and (limit is None or isinstance(limit, types.Omitted))
and (downcast is None or isinstance(downcast, types.Omitted))
if not ((method is None or isinstance(method, (types.Omitted, types.NoneType)))
and (limit is None or isinstance(limit, (types.Omitted, types.NoneType)))
and (downcast is None or isinstance(downcast, (types.Omitted, types.NoneType)))
):
raise TypingError('{} Unsupported parameters. Given method: {}, limit: {}, downcast: {}'.format(
_func_name, method, limit, downcast))
Expand Down
2 changes: 2 additions & 0 deletions sdc/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -1628,3 +1628,5 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None,
date_format, doublequote, escapechar, decimal)

return _impl

from sdc.datatypes.hpat_pandas_dataframe_functions import *
51 changes: 49 additions & 2 deletions sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
skip_numba_jit, skip_sdc_jit)

from sdc.tests.gen_test_data import ParquetGenerator
from sdc.tests.test_utils import (min_float64, max_float64, test_global_input_data_float64,
test_global_input_data_unicode_kind4, test_datatime,
min_int64, max_int64, test_global_input_data_int64)
from numba.config import IS_32BITS


Expand Down Expand Up @@ -123,6 +126,7 @@ def test_impl(df):
hpat_func = self.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))

@unittest.skip('returned NULL without setting an error')
def test_box1(self):
def test_impl(n):
df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
Expand Down Expand Up @@ -889,7 +893,7 @@ def test_impl(n):

def test_df_fillna1(self):
def test_impl(df):
return df.fillna(5.0)
return df.fillna(0.)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason to change it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its has attention to other PR, I forget delete this


df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0]})
hpat_func = self.jit(test_impl)
Expand Down Expand Up @@ -1134,7 +1138,6 @@ def test_impl():
@unittest.skip("Implement iterrows for DataFrame")
def test_dataframe_iterrows(self):
def test_impl(df):
print(df.iterrows())
return [row for _, row in df.iterrows()]

df = pd.DataFrame({'A': [1, 2, 3], 'B': [0.2, 0.5, 0.001], 'C': ['a', 'bb', 'ccc']})
Expand All @@ -1151,6 +1154,50 @@ def test_impl(n):
hpat_func = self.jit(test_impl)
pd.testing.assert_series_equal(hpat_func(n), test_impl(n))

def test_dataframe_head(self):
def test_impl(df):
return df.head()
sdc_func = sdc.jit(test_impl)
df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5],
"DATATIME": test_datatime,
"INT": test_global_input_data_int64[:5],
"STRING": ['a', 'dd', 'c', '12', 'ddf']})
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

def test_dataframe_head1(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if Index is set explicitly, this solution is ok for that case? I don't see tests on it

def test_impl(df, n):
return df.head(n)
sdc_func = sdc.jit(test_impl)
df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5],
"DATATIME": test_datatime,
"INT": test_global_input_data_int64[:5],
"STRING": ['a', 'dd', 'c', '12', 'ddf']})
for n in [-1, 0, 2, 5]:
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))

@unittest.skip('Dataframe.index not support')
def test_dataframe_head1_index(self):
def test_impl(df, n):
return df.head(n)
sdc_func = sdc.jit(test_impl)
df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5],
"DATATIME": test_datatime,
"INT": test_global_input_data_int64[:5],
"STRING": ['a', 'dd', 'c', '12', 'ddf']},
index=[32, 3, 6, 17, 23])
for n in [-1, 0, 2, 5]:
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))

def test_dataframe_head2(self):
def test_impl(df, n):
return df.head(n)
sdc_func = sdc.jit(test_impl)
df = pd.DataFrame({"A": [12, 4, 5, 1, 6, 8],
"B": [5, 2, 54, 3, 6, 4],
"C": [20, 16, 3, 8, 2, 3],
"D": [14, 3, 2, 6, 4, 5]})
for n in [-1, 0, 2, 5]:
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))

if __name__ == "__main__":
unittest.main()
8 changes: 8 additions & 0 deletions sdc/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,14 @@
'¡Y tú quién te crees?',
'🐍⚡',
'大处 着眼,c小处着手c。大大c大处',
'c小处着手c。',
'te crees?'
]

min_float64 = np.finfo('float64').min
max_float64 = np.finfo('float64').max
min_int64 = np.iinfo(np.int64).min
max_int64 = np.iinfo(np.int64).max

test_global_input_data_float64 = [
[1., -1., 0.1, min_float64, max_float64, max_float64, min_float64, -0.1],
Expand All @@ -49,6 +53,10 @@
[np.nan, np.inf, np.inf, np.nan, np.nan, np.nan, np.NINF, np.NZERO],
]

test_datatime = np.array(['2007-07-13', '2006-01-13', '2010-08-13',
'2005-02-27', '2005-02-28'], dtype='datetime64')

test_global_input_data_int64 = [min_int64, 0, 1, -23, max_int64, min_int64, max_int64]

def count_array_REPs():
if sdc.config.config_pipeline_hpat_default:
Expand Down