IntelPython · 1e-to · Nov 29, 2019 · Dec 3, 2019 · Dec 3, 2019 · Dec 3, 2019
diff --git a/sdc/config.py b/sdc/config.py
@@ -72,3 +72,8 @@
 '''
 Default value for a pointer intended to use as Numba.DefaultPassBuilder.define_nopython_pipeline() in overloaded function
 '''
+
+use_default_dataframe = distutils_util.strtobool(os.getenv('SDC_CONFIG_USE_DEFAULT_DATAFRAME', 'True'))
+'''
+Default value used to select compiler pipeline in a function decorator
+'''
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -31,65 +31,92 @@
 
 import operator
 import pandas
+import numpy
+import numba
+
+import sdc
+from sdc.datatypes.hpat_pandas_series_functions import TypeChecker
 
 from numba import types
 from numba.extending import (overload, overload_method, overload_attribute)
+from sdc.hiframes.pd_dataframe_ext import DataFrameType
+from sdc.hiframes.pd_series_ext import SeriesType
 from numba.errors import TypingError
 
-from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType
-from sdc.utils import sdc_overload_method
-
-
-@sdc_overload_method(DataFrameType, 'count')
-def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False):
-    """
-    Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation.
-
-    .. only:: developer
-
-        Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count
-
-    Parameters
-    -----------
-    self: :class:`pandas.DataFrame`
-        input arg
-    axis:
-        *unsupported*
-    level:
-        *unsupported*
-    numeric_only:
-        *unsupported*
-
-    Returns
-    -------
-    :obj:`pandas.Series` or `pandas.DataFrame`
-            returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame.
-    """
-
-    _func_name = 'Method pandas.dataframe.count().'
-
-    if not isinstance(self, DataFrameType):
-        raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self))
-
-    if not (isinstance(axis, types.Omitted) or axis == 0):
-        raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis))
-
-    if not (isinstance(level, types.Omitted) or level is None):
-        raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis))
-
-    if not (isinstance(numeric_only, types.Omitted) or numeric_only is False):
-        raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis))
-
-    def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False):
-        result_data = []
-        result_index = []
-
-        for dataframe_item in self._data:
-            item_count = dataframe_item.count()
-            item_name = dataframe_item._name
-            result_data.append(item_count)
-            result_index.append(item_name)
-
-        return pandas.Series(data=result_data, index=result_index)
-
-    return sdc_pandas_dataframe_count_impl
+if not sdc.config.use_default_dataframe:
+    from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType
+
+else:
+    def sdc_pandas_dataframe_reduce_columns_series(df, name, params):
+        saved_columns = df.columns
+        n_cols = len(saved_columns)
+        data_args = tuple('data{}'.format(i) for i in range(n_cols))
+        all_params = ['df'] + [f'{key}={value}' for key, value in params]
+        func_definition = 'def _reduce_impl({}):'.format(', '.join(all_params))
+
+        func_lines = [func_definition]
+        for i, d in enumerate(data_args):
+            line = '  {} = sdc.hiframes.api.init_series(sdc.hiframes.pd_dataframe_ext.get_dataframe_data(df, {}))'
+            func_lines.append(line.format(d + '_S', i))
+            func_lines.append('  {}_O = {}_S.{}({})'.format(d, d, name, ", ".join(
+                key for key, _ in params)))
+        func_lines.append("  return sdc.hiframes.pd_dataframe_ext.init_dataframe({}, None, {})\n".format(
+            ", ".join(d + '_O._data' for d in data_args),
+            ", ".join(f"'{c}'" for c in saved_columns)))
+
+        loc_vars = {}
+        func_text = '\n'.join(func_lines)
+        exec(func_text, {'sdc': sdc, 'np': numpy}, loc_vars)
+        _reduce_impl = loc_vars['_reduce_impl']
+
+        return _reduce_impl
+
+    def check_type(name, df, axis=None, skipna=None, level=None, numeric_only=None, ddof=1, min_count=0):
+        ty_checker = TypeChecker('Method {}().'.format(name))
+        ty_checker.check(df, DataFrameType)
+
+        if not (isinstance(axis, types.Omitted) or axis is None):
+            ty_checker.raise_exc(axis, 'unsupported', 'axis')
+
+        if not (isinstance(skipna, (types.Omitted, types.NoneType, types.Boolean)) or skipna is None):
+            ty_checker.raise_exc(skipna, 'bool', 'skipna')
+
+        if not (isinstance(level, types.Omitted) or level is None):
+            ty_checker.raise_exc(level, 'unsupported', 'level')
+
+        if not (isinstance(numeric_only, types.Omitted) or numeric_only is None):
+            ty_checker.raise_exc(numeric_only, 'unsupported', 'numeric_only')
+
+        if not (isinstance(ddof, types.Omitted) or ddof == 1):
+            ty_checker.raise_exc(ddof, 'unsupported', 'ddof')
+
+        if not (isinstance(min_count, types.Omitted) or min_count == 0):
+            ty_checker.raise_exc(min_count, 'unsupported', 'min_count')
+
+    @overload_method(DataFrameType, 'head')
+    def head_overload(df, n=5):
+        """
+        Pandas DataFrame method :meth:`pandas.DataFrame.head` implementation.
+        .. only:: developer
+            Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_head1
+        Parameters
+        -----------
+        self: :class:`pandas.DataFrame`
+            input arg
+        n: :obj:`int`, default 5
+            input arg, default 5
+        Returns
+        -------
+        :obj:`pandas.Series`
+        returns: The first n rows of the caller object.
+        """
+
+        name = 'head'
+
+        ty_checker = TypeChecker('Method {}().'.format(name))
+        ty_checker.check(df, DataFrameType)
+
+        if not (isinstance(n, (types.Omitted, types.Integer)) or n == 5):
+            ty_checker.raise_exc(n, 'int64', 'n')
+
+        return sdc_pandas_dataframe_reduce_columns_series(df, name, [('n', 5)])
diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py
@@ -3749,17 +3749,18 @@ def hpat_pandas_series_fillna(self, value=None, method=None, axis=None, inplace=
     if not isinstance(self, SeriesType):
         raise TypingError('{} The object must be a pandas.series. Given: {}'.format(_func_name, self))
 
-    if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType, types.Omitted)) or axis is None):
+    if not (isinstance(axis, (types.Integer, types.StringLiteral, types.UnicodeType,
+                              types.Omitted, types.NoneType)) or axis is None):
         raise TypingError('{} The axis must be an Integer or String. Given: {}'.format(_func_name, axis))
 
     if not (isinstance(inplace, types.Literal) and isinstance(inplace, types.Boolean)
             or isinstance(inplace, types.Omitted)
             or inplace is False):
         raise TypingError('{} The inplace must be a literal Boolean constant. Given: {}'.format(_func_name, inplace))
 
-    if not ((method is None or isinstance(method, types.Omitted))
-            and (limit is None or isinstance(limit, types.Omitted))
-            and (downcast is None or isinstance(downcast, types.Omitted))
+    if not ((method is None or isinstance(method, (types.Omitted, types.NoneType)))
+            and (limit is None or isinstance(limit, (types.Omitted, types.NoneType)))
+            and (downcast is None or isinstance(downcast, (types.Omitted, types.NoneType)))
     ):
         raise TypingError('{} Unsupported parameters. Given method: {}, limit: {}, downcast: {}'.format(
                 _func_name, method, limit, downcast))

diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -1628,3 +1628,5 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None,
                       date_format, doublequote, escapechar, decimal)
 
     return _impl
+
+from sdc.datatypes.hpat_pandas_dataframe_functions import *
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -41,6 +41,9 @@
                                    skip_numba_jit, skip_sdc_jit)
 
 from sdc.tests.gen_test_data import ParquetGenerator
+from sdc.tests.test_utils import (min_float64, max_float64, test_global_input_data_float64,
+                                  test_global_input_data_unicode_kind4, test_datatime,
+                                  min_int64, max_int64, test_global_input_data_int64)
 from numba.config import IS_32BITS
 
 
@@ -123,6 +126,7 @@ def test_impl(df):
         hpat_func = self.jit(test_impl)
         pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
 
+    @unittest.skip('returned NULL without setting an error')
     def test_box1(self):
         def test_impl(n):
             df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
@@ -889,7 +893,7 @@ def test_impl(n):
 
     def test_df_fillna1(self):
         def test_impl(df):
-            return df.fillna(5.0)
+            return df.fillna(0.)
 
         df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0]})
         hpat_func = self.jit(test_impl)
@@ -1134,7 +1138,6 @@ def test_impl():
     @unittest.skip("Implement iterrows for DataFrame")
     def test_dataframe_iterrows(self):
         def test_impl(df):
-            print(df.iterrows())
             return [row for _, row in df.iterrows()]
 
         df = pd.DataFrame({'A': [1, 2, 3], 'B': [0.2, 0.5, 0.001], 'C': ['a', 'bb', 'ccc']})
@@ -1151,6 +1154,50 @@ def test_impl(n):
         hpat_func = self.jit(test_impl)
         pd.testing.assert_series_equal(hpat_func(n), test_impl(n))
 
+    def test_dataframe_head(self):
+        def test_impl(df):
+            return df.head()
+        sdc_func = sdc.jit(test_impl)
+        df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5],
+                           "DATATIME": test_datatime,
+                           "INT": test_global_input_data_int64[:5],
+                           "STRING": ['a', 'dd', 'c', '12', 'ddf']})
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
+    def test_dataframe_head1(self):
+        def test_impl(df, n):
+            return df.head(n)
+        sdc_func = sdc.jit(test_impl)
+        df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5],
+                           "DATATIME": test_datatime,
+                           "INT": test_global_input_data_int64[:5],
+                           "STRING": ['a', 'dd', 'c', '12', 'ddf']})
+        for n in [-1, 0, 2, 5]:
+            pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
+
+    @unittest.skip('Dataframe.index not support')
+    def test_dataframe_head1_index(self):
+        def test_impl(df, n):
+            return df.head(n)
+        sdc_func = sdc.jit(test_impl)
+        df = pd.DataFrame({"FLOAT": test_global_input_data_float64[0][:5],
+                           "DATATIME": test_datatime,
+                           "INT": test_global_input_data_int64[:5],
+                           "STRING": ['a', 'dd', 'c', '12', 'ddf']},
+                           index=[32, 3, 6, 17, 23])
+        for n in [-1, 0, 2, 5]:
+            pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
+
+    def test_dataframe_head2(self):
+        def test_impl(df, n):
+            return df.head(n)
+        sdc_func = sdc.jit(test_impl)
+        df = pd.DataFrame({"A": [12, 4, 5, 1, 6, 8],
+                           "B": [5, 2, 54, 3, 6, 4],
+                           "C": [20, 16, 3, 8, 2, 3],
+                           "D": [14, 3, 2, 6, 4, 5]})
+        for n in [-1, 0, 2, 5]:
+            pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/sdc/tests/test_utils.py b/sdc/tests/test_utils.py
@@ -37,10 +37,14 @@
     '¡Y tú quién te crees?',
     '🐍⚡',
     '大处 着眼，c小处着手c。大大c大处',
+    'c小处着手c。',
+    'te crees?'
 ]
 
 min_float64 = np.finfo('float64').min
 max_float64 = np.finfo('float64').max
+min_int64 = np.iinfo(np.int64).min
+max_int64 = np.iinfo(np.int64).max
 
 test_global_input_data_float64 = [
     [1., -1., 0.1, min_float64, max_float64, max_float64, min_float64, -0.1],
@@ -49,6 +53,10 @@
     [np.nan, np.inf, np.inf, np.nan, np.nan, np.nan, np.NINF, np.NZERO],
 ]
 
+test_datatime = np.array(['2007-07-13', '2006-01-13', '2010-08-13',
+                         '2005-02-27', '2005-02-28'], dtype='datetime64')
+
+test_global_input_data_int64 = [min_int64, 0, 1, -23, max_int64, min_int64, max_int64]
 
 def count_array_REPs():
     if sdc.config.config_pipeline_hpat_default:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1628,3 +1628,5 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None,
		date_format, doublequote, escapechar, decimal)

		return _impl

		from sdc.datatypes.hpat_pandas_dataframe_functions import *